diff options
| author | anand <anekkunt@redhat.com> | 2015-07-21 15:42:24 +0530 | 
|---|---|---|
| committer | Atin Mukherjee <amukherj@redhat.com> | 2015-07-27 20:42:17 -0700 | 
| commit | 65e6ab1bfbbec7755f7ae2294cb83334ac65a296 (patch) | |
| tree | e8c0febdc939c1baa3df83b7656dc58bd93a3a4f | |
| parent | ca67ac071c56a3bd6f2b2ba3a958f0305db50a3d (diff) | |
glusterd: getting txn_id from frame->cookie in op_sm call back
RCA: If rebalance start is triggered from one node and one of other nodes in the cluster goes down simultaneously
we might end up in a case where callback will use the txn_id from priv->global_txn_id which is always zeros and
this means injecting an event with an incorrect txn_id will result into op-sm getting stuck.
fix: set txn_id in frame->cookie during sumbit_and_request, so that we can get txn_id in call back
functions.
Change-Id: I519176c259ea9d37897791a77a7c92eb96d10052
BUG: 1245142
Signed-off-by: anand <anekkunt@redhat.com>
Reviewed-on: http://review.gluster.org/11728
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
Tested-by: NetBSD Build System <jenkins@build.gluster.org>
Tested-by: Gluster Build System <jenkins@build.gluster.com>
| -rw-r--r-- | tests/bugs/glusterd/bug-1245142-rebalance_test.t | 28 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-rpc-ops.c | 77 | 
2 files changed, 89 insertions, 16 deletions
diff --git a/tests/bugs/glusterd/bug-1245142-rebalance_test.t b/tests/bugs/glusterd/bug-1245142-rebalance_test.t new file mode 100644 index 00000000000..a28810ea71c --- /dev/null +++ b/tests/bugs/glusterd/bug-1245142-rebalance_test.t @@ -0,0 +1,28 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../cluster.rc +. $(dirname $0)/../../volume.rc + + +cleanup; +TEST launch_cluster 2; +TEST $CLI_1 peer probe $H2; + +EXPECT_WITHIN $PROBE_TIMEOUT 1 peer_count + +$CLI_1 volume create $V0 $H1:$B1/$V0  $H2:$B2/$V0 +EXPECT 'Created' cluster_volinfo_field 1 $V0 'Status'; + +$CLI_1 volume start $V0 +EXPECT 'Started' cluster_volinfo_field 1 $V0 'Status'; + +$CLI_1 volume rebalance $V0  start & +#kill glusterd2 after requst sent, so that call back is called +#with rpc->status fail ,so roughly 1sec delay is introduced to get this scenario. +sleep 1 +kill_glusterd 2 +#check glusterd commands are working after rebalance start command +EXPECT 'Started' cluster_volinfo_field 1 $V0 'Status'; + +cleanup; diff --git a/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c b/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c index 99f37b05ae9..7a54866d192 100644 --- a/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c @@ -1126,14 +1126,17 @@ __glusterd_stage_op_cbk (struct rpc_req *req, struct iovec *iov,          xlator_t                      *this = NULL;          glusterd_conf_t               *priv = NULL;          uuid_t                        *txn_id = NULL; +        call_frame_t                  *frame = NULL;          this = THIS;          GF_ASSERT (this);          GF_ASSERT (req);          priv = this->private;          GF_ASSERT (priv); +        GF_ASSERT(myframe); -        txn_id = &priv->global_txn_id; +        frame = myframe; +        txn_id = frame->cookie;          if (-1 == req->rpc_status) {                  rsp.op_ret   = -1; @@ -1191,10 +1194,6 @@ out:                          uuid_utoa (rsp.uuid));          } -        ret = dict_get_bin (dict, "transaction_id", (void **)&txn_id); -        gf_msg_debug (this->name, 0, "transaction ID = %s", -                uuid_utoa (*txn_id)); -          rcu_read_lock ();          peerinfo = glusterd_peerinfo_find (rsp.uuid, NULL);          if (peerinfo == NULL) { @@ -1242,6 +1241,7 @@ out:          } else {                  free (rsp.dict.dict_val); //malloced by xdr          } +        GF_FREE (frame->cookie);          GLUSTERD_STACK_DESTROY (((call_frame_t *)myframe));          return ret;  } @@ -1270,14 +1270,17 @@ __glusterd_commit_op_cbk (struct rpc_req *req, struct iovec *iov,          glusterd_conf_t               *priv = NULL;          uuid_t                        *txn_id = NULL;          glusterd_op_info_t            txn_op_info = {{0},}; +        call_frame_t                  *frame  = NULL;          this = THIS;          GF_ASSERT (this);          GF_ASSERT (req);          priv = this->private;          GF_ASSERT (priv); +        GF_ASSERT(myframe); -        txn_id = &priv->global_txn_id; +        frame = myframe; +        txn_id = frame->cookie;          if (-1 == req->rpc_status) {                  rsp.op_ret   = -1; @@ -1335,9 +1338,6 @@ __glusterd_commit_op_cbk (struct rpc_req *req, struct iovec *iov,                          "Received commit ACC from uuid: %s",                          uuid_utoa (rsp.uuid));          } -        ret = dict_get_bin (dict, "transaction_id", (void **)&txn_id); -        gf_msg_debug (this->name, 0, "transaction ID = %s", -                uuid_utoa (*txn_id));          ret = glusterd_get_txn_opinfo (txn_id, &txn_op_info);          if (ret) { @@ -1411,6 +1411,7 @@ out:          if (dict)                  dict_unref (dict);          free (rsp.op_errstr); //malloced by xdr +        GF_FREE (frame->cookie);          GLUSTERD_STACK_DESTROY (((call_frame_t *)myframe));          return ret;  } @@ -1895,9 +1896,9 @@ glusterd_stage_op (call_frame_t *frame, xlator_t *this,          int                             ret = -1;          glusterd_peerinfo_t             *peerinfo = NULL;          glusterd_conf_t                 *priv = NULL; -        call_frame_t                    *dummy_frame = NULL;          dict_t                          *dict = NULL;          gf_boolean_t                    is_alloc = _gf_true; +        uuid_t                          *txn_id      = NULL;          if (!this) {                  goto out; @@ -1927,13 +1928,34 @@ glusterd_stage_op (call_frame_t *frame, xlator_t *this,                          "to request buffer");                  goto out;          } +        /* Sending valid transaction ID to peers */ +        ret = dict_get_bin (dict, "transaction_id", +                            (void **)&txn_id); +        if (ret) { +                gf_msg (this->name, GF_LOG_ERROR, 0, +                        GD_MSG_TRANS_ID_GET_FAIL, +                       "Failed to get transaction id."); +                goto out; +        } else { +                gf_msg_debug (this->name, 0, +                        "Transaction_id = %s", uuid_utoa (*txn_id)); +        } +        if (!frame) +                frame = create_frame (this, this->ctx->pool); -        dummy_frame = create_frame (this, this->ctx->pool); -        if (!dummy_frame) +        if (!frame) { +                ret = -1;                  goto out; +        } +        frame->cookie = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t); +        if (!frame->cookie) { +                ret = -1; +                goto out; +        } +        gf_uuid_copy (frame->cookie, *txn_id); -        ret = glusterd_submit_request (peerinfo->rpc, &req, dummy_frame, +        ret = glusterd_submit_request (peerinfo->rpc, &req, frame,                                         peerinfo->mgmt, GLUSTERD_MGMT_STAGE_OP,                                         NULL,                                         this, glusterd_stage_op_cbk, @@ -1958,6 +1980,7 @@ glusterd_commit_op (call_frame_t *frame, xlator_t *this,          call_frame_t           *dummy_frame = NULL;          dict_t                 *dict        = NULL;          gf_boolean_t            is_alloc    = _gf_true; +        uuid_t                 *txn_id      = NULL;          if (!this) {                  goto out; @@ -1986,12 +2009,34 @@ glusterd_commit_op (call_frame_t *frame, xlator_t *this,                          "request buffer");                  goto out;          } +        /* Sending valid transaction ID to peers */ +        ret = dict_get_bin (dict, "transaction_id", +                            (void **)&txn_id); +        if (ret) { +                gf_msg (this->name, GF_LOG_ERROR, 0, +                        GD_MSG_TRANS_ID_GET_FAIL, +                       "Failed to get transaction id."); +                goto out; +        } else { +                gf_msg_debug (this->name, 0, +                        "Transaction_id = %s", uuid_utoa (*txn_id)); +        } -        dummy_frame = create_frame (this, this->ctx->pool); -        if (!dummy_frame) +        if (!frame) +                frame = create_frame (this, this->ctx->pool); + +        if (!frame) { +                ret = -1;                  goto out; +        } +        frame->cookie = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t); +        if (!frame->cookie) { +                ret = -1; +                goto out; +        } +        gf_uuid_copy (frame->cookie, *txn_id); -        ret = glusterd_submit_request (peerinfo->rpc, &req, dummy_frame, +        ret = glusterd_submit_request (peerinfo->rpc, &req, frame,                                         peerinfo->mgmt, GLUSTERD_MGMT_COMMIT_OP,                                         NULL,                                         this, glusterd_commit_op_cbk,  | 
