From c5a5fea9e6a1f1709e6826c1eea89dfd25cc496b Mon Sep 17 00:00:00 2001 From: Pavan Sondur Date: Thu, 23 Sep 2010 09:18:37 +0000 Subject: mgmt/glusterd: Misc fixes to pump / cli / glusterd wrt replace brick. Patches from Vijay and Shishir have been pulled in into this one big patch. Signed-off-by: Pavan Vilas Sondur Signed-off-by: Vijay Bellur BUG: 1235 (Bug for all pump/migrate commits) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=1235 --- xlators/mgmt/glusterd/src/glusterd-op-sm.c | 370 ++++++++++++++++++----------- 1 file changed, 235 insertions(+), 135 deletions(-) (limited to 'xlators/mgmt/glusterd/src/glusterd-op-sm.c') diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c index b8d2778e39a..81762320f55 100644 --- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c +++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c @@ -1643,7 +1643,6 @@ rb_src_brick_restart (glusterd_volinfo_t *volinfo, } glusterd_delete_volfile (volinfo, src_brickinfo); - glusterd_store_delete_brick (volinfo, src_brickinfo); if (activate_pump) { ret = rb_regenerate_volfiles (volinfo, 1); @@ -1694,12 +1693,12 @@ rb_send_xattr_command (glusterd_volinfo_t *volinfo, RB_CLIENT_MOUNTPOINT); ret = stat (mount_point_path, &buf); - if (ret) { - gf_log ("", GF_LOG_DEBUG, - "stat failed. Could not send " - " %s command", xattr_key); - goto out; - } + if (ret) { + gf_log ("", GF_LOG_DEBUG, + "stat failed. Could not send " + " %s command", xattr_key); + goto out; + } ret = lsetxattr (mount_point_path, xattr_key, value, @@ -1789,13 +1788,13 @@ rb_spawn_glusterfs_client (glusterd_volinfo_t *volinfo, ret = stat (cmd_str, &buf); if (ret) { - gf_log ("", GF_LOG_DEBUG, - "stat on mountpoint failed"); - goto out; - } + gf_log ("", GF_LOG_DEBUG, + "stat on mountpoint failed"); + goto out; + } - gf_log ("", GF_LOG_DEBUG, - "stat on mountpoint succeeded"); + gf_log ("", GF_LOG_DEBUG, + "stat on mountpoint succeeded"); ret = 0; @@ -2083,13 +2082,6 @@ rb_do_operation_start (glusterd_volinfo_t *volinfo, char start_value[8192] = {0,}; int ret = -1; - ret = rb_src_brick_restart (volinfo, src_brickinfo, - 1); - if (ret) { - gf_log ("", GF_LOG_DEBUG, - "Could not restart src-brick"); - goto out; - } gf_log ("", GF_LOG_DEBUG, "replace-brick sending start xattr"); @@ -2172,12 +2164,21 @@ rb_do_operation_pause (glusterd_volinfo_t *volinfo, goto out; } + gf_log ("", GF_LOG_DEBUG, "unmounted the replace brick client"); ret = 0; out: + if (!glusterd_is_local_addr (src_brickinfo->hostname)) { + ret = rb_src_brick_restart (volinfo, src_brickinfo, + 0); + if (ret) { + gf_log ("", GF_LOG_DEBUG, + "Could not restart src-brick"); + } + } return ret; } @@ -2194,7 +2195,7 @@ rb_kill_destination_brick (glusterd_volinfo_t *volinfo, priv->workdir, volinfo->volname, RB_DSTBRICK_PIDFILE); - return glusterd_service_stop ("brick", pidfile, SIGQUIT, _gf_true); + return glusterd_service_stop ("brick", pidfile, SIGTERM, _gf_true); } static int @@ -2204,14 +2205,6 @@ rb_do_operation_abort (glusterd_volinfo_t *volinfo, { int ret = -1; - ret = rb_src_brick_restart (volinfo, src_brickinfo, - 0); - if (ret) { - gf_log ("", GF_LOG_DEBUG, - "Could not restart src-brick"); - goto out; - } - gf_log ("", GF_LOG_DEBUG, "replace-brick sending abort xattr"); @@ -2283,7 +2276,7 @@ rb_get_xattr_command (glusterd_volinfo_t *volinfo, glusterd_brickinfo_t *src_brickinfo, glusterd_brickinfo_t *dst_brickinfo, const char *xattr_key, - const char **value) + char *value) { glusterd_conf_t *priv = NULL; char mount_point_path[PATH_MAX] = {0,}; @@ -2296,19 +2289,19 @@ rb_get_xattr_command (glusterd_volinfo_t *volinfo, priv->workdir, volinfo->volname, RB_CLIENT_MOUNTPOINT); - ret = stat (mount_point_path, &buf); - if (ret) { - gf_log ("", GF_LOG_DEBUG, - "stat failed. Could not send " - " %s command", xattr_key); - goto out; - } + ret = stat (mount_point_path, &buf); + if (ret) { + gf_log ("", GF_LOG_DEBUG, + "stat failed. Could not send " + " %s command", xattr_key); + goto out; + } ret = lgetxattr (mount_point_path, xattr_key, - (char *)(*value), + value, 8192); - if (ret) { + if (ret < 0) { gf_log ("", GF_LOG_DEBUG, "getxattr failed"); goto out; @@ -2325,7 +2318,7 @@ rb_do_operation_status (glusterd_volinfo_t *volinfo, glusterd_brickinfo_t *src_brickinfo, glusterd_brickinfo_t *dst_brickinfo) { - const char *status = NULL; + char status[2048] = {0,}; char *status_reply = NULL; dict_t *ctx = NULL; int ret = -1; @@ -2346,7 +2339,7 @@ rb_do_operation_status (glusterd_volinfo_t *volinfo, ret = rb_get_xattr_command (volinfo, src_brickinfo, dst_brickinfo, RB_PUMP_STATUS_CMD, - &status); + status); if (ret) { gf_log ("", GF_LOG_DEBUG, "Failed to get status from pump"); @@ -2395,7 +2388,7 @@ out: } static int -glusterd_op_replace_brick (gd1_mgmt_stage_op_req *req) +glusterd_op_replace_brick (gd1_mgmt_stage_op_req *req, dict_t *rsp_dict) { int ret = 0; dict_t *dict = NULL; @@ -2485,21 +2478,34 @@ glusterd_op_replace_brick (gd1_mgmt_stage_op_req *req) gf_log ("", GF_LOG_NORMAL, "adding src-brick port no"); - ctx = glusterd_op_get_ctx (GD_OP_REPLACE_BRICK); - if (!ctx) { - gf_log ("", GF_LOG_ERROR, - "Operation Context is not present"); - ret = -1; - goto out; + src_brickinfo->port = pmap_registry_search (this, + src_brickinfo->path, GF_PMAP_PORT_BRICKSERVER); + if (!src_brickinfo->port) { + gf_log ("", GF_LOG_ERROR, + "Src brick port not available"); + ret = -1; + goto out; } - ret = dict_set_int32 (ctx, "src-brick-port", - src_brickinfo->port); - if (ret) { - gf_log ("", GF_LOG_DEBUG, - "Could not set src-brick port no"); - goto out; + if (rsp_dict) { + ret = dict_set_int32 (rsp_dict, "src-brick-port", src_brickinfo->port); + if (ret) { + gf_log ("", GF_LOG_DEBUG, + "Could not set src-brick port no"); + goto out; + } + } else { + ctx = glusterd_op_get_ctx (GD_OP_REPLACE_BRICK); + GF_ASSERT (ctx); + + ret = dict_set_int32 (ctx, "src-brick-port", src_brickinfo->port); + if (ret) { + gf_log ("", GF_LOG_DEBUG, + "Could not set src-brick port no"); + goto out; + } } + } switch (replace_op) { @@ -2515,8 +2521,18 @@ glusterd_op_replace_brick (gd1_mgmt_stage_op_req *req) goto out; } } - } - break; + + if (!glusterd_is_local_addr (src_brickinfo->hostname)) { + ret = rb_src_brick_restart (volinfo, src_brickinfo, + 1); + if (ret) { + gf_log ("", GF_LOG_DEBUG, + "Could not restart src-brick"); + goto out; + } + } + break; + } case GF_REPLACE_OP_COMMIT: { @@ -2544,6 +2560,7 @@ glusterd_op_replace_brick (gd1_mgmt_stage_op_req *req) goto out; } } + } break; @@ -3492,8 +3509,23 @@ out: return ret; } -static int -glusterd_op_ac_rcvd_commit_op_acc (glusterd_op_sm_event_t *event, void *ctx) +static gf_boolean_t +rb_check_brick_signin (glusterd_brickinfo_t *brickinfo) +{ + gf_boolean_t value; + + value = brickinfo->signed_in; + + if (value == _gf_true) { + gf_log ("", GF_LOG_DEBUG, + "Brick has signed in. Continuing..."); + } + + return value; +} + +void +glusterd_do_replace_brick (void *data) { glusterd_volinfo_t *volinfo = NULL; int32_t op = 0; @@ -3502,110 +3534,178 @@ glusterd_op_ac_rcvd_commit_op_acc (glusterd_op_sm_event_t *event, void *ctx) char *src_brick = NULL; char *dst_brick = NULL; char *volname = NULL; + gf_boolean_t brick_signin = _gf_false; glusterd_brickinfo_t *src_brickinfo = NULL; glusterd_brickinfo_t *dst_brickinfo = NULL; - int ret = 0; + glusterd_conf_t *priv = NULL; - GF_ASSERT (event); + int ret = 0; - opinfo.pending_count--; + dict = data; - if (opinfo.pending_count) - goto out; + GF_ASSERT (THIS); - dict = glusterd_op_get_ctx (GD_OP_REPLACE_BRICK); - if (dict) { + priv = THIS->private; + + if (priv->timer) { + gf_timer_call_cancel (THIS->ctx, priv->timer); + priv->timer = NULL; gf_log ("", GF_LOG_DEBUG, - "Replace brick operation detected"); + "Cancelled timer thread"); + } - ret = dict_get_int32 (dict, "operation", &op); - if (ret) { - gf_log ("", GF_LOG_DEBUG, - "dict_get on operation failed"); - goto out; - } - ret = dict_get_str (dict, "src-brick", &src_brick); - if (ret) { - gf_log ("", GF_LOG_ERROR, "Unable to get src brick"); - goto out; - } + gf_log ("", GF_LOG_DEBUG, + "Replace brick operation detected"); + ret = dict_get_int32 (dict, "operation", &op); + if (ret) { gf_log ("", GF_LOG_DEBUG, - "src brick=%s", src_brick); + "dict_get on operation failed"); + goto out; + } + ret = dict_get_str (dict, "src-brick", &src_brick); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to get src brick"); + goto out; + } - ret = dict_get_str (dict, "dst-brick", &dst_brick); - if (ret) { - gf_log ("", GF_LOG_ERROR, "Unable to get dst brick"); - goto out; - } + gf_log ("", GF_LOG_DEBUG, + "src brick=%s", src_brick); - gf_log ("", GF_LOG_DEBUG, - "dst brick=%s", dst_brick); + ret = dict_get_str (dict, "dst-brick", &dst_brick); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to get dst brick"); + goto out; + } - ret = dict_get_str (dict, "volname", &volname); + gf_log ("", GF_LOG_DEBUG, + "dst brick=%s", dst_brick); - if (ret) { - gf_log ("", GF_LOG_ERROR, "Unable to get volume name"); - goto out; - } + ret = dict_get_str (dict, "volname", &volname); - ret = glusterd_volinfo_find (volname, &volinfo); - if (ret) { - gf_log ("", GF_LOG_ERROR, "Unable to allocate memory"); - goto out; - } + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to get volume name"); + goto out; + } - ret = glusterd_brickinfo_get (src_brick, volinfo, &src_brickinfo); - if (ret) { - gf_log ("", GF_LOG_DEBUG, "Unable to get src-brickinfo"); - goto out; - } + ret = glusterd_volinfo_find (volname, &volinfo); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to allocate memory"); + goto out; + } - ret = glusterd_brickinfo_from_brick (dst_brick, &dst_brickinfo); - if (ret) { - gf_log ("", GF_LOG_DEBUG, "Unable to get dst-brickinfo"); - goto out; - } + ret = glusterd_brickinfo_get (src_brick, volinfo, &src_brickinfo); + if (ret) { + gf_log ("", GF_LOG_DEBUG, "Unable to get src-brickinfo"); + goto out; + } - ret = dict_get_int32 (dict, "src-brick-port", &src_port); - if (ret) { - gf_log ("", GF_LOG_ERROR, "Unable to get src-brick port"); - goto out; - } + ret = glusterd_brickinfo_from_brick (dst_brick, &dst_brickinfo); + if (ret) { + gf_log ("", GF_LOG_DEBUG, "Unable to get dst-brickinfo"); + goto out; + } - src_brickinfo->port = src_port; + ret = dict_get_int32 (dict, "src-brick-port", &src_port); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to get src-brick port"); + goto out; + } - switch (op) { - case GF_REPLACE_OP_START: - ret = rb_do_operation_start (volinfo, src_brickinfo, dst_brickinfo); - break; - case GF_REPLACE_OP_COMMIT: - ret = rb_do_operation_commit (volinfo, src_brickinfo, dst_brickinfo); - break; - case GF_REPLACE_OP_PAUSE: - ret = rb_do_operation_pause (volinfo, src_brickinfo, dst_brickinfo); - break; - case GF_REPLACE_OP_ABORT: - ret = rb_do_operation_abort (volinfo, src_brickinfo, dst_brickinfo); - break; - case GF_REPLACE_OP_STATUS: - ret = rb_do_operation_status (volinfo, src_brickinfo, dst_brickinfo); - break; - default: - ret = -1; - goto out; - } + src_brickinfo->port = src_port; + brick_signin = rb_check_brick_signin (src_brickinfo); + if (brick_signin == _gf_false) { + gf_log ("", GF_LOG_DEBUG, + "Marking replace brick to fail due to brick " + "not having signed-in in 10secs"); + ret = -1; + goto out; } + switch (op) { + case GF_REPLACE_OP_START: + ret = rb_do_operation_start (volinfo, src_brickinfo, dst_brickinfo); + break; + case GF_REPLACE_OP_COMMIT: + ret = rb_do_operation_commit (volinfo, src_brickinfo, dst_brickinfo); + break; + case GF_REPLACE_OP_PAUSE: + ret = rb_do_operation_pause (volinfo, src_brickinfo, dst_brickinfo); + break; + case GF_REPLACE_OP_ABORT: + ret = rb_do_operation_abort (volinfo, src_brickinfo, dst_brickinfo); + break; + case GF_REPLACE_OP_STATUS: + ret = rb_do_operation_status (volinfo, src_brickinfo, dst_brickinfo); + break; + default: + ret = -1; + goto out; + } + +out: if (ret) ret = glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, NULL); else ret = glusterd_op_sm_inject_event (GD_OP_EVENT_COMMIT_ACC, NULL); - gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + glusterd_op_sm (); +} + +static int +glusterd_op_ac_rcvd_commit_op_acc (glusterd_op_sm_event_t *event, void *ctx) +{ + glusterd_conf_t *priv = NULL; + dict_t *dict = NULL; + int ret = 0; + gf_boolean_t commit_ack_inject = _gf_false; + int32_t op = 0; + struct timeval timeout = {0, }; + priv = THIS->private; + GF_ASSERT (event); + + opinfo.pending_count--; + + if (opinfo.pending_count) + goto out; + + dict = glusterd_op_get_ctx (GD_OP_REPLACE_BRICK); + if (dict) { + if (op == GF_REPLACE_OP_START || + op == GF_REPLACE_OP_ABORT) + timeout.tv_sec = 5; + else + timeout.tv_sec = 1; + + timeout.tv_usec = 0; + + ret = dict_get_int32 (dict, "operation", &op); + if (ret) { + gf_log ("", GF_LOG_DEBUG, + "dict_get on operation failed"); + goto out; + } + + priv->timer = gf_timer_call_after (THIS->ctx, timeout, + glusterd_do_replace_brick, + (void *) dict); + + ret = 0; + commit_ack_inject = _gf_false; + goto out; + } + + commit_ack_inject = _gf_true; out: + if (commit_ack_inject) { + if (ret) + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, NULL); + else + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_COMMIT_ACC, NULL); + } + return ret; } @@ -4003,8 +4103,8 @@ glusterd_op_sm_transition_state (glusterd_op_info_t *opinfo, GF_ASSERT (state); GF_ASSERT (opinfo); - gf_log ("", GF_LOG_NORMAL, "Transitioning from %d to %d", - opinfo->state.state, state[event_type].next_state); + gf_log ("", GF_LOG_NORMAL, "Transitioning from %d to %d due to event %d", + opinfo->state.state, state[event_type].next_state, event_type); opinfo->state.state = state[event_type].next_state; return 0; @@ -4103,7 +4203,7 @@ glusterd_op_commit_perform (gd1_mgmt_stage_op_req *req, char **op_errstr, break; case GD_OP_REPLACE_BRICK: - ret = glusterd_op_replace_brick (req); + ret = glusterd_op_replace_brick (req, rsp_dict); break; case GD_OP_SET_VOLUME: -- cgit