From 45301bcd97825206f7f19b25a4ad722e7dc13cc6 Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Mon, 18 Jan 2016 12:16:31 +0000 Subject: cli/ afr: op_ret for index heal launch Backport of http://review.gluster.org/#/c/13303/ Problem: If index heal is launched when some of the bricks are down, glustershd of that node sends a -1 op_ret to glusterd which eventually propagates it to the CLI. Also, glusterd sometimes sends an err_str and sometimes not (depending on the failure happening in the brick-op phase or commit-op phase). So the message that gets displayed varies in each case: "Launching heal operation to perform index self heal on volume testvol has been unsuccessful" (OR) "Commit failed on . Please check log file for details." Fix: 1. Modify afr_xl_op() to return -1 even if index healing of atleast one brick fails. 2. Ignore glusterd's error string in gf_cli_heal_volume_cbk and print a more meaningful message. The patch also fixes a bug in glusterfs_handle_translator_op() where if we encounter an error in notify of one xlator, we break out of the loop instead of sending the notify to other xlators. Change-Id: I957f6c4b4d0a45453ffd5488e425cab5a3e0acca BUG: 1306922 Signed-off-by: Ravishankar N Reviewed-on: http://review.gluster.org/13435 Smoke: Gluster Build System NetBSD-regression: NetBSD Build System CentOS-regression: Gluster Build System Reviewed-by: Pranith Kumar Karampuri --- cli/src/cli-rpc-ops.c | 11 ++++------- glusterfsd/src/glusterfsd-mgmt.c | 6 +++++- tests/basic/afr/arbiter.t | 2 +- xlators/cluster/afr/src/afr-self-heald.c | 5 +++-- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c index b4fbd29c1f1..416b1e09539 100644 --- a/cli/src/cli-rpc-ops.c +++ b/cli/src/cli-rpc-ops.c @@ -8475,13 +8475,10 @@ gf_cli_heal_volume_cbk (struct rpc_req *req, struct iovec *iov, } if (rsp.op_ret) { - if (strcmp (rsp.op_errstr, "")) { - cli_err ("%s", rsp.op_errstr); - } else { - cli_err ("%s%s on volume %s has been unsuccessful", - operation, heal_op_str, volname); - } - + cli_err ("%s%s on volume %s has been unsuccessful on " + "bricks that are down. Please check if all brick " + "processes are running.", + operation, heal_op_str, volname); ret = rsp.op_ret; goto out; } else { diff --git a/glusterfsd/src/glusterfsd-mgmt.c b/glusterfsd/src/glusterfsd-mgmt.c index 877adba2938..da801779079 100644 --- a/glusterfsd/src/glusterfsd-mgmt.c +++ b/glusterfsd/src/glusterfsd-mgmt.c @@ -563,6 +563,7 @@ int glusterfs_handle_translator_op (rpcsvc_request_t *req) { int32_t ret = -1; + int32_t op_ret = 0; gd1_mgmt_brick_op_req xlator_req = {0,}; dict_t *input = NULL; xlator_t *xlator = NULL; @@ -632,9 +633,12 @@ glusterfs_handle_translator_op (rpcsvc_request_t *req) ret = dict_get_str (input, key, &xname); xlator = xlator_search_by_name (any, xname); XLATOR_NOTIFY (xlator, GF_EVENT_TRANSLATOR_OP, input, output); + /* If notify fails for an xlator we need to capture it but + * continue with the loop. */ if (ret) - break; + op_ret = -1; } + ret = op_ret; out: glusterfs_xlator_op_response_send (req, ret, "", output); if (input) diff --git a/tests/basic/afr/arbiter.t b/tests/basic/afr/arbiter.t index f06fdb1c49f..be8f676d1ec 100644 --- a/tests/basic/afr/arbiter.t +++ b/tests/basic/afr/arbiter.t @@ -49,7 +49,7 @@ TEST $CLI volume set $V0 cluster.self-heal-daemon on EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 -TEST $CLI volume heal $V0 +$CLI volume heal $V0 EXPECT_WITHIN $HEAL_TIMEOUT '1' echo $(count_sh_entries $B0/$V0"1") EXPECT_WITHIN $HEAL_TIMEOUT '1' echo $(count_sh_entries $B0/$V0"2") diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index 61b8b01afb4..9b8b8e85f2b 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -1058,7 +1058,7 @@ afr_xl_op (xlator_t *this, dict_t *input, dict_t *output) goto out; switch (op) { case GF_SHD_OP_HEAL_INDEX: - op_ret = -1; + op_ret = 0; for (i = 0; i < priv->child_count; i++) { healer = &shd->index_healers[i]; @@ -1067,10 +1067,12 @@ afr_xl_op (xlator_t *this, dict_t *input, dict_t *output) if (!priv->child_up[i]) { ret = dict_set_str (output, key, "Brick is not connected"); + op_ret = -1; } else if (AFR_COUNT (priv->child_up, priv->child_count) < 2) { ret = dict_set_str (output, key, "< 2 bricks in replica are up"); + op_ret = -1; } else if (!afr_shd_is_subvol_local (this, healer->subvol)) { ret = dict_set_str (output, key, "Brick is remote"); @@ -1078,7 +1080,6 @@ afr_xl_op (xlator_t *this, dict_t *input, dict_t *output) ret = dict_set_str (output, key, "Started self-heal"); afr_shd_index_healer_spawn (this, i); - op_ret = 0; } } break; -- cgit