From d06692d1deec425f74747e2c463e56f7eca981c8 Mon Sep 17 00:00:00 2001 From: Anuradha Date: Wed, 25 Feb 2015 15:09:28 +0530 Subject: cluster/afr : enable inspection & resolution of files in split-brain Part 2/2 patch to enable users analyze and resolve split-brain. This patch enables : 1) Users to inspect the files in data and metadata split-brain. 2) Resolve the split-brain. Both using a series of setfattr commands. Consider a volume "test" with 2 bricks. 1) To inspect a file f1: setfattr -n replica.split-brain-choice -v test-client-0 f1 After the execution of this command, if no read_subvol is found, reads will be served from test-client-0 (corresponding to brick-0). 2) To resolve split-brain : setfattr -n replica.split-brain-heal-finalize -v test-client-0 f1 Execution of this command will lead to the resolution of data and metadata split-brain with subvol mentioned in the command (test-client-0 here) as the source and the rest as sink. Change-Id: Ia20f3ee5abd3119e3d54fcc599f1e55ac65fd179 BUG: 1191396 Signed-off-by: Anuradha Reviewed-on: http://review.gluster.org/9743 Reviewed-by: Pranith Kumar Karampuri Tested-by: Gluster Build System --- xlators/cluster/afr/src/afr-common.c | 184 ++++++++++++++++++++++--- xlators/cluster/afr/src/afr-inode-write.c | 146 +++++++++++++++++++- xlators/cluster/afr/src/afr-read-txn.c | 7 + xlators/cluster/afr/src/afr-self-heal-common.c | 18 +-- xlators/cluster/afr/src/afr.h | 14 ++ 5 files changed, 336 insertions(+), 33 deletions(-) (limited to 'xlators/cluster') diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index f7cc202d4d1..0af46993a34 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -67,6 +67,37 @@ afr_copy_frame (call_frame_t *base) return frame; } +int +__afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx) +{ + uint64_t ctx_int = 0; + int ret = -1; + afr_inode_ctx_t *tmp_ctx = NULL; + + ret = __inode_ctx_get (inode, this, &ctx_int); + if (ret) { + tmp_ctx = GF_CALLOC (1, sizeof (afr_inode_ctx_t), + gf_afr_mt_inode_ctx_t); + if (!tmp_ctx) + goto out; + + ctx_int = (long) tmp_ctx; + ret = __inode_ctx_set (inode, this, &ctx_int); + if (ret) { + GF_FREE (tmp_ctx); + goto out; + } + tmp_ctx->spb_choice = -1; + tmp_ctx->read_subvol = 0; + } else { + tmp_ctx = (afr_inode_ctx_t *) ctx_int; + } + + *ctx = tmp_ctx; + ret = 0; +out: + return ret; +} /* * INODE CTX 64-bit VALUE FORMAT FOR SMALL (<= 16) SUBVOL COUNTS: * @@ -109,13 +140,16 @@ __afr_inode_read_subvol_get_small (inode_t *inode, xlator_t *this, uint32_t event = 0; uint64_t val = 0; int i = 0; + afr_inode_ctx_t *ctx = NULL; priv = this->private; - ret = __inode_ctx_get (inode, this, &val); + ret = __afr_inode_ctx_get (this, inode, &ctx); if (ret < 0) return ret; + val = ctx->read_subvol; + metadatamap = (val & 0x000000000000ffff); datamap = (val & 0x00000000ffff0000) >> 16; event = (val & 0xffffffff00000000) >> 32; @@ -143,9 +177,15 @@ __afr_inode_read_subvol_set_small (inode_t *inode, xlator_t *this, uint16_t metadatamap = 0; uint64_t val = 0; int i = 0; + int ret = -1; + afr_inode_ctx_t *ctx = NULL; priv = this->private; + ret = __afr_inode_ctx_get (this, inode, &ctx); + if (ret) + goto out; + for (i = 0; i < priv->child_count; i++) { if (data[i]) datamap |= (1 << i); @@ -157,9 +197,12 @@ __afr_inode_read_subvol_set_small (inode_t *inode, xlator_t *this, (((uint64_t) datamap) << 16) | (((uint64_t) event) << 32); - return __inode_ctx_set (inode, this, &val); -} + ctx->read_subvol = val; + ret = 0; +out: + return ret; +} int __afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this) @@ -169,9 +212,13 @@ __afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this) uint16_t metadatamap = 0; uint32_t event = 0; uint64_t val = 0; + afr_inode_ctx_t *ctx = NULL; + + ret = __afr_inode_ctx_get (this, inode, &ctx); + if (ret) + return ret; - ret = __inode_ctx_get (inode, this, &val); - (void) ret; + val = ctx->read_subvol; metadatamap = (val & 0x000000000000ffff) >> 0; datamap = (val & 0x00000000ffff0000) >> 16; @@ -181,7 +228,9 @@ __afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this) (((uint64_t) datamap) << 16) | (((uint64_t) event) << 32); - return __inode_ctx_set (inode, this, &val); + ctx->read_subvol = val; + + return ret; } @@ -205,6 +254,20 @@ __afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, return ret; } +int +__afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this, + int *spb_choice) +{ + afr_inode_ctx_t *ctx = NULL; + int ret = -1; + + ret = __afr_inode_ctx_get (this, inode, &ctx); + if (ret < 0) + return ret; + + *spb_choice = ctx->spb_choice; + return 0; +} int __afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, @@ -224,6 +287,23 @@ __afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data return ret; } +int +__afr_inode_split_brain_choice_set (inode_t *inode, xlator_t *this, + int spb_choice) +{ + afr_inode_ctx_t *ctx = NULL; + int ret = -1; + + ret = __afr_inode_ctx_get (this, inode, &ctx); + if (ret) + goto out; + + ctx->spb_choice = spb_choice; + + ret = 0; +out: + return ret; +} int __afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this) @@ -258,6 +338,22 @@ afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, unsigned char *data, return ret; } +int +afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this, + int *spb_choice) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __afr_inode_split_brain_choice_get (inode, this, + spb_choice); + } + UNLOCK(&inode->lock); + + return ret; +} + int afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, @@ -275,6 +371,22 @@ afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, return ret; } +int +afr_inode_split_brain_choice_set (inode_t *inode, xlator_t *this, + int spb_choice) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __afr_inode_split_brain_choice_set (inode, this, + spb_choice); + } + UNLOCK(&inode->lock); + + return ret; +} + int afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this) @@ -1220,6 +1332,7 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) gf_boolean_t locked_entry = _gf_false; gf_boolean_t can_interpret = _gf_true; inode_t *parent = NULL; + int spb_choice = -1; priv = this->private; local = frame->local; @@ -1232,6 +1345,8 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) afr_inode_read_subvol_get (parent, this, readable, NULL, &event); + afr_inode_split_brain_choice_get (local->inode, this, + &spb_choice); /* First, check if we have a gfid-change from somewhere, If so, propagate that so that a fresh lookup can be issued @@ -1321,18 +1436,24 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) } } else { cant_interpret: - if (read_subvol == -1) - dict_del (replies[0].xdata, GF_CONTENT_KEY); - else - dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY); + if (read_subvol == -1) { + if (spb_choice >= 0) + read_subvol = spb_choice; + else + read_subvol = 0; + } + dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY); } afr_handle_quota_size (frame, this); unwind: - if (read_subvol == -1) - read_subvol = 0; - + if (read_subvol == -1) { + if (spb_choice >= 0) + read_subvol = spb_choice; + else + read_subvol = 0; + } par_read_subvol = afr_get_parent_read_subvol (this, parent, replies, readable); @@ -1741,8 +1862,12 @@ afr_discover_done (call_frame_t *frame, xlator_t *this) } unwind: - if (read_subvol == -1) - read_subvol = 0; + if (read_subvol == -1) { + afr_inode_split_brain_choice_get (local->inode, this, + &read_subvol); + if (read_subvol == -1) + read_subvol = 0; + } AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, local->inode, &local->replies[read_subvol].poststat, @@ -3468,6 +3593,15 @@ out: int afr_forget (xlator_t *this, inode_t *inode) { + uint64_t ctx_int = 0; + afr_inode_ctx_t *ctx = NULL; + + inode_ctx_del (inode, this, &ctx_int); + if (!ctx_int) + return 0; + + ctx = (afr_inode_ctx_t *)ctx_int; + GF_FREE (ctx); return 0; } @@ -4594,8 +4728,26 @@ afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc) } out: - AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL); + if (local->op == GF_FOP_GETXATTR) + AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL); + else if (local->op == GF_FOP_SETXATTR) + AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL); if (dict) dict_unref(dict); return ret; } + +int +afr_get_child_index_from_name (xlator_t *this, char *name) +{ + afr_private_t *priv = this->private; + int index = -1; + + for (index = 0; index < priv->child_count; index++) { + if (!strcmp (priv->children[index]->name, name)) + goto out; + } + index = -1; +out: + return index; +} diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 0c96d069ae5..776933892ff 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -34,8 +34,8 @@ #include "common-utils.h" #include "compat-errno.h" #include "compat.h" +#include "protocol-common.h" -#include "afr.h" #include "afr-transaction.h" @@ -961,6 +961,145 @@ afr_setxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) return 0; } +int +afr_split_brain_resolve_do (call_frame_t *frame, xlator_t *this, loc_t *loc, + char *data) +{ + afr_local_t *local = NULL; + int ret = -1; + int op_errno = EINVAL; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + local->op = GF_FOP_SETXATTR; + + local->xdata_req = dict_new (); + + if (!local->xdata_req) { + op_errno = ENOMEM; + goto out; + } + + ret = dict_set_int32 (local->xdata_req, "heal-op", + GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + ret = dict_set_str (local->xdata_req, "child-name", data); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + afr_heal_splitbrain_file (frame, this, loc); +out: + if (ret < 0) + AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + return 0; +} + +int +afr_set_split_brain_choice (call_frame_t *frame, xlator_t *this, loc_t *loc, + int spb_choice) +{ + int ret = -1; + int op_errno = ENOMEM; + afr_private_t *priv = NULL; + + priv = this->private; + + ret = afr_inode_split_brain_choice_set (loc->inode, this, spb_choice); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set" + "split-brain choice as %s for %s", + priv->children[spb_choice]->name, + loc->name); + } + inode_invalidate (loc->inode); + AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL); + return ret; +} + +int +afr_get_split_brain_child_index (xlator_t *this, void *value, size_t len) +{ + int spb_child_index = -1; + char *spb_child_str = NULL; + + spb_child_str = alloca0 (len + 1); + memcpy (spb_child_str, value, len); + + if (!strcmp (spb_child_str, "none")) + return -2; + + spb_child_index = afr_get_child_index_from_name (this, + spb_child_str); + if (spb_child_index < 0) { + gf_log (this->name, GF_LOG_ERROR, "Invalid subvol: %s", + spb_child_str); + } + return spb_child_index; +} + +int +afr_handle_split_brain_commands (xlator_t *this, call_frame_t *frame, + loc_t *loc, dict_t *dict) +{ + int len = 0; + void *value = NULL; + int spb_child_index = -1; + int ret = -1; + int op_errno = EINVAL; + afr_private_t *priv = NULL; + + priv = this->private; + + ret = dict_get_ptr_and_len (dict, GF_AFR_SBRAIN_CHOICE, &value, + &len); + if (value) { + spb_child_index = afr_get_split_brain_child_index (this, value, + len); + if (spb_child_index < 0) { + /* Case where value was "none" */ + if (spb_child_index == -2) + spb_child_index = -1; + else { + ret = 1; + goto out; + } + } + + afr_set_split_brain_choice (frame, this, loc, + spb_child_index); + ret = 0; + goto out; + } + + ret = dict_get_ptr_and_len (dict, GF_AFR_SBRAIN_RESOLVE, &value, &len); + if (value) { + spb_child_index = afr_get_split_brain_child_index (this, value, + len); + if (spb_child_index < 0) { + ret = 1; + goto out; + } + + afr_split_brain_resolve_do (frame, this, loc, + priv->children[spb_child_index]->name); + ret = 0; + } +out: + /* key was correct but value was invalid when ret == 1 */ + if (ret == 1) { + AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + ret = 0; + } + return ret; +} int afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, @@ -977,6 +1116,11 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, op_errno, out); + ret = afr_handle_split_brain_commands (this, frame, loc, dict); + + if (ret == 0) + return 0; + transaction_frame = copy_frame (frame); if (!transaction_frame) goto out; diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c index ec67a20e624..eaa73d9be20 100644 --- a/xlators/cluster/afr/src/afr-read-txn.c +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -56,6 +56,7 @@ afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err) int event_generation = 0; inode_t *inode = NULL; int ret = -1; + int spb_choice = -1; local = frame->local; inode = local->inode; @@ -96,6 +97,12 @@ afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err) local->read_attempted[read_subvol] = 1; readfn: + if (read_subvol == -1) { + ret = afr_inode_split_brain_choice_get (inode, this, + &spb_choice); + if ((ret == 0) && spb_choice >= 0) + read_subvol = spb_choice; + } local->readfn (frame, this, read_subvol); return 0; diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 2441f413f3e..21b4c4414d9 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -389,9 +389,11 @@ afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this, local = frame->local; priv = this->private; xdata_req = local->xdata_req; + ret = dict_get_int32 (xdata_req, "heal-op", &heal_op); if (ret) goto out; + for (i = 0; i < priv->child_count; i++) { if (locked_on[i]) if (sources[i] || !sinks[i] || !healed_sinks[i]) { @@ -468,22 +470,6 @@ out: } -int -afr_get_child_index_from_name (xlator_t *this, char *name) -{ - afr_private_t *priv = this->private; - int index = -1; - - for (index = 0; index < priv->child_count; index++) { - if (!strcmp (priv->children[index]->name, name)) - goto out; - } - index = -1; -out: - return index; -} - - gf_boolean_t afr_does_witness_exist (xlator_t *this, uint64_t *witness) { diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index d7d15c69845..0885b582d77 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -733,6 +733,11 @@ typedef struct _afr_local { } afr_local_t; +typedef struct _afr_inode_ctx { + uint64_t read_subvol; + int spb_choice; +} afr_inode_ctx_t; + /* did a call fail due to a child failing? */ #define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \ ((op_errno == ENOTCONN) || \ @@ -1026,4 +1031,13 @@ afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc); int afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc); + +int +afr_inode_split_brain_choice_set (inode_t *inode, xlator_t *this, + int spb_choice); +int +afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this, + int *spb_choice); +int +afr_get_child_index_from_name (xlator_t *this, char *name); #endif /* __AFR_H__ */ -- cgit