From 719c927592cfdb0de88243769d477ca211a2b494 Mon Sep 17 00:00:00 2001 From: Anuradha Date: Sat, 9 May 2015 10:25:08 +0530 Subject: cluster/afr : Prevent inode-evict during split-brain resolution Backport of: http://review.gluster.org/#/c/10134/ 1) Provided setfattr command to set timeout for split-brain choice. 2) If split-brain inspection/resolution is being done from the mount for a file, ref the inode when split-brain-choice is set. This inode will be unconditionally unref-ed after timeout seconds set by the user/default otherwise. 3) Updated the doc and testcase to reflect the changes. Change-Id: I15c9037dee28855f21e680e7e3632e1f48dba4e1 BUG: 1219388 Reviewed-on: http://review.gluster.org/10134 Reviewed-by: Krutika Dhananjay Reviewed-by: Ravishankar N Tested-by: Gluster Build System Reviewed-by: Pranith Kumar Karampuri Signed-off-by: Anuradha Reviewed-on: http://review.gluster.org/10679 --- xlators/cluster/afr/src/afr-common.c | 198 ++++++++++++++++++++++++++---- xlators/cluster/afr/src/afr-inode-write.c | 140 ++++++++++++++++----- xlators/cluster/afr/src/afr-mem-types.h | 1 + xlators/cluster/afr/src/afr.c | 2 + xlators/cluster/afr/src/afr.h | 20 +++ 5 files changed, 303 insertions(+), 58 deletions(-) (limited to 'xlators/cluster/afr/src') diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 8fbca0b6f42..46f726da734 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -413,6 +413,142 @@ out: return ret; } +int +afr_spb_choice_timeout_cancel (xlator_t *this, inode_t *inode) +{ + afr_inode_ctx_t *ctx = NULL; + int ret = -1; + + if (!inode) + return ret; + + LOCK(&inode->lock); + { + __afr_inode_ctx_get (this, inode, &ctx); + if (!ctx) { + gf_log (this->name, GF_LOG_WARNING, "Failed to cancel" + " split-brain choice timer."); + goto out; + } + ctx->spb_choice = -1; + if (ctx->timer) { + gf_timer_call_cancel (this->ctx, ctx->timer); + ctx->timer = NULL; + } + ret = 0; + } +out: + UNLOCK(&inode->lock); + return ret; +} + +void +afr_set_split_brain_choice_cbk (void *data) +{ + inode_t *inode = data; + xlator_t *this = THIS; + + afr_spb_choice_timeout_cancel (this, inode); + inode_unref (inode); + return; +} + + +int +afr_set_split_brain_choice (int ret, call_frame_t *frame, void *opaque) +{ + int op_errno = ENOMEM; + afr_private_t *priv = NULL; + afr_inode_ctx_t *ctx = NULL; + inode_t *inode = NULL; + loc_t *loc = NULL; + xlator_t *this = NULL; + afr_spbc_timeout_t *data = opaque; + struct timespec delta = {0, }; + + if (ret) + goto out; + + frame = data->frame; + loc = data->loc; + this = frame->this; + priv = this->private; + + delta.tv_sec = priv->spb_choice_timeout; + delta.tv_nsec = 0; + + inode = loc->inode; + if (!inode) + goto out; + + if (!(data->d_spb || data->m_spb)) { + gf_log (this->name, GF_LOG_WARNING, "Cannot set " + "replica.split-brain-choice on %s. File is" + " not in data/metadata split-brain.", + uuid_utoa (loc->gfid)); + ret = -1; + op_errno = EINVAL; + goto out; + } + + LOCK(&inode->lock); + { + ret = __afr_inode_ctx_get (this, inode, &ctx); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get" + "inode_ctx for %s", loc->name); + goto unlock; + } + + ctx->spb_choice = data->spb_child_index; + + /* Possible changes in spb-choice : + * -1 to valid : ref and inject timer + * + * valid to valid : cancel timer and inject new one + * + * valid to -1 : cancel timer and unref + * + * -1 to -1 : do not do anything + */ + + /* ctx->timer is NULL iff previous value of + * ctx->spb_choice is -1 + */ + if (ctx->timer) { + if (ctx->spb_choice == -1) { + gf_timer_call_cancel (this->ctx, ctx->timer); + ctx->timer = NULL; + inode_unref (inode); + goto unlock; + } + goto reset_timer; + } else { + if (ctx->spb_choice == -1) + goto unlock; + } + + inode = inode_ref (loc->inode); + goto set_timer; + +reset_timer: + gf_timer_call_cancel (this->ctx, ctx->timer); + ctx->timer = NULL; + +set_timer: + ctx->timer = gf_timer_call_after (this->ctx, delta, + afr_set_split_brain_choice_cbk, + inode); + } +unlock: + UNLOCK(&inode->lock); + inode_invalidate (inode); +out: + if (data) + GF_FREE (data); + AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL); + return 0; +} int afr_accused_fill (xlator_t *this, dict_t *xdata, unsigned char *accused, @@ -3589,6 +3725,7 @@ afr_forget (xlator_t *this, inode_t *inode) uint64_t ctx_int = 0; afr_inode_ctx_t *ctx = NULL; + afr_spb_choice_timeout_cancel (this, inode); inode_ctx_del (inode, this, &ctx_int); if (!ctx_int) return 0; @@ -4552,10 +4689,10 @@ out: } int -afr_set_split_brain_status (call_frame_t *frame, xlator_t *this, - struct afr_reply *replies, - afr_transaction_type type, - gf_boolean_t *spb) +_afr_is_split_brain (call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + afr_transaction_type type, + gf_boolean_t *spb) { afr_private_t *priv = NULL; uint64_t *witness = NULL; @@ -4583,6 +4720,37 @@ afr_set_split_brain_status (call_frame_t *frame, xlator_t *this, return ret; } +int +afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode, + uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb) +{ + int ret = -1; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + + priv = this->private; + + replies = alloca0 (sizeof (*replies) * priv->child_count); + + ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies); + if (ret) + goto out; + + ret = _afr_is_split_brain (frame, this, replies, + AFR_DATA_TRANSACTION, d_spb); + if (ret) + goto out; + + ret = _afr_is_split_brain (frame, this, replies, + AFR_METADATA_TRANSACTION, m_spb); +out: + if (replies) { + afr_replies_wipe (replies, priv->child_count); + replies = NULL; + } + return ret; +} + int afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc) { @@ -4594,7 +4762,6 @@ afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc) char *choices = NULL; char *status = NULL; dict_t *dict = NULL; - struct afr_reply *replies = NULL; inode_t *inode = NULL; afr_private_t *priv = NULL; xlator_t **children = NULL; @@ -4605,7 +4772,6 @@ afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc) inode = afr_inode_find (this, loc->gfid); if (!inode) goto out; - replies = alloca0 (sizeof (*replies) * priv->child_count); /* Calculation for string length : * (child_count X length of child-name) + strlen (" Choices :") @@ -4615,23 +4781,9 @@ afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc) */ choices = alloca0 (priv->child_count * (256 + strlen ("-client-00,")) + strlen (" Choices:")); - ret = afr_selfheal_unlocked_discover (frame, inode, loc->gfid, replies); - if (ret) { - op_errno = -ret; - ret = -1; - goto out; - } - - ret = afr_set_split_brain_status (frame, this, replies, - AFR_DATA_TRANSACTION, &d_spb); - if (ret) { - op_errno = -ret; - ret = -1; - goto out; - } - ret = afr_set_split_brain_status (frame, this, replies, - AFR_METADATA_TRANSACTION, &m_spb); + ret = afr_is_split_brain (frame, this, inode, loc->gfid, &d_spb, + &m_spb); if (ret) { op_errno = -ret; ret = -1; @@ -4678,8 +4830,6 @@ out: AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL); if (dict) dict_unref (dict); - if (replies) - afr_replies_wipe (replies, priv->child_count); if (inode) inode_unref (inode); return ret; diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index f9fde44e9e4..3db4010e997 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -979,12 +979,7 @@ afr_split_brain_resolve_do (call_frame_t *frame, xlator_t *this, loc_t *loc, int ret = -1; int op_errno = EINVAL; - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; - - local->op = GF_FOP_SETXATTR; - + local = frame->local; local->xdata_req = dict_new (); if (!local->xdata_req) { @@ -1005,35 +1000,27 @@ afr_split_brain_resolve_do (call_frame_t *frame, xlator_t *this, loc_t *loc, ret = -1; goto out; } + /* set spb choice to -1 whether heal succeeds or not: + * If heal succeeds : spb-choice should be set to -1 as + * it is no longer valid; file is not + * in split-brain anymore. + * If heal doesn't succeed: + * spb-choice should be set to -1 + * otherwise reads will be served + * from spb-choice which is misleading. + */ + ret = afr_inode_split_brain_choice_set (loc->inode, this, -1); + if (ret) + gf_log (this->name, GF_LOG_WARNING, "Failed to set" + "split-brain choice to -1"); afr_heal_splitbrain_file (frame, this, loc); + ret = 0; out: if (ret < 0) AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); return 0; } -int -afr_set_split_brain_choice (call_frame_t *frame, xlator_t *this, loc_t *loc, - int spb_choice) -{ - int ret = -1; - int op_errno = ENOMEM; - afr_private_t *priv = NULL; - - priv = this->private; - - ret = afr_inode_split_brain_choice_set (loc->inode, this, spb_choice); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Failed to set" - "split-brain choice as %s for %s", - priv->children[spb_choice]->name, - loc->name); - } - inode_invalidate (loc->inode); - AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL); - return ret; -} - int afr_get_split_brain_child_index (xlator_t *this, void *value, size_t len) { @@ -1055,19 +1042,53 @@ afr_get_split_brain_child_index (xlator_t *this, void *value, size_t len) return spb_child_index; } +int +afr_can_set_split_brain_choice (void *opaque) +{ + afr_spbc_timeout_t *data = opaque; + call_frame_t *frame = NULL; + xlator_t *this = NULL; + loc_t *loc = NULL; + int ret = -1; + + frame = data->frame; + loc = data->loc; + this = frame->this; + + ret = afr_is_split_brain (frame, this, loc->inode, loc->gfid, + &data->d_spb, &data->m_spb); + + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Failed to determine if %s" + " is in split-brain. " + "Aborting split-brain-choice set.", + uuid_utoa (loc->gfid)); + return ret; +} + int afr_handle_split_brain_commands (xlator_t *this, call_frame_t *frame, loc_t *loc, dict_t *dict) { - int len = 0; void *value = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_spbc_timeout_t *data = NULL; + int len = 0; int spb_child_index = -1; int ret = -1; int op_errno = EINVAL; - afr_private_t *priv = NULL; priv = this->private; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) { + ret = 1; + goto out; + } + + local->op = GF_FOP_SETXATTR; + ret = dict_get_ptr_and_len (dict, GF_AFR_SBRAIN_CHOICE, &value, &len); if (value) { @@ -1079,12 +1100,29 @@ afr_handle_split_brain_commands (xlator_t *this, call_frame_t *frame, spb_child_index = -1; else { ret = 1; + op_errno = EINVAL; goto out; } } - afr_set_split_brain_choice (frame, this, loc, - spb_child_index); + data = GF_CALLOC (1, sizeof (*data), gf_afr_mt_spbc_timeout_t); + if (!data) { + ret = 1; + goto out; + } + data->spb_child_index = spb_child_index; + data->frame = frame; + data->loc = loc; + ret = synctask_new (this->ctx->env, + afr_can_set_split_brain_choice, + afr_set_split_brain_choice, NULL, data); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to create" + " synctask. Aborting split-brain choice set" + " for %s", loc->name); + ret = 1; + goto out; + } ret = 0; goto out; } @@ -1111,6 +1149,41 @@ out: return ret; } +int +afr_handle_spb_choice_timeout (xlator_t *this, call_frame_t *frame, + dict_t *dict) +{ + int ret = -1; + int op_errno = 0; + uint64_t timeout = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + ret = dict_get_uint64 (dict, GF_AFR_SPB_CHOICE_TIMEOUT, &timeout); + if (!ret) { + priv->spb_choice_timeout = timeout * 60; + AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL); + } + + return ret; +} + +static int +afr_handle_special_xattr (xlator_t *this, call_frame_t *frame, loc_t *loc, + dict_t *dict) +{ + int ret = -1; + + ret = afr_handle_split_brain_commands (this, frame, loc, dict); + if (ret == 0) + goto out; + + ret = afr_handle_spb_choice_timeout (this, frame, dict); +out: + return ret; +} + int afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) @@ -1126,8 +1199,7 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, op_errno, out); - ret = afr_handle_split_brain_commands (this, frame, loc, dict); - + ret = afr_handle_special_xattr (this, frame, loc, dict); if (ret == 0) return 0; diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 05df90cc0ee..a11063c1f25 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -43,6 +43,7 @@ enum gf_afr_mem_types_ { gf_afr_mt_pos_data_t, gf_afr_mt_reply_t, gf_afr_mt_subvol_healer_t, + gf_afr_mt_spbc_timeout_t, gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 21575fed2de..26efe93de99 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -276,6 +276,8 @@ init (xlator_t *this) GF_OPTION_INIT ("arbiter-count", priv->arbiter_count, uint32, out); + priv->spb_choice_timeout = AFR_DEFAULT_SPB_CHOICE_TIMEOUT; + GF_OPTION_INIT ("afr-dirty-xattr", priv->afr_dirty, str, out); GF_OPTION_INIT ("metadata-splitbrain-forced-heal", diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 6cb708ffbd7..855d3a3680e 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -38,6 +38,7 @@ #define AFR_LOCKEE_COUNT_MAX 3 #define AFR_DOM_COUNT_MAX 3 #define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ +#define AFR_DEFAULT_SPB_CHOICE_TIMEOUT 300 /*in seconds*/ #define ARBITER_BRICK_INDEX 2 @@ -130,6 +131,7 @@ typedef struct _afr_private { void *pump_private; gf_boolean_t use_afr_in_pump; gf_boolean_t consistent_metadata; + uint64_t spb_choice_timeout; } afr_private_t; @@ -742,8 +744,17 @@ typedef struct _afr_local { typedef struct _afr_inode_ctx { uint64_t read_subvol; int spb_choice; + gf_timer_t *timer; } afr_inode_ctx_t; +typedef struct afr_spbc_timeout { + call_frame_t *frame; + gf_boolean_t d_spb; + gf_boolean_t m_spb; + loc_t *loc; + int spb_child_index; +} afr_spbc_timeout_t; + /* did a call fail due to a child failing? */ #define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \ ((op_errno == ENOTCONN) || \ @@ -1046,4 +1057,13 @@ afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this, int *spb_choice); int afr_get_child_index_from_name (xlator_t *this, char *name); + +int +afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode, + uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb); +int +afr_spb_choice_timeout_cancel (xlator_t *this, inode_t *inode); + +int +afr_set_split_brain_choice (int ret, call_frame_t *frame, void *opaque); #endif /* __AFR_H__ */ -- cgit