From 7d62749f14663ea6c0000a4aab2c32041cbb4e75 Mon Sep 17 00:00:00 2001 From: Pranith Kumar K Date: Tue, 3 Apr 2012 08:12:34 +0530 Subject: self-heald: Find self-heal failures, split-brain Change-Id: Ib967f0fe0b537fe60e51d7d05462b58a7f16596e BUG: 806745 Signed-off-by: Pranith Kumar K Reviewed-on: http://review.gluster.com/3077 Tested-by: Gluster Build System Reviewed-by: Jeff Darcy Reviewed-by: Vijay Bellur --- xlators/cluster/afr/src/afr-common.c | 15 +++++++++++++-- xlators/cluster/afr/src/afr-dir-read.c | 2 +- xlators/cluster/afr/src/afr-self-heal-common.c | 5 +++-- xlators/cluster/afr/src/afr-self-heal-data.c | 3 ++- xlators/cluster/afr/src/afr-self-heald.c | 19 ++++++++++++++----- xlators/cluster/afr/src/afr.h | 5 +++-- 6 files changed, 36 insertions(+), 13 deletions(-) (limited to 'xlators') diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 2cfe92acf..9874b2619 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1301,7 +1301,8 @@ afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this), int (*unwind) (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno)) + int32_t op_ret, int32_t op_errno, + int32_t sh_failed)) { afr_local_t *local = NULL; char sh_type_str[256] = {0,}; @@ -1527,9 +1528,12 @@ afr_lookup_set_self_heal_params (afr_local_t *local, xlator_t *this) int afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, + int32_t sh_failed) { afr_local_t *local = NULL; + int ret = -1; + dict_t *xattr = NULL; local = frame->local; @@ -1544,6 +1548,13 @@ afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, } afr_lookup_done_success_action (frame, this, _gf_true); + xattr = local->cont.lookup.xattr; + if (xattr) { + ret = dict_set_int32 (xattr, "sh-failed", sh_failed); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " + "sh-failed to %d", local->loc.path, sh_failed); + } out: AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, local->cont.lookup.inode, &local->cont.lookup.buf, diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index 5abbd9c13..872822565 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -51,7 +51,7 @@ int afr_examine_dir_sh_unwind (call_frame_t *frame, xlator_t *this, int32_t op_ret, - int32_t op_errno) + int32_t op_errno, int32_t sh_failed) { afr_local_t *local = NULL; diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index bd0e04626..af5aadc3c 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -2048,7 +2048,8 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) FRAME_SU_UNDO (bgsh_frame, afr_local_t); if (!sh->unwound && sh->unwind) { - sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno); + sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, + sh->op_failed); } if (sh->background) { @@ -2187,7 +2188,7 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) out: if (op_errno) { - orig_sh->unwind (frame, this, -1, op_errno); + orig_sh->unwind (frame, this, -1, op_errno, 1); if (sh_frame) AFR_STACK_DESTROY (sh_frame); } diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index d362d5add..69494157c 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -769,7 +769,8 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this) } if (sh->background && sh->unwind) { - sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno); + sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, + sh->op_failed); sh->unwound = _gf_true; } diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index 3068d5c46..55ede78ad 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -277,7 +277,7 @@ out: void _crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child, - int32_t op_ret, int32_t op_errno, + int32_t op_ret, int32_t op_errno, dict_t *xattr_rsp, afr_crawl_data_t *crawl_data) { int ret = 0; @@ -286,6 +286,8 @@ _crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child, eh_t *eh = NULL; char *path = NULL; shd_event_t *event = NULL; + int32_t sh_failed = 0; + gf_boolean_t split_brain = 0; priv = this->private; shd = &priv->shd; @@ -307,9 +309,12 @@ _crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child, } } - if (op_ret < 0 && op_errno == EIO) + if (xattr_rsp) + ret = dict_get_int32 (xattr_rsp, "sh-failed", &sh_failed); + split_brain = afr_is_split_brain (this, child->inode); + if ((op_ret < 0 && op_errno == EIO) || split_brain) eh = shd->split_brain; - else if (op_ret < 0) + else if ((op_ret < 0) || sh_failed) eh = shd->heal_failed; else eh = shd->healed; @@ -338,6 +343,7 @@ _self_heal_entry (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *ent { struct iatt parentbuf = {0}; int ret = 0; + dict_t *xattr_rsp = NULL; if (uuid_is_null (child->gfid)) gf_log (this->name, GF_LOG_DEBUG, "lookup %s", child->path); @@ -346,8 +352,11 @@ _self_heal_entry (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *ent uuid_utoa (child->gfid)); ret = syncop_lookup (this, child, NULL, - iattr, NULL, &parentbuf); - _crawl_post_sh_action (this, parent, child, ret, errno, crawl_data); + iattr, &xattr_rsp, &parentbuf); + _crawl_post_sh_action (this, parent, child, ret, errno, xattr_rsp, + crawl_data); + if (xattr_rsp) + dict_unref (xattr_rsp); return ret; } diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 815f4667c..fccb39b1a 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -191,7 +191,7 @@ typedef struct { background, this function will be called as soon as possible. */ int (*unwind) (call_frame_t *frame, xlator_t *this, int32_t op_ret, - int32_t op_errno); + int32_t op_errno, int32_t sh_failed); /* End of external interface members */ @@ -1016,7 +1016,8 @@ afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this), int (*unwind) (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno)); + int32_t op_ret, int32_t op_errno, + int32_t sh_failed)); int afr_fix_open (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx, int need_open_count, int *need_open); -- cgit