summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr
diff options
context:
space:
mode:
authorPranith Kumar K <pkarampu@redhat.com>2014-09-17 11:48:24 +0530
committerNiels de Vos <ndevos@redhat.com>2014-10-01 00:14:21 -0700
commitbee0c740b54669a8be11acea405d021bb50d3c54 (patch)
treeec1c33dbce0411688cea6432c0ad97ad5aaef90d /xlators/cluster/afr
parent91a8e6940a0a32528ec9a55ee19e82021d08fd86 (diff)
cluster/afr: Launch self-heal only when all the brick status is known
Problem: File goes into split-brain because of wrong erasing of xattrs. RCA: The issue happens because index self-heal is triggered even before all the bricks are up. So what ends up happening while erasing the xattrs is, xattrs are erased only on the sink brick for the brick that it thinks is up leading to split-brain Example: lets say the xattrs before heal started are: brick 2: trusted.afr.vol1-client-2=0x000000020000000000000000 trusted.afr.vol1-client-3=0x000000020000000000000000 brick 3: trusted.afr.vol1-client-2=0x000010040000000000000000 trusted.afr.vol1-client-3=0x000000000000000000000000 if only brick-2 came up at the time of triggering the self-heal only 'trusted.afr.vol1-client-2' is erased leading to the following xattrs: brick 2: trusted.afr.vol1-client-2=0x000000000000000000000000 trusted.afr.vol1-client-3=0x000000020000000000000000 brick 3: trusted.afr.vol1-client-2=0x000010040000000000000000 trusted.afr.vol1-client-3=0x000000000000000000000000 So the file goes into split-brain. Change-Id: I79f9a289d2118a715d262398221037b684a53d2a BUG: 1142614 Signed-off-by: Pranith Kumar K <pkarampu@redhat.com> Reviewed-on: http://review.gluster.org/8757 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Krutika Dhananjay <kdhananj@redhat.com> Reviewed-by: Niels de Vos <ndevos@redhat.com>
Diffstat (limited to 'xlators/cluster/afr')
-rw-r--r--xlators/cluster/afr/src/afr-common.c22
1 files changed, 20 insertions, 2 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 58444ddb896..6fd3e321930 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -4385,8 +4385,26 @@ afr_notify (xlator_t *this, int32_t event,
ret = 0;
if (propagate)
ret = default_notify (this, event, data);
- if (call_psh && priv->shd.iamshd)
- afr_proactive_self_heal ((void*) (long) up_child);
+ if (priv->shd.iamshd && have_heard_from_all) {
+ if (!had_heard_from_all) {
+ /*
+ * Since self-heal is supposed to be launched only after
+ * the responses from all the bricks are collected,
+ * launch self-heals now on all up subvols.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->child_up[i])
+ continue;
+ afr_proactive_self_heal ((void*) (long) i);
+ }
+ } else if (call_psh) {
+ /*
+ * Already heard from everyone. Just launch heal on now
+ * up subvolume.
+ */
+ afr_proactive_self_heal ((void*) (long) up_child);
+ }
+ }
out:
return ret;