summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPranith Kumar K <pkarampu@redhat.com>2012-11-16 07:08:30 +0530
committerAnand Avati <avati@redhat.com>2012-12-11 16:23:53 -0800
commitcc9701d4f533bf7337d2925424e2498ab64fdf13 (patch)
tree565527862f7f235c7b747a325155a126ad006d89
parent179e7333740fe990cac6fc2e63fbace844b17b8d (diff)
cluster/afr: Remember type of split-brain in inode-ctx
Along with this change, fixed the race of setting the split-brain status in inode-ctx after unwinding the fop from self-heal in case of back-ground self-heal. Change-Id: Ifc829300df485f50f139443802e8b6dc7038b4ad BUG: 873962 Signed-off-by: Pranith Kumar K <pkarampu@redhat.com> Reviewed-on: http://review.gluster.org/4198 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Anand Avati <avati@redhat.com>
-rwxr-xr-xtests/bugs/bug-873962.t85
-rw-r--r--xlators/cluster/afr/src/afr-common.c169
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c7
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c12
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c28
-rw-r--r--xlators/cluster/afr/src/afr.h15
6 files changed, 199 insertions, 117 deletions
diff --git a/tests/bugs/bug-873962.t b/tests/bugs/bug-873962.t
new file mode 100755
index 00000000000..c18c71f6012
--- /dev/null
+++ b/tests/bugs/bug-873962.t
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+#AFR TEST-IDENTIFIER SPLIT-BRAIN
+. $(dirname $0)/../include.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info;
+
+B0_hiphenated=`echo $B0 | tr '/' '-'`
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2}
+
+#Make sure self-heal is not triggered when the bricks are re-started
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume set $V0 performance.stat-prefetch off
+TEST $CLI volume start $V0
+TEST glusterfs -s $H0 --volfile-id=$V0 $M0
+TEST glusterfs -s $H0 --volfile-id=$V0 $M1
+TEST cd $M0
+TEST touch a
+TEST touch b
+TEST touch c
+TEST touch d
+echo "1" > b
+echo "1" > d
+kill -9 `cat /var/lib/glusterd/vols/$V0/run/$H0$B0_hiphenated-${V0}2.pid`
+echo "1" > a
+echo "1" > c
+TEST setfattr -n trusted.mdata -v abc b
+TEST setfattr -n trusted.mdata -v abc d
+TEST $CLI volume start $V0 force
+sleep 5
+kill -9 `cat /var/lib/glusterd/vols/$V0/run/$H0$B0_hiphenated-${V0}1.pid`
+echo "2" > a
+echo "2" > c
+TEST setfattr -n trusted.mdata -v def b
+TEST setfattr -n trusted.mdata -v def d
+TEST $CLI volume start $V0 force
+sleep 10
+
+#Files are in split-brain, so open should fail
+TEST ! cat a;
+TEST ! cat $M1/a;
+TEST ! cat b;
+TEST ! cat $M1/b;
+
+#Reset split-brain status
+TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/a;
+TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/b;
+
+sleep 1
+#The operations should do self-heal and give correct output
+EXPECT "2" cat a;
+EXPECT "2" cat $M1/a;
+EXPECT "def" getfattr -n trusted.mdata --only-values b 2>/dev/null
+EXPECT "def" getfattr -n trusted.mdata --only-values $M1/b 2>/dev/null
+
+TEST $CLI volume set $V0 cluster.data-self-heal off
+TEST $CLI volume set $V0 cluster.metadata-self-heal off
+sleep 5 #Wait for vol file to be updated
+
+sleep 1
+#Files are in split-brain, so open should fail
+TEST ! cat c
+TEST ! cat $M1/c
+TEST ! cat d
+TEST ! cat $M1/d
+
+TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/c
+TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/d
+
+sleep 1
+#The operations should NOT do self-heal but give correct output
+EXPECT "2" cat c
+EXPECT "2" cat $M1/c
+EXPECT "1" cat d
+EXPECT "1" cat $M1/d
+
+#Check that the self-heal is not triggered.
+EXPECT "1" cat $B0/${V0}1/c
+EXPECT "abc" getfattr -n trusted.mdata --only-values $B0/${V0}1/d 2>/dev/null
+cd
+cleanup;
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index c38e527259b..ee96944f827 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -49,8 +49,7 @@
#include "afr-self-heald.h"
#include "pump.h"
-#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000200000000ULL
-#define AFR_ICTX_SPLIT_BRAIN_MASK 0x0000000100000000ULL
+#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000100000000ULL
#define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL
int
@@ -203,59 +202,86 @@ out:
return ret;
}
-afr_inode_ctx_t*
-afr_inode_ctx_get_from_addr (uint64_t addr, int32_t child_count)
+void
+afr_inode_ctx_destroy (afr_inode_ctx_t *ctx)
{
- int ret = -1;
- afr_inode_ctx_t *ctx = NULL;
- size_t size = 0;
+ if (!ctx)
+ return;
+ GF_FREE (ctx->fresh_children);
+ GF_FREE (ctx);
+}
- GF_ASSERT (child_count > 0);
+afr_inode_ctx_t*
+__afr_inode_ctx_get (inode_t *inode, xlator_t *this)
+{
+ int ret = 0;
+ uint64_t ctx_addr = 0;
+ afr_inode_ctx_t *ctx = NULL;
+ afr_private_t *priv = NULL;
- if (!addr) {
- ctx = GF_CALLOC (1, sizeof (*ctx),
- gf_afr_mt_inode_ctx_t);
- if (!ctx)
- goto out;
- size = sizeof (*ctx->fresh_children);
- ctx->fresh_children = GF_CALLOC (child_count, size,
- gf_afr_mt_int32_t);
- if (!ctx->fresh_children)
- goto out;
- } else {
- ctx = (afr_inode_ctx_t*) (long) addr;
+ priv = this->private;
+ ret = __inode_ctx_get (inode, this, &ctx_addr);
+ if (ret < 0)
+ ctx_addr = 0;
+ if (ctx_addr != 0) {
+ ctx = (afr_inode_ctx_t*) (long) ctx_addr;
+ goto out;
}
- ret = 0;
+ ctx = GF_CALLOC (1, sizeof (*ctx),
+ gf_afr_mt_inode_ctx_t);
+ if (!ctx)
+ goto fail;
+ ctx->fresh_children = GF_CALLOC (priv->child_count,
+ sizeof (*ctx->fresh_children),
+ gf_afr_mt_int32_t);
+ if (!ctx->fresh_children)
+ goto fail;
+ ret = __inode_ctx_put (inode, this, (uint64_t)ctx);
+ if (ret) {
+ gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to "
+ "set the inode ctx (%s)",
+ uuid_utoa (inode->gfid));
+ goto fail;
+ }
+
out:
- if (ret && ctx) {
- GF_FREE (ctx->fresh_children);
- GF_FREE (ctx);
- ctx = NULL;
+ return ctx;
+
+fail:
+ afr_inode_ctx_destroy (ctx);
+ return NULL;
+}
+
+afr_inode_ctx_t*
+afr_inode_ctx_get (inode_t *inode, xlator_t *this)
+{
+ afr_inode_ctx_t *ctx = NULL;
+
+ LOCK (&inode->lock);
+ {
+ ctx = __afr_inode_ctx_get (inode, this);
}
+ UNLOCK (&inode->lock);
return ctx;
}
void
-afr_inode_get_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params)
+afr_inode_get_ctx_params (xlator_t *this, inode_t *inode,
+ afr_inode_params_t *params)
{
GF_ASSERT (inode);
GF_ASSERT (params);
- int ret = 0;
afr_inode_ctx_t *ctx = NULL;
afr_private_t *priv = NULL;
int i = 0;
- uint64_t ctx_addr = 0;
int32_t read_child = -1;
int32_t *fresh_children = NULL;
priv = this->private;
LOCK (&inode->lock);
{
- ret = __inode_ctx_get (inode, this, &ctx_addr);
- if (ret < 0)
- goto unlock;
- ctx = afr_inode_ctx_get_from_addr (ctx_addr, priv->child_count);
+ ctx = __afr_inode_ctx_get (inode, this);
if (!ctx)
goto unlock;
switch (params->op) {
@@ -274,12 +300,6 @@ afr_inode_get_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params)
if (ctx->masks & AFR_ICTX_OPENDIR_DONE_MASK)
params->u.value = _gf_true;
break;
- case AFR_INODE_GET_SPLIT_BRAIN:
- params->u.value = _gf_false;
- if (ctx->masks & AFR_ICTX_SPLIT_BRAIN_MASK)
- params->u.value = _gf_true;
- ;
- break;
default:
GF_ASSERT (0);
break;
@@ -292,11 +312,16 @@ unlock:
gf_boolean_t
afr_is_split_brain (xlator_t *this, inode_t *inode)
{
- afr_inode_params_t params = {0};
+ afr_inode_ctx_t *ctx = NULL;
+ gf_boolean_t spb = _gf_false;
- params.op = AFR_INODE_GET_SPLIT_BRAIN;
- afr_inode_get_ctx (this, inode, &params);
- return params.u.value;
+ ctx = afr_inode_ctx_get (inode, this);
+ if (!ctx)
+ goto out;
+ if ((ctx->mdata_spb == SPB) || (ctx->data_spb == SPB))
+ spb = _gf_true;
+out:
+ return spb;
}
gf_boolean_t
@@ -305,11 +330,10 @@ afr_is_opendir_done (xlator_t *this, inode_t *inode)
afr_inode_params_t params = {0};
params.op = AFR_INODE_GET_OPENDIR_DONE;
- afr_inode_get_ctx (this, inode, &params);
+ afr_inode_get_ctx_params (this, inode, &params);
return params.u.value;
}
-
int32_t
afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children)
{
@@ -317,7 +341,7 @@ afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children)
params.op = AFR_INODE_GET_READ_CTX;
params.u.read_ctx.children = fresh_children;
- afr_inode_get_ctx (this, inode, &params);
+ afr_inode_get_ctx_params (this, inode, &params);
return params.u.read_ctx.read_child;
}
@@ -379,31 +403,14 @@ afr_inode_ctx_set_opendir_done (afr_inode_ctx_t *ctx)
}
void
-afr_inode_ctx_set_splitbrain (afr_inode_ctx_t *ctx, gf_boolean_t set)
-{
- uint64_t remaining_mask = 0;
- uint64_t mask = 0;
-
- if (set) {
- remaining_mask = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx->masks);
- mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_SPLIT_BRAIN_MASK);
- ctx->masks = remaining_mask | mask;
- } else {
- ctx->masks = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx->masks);
- }
-}
-
-void
-afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params)
+afr_inode_set_ctx_params (xlator_t *this, inode_t *inode,
+ afr_inode_params_t *params)
{
GF_ASSERT (inode);
GF_ASSERT (params);
- int ret = 0;
afr_inode_ctx_t *ctx = NULL;
afr_private_t *priv = NULL;
- uint64_t ctx_addr = 0;
- gf_boolean_t set = _gf_false;
int32_t read_child = -1;
int32_t *fresh_children = NULL;
int32_t *stale_children = NULL;
@@ -411,10 +418,7 @@ afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params)
priv = this->private;
LOCK (&inode->lock);
{
- ret = __inode_ctx_get (inode, this, &ctx_addr);
- if (ret < 0)
- ctx_addr = 0;
- ctx = afr_inode_ctx_get_from_addr (ctx_addr, priv->child_count);
+ ctx = __afr_inode_ctx_get (inode, this);
if (!ctx)
goto unlock;
switch (params->op) {
@@ -434,33 +438,26 @@ afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params)
case AFR_INODE_SET_OPENDIR_DONE:
afr_inode_ctx_set_opendir_done (ctx);
break;
- case AFR_INODE_SET_SPLIT_BRAIN:
- set = params->u.value;
- afr_inode_ctx_set_splitbrain (ctx, set);
- break;
default:
GF_ASSERT (0);
break;
}
- ret = __inode_ctx_put (inode, this, (uint64_t)ctx);
- if (ret) {
- gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to "
- "set the inode ctx (%s)",
- uuid_utoa (inode->gfid));
- }
}
unlock:
UNLOCK (&inode->lock);
}
void
-afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set)
+afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb,
+ afr_spb_state_t data_spb)
{
- afr_inode_params_t params = {0};
+ afr_inode_ctx_t *ctx = NULL;
- params.op = AFR_INODE_SET_SPLIT_BRAIN;
- params.u.value = set;
- afr_inode_set_ctx (this, inode, &params);
+ ctx = afr_inode_ctx_get (inode, this);
+ if (mdata_spb != DONT_KNOW)
+ ctx->mdata_spb = mdata_spb;
+ if (data_spb != DONT_KNOW)
+ ctx->data_spb = data_spb;
}
void
@@ -469,7 +466,7 @@ afr_set_opendir_done (xlator_t *this, inode_t *inode)
afr_inode_params_t params = {0};
params.op = AFR_INODE_SET_OPENDIR_DONE;
- afr_inode_set_ctx (this, inode, &params);
+ afr_inode_set_ctx_params (this, inode, &params);
}
void
@@ -488,7 +485,7 @@ afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child,
params.op = AFR_INODE_SET_READ_CTX;
params.u.read_ctx.read_child = read_child;
params.u.read_ctx.children = fresh_children;
- afr_inode_set_ctx (this, inode, &params);
+ afr_inode_set_ctx_params (this, inode, &params);
}
void
@@ -501,7 +498,7 @@ afr_inode_rm_stale_children (xlator_t *this, inode_t *inode,
params.op = AFR_INODE_RM_STALE_CHILDREN;
params.u.read_ctx.children = stale_children;
- afr_inode_set_ctx (this, inode, &params);
+ afr_inode_set_ctx_params (this, inode, &params);
}
gf_boolean_t
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 07603c5b2a2..a0dfa59d9a6 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -2067,19 +2067,16 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this)
afr_local_t * orig_frame_local = NULL;
afr_self_heal_t * orig_frame_sh = NULL;
char sh_type_str[256] = {0,};
- gf_boolean_t split_brain = _gf_false;
priv = this->private;
local = bgsh_frame->local;
sh = &local->self_heal;
- if (local->govinda_gOvinda || sh->mdata_spb || sh->data_spb) {
- split_brain = _gf_true;
+ if (local->govinda_gOvinda) {
+ afr_set_split_brain (this, sh->inode, SPB, SPB);
sh->op_failed = 1;
}
- afr_set_split_brain (this, sh->inode, split_brain);
-
afr_self_heal_type_str_get (sh, sh_type_str,
sizeof(sh_type_str));
if (sh->op_failed) {
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index c90ed7c1a4f..9694436879b 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -716,12 +716,13 @@ afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this)
"split-brain). Please delete the file from all but "
"the preferred subvolume.", local->loc.path);
- sh->data_spb = _gf_true;
+ afr_set_split_brain (this, sh->inode, DONT_KNOW, SPB);
+
afr_sh_data_fail (frame, this);
return 0;
}
- sh->data_spb = _gf_false;
+ afr_set_split_brain (this, sh->inode, DONT_KNOW, NO_SPB);
ret = afr_sh_inode_set_read_ctx (sh, this);
if (ret) {
gf_log (this->name, GF_LOG_DEBUG,
@@ -1401,13 +1402,6 @@ afr_self_heal_data (call_frame_t *frame, xlator_t *this)
local = frame->local;
sh = &local->self_heal;
- /* Self-heal completion cbk changes inode split-brain status based on
- * govinda_gOvinda, mdata_spb, data_spb value.
- * Initialize data_spb with current split-brain status.
- * If for some reason self-heal fails(locking phase etc), it makes sure
- * we retain the split-brain status before this self-heal started.
- */
- sh->data_spb = afr_is_split_brain (this, sh->inode);
if (afr_can_start_data_self_heal (sh, priv)) {
if (IA_ISREG (sh->type)) {
afr_sh_data_open (frame, this);
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index 647ee47a178..cf8a33add34 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -88,6 +88,19 @@ afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this)
return 0;
}
+int
+afr_sh_metadata_fail (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
+
+ local = frame->local;
+ sh = &local->self_heal;
+
+ sh->op_failed = 1;
+ afr_sh_metadata_finish (frame, this);
+ return 0;
+}
int
afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie,
@@ -397,13 +410,12 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this,
"(possible split-brain). Please fix the file on "
"all backend volumes", local->loc.path);
- sh->mdata_spb = _gf_true;
-
- afr_sh_metadata_finish (frame, this);
+ afr_set_split_brain (this, sh->inode, SPB, DONT_KNOW);
+ afr_sh_metadata_fail (frame, this);
goto out;
}
- sh->mdata_spb = _gf_false;
+ afr_set_split_brain (this, sh->inode, NO_SPB, DONT_KNOW);
if (nsources == 0) {
gf_log (this->name, GF_LOG_TRACE,
"No self-heal needed for %s",
@@ -529,14 +541,6 @@ afr_self_heal_metadata (call_frame_t *frame, xlator_t *this)
local = frame->local;
sh = &local->self_heal;
- /* Self-heal completion cbk changes inode split-brain status based on
- * govinda_gOvinda, mdata_spb, data_spb value.
- * Initialize mdata_spb with current split-brain status.
- * If for some reason self-heal fails(locking phase etc), it makes sure
- * we retain the split-brain status before this self-heal started.
- */
- sh->mdata_spb = afr_is_split_brain (this, sh->inode);
-
if (afr_can_start_metadata_self_heal (sh, priv)) {
afr_sh_metadata_lock (frame, this);
} else {
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index da9cc67c606..475e5dda43d 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -58,10 +58,8 @@ typedef enum {
AFR_INODE_SET_READ_CTX = 1,
AFR_INODE_RM_STALE_CHILDREN,
AFR_INODE_SET_OPENDIR_DONE,
- AFR_INODE_SET_SPLIT_BRAIN,
AFR_INODE_GET_READ_CTX,
AFR_INODE_GET_OPENDIR_DONE,
- AFR_INODE_GET_SPLIT_BRAIN,
} afr_inode_op_t;
typedef struct afr_inode_params_ {
@@ -75,9 +73,17 @@ typedef struct afr_inode_params_ {
} u;
} afr_inode_params_t;
+typedef enum afr_spb_state {
+ DONT_KNOW,
+ SPB,
+ NO_SPB
+} afr_spb_state_t;
+
typedef struct afr_inode_ctx_ {
uint64_t masks;
int32_t *fresh_children;//increasing order of latency
+ afr_spb_state_t mdata_spb;
+ afr_spb_state_t data_spb;
} afr_inode_ctx_t;
typedef enum {
@@ -272,8 +278,6 @@ typedef struct {
int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this);
int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this);
void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this);
- gf_boolean_t mdata_spb;
- gf_boolean_t data_spb;
call_frame_t *sh_frame;
} afr_self_heal_t;
@@ -833,7 +837,8 @@ gf_boolean_t
afr_is_split_brain (xlator_t *this, inode_t *inode);
void
-afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set);
+afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb,
+ afr_spb_state_t data_spb);
int
afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,