summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--tests/basic/gfid_unsplit.t102
-rw-r--r--tests/include.rc2
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c38
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-name.c296
-rw-r--r--xlators/cluster/afr/src/afr.h1
5 files changed, 426 insertions, 13 deletions
diff --git a/tests/basic/gfid_unsplit.t b/tests/basic/gfid_unsplit.t
new file mode 100644
index 00000000000..9bb52f4533a
--- /dev/null
+++ b/tests/basic/gfid_unsplit.t
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info;
+
+# Setup a cluster with 3 replicas, and fav child by majority on
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3};
+TEST $CLI volume set $V0 performance.stat-prefetch off
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume set $V0 nfs.disable off
+#EST $CLI volume set $V0 cluster.favorite-child-by-majority on
+#EST $CLI volume set $V0 cluster.favorite-child-by-mtime on
+TEST $CLI volume set $V0 cluster.favorite-child-policy majority
+TEST $CLI volume set $V0 cluster.metadata-self-heal off
+TEST $CLI volume set $V0 cluster.data-self-heal off
+TEST $CLI volume set $V0 cluster.entry-self-heal off
+TEST $CLI volume start $V0
+sleep 5
+
+# Part I: FUSE Test
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 \
+ --attribute-timeout=0 --entry-timeout=0
+
+dd if=/dev/urandom of=$M0/splitfile bs=128k count=5 2>/dev/null
+
+MD5=$(md5sum $M0/splitfile | cut -d\ -f1)
+
+# Create a split-brain by downing a brick, and flipping the
+# gfid on the down brick, then bring the brick back up.
+TEST kill_brick $V0 $H0 $B0/${V0}1
+GFID_DIR_B1="$B0/${V0}1/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}1/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')"
+rm -rf $GFID_DIR_B1
+TEST setfattr -n "trusted.gfid" -v "0xfd551a5cfddd4c1aa4d096ef09ef5c08" $B0/${V0}1/splitfile
+
+GFID_DIR_B3="$B0/${V0}3/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}3/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')"
+#EST rm -f $B0/${V0}3/splitfile
+#m -rf $GFID_DIR_B3
+
+# Restart the down brick
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0
+sleep 5
+
+# Tickle the file to trigger the gfid unsplit
+TEST stat $M0/splitfile
+sleep 1
+
+# Verify the file is readable
+TEST dd if=$M0/splitfile of=/dev/null 2>/dev/null
+
+# Verify the MD5 signature of the file
+HEALED_MD5=$(md5sum $M0/splitfile | cut -d\ -f1)
+TEST [ "$MD5" == "$HEALED_MD5" ]
+
+# Verify the file can be removed
+TEST rm -f $M0/splitfile
+
+# Part II: NFS test
+TEST mount -t nfs -o nolock,noatime,noacl,soft,intr $H0:/$V0 $N0;
+
+dd if=/dev/urandom of=$N0/splitfile bs=128k count=5 2>/dev/null
+
+MD5=$(md5sum $N0/splitfile | cut -d\ -f1)
+
+# Create a split-brain by downing a brick, and flipping the
+# gfid on the down brick, then bring the brick back up.
+TEST kill_brick $V0 $H0 $B0/${V0}1
+GFID_DIR_B1="$B0/${V0}1/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}1/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')"
+rm -rf $GFID_DIR_B1
+TEST setfattr -n "trusted.gfid" -v "0xfd551a5cfddd4c1aa4d096ef09ef5c08" $B0/${V0}1/splitfile
+
+GFID_DIR_B3="$B0/${V0}3/.glusterfs/$(getfattr -n trusted.gfid -e hex $B0/${V0}3/splitfile 2>/dev/null | grep ^trusted | cut -d= -f2 | awk '{print substr($0,3,2)}')"
+#EST rm -f $B0/${V0}3/splitfile
+#m -rf $GFID_DIR_B3
+
+# Restart the down brick
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0
+sleep 5
+
+# Tickle the file to trigger the gfid unsplit
+TEST stat $N0/splitfile
+sleep 1
+
+# Verify the file is readable
+TEST dd if=$N0/splitfile of=/dev/null 2>/dev/null
+
+# Verify the MD5 signature of the file
+HEALED_MD5=$(md5sum $N0/splitfile | cut -d\ -f1)
+TEST [ "$MD5" == "$HEALED_MD5" ]
+
+# Verify the file can be removed
+TEST rm -f $N0/splitfile
+
+cleanup
diff --git a/tests/include.rc b/tests/include.rc
index de3b90ee86b..872aec11edf 100644
--- a/tests/include.rc
+++ b/tests/include.rc
@@ -431,7 +431,7 @@ function cleanup()
# Prepare flags for umount
case `uname -s` in
Linux)
- flag="-l"
+ flag="-l -f --no-canonicalize"
;;
NetBSD)
flag="-f -R"
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 549c3c1ba71..04f16cc0122 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -664,6 +664,20 @@ out:
}
+static int
+replies_are_same (struct afr_reply *replies, int i, int k)
+{
+ if (replies[k].poststat.ia_mtime != replies[i].poststat.ia_mtime) {
+ return _gf_false;
+ }
+ if (replies[k].poststat.ia_size != replies[i].poststat.ia_size) {
+ return _gf_false;
+ }
+
+ return gf_uuid_compare (replies[i].poststat.ia_gfid,
+ replies[k].poststat.ia_gfid) == 0;
+}
+
int
afr_sh_fav_by_majority (xlator_t *this, struct afr_reply *replies,
inode_t *inode)
@@ -685,12 +699,8 @@ afr_sh_fav_by_majority (xlator_t *this, struct afr_reply *replies,
replies[i].poststat.ia_size,
uuid_utoa (inode->gfid));
vote_count = 0;
- for (k = 0; k < priv->child_count; k++) {
- if ((replies[k].poststat.ia_mtime ==
- replies[i].poststat.ia_mtime) &&
- (replies[k].poststat.ia_size ==
- replies[i].poststat.ia_size)
- ) {
+ for (k = 1; k < priv->child_count; k++) {
+ if (replies_are_same (replies, i, k)) {
vote_count++;
}
}
@@ -2248,3 +2258,19 @@ afr_choose_source_by_policy (afr_private_t *priv, unsigned char *sources,
out:
return source;
}
+
+void
+afr_sh_get_source_by_policy (xlator_t *this,
+ unsigned char *sources,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies, inode_t *inode)
+{
+ int fav_child = -1;
+ char *policy_str;
+
+ fav_child = afr_sh_get_fav_by_policy (this, replies, inode,
+ &policy_str);
+ sources[fav_child] = 1;
+ healed_sinks[fav_child] = 0;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
index b28ce4170f1..deb3b4e59af 100644
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
@@ -13,6 +13,287 @@
#include "afr-self-heal.h"
#include "afr-messages.h"
+
+
+/*
+ * Helper function to create the destination location for the copy
+ * of the directory entry we are moving out of the way.
+ */
+static int
+_afr_sh_create_unsplit_loc (struct afr_reply *replies, const int child_idx,
+ loc_t *loc, loc_t *unsplit_loc)
+{
+ int ret = 0;
+ int new_path_len = 0;
+ int new_name_len = 0;
+ char *new_path = NULL;
+ char *new_name = NULL;
+ char *tmp_gfid_str;
+ const char *filename = NULL;
+ uuid_t rand_uuid;
+
+ tmp_gfid_str = alloca (sizeof (UUID0_STR));
+
+ /*
+ * All of these allocations will be cleaned up
+ * @ afr_sh_gfid_unsplit_rename_done via loc_wipe.
+ */
+ if (loc_copy (unsplit_loc, loc)) {
+ ret = EINVAL;
+ goto err;
+ }
+
+ inode_unref (unsplit_loc->inode);
+ unsplit_loc->inode = inode_new (loc->inode->table);
+ unsplit_loc->parent = inode_ref (loc->parent);
+ gf_uuid_copy (unsplit_loc->inode->gfid,
+ replies[child_idx].poststat.ia_gfid);
+ unsplit_loc->inode->ia_type = loc->inode->ia_type;
+
+ gf_uuid_generate (rand_uuid);
+ /* Note: Use re-entrant version of uuid_utoa! */
+ tmp_gfid_str = uuid_utoa_r (rand_uuid, tmp_gfid_str);
+
+ /* Copy the GFIDs, file + parent directory */
+ gf_uuid_copy (unsplit_loc->gfid, rand_uuid);
+ gf_uuid_copy (unsplit_loc->pargfid,
+ replies[child_idx].postparent.ia_gfid);
+
+ filename = loc->name;
+
+ /*
+ * New path: Add 11 for null + ".unsplit_" + "_". We _could_ nuke
+ * tmp_gfid_str entirely here, iff we assume the uuid_utoa
+ * formatting to _never_change. If we assume this we can just add
+ * 32 to the length and call uuid_utoa directly in the snprintf.
+ */
+ new_path_len = strlen (filename) + strlen (tmp_gfid_str) + 11;
+ new_path = GF_CALLOC (1, new_path_len, gf_common_mt_char);
+ if (!new_path) {
+ ret = ENOMEM;
+ goto err;
+ }
+ snprintf (new_path, new_path_len, ".unsplit_%s_%s", tmp_gfid_str,
+ filename);
+ unsplit_loc->path = new_path;
+
+ /* New name: Add 11 for null + ".unsplit_" + "_" */
+ new_name_len = strlen (loc->name) + strlen (tmp_gfid_str) + 11;
+ new_name = GF_CALLOC (1, new_name_len, gf_common_mt_char);
+ if (!new_name) {
+ ret = ENOMEM;
+ goto err;
+ }
+ snprintf (new_name, new_name_len, ".unsplit_%s_%s", tmp_gfid_str,
+ loc->name);
+ unsplit_loc->name = new_name;
+
+ return 0;
+err:
+ GF_FREE (new_path);
+ GF_FREE (new_name);
+ return ret;
+}
+
+static int
+_afr_gfid_unsplit_rename_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ struct iatt *buf, struct iatt *preoldparent,
+ struct iatt *postoldparent, struct iatt *prenewparent,
+ struct iatt *postnewparent, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int child_index = (long) cookie;
+
+ local = frame->local;
+
+ if ((op_ret == -1) && (op_errno != ENOENT)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "rename entry %s/%s failed, on child %d reason, %s",
+ uuid_utoa (local->loc.pargfid),
+ local->loc.name, child_index, strerror (op_errno));
+ }
+ gf_log (this->name, GF_LOG_DEBUG,
+ "GFID unsplit successful on %s/%s, on child %d",
+ uuid_utoa (local->loc.pargfid), local->loc.name, child_index);
+
+ syncbarrier_wake (&local->barrier);
+ return 0;
+}
+int
+__afr_selfheal_do_gfid_unsplit (xlator_t *this, unsigned char *locked_on,
+ struct afr_reply *replies, inode_t *inode,
+ loc_t *loc)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ call_frame_t *frame = NULL;
+ loc_t *unsplit_loc;
+ int fav_child = -1;
+ unsigned char *fav_gfid;
+ unsigned int i = 0;
+ unsigned int split_count = 0;
+ unsigned char *rename_list;
+ int ret = 0;
+ char *policy_str;
+
+ frame = afr_frame_create (this);
+
+ local = frame->local; // Local variables for our frame
+ priv = this->private; // xlator specific variables
+ rename_list = alloca0 (priv->child_count);
+
+ if (loc_copy (&local->loc, loc)) {
+ ret = ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Ok, go find our favorite child by one of the active policies:
+ * majority -> ctime -> mtime -> size -> predefined
+ * we'll use this gfid as the "real" one.
+ */
+ fav_child = afr_sh_get_fav_by_policy (this, replies, inode,
+ &policy_str);
+ if (fav_child == -1) { /* No policies are in place, bail */
+ gf_log (this->name, GF_LOG_WARNING, "Unable to resolve GFID "
+ "split brain, there are no favorite child policies "
+ "set.");
+ ret = -EIO;
+ goto out;
+ }
+ fav_gfid = replies[fav_child].poststat.ia_gfid;
+ gf_log (this->name, GF_LOG_INFO, "Using child %d to resolve gfid "
+ "split-brain. GFID is %s.", fav_child, uuid_utoa (fav_gfid));
+
+ /* Pre-compute the number of rename calls we will be doing */
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i] &&
+ !gf_uuid_is_null (replies[i].poststat.ia_gfid) &&
+ gf_uuid_compare (replies[i].poststat.ia_gfid, fav_gfid)) {
+ split_count++;
+ }
+ }
+
+ gf_log (this->name, GF_LOG_INFO, "Found %d split-brained gfid's.",
+ split_count);
+
+ local->unsplit_locs = GF_CALLOC (priv->child_count,
+ sizeof (*unsplit_loc), gf_afr_mt_loc_t);
+ if (!local->unsplit_locs) {
+ ret = ENOMEM;
+ goto out;
+ }
+
+ afr_local_replies_wipe (local, priv);
+ local->call_count = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ unsplit_loc = &local->unsplit_locs[i];
+ if (locked_on[i] && local->child_up[i] &&
+ replies[i].op_errno != ENOENT &&
+ !gf_uuid_is_null (replies[i].poststat.ia_gfid) &&
+ gf_uuid_compare (replies[i].poststat.ia_gfid, fav_gfid)) {
+ ret = _afr_sh_create_unsplit_loc (replies, i,
+ loc, unsplit_loc);
+ gf_log (this->name, GF_LOG_INFO, "Renaming child %d to "
+ " %s/%s to resolve gfid split-brain.", i,
+ uuid_utoa (unsplit_loc->pargfid),
+ unsplit_loc->name);
+ rename_list[i] = 1;
+ /* frame, rfn, cky, obj, fn, params */
+ STACK_WIND_COOKIE (frame,
+ _afr_gfid_unsplit_rename_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->rename,
+ loc, unsplit_loc, NULL);
+ local->call_count++;
+ }
+ }
+ syncbarrier_wait (&local->barrier, local->call_count);
+
+out:
+ for (i = 0; i < priv->child_count; i++) {
+ if (rename_list[i])
+ loc_wipe (&local->unsplit_locs[i]);
+ }
+ if (frame)
+ AFR_STACK_DESTROY (frame);
+ return ret;
+}
+
+int
+__afr_selfheal_gfid_unsplit (xlator_t *this, inode_t *parent, uuid_t pargfid,
+ const char *bname, inode_t *inode,
+ struct afr_reply *replies, void *gfid,
+ unsigned char *locked_on)
+{
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ loc_t loc = {0, };
+ call_frame_t *new_frame = NULL;
+ afr_local_t *new_local = NULL;
+
+ priv = this->private;
+
+ new_frame = afr_frame_create (this);
+ if (!new_frame) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ new_local = new_frame->local;
+
+ gf_uuid_copy (parent->gfid, pargfid);
+
+ xdata = dict_new ();
+ if (!xdata) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = dict_set_static_bin (xdata, "gfid-req", gfid, 16);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ loc.parent = inode_ref (parent);
+ loc.inode = inode_ref (inode);
+ gf_uuid_copy (loc.pargfid, pargfid);
+ loc.name = bname;
+
+ ret = __afr_selfheal_do_gfid_unsplit (this, locked_on, replies,
+ inode, &loc);
+
+ if (ret)
+ goto out;
+
+ /* Clear out old replies here and wind lookup on all locked
+ * subvolumes to achieve two things:
+ * a. gfid heal on those subvolumes that do not have gfid associated
+ * with the inode, and
+ * b. refresh replies, which can be consumed by
+ * __afr_selfheal_name_impunge().
+ */
+ afr_replies_wipe (replies, priv->child_count);
+ /* This sends out lookups to all bricks and blocks once we have
+ * them.
+ */
+ AFR_ONLIST (locked_on, new_frame, afr_selfheal_discover_cbk, lookup,
+ &loc, xdata);
+ afr_replies_copy (replies, new_local->replies, priv->child_count);
+out:
+ loc_wipe (&loc);
+ if (xdata)
+ dict_unref (xdata);
+ if (new_frame)
+ AFR_STACK_DESTROY (new_frame);
+
+ return ret;
+}
+
int
__afr_selfheal_assign_gfid (xlator_t *this, inode_t *parent, uuid_t pargfid,
const char *bname, inode_t *inode,
@@ -429,12 +710,6 @@ __afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent,
if (ret)
return ret;
- ret = afr_selfheal_name_gfid_mismatch_check (this, replies, source,
- sources, &gfid_idx,
- pargfid, bname);
- if (ret)
- return ret;
-
if (gfid_idx == -1) {
if (!gfid_req || gf_uuid_is_null (gfid_req))
return -1;
@@ -443,6 +718,15 @@ __afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent,
gfid = &replies[gfid_idx].poststat.ia_gfid;
}
+ ret = afr_selfheal_name_gfid_mismatch_check (this, replies, source,
+ sources, &gfid_idx,
+ pargfid, bname);
+ if (ret)
+ ret = __afr_selfheal_gfid_unsplit (this, parent, pargfid,
+ bname, inode, replies, gfid, locked_on);
+ if (ret)
+ return ret;
+
is_gfid_absent = (gfid_idx == -1) ? _gf_true : _gf_false;
ret = __afr_selfheal_assign_gfid (this, parent, pargfid, bname, inode,
replies, gfid, locked_on,
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index bbb444c7974..53abeaace11 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -823,6 +823,7 @@ typedef struct _afr_local {
gf_boolean_t need_full_crawl;
gf_boolean_t is_read_txn;
+ loc_t *unsplit_locs; /* Un-split targets */
} afr_local_t;