summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAnand Avati <avati@redhat.com>2013-08-29 23:35:23 -0700
committerVijay Bellur <vbellur@redhat.com>2013-09-10 01:14:48 -0700
commit23f0ad993c5b12e8a8444fa2519864ccfc9cd8cf (patch)
tree928144d6d799d5e6f82262c0f4d26450e3410765
parenta95bf63d063b3e2345936d53410b515b7198f2ac (diff)
cluster/dht: assign layout onto missing directories too
The current self-healing algorithm is ignoring missing directories for assigning new layout. When lookup() is racing against mkdir() or when self-healing a half-done mkdir(), the layout assignment split must happen based on the final number of directories, and not the currently existing number of directories (because we finish mkdir() of missing directories before hash layout assignment). Without this fix, concurrent mkdir() and lookup() will step on each others feet, create a messed up layout on disk, and end up with different in-memory layouts. Once two clients have different in-memory layouts, creation of subdirectory will not arbitrate on the same hashed subvolume and will result in GFID mismatch of the sub-directory. Change-Id: Ia47acad67c265060405984c822b4d37512b9dbb3 BUG: 907072 Signed-off-by: Anand Avati <avati@redhat.com> Reviewed-on: http://review.gluster.org/5871 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Vijay Bellur <vbellur@redhat.com>
-rwxr-xr-xtests/bugs/bug-907072.t46
-rw-r--r--xlators/cluster/dht/src/dht-selfheal.c32
2 files changed, 74 insertions, 4 deletions
diff --git a/tests/bugs/bug-907072.t b/tests/bugs/bug-907072.t
new file mode 100755
index 00000000000..49b477767ac
--- /dev/null
+++ b/tests/bugs/bug-907072.t
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../fileio.rc
+. $(dirname $0)/../dht.rc
+
+cleanup;
+
+TEST glusterd;
+TEST pidof glusterd;
+
+TEST $CLI volume create $V0 $H0:$B0/${V0}{0,1,2,3};
+TEST $CLI volume start $V0;
+
+TEST glusterfs -s $H0 --volfile-id $V0 $M0;
+
+TEST mkdir $M0/test;
+
+OLD_LAYOUT0=`get_layout $B0/${V0}0/test`;
+OLD_LAYOUT1=`get_layout $B0/${V0}1/test`;
+OLD_LAYOUT2=`get_layout $B0/${V0}2/test`;
+OLD_LAYOUT3=`get_layout $B0/${V0}3/test`;
+
+TEST killall glusterfsd;
+
+# Delete directory on one brick
+TEST rm -rf $B0/${V}1/test;
+
+# And only layout xattr on another brick
+TEST setfattr -x trusted.glusterfs.dht $B0/${V0}2/test;
+
+TEST $CLI volume start $V0 force;
+
+TEST umount $M0;
+TEST glusterfs -s $H0 --volfile-id $V0 $M0;
+TEST stat $M0/test;
+
+NEW_LAYOUT0=`get_layout $B0/${V0}0/test`;
+NEW_LAYOUT1=`get_layout $B0/${V0}1/test`;
+NEW_LAYOUT2=`get_layout $B0/${V0}2/test`;
+NEW_LAYOUT3=`get_layout $B0/${V0}3/test`;
+
+EXPECT $OLD_LAYOUT0 echo $NEW_LAYOUT0;
+EXPECT $OLD_LAYOUT1 echo $NEW_LAYOUT1;
+EXPECT $OLD_LAYOUT2 echo $NEW_LAYOUT2;
+EXPECT $OLD_LAYOUT3 echo $NEW_LAYOUT3;
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
index 2d0b37e6974..39ee2b70be3 100644
--- a/xlators/cluster/dht/src/dht-selfheal.c
+++ b/xlators/cluster/dht/src/dht-selfheal.c
@@ -562,9 +562,33 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout)
for (i = 0; i < layout->cnt; i++) {
err = layout->list[i].err;
- if (err == -1 || err == 0) {
- layout->list[i].err = -1;
+ if (err == -1 || err == 0 || err == ENOENT) {
+ /* Setting list[i].err = -1 is an indication for
+ dht_selfheal_layout_new_directory() to assign
+ a range. We set it to -1 based on any one of
+ the three criteria:
+
+ - err == -1 already, which means directory
+ existed but layout was not set on it.
+
+ - err == 0, which means directory exists and
+ has an old layout piece which will be
+ overwritten now.
+
+ - err == ENOENT, which means directory does
+ not exist (possibly racing with mkdir or
+ finishing half done mkdir). The missing
+ directory will be attempted to be recreated.
+
+ It is important to note that it is safe
+ to race with mkdir() as self-heal and
+ mkdir are idempotent operations. Both will
+ strive to set the directory and layouts to
+ the same final state.
+ */
count++;
+ if (!err)
+ layout->list[i].err = -1;
}
}
@@ -767,7 +791,7 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
DHT_RESET_LAYOUT_RANGE (layout);
for (i = start_subvol; i < layout->cnt; i++) {
err = layout->list[i].err;
- if (err == -1) {
+ if (err == -1 || err == ENOENT) {
DHT_SET_LAYOUT_RANGE(layout, i, start, chunk,
cnt, loc->path);
if (--cnt == 0) {
@@ -780,7 +804,7 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
for (i = 0; i < start_subvol; i++) {
err = layout->list[i].err;
- if (err == -1) {
+ if (err == -1 || err == ENOENT) {
DHT_SET_LAYOUT_RANGE(layout, i, start, chunk,
cnt, loc->path);
if (--cnt == 0) {