cluster/afr: Prevent split-brain when bricks are brought off and on in cyclic order

Backport of: http://review.gluster.org/15080 When the bricks are brought offline and then online in cyclic order while writes are in progress on a file, thanks to inode refresh in write txns, AFR will mostly fail the write attempt when the only good copy is offline. However, there is still a remote possibility that the file will run into split-brain if the brick that has the lone good copy goes offline *after* the inode refresh but *before* the write txn completes (I call it in-flight split-brain in the patch for ease of reference), requiring intervention from admin to resolve the split-brain before the IO can resume normally on the file. To get around this, the patch does the following things: i) retains the dirty xattrs on the file ii) avoids marking the last of the good copies as bad (or accused) in case it is the one to go down during the course of a write. iii) fails that particular write with the appropriate errno. This way, we still have one good copy left despite the split-brain situation which when it is back online, will be chosen as source to do the heal. Change-Id: I7c13c6ddd5b8fe88b0f2684e8ce5f4a9c3a24a08 BUG: 1367270 Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com> Reviewed-on: http://review.gluster.org/15222 Smoke: Gluster Build System <jenkins@build.gluster.org> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Oleksandr Natalenko <oleksandr@natalenko.name> Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
author: Krutika Dhananjay <kdhananj@redhat.com> 2016-07-28 21:29:59 +0530
committer: Pranith Kumar Karampuri <pkarampu@redhat.com> 2016-08-22 03:05:08 -0700
commit: febaa1e46d3a91a29c4786a17abf29cfc7178254 (patch)
tree: 0fe52522cb3bfe318d9032243283f3ab6751ec9e /tests
parent: 888ad44a9da3006b3e5695e5e5b40d6e446aa109 (diff)
1 files changed, 112 insertions, 0 deletions
diff --git a/tests/bugs/replicate/bug-1363721.t b/tests/bugs/replicate/bug-1363721.t
new file mode 100644
index 00000000000..ec39889b27e
--- /dev/null
+++ b/tests/bugs/replicate/bug-1363721.t
@@ -0,0 +1,112 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+FILE_UPDATE_TIMEOUT=20
+cleanup
+
+function size_increased {
+        local file=$1
+        local size=$2
+        local new_size=$(stat -c%s $file)
+        if [ $new_size -gt $size ];
+        then
+                echo "Y"
+        else
+                echo "N"
+        fi
+}
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 performance.write-behind off
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume set $V0 cluster.data-self-heal off
+TEST $CLI volume set $V0 cluster.metadata-self-heal off
+TEST $CLI volume set $V0 cluster.entry-self-heal off
+TEST $CLI volume start $V0
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 --direct-io-mode=enable
+
+cd $M0
+
+# Start writing to a file.
+(dd if=/dev/urandom of=$M0/file1 bs=1k 2>/dev/null 1>/dev/null)&
+dd_pid=$!
+
+# Let IO happen
+EXPECT_WITHIN $FILE_UPDATE_TIMEOUT "Y" size_increased file1 0
+
+# Now kill the zeroth brick
+kill_brick $V0 $H0 $B0/${V0}0
+
+# Let IO continue
+EXPECT_WITHIN $FILE_UPDATE_TIMEOUT "Y" size_increased file1 $(stat -c%s file1)
+
+# Now bring the brick back up
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
+
+# Let IO continue
+EXPECT_WITHIN $FILE_UPDATE_TIMEOUT "Y" size_increased file1 $(stat -c%s file1)
+
+# Now kill the first brick
+kill_brick $V0 $H0 $B0/${V0}1
+
+# Let IO continue
+EXPECT_WITHIN $FILE_UPDATE_TIMEOUT "Y" size_increased file1 $(stat -c%s file1)
+
+# Now bring the brick back up
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
+
+# Let IO continue for 3 seconds
+sleep 3
+
+# Now kill the second brick
+kill_brick $V0 $H0 $B0/${V0}2
+
+# At this point the write should have been failed. But make sure that the second
+# brick is never an accused.
+
+md5sum_2=$(md5sum $B0/${V0}2/file1 | awk '{print $1}')
+
+EXPECT_NOT "$md5sum_2" echo `md5sum $B0/${V0}0/file1 | awk '{print $1}'`
+EXPECT_NOT "$md5sum_2" echo `md5sum $B0/${V0}1/file1 | awk '{print $1}'`
+
+EXPECT_NOT "00000000" afr_get_specific_changelog_xattr $B0/${V0}0/file1 trusted.afr.dirty data
+EXPECT_NOT "00000000" afr_get_specific_changelog_xattr $B0/${V0}1/file1 trusted.afr.dirty data
+
+EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}0/file1 trusted.afr.$V0-client-2 data
+EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}1/file1 trusted.afr.$V0-client-2 data
+EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file1 trusted.afr.$V0-client-2 data
+EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}0/file1 trusted.afr.$V0-client-2 metadata
+EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}1/file1 trusted.afr.$V0-client-2 metadata
+EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file1 trusted.afr.$V0-client-2 metadata
+
+# Now bring the brick back up
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 2
+
+# Enable shd
+TEST $CLI volume set $V0 self-heal-daemon on
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
+
+TEST $CLI volume heal $V0
+
+# Wait for heal to complete
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+EXPECT "$md5sum_2" echo `md5sum $B0/${V0}0/file1 | awk '{print $1}'`
+EXPECT "$md5sum_2" echo `md5sum $B0/${V0}1/file1 | awk '{print $1}'`
+EXPECT "$md5sum_2" echo `md5sum $B0/${V0}2/file1 | awk '{print $1}'`
+
+cd ~
+
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+
+cleanup
author	Krutika Dhananjay <kdhananj@redhat.com>	2016-07-28 21:29:59 +0530
committer	Pranith Kumar Karampuri <pkarampu@redhat.com>	2016-08-22 03:05:08 -0700
commit	febaa1e46d3a91a29c4786a17abf29cfc7178254 (patch)
tree	0fe52522cb3bfe318d9032243283f3ab6751ec9e /tests
parent	888ad44a9da3006b3e5695e5e5b40d6e446aa109 (diff)