diff options
authorRavishankar N <>2016-02-05 15:10:06 +0530
committerPranith Kumar Karampuri <>2016-06-27 00:13:36 -0700
commite4ea25e9eea0f7259c11333f7a75049f3dccb7a7 (patch)
parentcdaa5dd7f91d93fce3d900e4321dd83d021e96dd (diff)
afr:Don't wind reads for files in metadata split-brain
Backport of Problem: For a read on a file in metadata split-brain: 1.lookup_done resets event_generation to zero. 2. readv is issued, goes to inode refresh due to mismatching event_gen. 3. After refresh is successful, we update event_generation, data and metdata readable. 3. We then call afr_read_txn_refresh_done() which in turn calls afr_inode_get_readable() but doesn't check for EIO. So afr_readv_wind is called with local->readable (which is populated with data_readable), thus winding the read to a brick. 4. Also, further parallel reads that come directly go to the wind path because there is no inode_refresh needed. Fix: 1.For any afr_read_txn(), readable must be an intersection of data and metadata readable. 2.Check for EIO in afr_read_txn_refresh_done(). Change-Id: I22dd221fdfaf96d7aced2f474e28ed1337d69f0e BUG: 1349881 Signed-off-by: Ravishankar N <> (cherry picked from commit 7a1c1e2904701496968ed14b6d7479fb706c3188) Reviewed-on: NetBSD-regression: NetBSD Build System <> CentOS-regression: Gluster Build System <> Tested-by: Gluster Build System <> Smoke: Gluster Build System <> Reviewed-by: Pranith Kumar Karampuri <>
3 files changed, 47 insertions, 11 deletions
diff --git a/tests/bugs/replicate/bug-1305031-block-reads-on-metadata-sbrain.t b/tests/bugs/replicate/bug-1305031-block-reads-on-metadata-sbrain.t
new file mode 100644
index 00000000000..780ddb9250c
--- /dev/null
+++ b/tests/bugs/replicate/bug-1305031-block-reads-on-metadata-sbrain.t
@@ -0,0 +1,40 @@
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+#Test that for files in metadata-split-brain, we do not wind even a single read.
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1}
+TEST $CLI volume set $V0 self-heal-daemon off
+TEST $CLI volume set $V0 off
+TEST $CLI volume set $V0 performance.write-behind off
+TEST $CLI volume set $V0 off
+TEST $CLI volume set $V0 performance.stat-prefetch off
+TEST $CLI volume set $V0 performance.quick-read off
+TEST $CLI volume set $V0 off
+TEST $CLI volume start $V0
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0
+TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST chmod 700 $M0/file
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST chmod 777 $M0/file
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+TEST umount $M0
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0
+lines=`cat $M0/file|wc|awk '{print $1}'`
+EXPECT 0 echo $lines
+TEST umount $M0
diff --git a/tests/bugs/replicate/bug-977797.t b/tests/bugs/replicate/bug-977797.t
index 72c616ba68e..ea9a98adc23 100755
--- a/tests/bugs/replicate/bug-977797.t
+++ b/tests/bugs/replicate/bug-977797.t
@@ -53,7 +53,9 @@ TEST chmod 757 $M0/a/file
TEST $CLI volume start $V0 force
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1;
-TEST dd if=$M0/a/file of=/dev/null bs=1024k
+dd if=$M0/a/file of=/dev/null bs=1024k
+#read fails, but heal is triggered.
+TEST [ $? -ne 0 ]
afr_get_specific_changelog_xattr $B0/$V0"1"/a/file trusted.afr.$V0-client-0 "data"
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
index 32ad6a46d17..74749f029c8 100644
--- a/xlators/cluster/afr/src/afr-read-txn.c
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -83,7 +83,7 @@ afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
- if (ret == -1 || !event_generation)
+ if (ret == -EIO || !event_generation)
/* Even after refresh, we don't have a good
read subvolume. Time to bail */
@@ -218,18 +218,12 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
local->transaction.type = type;
- if (local->op == GF_FOP_FSTAT || local->op == GF_FOP_STAT) {
- ret = afr_inode_read_subvol_get (inode, this, data, metadata,
- &event_generation);
- AFR_INTERSECT (local->readable, data, metadata,
- priv->child_count);
- } else {
- ret = afr_inode_read_subvol_type_get (inode, this, local->readable,
- &event_generation, type);
- }
+ ret = afr_inode_read_subvol_get (inode, this, data, metadata,
+ &event_generation);
if (ret == -1)
/* very first transaction on this inode */
goto refresh;
+ AFR_INTERSECT (local->readable, data, metadata, priv->child_count);
gf_msg_debug (this->name, 0, "%s: generation now vs cached: %d, "
"%d", uuid_utoa (inode->gfid), local->event_generation,