diff options
53 files changed, 8915 insertions, 19619 deletions
diff --git a/libglusterfs/src/gf-dirent.c b/libglusterfs/src/gf-dirent.c index bb028c9671d..0cda83a27c3 100644 --- a/libglusterfs/src/gf-dirent.c +++ b/libglusterfs/src/gf-dirent.c @@ -83,6 +83,8 @@ gf_link_inodes_from_dirent (xlator_t *this, inode_t *parent, if (entry->inode) { link_inode = inode_link (entry->inode, parent, entry->d_name, &entry->d_stat); + if (!link_inode) + continue; inode_lookup (link_inode); inode_unref (link_inode); } diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index 31c46b74efc..5ce0d6e70bb 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -132,7 +132,7 @@ /* Index xlator related */ #define GF_XATTROP_INDEX_GFID "glusterfs.xattrop_index_gfid" -#define GF_BASE_INDICES_HOLDER_GFID "glusterfs.base_indicies_holder_gfid" +#define GF_XATTROP_INDEX_COUNT "glusterfs.xattrop_index_count" #define GF_GFIDLESS_LOOKUP "gfidless-lookup" /* replace-brick and pump related internal xattrs */ diff --git a/libglusterfs/src/xlator.c b/libglusterfs/src/xlator.c index f3df8e2aeaf..1bded6d3d11 100644 --- a/libglusterfs/src/xlator.c +++ b/libglusterfs/src/xlator.c @@ -680,7 +680,9 @@ loc_copy_overload_parent (loc_t *dst, loc_t *src, inode_t *parent) dst->name = strrchr (dst->path, '/'); if (dst->name) dst->name++; - } + } else if (src->name) { + dst->name = src->name; + } ret = 0; out: @@ -718,7 +720,9 @@ loc_copy (loc_t *dst, loc_t *src) dst->name = strrchr (dst->path, '/'); if (dst->name) dst->name++; - } + } else if (src->name) { + dst->name = src->name; + } ret = 0; out: diff --git a/tests/basic/pump.t b/tests/basic/pump.t index 3faf06f0502..23bdc187d90 100644 --- a/tests/basic/pump.t +++ b/tests/basic/pump.t @@ -22,7 +22,7 @@ done cd TEST umount $M0 TEST $CLI volume replace-brick $V0 $H0:$B0/${V0}0 $H0:$B0/${V0}1 start -EXPECT_WITHIN 60 "Y" gd_is_replace_brick_completed $H0 $V0 $H0:$B0/${V0}0 $H0:$B0/${V0}1 +EXPECT_WITHIN 600 "Y" gd_is_replace_brick_completed $H0 $V0 $H0:$B0/${V0}0 $H0:$B0/${V0}1 TEST $CLI volume replace-brick $V0 $H0:$B0/${V0}0 $H0:$B0/${V0}1 commit TEST $CLI volume stop $V0 TEST diff -r --exclude=.glusterfs $B0/${V0}0 $B0/${V0}1 diff --git a/tests/bugs/859927/repl.t b/tests/bugs/859927/repl.t index 73c86e7be3c..856b057fbcb 100755 --- a/tests/bugs/859927/repl.t +++ b/tests/bugs/859927/repl.t @@ -33,20 +33,20 @@ TEST $CLI volume set $V0 cluster.data-self-heal-algorithm full EXPECT full volume_option $V0 cluster.data-self-heal-algorithm create_setup_for_self_heal $M0/a EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0 -ls -l $file 2>&1 > /dev/null +cat $file 2>&1 > /dev/null TEST cmp $B0/${V0}1/a $B0/${V0}2/a TEST $CLI volume set $V0 cluster.data-self-heal-algorithm diff EXPECT diff volume_option $V0 cluster.data-self-heal-algorithm create_setup_for_self_heal $M0/a EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0 -ls -l $file 2>&1 > /dev/null +cat $file 2>&1 > /dev/null TEST cmp $B0/${V0}1/a $B0/${V0}2/a TEST $CLI volume reset $V0 cluster.data-self-heal-algorithm create_setup_for_self_heal $M0/a EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0 -ls -l $file 2>&1 > /dev/null +cat $file 2>&1 > /dev/null TEST cmp $B0/${V0}1/a $B0/${V0}2/a TEST ! $CLI volume set $V0 cluster.data-self-heal-algorithm "" diff --git a/tests/bugs/bug-1015990-rep.t b/tests/bugs/bug-1015990-rep.t index f59bb2f75ba..bca0d7aff07 100755 --- a/tests/bugs/bug-1015990-rep.t +++ b/tests/bugs/bug-1015990-rep.t @@ -35,7 +35,6 @@ for i in {1..100}; do echo "STRING" > $M0/File$i; done brick_2_sh_entries=$(count_sh_entries $B0/$V0"2") brick_4_sh_entries=$(count_sh_entries $B0/$V0"4") - command_output=$(gluster volume heal $V0 statistics heal-count replica $H0:$B0/$V0"1") diff --git a/tests/bugs/bug-1035576.t b/tests/bugs/bug-1035576.t index 52d93dd87df..938306a8503 100644 --- a/tests/bugs/bug-1035576.t +++ b/tests/bugs/bug-1035576.t @@ -34,7 +34,8 @@ quota_limit_val1=$(get_hex_xattr trusted.glusterfs.quota.limit-set $B0/${V0}1/a) quota_size_val1=$(get_hex_xattr trusted.glusterfs.quota.size $B0/${V0}1/a) #Trigger entry,metadata self-heal -TEST stat $M0/a +TEST ls $M0/a + quota_limit_val0=$(get_hex_xattr trusted.glusterfs.quota.limit-set $B0/${V0}0/a) quota_size_val0=$(get_hex_xattr trusted.glusterfs.quota.size $B0/${V0}0/a) @@ -43,7 +44,7 @@ TEST [ $quota_limit_val0 == $quota_limit_val1 ] #Only entry, metadata self-heal is done quota size value should not be same TEST [ $quota_size_val0 != $quota_size_val1 ] -TEST stat $M0/a/f +TEST cat $M0/a/f #Now that data self-heal is done quota size value should be same quota_size_val0=$(get_hex_xattr trusted.glusterfs.quota.size $B0/${V0}0/a) diff --git a/tests/bugs/bug-1037501.t b/tests/bugs/bug-1037501.t index d11c788a093..596122a727e 100755 --- a/tests/bugs/bug-1037501.t +++ b/tests/bugs/bug-1037501.t @@ -24,14 +24,6 @@ TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}-{0,1,2} EXPECT "$V0" volinfo_field $V0 'Volume Name'; EXPECT 'Created' volinfo_field $V0 'Status'; -## Make sure io-cache and write-behind don't interfere. -TEST $CLI volume set $V0 data-self-heal off; - -## Make sure automatic self-heal doesn't perturb our results. -TEST $CLI volume set $V0 cluster.self-heal-daemon off - -TEST $CLI volume set $V0 background-self-heal-count 0 - ## Start volume and verify TEST $CLI volume start $V0; EXPECT 'Started' volinfo_field $V0 'Status'; @@ -48,206 +40,38 @@ TEST $CLI volume add-brick $V0 replica 4 $H0:$B0/$V0-3 force TEST $CLI volume add-brick $V0 replica 5 $H0:$B0/$V0-4 force TEST $CLI volume add-brick $V0 replica 6 $H0:$B0/$V0-5 force -sleep 10 - -TEST ls $M0/ - - -function compare() -{ - var=-1; - if [ $1 == $2 ]; then - var=0; - else - var=-1; - fi - - echo $var -} - -var2="000000000000000000000000" - -var1=`getfattr -d -m . $B0/$V0-0/File -e hex 2>&1 | grep "client-3"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-0/File -e hex 2>&1 | grep "client-4"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-0/File -e hex 2>&1 | grep "client-5"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-1/File -e hex 2>&1 | grep "client-3"` -EXPECT "0" echo $? -var3=`echo $var1| cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-1/File -e hex 2>&1 | grep "client-4"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-1/File -e hex 2>&1 | grep "client-5"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-2/File -e hex 2>&1 | grep "client-3"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-2/File -e hex 2>&1 | grep "client-4"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-2/File -e hex 2>&1 | grep "client-5"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-0/Dir -e hex 2>&1 | grep "client-3"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-0/Dir -e hex 2>&1 | grep "client-4"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-0/Dir -e hex 2>&1 | grep "client-5"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-1/Dir -e hex 2>&1 | grep "client-3"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-1/Dir -e hex 2>&1 | grep "client-4"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-1/Dir -e hex 2>&1 | grep "client-5"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-2/Dir -e hex 2>&1 | grep "client-3"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-2/Dir -e hex 2>&1 | grep "client-4"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-2/Dir -e hex 2>&1 | grep "client-5"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - - -var1=`getfattr -d -m . $B0/$V0-0/Link -e hex 2>&1 | grep "client-3"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-0/Link -e hex 2>&1 | grep "client-4"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-0/Link -e hex 2>&1 | grep "client-5"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-1/Link -e hex 2>&1 | grep "client-3"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-1/Link -e hex 2>&1 | grep "client-4"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-1/Link -e hex 2>&1 | grep "client-5"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-2/Link -e hex 2>&1 | grep "client-3"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-2/Link -e hex 2>&1 | grep "client-4"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-2/Link -e hex 2>&1 | grep "client-5"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - - - -var1=`getfattr -d -m . $B0/$V0-0/FIFO -e hex 2>&1 | grep "client-3"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-0/FIFO -e hex 2>&1 | grep "client-4"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-0/FIFO -e hex 2>&1 | grep "client-5"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-1/FIFO -e hex 2>&1 | grep "client-3"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-1/FIFO -e hex 2>&1 | grep "client-4"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-1/FIFO -e hex 2>&1 | grep "client-5"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-2/FIFO -e hex 2>&1 | grep "client-3"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-2/FIFO -e hex 2>&1 | grep "client-4"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 - -var1=`getfattr -d -m . $B0/$V0-2/FIFO -e hex 2>&1 | grep "client-5"` -EXPECT "0" echo $? -var3=`echo $var1 | cut -d x -f 2` -EXPECT_NOT $var2 echo $var3 +sleep 5 + +TEST gluster volume heal $V0 full + +sleep 5 + +EXPECT 10 stat -c '%s' $B0/$V0-0/File +EXPECT 10 stat -c '%s' $B0/$V0-1/File +EXPECT 10 stat -c '%s' $B0/$V0-2/File +EXPECT 10 stat -c '%s' $B0/$V0-3/File +EXPECT 10 stat -c '%s' $B0/$V0-4/File +EXPECT 10 stat -c '%s' $B0/$V0-5/File + +EXPECT 3 stat -c '%h' $B0/$V0-0/Link +EXPECT 3 stat -c '%h' $B0/$V0-1/Link +EXPECT 3 stat -c '%h' $B0/$V0-2/Link +EXPECT 3 stat -c '%h' $B0/$V0-3/Link +EXPECT 3 stat -c '%h' $B0/$V0-4/Link +EXPECT 3 stat -c '%h' $B0/$V0-5/Link + +EXPECT 'directory' stat -c '%F' $B0/$V0-0/Dir +EXPECT 'directory' stat -c '%F' $B0/$V0-1/Dir +EXPECT 'directory' stat -c '%F' $B0/$V0-2/Dir +EXPECT 'directory' stat -c '%F' $B0/$V0-3/Dir +EXPECT 'directory' stat -c '%F' $B0/$V0-4/Dir +EXPECT 'directory' stat -c '%F' $B0/$V0-5/Dir + +EXPECT 'fifo' stat -c '%F' $B0/$V0-0/FIFO +EXPECT 'fifo' stat -c '%F' $B0/$V0-1/FIFO +EXPECT 'fifo' stat -c '%F' $B0/$V0-2/FIFO +EXPECT 'fifo' stat -c '%F' $B0/$V0-3/FIFO +EXPECT 'fifo' stat -c '%F' $B0/$V0-4/FIFO +EXPECT 'fifo' stat -c '%F' $B0/$V0-5/FIFO cleanup; diff --git a/tests/bugs/bug-1058797.t b/tests/bugs/bug-1058797.t index 2b80794cf06..1e9f09af0a8 100644 --- a/tests/bugs/bug-1058797.t +++ b/tests/bugs/bug-1058797.t @@ -29,7 +29,7 @@ EXPECT "s" echo $setuid_bit1 #Restart volume and do lookup from mount to trigger heal TEST $CLI volume start $V0 force EXPECT_WITHIN 20 "1" afr_child_up_status $V0 1 -TEST ls -l $M0/file +TEST dd if=$M0/file of=/dev/null #Get file permissions from healed brick1 and verify that S_ISUID is indeed set file_permissions2=`ls -l $B0/brick1/file | awk '{print $1}' | cut -d. -f1 | cut -d- -f2,3,4,5,6` diff --git a/tests/bugs/bug-767585-gfid.t b/tests/bugs/bug-767585-gfid.t index 49cf7423fa0..41043a0b247 100755 --- a/tests/bugs/bug-767585-gfid.t +++ b/tests/bugs/bug-767585-gfid.t @@ -26,10 +26,9 @@ TEST setfattr -n trusted.gfid -v $gfid2 $B0/${V0}1/c sleep 2 -cd $M0 -TEST ls -l a -TEST ls -l b -TEST ls -l c +TEST stat $M0/a +TEST stat $M0/b +TEST stat $M0/c TEST gf_get_gfid_xattr $B0/${V0}0/a TEST gf_get_gfid_xattr $B0/${V0}1/a diff --git a/tests/bugs/bug-802417.t b/tests/bugs/bug-802417.t index 314141f6b6d..b596df30385 100755 --- a/tests/bugs/bug-802417.t +++ b/tests/bugs/bug-802417.t @@ -55,7 +55,7 @@ EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0 EXPECT_WITHIN 20 "1" afr_child_up_status $V0 1 EXPECT_WITHIN 20 "1" afr_child_up_status $V0 2 TEST kill_brick ${V0} ${H0} ${B0}/${V0}-2 -TEST ls -l ${M0}/a_file +TEST dd if=${M0}/a_file of=/dev/null obs_path_0=${B0}/${V0}-0/a_file @@ -67,31 +67,31 @@ tgt_xattr_1="trusted.afr.${V0}-client-1" tgt_xattr_2="trusted.afr.${V0}-client-2" actual=$(afr_get_changelog_xattr $obs_path_0 $tgt_xattr_0) -EXPECT "0x000000000000000000000000" echo $actual +EXPECT "0x000000000000000000000000|^\$" echo $actual actual=$(afr_get_changelog_xattr $obs_path_0 $tgt_xattr_1) -EXPECT "0x000000000000000000000000" echo $actual +EXPECT "0x000000000000000000000000|^\$" echo $actual actual=$(afr_get_changelog_xattr $obs_path_0 $tgt_xattr_2) -EXPECT "0x000000020000000000000000" echo $actual +EXPECT "0x000000030000000000000000" echo $actual actual=$(afr_get_changelog_xattr $obs_path_1 $tgt_xattr_0) -EXPECT "0x000000000000000000000000" echo $actual +EXPECT "0x000000000000000000000000|^\$" echo $actual actual=$(afr_get_changelog_xattr $obs_path_1 $tgt_xattr_1) -EXPECT "0x000000000000000000000000" echo $actual +EXPECT "0x000000000000000000000000|^\$" echo $actual actual=$(afr_get_changelog_xattr $obs_path_1 $tgt_xattr_2) -EXPECT "0x000000020000000000000000" echo $actual +EXPECT "0x000000010000000000000000" echo $actual actual=$(afr_get_changelog_xattr $obs_path_2 $tgt_xattr_0) -EXPECT "0x000000000000000000000000" echo $actual +EXPECT "0x000000000000000000000000|^\$" echo $actual actual=$(afr_get_changelog_xattr $obs_path_2 $tgt_xattr_1) -EXPECT "0x000000000000000000000000" echo $actual +EXPECT "0x000000000000000000000000|^\$" echo $actual actual=$(afr_get_changelog_xattr $obs_path_2 $tgt_xattr_2) -EXPECT "0x000000000000000000000000" echo $actual +EXPECT "0x000000000000000000000000|^\$" echo $actual if [ "$EXIT_EARLY" = "1" ]; then exit 0; diff --git a/tests/bugs/bug-830665.t b/tests/bugs/bug-830665.t index 0073ff1d948..dd6f3ce2f12 100755 --- a/tests/bugs/bug-830665.t +++ b/tests/bugs/bug-830665.t @@ -81,15 +81,17 @@ ls -l $N0 &> /dev/null; sleep 5; ## Force entry self-heal. -find $N0 | xargs stat > /dev/null; +TEST $CLI volume set $V0 cluster.self-heal-daemon on +sleep 1 +TEST gluster volume heal $V0 full #ls -lR $N0 > /dev/null; ## Do NOT check through the NFS mount here. That will force a new self-heal ## check, but we want to test whether self-heal already happened. ## Make sure everything's in order on the recreated brick. -EXPECT 'test_data' cat $B0/${V0}-0/a_file; -EXPECT 'more_test_data' cat $B0/${V0}-0/a_dir/another_file; +EXPECT_WITHIN 20 'test_data' cat $B0/${V0}-0/a_file; +EXPECT_WITHIN 20 'more_test_data' cat $B0/${V0}-0/a_dir/another_file; if [ "$EXIT_EARLY" = "1" ]; then exit 0; diff --git a/tests/bugs/bug-853690.t b/tests/bugs/bug-853690.t index 77a581f5444..c2f82d1030a 100755 --- a/tests/bugs/bug-853690.t +++ b/tests/bugs/bug-853690.t @@ -66,7 +66,6 @@ TEST glusterfs --volfile=$B0/test.vol --attribute-timeout=0 --entry-timeout=0 $M # file sizes and immediate split-brain (EIO). TEST dd if=/dev/zero of=$M0/file bs=128k count=1 TEST dd if=$M0/file of=/dev/null bs=128k count=1 - ######## # # Test self-heal with short writes... @@ -76,14 +75,11 @@ TEST dd if=$M0/file of=/dev/null bs=128k count=1 # Cause a lookup and wait a few seconds for posterity. This self-heal also fails # due to a short write. TEST ls $M0/file - # Verify the attributes on the healthy replica do not reflect consistency with # the other replica. -TEST "getfattr -n trusted.afr.test-locks-0 $B0/test2/file --only-values > $B0/out1 2> /dev/null" -TEST "getfattr -n trusted.afr.test-locks-1 $B0/test2/file --only-values > $B0/out2 2> /dev/null" -TEST ! cmp $B0/out1 $B0/out2 +xa=`getfattr -n trusted.afr.test-locks-0 -e hex $B0/test2/file 2>&1 | grep = | cut -f2 -d=` +EXPECT_NOT 0x000000000000000000000000 echo $xa -TEST rm -f $B0/out1 $B0/out2 TEST rm -f $M0/file TEST umount $M0 diff --git a/tests/bugs/bug-865825.t b/tests/bugs/bug-865825.t index 6bb1c23482d..8ee75186484 100755 --- a/tests/bugs/bug-865825.t +++ b/tests/bugs/bug-865825.t @@ -2,6 +2,8 @@ . $(dirname $0)/../include.rc +cleanup; + TEST glusterd TEST pidof glusterd TEST $CLI volume info; @@ -28,6 +30,7 @@ EXPECT 'Created' volinfo_field $V0 'Status'; ## Make sure io-cache and write-behind don't interfere. TEST $CLI volume set $V0 cluster.background-self-heal-count 0 TEST $CLI volume set $V0 performance.io-cache off; +TEST $CLI volume set $V0 performance.quick-read off; TEST $CLI volume set $V0 performance.write-behind off; TEST $CLI volume set $V0 performance.stat-prefetch off @@ -54,19 +57,18 @@ setfattr -n trusted.afr.${V0}-client-2 -v $value $B0/${V0}-0/a_file setfattr -x trusted.afr.${V0}-client-2 $B0/${V0}-1/a_file echo "wrong_data" > $B0/${V0}-2/a_file -## Remount and force a self-heal. -TEST glusterfs --volfile-server=$H0 --volfile-id=$V0 $M0 -stat ${M0}/a_file > /dev/null +gluster volume set $V0 cluster.self-heal-daemon on +sleep 3 +gluster volume heal $V0 full ## Make sure brick 2 now has the correct contents. -EXPECT "test_data" cat $B0/${V0}-2/a_file +EXPECT_WITHIN 30 "test_data" cat $B0/${V0}-2/a_file if [ "$EXIT_EARLY" = "1" ]; then exit 0; fi ## Finish up -TEST umount $M0; TEST $CLI volume stop $V0; EXPECT 'Stopped' volinfo_field $V0 'Status'; diff --git a/tests/bugs/bug-873962.t b/tests/bugs/bug-873962.t index b245cc3dab5..0281417f07f 100755 --- a/tests/bugs/bug-873962.t +++ b/tests/bugs/bug-873962.t @@ -61,11 +61,12 @@ EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0 EXPECT_WITHIN 20 "1" afr_child_up_status $V0 1 TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $M1 --direct-io-mode=enable + #Files are in split-brain, so open should fail TEST ! cat $M0/a; TEST ! cat $M1/a; -TEST ! cat $M0/b; -TEST ! cat $M1/b; +TEST cat $M0/b; +TEST cat $M1/b; #Reset split-brain status TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/a; @@ -75,6 +76,7 @@ TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0 EXPECT "2" cat $M0/a; # FAIL HERE - see comment about cluster.self-heal-background-count above. EXPECT "2" cat $M1/a; +TEST dd if=$M0/b of=/dev/null bs=1M EXPECT "def" getfattr -n trusted.mdata --only-values $M0/b 2>/dev/null EXPECT "def" getfattr -n trusted.mdata --only-values $M1/b 2>/dev/null @@ -90,8 +92,8 @@ TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $ #Files are in split-brain, so open should fail TEST ! cat $M0/c TEST ! cat $M1/c -TEST ! cat $M0/d -TEST ! cat $M1/d +TEST cat $M0/d +TEST cat $M1/d TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/c TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/d @@ -102,7 +104,4 @@ EXPECT "2" cat $M1/c EXPECT "1" cat $M0/d EXPECT "1" cat $M1/d -#Check that the self-heal is not triggered. -EXPECT "1" cat $B0/${V0}1/c -EXPECT "abc" getfattr -n trusted.mdata --only-values $B0/${V0}1/d 2>/dev/null cleanup; diff --git a/tests/bugs/bug-888174.t b/tests/bugs/bug-888174.t index 4ea34645bd6..ef653f76da2 100644 --- a/tests/bugs/bug-888174.t +++ b/tests/bugs/bug-888174.t @@ -38,10 +38,9 @@ TEST [ -z $inodelk_max_latency ] TEST dd of=$M0/a if=/dev/urandom bs=1M count=10 conv=fsync #Check for no trace of pending changelog. Flush should make sure of it. -EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_0/a trusted.afr.$V0-client-0 -EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_0/a trusted.afr.$V0-client-1 -EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_1/a trusted.afr.$V0-client-0 -EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_1/a trusted.afr.$V0-client-1 +EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_0/a trusted.afr.dirty +EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_1/a trusted.afr.dirty + dd of=$M0/a if=/dev/urandom bs=1M count=1024 2>/dev/null & p=$! @@ -51,15 +50,13 @@ TEST $CLI volume set $V0 performance.io-cache off TEST $CLI volume set $V0 performance.stat-prefetch off TEST $CLI volume set $V0 performance.read-ahead off -kill -SIGTERM $p +kill -TERM $p #wait for dd to exit wait > /dev/null 2>&1 #Goal is to check if there is permanent FOOL changelog sleep 5 -EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_0/a trusted.afr.$V0-client-0 -EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_0/a trusted.afr.$V0-client-1 -EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_1/a trusted.afr.$V0-client-0 -EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_1/a trusted.afr.$V0-client-1 +EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_0/a trusted.afr.dirty +EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_1/a trusted.afr.dirty cleanup; diff --git a/tests/bugs/bug-906646.t b/tests/bugs/bug-906646.t index 0e6a3bcb695..b2cbf6bc32e 100644 --- a/tests/bugs/bug-906646.t +++ b/tests/bugs/bug-906646.t @@ -84,7 +84,7 @@ TEST $CLI volume start $V0 force EXPECT_WITHIN 20 "1" afr_child_up_status $V0 `expr $brick_id - 1` -stat $pth +cat $pth >/dev/null # check backends - xattr should not be present anywhere EXPECT 1 xattr_query_check ${backend_paths_array[0]} "trusted.name" diff --git a/tests/bugs/bug-913051.t b/tests/bugs/bug-913051.t index 69e90cf66c2..9a59424f412 100644 --- a/tests/bugs/bug-913051.t +++ b/tests/bugs/bug-913051.t @@ -48,8 +48,8 @@ EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $B0/${V0}0/dir/b #attempt self-heal so that the files are created on brick-0 -TEST ls -l $M0/dir/a -TEST ls -l $M0/dir/b +TEST dd if=$M0/dir/a of=/dev/null bs=1M +TEST dd if=$M0/dir/b of=/dev/null bs=1M #trigger writev for attempting open-fd-fix in afr TEST fd_write $wfd "open sesame" diff --git a/tests/bugs/bug-913544.t b/tests/bugs/bug-913544.t index 790bc08980f..db28ca814ce 100644 --- a/tests/bugs/bug-913544.t +++ b/tests/bugs/bug-913544.t @@ -17,7 +17,7 @@ TEST touch a #simulate no-changelog data split-brain echo "abc" > $B0/${V0}1/a echo "abcd" > $B0/${V0}0/a -TEST ! truncate -s 0 a +TEST truncate -s 0 a TEST ls cd diff --git a/tests/bugs/bug-918437-sh-mtime.t b/tests/bugs/bug-918437-sh-mtime.t index 080956f519a..11155ad1629 100644 --- a/tests/bugs/bug-918437-sh-mtime.t +++ b/tests/bugs/bug-918437-sh-mtime.t @@ -38,7 +38,12 @@ TEST $CLI volume start $V0 force EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0 EXPECT_WITHIN 20 "1" afr_child_up_status $V0 1 -find $M0 | xargs stat 1>/dev/null +TEST $CLI volume set $V0 cluster.self-heal-daemon on +sleep 1 +TEST gluster volume heal $V0 full + +size=`stat -c '%s' /etc/passwd` +EXPECT_WITHIN 60 $size stat -c '%s' $B0/gfs0/brick01/a TEST modify_atstamp1=$(get_mtime $B0/gfs0/brick01/a) TEST modify_atstamp2=$(get_mtime $B0/gfs0/brick02/a) diff --git a/tests/bugs/bug-977797.t b/tests/bugs/bug-977797.t index 08cdbe8f119..f2252159a21 100755 --- a/tests/bugs/bug-977797.t +++ b/tests/bugs/bug-977797.t @@ -54,7 +54,7 @@ TEST chmod 757 $M0/a/file TEST $CLI volume start $V0 force EXPECT_WITHIN 20 "1" afr_child_up_status $V0 1; -TEST ls -l $M0/a/file +TEST dd if=$M0/a/file of=/dev/null bs=1M b1c0dir=$(afr_get_specific_changelog_xattr $B0/$V0"1"/a \ trusted.afr.$V0-client-0 "entry") @@ -75,34 +75,15 @@ b2c0f=$(afr_get_specific_changelog_xattr $B0/$V0"2"/a/file \ b2c1f=$(afr_get_specific_changelog_xattr $B0/$V0"2"/a/file \ trusted.afr.$V0-client-1 "data") -EXPECT "00000000" echo $b1c0f -EXPECT "00000000" echo $b1c1f -EXPECT "00000000" echo $b2c0f -EXPECT "00000000" echo $b2c1f - -EXPECT "00000000" echo $b1c0dir -EXPECT "00000000" echo $b1c1dir -EXPECT "00000000" echo $b2c0dir -EXPECT "00000000" echo $b2c1dir - -contains() { - string="$1" - substring="$2" - var="-1" - if test "${string#*$substring}" != "$string" - then - var="0" # $substring is in $string - else - var="1" # $substring is not in $string - fi - echo $var -} - -var1=$(cat $M0/a/file 2>&1) -var2="Input/output error" - - -EXPECT "0" contains "$var1" "$var2" +EXPECT "00000000|^$" echo $b1c0f +EXPECT "00000000|^$" echo $b1c1f +EXPECT "00000000|^$" echo $b2c0f +EXPECT "00000000|^$" echo $b2c1f + +EXPECT "00000000|^$" echo $b1c0dir +EXPECT "00000000|^$" echo $b1c1dir +EXPECT "00000000|^$" echo $b2c0dir +EXPECT "00000000|^$" echo $b2c1dir ## Finish up TEST $CLI volume stop $V0; diff --git a/tests/volume.rc b/tests/volume.rc index 5e2f95e766d..9a06687cd86 100644 --- a/tests/volume.rc +++ b/tests/volume.rc @@ -169,7 +169,7 @@ function check_option_help_presence { function afr_get_changelog_xattr { local file=$1 local xkey=$2 - getfattr -n $xkey -e hex $file 2>/dev/null | grep "client-" | cut -f2 -d'=' + getfattr -n $xkey -e hex $file 2>/dev/null | grep "$xkey" | cut -f2 -d'=' } function afr_get_pending_heal_count { diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am index 35d18a6c0da..ea5a90abbdb 100644 --- a/xlators/cluster/afr/src/Makefile.am +++ b/xlators/cluster/afr/src/Makefile.am @@ -2,24 +2,26 @@ xlator_LTLIBRARIES = afr.la pump.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \ - afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c \ - afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c \ - afr-self-heal-algorithm.c afr-lk-common.c afr-self-heald.c \ + afr-inode-write.c afr-open.c afr-transaction.c afr-lk-common.c \ + afr-read-txn.c \ $(top_builddir)/xlators/lib/src/libxlator.c +AFR_SELFHEAL_SOURCES = afr-self-heal-common.c afr-self-heal-data.c \ + afr-self-heal-entry.c afr-self-heal-metadata.c afr-self-heald.c \ + afr-self-heal-name.c + afr_la_LDFLAGS = -module -avoid-version -afr_la_SOURCES = $(afr_common_source) afr.c +afr_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) afr.c afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la pump_la_LDFLAGS = -module -avoid-version -pump_la_SOURCES = $(afr_common_source) pump.c +pump_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) pump.c pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h \ - afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h \ - afr-self-heal-algorithm.h pump.h afr-mem-types.h afr-common.c \ - afr-self-heald.h $(top_builddir)/xlators/lib/src/libxlator.h \ - $(top_builddir)/glusterfsd/src/glusterfsd.h + afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-mem-types.h \ + afr-common.c afr-self-heald.h pump.h \ + $(top_builddir)/xlators/lib/src/libxlator.h AM_CPPFLAGS = $(GF_CPPFLAGS) \ -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ @@ -31,7 +33,6 @@ CLEANFILES = uninstall-local: rm -f $(DESTDIR)$(xlatordir)/replicate.so - rm -f $(DESTDIR)$(xlatordir)/pump.so install-data-hook: ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 224d3054626..2bab0f8533d 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -45,787 +45,797 @@ #include "afr-dir-write.h" #include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" #include "afr-self-heald.h" -#include "pump.h" -#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000100000000ULL -#define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL -#define AFR_STATISTICS_HISTORY_SIZE 50 -int -afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, - gf_boolean_t fail_conflict); -void -afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count) -{ - int i = 0; - for (i = 0; i < child_count; i++) - dst[i] = src[i]; -} - -void -afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path) +call_frame_t * +afr_copy_frame (call_frame_t *base) { - int i = 0; - afr_private_t *priv = NULL; - int ret = 0; + afr_local_t *local = NULL; + call_frame_t *frame = NULL; + int op_errno = 0; - priv = this->private; + frame = copy_frame (base); + if (!frame) + return NULL; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) { + AFR_STACK_DESTROY (frame); + return NULL; + } - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_uint64 (xattr_req, priv->pending_key[i], - 3 * sizeof(int32_t)); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - path, priv->pending_key[i]); - /* 3 = data+metadata+entry */ - } - ret = dict_set_int32 (xattr_req, GF_GFIDLESS_LOOKUP, 1); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, "%s: failed to set gfidless " - "lookup", path); - } + return frame; } +/* + * INODE CTX 64-bit VALUE FORMAT FOR SMALL (<= 16) SUBVOL COUNTS: + * + * |<---------- 64bit ------------>| + * 63 32 31 16 15 0 + * | EVENT_GEN | DATA | METADATA | + * + * + * METADATA (bit-0 .. bit-15): bitmap representing subvolumes from which + * metadata can be attempted to be read. + * + * bit-0 => priv->subvolumes[0] + * bit-1 => priv->subvolumes[1] + * ... etc. till bit-15 + * + * DATA (bit-16 .. bit-31): bitmap representing subvolumes from which data + * can be attempted to be read. + * + * bit-16 => priv->subvolumes[0] + * bit-17 => priv->subvolumes[1] + * ... etc. till bit-31 + * + * EVENT_GEN (bit-32 .. bit-63): event generation (i.e priv->event_generation) + * when DATA and METADATA was last updated. + * + * If EVENT_GEN is < priv->event_generation, + * or is 0, it means afr_inode_refresh() needs + * to be called to recalculate the bitmaps. + */ + int -afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this, - dict_t *xattr_req, loc_t *loc, void **gfid_req) +__afr_inode_read_subvol_get_small (inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int *event_p) { - int ret = -ENOMEM; + afr_private_t *priv = NULL; + int ret = -1; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint32_t event = 0; + uint64_t val = 0; + int i = 0; - GF_ASSERT (gfid_req); + priv = this->private; - *gfid_req = NULL; - local->xattr_req = dict_new (); - if (!local->xattr_req) - goto out; - if (xattr_req) - dict_copy (xattr_req, local->xattr_req); + ret = __inode_ctx_get (inode, this, &val); + if (ret < 0) + return ret; - afr_xattr_req_prepare (this, local->xattr_req, loc->path); - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_INODELK_COUNT); - } - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_ENTRYLK_COUNT); - } + metadatamap = (val & 0x000000000000ffff); + datamap = (val & 0x00000000ffff0000) >> 16; + event = (val & 0xffffffff00000000) >> 32; - ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_PARENT_ENTRYLK); - } + for (i = 0; i < priv->child_count; i++) { + if (metadata) + metadata[i] = (metadatamap >> i) & 1; + if (data) + data[i] = (datamap >> i) & 1; + } - ret = dict_get_ptr (local->xattr_req, "gfid-req", gfid_req); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: failed to get the gfid from dict", loc->path); - *gfid_req = NULL; - } else { - if (loc->parent != NULL) - dict_del (local->xattr_req, "gfid-req"); - } - ret = 0; -out: - return ret; + if (event_p) + *event_p = event; + return ret; } -void -afr_lookup_save_gfid (uuid_t dst, void* new, const loc_t *loc) -{ - inode_t *inode = NULL; - - inode = loc->inode; - if (inode && !uuid_is_null (inode->gfid)) - uuid_copy (dst, inode->gfid); - else if (!uuid_is_null (loc->gfid)) - uuid_copy (dst, loc->gfid); - else if (new && !uuid_is_null (new)) - uuid_copy (dst, new); -} int -afr_errno_count (int32_t *children, int *child_errno, - unsigned int child_count, int32_t op_errno) -{ - int i = 0; - int errno_count = 0; - int child = 0; +__afr_inode_read_subvol_set_small (inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int event) +{ + afr_private_t *priv = NULL; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint64_t val = 0; + int i = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (data[i]) + datamap |= (1 << i); + if (metadata[i]) + metadatamap |= (1 << i); + } - for (i = 0; i < child_count; i++) { - if (children) { - child = children[i]; - if (child == -1) - break; - } else { - child = i; - } - if (child_errno[child] == op_errno) - errno_count++; - } - return errno_count; -} + val = ((uint64_t) metadatamap) | + (((uint64_t) datamap) << 16) | + (((uint64_t) event) << 32); -int32_t -afr_set_dict_gfid (dict_t *dict, uuid_t gfid) -{ - int ret = 0; - uuid_t *pgfid = NULL; + return __inode_ctx_set (inode, this, &val); +} - GF_ASSERT (gfid); - pgfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char); - if (!pgfid) { - ret = -1; - goto out; - } +int +__afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this) +{ + int ret = -1; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint32_t event = 0; + uint64_t val = 0; - uuid_copy (*pgfid, gfid); + ret = __inode_ctx_get (inode, this, &val); + (void) ret; - ret = dict_set_dynptr (dict, "gfid-req", pgfid, sizeof (uuid_t)); - if (ret) - gf_log (THIS->name, GF_LOG_ERROR, "gfid set failed"); + metadatamap = (val & 0x000000000000ffff) >> 0; + datamap = (val & 0x00000000ffff0000) >> 16; + event = 0; -out: - if (ret && pgfid) - GF_FREE (pgfid); + val = ((uint64_t) metadatamap) | + (((uint64_t) datamap) << 16) | + (((uint64_t) event) << 32); - return ret; + return __inode_ctx_set (inode, this, &val); } -void -afr_inode_ctx_destroy (afr_inode_ctx_t *ctx) -{ - if (!ctx) - return; - GF_FREE (ctx->fresh_children); - GF_FREE (ctx); -} -afr_inode_ctx_t* -__afr_inode_ctx_get (inode_t *inode, xlator_t *this) +int +__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int *event_p) { - int ret = 0; - uint64_t ctx_addr = 0; - afr_inode_ctx_t *ctx = NULL; - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; + int ret = -1; - priv = this->private; - ret = __inode_ctx_get (inode, this, &ctx_addr); - if (ret < 0) - ctx_addr = 0; - if (ctx_addr != 0) { - ctx = (afr_inode_ctx_t*) (long) ctx_addr; - goto out; - } - ctx = GF_CALLOC (1, sizeof (*ctx), - gf_afr_mt_inode_ctx_t); - if (!ctx) - goto fail; - ctx->fresh_children = GF_CALLOC (priv->child_count, - sizeof (*ctx->fresh_children), - gf_afr_mt_int32_t); - if (!ctx->fresh_children) - goto fail; - ret = __inode_ctx_put (inode, this, (uint64_t)ctx); - if (ret) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to " - "set the inode ctx (%s)", - uuid_utoa (inode->gfid)); - goto fail; - } + priv = this->private; -out: - return ctx; + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_get_small (inode, this, data, + metadata, event_p); + else + /* TBD: allocate structure with array and read from it */ + ret = -1; -fail: - afr_inode_ctx_destroy (ctx); - return NULL; + return ret; } -afr_inode_ctx_t* -afr_inode_ctx_get (inode_t *inode, xlator_t *this) + +int +__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int event) { - afr_inode_ctx_t *ctx = NULL; + afr_private_t *priv = NULL; + int ret = -1; - LOCK (&inode->lock); - { - ctx = __afr_inode_ctx_get (inode, this); - } - UNLOCK (&inode->lock); - return ctx; + priv = this->private; + + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_set_small (inode, this, data, + metadata, event); + else + ret = -1; + + return ret; } -void -afr_inode_get_ctx_params (xlator_t *this, inode_t *inode, - afr_inode_params_t *params) + +int +__afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this) { - GF_ASSERT (inode); - GF_ASSERT (params); + afr_private_t *priv = NULL; + int ret = -1; - afr_inode_ctx_t *ctx = NULL; - afr_private_t *priv = NULL; - int i = 0; - int32_t read_child = -1; - int32_t *fresh_children = NULL; + priv = this->private; - priv = this->private; - LOCK (&inode->lock); - { - ctx = __afr_inode_ctx_get (inode, this); - if (!ctx) - goto unlock; - switch (params->op) { - case AFR_INODE_GET_READ_CTX: - fresh_children = params->u.read_ctx.children; - read_child = (int32_t)(ctx->masks & - AFR_ICTX_READ_CHILD_MASK); - params->u.read_ctx.read_child = read_child; - if (!fresh_children) - goto unlock; - for (i = 0; i < priv->child_count; i++) - fresh_children[i] = ctx->fresh_children[i]; - break; - case AFR_INODE_GET_OPENDIR_DONE: - params->u.value = _gf_false; - if (ctx->masks & AFR_ICTX_OPENDIR_DONE_MASK) - params->u.value = _gf_true; - break; - default: - GF_ASSERT (0); - break; - } - } -unlock: - UNLOCK (&inode->lock); + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_reset_small (inode, this); + else + ret = -1; + + return ret; } -gf_boolean_t -afr_is_split_brain (xlator_t *this, inode_t *inode) + +int +afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int *event_p) { - afr_inode_ctx_t *ctx = NULL; - gf_boolean_t spb = _gf_false; + int ret = -1; - ctx = afr_inode_ctx_get (inode, this); - if (!ctx) - goto out; - if ((ctx->mdata_spb == SPB) || (ctx->data_spb == SPB)) - spb = _gf_true; -out: - return spb; + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_get (inode, this, data, + metadata, event_p); + } + UNLOCK(&inode->lock); + + return ret; } -gf_boolean_t -afr_is_opendir_done (xlator_t *this, inode_t *inode) + +int +afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int event) { - afr_inode_params_t params = {0}; + int ret = -1; + + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_set (inode, this, data, metadata, + event); + } + UNLOCK(&inode->lock); - params.op = AFR_INODE_GET_OPENDIR_DONE; - afr_inode_get_ctx_params (this, inode, ¶ms); - return params.u.value; + return ret; } -int32_t -afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children) + +int +afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this) { - afr_inode_params_t params = {0}; + int ret = -1; + + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_reset (inode, this); + } + UNLOCK(&inode->lock); - params.op = AFR_INODE_GET_READ_CTX; - params.u.read_ctx.children = fresh_children; - afr_inode_get_ctx_params (this, inode, ¶ms); - return params.u.read_ctx.read_child; + return ret; } -void -afr_inode_ctx_set_read_child (afr_inode_ctx_t *ctx, int32_t read_child) -{ - uint64_t remaining_mask = 0; - uint64_t mask = 0; - remaining_mask = (~AFR_ICTX_READ_CHILD_MASK & ctx->masks); - mask = (AFR_ICTX_READ_CHILD_MASK & read_child); - ctx->masks = remaining_mask | mask; -} +int +afr_accused_fill (xlator_t *this, dict_t *xdata, unsigned char *accused, + afr_transaction_type type) +{ + afr_private_t *priv = NULL; + int i = 0; + int idx = afr_index_for_transaction_type (type); + void *pending_raw = NULL; + int pending[3]; + int ret = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + ret = dict_get_ptr (xdata, priv->pending_key[i], + &pending_raw); + if (ret) /* no pending flags */ + continue; + memcpy (pending, pending_raw, sizeof(pending)); + + if (ntoh32 (pending[idx])) + accused[i] = 1; + } -void -afr_inode_ctx_set_read_ctx (afr_inode_ctx_t *ctx, int32_t read_child, - int32_t *fresh_children, int32_t child_count) -{ - int i = 0; - - afr_inode_ctx_set_read_child (ctx, read_child); - for (i = 0; i < child_count; i++) { - if (fresh_children) - ctx->fresh_children[i] = fresh_children[i]; - else - ctx->fresh_children[i] = -1; - } + return 0; } -void -afr_inode_ctx_rm_stale_children (afr_inode_ctx_t *ctx, int32_t *stale_children, - int32_t child_count) + +int +afr_accuse_smallfiles (xlator_t *this, struct afr_reply *replies, + unsigned char *data_accused) { - int i = 0; - int32_t read_child = -1; + int i = 0; + afr_private_t *priv = NULL; + uint64_t maxsize = 0; - GF_ASSERT (stale_children); - for (i = 0; i < child_count; i++) { - if (stale_children[i] == -1) - break; - afr_children_rm_child (ctx->fresh_children, - stale_children[i], child_count); - } - read_child = (int32_t)(ctx->masks & AFR_ICTX_READ_CHILD_MASK); - if (!afr_is_child_present (ctx->fresh_children, child_count, - read_child)) - afr_inode_ctx_set_read_child (ctx, ctx->fresh_children[0]); -} + priv = this->private; -void -afr_inode_ctx_set_opendir_done (afr_inode_ctx_t *ctx) -{ - uint64_t remaining_mask = 0; - uint64_t mask = 0; + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) + continue; + if (replies[i].poststat.ia_size > maxsize) + maxsize = replies[i].poststat.ia_size; + } - remaining_mask = (~AFR_ICTX_OPENDIR_DONE_MASK & ctx->masks); - mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_OPENDIR_DONE_MASK); - ctx->masks = remaining_mask | mask; + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) + continue; + if (replies[i].poststat.ia_size < maxsize) + data_accused[i] = 1; + } + + return 0; } -void -afr_inode_set_ctx_params (xlator_t *this, inode_t *inode, - afr_inode_params_t *params) -{ - GF_ASSERT (inode); - GF_ASSERT (params); - afr_inode_ctx_t *ctx = NULL; - afr_private_t *priv = NULL; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - int32_t *stale_children = NULL; +int +afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int event_generation = 0; + int i = 0; + unsigned char *data_accused = NULL; + unsigned char *metadata_accused = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + int ret = 0; - priv = this->private; - LOCK (&inode->lock); - { - ctx = __afr_inode_ctx_get (inode, this); - if (!ctx) - goto unlock; - switch (params->op) { - case AFR_INODE_SET_READ_CTX: - read_child = params->u.read_ctx.read_child; - fresh_children = params->u.read_ctx.children; - afr_inode_ctx_set_read_ctx (ctx, read_child, - fresh_children, - priv->child_count); - break; - case AFR_INODE_RM_STALE_CHILDREN: - stale_children = params->u.read_ctx.children; - afr_inode_ctx_rm_stale_children (ctx, - stale_children, - priv->child_count); - break; - case AFR_INODE_SET_OPENDIR_DONE: - afr_inode_ctx_set_opendir_done (ctx); - break; - default: - GF_ASSERT (0); - break; - } - } -unlock: - UNLOCK (&inode->lock); -} + local = frame->local; + priv = this->private; + replies = local->replies; + event_generation = local->event_generation; + + data_accused = alloca0 (priv->child_count); + data_readable = alloca0 (priv->child_count); + metadata_accused = alloca0 (priv->child_count); + metadata_readable = alloca0 (priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + data_readable[i] = 1; + metadata_readable[i] = 1; + } -void -afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, - afr_spb_state_t data_spb) -{ - afr_inode_ctx_t *ctx = NULL; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) { + data_readable[i] = 0; + metadata_readable[i] = 0; + continue; + } + + if (replies[i].op_ret == -1) { + data_readable[i] = 0; + metadata_readable[i] = 0; + continue; + } + + afr_accused_fill (this, replies[i].xdata, data_accused, + (inode->ia_type == IA_IFDIR) ? + AFR_ENTRY_TRANSACTION : AFR_DATA_TRANSACTION); + + afr_accused_fill (this, replies[i].xdata, + metadata_accused, AFR_METADATA_TRANSACTION); + + } - ctx = afr_inode_ctx_get (inode, this); - if (mdata_spb != DONT_KNOW) - ctx->mdata_spb = mdata_spb; - if (data_spb != DONT_KNOW) - ctx->data_spb = data_spb; + if (inode->ia_type != IA_IFDIR) + afr_accuse_smallfiles (this, replies, data_accused); + + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) { + data_readable[i] = 0; + ret = 1; + } + if (metadata_accused[i]) { + metadata_readable[i] = 0; + ret = 1; + } + } + + afr_inode_read_subvol_set (inode, this, data_readable, + metadata_readable, event_generation); + return ret; } -void -afr_set_opendir_done (xlator_t *this, inode_t *inode) -{ - afr_inode_params_t params = {0}; - params.op = AFR_INODE_SET_OPENDIR_DONE; - afr_inode_set_ctx_params (this, inode, ¶ms); + +int +afr_refresh_selfheal_done (int ret, call_frame_t *heal, void *opaque) +{ + if (heal) + STACK_DESTROY (heal->root); + return 0; } -void -afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, - int32_t *fresh_children) +int +afr_inode_refresh_err (call_frame_t *frame, xlator_t *this) { - afr_inode_params_t params = {0}; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int err = 0; - priv = this->private; - GF_ASSERT (read_child >= 0); - GF_ASSERT (fresh_children); - GF_ASSERT (afr_is_child_present (fresh_children, priv->child_count, - read_child)); - - params.op = AFR_INODE_SET_READ_CTX; - params.u.read_ctx.read_child = read_child; - params.u.read_ctx.children = fresh_children; - afr_inode_set_ctx_params (this, inode, ¶ms); + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && !local->replies[i].op_ret) { + err = 0; + goto ret; + } + } + + err = afr_final_errno (local, priv); +ret: + return -err; } -void -afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, - int32_t *stale_children) + +int +afr_refresh_selfheal_wrap (void *opaque) { - afr_inode_params_t params = {0}; + call_frame_t *frame = opaque; + afr_local_t *local = NULL; + xlator_t *this = NULL; + int err = 0; + + local = frame->local; + this = frame->this; - GF_ASSERT (stale_children); + afr_selfheal (frame->this, local->refreshinode->gfid); - params.op = AFR_INODE_RM_STALE_CHILDREN; - params.u.read_ctx.children = stale_children; - afr_inode_set_ctx_params (this, inode, ¶ms); + afr_selfheal_unlocked_discover (frame, local->refreshinode, + local->refreshinode->gfid, + local->replies); + + afr_replies_interpret (frame, this, local->refreshinode); + + err = afr_inode_refresh_err (frame, this); + + afr_replies_wipe (local, this->private); + + local->refreshfn (frame, this, err); + + return 0; } + gf_boolean_t -afr_is_source_child (int32_t *sources, int32_t child_count, int32_t child) +afr_selfheal_enabled (xlator_t *this) { - gf_boolean_t source_xattrs = _gf_false; + afr_private_t *priv = NULL; + gf_boolean_t data = _gf_false; - GF_ASSERT (child < child_count); + priv = this->private; - if ((child >= 0) && (child < child_count) && - sources[child]) { - source_xattrs = _gf_true; - } - return source_xattrs; + gf_string2boolean (priv->data_self_heal, &data); + + return data || priv->metadata_self_heal || priv->entry_self_heal; } -gf_boolean_t -afr_is_child_present (int32_t *success_children, int32_t child_count, - int32_t child) + + +int +afr_inode_refresh_done (call_frame_t *frame, xlator_t *this) { - gf_boolean_t success_child = _gf_false; - int i = 0; + call_frame_t *heal = NULL; + afr_local_t *local = NULL; + int ret = 0; + int err = 0; - GF_ASSERT (child < child_count); + local = frame->local; - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - if (child == success_children[i]) { - success_child = _gf_true; - break; - } - } - return success_child; + ret = afr_replies_interpret (frame, this, local->refreshinode); + + err = afr_inode_refresh_err (frame, this); + + afr_replies_wipe (local, this->private); + + if (ret && afr_selfheal_enabled (this)) { + heal = copy_frame (frame); + if (heal) + heal->root->pid = -1; + ret = synctask_new (this->ctx->env, afr_refresh_selfheal_wrap, + afr_refresh_selfheal_done, heal, frame); + if (ret) + goto refresh_done; + } else { + refresh_done: + local->refreshfn (frame, this, err); + } + + return 0; } -gf_boolean_t -afr_is_read_child (int32_t *success_children, int32_t *sources, - int32_t child_count, int32_t child) + +int +afr_inode_refresh_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *par) { - gf_boolean_t success_child = _gf_false; - gf_boolean_t source = _gf_false; + afr_local_t *local = NULL; + int call_child = (long) cookie; + int call_count = 0; - if (child < 0) { - return _gf_false; - } + local = frame->local; - GF_ASSERT (success_children); - GF_ASSERT (child_count > 0); + local->replies[call_child].valid = 1; + local->replies[call_child].op_ret = op_ret; + local->replies[call_child].op_errno = op_errno; + if (op_ret != -1) { + local->replies[call_child].poststat = *buf; + local->replies[call_child].postparent = *par; + local->replies[call_child].xdata = dict_ref (xdata); + } - success_child = afr_is_child_present (success_children, child_count, - child); - if (!success_child) - goto out; - if (NULL == sources) { - source = _gf_true; - goto out; - } - source = afr_is_source_child (sources, child_count, child); -out: - return (success_child && source); + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_inode_refresh_done (frame, this); + + return 0; } -int32_t -afr_hash_child (int32_t *success_children, int32_t child_count, - unsigned int hmode, uuid_t gfid) + +int +afr_inode_refresh_subvol (call_frame_t *frame, xlator_t *this, int i, + inode_t *inode, dict_t *xdata) { - uuid_t gfid_copy = {0,}; - pid_t pid; + loc_t loc = {0, }; + afr_private_t *priv = NULL; - if (!hmode) { - return -1; - } + priv = this->private; - if (gfid) { - uuid_copy(gfid_copy,gfid); - } - if (hmode > 1) { - /* - * Why getpid? Because it's one of the cheapest calls - * available - faster than gethostname etc. - and returns a - * constant-length value that's sure to be shorter than a UUID. - * It's still very unlikely to be the same across clients, so - * it still provides good mixing. We're not trying for - * perfection here. All we need is a low probability that - * multiple clients won't converge on the same subvolume. - */ - pid = getpid(); - memcpy (gfid_copy, &pid, sizeof(pid)); - } + loc.inode = inode; + uuid_copy (loc.gfid, inode->gfid); - return SuperFastHash((char *)gfid_copy, - sizeof(gfid_copy)) % child_count; + STACK_WIND_COOKIE (frame, afr_inode_refresh_subvol_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->lookup, &loc, xdata); + return 0; } -/* If sources is NULL the xattrs are assumed to be of source for all - * success_children. - */ + int -afr_select_read_child_from_policy (int32_t *success_children, - int32_t child_count, int32_t prev_read_child, - int32_t config_read_child, int32_t *sources, - unsigned int hmode, uuid_t gfid) +afr_inode_refresh_do (call_frame_t *frame, xlator_t *this) { - int32_t read_child = -1; - int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t *xdata = NULL; - GF_ASSERT (success_children); + priv = this->private; + local = frame->local; - read_child = config_read_child; - if (afr_is_read_child (success_children, sources, child_count, - read_child)) - goto out; + afr_replies_wipe (local, priv); - read_child = prev_read_child; - if (afr_is_read_child (success_children, sources, child_count, - read_child)) - goto out; + xdata = dict_new (); + if (!xdata) { + afr_inode_refresh_done (frame, this); + return 0; + } - read_child = afr_hash_child (success_children, child_count, - hmode, gfid); - if (afr_is_read_child (success_children, sources, child_count, - read_child)) { - goto out; - } + if (afr_xattr_req_prepare (this, xdata) != 0) { + dict_unref (xdata); + afr_inode_refresh_done (frame, this); + return 0; + } - for (i = 0; i < child_count; i++) { - read_child = success_children[i]; - if (read_child < 0) - break; - if (afr_is_read_child (success_children, sources, child_count, - read_child)) - goto out; - } - read_child = -1; + local->call_count = AFR_COUNT (local->child_up, priv->child_count); -out: - return read_child; + call_count = local->call_count; + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + + afr_inode_refresh_subvol (frame, this, i, local->refreshinode, + xdata); + + if (!--call_count) + break; + } + + dict_unref (xdata); + + return 0; } -/* This function should be used when all the success_children are sources - */ -void -afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, - int32_t *fresh_children, int32_t prev_read_child, - int32_t config_read_child, uuid_t gfid) + +int +afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_inode_refresh_cbk_t refreshfn) { - int read_child = -1; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; - priv = this->private; - read_child = afr_select_read_child_from_policy (fresh_children, - priv->child_count, - prev_read_child, - config_read_child, - NULL, - priv->hash_mode, gfid); - if (read_child >= 0) - afr_inode_set_read_ctx (this, inode, read_child, - fresh_children); + local = frame->local; + + local->refreshfn = refreshfn; + + if (local->refreshinode) { + inode_unref (local->refreshinode); + local->refreshinode = NULL; + } + + local->refreshinode = inode_ref (inode); + + afr_inode_refresh_do (frame, this); + + return 0; } -/* afr_next_call_child () - * This is a common function used by all the read-type fops - * This function should not be called with the inode's read_children array. - * The fop's handler should make a copy of the inode's read_children, - * preferred read_child into the local vars, because while this function is - * in execution there is a chance for inode's read_ctx to change. - */ -int32_t -afr_next_call_child (int32_t *fresh_children, unsigned char *child_up, - size_t child_count, int32_t *last_index, - int32_t read_child) + +int +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req) { - int next_index = 0; - int32_t next_call_child = -1; + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; - GF_ASSERT (last_index); + priv = this->private; - next_index = *last_index; -retry: - next_index++; - if ((next_index >= child_count) || - (fresh_children[next_index] == -1)) - goto out; - if ((fresh_children[next_index] == read_child) || - (!child_up[fresh_children[next_index]])) - goto retry; - *last_index = next_index; - next_call_child = fresh_children[next_index]; -out: - return next_call_child; + for (i = 0; i < priv->child_count; i++) { + ret = dict_set_uint64 (xattr_req, priv->pending_key[i], + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret < 0) + gf_log (this->name, GF_LOG_WARNING, + "Unable to set dict value for %s", + priv->pending_key[i]); + /* 3 = data+metadata+entry */ + } + ret = dict_set_uint64 (xattr_req, AFR_DIRTY, + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "failed to set dirty " + "query flag"); + } + + return ret; } - /* This function should not be called with the inode's read_children array. - * The fop's handler should make a copy of the inode's read_children, - * preferred read_child into the local vars, because while this function is - * in execution there is a chance for inode's read_ctx to change. - */ -int32_t -afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, - int32_t *fresh_children, - int32_t *call_child, int32_t *last_index) +int +afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this, + dict_t *xattr_req, loc_t *loc) { - int ret = 0; - afr_private_t *priv = NULL; - int i = 0; - - GF_ASSERT (child_up); - GF_ASSERT (call_child); - GF_ASSERT (last_index); - GF_ASSERT (fresh_children); + int ret = -ENOMEM; - if (read_child < 0) { - ret = -EIO; + local->xattr_req = dict_new (); + if (!local->xattr_req) goto out; - } - priv = this->private; - *call_child = -1; - *last_index = -1; + if (xattr_req) + dict_copy (xattr_req, local->xattr_req); - if (child_up[read_child]) { - *call_child = read_child; - } else { - for (i = 0; i < priv->child_count; i++) { - if (fresh_children[i] == -1) - break; - if (child_up[fresh_children[i]]) { - *call_child = fresh_children[i]; - ret = 0; - break; - } - } + ret = afr_xattr_req_prepare (this, local->xattr_req); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to prepare xattr_req", loc->path); + } - if (*call_child == -1) { - ret = -ENOTCONN; - goto out; - } + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_INODELK_COUNT); + } + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_ENTRYLK_COUNT); + } - *last_index = i; + ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_PARENT_ENTRYLK); } + + ret = 0; out: - gf_log (this->name, GF_LOG_DEBUG, "Returning %d, call_child: %d, " - "last_index: %d", ret, *call_child, *last_index); return ret; } -void -afr_reset_xattr (dict_t **xattr, unsigned int child_count) + +int +afr_hash_child (inode_t *inode, int32_t child_count, int hashmode) { - unsigned int i = 0; + uuid_t gfid_copy = {0,}; + pid_t pid; - if (!xattr) - goto out; - for (i = 0; i < child_count; i++) { - if (xattr[i]) { - dict_unref (xattr[i]); - xattr[i] = NULL; - } + if (!hashmode) { + return -1; } -out: - return; -} -void -afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count) -{ - afr_reset_xattr (xattr, child_count); - GF_FREE (xattr); -} + if (inode) { + uuid_copy (gfid_copy, inode->gfid); + } -void -afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) -{ - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; + if (hashmode > 1) { + /* + * Why getpid? Because it's one of the cheapest calls + * available - faster than gethostname etc. - and returns a + * constant-length value that's sure to be shorter than a UUID. + * It's still very unlikely to be the same across clients, so + * it still provides good mixing. We're not trying for + * perfection here. All we need is a low probability that + * multiple clients won't converge on the same subvolume. + */ + pid = getpid(); + memcpy (gfid_copy, &pid, sizeof(pid)); + } - sh = &local->self_heal; - priv = this->private; + return SuperFastHash((char *)gfid_copy, + sizeof(gfid_copy)) % child_count; +} - if (sh->data_sh_info && strcmp (sh->data_sh_info, "")) - GF_FREE (sh->data_sh_info); - if (sh->metadata_sh_info && strcmp (sh->metadata_sh_info, "")) - GF_FREE (sh->metadata_sh_info); +int +afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, + unsigned char *readable) +{ + afr_private_t *priv = NULL; + int read_subvol = -1; + int i = 0; - GF_FREE (sh->buf); + priv = this->private; - GF_FREE (sh->parentbufs); + /* first preference - explicitly specified or local subvolume */ + if (priv->read_child >= 0 && readable[priv->read_child]) + return priv->read_child; - if (sh->inode) - inode_unref (sh->inode); + /* second preference - use hashed mode */ + read_subvol = afr_hash_child (inode, priv->child_count, + priv->hash_mode); + if (read_subvol >= 0 && readable[read_subvol]) + return read_subvol; - afr_xattr_array_destroy (sh->xattr, priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (readable[i]) + return i; + } - GF_FREE (sh->child_errno); + /* no readable subvolumes, either split brain or all subvols down */ - afr_matrix_cleanup (sh->pending_matrix, priv->child_count); - afr_matrix_cleanup (sh->delta_matrix, priv->child_count); + return -1; +} - GF_FREE (sh->sources); - GF_FREE (sh->success); +int +afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, + int type) +{ + int ret = -1; - GF_FREE (sh->locked_nodes); + if (type == AFR_METADATA_TRANSACTION) + ret = afr_inode_read_subvol_get (inode, this, 0, readable, + event_p); + else + ret = afr_inode_read_subvol_get (inode, this, readable, 0, + event_p); + return ret; +} - if (sh->healing_fd) { - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; - } - GF_FREE ((char *)sh->linkname); +int +afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p, + int *event_p, afr_transaction_type type) +{ + afr_private_t *priv = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + unsigned char *readable = NULL; + unsigned char *intersection = NULL; + int subvol = -1; + int event = 0; - GF_FREE (sh->success_children); + priv = this->private; - GF_FREE (sh->fresh_children); + readable = alloca0 (priv->child_count); + data_readable = alloca0 (priv->child_count); + metadata_readable = alloca0 (priv->child_count); + intersection = alloca0 (priv->child_count); - GF_FREE (sh->fresh_parent_dirs); + afr_inode_read_subvol_type_get (inode, this, readable, &event, type); - loc_wipe (&sh->parent_loc); - loc_wipe (&sh->lookup_loc); + afr_inode_read_subvol_get (inode, this, data_readable, metadata_readable, + &event); - GF_FREE (sh->checksum); + AFR_INTERSECT (intersection, data_readable, metadata_readable, + priv->child_count); - GF_FREE (sh->write_needed); - if (sh->healing_fd) - fd_unref (sh->healing_fd); + if (AFR_COUNT (intersection, priv->child_count) > 0) + subvol = afr_read_subvol_select_by_policy (inode, this, + intersection); + else + subvol = afr_read_subvol_select_by_policy (inode, this, + readable); + if (subvol_p) + *subvol_p = subvol; + if (event_p) + *event_p = event; + return subvol; } @@ -838,8 +848,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) priv = this->private; afr_matrix_cleanup (local->pending, priv->child_count); - afr_matrix_cleanup (local->transaction.txn_changelog, - priv->child_count); GF_FREE (local->internal_lock.locked_nodes); @@ -860,7 +868,25 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) loc_wipe (&local->transaction.parent_loc); loc_wipe (&local->transaction.new_parent_loc); - GF_FREE (local->transaction.postop_piggybacked); +} + + +void +afr_replies_wipe (afr_local_t *local, afr_private_t *priv) +{ + int i; + + if (!local->replies) + return; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].xdata) { + dict_unref (local->replies[i].xdata); + local->replies[i].xdata = NULL; + } + } + + memset (local->replies, 0, sizeof(*local->replies) * priv->child_count); } @@ -872,7 +898,7 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (!local) return; - afr_local_sh_cleanup (local, this); + syncbarrier_destroy (&local->barrier); afr_local_transaction_cleanup (local, this); @@ -890,40 +916,26 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (local->dict) dict_unref (local->dict); + afr_replies_wipe (local, priv); GF_FREE(local->replies); GF_FREE (local->child_up); - GF_FREE (local->child_errno); + GF_FREE (local->read_attempted); - GF_FREE (local->fresh_children); + GF_FREE (local->readable); - { /* lookup */ - if (local->cont.lookup.xattrs) { - afr_reset_xattr (local->cont.lookup.xattrs, - priv->child_count); - GF_FREE (local->cont.lookup.xattrs); - local->cont.lookup.xattrs = NULL; - } + if (local->inode) + inode_unref (local->inode); - if (local->cont.lookup.xattr) { - dict_unref (local->cont.lookup.xattr); - } - - if (local->cont.lookup.inode) { - inode_unref (local->cont.lookup.inode); - } + if (local->parent) + inode_unref (local->parent); - GF_FREE (local->cont.lookup.postparents); + if (local->parent2) + inode_unref (local->parent2); - GF_FREE (local->cont.lookup.bufs); - - GF_FREE (local->cont.lookup.success_children); - - GF_FREE (local->cont.lookup.sources); - afr_matrix_cleanup (local->cont.lookup.pending_matrix, - priv->child_count); - } + if (local->refreshinode) + inode_unref (local->refreshinode); { /* getxattr */ GF_FREE (local->cont.getxattr.name); @@ -1018,67 +1030,29 @@ afr_frame_return (call_frame_t *frame) return call_count; } -int -afr_set_elem_count_get (unsigned char *elems, int child_count) -{ - int i = 0; - int ret = 0; - - for (i = 0; i < child_count; i++) - if (elems[i]) - ret++; - return ret; -} - -/** - * up_children_count - return the number of children that are up - */ - -unsigned int -afr_up_children_count (unsigned char *child_up, unsigned int child_count) -{ - return afr_set_elem_count_get (child_up, child_count); -} - -unsigned int -afr_locked_children_count (unsigned char *children, unsigned int child_count) -{ - return afr_set_elem_count_get (children, child_count); -} - -unsigned int -afr_pre_op_done_children_count (unsigned char *pre_op, - unsigned int child_count) -{ - return afr_set_elem_count_get (pre_op, child_count); -} gf_boolean_t -afr_is_fresh_lookup (loc_t *loc, xlator_t *this) -{ - uint64_t ctx = 0; - int32_t ret = 0; - - GF_ASSERT (loc); - GF_ASSERT (this); - GF_ASSERT (loc->inode); +afr_is_entry_possibly_under_txn (afr_local_t *local, xlator_t *this) +{ + int i = 0; + int tmp = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].xdata) + continue; + if (dict_get_int32 (local->replies[i].xdata, + GLUSTERFS_PARENT_ENTRYLK, + &tmp) == 0) + if (tmp) + return _gf_true; + } - ret = inode_ctx_get (loc->inode, this, &ctx); - if (0 == ret) - return _gf_false; - return _gf_true; + return _gf_false; } -void -afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) -{ - GF_ASSERT (loc); - GF_ASSERT (buf); - - uuid_copy (loc->gfid, buf->ia_gfid); - if (postparent) - uuid_copy (loc->pargfid, postparent->ia_gfid); -} /* * Quota size xattrs are not maintained by afr. There is a @@ -1090,1467 +1064,845 @@ afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) * */ static void -afr_handle_quota_size (afr_local_t *local, xlator_t *this, - dict_t *rsp_dict) +afr_handle_quota_size (call_frame_t *frame, xlator_t *this) { - int32_t *sources = NULL; - dict_t *xattr = NULL; - data_t *max_data = NULL; - int64_t max_quota_size = -1; - data_t *data = NULL; - int64_t *size = NULL; - int64_t quota_size = -1; - afr_private_t *priv = NULL; - int i = 0; - int ret = -1; - gf_boolean_t source_present = _gf_false; - - priv = this->private; - sources = local->cont.lookup.sources; - - if (rsp_dict == NULL) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "%s: Invalid " - "response dictionary", local->loc.path); - return; - } - - for (i = 0; i < priv->child_count; i++) { - if (sources[i]) { - source_present = _gf_true; - break; - } - } - - for (i = 0; i < priv->child_count; i++) { - /* - * If there is at least one source lets check - * for maximum quota sizes among sources, otherwise take the - * maximum of the ones present to be on the safer side. - */ - if (source_present && !sources[i]) - continue; - - xattr = local->cont.lookup.xattrs[i]; - if (!xattr) - continue; - - data = dict_get (xattr, QUOTA_SIZE_KEY); - if (!data) - continue; - - size = (int64_t*)data->data; - quota_size = ntoh64(*size); - gf_log (this->name, GF_LOG_DEBUG, "%s: %d, size: %"PRId64, - local->loc.path, i, quota_size); - if (quota_size > max_quota_size) { - if (max_data) - data_unref (max_data); - - max_quota_size = quota_size; - max_data = data_ref (data); - } - } + unsigned char *readable = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int i = 0; + uint64_t size = 0; + uint64_t max_size = 0; + int readable_cnt = 0; - if (max_data) { - ret = dict_set (rsp_dict, QUOTA_SIZE_KEY, max_data); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " - "quota size", local->loc.path); - } + local = frame->local; + priv = this->private; + replies = local->replies; + + readable = alloca0 (priv->child_count); + + afr_inode_read_subvol_get (local->inode, this, readable, 0, 0); + + readable_cnt = AFR_COUNT (readable, priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (readable_cnt && !readable[i]) + continue; + if (!replies[i].xdata) + continue; + if (dict_get_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, &size)) + continue; + if (size > max_size) + max_size = size; + } - data_unref (max_data); - } + if (!max_size) + return; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (readable_cnt && !readable[i]) + continue; + if (!replies[i].xdata) + continue; + if (dict_set_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, max_size)) + continue; + } } -int -afr_lookup_build_response_params (afr_local_t *local, xlator_t *this) -{ - struct iatt *buf = NULL; - struct iatt *postparent = NULL; - dict_t **xattr = NULL; - int32_t *success_children = NULL; - int32_t *sources = NULL; - afr_private_t *priv = NULL; - int32_t read_child = -1; - int ret = 0; - int i = 0; - - GF_ASSERT (local); - - buf = &local->cont.lookup.buf; - postparent = &local->cont.lookup.postparent; - xattr = &local->cont.lookup.xattr; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->cont.lookup.inode, - local->fresh_children); - if (read_child < 0) { - ret = -1; - goto out; - } - success_children = local->cont.lookup.success_children; - sources = local->cont.lookup.sources; - memset (sources, 0, sizeof (*sources) * priv->child_count); - afr_children_intersection_get (local->fresh_children, success_children, - sources, priv->child_count); - if (!sources[read_child]) { - read_child = -1; - for (i = 0; i < priv->child_count; i++) { - if (sources[i]) { - read_child = i; - break; - } - } - } - if (read_child < 0) { - ret = -1; - goto out; - } - - gf_log (this->name, GF_LOG_DEBUG, "Building lookup response from %d", - read_child); - if (!*xattr) - *xattr = dict_ref (local->cont.lookup.xattrs[read_child]); - - *buf = local->cont.lookup.bufs[read_child]; - *postparent = local->cont.lookup.postparents[read_child]; - - if (dict_get (local->xattr_req, QUOTA_SIZE_KEY)) - afr_handle_quota_size (local, this, *xattr); - - if (IA_INVAL == local->cont.lookup.inode->ia_type) { - /* fix for RT #602 */ - local->cont.lookup.inode->ia_type = buf->ia_type; - } -out: - return ret; -} static void -afr_lookup_update_lk_counts (afr_local_t *local, xlator_t *this, - int child_index, dict_t *xattr) +afr_lookup_done (call_frame_t *frame, xlator_t *this) { - uint32_t inodelk_count = 0; - uint32_t entrylk_count = 0; - int ret = -1; - uint32_t parent_entrylk = 0; - - GF_ASSERT (local); - GF_ASSERT (this); - GF_ASSERT (xattr); - GF_ASSERT (child_index >= 0); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = -1; + int op_errno = 0; + int read_subvol = 0; + unsigned char *readable = NULL; + int event = 0; + struct afr_reply *replies = NULL; + uuid_t read_gfid = {0, }; + gf_boolean_t locked_entry = _gf_false; + gf_boolean_t can_interpret = _gf_true; - ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT, - &inodelk_count); - if (ret == 0) - local->inodelk_count += inodelk_count; + priv = this->private; + local = frame->local; + replies = local->replies; - ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT, - &entrylk_count); - if (ret == 0) - local->entrylk_count += entrylk_count; - ret = dict_get_uint32 (xattr, GLUSTERFS_PARENT_ENTRYLK, - &parent_entrylk); - if (!ret) - local->cont.lookup.parent_entrylk += parent_entrylk; -} + locked_entry = afr_is_entry_possibly_under_txn (local, this); -/* - * It's important to maintain a commutative property on do_*_self_heal and - * found*; once set, they must not be cleared by a subsequent iteration or - * call, so that they represent a logical OR of all iterations and calls - * regardless of child/key order. That allows the caller to call us multiple - * times without having to use a separate variable as a "reduce" accumulator. - */ -static void -afr_lookup_set_self_heal_params_by_xattr (afr_local_t *local, xlator_t *this, - dict_t *xattr) -{ - afr_private_t *priv = NULL; - int i = 0; - int ret = -1; - void *pending_raw = NULL; - int32_t *pending = NULL; + readable = alloca0 (priv->child_count); - GF_ASSERT (local); - GF_ASSERT (this); - GF_ASSERT (xattr); + afr_inode_read_subvol_get (local->loc.parent, this, readable, + NULL, &event); - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); - if (ret != 0) { - continue; - } - pending = pending_raw; + /* First, check if we have an ESTALE from somewhere, + If so, propagate that so that a revalidate can be + issued + */ + op_errno = afr_final_errno (frame->local, this->private); + local->op_errno = op_errno; + if (op_errno == ESTALE) { + local->op_errno = op_errno; + local->op_ret = -1; + goto unwind; + } - if (pending[AFR_METADATA_TRANSACTION]) { - gf_log(this->name, GF_LOG_DEBUG, - "metadata self-heal is pending for %s.", - local->loc.path); - local->self_heal.do_metadata_self_heal = _gf_true; - } + read_subvol = -1; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (locked_entry && replies[i].op_ret == -1 && + replies[i].op_errno == ENOENT) { + /* Second, check entry is still + "underway" in creation */ + local->op_ret = -1; + local->op_errno = ENOENT; + read_subvol = i; + goto unwind; + } - if (pending[AFR_ENTRY_TRANSACTION]) { - gf_log(this->name, GF_LOG_DEBUG, - "entry self-heal is pending for %s.", - local->loc.path); - local->self_heal.do_entry_self_heal = _gf_true; - } + if (replies[i].op_ret == -1) + continue; - if (pending[AFR_DATA_TRANSACTION]) { - gf_log(this->name, GF_LOG_DEBUG, - "data self-heal is pending for %s.", - local->loc.path); - local->self_heal.do_data_self_heal = _gf_true; - } - } -} + if (read_subvol == -1 || !readable[read_subvol]) { + read_subvol = i; + uuid_copy (read_gfid, replies[i].poststat.ia_gfid); + local->op_ret = 0; + } + } -void -afr_lookup_check_set_metadata_split_brain (afr_local_t *local, xlator_t *this) -{ - int32_t *sources = NULL; - afr_private_t *priv = NULL; - int32_t subvol_status = 0; - int32_t *success_children = NULL; - dict_t **xattrs = NULL; - struct iatt *bufs = NULL; - int32_t **pending_matrix = NULL; + if (read_subvol == -1) + goto unwind; + /* We now have a read_subvol, which is readable[] (if there + were any). Next we look for GFID mismatches. We don't + consider a GFID mismatch as an error if read_subvol is + readable[] but the mismatching GFID subvol is not. + */ + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) { + if (priv->child_up[i]) + can_interpret = _gf_false; + continue; + } - priv = this->private; + if (!uuid_compare (replies[i].poststat.ia_gfid, + read_gfid)) + continue; - sources = GF_CALLOC (priv->child_count, sizeof (*sources), - gf_afr_mt_int32_t); - if (NULL == sources) - goto out; - success_children = local->cont.lookup.success_children; - xattrs = local->cont.lookup.xattrs; - bufs = local->cont.lookup.bufs; - pending_matrix = local->cont.lookup.pending_matrix; - afr_build_sources (this, xattrs, bufs, pending_matrix, - sources, success_children, AFR_METADATA_TRANSACTION, - &subvol_status, _gf_false); - if (subvol_status & SPLIT_BRAIN) - local->cont.lookup.possible_spb = _gf_true; -out: - GF_FREE (sources); -} + can_interpret = _gf_false; -static void -afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this, - struct iatt *buf, struct iatt *lookup_buf) -{ - if (PERMISSION_DIFFERS (buf, lookup_buf)) { - /* mismatching permissions */ - gf_log (this->name, GF_LOG_DEBUG, - "permissions differ for %s ", local->loc.path); - local->self_heal.do_metadata_self_heal = _gf_true; - } + if (locked_entry) + continue; - if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { - /* mismatching permissions */ - local->self_heal.do_metadata_self_heal = _gf_true; - gf_log (this->name, GF_LOG_DEBUG, - "ownership differs for %s ", local->loc.path); - } + /* Now GFIDs mismatch. It's OK as long as this subvol + is not readable[] but read_subvol is */ + if (readable[read_subvol] && !readable[i]) + continue; - if (SIZE_DIFFERS (buf, lookup_buf) - && IA_ISREG (buf->ia_type)) { - gf_log (this->name, GF_LOG_DEBUG, - "size differs for %s ", local->loc.path); - local->self_heal.do_data_self_heal = _gf_true; - } + /* LOG ERROR */ + local->op_ret = -1; + local->op_errno = EIO; + goto unwind; + } - if (uuid_compare (buf->ia_gfid, lookup_buf->ia_gfid)) { - /* mismatching gfid */ - gf_log (this->name, GF_LOG_DEBUG, - "%s: gfid different on subvolume", local->loc.path); - } -} + /* Forth, for the finalized GFID, pick the best subvolume + to return stats from. + */ + if (can_interpret) { + /* It is safe to call afr_replies_interpret() because we have + a response from all the UP subvolumes and all of them resolved + to the same GFID + */ + if (afr_replies_interpret (frame, this, local->inode)) { + read_subvol = afr_data_subvol_get (local->inode, this, + 0, 0); + afr_inode_read_subvol_reset (local->inode, this); + goto cant_interpret; + } else { + read_subvol = afr_data_subvol_get (local->inode, this, + 0, 0); + } + } else { + cant_interpret: + if (read_subvol == -1) + dict_del (replies[0].xdata, GF_CONTENT_KEY); + else + dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY); + } -static void -afr_detect_self_heal_by_split_brain_status (afr_local_t *local, xlator_t *this) -{ - gf_boolean_t split_brain = _gf_false; - afr_self_heal_t *sh = NULL; + afr_handle_quota_size (frame, this); - sh = &local->self_heal; +unwind: + if (read_subvol == -1) + read_subvol = 0; - split_brain = afr_is_split_brain (this, local->cont.lookup.inode); - split_brain = split_brain || local->cont.lookup.possible_spb; - if ((local->success_count > 0) && split_brain && - IA_ISREG (local->cont.lookup.inode->ia_type)) { - sh->force_confirm_spb = _gf_true; - gf_log (this->name, GF_LOG_DEBUG, - "split brain detected during lookup of %s.", - local->loc.path); - } + AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->replies[read_subvol].poststat, + local->replies[read_subvol].xdata, + &local->replies[read_subvol].postparent); } -static void -afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this) -{ - GF_ASSERT (local); - GF_ASSERT (this); - - if ((local->success_count > 0) && (local->enoent_count > 0)) { - local->self_heal.do_metadata_self_heal = _gf_true; - local->self_heal.do_data_self_heal = _gf_true; - local->self_heal.do_entry_self_heal = _gf_true; - local->self_heal.do_gfid_self_heal = _gf_true; - local->self_heal.do_missing_entry_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "entries are missing in lookup of %s.", - local->loc.path); - } - - return; -} +/* + * During a lookup, some errors are more "important" than + * others in that they must be given higher priority while + * returning to the user. + * + * The hierarchy is ESTALE > ENOENT > others + */ -gf_boolean_t -afr_can_self_heal_proceed (afr_self_heal_t *sh, afr_private_t *priv) +int +afr_higher_errno (int32_t old_errno, int32_t new_errno) { - GF_ASSERT (sh); - GF_ASSERT (priv); + if (old_errno == ENODATA || new_errno == ENODATA) + return ENODATA; + if (old_errno == ESTALE || new_errno == ESTALE) + return ESTALE; + if (old_errno == ENOENT || new_errno == ENOENT) + return ENOENT; - if (sh->force_confirm_spb) - return _gf_true; - return (sh->do_gfid_self_heal - || sh->do_missing_entry_self_heal - || (afr_data_self_heal_enabled (priv->data_self_heal) && - sh->do_data_self_heal) - || (priv->metadata_self_heal && sh->do_metadata_self_heal) - || (priv->entry_self_heal && sh->do_entry_self_heal)); + return new_errno; } -afr_transaction_type -afr_transaction_type_get (ia_type_t ia_type) -{ - afr_transaction_type type = AFR_METADATA_TRANSACTION; - GF_ASSERT (ia_type != IA_INVAL); +int +afr_final_errno (afr_local_t *local, afr_private_t *priv) +{ + int i = 0; + int op_errno = 0; + int tmp_errno = 0; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret == 0) + continue; + tmp_errno = local->replies[i].op_errno; + op_errno = afr_higher_errno (op_errno, tmp_errno); + } - if (IA_ISDIR (ia_type)) { - type = AFR_ENTRY_TRANSACTION; - } else if (IA_ISREG (ia_type)) { - type = AFR_DATA_TRANSACTION; - } - return type; + return op_errno; } -int -afr_lookup_select_read_child (afr_local_t *local, xlator_t *this, - int32_t *read_child) +static int +get_pathinfo_host (char *pathinfo, char *hostname, size_t size) { - ia_type_t ia_type = IA_INVAL; - int32_t source = -1; - int ret = -1; - dict_t **xattrs = NULL; - int32_t *success_children = NULL; - afr_transaction_type type = AFR_METADATA_TRANSACTION; - uuid_t *gfid = NULL; - - GF_ASSERT (local); - GF_ASSERT (this); - GF_ASSERT (local->success_count > 0); + char *start = NULL; + char *end = NULL; + int ret = -1; + int i = 0; - success_children = local->cont.lookup.success_children; - /*We can take the success_children[0] only because we already - *handle the conflicting children other wise, we could select the - *read_child based on wrong file type - */ - ia_type = local->cont.lookup.bufs[success_children[0]].ia_type; - type = afr_transaction_type_get (ia_type); - xattrs = local->cont.lookup.xattrs; - gfid = &local->cont.lookup.buf.ia_gfid; - source = afr_lookup_select_read_child_by_txn_type (this, local, xattrs, - type, *gfid); - if (source < 0) { - gf_log (this->name, GF_LOG_DEBUG, "failed to select source " - "for %s", local->loc.path); + if (!pathinfo) + goto out; + + start = strchr (pathinfo, ':'); + if (!start) + goto out; + end = strrchr (pathinfo, ':'); + if (start == end) goto out; - } - gf_log (this->name, GF_LOG_DEBUG, "Source selected as %d for %s", - source, local->loc.path); - *read_child = source; + memset (hostname, 0, size); + i = 0; + while (++start != end) + hostname[i++] = *start; ret = 0; out: return ret; } -static inline gf_boolean_t -afr_is_transaction_running (afr_local_t *local) -{ - GF_ASSERT (local->fop == GF_FOP_LOOKUP); - return ((local->inodelk_count > 0) || (local->entrylk_count > 0)); -} - -void -afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, - gf_boolean_t background, ia_type_t ia_type, char *reason, - void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, - xlator_t *this), - int (*unwind) (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno, - int32_t sh_failed)) -{ - afr_local_t *local = NULL; - char sh_type_str[256] = {0,}; - char *bg = ""; - - GF_ASSERT (frame); - GF_ASSERT (this); - GF_ASSERT (inode); - GF_ASSERT (ia_type != IA_INVAL); - - local = frame->local; - local->self_heal.background = background; - local->self_heal.type = ia_type; - local->self_heal.unwind = unwind; - local->self_heal.gfid_sh_success_cbk = gfid_sh_success_cbk; - - afr_self_heal_type_str_get (&local->self_heal, - sh_type_str, - sizeof (sh_type_str)); - - if (background) - bg = "background"; - gf_log (this->name, GF_LOG_DEBUG, - "%s %s self-heal triggered. path: %s, reason: %s", bg, - sh_type_str, local->loc.path, reason); - - afr_self_heal (frame, this, inode); -} - -unsigned int -afr_gfid_missing_count (const char *xlator_name, int32_t *success_children, - struct iatt *bufs, unsigned int child_count, - const char *path) +int +afr_local_pathinfo (char *pathinfo, gf_boolean_t *local) { - unsigned int gfid_miss_count = 0; - int i = 0; - struct iatt *child1 = NULL; + int ret = 0; + char pathinfohost[1024] = {0}; + char localhost[1024] = {0}; + xlator_t *this = THIS; - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - child1 = &bufs[success_children[i]]; - if (uuid_is_null (child1->ia_gfid)) { - gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid is null" - " on subvolume %d", path, success_children[i]); - gfid_miss_count++; - } + *local = _gf_false; + ret = get_pathinfo_host (pathinfo, pathinfohost, sizeof (pathinfohost)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Invalid pathinfo: %s", + pathinfo); + goto out; } - return gfid_miss_count; -} - -static int -afr_lookup_gfid_missing_count (afr_local_t *local, xlator_t *this) -{ - int32_t *success_children = NULL; - afr_private_t *priv = NULL; - struct iatt *bufs = NULL; - int miss_count = 0; - - priv = this->private; - bufs = local->cont.lookup.bufs; - success_children = local->cont.lookup.success_children; - - miss_count = afr_gfid_missing_count (this->name, success_children, - bufs, priv->child_count, - local->loc.path); - return miss_count; -} - -gf_boolean_t -afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, - unsigned int child_count, const char *path, - const char *xlator_name) -{ - gf_boolean_t conflicting = _gf_false; - int i = 0; - struct iatt *child1 = NULL; - struct iatt *child2 = NULL; - uuid_t *gfid = NULL; - - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - child1 = &bufs[success_children[i]]; - if ((!gfid) && (!uuid_is_null (child1->ia_gfid))) - gfid = &child1->ia_gfid; - - if (i == 0) - continue; - - child2 = &bufs[success_children[i-1]]; - if (FILETYPE_DIFFERS (child1, child2)) { - gf_log (xlator_name, GF_LOG_DEBUG, "%s: filetype " - "differs on subvolumes (%d, %d)", path, - success_children[i-1], success_children[i]); - conflicting = _gf_true; - goto out; - } - if (!gfid || uuid_is_null (child1->ia_gfid)) - continue; - if (uuid_compare (*gfid, child1->ia_gfid)) { - gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid differs" - " on subvolume %d", path, success_children[i]); - conflicting = _gf_true; - goto out; - } + ret = gethostname (localhost, sizeof (localhost)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "gethostname() failed, " + "reason: %s", strerror (errno)); + goto out; } -out: - return conflicting; -} -/* afr_update_gfid_from_iatts: This function should be called only if the - * iatts are not conflicting. - */ -void -afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs, - int32_t *success_children, unsigned int child_count) -{ - uuid_t *gfid = NULL; - int i = 0; - int child = 0; - - for (i = 0; i < child_count; i++) { - child = success_children[i]; - if (child == -1) - break; - if ((!gfid) && (!uuid_is_null (bufs[child].ia_gfid))) { - gfid = &bufs[child].ia_gfid; - } else if (gfid && (!uuid_is_null (bufs[child].ia_gfid))) { - if (uuid_compare (*gfid, bufs[child].ia_gfid)) { - GF_ASSERT (0); - goto out; - } - } - } - if (gfid && (!uuid_is_null (*gfid))) - uuid_copy (uuid, *gfid); + if (!strcmp (localhost, pathinfohost)) + *local = _gf_true; out: - return; -} - -static gf_boolean_t -afr_lookup_conflicting_entries (afr_local_t *local, xlator_t *this) -{ - afr_private_t *priv = NULL; - gf_boolean_t conflict = _gf_false; - - priv = this->private; - conflict = afr_conflicting_iattrs (local->cont.lookup.bufs, - local->cont.lookup.success_children, - priv->child_count, local->loc.path, - this->name); - return conflict; -} - -gf_boolean_t -afr_open_only_data_self_heal (char *data_self_heal) -{ - return !strcmp (data_self_heal, "open"); + return ret; } -gf_boolean_t -afr_data_self_heal_enabled (char *data_self_heal) +static int32_t +afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - gf_boolean_t enabled = _gf_false; + int ret = 0; + char *pathinfo = NULL; + gf_boolean_t is_local = _gf_false; + afr_private_t *priv = NULL; + int32_t child_index = -1; - if (gf_string2boolean (data_self_heal, &enabled) == -1) { - enabled = !strcmp (data_self_heal, "open"); - GF_ASSERT (enabled); + if (op_ret != 0) { + goto out; } - return enabled; -} - -static void -afr_lookup_set_self_heal_params (afr_local_t *local, xlator_t *this) -{ - int i = 0; - struct iatt *bufs = NULL; - dict_t **xattr = NULL; - afr_private_t *priv = NULL; - int32_t child1 = -1; - int32_t child2 = -1; - afr_self_heal_t *sh = NULL; - - priv = this->private; - sh = &local->self_heal; - - afr_detect_self_heal_by_lookup_status (local, this); - - if (afr_lookup_gfid_missing_count (local, this)) - local->self_heal.do_gfid_self_heal = _gf_true; - - if (_gf_true == afr_lookup_conflicting_entries (local, this)) - local->self_heal.do_missing_entry_self_heal = _gf_true; - else - afr_update_gfid_from_iatts (local->self_heal.sh_gfid_req, - local->cont.lookup.bufs, - local->cont.lookup.success_children, - priv->child_count); - - bufs = local->cont.lookup.bufs; - for (i = 1; i < local->success_count; i++) { - child1 = local->cont.lookup.success_children[i-1]; - child2 = local->cont.lookup.success_children[i]; - afr_detect_self_heal_by_iatt (local, this, - &bufs[child1], &bufs[child2]); - } + priv = this->private; + child_index = (int32_t)(long)cookie; - xattr = local->cont.lookup.xattrs; - for (i = 0; i < local->success_count; i++) { - child1 = local->cont.lookup.success_children[i]; - afr_lookup_set_self_heal_params_by_xattr (local, this, - xattr[child1]); + ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); + if (ret != 0) { + goto out; } - if (afr_open_only_data_self_heal (priv->data_self_heal)) - sh->do_data_self_heal = _gf_false; - if (sh->do_metadata_self_heal) - afr_lookup_check_set_metadata_split_brain (local, this); - afr_detect_self_heal_by_split_brain_status (local, this); -} - -int -afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno, - int32_t sh_failed) -{ - afr_local_t *local = NULL; - int ret = -1; - dict_t *xattr = NULL; - - local = frame->local; - - if (op_ret == -1) { - local->op_ret = -1; - local->op_errno = afr_most_important_error(local->op_errno, - op_errno, _gf_true); + ret = afr_local_pathinfo (pathinfo, &is_local); + if (ret) { goto out; - } else { - local->op_ret = 0; } - afr_lookup_done_success_action (frame, this, _gf_true); - xattr = local->cont.lookup.xattr; - if (xattr) { - ret = dict_set_int32 (xattr, "sh-failed", sh_failed); - if (ret) - gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " - "sh-failed to %d", local->loc.path, sh_failed); - - if (local->self_heal.actual_sh_started == _gf_true && - sh_failed == 0) { - ret = dict_set_int32 (xattr, "actual-sh-done", 1); - if (ret) - gf_log(this->name, GF_LOG_ERROR, "%s: Failed to" - " set actual-sh-done to %d", - local->loc.path, - local->self_heal.actual_sh_started); - } + /* + * Note that one local subvolume will override another here. The only + * way to avoid that would be to retain extra information about whether + * the previous read_child is local, and it's just not worth it. Even + * the slowest local subvolume is far preferable to a remote one. + */ + if (is_local) { + gf_log (this->name, GF_LOG_INFO, + "selecting local read_child %s", + priv->children[child_index]->name); + priv->read_child = child_index; } out: - AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, - local->cont.lookup.inode, &local->cont.lookup.buf, - local->cont.lookup.xattr, - &local->cont.lookup.postparent); - + STACK_DESTROY(frame->root); return 0; } -//TODO: At the moment only lookup needs this, so not doing any checks, in the -// future we will have to do fop specific operations -void -afr_post_gfid_sh_success (call_frame_t *sh_frame, xlator_t *this) +static void +afr_attempt_local_discovery (xlator_t *this, int32_t child_index) { - afr_local_t *local = NULL; - afr_local_t *sh_local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - struct iatt *lookup_bufs = NULL; - struct iatt *lookup_parentbufs = NULL; - - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - local = sh->orig_frame->local; - lookup_bufs = local->cont.lookup.bufs; - lookup_parentbufs = local->cont.lookup.postparents; - priv = this->private; - - memcpy (lookup_bufs, sh->buf, priv->child_count * sizeof (*sh->buf)); - memcpy (lookup_parentbufs, sh->parentbufs, - priv->child_count * sizeof (*sh->parentbufs)); - - afr_reset_xattr (local->cont.lookup.xattrs, priv->child_count); - if (local->cont.lookup.xattr) { - dict_unref (local->cont.lookup.xattr); - local->cont.lookup.xattr = NULL; - } + call_frame_t *newframe = NULL; + loc_t tmploc = {0,}; + afr_private_t *priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) - local->cont.lookup.xattrs[i] = dict_ref (sh->xattr[i]); + newframe = create_frame(this,this->ctx->pool); + if (!newframe) { + return; } - afr_reset_children (local->cont.lookup.success_children, - priv->child_count); - afr_children_copy (local->cont.lookup.success_children, - sh->fresh_children, priv->child_count); + tmploc.gfid[sizeof(tmploc.gfid)-1] = 1; + STACK_WIND_COOKIE (newframe, afr_local_discovery_cbk, + (void *)(long)child_index, + priv->children[child_index], + priv->children[child_index]->fops->getxattr, + &tmploc, GF_XATTR_PATHINFO_KEY, NULL); } -static void -afr_lookup_perform_self_heal (call_frame_t *frame, xlator_t *this, - gf_boolean_t *sh_launched) -{ - unsigned int up_count = 0; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - char *reason = NULL; - - GF_ASSERT (sh_launched); - *sh_launched = _gf_false; - priv = this->private; - local = frame->local; - - up_count = afr_up_children_count (local->child_up, priv->child_count); - if (up_count == 1) { - gf_log (this->name, GF_LOG_DEBUG, - "Only 1 child up - do not attempt to detect self heal"); - goto out; - } - - afr_lookup_set_self_heal_params (local, this); - if (afr_can_self_heal_proceed (&local->self_heal, priv)) { - if (afr_is_transaction_running (local) && - /*Forcefully call afr_launch_self_heal (which will go on to - fail) for SB files.This prevents stale data being served - due to race in afr_is_transaction_running() when - multiple clients access the same SB file*/ - !local->cont.lookup.possible_spb && - (!local->attempt_self_heal)) - goto out; - reason = "lookup detected pending operations"; - afr_launch_self_heal (frame, this, local->cont.lookup.inode, - !local->foreground_self_heal, - local->cont.lookup.buf.ia_type, - reason, afr_post_gfid_sh_success, - afr_self_heal_lookup_unwind); - *sh_launched = _gf_true; - } -out: - return; -} - -void -afr_get_fresh_children (int32_t *success_children, int32_t *sources, - int32_t *fresh_children, unsigned int child_count) +int +afr_lookup_selfheal_wrap (void *opaque) { - unsigned int i = 0; - unsigned int j = 0; - - GF_ASSERT (success_children); - GF_ASSERT (sources); - GF_ASSERT (fresh_children); + call_frame_t *frame = opaque; + afr_local_t *local = NULL; + xlator_t *this = NULL; + inode_t *inode = NULL; - afr_reset_children (fresh_children, child_count); - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - if (afr_is_read_child (success_children, sources, child_count, - success_children[i])) { - fresh_children[j] = success_children[i]; - j++; - } - } -} + local = frame->local; + this = frame->this; -static int -afr_lookup_set_read_ctx (afr_local_t *local, xlator_t *this, int32_t read_child) -{ - afr_private_t *priv = NULL; + afr_selfheal_name (frame->this, local->loc.pargfid, local->loc.name); - GF_ASSERT (read_child >= 0); + afr_replies_wipe (local, this->private); - priv = this->private; - afr_get_fresh_children (local->cont.lookup.success_children, - local->cont.lookup.sources, - local->fresh_children, priv->child_count); - afr_inode_set_read_ctx (this, local->cont.lookup.inode, read_child, - local->fresh_children); + inode = afr_selfheal_unlocked_lookup_on (frame, local->loc.parent, + local->loc.name, local->replies, + local->child_up); + if (inode) + inode_unref (inode); + afr_lookup_done (frame, this); - return 0; + return 0; } + int -afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, - gf_boolean_t fail_conflict) +afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this) { - int32_t read_child = -1; - int32_t ret = -1; - afr_local_t *local = NULL; - gf_boolean_t fresh_lookup = _gf_false; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + call_frame_t *heal = NULL; + int i = 0, first = -1; + gf_boolean_t need_heal = _gf_false; + struct afr_reply *replies = NULL; + int ret = 0; - local = frame->local; - fresh_lookup = local->cont.lookup.fresh_lookup; + local = frame->local; + replies = local->replies; + priv = this->private; - if (local->loc.parent == NULL) - fail_conflict = _gf_true; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; - if (afr_lookup_conflicting_entries (local, this)) { - if (fail_conflict == _gf_false) - ret = 0; - goto out; - } + if (first == -1) { + first = i; + continue; + } - ret = afr_lookup_select_read_child (local, this, &read_child); - if (!afr_is_transaction_running (local) || fresh_lookup) { - if (read_child < 0) - goto out; + if (replies[i].op_ret != replies[first].op_ret) { + need_heal = _gf_true; + break; + } - ret = afr_lookup_set_read_ctx (local, this, read_child); - if (ret) - goto out; - } + if (uuid_compare (replies[i].poststat.ia_gfid, + replies[first].poststat.ia_gfid)) { + need_heal = _gf_true; + break; + } + } - ret = afr_lookup_build_response_params (local, this); - if (ret) - goto out; - afr_update_loc_gfids (&local->loc, - &local->cont.lookup.buf, - &local->cont.lookup.postparent); + if (need_heal) { + heal = copy_frame (frame); + if (heal) + heal->root->pid = -1; + ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap, + afr_refresh_selfheal_done, heal, frame); + if (ret) + goto lookup_done; + } else { + lookup_done: + afr_lookup_done (frame, this); + } - ret = 0; -out: - if (ret) { - local->op_ret = -1; - local->op_errno = EIO; - } - return ret; + return ret; } + int -afr_lookup_get_latest_subvol (afr_local_t *local, xlator_t *this) +afr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *buf, + dict_t *xdata, struct iatt *postparent) { - afr_private_t *priv = NULL; - int32_t *success_children = NULL; - struct iatt *bufs = NULL; - int i = 0; - int child = 0; - int lsubvol = -1; - - priv = this->private; - success_children = local->cont.lookup.success_children; - bufs = local->cont.lookup.bufs; - for (i = 0; i < priv->child_count; i++) { - child = success_children[i]; - if (child == -1) - break; - if (uuid_is_null (bufs[child].ia_gfid)) - continue; - if (lsubvol < 0) { - lsubvol = child; - } else if (bufs[lsubvol].ia_ctime < bufs[child].ia_ctime) { - lsubvol = child; - } else if ((bufs[lsubvol].ia_ctime == bufs[child].ia_ctime) && - (bufs[lsubvol].ia_ctime_nsec < bufs[child].ia_ctime_nsec)) { - lsubvol = child; - } - } - return lsubvol; -} + afr_local_t * local = NULL; + int call_count = -1; + int child_index = -1; -void -afr_lookup_mark_other_entries_stale (afr_local_t *local, xlator_t *this, - int subvol) -{ - afr_private_t *priv = NULL; - int32_t *success_children = NULL; - struct iatt *bufs = NULL; - int i = 0; - int child = 0; + child_index = (long) cookie; - priv = this->private; - success_children = local->cont.lookup.success_children; - bufs = local->cont.lookup.bufs; - memcpy (local->fresh_children, success_children, - sizeof (*success_children) * priv->child_count); - for (i = 0; i < priv->child_count; i++) { - child = local->fresh_children[i]; - if (child == -1) - break; - if (child == subvol) - continue; - if (uuid_is_null (bufs[child].ia_gfid) && - (bufs[child].ia_type == bufs[subvol].ia_type)) - continue; - afr_children_rm_child (success_children, child, - priv->child_count); - local->success_count--; - } - afr_reset_children (local->fresh_children, priv->child_count); -} + local = frame->local; -void -afr_succeed_lookup_on_latest_iatt (afr_local_t *local, xlator_t *this) -{ - int lsubvol = 0; + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (op_ret != -1) { + local->replies[child_index].poststat = *buf; + local->replies[child_index].postparent = *postparent; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + } - if (!afr_lookup_conflicting_entries (local, this)) - goto out; + call_count = afr_frame_return (frame); + if (call_count == 0) { + afr_lookup_entry_heal (frame, this); + } - lsubvol = afr_lookup_get_latest_subvol (local, this); - if (lsubvol < 0) - goto out; - afr_lookup_mark_other_entries_stale (local, this, lsubvol); -out: - return; + return 0; } -gf_boolean_t -afr_is_entry_possibly_under_creation (afr_local_t *local, xlator_t *this) -{ - /* - * We need to perform this test in lookup done and treat on going - * create/DELETE as ENOENT. - * Reason: - Multiple clients A, B and C are attempting 'mkdir -p /mnt/a/b/c' - - 1 Client A is in the middle of mkdir(/a). It has acquired lock. - It has performed mkdir(/a) on one subvol, and second one is still - in progress - 2 Client B performs a lookup, sees directory /a on one, - ENOENT on the other, succeeds lookup. - 3 Client B performs lookup on /a/b on both subvols, both return ENOENT - (one subvol because /a/b does not exist, another because /a - itself does not exist) - 4 Client B proceeds to mkdir /a/b. It obtains entrylk on inode=/a with - basename=b on one subvol, but fails on other subvol as /a is yet to - be created by Client A. - 5 Client A finishes mkdir of /a on other subvol - 6 Client C also attempts to create /a/b, lookup returns ENOENT on - both subvols. - 7 Client C tries to obtain entrylk on on inode=/a with basename=b, - obtains on one subvol (where B had failed), and waits for B to unlock - on other subvol. - 8 Client B finishes mkdir() on one subvol with GFID-1 and completes - transaction and unlocks - 9 Client C gets the lock on the second subvol, At this stage second - subvol already has /a/b created from Client B, but Client C does not - check that in the middle of mkdir transaction - 10 Client C attempts mkdir /a/b on both subvols. It succeeds on - ONLY ONE (where Client B could not get lock because of - missing parent /a dir) with GFID-2, and gets EEXIST from ONE subvol. - This way we have /a/b in GFID mismatch. One subvol got GFID-1 because - Client B performed transaction on only one subvol (because entrylk() - could not be obtained on second subvol because of missing parent dir -- - caused by premature/speculative succeeding of lookup() on /a when locks - are detected). Other subvol gets GFID-2 from Client C because while - it was waiting for entrylk() on both subvols, Client B was in the - middle of creating mkdir() on only one subvol, and Client C does not - "expect" this when it is between lock() and pre-op()/op() phase of the - transaction. - */ - if (local->cont.lookup.parent_entrylk && local->enoent_count) - return _gf_true; - - return _gf_false; -} static void -afr_lookup_done (call_frame_t *frame, xlator_t *this) +afr_discover_done (call_frame_t *frame, xlator_t *this) { - int unwind = 1; afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; - gf_boolean_t sh_launched = _gf_false; - gf_boolean_t fail_conflict = _gf_false; - int gfid_miss_count = 0; - int enotconn_count = 0; - int up_children_count = 0; + int i = -1; + int op_errno = 0; + int read_subvol = 0; priv = this->private; local = frame->local; - if (afr_is_entry_possibly_under_creation (local, this)) { - local->op_ret = -1; - local->op_errno = ENOENT; - goto unwind; + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret == 0) + local->op_ret = 0; } - if (local->op_ret < 0) - goto unwind; + op_errno = afr_final_errno (frame->local, this->private); - if (local->cont.lookup.parent_entrylk && local->success_count > 1) - afr_succeed_lookup_on_latest_iatt (local, this); - - gfid_miss_count = afr_lookup_gfid_missing_count (local, this); - up_children_count = afr_up_children_count (local->child_up, - priv->child_count); - enotconn_count = priv->child_count - up_children_count; - if ((gfid_miss_count == local->success_count) && - (enotconn_count > 0)) { - local->op_ret = -1; - local->op_errno = EIO; - gf_log (this->name, GF_LOG_ERROR, "Failing lookup for %s, " - "LOOKUP on a file without gfid is not allowed when " - "some of the children are down", local->loc.path); - goto unwind; - } - - if ((gfid_miss_count == local->success_count) && - uuid_is_null (local->cont.lookup.gfid_req)) { - local->op_ret = -1; - local->op_errno = ENODATA; - gf_log (this->name, GF_LOG_ERROR, "%s: No gfid present", - local->loc.path); + if (local->op_ret < 0) { + local->op_errno = op_errno; + local->op_ret = -1; goto unwind; - } + } - if (gfid_miss_count && uuid_is_null (local->cont.lookup.gfid_req)) - fail_conflict = _gf_true; - ret = afr_lookup_done_success_action (frame, this, fail_conflict); - if (ret) - goto unwind; - uuid_copy (local->self_heal.sh_gfid_req, local->cont.lookup.gfid_req); + afr_replies_interpret (frame, this, local->inode); - afr_lookup_perform_self_heal (frame, this, &sh_launched); - if (sh_launched) { - unwind = 0; - goto unwind; - } + read_subvol = afr_data_subvol_get (local->inode, this, 0, 0); + if (read_subvol == -1) { + gf_log (this->name, GF_LOG_WARNING, "no read subvols for %s", + local->loc.path); - unwind: - if (unwind) { - AFR_STACK_UNWIND (lookup, frame, local->op_ret, - local->op_errno, local->cont.lookup.inode, - &local->cont.lookup.buf, - local->cont.lookup.xattr, - &local->cont.lookup.postparent); - } -} + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid || + local->replies[i].op_ret == -1) + continue; + read_subvol = i; + break; + } + } -/* - * During a lookup, some errors are more "important" than - * others in that they must be given higher priority while - * returning to the user. - * - * The hierarchy is ESTALE > EIO > ENOENT > others - */ -int32_t -afr_most_important_error(int32_t old_errno, int32_t new_errno, - gf_boolean_t eio) -{ - if (old_errno == ESTALE || new_errno == ESTALE) - return ESTALE; - if (eio && (old_errno == EIO || new_errno == EIO)) - return EIO; - if (old_errno == ENOENT || new_errno == ENOENT) - return ENOENT; +unwind: + if (read_subvol == -1) + read_subvol = 0; - return new_errno; + AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->replies[read_subvol].poststat, + local->replies[read_subvol].xdata, + &local->replies[read_subvol].postparent); } -int32_t -afr_resultant_errno_get (int32_t *children, - int *child_errno, unsigned int child_count) -{ - int i = 0; - int32_t op_errno = 0; - int child = 0; - - for (i = 0; i < child_count; i++) { - if (children) { - child = children[i]; - if (child == -1) - break; - } else { - child = i; - } - op_errno = afr_most_important_error(op_errno, - child_errno[child], - _gf_false); - } - return op_errno; -} -static void -afr_lookup_handle_error (afr_local_t *local, int32_t op_ret, int32_t op_errno) +int +afr_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *buf, + dict_t *xdata, struct iatt *postparent) { - GF_ASSERT (local); - if (op_errno == ENOENT) - local->enoent_count++; + afr_local_t * local = NULL; + int call_count = -1; + int child_index = -1; - local->op_errno = afr_most_important_error(local->op_errno, op_errno, - _gf_false); + child_index = (long) cookie; - if (local->op_errno == ESTALE) { - local->op_ret = -1; - } -} + local = frame->local; -static void -afr_set_root_inode_on_first_lookup (afr_local_t *local, xlator_t *this, - inode_t *inode) -{ - afr_private_t *priv = NULL; - GF_ASSERT (inode); + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (op_ret != -1) { + local->replies[child_index].poststat = *buf; + local->replies[child_index].postparent = *postparent; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + } - if (!__is_root_gfid (inode->gfid)) - goto out; - if (!afr_is_fresh_lookup (&local->loc, this)) - goto out; - priv = this->private; - if ((priv->first_lookup)) { - gf_log (this->name, GF_LOG_INFO, "added root inode"); - priv->root_inode = inode_ref (inode); - priv->first_lookup = 0; + if (local->do_discovery && (op_ret == 0)) + afr_attempt_local_discovery (this, child_index); + + call_count = afr_frame_return (frame); + if (call_count == 0) { + afr_discover_done (frame, this); } -out: - return; -} -static void -afr_lookup_cache_args (afr_local_t *local, int child_index, dict_t *xattr, - struct iatt *buf, struct iatt *postparent) -{ - GF_ASSERT (child_index >= 0); - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - local->cont.lookup.postparents[child_index] = *postparent; - local->cont.lookup.bufs[child_index] = *buf; + return 0; } -static void -afr_lookup_handle_first_success (afr_local_t *local, xlator_t *this, - inode_t *inode, struct iatt *buf) -{ - local->cont.lookup.inode = inode_ref (inode); - local->cont.lookup.buf = *buf; - afr_set_root_inode_on_first_lookup (local, this, inode); -} -static int32_t -afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, - dict_t *xdata) +int +afr_discover_do (call_frame_t *frame, xlator_t *this, int err) { - int ret = 0; - char *pathinfo = NULL; - gf_boolean_t is_local = _gf_false; - afr_private_t *priv = NULL; - int32_t child_index = -1; + int ret = 0; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; - if (op_ret != 0) { - goto out; - } + local = frame->local; + priv = this->private; - ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); - if (ret != 0) { - goto out; - } + if (err) { + local->op_errno = -err; + ret = -1; + goto out; + } - ret = afr_local_pathinfo (pathinfo, &is_local); + call_count = local->call_count = AFR_COUNT (local->child_up, + priv->child_count); + + ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, + &local->loc); if (ret) { + local->op_errno = -ret; + ret = -1; goto out; } - priv = this->private; - /* - * Note that one local subvolume will override another here. The only - * way to avoid that would be to retain extra information about whether - * the previous read_child is local, and it's just not worth it. Even - * the slowest local subvolume is far preferable to a remote one. - */ - if (is_local) { - child_index = (int32_t)(long)cookie; - gf_log (this->name, GF_LOG_INFO, - "selecting local read_child %s", - priv->children[child_index]->name); - priv->read_child = child_index; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_discover_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, local->xattr_req); + if (!--call_count) + break; + } } + return 0; out: - STACK_DESTROY(frame->root); - return 0; + AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0); + return 0; } -static void -afr_attempt_local_discovery (xlator_t *this, int32_t child_index) + +int +afr_discover (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) { - call_frame_t *newframe = NULL; - loc_t tmploc = {0,}; - afr_private_t *priv = this->private; + int op_errno = ENOMEM; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int event = 0; - newframe = create_frame(this,this->ctx->pool); - if (!newframe) { - return; - } + priv = this->private; - tmploc.gfid[sizeof(tmploc.gfid)-1] = 1; - STACK_WIND_COOKIE (newframe, afr_discovery_cbk, - (void *)(long)child_index, - priv->children[child_index], - priv->children[child_index]->fops->getxattr, - &tmploc, GF_XATTR_PATHINFO_KEY, NULL); -} - -static void -afr_lookup_handle_success (afr_local_t *local, xlator_t *this, int32_t child_index, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - afr_private_t *priv = this->private; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - if (local->success_count == 0) { - if (local->op_errno != ESTALE) { - local->op_ret = op_ret; - local->op_errno = 0; - } - afr_lookup_handle_first_success (local, this, inode, buf); + if (!local->call_count) { + op_errno = ENOTCONN; + goto out; } - afr_lookup_update_lk_counts (local, this, - child_index, xattr); - afr_lookup_cache_args (local, child_index, xattr, - buf, postparent); + if (__is_root_gfid (loc->inode->gfid)) { + if (!this->itable) + this->itable = loc->inode->table; + if (!priv->root_inode) + priv->root_inode = inode_ref (loc->inode); - if (local->do_discovery && (priv->read_child == (-1))) { - afr_attempt_local_discovery(this,child_index); - } + if (priv->choose_local && !priv->did_discovery) { + /* Logic to detect which subvolumes of AFR are + local, in order to prefer them for reads + */ + local->do_discovery = _gf_true; + priv->did_discovery = _gf_true; + } + } - local->cont.lookup.success_children[local->success_count] = child_index; - local->success_count++; -} + local->op = GF_FOP_LOOKUP; -int -afr_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - afr_local_t * local = NULL; - int call_count = -1; - int child_index = -1; + loc_copy (&local->loc, loc); - child_index = (long) cookie; + local->inode = inode_ref (loc->inode); - LOCK (&frame->lock); - { - local = frame->local; + if (xattr_req) + /* If xattr_req was null, afr_lookup_xattr_req_prepare() will + allocate one for us */ + local->xattr_req = dict_ref (xattr_req); - if (op_ret == -1) { - afr_lookup_handle_error (local, op_ret, op_errno); - goto unlock; - } - afr_lookup_handle_success (local, this, child_index, op_ret, - op_errno, inode, buf, xattr, - postparent); + if (uuid_is_null (loc->inode->gfid)) { + afr_discover_do (frame, this, 0); + return 0; + } - } -unlock: - UNLOCK (&frame->lock); + afr_read_subvol_get (loc->inode, this, NULL, &event, + AFR_DATA_TRANSACTION); - call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_lookup_done (frame, this); - } + if (event != local->event_generation) + afr_inode_refresh (frame, this, loc->inode, afr_discover_do); + else + afr_discover_do (frame, this, 0); - return 0; + return 0; +out: + AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; } + int -afr_lookup_cont_init (afr_local_t *local, unsigned int child_count) +afr_lookup_do (call_frame_t *frame, xlator_t *this, int err) { - int ret = -ENOMEM; - struct iatt *iatts = NULL; - int32_t *success_children = NULL; - int32_t *sources = NULL; - int32_t **pending_matrix = NULL; - - GF_ASSERT (local); - local->cont.lookup.xattrs = GF_CALLOC (child_count, - sizeof (*local->cont.lookup.xattr), - gf_afr_mt_dict_t); - if (NULL == local->cont.lookup.xattrs) - goto out; - - iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); - if (NULL == iatts) - goto out; - local->cont.lookup.postparents = iatts; + int ret = 0; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; - iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); - if (NULL == iatts) - goto out; - local->cont.lookup.bufs = iatts; + local = frame->local; + priv = this->private; - success_children = afr_children_create (child_count); - if (NULL == success_children) - goto out; - local->cont.lookup.success_children = success_children; + if (err < 0) { + local->op_errno = -err; + ret = -1; + goto out; + } - local->fresh_children = afr_children_create (child_count); - if (NULL == local->fresh_children) - goto out; + call_count = local->call_count = AFR_COUNT (local->child_up, + priv->child_count); - sources = GF_CALLOC (sizeof (*sources), child_count, gf_afr_mt_int32_t); - if (NULL == sources) - goto out; - local->cont.lookup.sources = sources; - - pending_matrix = afr_matrix_create (child_count, child_count); - if (NULL == pending_matrix) + ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, + &local->loc); + if (ret) { + local->op_errno = -ret; + ret = -1; goto out; - local->cont.lookup.pending_matrix = pending_matrix; + } - ret = 0; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, local->xattr_req); + if (!--call_count) + break; + } + } + return 0; out: - return ret; + AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0); + return 0; } +/* + * afr_lookup() + * + * The goal here is to figure out what the element getting looked up is. + * i.e what is the GFID, inode type and a conservative estimate of the + * inode attributes are. + * + * As we lookup, operations may be underway on the entry name and the + * inode. In lookup() we are primarily concerned only with the entry + * operations. If the entry is getting unlinked or renamed, we detect + * what operation is underway by querying for on-going transactions and + * pending self-healing on the entry through xdata. + * + * If the entry is a file/dir, it may need self-heal and/or in a + * split-brain condition. Lookup is not the place to worry about these + * conditions. Outcast marking will naturally handle them in the read + * paths. + * + * Here is a brief goal of what we are trying to achieve: + * + * - LOOKUP on all subvolumes concurrently, querying on-going transaction + * and pending self-heal info from the servers. + * + * - If all servers reply the same inode type and GFID, the overall call + * MUST be a success. + * + * - If inode types or GFIDs mismatch, and there IS either an on-going + * transaction or pending self-heal, inspect what the nature of the + * transaction or pending heal is, and select the appropriate subvolume's + * reply as the winner. + * + * - If inode types or GFIDs mismatch, and there are no on-going transactions + * or pending self-heal on the entry name on any of the servers, fail the + * lookup with EIO. Something has gone wrong beyond reasonable action. + */ + int -afr_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) +afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - void *gfid_req = NULL; - int ret = -1; - int i = 0; - int call_count = 0; - uint64_t ctx = 0; - int32_t op_errno = 0; - priv = this->private; - - AFR_LOCAL_ALLOC_OR_GOTO (local, out); + afr_local_t *local = NULL; + int32_t op_errno = 0; + int event = 0; - local->op_ret = -1; + if (!loc->parent) { + afr_discover (frame, this, loc, xattr_req); + return 0; + } - frame->local = local; - local->fop = GF_FOP_LOOKUP; + if (__is_root_gfid (loc->parent->gfid)) { + if (!strcmp (loc->name, GF_REPLICATE_TRASH_DIR)) { + op_errno = EPERM; + goto out; + } + } - loc_copy (&local->loc, loc); - ret = loc_path (&local->loc, NULL); - if (ret < 0) { - op_errno = EINVAL; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - if (local->loc.path && - (strcmp (local->loc.path, "/" GF_REPLICATE_TRASH_DIR) == 0)) { - op_errno = EPERM; - ret = -1; + if (!local->call_count) { + op_errno = ENOTCONN; goto out; } - ret = inode_ctx_get (local->loc.inode, this, &ctx); - if (ret == 0) { - /* lookup is a revalidate */ + local->op = GF_FOP_LOOKUP; - local->read_child_index = afr_inode_get_read_ctx (this, - local->loc.inode, - NULL); - } else { - LOCK (&priv->read_child_lock); - { - if (priv->hash_mode) { - local->read_child_index = -1; - } - else { - local->read_child_index = - (++priv->read_child_rr) % - (priv->child_count); - } - } - UNLOCK (&priv->read_child_lock); - local->cont.lookup.fresh_lookup = _gf_true; - } + loc_copy (&local->loc, loc); - local->child_up = memdup (priv->child_up, - sizeof (*local->child_up) * priv->child_count); - if (NULL == local->child_up) { - op_errno = ENOMEM; - goto out; - } + local->inode = inode_ref (loc->inode); - ret = afr_lookup_cont_init (local, priv->child_count); - if (ret < 0) { - op_errno = -ret; - goto out; - } + if (xattr_req) + /* If xattr_req was null, afr_lookup_xattr_req_prepare() will + allocate one for us */ + local->xattr_req = dict_ref (xattr_req); - local->call_count = afr_up_children_count (local->child_up, - priv->child_count); - call_count = local->call_count; - if (local->call_count == 0) { - ret = -1; - op_errno = ENOTCONN; - goto out; - } + afr_read_subvol_get (loc->parent, this, NULL, &event, + AFR_DATA_TRANSACTION); - /* By default assume ENOTCONN. On success it will be set to 0. */ - local->op_errno = ENOTCONN; + if (event != local->event_generation) + afr_inode_refresh (frame, this, loc->parent, afr_lookup_do); + else + afr_lookup_do (frame, this, 0); - ret = dict_get_int32 (xattr_req, "attempt-self-heal", - &local->attempt_self_heal); - dict_del (xattr_req, "attempt-self-heal"); + return 0; +out: + AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); - ret = dict_get_int32 (xattr_req, "foreground-self-heal", - &local->foreground_self_heal); - dict_del (xattr_req, "foreground-self-heal"); + return 0; +} - ret = afr_lookup_xattr_req_prepare (local, this, xattr_req, &local->loc, - &gfid_req); - if (ret) { - local->op_errno = -ret; - goto out; - } - afr_lookup_save_gfid (local->cont.lookup.gfid_req, gfid_req, - &local->loc); - local->fop = GF_FOP_LOOKUP; - if (priv->choose_local && !priv->did_discovery) { - if (gfid_req && __is_root_gfid(gfid_req)) { - local->do_discovery = _gf_true; - priv->did_discovery = _gf_true; - } - } - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_lookup_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - &local->loc, local->xattr_req); - if (!--call_count) - break; - } + +/* {{{ open */ + +afr_fd_ctx_t * +__afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ + uint64_t ctx = 0; + int ret = 0; + afr_fd_ctx_t *fd_ctx = NULL; + + ret = __fd_ctx_get (fd, this, &ctx); + + if (ret < 0) { + ret = __afr_fd_ctx_set (this, fd); + if (ret < 0) + goto out; + + ret = __fd_ctx_get (fd, this, &ctx); + if (ret < 0) + goto out; } - ret = 0; + fd_ctx = (afr_fd_ctx_t *)(long) ctx; out: - if (ret) - AFR_STACK_UNWIND (lookup, frame, -1, op_errno, - NULL, NULL, NULL, NULL); - - return 0; + return fd_ctx; } -/* {{{ open */ +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ + afr_fd_ctx_t *fd_ctx = NULL; + + LOCK(&fd->lock); + { + fd_ctx = __afr_fd_ctx_get (fd, this); + } + UNLOCK(&fd->lock); + + return fd_ctx; +} + int __afr_fd_ctx_set (xlator_t *this, fd_t *fd) @@ -2559,6 +1911,7 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) int ret = -1; uint64_t ctx = 0; afr_fd_ctx_t * fd_ctx = NULL; + int i = 0; VALIDATE_OR_GOTO (this->private, out); VALIDATE_OR_GOTO (fd, out); @@ -2577,21 +1930,15 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) goto out; } - fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_done) { - ret = -ENOMEM; - goto out; - } - - fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_piggyback) { - ret = -ENOMEM; - goto out; - } + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]), + priv->child_count, + gf_afr_mt_int32_t); + if (!fd_ctx->pre_op_done[i]) { + ret = -ENOMEM; + goto out; + } + } fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), priv->child_count, @@ -2601,6 +1948,13 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) goto out; } + for (i = 0; i < priv->child_count; i++) { + if (fd_is_anonymous (fd)) + fd_ctx->opened_on[i] = AFR_FD_OPENED; + else + fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED; + } + fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback), priv->child_count, gf_afr_mt_char); @@ -2617,20 +1971,7 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) goto out; } - fd_ctx->up_count = priv->up_count; - fd_ctx->down_count = priv->down_count; - - fd_ctx->locked_on = GF_CALLOC (sizeof (*fd_ctx->locked_on), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->locked_on) { - ret = -ENOMEM; - goto out; - } - pthread_mutex_init (&fd_ctx->delay_lock, NULL); - INIT_LIST_HEAD (&fd_ctx->entries); - fd_ctx->call_child = -1; INIT_LIST_HEAD (&fd_ctx->eager_locked); @@ -2660,32 +2001,31 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd) /* {{{ flush */ int -afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - int call_count = -1; + afr_local_t *local = NULL; + int call_count = -1; local = frame->local; LOCK (&frame->lock); { if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - } - - local->op_errno = op_errno; + local->op_ret = op_ret; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + } else { + local->op_errno = op_errno; + } } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) - AFR_STACK_UNWIND(flush, frame, local->op_ret, - local->op_errno, NULL); + AFR_STACK_UNWIND (flush, frame, local->op_ret, + local->op_errno, local->xdata_rsp); return 0; } @@ -2708,7 +2048,7 @@ afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) (void *) (long) i, priv->children[i], priv->children[i]->fops->flush, - local->fd, NULL); + local->fd, xdata); if (!--call_count) break; @@ -2721,40 +2061,30 @@ afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) int afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_private_t *priv = NULL; afr_local_t *local = NULL; call_stub_t *stub = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; - priv = this->private; - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - ret = afr_local_init(local, priv, &op_errno); - if (ret < 0) + if (!local->call_count) { + op_errno = ENOTCONN; goto out; + } local->fd = fd_ref(fd); + stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata); - if (!stub) { - ret = -1; - op_errno = ENOMEM; + if (!stub) goto out; - } afr_delayed_changelog_wake_resume (this, fd, stub); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND(flush, frame, -1, op_errno, NULL); - + AFR_STACK_UNWIND (flush, frame, -1, op_errno, NULL); return 0; } @@ -2767,6 +2097,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) uint64_t ctx = 0; afr_fd_ctx_t *fd_ctx = NULL; int ret = 0; + int i = 0; ret = fd_ctx_get (fd, this, &ctx); if (ret < 0) @@ -2775,13 +2106,11 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) fd_ctx = (afr_fd_ctx_t *)(long) ctx; if (fd_ctx) { - GF_FREE (fd_ctx->pre_op_done); + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) + GF_FREE (fd_ctx->pre_op_done[i]); GF_FREE (fd_ctx->opened_on); - GF_FREE (fd_ctx->locked_on); - - GF_FREE (fd_ctx->pre_op_piggyback); GF_FREE (fd_ctx->lock_piggyback); GF_FREE (fd_ctx->lock_acquired); @@ -2799,24 +2128,8 @@ out: int afr_release (xlator_t *this, fd_t *fd) { - afr_locked_fd_t *locked_fd = NULL; - afr_locked_fd_t *tmp = NULL; - afr_private_t *priv = NULL; - - priv = this->private; - afr_cleanup_fd_ctx (this, fd); - list_for_each_entry_safe (locked_fd, tmp, &priv->saved_fds, - list) { - - if (locked_fd->fd == fd) { - list_del_init (&locked_fd->list); - GF_FREE (locked_fd); - } - - } - return 0; } @@ -2841,36 +2154,38 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_local_t *local = NULL; int call_count = -1; int child_index = (long) cookie; - int read_child = 0; + int read_subvol = 0; call_stub_t *stub = NULL; local = frame->local; - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + read_subvol = afr_data_subvol_get (local->inode, this, 0, 0); LOCK (&frame->lock); { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - if (op_ret == 0) { - local->op_ret = 0; + if (local->op_ret == -1) { + local->op_ret = 0; - if (local->success_count == 0) { local->cont.inode_wfop.prebuf = *prebuf; local->cont.inode_wfop.postbuf = *postbuf; + + if (xdata) + local->xdata_rsp = dict_ref (xdata); } - if (child_index == read_child) { + if (child_index == read_subvol) { local->cont.inode_wfop.prebuf = *prebuf; local->cont.inode_wfop.postbuf = *postbuf; + if (xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = dict_ref (xdata); + } } - - local->success_count++; - } - - local->op_errno = op_errno; + } else { + local->op_errno = op_errno; + } } UNLOCK (&frame->lock); @@ -2890,7 +2205,7 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_ret, local->op_errno, &local->cont.inode_wfop.prebuf, &local->cont.inode_wfop.postbuf, - xdata); + local->xdata_rsp); if (!stub) { AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); return 0; @@ -2910,37 +2225,35 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int -afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync, dict_t *xdata) +afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; + int32_t op_errno = ENOMEM; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + priv = this->private; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } - local->fd = fd_ref (fd); + local->fd = fd_ref (fd); if (afr_fd_has_witnessed_unstable_write (this, fd)) { /* don't care. we only wanted to CLEAR the bit */ } + local->inode = inode_ref (fd->inode); + for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, afr_fsync_cbk, @@ -2953,10 +2266,10 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); + AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } @@ -2964,10 +2277,9 @@ out: /* {{{ fsync */ -int32_t -afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xdata) +int +afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; int call_count = -1; @@ -2976,10 +2288,13 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, LOCK (&frame->lock); { - if (op_ret == 0) + if (op_ret == 0) { local->op_ret = 0; - - local->op_errno = op_errno; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + } else { + local->op_errno = op_errno; + } } UNLOCK (&frame->lock); @@ -2987,37 +2302,33 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret, - local->op_errno, xdata); + local->op_errno, local->xdata_rsp); return 0; } -int32_t -afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync, dict_t *xdata) +int +afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; + int32_t op_errno = ENOMEM; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + priv = this->private; - priv = this->private; - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3030,10 +2341,10 @@ afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); + return 0; } @@ -3056,6 +2367,10 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, if (op_ret == 0) { if (!local->cont.xattrop.xattr) local->cont.xattrop.xattr = dict_ref (xattr); + + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + local->op_ret = 0; } @@ -3067,7 +2382,7 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (xattrop, frame, local->op_ret, local->op_errno, - local->cont.xattrop.xattr, xdata); + local->cont.xattrop.xattr, local->xdata_rsp); return 0; } @@ -3079,25 +2394,21 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3110,10 +2421,10 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); + return 0; } @@ -3138,6 +2449,8 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, if (!local->cont.fxattrop.xattr) local->cont.fxattrop.xattr = dict_ref (xattr); + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); local->op_ret = 0; } @@ -3149,7 +2462,7 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fxattrop, frame, local->op_ret, local->op_errno, - local->cont.fxattrop.xattr, xdata); + local->cont.fxattrop.xattr, local->xdata_rsp); return 0; } @@ -3161,25 +2474,21 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; int32_t op_errno = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3192,10 +2501,10 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); + return 0; } @@ -3203,8 +2512,8 @@ out: int32_t -afr_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -3238,25 +2547,21 @@ afr_inodelk (call_frame_t *frame, xlator_t *this, { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOMEM; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3270,18 +2575,17 @@ afr_inodelk (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); + return 0; } int32_t -afr_finodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xdata) +afr_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -3309,31 +2613,26 @@ afr_finodelk_cbk (call_frame_t *frame, void *cookie, int32_t -afr_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, - dict_t *xdata) +afr_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *flock, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3347,10 +2646,10 @@ afr_finodelk (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); + return 0; } @@ -3383,33 +2682,28 @@ afr_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } -int32_t -afr_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, - const char *basename, entrylk_cmd cmd, entrylk_type type, - dict_t *xdata) +int +afr_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; int32_t op_errno = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3423,18 +2717,18 @@ afr_entrylk (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); + return 0; } -int32_t -afr_fentrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) +int +afr_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -3461,33 +2755,28 @@ afr_fentrylk_cbk (call_frame_t *frame, void *cookie, } -int32_t -afr_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, - const char *basename, entrylk_cmd cmd, - entrylk_type type, dict_t *xdata) +int +afr_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3501,82 +2790,85 @@ afr_fentrylk (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); + return 0; } -int32_t -afr_statfs_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct statvfs *statvfs, dict_t *xdata) + +int +afr_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct statvfs *statvfs, dict_t *xdata) { afr_local_t *local = NULL; int call_count = 0; + struct statvfs *buf = NULL; LOCK (&frame->lock); { local = frame->local; - if (op_ret == 0) { - local->op_ret = op_ret; - - if (local->cont.statfs.buf_set) { - if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail) - local->cont.statfs.buf = *statvfs; - } else { - local->cont.statfs.buf = *statvfs; - local->cont.statfs.buf_set = 1; - } - } - - if (op_ret == -1) + if (op_ret != 0) { local->op_errno = op_errno; + goto unlock; + } + local->op_ret = op_ret; + + buf = &local->cont.statfs.buf; + if (local->cont.statfs.buf_set) { + if (statvfs->f_bavail < buf->f_bavail) { + *buf = *statvfs; + if (xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = dict_ref (xdata); + } + } + } else { + *buf = *statvfs; + local->cont.statfs.buf_set = 1; + if (xdata) + local->xdata_rsp = dict_ref (xdata); + } } +unlock: UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno, - &local->cont.statfs.buf, xdata); + &local->cont.statfs.buf, local->xdata_rsp); return 0; } -int32_t -afr_statfs (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xdata) +int +afr_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - afr_private_t * priv = NULL; - int child_count = 0; afr_local_t * local = NULL; + afr_private_t *priv = NULL; int i = 0; - int ret = -1; int call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - child_count = priv->child_count; + int32_t op_errno = ENOMEM; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + priv = this->private; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } - for (i = 0; i < child_count; i++) { + for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_statfs_cbk, priv->children[i], @@ -3587,10 +2879,10 @@ afr_statfs (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); + return 0; } @@ -3699,21 +2991,6 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN, &local->cont.lk.ret_flock, NULL); } else { - /* locking has succeeded on all nodes that are up */ - - /* temporarily - ret = afr_mark_locked_nodes (this, local->fd, - local->cont.lk.locked_nodes); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "Could not save locked nodes info in fdctx"); - - ret = afr_save_locked_fd (this, local->fd); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "Could not save locked fd"); - - */ AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, &local->cont.lk.ret_flock, NULL); } @@ -3729,20 +3006,12 @@ afr_lk (call_frame_t *frame, xlator_t *this, afr_private_t *priv = NULL; afr_local_t *local = NULL; int i = 0; - int32_t op_errno = 0; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) goto out; local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count, @@ -3764,28 +3033,16 @@ afr_lk (call_frame_t *frame, xlator_t *this, priv->children[i]->fops->lk, fd, cmd, flock, xdata); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); + return 0; } int afr_forget (xlator_t *this, inode_t *inode) { - uint64_t ctx_addr = 0; - afr_inode_ctx_t *ctx = NULL; - - inode_ctx_get (inode, this, &ctx_addr); - - if (!ctx_addr) - goto out; - - ctx = (afr_inode_ctx_t *)(long)ctx_addr; - GF_FREE (ctx->fresh_children); - GF_FREE (ctx); -out: return 0; } @@ -3805,7 +3062,6 @@ afr_priv_dump (xlator_t *this) snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); gf_proc_dump_add_section(key_prefix); gf_proc_dump_write("child_count", "%u", priv->child_count); - gf_proc_dump_write("read_child_rr", "%u", priv->read_child_rr); for (i = 0; i < priv->child_count; i++) { sprintf (key, "child_up[%d]", i); gf_proc_dump_write(key, "%d", priv->child_up[i]); @@ -3862,7 +3118,7 @@ afr_notify (xlator_t *this, int32_t event, int idx = -1; int ret = -1; int call_psh = 0; - int up_child = AFR_ALL_CHILDREN; + int up_child = -1; dict_t *input = NULL; dict_t *output = NULL; @@ -3914,6 +3170,7 @@ afr_notify (xlator_t *this, int32_t event, */ if (priv->child_up[idx] != 1) { priv->up_count++; + priv->event_generation++; } priv->child_up[idx] = 1; @@ -3953,6 +3210,7 @@ afr_notify (xlator_t *this, int32_t event, */ if (priv->child_up[idx] == 1) { priv->down_count++; + priv->event_generation++; } priv->child_up[idx] = 0; @@ -4019,8 +3277,7 @@ afr_notify (xlator_t *this, int32_t event, LOCK (&priv->lock); { - up_children = afr_up_children_count (priv->child_up, - priv->child_count); + up_children = AFR_COUNT (priv->child_up, priv->child_count); for (i = 0; i < priv->child_count; i++) { if (priv->last_event[i] == GF_EVENT_CHILD_UP) { event = GF_EVENT_CHILD_UP; @@ -4040,39 +3297,23 @@ afr_notify (xlator_t *this, int32_t event, ret = 0; if (propagate) ret = default_notify (this, event, data); - if (call_psh && priv->shd.iamshd) - afr_proactive_self_heal ((void*) (long) up_child); + if (call_psh && priv->shd.iamshd) { + afr_selfheal_childup (this, up_child); + } out: return ret; } -int -afr_first_up_child (unsigned char *child_up, size_t child_count) -{ - int ret = -1; - int i = 0; - - GF_ASSERT (child_up); - - for (i = 0; i < child_count; i++) { - if (child_up[i]) { - ret = i; - break; - } - } - - return ret; -} int afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) { - int ret = -1; - local->op_ret = -1; local->op_errno = EUCLEAN; + syncbarrier_init (&local->barrier); + local->child_up = GF_CALLOC (priv->child_count, sizeof (*local->child_up), gf_afr_mt_char); @@ -4084,38 +3325,42 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) memcpy (local->child_up, priv->child_up, sizeof (*local->child_up) * priv->child_count); - local->call_count = afr_up_children_count (local->child_up, - priv->child_count); + local->call_count = AFR_COUNT (local->child_up, priv->child_count); if (local->call_count == 0) { gf_log (THIS->name, GF_LOG_INFO, "no subvolumes up"); if (op_errno) *op_errno = ENOTCONN; goto out; } + local->event_generation = priv->event_generation; - local->child_errno = GF_CALLOC (priv->child_count, - sizeof (*local->child_errno), - gf_afr_mt_int32_t); - if (!local->child_errno) { - if (op_errno) - *op_errno = ENOMEM; - goto out; - } + local->read_attempted = GF_CALLOC (priv->child_count, sizeof (char), + gf_afr_mt_char); + if (!local->read_attempted) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } - local->transaction.postop_piggybacked = GF_CALLOC (priv->child_count, - sizeof (int), - gf_afr_mt_int32_t); - if (!local->transaction.postop_piggybacked) { - if (op_errno) - *op_errno = ENOMEM; - goto out; - } + local->readable = GF_CALLOC (priv->child_count, sizeof (char), + gf_afr_mt_char); + if (!local->readable) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } - local->append_write = _gf_false; + local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), + gf_afr_mt_reply_t); + if (!local->replies) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } - ret = 0; + return 0; out: - return ret; + return -1; } int @@ -4218,13 +3463,11 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) } ret = -ENOMEM; - child_up_count = afr_up_children_count (local->child_up, - priv->child_count); + child_up_count = AFR_COUNT (local->child_up, priv->child_count); if (priv->optimistic_change_log && child_up_count == priv->child_count) local->optimistic_change_log = 1; - local->first_up_child = afr_first_up_child (local->child_up, - priv->child_count); + local->pre_op_compat = priv->pre_op_compat; local->transaction.eager_lock = GF_CALLOC (sizeof (*local->transaction.eager_lock), @@ -4234,26 +3477,29 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) if (!local->transaction.eager_lock) goto out; - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) - goto out; - local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op), priv->child_count, gf_afr_mt_char); if (!local->transaction.pre_op) goto out; + local->transaction.fop_subvols = GF_CALLOC (sizeof (*local->transaction.fop_subvols), + priv->child_count, + gf_afr_mt_char); + if (!local->transaction.fop_subvols) + goto out; + + local->transaction.failed_subvols = GF_CALLOC (sizeof (*local->transaction.failed_subvols), + priv->child_count, + gf_afr_mt_char); + if (!local->transaction.failed_subvols) + goto out; + local->pending = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); if (!local->pending) goto out; - local->transaction.txn_changelog = afr_matrix_create (priv->child_count, - AFR_NUM_CHANGE_LOGS); - if (!local->transaction.txn_changelog) - goto out; - INIT_LIST_HEAD (&local->transaction.eager_locked); ret = 0; @@ -4261,86 +3507,6 @@ out: return ret; } -void -afr_reset_children (int32_t *fresh_children, int32_t child_count) -{ - unsigned int i = 0; - for (i = 0; i < child_count; i++) - fresh_children[i] = -1; -} - -int32_t* -afr_children_create (int32_t child_count) -{ - int32_t *children = NULL; - int i = 0; - - GF_ASSERT (child_count > 0); - - children = GF_CALLOC (child_count, sizeof (*children), - gf_afr_mt_int32_t); - if (NULL == children) - goto out; - for (i = 0; i < child_count; i++) - children[i] = -1; -out: - return children; -} - -void -afr_children_add_child (int32_t *children, int32_t child, - int32_t child_count) -{ - gf_boolean_t child_found = _gf_false; - int i = 0; - - for (i = 0; i < child_count; i++) { - if (children[i] == -1) - break; - if (children[i] == child) { - child_found = _gf_true; - break; - } - } - - if (!child_found) { - GF_ASSERT (i < child_count); - children[i] = child; - } -} - -void -afr_children_rm_child (int32_t *children, int32_t child, int32_t child_count) -{ - int i = 0; - - GF_ASSERT ((child >= 0) && (child < child_count)); - for (i = 0; i < child_count; i++) { - if (children[i] == -1) - break; - if (children[i] == child) { - if (i != (child_count - 1)) - memmove (children + i, children + i + 1, - sizeof (*children)*(child_count - i - 1)); - children[child_count - 1] = -1; - break; - } - } -} - -int -afr_get_children_count (int32_t *children, unsigned int child_count) -{ - int count = 0; - int i = 0; - - for (i = 0; i < child_count; i++) { - if (children[i] == -1) - break; - count++; - } - return count; -} void afr_set_low_priority (call_frame_t *frame) @@ -4348,38 +3514,6 @@ afr_set_low_priority (call_frame_t *frame) frame->root->pid = LOW_PRIO_PROC_PID; } -int -afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child, - int flags) -{ - int ret = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - GF_ASSERT (fd && fd->inode); - ret = afr_fd_ctx_set (this, fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not set fd ctx for fd=%p", fd); - goto out; - } - - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not get fd ctx for fd=%p", fd); - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - fd_ctx->opened_on[child] = AFR_FD_OPENED; - if (!IA_ISDIR (fd->inode->ia_type)) { - fd_ctx->flags = flags; - } - ret = 0; -out: - return ret; -} gf_boolean_t afr_have_quorum (char *logname, afr_private_t *priv) @@ -4426,33 +3560,6 @@ afr_priv_destroy (afr_private_t *priv) if (!priv) goto out; inode_unref (priv->root_inode); - GF_FREE (priv->shd.pos); - GF_FREE (priv->shd.pending); - GF_FREE (priv->shd.inprogress); -// for (i = 0; i < priv->child_count; i++) -// if (priv->shd.timer && priv->shd.timer[i]) -// gf_timer_call_cancel (this->ctx, priv->shd.timer[i]); - GF_FREE (priv->shd.timer); - - if (priv->shd.healed) - eh_destroy (priv->shd.healed); - - if (priv->shd.heal_failed) - eh_destroy (priv->shd.heal_failed); - - if (priv->shd.split_brain) - eh_destroy (priv->shd.split_brain); - - for (i = 0; i < priv->child_count; i++) - { - if (priv->shd.statistics[i]) - eh_destroy (priv->shd.statistics[i]); - } - - GF_FREE (priv->shd.statistics); - - GF_FREE (priv->shd.crawl_events); - GF_FREE (priv->last_event); if (priv->pending_key) { for (i = 0; i < priv->child_count; i++) @@ -4462,8 +3569,7 @@ afr_priv_destroy (afr_private_t *priv) GF_FREE (priv->children); GF_FREE (priv->child_up); LOCK_DESTROY (&priv->lock); - LOCK_DESTROY (&priv->read_child_lock); - pthread_mutex_destroy (&priv->mutex); + GF_FREE (priv); out: return; @@ -4480,124 +3586,21 @@ xlator_subvolume_count (xlator_t *this) return i; } -inline gf_boolean_t -afr_is_errno_set (int *child_errno, int child) -{ - return child_errno[child]; -} - -inline gf_boolean_t -afr_is_errno_unset (int *child_errno, int child) -{ - return !afr_is_errno_set (child_errno, child); -} - -void -afr_prepare_new_entry_pending_matrix (int32_t **pending, - gf_boolean_t (*is_pending) (int *, int), - int *ctx, struct iatt *buf, - unsigned int child_count) -{ - int midx = 0; - int idx = 0; - int i = 0; - - midx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); - if (IA_ISDIR (buf->ia_type)) - idx = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); - else if (IA_ISREG (buf->ia_type)) - idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - else - idx = -1; - for (i = 0; i < child_count; i++) { - if (is_pending (ctx, i)) { - pending[i][midx] = hton32 (1); - if (idx == -1) - continue; - pending[i][idx] = hton32 (1); - } - } -} - -gf_boolean_t -afr_is_fd_fixable (fd_t *fd) -{ - if (!fd || !fd->inode) - return _gf_false; - else if (fd_is_anonymous (fd)) - return _gf_false; - else if (uuid_is_null (fd->inode->gfid)) - return _gf_false; - - return _gf_true; -} void afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; - inode_t *inode = NULL; - afr_inode_ctx_t *ctx = NULL; + afr_fd_ctx_t *fd_ctx = NULL; local = frame->local; - if (local->fd) - inode = local->fd->inode; - else - inode = local->loc.inode; - - if (!inode) - return; - - LOCK (&inode->lock); - { - ctx = __afr_inode_ctx_get (inode, this); - ctx->open_fd_count = local->open_fd_count; - } - UNLOCK (&inode->lock); -} - -int -afr_initialise_statistics (xlator_t *this) -{ - afr_private_t *priv = NULL; - int ret = -1; - int i = 0; - int child_count = 0; - eh_t *stats_per_brick = NULL; - shd_crawl_event_t ***shd_crawl_events = NULL; - priv = this->private; - - priv->shd.statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count, - gf_common_mt_eh_t); - if (!priv->shd.statistics) { - ret = -1; - goto out; - } - child_count = priv->child_count; - for (i=0; i < child_count ; i++) { - stats_per_brick = eh_new (AFR_STATISTICS_HISTORY_SIZE, - _gf_false, - _destroy_crawl_event_data); - if (!stats_per_brick) { - ret = -1; - goto out; - } - priv->shd.statistics[i] = stats_per_brick; - - } - - shd_crawl_events = (shd_crawl_event_t***)(&priv->shd.crawl_events); - *shd_crawl_events = GF_CALLOC (sizeof(shd_crawl_event_t*), - priv->child_count, - gf_afr_mt_shd_crawl_event_t); + if (!local->fd) + return; - if (!priv->shd.crawl_events) { - ret = -1; - goto out; - } - ret = 0; -out: - return ret; + fd_ctx = afr_fd_ctx_get (local->fd, this); + if (!fd_ctx) + return; + fd_ctx->open_fd_count = local->open_fd_count; } diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index 689dd84e646..fa1da3958df 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -37,177 +37,7 @@ #include "checksum.h" #include "afr.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" - -int -afr_examine_dir_sh_unwind (call_frame_t *frame, xlator_t *this, int32_t op_ret, - int32_t op_errno, int32_t sh_failed) -{ - afr_local_t *local = NULL; - - local = frame->local; - - afr_set_opendir_done (this, local->fd->inode); - - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd, NULL); - - return 0; -} - - -gf_boolean_t -__checksums_differ (uint32_t *checksum, int child_count, - unsigned char *child_up) -{ - int ret = _gf_false; - int i = 0; - uint32_t cksum = 0; - gf_boolean_t activate_check = _gf_false; - - for (i = 0; i < child_count; i++) { - if (!child_up[i]) - continue; - if (_gf_false == activate_check) { - cksum = checksum[i]; - activate_check = _gf_true; - continue; - } - - if (cksum != checksum[i]) { - ret = _gf_true; - break; - } - - cksum = checksum[i]; - } - - return ret; -} - - -int32_t -afr_examine_dir_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; - char *reason = NULL; - int child_index = 0; - uint32_t entry_cksum = 0; - int call_count = 0; - off_t last_offset = 0; - inode_t *inode = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - inode = local->fd->inode; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to do opendir on %s", - local->loc.path, priv->children[child_index]->name); - local->op_ret = -1; - local->op_ret = op_errno; - goto out; - } - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: no entries found in %s", - local->loc.path, priv->children[child_index]->name); - goto out; - } - - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - entry_cksum = gf_rsync_weak_checksum ((unsigned char *)entry->d_name, - strlen (entry->d_name)); - local->cont.opendir.checksum[child_index] ^= entry_cksum; - } - - list_for_each_entry (entry, &entries->list, list) { - last_offset = entry->d_off; - } - - /* read more entries */ - - STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->readdir, - local->fd, 131072, last_offset, NULL); - - return 0; - -out: - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (__checksums_differ (local->cont.opendir.checksum, - priv->child_count, - local->child_up)) { - - sh->do_entry_self_heal = _gf_true; - sh->forced_merge = _gf_true; - - reason = "checksums of directory differ"; - afr_launch_self_heal (frame, this, inode, _gf_false, - inode->ia_type, reason, NULL, - afr_examine_dir_sh_unwind); - } else { - afr_set_opendir_done (this, inode); - - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd, NULL); - } - } - - return 0; -} - - -int -afr_examine_dir (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - int i = 0; - int call_count = 0; - - local = frame->local; - priv = this->private; - - local->cont.opendir.checksum = GF_CALLOC (priv->child_count, - sizeof (*local->cont.opendir.checksum), - gf_afr_mt_int32_t); - - call_count = afr_up_children_count (local->child_up, priv->child_count); - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->readdir, - local->fd, 131072, 0, NULL); - - if (!--call_count) - break; - } - } - - return 0; -} +#include "afr-transaction.h" int32_t @@ -215,112 +45,66 @@ afr_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - afr_private_t *priv = NULL; afr_local_t *local = NULL; - int32_t up_children_count = 0; - int ret = -1; int call_count = -1; int32_t child_index = 0; + afr_fd_ctx_t *fd_ctx = NULL; - priv = this->private; local = frame->local; + fd_ctx = local->fd_ctx; child_index = (long) cookie; - up_children_count = afr_up_children_count (local->child_up, - priv->child_count); - LOCK (&frame->lock); { - if (op_ret >= 0) { + if (op_ret == -1) { + local->op_errno = op_errno; + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } else { local->op_ret = op_ret; - ret = afr_child_fd_ctx_set (this, fd, child_index, 0); - if (ret) { - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); } - - local->op_errno = op_errno; } -unlock: UNLOCK (&frame->lock); call_count = afr_frame_return (frame); - if (call_count == 0) { - if (local->op_ret != 0) - goto out; - - if (!afr_is_opendir_done (this, local->fd->inode) && - up_children_count > 1 && priv->entry_self_heal) { - - /* - * This is the first opendir on this inode. We need - * to check if the directory's entries are the same - * on all subvolumes. This is needed in addition - * to regular entry self-heal because the readdir - * call is sent only to the first subvolume, and - * thus files that exist only there will never be healed - * otherwise (assuming changelog shows no anomalies). - */ - - gf_log (this->name, GF_LOG_TRACE, - "reading contents of directory %s looking for mismatch", - local->loc.path); - - afr_examine_dir (frame, this); - - } else { - /* do the unwind */ - goto out; - } - } - - return 0; - -out: - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd, NULL); - + if (call_count == 0) + AFR_STACK_UNWIND (opendir, frame, local->op_ret, + local->op_errno, local->fd, NULL); return 0; } -int32_t -afr_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd) +int +afr_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) { afr_private_t * priv = NULL; afr_local_t * local = NULL; - int child_count = 0; int i = 0; - int ret = -1; int call_count = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; + afr_fd_ctx_t *fd_ctx = NULL; priv = this->private; - child_count = priv->child_count; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + goto out; loc_copy (&local->loc, loc); local->fd = fd_ref (fd); + local->fd_ctx = fd_ctx; call_count = local->call_count; - for (i = 0; i < child_count; i++) { + for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, afr_opendir_cbk, (void*) (long) i, @@ -333,182 +117,280 @@ afr_opendir (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL); - + AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL); return 0; } -/** - * Common algorithm for directory read calls: - * - * - Try the fop on the first child that is up - * - if we have failed due to ENOTCONN: - * try the next child - * - * Applicable to: readdir - */ +#define BACKEND_D_OFF_BITS 63 +#define PRESENT_D_OFF_BITS 63 +#define ONE 1ULL +#define MASK (~0ULL) +#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS)) +#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS)) -struct entry_name { - char *name; - struct list_head list; -}; +#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1)) +#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1))) -static void -afr_forget_entries (fd_t *fd) +static uint64_t +afr_bits_for (uint64_t num) { - struct entry_name *entry = NULL; - struct entry_name *tmp = NULL; - int ret = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - ret = fd_ctx_get (fd, THIS, &ctx); - if (ret < 0) { - gf_log (THIS->name, GF_LOG_INFO, - "could not get fd ctx for fd=%p", fd); - return; - } + uint64_t bits = 0, ctrl = 1; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + while (ctrl < num) { + ctrl *= 2; + bits ++; + } - list_for_each_entry_safe (entry, tmp, &fd_ctx->entries, list) { - GF_FREE (entry->name); - list_del (&entry->list); - GF_FREE (entry); - } + return bits; } -static void -afr_readdir_filter_trash_dir (gf_dirent_t *entries, fd_t *fd) +int +afr_itransform (xlator_t *this, int subvol, uint64_t x, uint64_t *y_p) { - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; + afr_private_t *conf = NULL; + int cnt = 0; + int max = 0; + uint64_t y = 0; + uint64_t hi_mask = 0; + uint64_t off_mask = 0; + int max_bits = 0; + + if (x == ((uint64_t) -1)) { + y = (uint64_t) -1; + goto out; + } - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - if (__is_root_gfid (fd->inode->gfid) && - !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { - list_del_init (&entry->list); - GF_FREE (entry); - } + conf = this->private; + if (!conf) + goto out; + + max = conf->child_count; + cnt = subvol; + + if (max == 1) { + y = x; + goto out; + } + + max_bits = afr_bits_for (max); + + hi_mask = ~(PRESENT_MASK >> (max_bits + 1)); + + if (x & hi_mask) { + /* HUGE d_off */ + off_mask = MASK << max_bits; + y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt; + } else { + /* small d_off */ + y = ((x * max) + cnt); } + +out: + if (y_p) + *y_p = y; + + return 0; } -int32_t -afr_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries, dict_t *xdata) + +int +afr_deitransform (xlator_t *this, uint64_t y, int *subvol_p, + uint64_t *x_p) { - afr_local_t *local = NULL; + afr_private_t *conf = NULL; + int cnt = 0; + int max = 0; + uint64_t x = 0; + int subvol = 0; + int max_bits = 0; + uint64_t off_mask = 0; + uint64_t host_mask = 0; + + if (!this->private) + return -1; + + conf = this->private; + max = conf->child_count; + + if (max == 1) { + x = y; + cnt = 0; + goto out; + } + + if (y & TOP_BIT) { + /* HUGE d_off */ + max_bits = afr_bits_for (max); + off_mask = (MASK << max_bits); + host_mask = ~(off_mask); + + x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS; + + cnt = y & host_mask; + } else { + /* small d_off */ + cnt = y % max; + x = y / max; + } - if (op_ret == -1) - goto out; +out: + subvol = cnt; - local = frame->local; - afr_readdir_filter_trash_dir (entries, local->fd); + if (subvol_p) + *subvol_p = subvol; + + if (x_p) + *x_p = x; -out: - AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, NULL); return 0; } -int32_t -afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, - dict_t *xdata) +static void +afr_readdir_transform_entries (gf_dirent_t *subvol_entries, int subvol, + gf_dirent_t *entries, fd_t *fd) { - afr_local_t *local = NULL; + afr_private_t *priv = NULL; + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + int gen = 0; - if (op_ret == -1) - goto out; + priv = THIS->private; - local = frame->local; - afr_readdir_filter_trash_dir (entries, local->fd); + data_readable = alloca0 (priv->child_count); + metadata_readable = alloca0 (priv->child_count); -out: - AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, NULL); - return 0; + list_for_each_entry_safe (entry, tmp, &subvol_entries->list, list) { + if (__is_root_gfid (fd->inode->gfid) && + !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { + continue; + } + + list_del_init (&entry->list); + afr_itransform (THIS, subvol, entry->d_off, &entry->d_off); + list_add_tail (&entry->list, &entries->list); + + if (entry->inode) { + gen = 0; + afr_inode_read_subvol_get (entry->inode, THIS, + data_readable, + metadata_readable, &gen); + + if (gen != priv->event_generation || + !data_readable[subvol] || + !metadata_readable[subvol]) { + + inode_unref (entry->inode); + entry->inode = NULL; + } + } + } } + int32_t -afr_do_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, int whichop, dict_t *dict) +afr_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *subvol_entries, + dict_t *xdata) { - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - int ret = -1; - int32_t op_errno = 0; - uint64_t read_child = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_local_t *local = NULL; + gf_dirent_t entries; - priv = this->private; - children = priv->children; + INIT_LIST_HEAD (&entries.list); - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + if (op_ret < 0 && !local->cont.readdir.offset) { + /* failover only if this was first readdir, detected + by offset == 0 */ + local->op_ret = op_ret; + local->op_errno = op_errno; - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - read_child = afr_inode_get_read_ctx (this, fd->inode, - local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.readdir.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } + if (op_ret >= 0) + afr_readdir_transform_entries (subvol_entries, (long) cookie, + &entries, local->fd); - fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) { - op_errno = EBADF; - goto out; - } + AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries, xdata); - if ((offset == 0) || (fd_ctx->call_child == -1)) { - fd_ctx->call_child = call_child; - } else if ((priv->readdir_failover == _gf_false) && - (call_child != fd_ctx->call_child)) { - op_errno = EBADF; - goto out; - } + return 0; +} + + +int +afr_readdir_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + local = frame->local; - local->fd = fd_ref (fd); - local->cont.readdir.size = size; - local->cont.readdir.dict = (dict)? dict_ref (dict) : NULL; + if (subvol == -1) { + AFR_STACK_UNWIND (readdir, frame, local->op_ret, + local->op_errno, 0, 0); + return 0; + } - if (whichop == GF_FOP_READDIR) + if (local->op == GF_FOP_READDIR) STACK_WIND_COOKIE (frame, afr_readdir_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readdir, fd, - size, offset, dict); + (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->readdir, + local->fd, local->cont.readdir.size, + local->cont.readdir.offset, + local->xdata_req); else - STACK_WIND_COOKIE (frame, afr_readdirp_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readdirp, fd, - size, offset, dict); + STACK_WIND_COOKIE (frame, afr_readdir_cbk, + (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->readdirp, + local->fd, local->cont.readdir.size, + local->cont.readdir.offset, + local->xdata_req); + return 0; +} + + +int +afr_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, int whichop, dict_t *dict) +{ + afr_local_t *local = NULL; + int32_t op_errno = 0; + int subvol = -1; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + local->op = whichop; + local->fd = fd_ref (fd); + local->cont.readdir.size = size; + local->cont.readdir.offset = offset; + local->xdata_req = (dict)? dict_ref (dict) : NULL; + + if (offset == 0) { + /* First readdir has option of failing over and selecting + an appropriate read subvolume */ + afr_read_txn (frame, this, fd->inode, afr_readdir_wind, + AFR_DATA_TRANSACTION); + } else { + /* But continued readdirs MUST stick to the same subvolume + without an option to failover */ + afr_deitransform (this, offset, &subvol, + (uint64_t *)&local->cont.readdir.offset); + afr_readdir_wind (frame, this, subvol); + } return 0; out: @@ -521,7 +403,8 @@ int32_t afr_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t offset, dict_t *xdata) { - afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata); + afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata); + return 0; } @@ -531,6 +414,7 @@ afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t offset, dict_t *dict) { afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP, dict); + return 0; } @@ -538,7 +422,6 @@ afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, int32_t afr_releasedir (xlator_t *this, fd_t *fd) { - afr_forget_entries (fd); afr_cleanup_fd_ctx (this, fd); return 0; diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 1943b719bb5..465dde54f9c 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -34,10 +34,14 @@ #include "common-utils.h" #include "compat-errno.h" #include "compat.h" +#include "byte-order.h" #include "afr.h" #include "afr-transaction.h" +void +afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this); + int afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno) { @@ -56,79 +60,214 @@ afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno) *op_errno = ENOMEM; goto out; } - parent->path = gf_strdup( dirname (child_path) ); - if (!parent->path) { + + parent->path = gf_strdup (dirname (child_path)); + if (!parent->path) { if (op_errno) *op_errno = ENOMEM; goto out; } - parent->inode = inode_ref (child->parent); - uuid_copy (parent->gfid, child->pargfid); + + parent->inode = inode_ref (child->parent); + uuid_copy (parent->gfid, child->pargfid); ret = 0; out: - GF_FREE(child_path); + GF_FREE (child_path); return ret; } -void -__dir_entry_fop_common_cbk (call_frame_t *frame, int child_index, - xlator_t *this, int32_t op_ret, - int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, struct iatt *prenewparent, - struct iatt *postnewparent) + +static void +__afr_dir_write_finalize (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int inode_read_subvol = -1; + int parent_read_subvol = -1; + int parent2_read_subvol = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + if (local->inode) { + afr_replies_interpret (frame, this, local->inode); + inode_read_subvol = afr_data_subvol_get (local->inode, this, + NULL, NULL); + } + if (local->parent) + parent_read_subvol = afr_data_subvol_get (local->parent, this, + NULL, NULL); + if (local->parent2) + parent2_read_subvol = afr_data_subvol_get (local->parent2, this, + NULL, NULL); + + local->op_ret = -1; + local->op_errno = afr_final_errno (local, priv); + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret < 0) { + if (local->inode) + afr_inode_read_subvol_reset (local->inode, + this); + if (local->parent) + afr_inode_read_subvol_reset (local->parent, + this); + if (local->parent2) + afr_inode_read_subvol_reset (local->parent2, + this); + continue; + } + + if (local->op_ret == -1) { + local->op_ret = local->replies[i].op_ret; + local->op_errno = local->replies[i].op_errno; + + local->cont.dir_fop.buf = + local->replies[i].poststat; + local->cont.dir_fop.preparent = + local->replies[i].preparent; + local->cont.dir_fop.postparent = + local->replies[i].postparent; + local->cont.dir_fop.prenewparent = + local->replies[i].preparent2; + local->cont.dir_fop.postnewparent = + local->replies[i].postparent2; + if (local->replies[i].xdata) + local->xdata_rsp = + dict_ref (local->replies[i].xdata); + continue; + } + + if (i == inode_read_subvol) { + local->cont.dir_fop.buf = + local->replies[i].poststat; + if (local->replies[i].xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = + dict_ref (local->replies[i].xdata); + } + } + + if (i == parent_read_subvol) { + local->cont.dir_fop.preparent = + local->replies[i].preparent; + local->cont.dir_fop.postparent = + local->replies[i].postparent; + } + + if (i == parent2_read_subvol) { + local->cont.dir_fop.prenewparent = + local->replies[i].preparent2; + local->cont.dir_fop.postnewparent = + local->replies[i].postparent2; + } + } +} + + +static void +__afr_dir_write_fill (call_frame_t *frame, xlator_t *this, int child_index, + int op_ret, int op_errno, struct iatt *poststat, + struct iatt *preparent, struct iatt *postparent, + struct iatt *preparent2, struct iatt *postparent2, + dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_fd_ctx_t *fd_ctx = NULL; local = frame->local; + fd_ctx = local->fd_ctx; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + + if (op_ret >= 0) { + if (poststat) + local->replies[child_index].poststat = *poststat; + if (preparent) + local->replies[child_index].preparent = *preparent; + if (postparent) + local->replies[child_index].postparent = *postparent; + if (preparent2) + local->replies[child_index].preparent2 = *preparent2; + if (postparent2) + local->replies[child_index].postparent2 = *postparent2; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + + if (fd_ctx) + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + } else { + if (op_errno != ENOTEMPTY) + afr_transaction_fop_failed (frame, this, child_index); + if (fd_ctx) + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } + + return; +} + + +static int +__afr_dir_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + struct iatt *preparent2, struct iatt *postparent2, + dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = (long) cookie; + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + __afr_dir_write_fill (frame, this, child_index, op_ret, + op_errno, buf, preparent, postparent, + preparent2, postparent2, xdata); + } + UNLOCK (&frame->lock); + call_count = afr_frame_return (frame); + + if (call_count == 0) { + __afr_dir_write_finalize (frame, this); + + if (afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret > -1) { - local->op_ret = op_ret; - - if ((local->success_count == 0) || - (child_index == local->read_child_index)) { - local->cont.dir_fop.preparent = *preparent; - local->cont.dir_fop.postparent = *postparent; - if (buf) - local->cont.dir_fop.buf = *buf; - if (prenewparent) - local->cont.dir_fop.prenewparent = *prenewparent; - if (postnewparent) - local->cont.dir_fop.postnewparent = *postnewparent; - } - - local->cont.dir_fop.inode = inode; - - local->fresh_children[local->success_count] = child_index; - local->success_count++; - local->child_errno[child_index] = 0; - } else { - local->child_errno[child_index] = op_errno; + afr_mark_entry_pending_changelog (frame, this); + + local->transaction.resume (frame, this); } - local->op_errno = op_errno; + return 0; } + int afr_mark_new_entry_changelog_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, + xlator_t *this, int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - int call_count = 0; + int call_count = 0; call_count = afr_frame_return (frame); - if (call_count == 0) { + + if (call_count == 0) AFR_STACK_DESTROY (frame); - } + return 0; } + void afr_mark_new_entry_changelog (call_frame_t *frame, xlator_t *this) { @@ -136,125 +275,109 @@ afr_mark_new_entry_changelog (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; afr_local_t *new_local = NULL; afr_private_t *priv = NULL; - dict_t **xattr = NULL; + dict_t *xattr = NULL; int32_t **changelog = NULL; int i = 0; - GF_UNUSED int op_errno = 0; + int idx = 0; + int op_errno = ENOMEM; + unsigned char *pending = NULL; + int call_count = 0; local = frame->local; priv = this->private; new_frame = copy_frame (frame); - if (!new_frame) { + if (!new_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (new_frame->local, out); - new_local = new_frame->local; + new_local = AFR_FRAME_INIT (new_frame, op_errno); + if (!new_local) + goto out; + changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); if (!changelog) goto out; - xattr = GF_CALLOC (priv->child_count, sizeof (*xattr), - gf_afr_mt_dict_t); - if (!xattr) - goto out; - for (i = 0; i < priv->child_count; i++) { - if (local->child_errno[i]) - continue; - xattr[i] = dict_new (); - if (!xattr[i]) - goto out; - } + xattr = dict_new (); + if (!xattr) + goto out; + + idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - afr_prepare_new_entry_pending_matrix (changelog, - afr_is_errno_set, - local->child_errno, - &local->cont.dir_fop.buf, - priv->child_count); + pending = alloca0 (priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] && + !local->transaction.failed_subvols[i]) { + call_count ++; + continue; + } + + changelog[i][idx] = hton32(1); + pending[i] = 1; + } new_local->pending = changelog; uuid_copy (new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid); - new_local->loc.inode = inode_ref (local->cont.dir_fop.inode); - new_local->call_count = local->success_count; + new_local->loc.inode = inode_ref (local->inode); + + + afr_set_pending_dict (priv, xattr, changelog); + + new_local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_errno[i]) + if (pending[i]) continue; - afr_set_pending_dict (priv, xattr[i], changelog, i, LOCAL_LAST); STACK_WIND_COOKIE (new_frame, afr_mark_new_entry_changelog_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->xattrop, &new_local->loc, GF_XATTROP_ADD_ARRAY, - xattr[i], NULL); + xattr, NULL); + if (!--call_count) + break; } + new_frame = NULL; out: if (new_frame) AFR_STACK_DESTROY (new_frame); - afr_xattr_array_destroy (xattr, priv->child_count); + if (xattr) + dict_unref (xattr); return; } -gf_boolean_t -afr_is_new_entry_changelog_needed (glusterfs_fop_t fop) -{ - glusterfs_fop_t fops[] = {GF_FOP_CREATE, GF_FOP_MKNOD, GF_FOP_NULL}; - int i = 0; - - for (i = 0; fops[i] != GF_FOP_NULL; i++) { - if (fop == fops[i]) - return _gf_true; - } - return _gf_false; -} void -afr_dir_fop_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this) +afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int pre_op_count = 0; + int failed_count = 0; local = frame->local; priv = this->private; if (local->op_ret < 0) - goto out; + return; - if (local->success_count == priv->child_count) - goto out; + if (local->op != GF_FOP_CREATE && local->op != GF_FOP_MKNOD) + return; - if (!afr_is_new_entry_changelog_needed (local->op)) - goto out; + pre_op_count = AFR_COUNT (local->transaction.pre_op, priv->child_count); + failed_count = AFR_COUNT (local->transaction.failed_subvols, + priv->child_count); + + if (pre_op_count == priv->child_count && !failed_count) + return; afr_mark_new_entry_changelog (frame, this); -out: return; } -void -afr_dir_fop_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - priv = this->private; - - if (local->cont.dir_fop.inode == NULL) - goto done; - afr_set_read_ctx_from_policy (this, local->cont.dir_fop.inode, - local->fresh_children, - local->read_child_index, - priv->read_child, - local->cont.dir_fop.buf.ia_gfid); -done: - local->transaction.unwind (frame, this); - afr_dir_fop_mark_entry_pending_changelog (frame, this); - local->transaction.resume (frame, this); -} /* {{{ create */ @@ -266,26 +389,16 @@ afr_create_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (create, main_frame, - local->op_ret, local->op_errno, - local->cont.create.fd, - local->cont.dir_fop.inode, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - local->xdata_rsp); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + + AFR_STACK_UNWIND (create, main_frame, local->op_ret, local->op_errno, + local->cont.create.fd, local->inode, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -297,175 +410,79 @@ afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int ret = 0; - int call_count = -1; - int child_index = -1; - - local = frame->local; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret > -1) { - ret = afr_fd_ctx_set (this, fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not set ctx on fd=%p", fd); - - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } - - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not get fd ctx for fd=%p", fd); - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - fd_ctx->opened_on[child_index] = AFR_FD_OPENED; - fd_ctx->flags = local->cont.create.flags; - - if (local->success_count == 0) { - if (xdata) - local->xdata_rsp = dict_ref(xdata); - } - } - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, inode, buf, - preparent, postparent, NULL, NULL); - } - -unlock: - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } int -afr_create_wind (call_frame_t *frame, xlator_t *this) +afr_create_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_create_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->create, - &local->loc, - local->cont.create.flags, - local->cont.create.mode, - local->umask, - local->cont.create.fd, - local->xdata_req); - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_create_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_create_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->create, + &local->loc, local->cont.create.flags, + local->cont.create.mode, local->umask, + local->cont.create.fd, local->xdata_req); return 0; } int -afr_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - mode_t umask, fd_t *fd, dict_t *params) +afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(create,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, loc); - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + local->fd_ctx = afr_fd_ctx_get (fd, this); + if (!local->fd_ctx) + goto out; + + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); local->op = GF_FOP_CREATE; local->cont.create.flags = flags; local->cont.create.mode = mode; local->cont.create.fd = fd_ref (fd); local->umask = umask; - if (params) - local->xdata_req = dict_ref (params); - local->transaction.fop = afr_create_wind; - local->transaction.done = afr_create_done; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_create_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_create_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -492,15 +509,13 @@ afr_create (call_frame_t *frame, xlator_t *this, goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); return 0; } @@ -516,25 +531,14 @@ afr_mknod_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (mknod, main_frame, - local->op_ret, local->op_errno, - local->cont.dir_fop.inode, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (mknod, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -545,131 +549,72 @@ afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - int call_count = -1; - int child_index = -1; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, inode, buf, - preparent, postparent, NULL, NULL); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } -int32_t -afr_mknod_wind (call_frame_t *frame, xlator_t *this) +int +afr_mknod_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mknod, - &local->loc, local->cont.mknod.mode, - local->cont.mknod.dev, - local->umask, - local->xdata_req); - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_mknod_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->mknod, + &local->loc, local->cont.mknod.mode, + local->cont.mknod.dev, local->umask, + local->xdata_req); return 0; } - int afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t dev, mode_t umask, dict_t *params) + dev_t dev, mode_t umask, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(mknod,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, loc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); local->op = GF_FOP_MKNOD; local->cont.mknod.mode = mode; local->cont.mknod.dev = dev; local->umask = umask; - if (params) - local->xdata_req = dict_ref (params); - local->transaction.fop = afr_mknod_wind; - local->transaction.done = afr_mknod_done; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_mknod_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_mknod_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -692,19 +637,17 @@ afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, int_lock->lockee_count++; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (mknod, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); return 0; } @@ -721,25 +664,14 @@ afr_mkdir_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (mkdir, main_frame, - local->op_ret, local->op_errno, - local->cont.dir_fop.inode, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (mkdir, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -750,130 +682,71 @@ afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - int call_count = -1; - int child_index = -1; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, inode, buf, - preparent, postparent, NULL, NULL); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } int -afr_mkdir_wind (call_frame_t *frame, xlator_t *this) +afr_mkdir_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mkdir, - &local->loc, local->cont.mkdir.mode, - local->umask, - local->xdata_req); - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->mkdir, &local->loc, + local->cont.mkdir.mode, local->umask, + local->xdata_req); return 0; } int -afr_mkdir_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - -int -afr_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, mode_t umask, dict_t *params) +afr_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(mkdir,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, loc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); local->cont.mkdir.mode = mode; local->umask = umask; - if (params) - local->xdata_req = dict_ref (params); + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; local->op = GF_FOP_MKDIR; - local->transaction.fop = afr_mkdir_wind; - local->transaction.done = afr_mkdir_done; + local->transaction.wind = afr_mkdir_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_mkdir_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -896,20 +769,17 @@ afr_mkdir (call_frame_t *frame, xlator_t *this, int_lock->lockee_count++; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - - AFR_STACK_UNWIND (mkdir, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); return 0; } @@ -926,25 +796,14 @@ afr_link_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (link, main_frame, - local->op_ret, local->op_errno, - local->cont.dir_fop.inode, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (link, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -955,127 +814,70 @@ afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - int call_count = -1; - int child_index = -1; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, inode, buf, - preparent, postparent, NULL, NULL); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } int -afr_link_wind (call_frame_t *frame, xlator_t *this) +afr_link_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_link_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->link, - &local->loc, - &local->newloc, local->xdata_req); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->link, + &local->loc, &local->newloc, local->xdata_req); return 0; } int -afr_link_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - - -int -afr_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdata) +afr_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(link,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, oldloc); loc_copy (&local->newloc, newloc); + + local->inode = inode_ref (oldloc->inode); + local->parent = inode_ref (newloc->parent); + if (xdata) - local->xdata_req = dict_ref (xdata); + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + if (!local->xdata_req) + goto out; local->op = GF_FOP_LINK; - local->transaction.fop = afr_link_wind; - local->transaction.done = afr_link_done; + + local->transaction.wind = afr_link_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_link_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, newloc, @@ -1098,18 +900,17 @@ afr_link (call_frame_t *frame, xlator_t *this, int_lock->lockee_count++; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (link, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); return 0; } @@ -1126,25 +927,14 @@ afr_symlink_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (symlink, main_frame, - local->op_ret, local->op_errno, - local->cont.dir_fop.inode, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (symlink, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -1155,132 +945,71 @@ afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - int call_count = -1; - int child_index = -1; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, inode, buf, - preparent, postparent, NULL, NULL); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } int -afr_symlink_wind (call_frame_t *frame, xlator_t *this) +afr_symlink_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->symlink, - local->cont.symlink.linkpath, - &local->loc, - local->umask, - local->xdata_req); - - if (!--call_count) - break; - - } - } - - return 0; -} - - -int -afr_symlink_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->symlink, + local->cont.symlink.linkpath, &local->loc, + local->umask, local->xdata_req); return 0; } int -afr_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *loc, mode_t umask, dict_t *params) +afr_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(symlink,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, loc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); local->cont.symlink.linkpath = gf_strdup (linkpath); local->umask = umask; - if (params) - local->xdata_req = dict_ref (params); + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; local->op = GF_FOP_SYMLINK; - local->transaction.fop = afr_symlink_wind; - local->transaction.done = afr_symlink_done; + local->transaction.wind = afr_symlink_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_symlink_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -1303,19 +1032,17 @@ afr_symlink (call_frame_t *frame, xlator_t *this, int_lock->lockee_count++; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (symlink, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (symlink, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL); return 0; } @@ -1331,26 +1058,16 @@ afr_rename_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (rename, main_frame, - local->op_ret, local->op_errno, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - &local->cont.dir_fop.prenewparent, - &local->cont.dir_fop.postnewparent, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (rename, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + &local->cont.dir_fop.prenewparent, + &local->cont.dir_fop.postnewparent, local->xdata_rsp); return 0; } @@ -1362,131 +1079,72 @@ afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *prenewparent, struct iatt *postnewparent, dict_t *xdata) { - afr_local_t * local = NULL; - int call_count = -1; - int child_index = -1; - - local = frame->local; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno) && op_errno != ENOTEMPTY) - afr_transaction_fop_failed (frame, this, child_index); - local->op_errno = op_errno; - local->child_errno[child_index] = op_errno; - - if (op_ret > -1) - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, NULL, buf, - preoldparent, postoldparent, - prenewparent, postnewparent); - - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preoldparent, postoldparent, prenewparent, + postnewparent, xdata); } -int32_t -afr_rename_wind (call_frame_t *frame, xlator_t *this) +int +afr_rename_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - local = frame->local; - priv = this->private; - - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->rename, - &local->loc, - &local->newloc, NULL); - if (!--call_count) - break; - } - } + local = frame->local; + priv = this->private; + STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->rename, + &local->loc, &local->newloc, local->xdata_req); return 0; } int -afr_rename_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - - -int -afr_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdata) +afr_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; + int op_errno = ENOMEM; int nlockee = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; QUORUM_CHECK(rename,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { + if (!transaction_frame) op_errno = ENOMEM; - goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, oldloc); loc_copy (&local->newloc, newloc); - local->read_child_index = afr_inode_get_read_ctx (this, oldloc->inode, NULL); + local->inode = inode_ref (oldloc->inode); + local->parent = inode_ref (oldloc->parent); + local->parent2 = inode_ref (newloc->parent); + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; local->op = GF_FOP_RENAME; - local->transaction.fop = afr_rename_wind; - local->transaction.done = afr_rename_done; + local->transaction.wind = afr_rename_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_rename_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, oldloc, @@ -1536,20 +1194,17 @@ afr_rename (call_frame_t *frame, xlator_t *this, ret = afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - - AFR_STACK_UNWIND (rename, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); return 0; } @@ -1565,23 +1220,13 @@ afr_unlink_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (unlink, main_frame, - local->op_ret, local->op_errno, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (unlink, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -1591,123 +1236,69 @@ afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - afr_local_t * local = NULL; - int call_count = -1; - int child_index = (long) cookie; - - local = frame->local; - - LOCK (&frame->lock); - { - if (child_index == local->read_child_index) { - local->read_child_returned = _gf_true; - } - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, NULL, NULL, - preparent, postparent, NULL, NULL); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL, + preparent, postparent, NULL, NULL, xdata); } -int32_t -afr_unlink_wind (call_frame_t *frame, xlator_t *this) +int +afr_unlink_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->unlink, - &local->loc, local->xflag, - local->xdata_req); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int32_t -afr_unlink_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->unlink, + &local->loc, local->xflag, local->xdata_req); return 0; } -int32_t -afr_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, int xflag, dict_t *xdata) +int +afr_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(unlink,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, loc); local->xflag = xflag; + + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); + if (xdata) - local->xdata_req = dict_ref (xdata); + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; local->op = GF_FOP_UNLINK; - local->transaction.fop = afr_unlink_wind; - local->transaction.done = afr_unlink_done; + local->transaction.wind = afr_unlink_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_unlink_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -1730,19 +1321,16 @@ afr_unlink (call_frame_t *frame, xlator_t *this, int_lock->lockee_count++; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (unlink, frame, -1, op_errno, - NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -1760,23 +1348,13 @@ afr_rmdir_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (rmdir, main_frame, - local->op_ret, local->op_errno, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (rmdir, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -1786,130 +1364,71 @@ afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - afr_local_t * local = NULL; - int call_count = -1; - int child_index = (long) cookie; - int read_child = 0; - - local = frame->local; - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - if (afr_fop_failed (op_ret, op_errno) && (op_errno != ENOTEMPTY)) - afr_transaction_fop_failed (frame, this, child_index); - local->op_errno = op_errno; - local->child_errno[child_index] = op_errno; - if (op_ret > -1) - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, NULL, NULL, - preparent, postparent, NULL, - NULL); - - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL, + preparent, postparent, NULL, NULL, xdata); } int -afr_rmdir_wind (call_frame_t *frame, xlator_t *this) +afr_rmdir_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->rmdir, - &local->loc, local->cont.rmdir.flags, - NULL); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_rmdir_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->rmdir, + &local->loc, local->cont.rmdir.flags, local->xdata_req); return 0; } int -afr_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags, dict_t *xdata) +afr_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; + int op_errno = ENOMEM; int nlockee = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; QUORUM_CHECK(rmdir,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - local->cont.rmdir.flags = flags; loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); + + local->cont.rmdir.flags = flags; + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; local->op = GF_FOP_RMDIR; - local->transaction.fop = afr_rmdir_wind; - local->transaction.done = afr_rmdir_done; + local->transaction.wind = afr_rmdir_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_rmdir_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -1944,18 +1463,16 @@ afr_rmdir (call_frame_t *frame, xlator_t *this, ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL); return 0; } diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index 0cfebcb9d55..01e078c13e6 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -35,241 +35,153 @@ #include "compat-errno.h" #include "compat.h" -/** - * Common algorithm for inode read calls: - * - * - Try the fop on the first child that is up - * - if we have failed due to ENOTCONN: - * try the next child - * - * Applicable to: access, stat, fstat, readlink, getxattr - */ +#include "afr-transaction.h" + /* {{{ access */ -int32_t -afr_access_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) +int +afr_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; + afr_local_t *local = NULL; local = frame->local; - read_child = (long) cookie; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - if (op_ret == -1) { - last_index = &local->cont.access.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - unwind = 0; - - STACK_WIND_COOKIE (frame, afr_access_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->access, - &local->loc, local->cont.access.mask, - NULL); - } - -out: - if (unwind) { - AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata); - } + AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata); return 0; } -int32_t -afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, - dict_t *xdata) +int +afr_access_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - int32_t op_errno = 0; - int32_t read_child = -1; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; - - AFR_SBRAIN_CHECK_LOC (loc, out); - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (subvol == -1) { + AFR_STACK_UNWIND (access, frame, local->op_ret, + local->op_errno, 0); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_access_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->access, + &local->loc, local->cont.access.mask, + local->xdata_req); + return 0; +} +int +afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, + int mask, dict_t *xdata) +{ + afr_local_t *local = NULL; + int op_errno = 0; - read_child = afr_inode_get_read_ctx (this, loc->inode, - local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.access.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - loc_copy (&local->loc, loc); - local->cont.access.mask = mask; + local->op = GF_FOP_ACCESS; + loc_copy (&local->loc, loc); + local->cont.access.mask = mask; + if (xdata) + local->xdata_req = dict_ref (xdata); - STACK_WIND_COOKIE (frame, afr_access_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->access, - loc, mask, xdata); + afr_read_txn (frame, this, loc->inode, afr_access_wind, + AFR_METADATA_TRANSACTION); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL); + return 0; } - /* }}} */ /* {{{ stat */ -int32_t +int afr_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; - - read_child = (long) cookie; + afr_local_t *local = NULL; local = frame->local; - if (op_ret == -1) { - last_index = &local->cont.stat.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - unwind = 0; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - STACK_WIND_COOKIE (frame, afr_stat_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->stat, - &local->loc, NULL); - } - -out: - if (unwind) { - AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); - } + AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); return 0; } -int32_t -afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +int +afr_stat_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int call_child = 0; - int32_t op_errno = 0; - int32_t read_child = -1; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; - - AFR_SBRAIN_CHECK_LOC (loc, out); - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (subvol == -1) { + AFR_STACK_UNWIND (stat, frame, local->op_ret, local->op_errno, + 0, 0); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->stat, + &local->loc, local->xdata_req); + return 0; +} - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; +int +afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + afr_local_t *local = NULL; + int op_errno = 0; - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - read_child = afr_inode_get_read_ctx (this, loc->inode, - local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.stat.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } - loc_copy (&local->loc, loc); + local->op = GF_FOP_STAT; + loc_copy (&local->loc, loc); + if (xdata) + local->xdata_req = dict_ref (xdata); - STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child, - children[call_child], - children[call_child]->fops->stat, - loc, xdata); + afr_read_txn (frame, this, loc->inode, afr_stat_wind, + AFR_DATA_TRANSACTION); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); return 0; } @@ -279,52 +191,49 @@ out: /* {{{ fstat */ -int32_t +int afr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; + afr_local_t *local = NULL; local = frame->local; - read_child = (long) cookie; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - if (op_ret == -1) { - last_index = &local->cont.fstat.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - unwind = 0; + AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); - STACK_WIND_COOKIE (frame, afr_fstat_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->fstat, - local->fd, NULL); - } + return 0; +} -out: - if (unwind) { - AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); - } - return 0; +int +afr_fstat_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (subvol == -1) { + AFR_STACK_UNWIND (fstat, frame, local->op_ret, local->op_errno, + 0, 0); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fstat, + local->fd, local->xdata_req); + return 0; } @@ -332,68 +241,26 @@ int32_t afr_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int call_child = 0; - int32_t op_errno = 0; - int32_t read_child = 0; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; - - VALIDATE_OR_GOTO (fd->inode, out); - - AFR_SBRAIN_CHECK_FD (fd, out); - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } + afr_local_t *local = NULL; + int op_errno = 0; - read_child = afr_inode_get_read_ctx (this, fd->inode, - local->fresh_children); + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + local->op = GF_FOP_FSTAT; + local->fd = fd_ref (fd); + if (xdata) + local->xdata_req = dict_ref (xdata); + afr_fix_open (fd, this); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.fstat.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - local->fd = fd_ref (fd); - - afr_open_fd_fix (fd, this); + afr_read_txn (frame, this, fd->inode, afr_fstat_wind, + AFR_DATA_TRANSACTION); - STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child, - children[call_child], - children[call_child]->fops->fstat, - fd, xdata); - - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); return 0; } @@ -402,117 +269,77 @@ out: /* {{{ readlink */ -int32_t +int afr_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, const char *buf, struct iatt *sbuf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; + afr_local_t *local = NULL; - priv = this->private; - children = priv->children; + local = frame->local; - local = frame->local; + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - read_child = (long) cookie; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - if (op_ret == -1) { - last_index = &local->cont.readlink.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; - - unwind = 0; - STACK_WIND_COOKIE (frame, afr_readlink_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->readlink, - &local->loc, - local->cont.readlink.size, NULL); - } - -out: - if (unwind) { - AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, buf, sbuf, - xdata); - } + AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, + buf, sbuf, xdata); + return 0; +} - return 0; +int +afr_readlink_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND (readlink, frame, local->op_ret, + local->op_errno, 0, 0, 0); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_readlink_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->readlink, + &local->loc, local->cont.readlink.size, + local->xdata_req); + return 0; } -int32_t +int afr_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, dict_t *xdata) { - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; - afr_local_t *local = NULL; + afr_local_t * local = NULL; int32_t op_errno = 0; - int32_t read_child = -1; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - children = priv->children; - - AFR_SBRAIN_CHECK_LOC (loc, out); - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } - read_child = afr_inode_get_read_ctx (this, loc->inode, - local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.readlink.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + local->op = GF_FOP_READLINK; loc_copy (&local->loc, loc); + local->cont.readlink.size = size; + if (xdata) + local->xdata_req = dict_ref (xdata); - local->cont.readlink.size = size; - - STACK_WIND_COOKIE (frame, afr_readlink_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readlink, - loc, size, xdata); + afr_read_txn (frame, this, loc->inode, afr_readlink_wind, + AFR_DATA_TRANSACTION); - ret = 0; -out: - if (ret < 0) - AFR_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL, NULL); return 0; +out: + AFR_STACK_UNWIND(readlink, frame, -1, op_errno, 0, 0, 0); + + return 0; } @@ -550,7 +377,7 @@ __gather_xattr_keys (dict_t *dict, char *key, data_t *value, void -__filter_xattrs (dict_t *dict) +afr_filter_xattrs (dict_t *dict) { struct list_head keys = {0,}; struct _xattr_key *key = NULL; @@ -571,59 +398,56 @@ __filter_xattrs (dict_t *dict) } - -int32_t +int afr_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; + afr_local_t *local = NULL; local = frame->local; - read_child = (long) cookie; - - if (op_ret == -1) { - last_index = &local->cont.getxattr.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - unwind = 0; - STACK_WIND_COOKIE (frame, afr_getxattr_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->getxattr, - &local->loc, - local->cont.getxattr.name, - NULL); - } + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } -out: - if (unwind) { - if (op_ret >= 0 && dict) - __filter_xattrs (dict); + if (dict) + afr_filter_xattrs (dict); - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); - } + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); return 0; } + +int +afr_getxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND (getxattr, frame, local->op_ret, + local->op_errno, NULL, NULL); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_getxattr_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->getxattr, + &local->loc, local->cont.getxattr.name, + local->xdata_req); + return 0; +} + + int32_t afr_getxattr_unwind (call_frame_t *frame, int op_ret, int op_errno, dict_t *dict, dict_t *xdata) @@ -659,7 +483,7 @@ afr_fgetxattr_clrlk_cbk (call_frame_t *frame, void *cookie, { callcnt = --local->call_count; if (op_ret == -1) - local->child_errno[cky] = op_errno; + local->replies[cky].op_errno = op_errno; if (!local->dict) local->dict = dict_new (); @@ -710,12 +534,10 @@ unlock: unwind: // Updating child_errno with more recent 'events' - local->child_errno[cky] = op_errno; - op_errno = afr_resultant_errno_get (NULL, local->child_errno, - priv->child_count); + op_errno = afr_final_errno (local, priv); + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr, xdata); - if (xattr) dict_unref (xattr); } @@ -749,7 +571,7 @@ afr_getxattr_clrlk_cbk (call_frame_t *frame, void *cookie, { callcnt = --local->call_count; if (op_ret == -1) - local->child_errno[cky] = op_errno; + local->replies[cky].op_errno = op_errno; if (!local->dict) local->dict = dict_new (); @@ -800,9 +622,8 @@ unlock: unwind: // Updating child_errno with more recent 'events' - local->child_errno[cky] = op_errno; - op_errno = afr_resultant_errno_get (NULL, local->child_errno, - priv->child_count); + op_errno = afr_final_errno (local, priv); + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata); if (xattr) @@ -1411,7 +1232,7 @@ afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk, } if (!strcmp (name, GF_XATTR_PATHINFO_KEY) || - !strcmp (name, GF_XATTR_USER_PATHINFO_KEY)) { + !strcmp (name, GF_XATTR_USER_PATHINFO_KEY)) { if (is_fgetxattr) { *cbk = afr_fgetxattr_pathinfo_cbk; } else { @@ -1442,18 +1263,16 @@ out: } static void -afr_getxattr_frm_all_children (xlator_t *this, call_frame_t *frame, - const char *name, loc_t *loc, - fop_getxattr_cbk_t cbk) +afr_getxattr_all_subvols (xlator_t *this, call_frame_t *frame, + const char *name, loc_t *loc, + fop_getxattr_cbk_t cbk) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - xlator_t **children = NULL; int i = 0; int call_count = 0; priv = this->private; - children = priv->children; local = frame->local; //local->call_count set in afr_local_init @@ -1465,8 +1284,8 @@ afr_getxattr_frm_all_children (xlator_t *this, call_frame_t *frame, for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, cbk, - (void *) (long) i, children[i], - children[i]->fops->getxattr, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->getxattr, loc, name, NULL); if (!--call_count) break; @@ -1481,41 +1300,41 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, { afr_private_t *priv = NULL; xlator_t **children = NULL; - int call_child = 0; afr_local_t *local = NULL; xlator_list_t *trav = NULL; xlator_t **sub_volumes = NULL; int i = 0; int32_t op_errno = 0; - int32_t read_child = -1; int ret = -1; fop_getxattr_cbk_t cbk = NULL; int afr_xtime_gauge[MCNT_MAX] = {0,}; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); children = priv->children; - AFR_SBRAIN_CHECK_LOC (loc, out); + loc_copy (&local->loc, loc); - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + local->op = GF_FOP_GETXATTR; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + if (xdata) + local->xdata_req = dict_ref (xdata); - loc_copy (&local->loc, loc); if (!name) goto no_name; local->cont.getxattr.name = gf_strdup (name); + if (!local->cont.getxattr.name) { + op_errno = ENOMEM; + goto out; + } + if (!strncmp (name, AFR_XATTR_PREFIX, strlen (AFR_XATTR_PREFIX))) { gf_log (this->name, GF_LOG_INFO, @@ -1559,8 +1378,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, * collect information from all childs */ if (afr_is_special_xattr (name, &cbk, 0)) { - afr_getxattr_frm_all_children (this, frame, name, - loc, cbk); + afr_getxattr_all_subvols (this, frame, name, loc, cbk); return 0; } @@ -1615,28 +1433,9 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, } no_name: - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } - read_child = afr_inode_get_read_ctx (this, loc->inode, - local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.getxattr.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - STACK_WIND_COOKIE (frame, afr_getxattr_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->getxattr, - loc, name, xdata); + afr_read_txn (frame, this, local->loc.inode, afr_getxattr_wind, + AFR_METADATA_TRANSACTION); ret = 0; out: @@ -1653,76 +1452,60 @@ afr_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; - - local = frame->local; + afr_local_t *local = NULL; - read_child = (long) cookie; + local = frame->local; - if (op_ret == -1) { - last_index = &local->cont.getxattr.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - unwind = 0; - STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->fgetxattr, - local->fd, - local->cont.getxattr.name, - NULL); - } + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } -out: - if (unwind) { - if (op_ret >= 0 && dict) - __filter_xattrs (dict); + if (dict) + afr_filter_xattrs (dict); - AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, - xdata); - } + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata); - return 0; + return 0; } -int32_t -afr_fgetxattr_unwind (call_frame_t *frame, - int op_ret, int op_errno, dict_t *dict, dict_t *xdata) - +int +afr_fgetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { - AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata); - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND (fgetxattr, frame, local->op_ret, + local->op_errno, NULL, NULL); + return 0; + } + + STACK_WIND_COOKIE (frame, (void *) (long) subvol, afr_fgetxattr_cbk, + priv->children[subvol], + priv->children[subvol]->fops->fgetxattr, + local->fd, local->cont.getxattr.name, + local->xdata_req); + return 0; } + static void -afr_fgetxattr_frm_all_children (xlator_t *this, call_frame_t *frame, - const char *name, fd_t *fd, - fop_fgetxattr_cbk_t cbk) +afr_fgetxattr_all_subvols (xlator_t *this, call_frame_t *frame, + fop_fgetxattr_cbk_t cbk) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - xlator_t **children = NULL; int i = 0; int call_count = 0; priv = this->private; - children = priv->children; local = frame->local; //local->call_count set in afr_local_init @@ -1735,9 +1518,10 @@ afr_fgetxattr_frm_all_children (xlator_t *this, call_frame_t *frame, if (local->child_up[i]) { STACK_WIND_COOKIE (frame, cbk, (void *) (long) i, - children[i], - children[i]->fops->fgetxattr, - fd, name, NULL); + priv->children[i], + priv->children[i]->fops->fgetxattr, + local->fd, local->cont.getxattr.name, + NULL); if (!--call_count) break; } @@ -1746,42 +1530,30 @@ afr_fgetxattr_frm_all_children (xlator_t *this, call_frame_t *frame, return; } -int32_t + +int afr_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, dict_t *xdata) { - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; afr_local_t *local = NULL; - int32_t op_ret = -1; int32_t op_errno = 0; - int32_t read_child = -1; fop_fgetxattr_cbk_t cbk = NULL; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; - - AFR_SBRAIN_CHECK_FD (fd, out); - - AFR_LOCAL_ALLOC_OR_GOTO (local, out); - frame->local = local; - - op_ret = afr_local_init (local, priv, &op_errno); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + local->op = GF_FOP_FGETXATTR; local->fd = fd_ref (fd); - if (name) + if (name) { local->cont.getxattr.name = gf_strdup (name); + if (!local->cont.getxattr.name) { + op_errno = ENOMEM; + goto out; + } + } + if (xdata) + local->xdata_req = dict_ref (xdata); /* pathinfo gets handled only in getxattr(), but we need to handle * lockinfo. @@ -1789,42 +1561,19 @@ afr_fgetxattr (call_frame_t *frame, xlator_t *this, * collect information from all children. */ if (afr_is_special_xattr (name, &cbk, 1)) { - afr_fgetxattr_frm_all_children (this, frame, name, - fd, cbk); + afr_fgetxattr_all_subvols (this, frame, cbk); return 0; } + afr_fix_open (fd, this); - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } - - read_child = afr_inode_get_read_ctx (this, fd->inode, - local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.getxattr.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } - - STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->fgetxattr, - fd, name, xdata); + afr_read_txn (frame, this, fd->inode, afr_fgetxattr_wind, + AFR_METADATA_TRANSACTION); - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, NULL, - NULL); - } + AFR_STACK_UNWIND (fgetxattr, frame, -1, op_errno, NULL, NULL); + return 0; } @@ -1833,144 +1582,84 @@ out: /* {{{ readv */ -/** - * read algorithm: - * - * if the user has specified a read subvolume, use it - * otherwise - - * use the inode number to hash it to one of the subvolumes, and - * read from there (to balance read load) - * - * if any of the above read's fail, try the children in sequence - * beginning at the beginning - */ - -int32_t +int afr_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, int32_t count, struct iatt *buf, struct iobref *iobref, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t *fresh_children = NULL; - int32_t read_child = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - read_child = (long) cookie; + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - if (op_ret == -1) { - last_index = &local->cont.readv.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; - - unwind = 0; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - STACK_WIND_COOKIE (frame, afr_readv_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->readv, - local->fd, local->cont.readv.size, - local->cont.readv.offset, - local->cont.readv.flags, - NULL); - } + AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, + vector, count, buf, iobref, xdata); + return 0; +} -out: - if (unwind) { - AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, - vector, count, buf, iobref, xdata); - } - return 0; +int +afr_readv_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND (readv, frame, local->op_ret, local->op_errno, + 0, 0, 0, 0, 0); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_readv_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->readv, + local->fd, local->cont.readv.size, + local->cont.readv.offset, local->cont.readv.flags, + local->xdata_req); + return 0; } -int32_t -afr_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata) +int +afr_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - afr_private_t * priv = NULL; afr_local_t * local = NULL; - xlator_t ** children = NULL; - int call_child = 0; int32_t op_errno = 0; - int32_t read_child = -1; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - children = priv->children; - AFR_SBRAIN_CHECK_FD (fd, out); + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } - - read_child = afr_inode_get_read_ctx (this, fd->inode, local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.readv.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - local->fd = fd_ref (fd); - - local->cont.readv.size = size; - local->cont.readv.offset = offset; - local->cont.readv.flags = flags; + local->op = GF_FOP_READ; + local->fd = fd_ref (fd); + local->cont.readv.size = size; + local->cont.readv.offset = offset; + local->cont.readv.flags = flags; + if (xdata) + local->xdata_req = dict_ref (xdata); - afr_open_fd_fix (fd, this); + afr_fix_open (fd, this); - STACK_WIND_COOKIE (frame, afr_readv_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readv, - fd, size, offset, flags, xdata); + afr_read_txn (frame, this, fd->inode, afr_readv_wind, + AFR_DATA_TRANSACTION); - ret = 0; -out: - if (ret < 0) { - AFR_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, - NULL, NULL); - } return 0; +out: + AFR_STACK_UNWIND(readv, frame, -1, op_errno, 0, 0, 0, 0, 0); + + return 0; } /* }}} */ diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index d62847defa3..3dacfc8dd5d 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -37,46 +37,128 @@ #include "afr.h" #include "afr-transaction.h" -#include "afr-self-heal-common.h" +//#include "afr-self-heal-common.h" -void -__inode_write_fop_cbk (call_frame_t *frame, int child_index, int read_child, - xlator_t *this, int32_t *op_ret, int32_t *op_errno, - struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) + +static void +__afr_inode_write_finalize (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int read_subvol = 0; + int i = 0; + + local = frame->local; + priv = this->private; + + if (local->inode) { + if (local->transaction.type == AFR_METADATA_TRANSACTION) + read_subvol = afr_metadata_subvol_get (local->inode, this, + NULL, NULL); + else + read_subvol = afr_data_subvol_get (local->inode, this, + NULL, NULL); + } + + local->op_ret = -1; + local->op_errno = afr_final_errno (local, priv); + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret < 0) { + afr_inode_read_subvol_reset (local->inode, this); + continue; + } + + /* Order of checks in the compound conditional + below is important. + + - Highest precedence: largest op_ret + - Next precendence: if all op_rets are equal, read subvol + - Least precedence: any succeeded subvol + */ + if ((local->op_ret < local->replies[i].op_ret) || + ((local->op_ret == local->replies[i].op_ret) && + (i == read_subvol))) { + + local->op_ret = local->replies[i].op_ret; + local->op_errno = local->replies[i].op_errno; + + local->cont.inode_wfop.prebuf = + local->replies[i].prestat; + local->cont.inode_wfop.postbuf = + local->replies[i].poststat; + + if (local->replies[i].xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = + dict_ref (local->replies[i].xdata); + } + } + } +} + + +static void +__afr_inode_write_fill (call_frame_t *frame, xlator_t *this, int child_index, + int op_ret, int op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; local = frame->local; - if (afr_fop_failed (*op_ret, *op_errno)) { - local->child_errno[child_index] = *op_errno; - - switch (local->op) { - case GF_FOP_TRUNCATE: - case GF_FOP_FTRUNCATE: - if (*op_errno != EFBIG) - afr_transaction_fop_failed (frame, this, - child_index); - break; - default: - afr_transaction_fop_failed (frame, this, child_index); - break; - } - local->op_errno = *op_errno; - goto out; + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + + if (op_ret >= 0) { + if (prebuf) + local->replies[child_index].prestat = *prebuf; + if (postbuf) + local->replies[child_index].poststat = *postbuf; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + } else { + afr_transaction_fop_failed (frame, this, child_index); + } + + return; +} + + +static int +__afr_inode_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = (long) cookie; + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + __afr_inode_write_fill (frame, this, child_index, op_ret, + op_errno, prebuf, postbuf, xdata); } + UNLOCK (&frame->lock); - if ((local->success_count == 0) || (read_child == child_index)) { - local->op_ret = *op_ret; - if (prebuf) - local->cont.inode_wfop.prebuf = *prebuf; - if (postbuf) - local->cont.inode_wfop.postbuf = *postbuf; + call_count = afr_frame_return (frame); + + if (call_count == 0) { + __afr_inode_write_finalize (frame, this); + + if (afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); } - local->success_count++; -out: - return; + return 0; } /* {{{ writev */ @@ -94,6 +176,8 @@ afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame) dst_local->op_errno = src_local->op_errno; dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf; dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf; + if (src_local->xdata_rsp) + dst_local->xdata_rsp = dict_ref (src_local->xdata_rsp); } void @@ -106,26 +190,9 @@ afr_writev_unwind (call_frame_t *frame, xlator_t *this) local->op_ret, local->op_errno, &local->cont.inode_wfop.prebuf, &local->cont.inode_wfop.postbuf, - NULL); + local->xdata_rsp); } -call_frame_t* -afr_transaction_detach_fop_frame (call_frame_t *frame) -{ - afr_local_t * local = NULL; - call_frame_t *fop_frame = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - fop_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - return fop_frame; -} int afr_transaction_writev_unwind (call_frame_t *frame, xlator_t *this) @@ -173,82 +240,60 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = NULL; - afr_private_t *priv = NULL; call_frame_t *fop_frame = NULL; int child_index = (long) cookie; int call_count = -1; - int read_child = 0; - int ret = 0; + int ret = 0; uint32_t open_fd_count = 0; uint32_t write_is_append = 0; local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); LOCK (&frame->lock); { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, prebuf, postbuf, - xdata); - - local->replies[child_index].valid = 1; - local->replies[child_index].op_ret = op_ret; - local->replies[child_index].op_errno = op_errno; - - - /* stage the best case return value for unwind */ - if ((local->success_count == 0) || (op_ret > local->op_ret)) { - local->op_ret = op_ret; - local->op_errno = op_errno; - } - - if (op_ret != -1) { - if (xdata) { - ret = dict_get_uint32 (xdata, - GLUSTERFS_OPEN_FD_COUNT, - &open_fd_count); - if ((ret == 0) && - (open_fd_count > local->open_fd_count)) { - local->open_fd_count = open_fd_count; - local->update_open_fd_count = _gf_true; - } - - write_is_append = 0; - ret = dict_get_uint32 (xdata, - GLUSTERFS_WRITE_IS_APPEND, - &write_is_append); - if (ret || !write_is_append) - local->append_write = _gf_false; - } - + __afr_inode_write_fill (frame, this, child_index, op_ret, + op_errno, prebuf, postbuf, xdata); + if (op_ret == -1 || !xdata) + goto unlock; + + write_is_append = 0; + ret = dict_get_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND, + &write_is_append); + if (ret || !write_is_append) + local->append_write = _gf_false; + + ret = dict_get_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, + &open_fd_count); + if (ret == -1) + goto unlock; + if ((open_fd_count > local->open_fd_count)) { + local->open_fd_count = open_fd_count; + local->update_open_fd_count = _gf_true; } } +unlock: UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) { - - if (local->update_open_fd_count) - afr_handle_open_fd_count (frame, this); - - if (!local->stable_write && !local->append_write) + if (!local->stable_write && !local->append_write) /* An appended write removes the necessity to fsync() the file. This is because self-heal has the logic to check for larger file when the xattrs are not reliably pointing at a stale file. */ - afr_fd_report_unstable_write (this, local->fd); + afr_fd_report_unstable_write (this, local->fd); + + __afr_inode_write_finalize (frame, this); afr_writev_handle_short_writes (frame, this); - if (afr_any_fops_failed (local, priv)) { + + if (local->update_open_fd_count) + afr_handle_open_fd_count (frame, this); + + if (!afr_txn_nothing_failed (frame, this)) { //Don't unwind until post-op is complete local->transaction.resume (frame, this); } else { @@ -272,91 +317,23 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, return 0; } + int -afr_writev_wind (call_frame_t *frame, xlator_t *this) +afr_writev_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int i = 0; - int call_count = -1; - dict_t *xdata = NULL; - GF_UNUSED int ret = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), - gf_afr_mt_reply_t); - if (!local->replies) { - local->op_ret = -1; - local->op_errno = ENOMEM; - local->transaction.unwind(frame, this); - local->transaction.resume(frame, this); - return 0; - } - - xdata = dict_new (); - if (xdata) { - ret = dict_set_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, - sizeof (uint32_t)); - ret = dict_set_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND, - 0); - /* Set append_write to be true speculatively. If on any - server it turns not be true, we unset it in the - callback. - */ - local->append_write = _gf_true; - } - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->writev, - local->fd, - local->cont.writev.vector, - local->cont.writev.count, - local->cont.writev.offset, - local->cont.writev.flags, - local->cont.writev.iobref, - xdata); - - if (!--call_count) - break; - } - } - - if (xdata) - dict_unref (xdata); - - return 0; -} - - -int -afr_writev_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - iobref_unref (local->cont.writev.iobref); - local->cont.writev.iobref = NULL; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->writev, + local->fd, local->cont.writev.vector, + local->cont.writev.count, local->cont.writev.offset, + local->cont.writev.flags, local->cont.writev.iobref, + local->xdata_req); return 0; } @@ -366,29 +343,29 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) { call_frame_t *transaction_frame = NULL; afr_local_t *local = NULL; - int op_ret = -1; - int op_errno = 0; - - local = frame->local; + int ret = -1; + int op_errno = ENOMEM; transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } + local = frame->local; transaction_frame->local = local; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + frame->local = NULL; - local->op = GF_FOP_WRITE; + if (!AFR_FRAME_INIT (frame, op_errno)) + goto out; - local->success_count = 0; + local->op = GF_FOP_WRITE; - local->transaction.fop = afr_writev_wind; - local->transaction.done = afr_writev_done; + local->transaction.wind = afr_writev_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_transaction_writev_unwind; local->transaction.main_frame = frame; + if (local->fd->flags & O_APPEND) { /* * Backend vfs ignores the 'offset' for append mode fd so @@ -405,179 +382,86 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) local->cont.writev.count); } - op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; goto out; } - op_ret = 0; + return 0; out: - if (op_ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); return 0; } -static void -afr_trigger_open_fd_self_heal (fd_t *fd, xlator_t *this) -{ - call_frame_t *frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - char *reason = NULL; - int32_t op_errno = 0; - int ret = 0; - - if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid args: " - "fd: %p, inode: %p", fd, - fd ? fd->inode : NULL); - goto out; - } - - frame = create_frame (this, this->ctx->pool); - if (!frame) - goto out; - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - ret = afr_local_init (local, this->private, &op_errno); - if (ret < 0) - goto out; - - local->loc.inode = inode_ref (fd->inode); - ret = loc_path (&local->loc, NULL); - if (ret < 0) - goto out; - - sh = &local->self_heal; - sh->do_metadata_self_heal = _gf_true; - if (fd->inode->ia_type == IA_IFREG) - sh->do_data_self_heal = _gf_true; - else if (fd->inode->ia_type == IA_IFDIR) - sh->do_entry_self_heal = _gf_true; - - reason = "subvolume came online"; - afr_launch_self_heal (frame, this, fd->inode, _gf_true, - fd->inode->ia_type, reason, NULL, NULL); - return; -out: - AFR_STACK_DESTROY (frame); -} - -void -afr_open_fd_fix (fd_t *fd, xlator_t *this) -{ - int ret = 0; - int i = 0; - afr_fd_ctx_t *fd_ctx = NULL; - gf_boolean_t need_self_heal = _gf_false; - int *need_open = NULL; - size_t need_open_count = 0; - afr_private_t *priv = NULL; - - priv = this->private; - - if (!afr_is_fd_fixable (fd)) - goto out; - - fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) - goto out; - - LOCK (&fd->lock); - { - if (fd_ctx->up_count < priv->up_count) { - need_self_heal = _gf_true; - fd_ctx->up_count = priv->up_count; - fd_ctx->down_count = priv->down_count; - } - - need_open = alloca (priv->child_count * sizeof (*need_open)); - for (i = 0; i < priv->child_count; i++) { - need_open[i] = 0; - if (fd_ctx->opened_on[i] != AFR_FD_NOT_OPENED) - continue; - - if (!priv->child_up[i]) - continue; - - fd_ctx->opened_on[i] = AFR_FD_OPENING; - - need_open[i] = 1; - need_open_count++; - } - } - UNLOCK (&fd->lock); - if (ret) - goto out; - - if (need_self_heal) - afr_trigger_open_fd_self_heal (fd, this); - - if (!need_open_count) - goto out; - - afr_fix_open (this, fd, need_open_count, need_open); -out: - return; -} int afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int op_errno = ENOMEM; priv = this->private; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } - QUORUM_CHECK(writev,out); - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - local->cont.writev.vector = iov_dup (vector, count); + local->cont.writev.vector = iov_dup (vector, count); + if (!local->cont.writev.vector) + goto out; local->cont.writev.count = count; local->cont.writev.offset = offset; local->cont.writev.flags = flags; local->cont.writev.iobref = iobref_ref (iobref); - local->fd = fd_ref (fd); + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; + + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); + + if (dict_set_uint32 (local->xdata_req, GLUSTERFS_OPEN_FD_COUNT, 4)) { + op_errno = ENOMEM; + goto out; + } + + if (dict_set_uint32 (local->xdata_req, GLUSTERFS_WRITE_IS_APPEND, 4)) { + op_errno = ENOMEM; + goto out; + } + + /* Set append_write to be true speculatively. If on any + server it turns not be true, we unset it in the + callback. + */ + local->append_write = _gf_true; /* detect here, but set it in writev_wind_cbk *after* the unstable write is performed */ local->stable_write = !!((fd->flags|flags)&(O_SYNC|O_DSYNC)); - afr_open_fd_fix (fd, this); + afr_fix_open (fd, this); afr_do_writev (frame, this); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); + AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -595,22 +479,13 @@ afr_truncate_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } @@ -620,96 +495,32 @@ afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - int child_index = (long) cookie; - int read_child = 0; - int call_count = -1; - - local = frame->local; - - read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); + afr_local_t *local = NULL; - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } + local = frame->local; - if (op_ret != -1) { - if (prebuf->ia_size != postbuf->ia_size) - local->stable_write = _gf_false; - } - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, prebuf, postbuf, - xdata); - } - UNLOCK (&frame->lock); + if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (local->stable_write && afr_txn_nothing_failed (frame, this)) - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); } -int32_t -afr_truncate_wind (call_frame_t *frame, xlator_t *this) +int +afr_truncate_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - local->stable_write = _gf_true; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->truncate, - &local->loc, - local->cont.truncate.offset, - NULL); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_truncate_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->truncate, + &local->loc, local->cont.truncate.offset, + local->xdata_req); return 0; } @@ -721,56 +532,60 @@ afr_truncate (call_frame_t *frame, xlator_t *this, afr_private_t * priv = NULL; afr_local_t * local = NULL; call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int ret = -1; + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(truncate,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; local->cont.truncate.offset = offset; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; - local->transaction.fop = afr_truncate_wind; - local->transaction.done = afr_truncate_done; + local->transaction.wind = afr_truncate_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_truncate_unwind; loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + + local->op = GF_FOP_TRUNCATE; local->transaction.main_frame = frame; local->transaction.start = offset; local->transaction.len = 0; + /* Set it true speculatively, will get reset in afr_truncate_wind_cbk + if truncate was not a NOP */ + local->stable_write = _gf_true; + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -788,21 +603,13 @@ afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } @@ -812,122 +619,75 @@ afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - int child_index = (long) cookie; - int call_count = -1; - int read_child = 0; - - local = frame->local; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - if (op_ret != -1) { - if (prebuf->ia_size != postbuf->ia_size) - local->stable_write = _gf_false; - } - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, prebuf, postbuf, - xdata); - } - UNLOCK (&frame->lock); + afr_local_t *local = NULL; - call_count = afr_frame_return (frame); + local = frame->local; - if (call_count == 0) { - if (local->stable_write && afr_txn_nothing_failed (frame, this)) - local->transaction.unwind (frame, this); + if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); } int -afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) +afr_ftruncate_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - local->call_count = call_count; - local->stable_write = _gf_true; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->ftruncate, - local->fd, - local->cont.ftruncate.offset, - NULL); - - if (!--call_count) - break; - } - } + local = frame->local; + priv = this->private; + STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->ftruncate, + local->fd, local->cont.ftruncate.offset, + local->xdata_req); return 0; } int -afr_ftruncate_done (call_frame_t *frame, xlator_t *this) +afr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { + afr_private_t *priv = NULL; afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; - - local->transaction.unwind (frame, this); + priv = this->private; - AFR_STACK_DESTROY (frame); + QUORUM_CHECK(ftruncate,out); - return 0; -} + transaction_frame = copy_frame (frame); + if (!frame) + goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; -int -afr_do_ftruncate (call_frame_t *frame, xlator_t *this) -{ - call_frame_t * transaction_frame = NULL; - afr_local_t * local = NULL; - int op_ret = -1; - int op_errno = 0; + local->cont.ftruncate.offset = offset; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local = frame->local; + if (!local->xdata_req) + goto out; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } - - transaction_frame->local = local; - frame->local = NULL; + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); local->op = GF_FOP_FTRUNCATE; - local->transaction.fop = afr_ftruncate_wind; - local->transaction.done = afr_ftruncate_done; + local->transaction.wind = afr_ftruncate_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_ftruncate_unwind; local->transaction.main_frame = frame; @@ -935,69 +695,21 @@ afr_do_ftruncate (call_frame_t *frame, xlator_t *this) local->transaction.start = local->cont.ftruncate.offset; local->transaction.len = 0; - op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } + afr_fix_open (fd, this); - op_ret = 0; -out: - if (op_ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, - NULL, NULL); - } - - return 0; -} - - -int -afr_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; + /* Set it true speculatively, will get reset in afr_ftruncate_wind_cbk + if truncate was not a NOP */ + local->stable_write = _gf_true; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } - QUORUM_CHECK(ftruncate,out); - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - local->cont.ftruncate.offset = offset; - - local->fd = fd_ref (fd); - - afr_open_fd_fix (fd, this); - - afr_do_ftruncate (frame, this); - - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); - } + AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -1009,173 +721,92 @@ out: int afr_setattr_unwind (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + local->xdata_rsp); return 0; } int afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, + int op_ret, int op_errno, struct iatt *preop, struct iatt *postop, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; - int read_child = 0; - int call_count = -1; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, preop, postop, - xdata); - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + preop, postop, xdata); } -int32_t -afr_setattr_wind (call_frame_t *frame, xlator_t *this) +int +afr_setattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setattr, - &local->loc, - &local->cont.setattr.in_buf, - local->cont.setattr.valid, - NULL); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->setattr, + &local->loc, &local->cont.setattr.in_buf, + local->cont.setattr.valid, local->xdata_req); return 0; } int -afr_setattr_done (call_frame_t *frame, xlator_t *this) +afr_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf, + int32_t valid, dict_t *xdata) { + afr_private_t *priv = NULL; afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - - -int -afr_setattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(setattr,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; local->cont.setattr.in_buf = *buf; local->cont.setattr.valid = valid; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local->transaction.fop = afr_setattr_wind; - local->transaction.done = afr_setattr_done; + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_setattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_setattr_unwind; loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + + local->op = GF_FOP_SETATTR; local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; @@ -1183,18 +814,16 @@ afr_setattr (call_frame_t *frame, xlator_t *this, ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -1208,22 +837,13 @@ afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } @@ -1233,149 +853,72 @@ afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preop, struct iatt *postop, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; - int read_child = 0; - int call_count = -1; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, preop, postop, - xdata); - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + preop, postop, xdata); } -int32_t -afr_fsetattr_wind (call_frame_t *frame, xlator_t *this) +int +afr_fsetattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fsetattr, - local->fd, - &local->cont.fsetattr.in_buf, - local->cont.fsetattr.valid, - NULL); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fsetattr, + local->fd, &local->cont.fsetattr.in_buf, + local->cont.fsetattr.valid, local->xdata_req); return 0; } int -afr_fsetattr_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - -int afr_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } - QUORUM_CHECK(fsetattr,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; local->cont.fsetattr.in_buf = *buf; local->cont.fsetattr.valid = valid; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local->transaction.fop = afr_fsetattr_wind; - local->transaction.done = afr_fsetattr_done; + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_fsetattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_fsetattr_unwind; local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); + + local->op = GF_FOP_FSETATTR; - afr_open_fd_fix (fd, this); + afr_fix_open (fd, this); local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; @@ -1383,18 +926,16 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this, ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -1410,19 +951,12 @@ afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (setxattr, main_frame, - local->op_ret, local->op_errno, - NULL); - } + AFR_STACK_UNWIND (setxattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); return 0; } @@ -1431,95 +965,32 @@ int afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int need_unwind = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - __inode_write_fop_cbk (frame, child_index, -1, this, - &op_ret, &op_errno, NULL, NULL, - xdata); - if (local->success_count == priv->child_count) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + NULL, NULL, xdata); } int -afr_setxattr_wind (call_frame_t *frame, xlator_t *this) +afr_setxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, - local->cont.setxattr.dict, - local->cont.setxattr.flags, - NULL); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->setxattr, + &local->loc, local->cont.setxattr.dict, + local->cont.setxattr.flags, local->xdata_req); return 0; } int -afr_setxattr_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - -int -afr_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) +afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -1527,59 +998,60 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, int ret = -1; int op_errno = EINVAL; - VALIDATE_OR_GOTO (this, out); - GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, op_errno, out); GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, op_errno, out); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; QUORUM_CHECK(setxattr,out); + transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) goto out; local->cont.setxattr.dict = dict_ref (dict); local->cont.setxattr.flags = flags; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local->transaction.fop = afr_setxattr_wind; - local->transaction.done = afr_setxattr_done; + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_setxattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_setxattr_unwind; loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; local->transaction.len = 0; + local->op = GF_FOP_SETXATTR; + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); return 0; } @@ -1595,19 +1067,12 @@ afr_fsetxattr_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (fsetxattr, main_frame, - local->op_ret, local->op_errno, - NULL); - } + AFR_STACK_UNWIND (fsetxattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); return 0; } @@ -1616,94 +1081,30 @@ int afr_fsetxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int need_unwind = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - - __inode_write_fop_cbk (frame, child_index, -1, this, - &op_ret, &op_errno, NULL, NULL, - xdata); - if (local->success_count == priv->child_count) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + NULL, NULL, xdata); } int -afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this) +afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fsetxattr, - local->fd, - local->cont.fsetxattr.dict, - local->cont.fsetxattr.flags, - NULL); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fsetxattr, + local->fd, local->cont.fsetxattr.dict, + local->cont.fsetxattr.flags, local->xdata_req); return 0; } int -afr_fsetxattr_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - -int afr_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata) { @@ -1711,11 +1112,7 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this, afr_local_t *local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = EINVAL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, op_errno, out); @@ -1725,36 +1122,36 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this, priv = this->private; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } - QUORUM_CHECK(fsetxattr,out); - AFR_LOCAL_ALLOC_OR_GOTO (local, out); - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { + if (!transaction_frame) goto out; - } - - transaction_frame->local = local; - local->op_ret = -1; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; local->cont.fsetxattr.dict = dict_ref (dict); local->cont.fsetxattr.flags = flags; - local->transaction.fop = afr_fsetxattr_wind; - local->transaction.done = afr_fsetxattr_done; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_fsetxattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_fsetxattr_unwind; local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); + + local->op = GF_FOP_FSETXATTR; local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; @@ -1762,18 +1159,16 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this, ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); return 0; } @@ -1791,19 +1186,12 @@ afr_removexattr_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (removexattr, main_frame, - local->op_ret, local->op_errno, - NULL); - } + AFR_STACK_UNWIND (removexattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); return 0; } @@ -1812,88 +1200,25 @@ int afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int need_unwind = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - __inode_write_fop_cbk (frame, child_index, -1, this, - &op_ret, &op_errno, NULL, NULL, - xdata); - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + NULL, NULL, xdata); } -int32_t -afr_removexattr_wind (call_frame_t *frame, xlator_t *this) +int +afr_removexattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->removexattr, - &local->loc, - local->cont.removexattr.name, - NULL); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_removexattr_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); + local = frame->local; + priv = this->private; + STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->removexattr, + &local->loc, local->cont.removexattr.name, + local->xdata_req); return 0; } @@ -1906,9 +1231,7 @@ afr_removexattr (call_frame_t *frame, xlator_t *this, afr_local_t *local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (this, out); + int op_errno = ENOMEM; GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", name, op_errno, out); @@ -1916,34 +1239,37 @@ afr_removexattr (call_frame_t *frame, xlator_t *this, GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", name, op_errno, out); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - priv = this->private; QUORUM_CHECK(removexattr,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; local->cont.removexattr.name = gf_strdup (name); - local->transaction.fop = afr_removexattr_wind; - local->transaction.done = afr_removexattr_done; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_removexattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_removexattr_unwind; loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + + local->op = GF_FOP_REMOVEXATTR; local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; @@ -1951,18 +1277,16 @@ afr_removexattr (call_frame_t *frame, xlator_t *this, ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); return 0; } @@ -1975,19 +1299,12 @@ afr_fremovexattr_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (fremovexattr, main_frame, - local->op_ret, local->op_errno, - NULL); - } + AFR_STACK_UNWIND (fremovexattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); return 0; } @@ -1996,105 +1313,38 @@ int afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int need_unwind = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - __inode_write_fop_cbk (frame, child_index, -1, this, - &op_ret, &op_errno, NULL, NULL, - xdata); - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + NULL, NULL, xdata); } -int32_t -afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this) +int +afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fremovexattr, - local->fd, - local->cont.removexattr.name, - NULL); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fremovexattr, + local->fd, local->cont.removexattr.name, + local->xdata_req); return 0; } int -afr_fremovexattr_done (call_frame_t *frame, xlator_t *this) +afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) { - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - - -int -afr_fremovexattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, const char *name, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; int ret = -1; - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (this, out); + int op_errno = ENOMEM; GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", name, op_errno, out); @@ -2102,64 +1352,59 @@ afr_fremovexattr (call_frame_t *frame, xlator_t *this, GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", name, op_errno, out); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } + priv = this->private; QUORUM_CHECK(fremovexattr, out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (local, out); - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) { - op_errno = -ret; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) goto out; - } - - transaction_frame->local = local; - - local->op_ret = -1; local->cont.removexattr.name = gf_strdup (name); + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local->transaction.fop = afr_fremovexattr_wind; - local->transaction.done = afr_fremovexattr_done; + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_fremovexattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_fremovexattr_unwind; local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); + + local->op = GF_FOP_FREMOVEXATTR; local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; local->transaction.len = 0; - op_ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } - op_ret = 0; + return 0; out: - if (op_ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (fremovexattr, frame, -1, op_errno, NULL); return 0; } -static int + +int afr_fallocate_unwind (call_frame_t *frame, xlator_t *this) { afr_local_t * local = NULL; @@ -2167,147 +1412,88 @@ afr_fallocate_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } -static int + +int afr_fallocate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; - int call_count = -1; - int need_unwind = 0; - int read_child = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, prebuf, postbuf, - xdata); - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); } -static int -afr_fallocate_wind (call_frame_t *frame, xlator_t *this) + +int +afr_fallocate_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fallocate, - local->fd, - local->cont.fallocate.mode, - local->cont.fallocate.offset, - local->cont.fallocate.len, - NULL); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fallocate, + local->fd, local->cont.fallocate.mode, + local->cont.fallocate.offset, + local->cont.fallocate.len, local->xdata_req); return 0; } -static int -afr_fallocate_done (call_frame_t *frame, xlator_t *this) + +int +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) { + afr_private_t *priv = NULL; + call_frame_t *transaction_frame = NULL; afr_local_t *local = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; + priv = this->private; - local->transaction.unwind (frame, this); + QUORUM_CHECK(fallocate,out); - AFR_STACK_DESTROY (frame); + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - return 0; -} + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; -static int -afr_do_fallocate (call_frame_t *frame, xlator_t *this) -{ - call_frame_t * transaction_frame = NULL; - afr_local_t * local = NULL; - int op_ret = -1; - int op_errno = 0; + local->cont.fallocate.mode = mode; + local->cont.fallocate.offset = offset; + local->cont.fallocate.len = len; - local = frame->local; + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - transaction_frame->local = local; - frame->local = NULL; + if (!local->xdata_req) + goto out; local->op = GF_FOP_FALLOCATE; - local->transaction.fop = afr_fallocate_wind; - local->transaction.done = afr_fallocate_done; + local->transaction.wind = afr_fallocate_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_fallocate_unwind; local->transaction.main_frame = frame; @@ -2315,80 +1501,29 @@ afr_do_fallocate (call_frame_t *frame, xlator_t *this) local->transaction.start = local->cont.fallocate.offset; local->transaction.len = 0; - /* fallocate can modify the file size */ - op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } - - op_ret = 0; -out: - if (op_ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fallocate, frame, op_ret, op_errno, NULL, - NULL, NULL); - } - - return 0; -} - -int -afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, - off_t offset, size_t len, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_fix_open (fd, this); - priv = this->private; - - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } - QUORUM_CHECK(fallocate,out); - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - local->cont.fallocate.mode = mode; - local->cont.fallocate.offset = offset; - local->cont.fallocate.len = len; - - local->fd = fd_ref (fd); - - afr_open_fd_fix (fd, this); - - afr_do_fallocate (frame, this); - - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); return 0; } + /* }}} */ /* {{{ discard */ -static int +int afr_discard_unwind (call_frame_t *frame, xlator_t *this) { afr_local_t * local = NULL; @@ -2396,146 +1531,86 @@ afr_discard_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (discard, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } -static int + +int afr_discard_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; - int call_count = -1; - int need_unwind = 0; - int read_child = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, prebuf, postbuf, - xdata); - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); } -static int -afr_discard_wind (call_frame_t *frame, xlator_t *this) + +int +afr_discard_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->discard, - local->fd, - local->cont.discard.offset, - local->cont.discard.len, - NULL); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->discard, + local->fd, local->cont.discard.offset, + local->cont.discard.len, local->xdata_req); return 0; } -static int -afr_discard_done (call_frame_t *frame, xlator_t *this) + +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { + afr_private_t *priv = NULL; afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; + priv = this->private; - local->transaction.unwind (frame, this); + QUORUM_CHECK(discard, out); - AFR_STACK_DESTROY (frame); + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - return 0; -} + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; -static int -afr_do_discard (call_frame_t *frame, xlator_t *this) -{ - call_frame_t * transaction_frame = NULL; - afr_local_t * local = NULL; - int op_ret = -1; - int op_errno = 0; + local->cont.discard.offset = offset; + local->cont.discard.len = len; - local = frame->local; + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - transaction_frame->local = local; - frame->local = NULL; + if (!local->xdata_req) + goto out; local->op = GF_FOP_DISCARD; - local->transaction.fop = afr_discard_wind; - local->transaction.done = afr_discard_done; + local->transaction.wind = afr_discard_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_discard_unwind; local->transaction.main_frame = frame; @@ -2543,316 +1618,134 @@ afr_do_discard (call_frame_t *frame, xlator_t *this) local->transaction.start = local->cont.discard.offset; local->transaction.len = 0; - op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } + afr_fix_open (fd, this); - op_ret = 0; -out: - if (op_ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (discard, frame, op_ret, op_errno, NULL, - NULL, NULL); - } - - return 0; -} - -int -afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - size_t len, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } - QUORUM_CHECK(discard, out); - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - local->cont.discard.offset = offset; - local->cont.discard.len = len; - - local->fd = fd_ref (fd); - - afr_open_fd_fix (fd, this); - - afr_do_discard(frame, this); - - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); return 0; } /* {{{ zerofill */ -static int +int afr_zerofill_unwind (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (zerofill, main_frame, local->op_ret, - local->op_errno, - &local->cont.zerofill.prebuf, - &local->cont.zerofill.postbuf, - NULL); - } + AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } -static int -afr_zerofill_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int child_index = (long) cookie; - int call_count = -1; - int need_unwind = 0; - int read_child = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - if (afr_fop_failed (op_ret, op_errno)) { - afr_transaction_fop_failed (frame, this, child_index); - } - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.zerofill.prebuf = *prebuf; - local->cont.zerofill.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.zerofill.prebuf = *prebuf; - local->cont.zerofill.postbuf = *postbuf; - } - - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (need_unwind) { - local->transaction.unwind (frame, this); - } - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; +int +afr_zerofill_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); } -static int -afr_zerofill_wind (call_frame_t *frame, xlator_t *this) + +int +afr_zerofill_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->zerofill, - local->fd, - local->cont.zerofill.offset, - local->cont.zerofill.len, - NULL); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->zerofill, + local->fd, local->cont.zerofill.offset, + local->cont.zerofill.len, local->xdata_req); return 0; } -static int -afr_zerofill_done (call_frame_t *frame, xlator_t *this) +int +afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { + afr_private_t *priv = NULL; afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; + priv = this->private; - local->transaction.unwind (frame, this); + QUORUM_CHECK(discard, out); - AFR_STACK_DESTROY (frame); + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - return 0; -} + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; -static int -afr_do_zerofill(call_frame_t *frame, xlator_t *this) -{ - call_frame_t *transaction_frame = NULL; - afr_local_t *local = NULL; - int op_ret = -1; - int op_errno = 0; + local->cont.zerofill.offset = offset; + local->cont.zerofill.len = len; - local = frame->local; + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - transaction_frame->local = local; - frame->local = NULL; + if (!local->xdata_req) + goto out; local->op = GF_FOP_ZEROFILL; - local->transaction.fop = afr_zerofill_wind; - local->transaction.done = afr_zerofill_done; + local->transaction.wind = afr_zerofill_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_zerofill_unwind; local->transaction.main_frame = frame; - local->transaction.start = local->cont.zerofill.offset; - local->transaction.len = 0; - - op_ret = afr_transaction (transaction_frame, this, - AFR_DATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } - - op_ret = 0; -out: - if (op_ret < 0) { - if (transaction_frame) { - AFR_STACK_DESTROY (transaction_frame); - } - AFR_STACK_UNWIND (zerofill, frame, op_ret, op_errno, NULL, - NULL, NULL); - } - - return 0; -} - -int -afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - off_t len, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } - QUORUM_CHECK(zerofill, out); + local->transaction.start = local->cont.discard.offset; + local->transaction.len = len; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + afr_fix_open (fd, this); - ret = afr_local_init (local, priv, &op_errno); + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); if (ret < 0) { - goto out; + op_errno = -ret; + goto out; } - local->cont.zerofill.offset = offset; - local->cont.zerofill.len = len; - - local->fd = fd_ref (fd); - - afr_open_fd_fix (fd, this); - afr_do_zerofill(frame, this); - - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) { - AFR_STACK_DESTROY (transaction_frame); - } - AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, - NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL); return 0; } diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index 060d78f3505..a2a758f35af 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -580,22 +580,6 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this) return 0; } -loc_t * -lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2) -{ - int ret = 0; - - ret = uuid_compare (l1->inode->gfid, l2->inode->gfid); - - if (ret == 0) - ret = strcmp (b1, b2); - - if (ret <= 0) - return l1; - else - return l2; -} - int afr_lockee_locked_nodes_count (afr_internal_lock_t *int_lock) { @@ -1213,8 +1197,7 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this) case AFR_ENTRY_RENAME_TRANSACTION: case AFR_ENTRY_TRANSACTION: - up_count = afr_up_children_count (local->child_up, - priv->child_count); + up_count = AFR_COUNT (local->child_up, priv->child_count); int_lock->lk_call_count = int_lock->lk_expected_count = (int_lock->lockee_count * up_count); @@ -1648,496 +1631,6 @@ afr_unlock (call_frame_t *frame, xlator_t *this) } int -afr_mark_locked_nodes (xlator_t *this, fd_t *fd, - unsigned char *locked_nodes) -{ - afr_private_t *priv = NULL; - afr_fd_ctx_t *fdctx = NULL; - uint64_t tmp = 0; - int ret = 0; - - priv = this->private; - - ret = afr_fd_ctx_set (this, fd); - if (ret) - goto out; - - ret = fd_ctx_get (fd, this, &tmp); - if (ret) { - gf_log (this->name, GF_LOG_INFO, - "failed to get the fd ctx"); - goto out; - } - fdctx = (afr_fd_ctx_t *) (long) tmp; - - GF_ASSERT (fdctx->locked_on); - - memcpy (fdctx->locked_on, locked_nodes, - priv->child_count); - -out: - return ret; -} - -static int -__is_fd_saved (xlator_t *this, fd_t *fd) -{ - afr_locked_fd_t *locked_fd = NULL; - afr_private_t *priv = NULL; - int found = 0; - - priv = this->private; - - list_for_each_entry (locked_fd, &priv->saved_fds, list) { - if (locked_fd->fd == fd) { - found = 1; - break; - } - } - - return found; -} - -static int -__afr_save_locked_fd (xlator_t *this, fd_t *fd) -{ - afr_private_t *priv = NULL; - afr_locked_fd_t *locked_fd = NULL; - int ret = 0; - - priv = this->private; - - locked_fd = GF_CALLOC (1, sizeof (*locked_fd), - gf_afr_mt_locked_fd); - if (!locked_fd) { - ret = -1; - goto out; - } - - locked_fd->fd = fd; - INIT_LIST_HEAD (&locked_fd->list); - - list_add_tail (&locked_fd->list, &priv->saved_fds); - -out: - return ret; -} - -int -afr_save_locked_fd (xlator_t *this, fd_t *fd) -{ - afr_private_t *priv = NULL; - int ret = 0; - - priv = this->private; - - pthread_mutex_lock (&priv->mutex); - { - if (__is_fd_saved (this, fd)) { - gf_log (this->name, GF_LOG_DEBUG, - "fd=%p already saved", fd); - goto unlock; - } - - ret = __afr_save_locked_fd (this, fd); - if (ret) { - gf_log (this->name, GF_LOG_INFO, - "fd=%p could not be saved", fd); - goto unlock; - } - } -unlock: - pthread_mutex_unlock (&priv->mutex); - - return ret; -} - -static int -afr_lock_recovery_cleanup (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_locked_fd_t *locked_fd = NULL; - - local = frame->local; - - locked_fd = local->locked_fd; - - STACK_DESTROY (frame->root); - afr_local_cleanup (local, this); - - afr_save_locked_fd (this, locked_fd->fd); - - return 0; - -} - -static int -afr_get_source_lock_recovery (xlator_t *this, fd_t *fd) -{ - afr_fd_ctx_t *fdctx = NULL; - afr_private_t *priv = NULL; - uint64_t tmp = 0; - int i = 0; - int source_child = -1; - int ret = 0; - - priv = this->private; - - ret = fd_ctx_get (fd, this, &tmp); - if (ret) - goto out; - - fdctx = (afr_fd_ctx_t *) (long) tmp; - - for (i = 0; i < priv->child_count; i++) { - if (fdctx->locked_on[i]) { - gf_log (this->name, GF_LOG_DEBUG, - "Found lock recovery source=%d", i); - source_child = i; - break; - } - } - -out: - return source_child; - -} - -int32_t -afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock, - dict_t *xdata); -int32_t -afr_recover_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock, - dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int32_t source_child = 0; - struct gf_flock flock = {0,}; - - local = frame->local; - priv = this->private; - - if (op_ret) { - gf_log (this->name, GF_LOG_INFO, - "lock recovery failed"); - goto cleanup; - } - - source_child = local->source_child; - - memcpy (&flock, lock, sizeof (*lock)); - - STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk, - (void *) (long) source_child, - priv->children[source_child], - priv->children[source_child]->fops->lk, - local->fd, F_GETLK_FD, &flock, NULL); - - return 0; - -cleanup: - afr_lock_recovery_cleanup (frame, this); - return 0; -} - -int -afr_recover_lock (call_frame_t *frame, xlator_t *this, - struct gf_flock *flock) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int32_t lock_recovery_child = 0; - - priv = this->private; - local = frame->local; - - lock_recovery_child = local->lock_recovery_child; - - frame->root->lk_owner = flock->l_owner; - - STACK_WIND_COOKIE (frame, afr_recover_lock_cbk, - (void *) (long) lock_recovery_child, - priv->children[lock_recovery_child], - priv->children[lock_recovery_child]->fops->lk, - local->fd, F_SETLK, flock, NULL); - - return 0; -} - -static int -is_afr_lock_eol (struct gf_flock *lock) -{ - int ret = 0; - - if ((lock->l_type == GF_LK_EOL)) - ret = 1; - - return ret; -} - -int32_t -afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock, - dict_t *xdata) -{ - if (op_ret) { - gf_log (this->name, GF_LOG_INFO, - "Failed to get locks on fd"); - goto cleanup; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Got a lock on fd"); - - if (is_afr_lock_eol (lock)) { - gf_log (this->name, GF_LOG_INFO, - "Reached EOL on locks on fd"); - goto cleanup; - } - - afr_recover_lock (frame, this, lock); - - return 0; - -cleanup: - afr_lock_recovery_cleanup (frame, this); - - return 0; -} - -static int -afr_lock_recovery (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - fd_t *fd = NULL; - int ret = 0; - int32_t source_child = 0; - struct gf_flock flock = {0,}; - - priv = this->private; - local = frame->local; - - fd = local->fd; - - source_child = afr_get_source_lock_recovery (this, fd); - if (source_child < 0) { - gf_log (this->name, GF_LOG_ERROR, - "Could not recover locks due to lock " - "split brain"); - ret = -1; - goto out; - } - - local->source_child = source_child; - - /* the flock can be zero filled as we're querying incrementally - the locks held on the fd. - */ - STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk, - (void *) (long) source_child, - priv->children[source_child], - priv->children[source_child]->fops->lk, - local->fd, F_GETLK_FD, &flock, NULL); - -out: - return ret; -} - - -static int -afr_mark_fd_opened (xlator_t *this, fd_t *fd, int32_t child_index) -{ - afr_fd_ctx_t *fdctx = NULL; - uint64_t tmp = 0; - int ret = 0; - - ret = fd_ctx_get (fd, this, &tmp); - if (ret) - goto out; - - fdctx = (afr_fd_ctx_t *) (long) tmp; - - fdctx->opened_on[child_index] = AFR_FD_OPENED; - -out: - return ret; -} - -int32_t -afr_lock_recovery_preopen_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, - dict_t *xdata) -{ - int32_t child_index = (long )cookie; - int ret = 0; - - if (op_ret) { - gf_log (this->name, GF_LOG_INFO, - "Reopen during lock-recovery failed"); - goto cleanup; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Open succeeded => proceed to recover locks"); - - ret = afr_lock_recovery (frame, this); - if (ret) { - gf_log (this->name, GF_LOG_INFO, - "Lock recovery failed"); - goto cleanup; - } - - ret = afr_mark_fd_opened (this, fd, child_index); - if (ret) { - gf_log (this->name, GF_LOG_INFO, - "Marking fd open failed"); - goto cleanup; - } - - return 0; - -cleanup: - afr_lock_recovery_cleanup (frame, this); - return 0; -} - -static int -afr_lock_recovery_preopen (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - uint64_t tmp = 0; - afr_fd_ctx_t *fdctx = NULL; - loc_t loc = {0,}; - int32_t child_index = 0; - int ret = 0; - - priv = this->private; - local = frame->local; - - GF_ASSERT (local && local->fd); - - ret = fd_ctx_get (local->fd, this, &tmp); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to get the context of fd", - uuid_utoa (local->fd->inode->gfid)); - fdctx = (afr_fd_ctx_t *) (long) tmp; - /* TODO: instead we should return from the function */ - GF_ASSERT (fdctx); - - child_index = local->lock_recovery_child; - - inode_path (local->fd->inode, NULL, (char **)&loc.path); - loc.name = strrchr (loc.path, '/'); - loc.inode = inode_ref (local->fd->inode); - loc.parent = inode_parent (local->fd->inode, 0, NULL); - - - STACK_WIND_COOKIE (frame, afr_lock_recovery_preopen_cbk, - (void *)(long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->open, - &loc, fdctx->flags, local->fd, NULL); - - return 0; -} - -static int -is_fd_opened (fd_t *fd, int32_t child_index) -{ - afr_fd_ctx_t *fdctx = NULL; - uint64_t tmp = 0; - int ret = 0; - - ret = fd_ctx_get (fd, THIS, &tmp); - if (ret) - goto out; - - fdctx = (afr_fd_ctx_t *) (long) tmp; - - if (fdctx->opened_on[child_index] == AFR_FD_OPENED) - ret = 1; - -out: - return ret; -} - -int -afr_attempt_lock_recovery (xlator_t *this, int32_t child_index) -{ - call_frame_t *frame = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_locked_fd_t *locked_fd = NULL; - afr_locked_fd_t *tmp = NULL; - int ret = -1; - struct list_head locks_list = {0,}; - int32_t op_errno = 0; - - - priv = this->private; - - if (list_empty (&priv->saved_fds)) - goto out; - - frame = create_frame (this, this->ctx->pool); - if (!frame) { - ret = -1; - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) { - ret = -1; - goto out; - } - - frame->local = local; - - INIT_LIST_HEAD (&locks_list); - - pthread_mutex_lock (&priv->mutex); - { - list_splice_init (&priv->saved_fds, &locks_list); - } - pthread_mutex_unlock (&priv->mutex); - - list_for_each_entry_safe (locked_fd, tmp, - &locks_list, list) { - - list_del_init (&locked_fd->list); - - local->fd = fd_ref (locked_fd->fd); - local->lock_recovery_child = child_index; - local->locked_fd = locked_fd; - - if (!is_fd_opened (locked_fd->fd, child_index)) { - gf_log (this->name, GF_LOG_DEBUG, - "attempting open before lock " - "recovery"); - afr_lock_recovery_preopen (frame, this); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "attempting lock recovery " - "without a preopen"); - afr_lock_recovery (frame, this); - } - } - -out: - if ((ret < 0) && frame) - AFR_STACK_DESTROY (frame); - return ret; -} - -int afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, unsigned int child_count) { diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 73594f26526..05df90cc0ee 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -41,10 +41,8 @@ enum gf_afr_mem_types_ { gf_afr_mt_shd_event_t, gf_afr_mt_time_t, gf_afr_mt_pos_data_t, - gf_afr_mt_reply_t, - gf_afr_mt_stats_t, - gf_afr_mt_shd_crawl_event_t, - gf_afr_mt_uint64_t, + gf_afr_mt_reply_t, + gf_afr_mt_subvol_healer_t, gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index 643a5d692df..f86aa7fd80d 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -43,85 +43,29 @@ #include "afr-dir-read.h" #include "afr-dir-write.h" #include "afr-transaction.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" -int -afr_stale_child_up (afr_local_t *local, xlator_t *this) -{ - int i = 0; - afr_private_t *priv = NULL; - int up = -1; - - priv = this->private; - - if (!local->fresh_children) - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) - goto out; - - afr_inode_get_read_ctx (this, local->fd->inode, local->fresh_children); - if (priv->child_count == afr_get_children_count (local->fresh_children, - priv->child_count)) - goto out; - for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i]) - continue; - if (afr_is_child_present (local->fresh_children, - priv->child_count, i)) - continue; - up = i; - break; - } -out: - return up; -} - -void -afr_perform_data_self_heal (call_frame_t *frame, xlator_t *this) +gf_boolean_t +afr_is_fd_fixable (fd_t *fd) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - inode_t *inode = NULL; - int st_child = -1; - char reason[64] = {0}; - - local = frame->local; - sh = &local->self_heal; - inode = local->fd->inode; - - if (!IA_ISREG (inode->ia_type)) - goto out; - - st_child = afr_stale_child_up (local, this); - if (st_child < 0) - goto out; - - sh->do_data_self_heal = _gf_true; - sh->do_metadata_self_heal = _gf_true; - sh->do_gfid_self_heal = _gf_true; - sh->do_missing_entry_self_heal = _gf_true; - - snprintf (reason, sizeof (reason), "stale subvolume %d detected", - st_child); - afr_launch_self_heal (frame, this, inode, _gf_true, inode->ia_type, - reason, NULL, NULL); -out: - return; + if (!fd || !fd->inode) + return _gf_false; + else if (fd_is_anonymous (fd)) + return _gf_false; + else if (uuid_is_null (fd->inode->gfid)) + return _gf_false; + + return _gf_true; } + int afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = frame->local; - afr_private_t *priv = NULL; - priv = this->private; - if (afr_open_only_data_self_heal (priv->data_self_heal)) - afr_perform_data_self_heal (frame, this); AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, local->fd, xdata); return 0; @@ -134,49 +78,38 @@ afr_open_cbk (call_frame_t *frame, void *cookie, fd_t *fd, dict_t *xdata) { afr_local_t * local = NULL; - int ret = 0; int call_count = -1; int child_index = (long) cookie; - afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; - priv = this->private; local = frame->local; + fd_ctx = local->fd_ctx; LOCK (&frame->lock); { if (op_ret == -1) { local->op_errno = op_errno; - } - - if (op_ret >= 0) { + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } else { local->op_ret = op_ret; - local->success_count++; - - ret = afr_child_fd_ctx_set (this, fd, child_index, - local->cont.open.flags); - if (ret) { - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); } } -unlock: UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) { - if ((local->cont.open.flags & O_TRUNC) - && (local->op_ret >= 0)) { + if ((fd_ctx->flags & O_TRUNC) && (local->op_ret >= 0)) { STACK_WIND (frame, afr_open_ftruncate_cbk, this, this->fops->ftruncate, fd, 0, NULL); } else { - if (afr_open_only_data_self_heal (priv->data_self_heal)) - afr_perform_data_self_heal (frame, this); AFR_STACK_UNWIND (open, frame, local->op_ret, - local->op_errno, local->fd, xdata); + local->op_errno, local->fd, + local->xdata_rsp); } } @@ -190,16 +123,11 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, afr_private_t * priv = NULL; afr_local_t * local = NULL; int i = 0; - int ret = -1; int32_t call_count = 0; int32_t op_errno = 0; - int32_t wind_flags = flags & (~O_TRUNC); - //We can't let truncation to happen outside transaction. + afr_fd_ctx_t *fd_ctx = NULL; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); + //We can't let truncation to happen outside transaction. priv = this->private; @@ -207,44 +135,38 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, QUORUM_CHECK(open,out); } - if (afr_is_split_brain (this, loc->inode)) { - /* self-heal failed */ - gf_log (this->name, GF_LOG_WARNING, - "failed to open as split brain seen, returning EIO"); - op_errno = EIO; - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) { + op_errno = ENOMEM; + goto out; + } - call_count = local->call_count; - loc_copy (&local->loc, loc); + local->fd = fd_ref (fd); + local->fd_ctx = fd_ctx; + fd_ctx->flags = flags; - local->cont.open.flags = flags; + call_count = local->call_count; - local->fd = fd_ref (fd); + local->cont.open.flags = flags; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->open, - loc, wind_flags, fd, xdata); - + loc, (flags & ~O_TRUNC), fd, xdata); if (!--call_count) break; } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, xdata); + AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, NULL); return 0; } @@ -273,12 +195,7 @@ afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, priv->children[child_index]->name); } - fd_ctx = afr_fd_ctx_get (local->fd, this); - if (!fd_ctx) { - gf_log (this->name, GF_LOG_WARNING, - "failed to get fd context, %p", local->fd); - goto out; - } + fd_ctx = local->fd_ctx; LOCK (&local->fd->lock); { @@ -289,7 +206,7 @@ afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } } UNLOCK (&local->fd->lock); -out: + call_count = afr_frame_return (frame); if (call_count == 0) AFR_STACK_DESTROY (frame); @@ -297,8 +214,42 @@ out: return 0; } + +static int +afr_fd_ctx_need_open (fd_t *fd, xlator_t *this, unsigned char *need_open) +{ + afr_fd_ctx_t *fd_ctx = NULL; + afr_private_t *priv = NULL; + int i = 0; + int count = 0; + + priv = this->private; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return 0; + + LOCK (&fd->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED && + priv->child_up[i]) { + fd_ctx->opened_on[i] = AFR_FD_OPENING; + need_open[i] = 1; + count++; + } else { + need_open[i] = 0; + } + } + } + UNLOCK (&fd->lock); + + return count; +} + + void -afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open) +afr_fix_open (fd_t *fd, xlator_t *this) { afr_private_t *priv = NULL; int i = 0; @@ -307,29 +258,31 @@ afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open) int ret = -1; int32_t op_errno = 0; afr_fd_ctx_t *fd_ctx = NULL; + unsigned char *need_open = NULL; + int call_count = 0; priv = this->private; - if (!afr_is_fd_fixable (fd) || !need_open || !need_open_count) + if (!afr_is_fd_fixable (fd)) goto out; fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) { - ret = -1; + if (!fd_ctx) goto out; - } + + need_open = alloca0 (priv->child_count); + + call_count = afr_fd_ctx_need_open (fd, this, need_open); + if (!call_count) + goto out; frame = create_frame (this, this->ctx->pool); - if (!frame) { - ret = -1; + if (!frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; local->loc.inode = inode_ref (fd->inode); ret = loc_path (&local->loc, NULL); @@ -337,10 +290,12 @@ afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open) goto out; local->fd = fd_ref (fd); - local->call_count = need_open_count; + local->fd_ctx = fd_ctx; + + local->call_count = call_count; - gf_log (this->name, GF_LOG_DEBUG, "need open count: %zd", - need_open_count); + gf_log (this->name, GF_LOG_DEBUG, "need open count: %d", + call_count); for (i = 0; i < priv->child_count; i++) { if (!need_open[i]) @@ -371,12 +326,12 @@ afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open) local->fd, NULL); } + if (!--call_count) + break; } - op_errno = 0; - ret = 0; + + return; out: - if (op_errno) - ret = -1; //For handling ALLOC_OR_GOTO - if (ret && frame) + if (frame) AFR_STACK_DESTROY (frame); } diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c new file mode 100644 index 00000000000..186f68c3359 --- /dev/null +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -0,0 +1,239 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "afr.h" +#include "afr-transaction.h" + +int +afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int subvol = -1; + + local = frame->local; + priv = this->private; + + + for (i = 0; i < priv->child_count; i++) { + if (!local->readable[i]) { + /* don't even bother trying here. + just mark as attempted and move on. */ + local->read_attempted[i] = 1; + continue; + } + + if (!local->read_attempted[i]) { + subvol = i; + break; + } + } + + /* If no more subvols were available for reading, we leave + @subvol as -1, which is an indication we have run out of + readable subvols. */ + if (subvol != -1) + local->read_attempted[subvol] = 1; + local->readfn (frame, this, subvol); + + return 0; +} + + +int +afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err) +{ + afr_local_t *local = NULL; + int read_subvol = 0; + int event_generation = 0; + inode_t *inode = NULL; + int ret = -1; + + local = frame->local; + inode = local->inode; + + if (err) { + local->op_errno = -err; + local->op_ret = -1; + read_subvol = -1; + goto readfn; + } + + ret = afr_inode_read_subvol_type_get (inode, this, local->readable, + &event_generation, + local->transaction.type); + + if (ret == -1 || !event_generation) { + /* Even after refresh, we don't have a good + read subvolume. Time to bail */ + local->op_ret = -1; + local->op_errno = EIO; + read_subvol = -1; + goto readfn; + } + + read_subvol = afr_read_subvol_select_by_policy (inode, this, + local->readable); + + if (read_subvol == -1) { + local->op_ret = -1; + local->op_errno = EIO; + goto readfn; + } + + if (local->read_attempted[read_subvol]) { + afr_read_txn_next_subvol (frame, this); + return 0; + } + + local->read_attempted[read_subvol] = 1; +readfn: + local->readfn (frame, this, read_subvol); + + return 0; +} + + +int +afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (!local->refreshed) { + local->refreshed = _gf_true; + afr_inode_refresh (frame, this, local->inode, + afr_read_txn_refresh_done); + } else { + afr_read_txn_next_subvol (frame, this); + } + + return 0; +} + + +/* afr_read_txn_wipe: + + clean internal variables in @local in order to make + it possible to call afr_read_txn() multiple times from + the same frame +*/ + +void +afr_read_txn_wipe (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + + local->readfn = NULL; + + if (local->inode) + inode_unref (local->inode); + + for (i = 0; i < priv->child_count; i++) { + local->read_attempted[i] = 0; + local->readable[i] = 0; + } +} + + +/* + afr_read_txn: + + This is the read transaction function. The way it works: + + - Determine read-subvolume from inode ctx. + + - If read-subvolume's generation was stale, refresh ctx once by + calling afr_inode_refresh() + + Else make an attempt to read on read-subvolume. + + - If attempted read on read-subvolume fails, refresh ctx once + by calling afr_inode_refresh() + + - After ctx refresh, query read-subvolume freshly and attempt + read once. + + - If read fails, try every other readable[] subvolume before + finally giving up. readable[] elements are set by afr_inode_refresh() + based on dirty and pending flags. + + - If file is in split brain in the backend, generation will be + kept 0 by afr_inode_refresh() and readable[] will be set 0 for + all elements. Therefore reads always fail. +*/ + +int +afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_read_txn_wind_t readfn, afr_transaction_type type) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int read_subvol = -1; + int event_generation = 0; + int ret = -1; + + priv = this->private; + local = frame->local; + + afr_read_txn_wipe (frame, this); + + local->readfn = readfn; + local->inode = inode_ref (inode); + + local->transaction.type = type; + ret = afr_inode_read_subvol_type_get (inode, this, local->readable, + &event_generation, type); + if (ret == -1) + /* very first transaction on this inode */ + goto refresh; + + if (local->event_generation != event_generation) + /* servers have disconnected / reconnected, and possibly + rebooted, very likely changing the state of freshness + of copies */ + goto refresh; + + read_subvol = afr_read_subvol_select_by_policy (inode, this, + local->readable); + + if (read_subvol < 0 || read_subvol > priv->child_count) { + gf_log (this->name, GF_LOG_WARNING, "Unreadable subvolume %d " + "found with event generation %d", read_subvol, + event_generation); + goto refresh; + } + + if (!local->child_up[read_subvol]) { + /* should never happen, just in case */ + gf_log (this->name, GF_LOG_WARNING, "subvolume %d is the " + "read subvolume in this generation, but is not up", + read_subvol); + goto refresh; + } + + local->read_attempted[read_subvol] = 1; + + local->readfn (frame, this, read_subvol); + + return 0; + +refresh: + afr_inode_refresh (frame, this, inode, afr_read_txn_refresh_done); + + return 0; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c deleted file mode 100644 index 83846f152d2..00000000000 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c +++ /dev/null @@ -1,837 +0,0 @@ -/* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - - -#include <openssl/md5.h> -#include "glusterfs.h" -#include "afr.h" -#include "xlator.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" -#include "afr-self-heal-algorithm.h" - -/* - This file contains the various self-heal algorithms -*/ - -static int -sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, - gf_boolean_t is_first_call, call_frame_t *old_loop_frame); -static int -sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame, - int32_t op_ret, int32_t op_errno); -static int -sh_destroy_frame (call_frame_t *frame, xlator_t *this) -{ - if (!frame) - goto out; - - AFR_STACK_DESTROY (frame); -out: - return 0; -} - -static void -sh_private_cleanup (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_private_t *sh_priv = NULL; - - local = frame->local; - sh = &local->self_heal; - - sh_priv = sh->private; - GF_FREE (sh_priv); -} - -static int -sh_number_of_writes_needed (unsigned char *write_needed, int child_count) -{ - int writes = 0; - int i = 0; - - for (i = 0; i < child_count; i++) { - if (write_needed[i]) - writes++; - } - - return writes; -} - - -static int -sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this, - call_frame_t *last_loop_frame) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_private_t *sh_priv = NULL; - int32_t total_blocks = 0; - int32_t diff_blocks = 0; - - local = sh_frame->local; - sh = &local->self_heal; - sh_priv = sh->private; - if (sh_priv) { - total_blocks = sh_priv->total_blocks; - diff_blocks = sh_priv->diff_blocks; - } - - sh_private_cleanup (sh_frame, this); - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - GF_ASSERT (!last_loop_frame); - //loop_finish should have happened and the old_loop should be NULL - gf_log (this->name, GF_LOG_DEBUG, - "self-heal aborting on %s", - local->loc.path); - - local->self_heal.algo_abort_cbk (sh_frame, this); - } else { - GF_ASSERT (last_loop_frame); - if (diff_blocks == total_blocks) { - gf_log (this->name, GF_LOG_DEBUG, "full self-heal " - "completed on %s",local->loc.path); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "diff self-heal on %s: completed. " - "(%d blocks of %d were different (%.2f%%))", - local->loc.path, diff_blocks, total_blocks, - ((diff_blocks * 1.0)/total_blocks) * 100); - } - - sh->old_loop_frame = last_loop_frame; - local->self_heal.algo_completion_cbk (sh_frame, this); - } - - return 0; -} - -int -sh_loop_finish (call_frame_t *loop_frame, xlator_t *this) -{ - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - - if (!loop_frame) - goto out; - - loop_local = loop_frame->local; - if (loop_local) { - loop_sh = &loop_local->self_heal; - } - - if (loop_sh && loop_sh->data_lock_held) { - afr_sh_data_unlock (loop_frame, this, this->name, - sh_destroy_frame); - } else { - sh_destroy_frame (loop_frame, this); - } -out: - return 0; -} - -static int -sh_loop_lock_success (call_frame_t *loop_frame, xlator_t *this) -{ - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - sh_loop_finish (loop_sh->old_loop_frame, this); - loop_sh->old_loop_frame = NULL; - - gf_log (this->name, GF_LOG_DEBUG, "Acquired lock for range %"PRIu64 - " %"PRIu64, loop_sh->offset, loop_sh->block_size); - loop_sh->data_lock_held = _gf_true; - loop_sh->sh_data_algo_start (loop_frame, this); - return 0; -} - -static int -sh_loop_lock_failure (call_frame_t *loop_frame, xlator_t *this) -{ - call_frame_t *sh_frame = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - sh_frame = loop_sh->sh_frame; - - gf_log (this->name, GF_LOG_ERROR, "failed lock for range %"PRIu64 - " %"PRIu64, loop_sh->offset, loop_sh->block_size); - sh_loop_finish (loop_sh->old_loop_frame, this); - loop_sh->old_loop_frame = NULL; - sh_loop_return (sh_frame, this, loop_frame, -1, ENOTCONN); - return 0; -} - -static int -sh_loop_frame_create (call_frame_t *sh_frame, xlator_t *this, - call_frame_t *old_loop_frame, call_frame_t **loop_frame) -{ - call_frame_t *new_loop_frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_local_t *new_loop_local = NULL; - afr_self_heal_t *new_loop_sh = NULL; - afr_private_t *priv = NULL; - - GF_ASSERT (sh_frame); - GF_ASSERT (loop_frame); - - *loop_frame = NULL; - local = sh_frame->local; - sh = &local->self_heal; - priv = this->private; - - new_loop_frame = copy_frame (sh_frame); - if (!new_loop_frame) - goto out; - //We want the frame to have same lk_owner as sh_frame - //so that locks translator allows conflicting locks - new_loop_local = afr_self_heal_local_init (local, this); - if (!new_loop_local) - goto out; - new_loop_frame->local = new_loop_local; - - new_loop_sh = &new_loop_local->self_heal; - new_loop_sh->sources = memdup (sh->sources, - priv->child_count * sizeof (*sh->sources)); - if (!new_loop_sh->sources) - goto out; - new_loop_sh->write_needed = GF_CALLOC (priv->child_count, - sizeof (*new_loop_sh->write_needed), - gf_afr_mt_char); - if (!new_loop_sh->write_needed) - goto out; - new_loop_sh->checksum = GF_CALLOC (priv->child_count, MD5_DIGEST_LENGTH, - gf_afr_mt_uint8_t); - if (!new_loop_sh->checksum) - goto out; - new_loop_sh->inode = inode_ref (sh->inode); - new_loop_sh->sh_data_algo_start = sh->sh_data_algo_start; - new_loop_sh->source = sh->source; - new_loop_sh->active_sinks = sh->active_sinks; - new_loop_sh->healing_fd = fd_ref (sh->healing_fd); - new_loop_sh->file_has_holes = sh->file_has_holes; - new_loop_sh->old_loop_frame = old_loop_frame; - new_loop_sh->sh_frame = sh_frame; - *loop_frame = new_loop_frame; - return 0; -out: - sh_destroy_frame (new_loop_frame, this); - return -ENOMEM; -} - -static int -sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, - call_frame_t *old_loop_frame) -{ - call_frame_t *new_loop_frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_local_t *new_loop_local = NULL; - afr_self_heal_t *new_loop_sh = NULL; - int ret = 0; - - GF_ASSERT (sh_frame); - - local = sh_frame->local; - sh = &local->self_heal; - - ret = sh_loop_frame_create (sh_frame, this, old_loop_frame, - &new_loop_frame); - if (ret) - goto out; - new_loop_local = new_loop_frame->local; - new_loop_sh = &new_loop_local->self_heal; - new_loop_sh->offset = offset; - new_loop_sh->block_size = sh->block_size; - afr_sh_data_lock (new_loop_frame, this, offset, new_loop_sh->block_size, - _gf_true, this->name, sh_loop_lock_success, sh_loop_lock_failure); - return 0; -out: - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - if (old_loop_frame) - sh_loop_finish (old_loop_frame, this); - sh_loop_return (sh_frame, this, new_loop_frame, -1, ENOMEM); - return 0; -} - -static int -sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, - gf_boolean_t is_first_call, call_frame_t *old_loop_frame) -{ - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_private_t *sh_priv = NULL; - gf_boolean_t is_driver_done = _gf_false; - blksize_t block_size = 0; - int loop = 0; - off_t offset = 0; - afr_private_t *priv = NULL; - - priv = this->private; - local = sh_frame->local; - sh = &local->self_heal; - sh_priv = sh->private; - - LOCK (&sh_priv->lock); - { - if (!is_first_call) - sh_priv->loops_running--; - offset = sh_priv->offset; - block_size = sh->block_size; - while ((!sh->eof_reached) && - (!is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) && - (sh_priv->loops_running < priv->data_self_heal_window_size) - && (sh_priv->offset < sh->file_size)) { - - loop++; - sh_priv->offset += block_size; - sh_priv->loops_running++; - - if (!is_first_call) - break; - } - if (0 == sh_priv->loops_running) { - is_driver_done = _gf_true; - } - } - UNLOCK (&sh_priv->lock); - - if (0 == loop) { - //loop finish does unlock, but the erasing of the pending - //xattrs needs to happen before that so do not finish the loop - if (is_driver_done && - !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) - goto driver_done; - if (old_loop_frame) { - sh_loop_finish (old_loop_frame, this); - old_loop_frame = NULL; - } - } - - //If we have more loops to form we should finish previous loop after - //the next loop lock - while (loop--) { - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - // op failed in other loop, stop spawning more loops - if (old_loop_frame) { - sh_loop_finish (old_loop_frame, this); - old_loop_frame = NULL; - } - sh_loop_driver (sh_frame, this, _gf_false, NULL); - } else { - gf_log (this->name, GF_LOG_TRACE, "spawning a loop " - "for offset %"PRId64, offset); - - sh_loop_start (sh_frame, this, offset, old_loop_frame); - old_loop_frame = NULL; - offset += block_size; - } - } - -driver_done: - if (is_driver_done) { - sh_loop_driver_done (sh_frame, this, old_loop_frame); - } - return 0; -} - -static int -sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t * loop_local = NULL; - afr_self_heal_t * loop_sh = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - if (loop_frame) { - loop_local = loop_frame->local; - if (loop_local) - loop_sh = &loop_local->self_heal; - if (loop_sh) - gf_log (this->name, GF_LOG_TRACE, "loop for offset " - "%"PRId64" returned", loop_sh->offset); - } - - if (op_ret == -1) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_set_error (sh, op_errno); - if (loop_frame) { - sh_loop_finish (loop_frame, this); - loop_frame = NULL; - } - } - - sh_loop_driver (sh_frame, this, _gf_false, loop_frame); - - return 0; -} - -static int -sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *postbuf, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * loop_local = NULL; - afr_self_heal_t * loop_sh = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - int call_count = 0; - int child_index = 0; - - priv = this->private; - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - sh_frame = loop_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - child_index = (long) cookie; - - gf_log (this->name, GF_LOG_TRACE, - "wrote %d bytes of data from %s to child %d, offset %"PRId64"", - op_ret, sh_local->loc.path, child_index, loop_sh->offset); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "write to %s failed on subvolume %s (%s)", - sh_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_set_error (loop_sh, op_errno); - } else if (op_ret < loop_local->cont.writev.vector->iov_len) { - gf_log (this->name, GF_LOG_ERROR, - "incomplete write to %s on subvolume %s " - "(expected %lu, returned %d)", sh_local->loc.path, - priv->children[child_index]->name, - loop_local->cont.writev.vector->iov_len, op_ret); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } - - call_count = afr_frame_return (loop_frame); - - if (call_count == 0) { - iobref_unref(loop_local->cont.writev.iobref); - - sh_loop_return (sh_frame, this, loop_frame, - loop_sh->op_ret, loop_sh->op_errno); - } - - return 0; -} - -static void -sh_prune_writes_needed (call_frame_t *sh_frame, call_frame_t *loop_frame, - afr_private_t *priv) -{ - afr_local_t *sh_local = NULL; - afr_self_heal_t *sh = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - int i = 0; - - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - if (!strcmp (sh->algo->name, "diff")) - return; - - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - /* full self-heal guarantees there exists atleast 1 file with size 0 - * That means for other files we can preserve holes that come after - * its size before 'trim' - */ - for (i = 0; i < priv->child_count; i++) { - if (loop_sh->write_needed[i] && - ((loop_sh->offset + 1) > sh->buf[i].ia_size)) - loop_sh->write_needed[i] = 0; - } -} - -static int -sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, struct iatt *buf, - struct iobref *iobref, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * loop_local = NULL; - afr_self_heal_t * loop_sh = NULL; - call_frame_t *sh_frame = NULL; - int i = 0; - int call_count = 0; - afr_local_t * sh_local = NULL; - afr_self_heal_t * sh = NULL; - - priv = this->private; - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - sh_frame = loop_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - gf_log (this->name, GF_LOG_TRACE, - "read %d bytes of data from %s, offset %"PRId64"", - op_ret, loop_local->loc.path, loop_sh->offset); - - if (op_ret <= 0) { - if (op_ret < 0) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - gf_log (this->name, GF_LOG_ERROR, "read failed on %d " - "for %s reason :%s", sh->source, - sh_local->loc.path, strerror (errno)); - } else { - sh->eof_reached = _gf_true; - gf_log (this->name, GF_LOG_DEBUG, "Eof reached for %s", - sh_local->loc.path); - } - sh_loop_return (sh_frame, this, loop_frame, op_ret, op_errno); - goto out; - } - - if (loop_sh->file_has_holes && iov_0filled (vector, count) == 0) - sh_prune_writes_needed (sh_frame, loop_frame, priv); - - call_count = sh_number_of_writes_needed (loop_sh->write_needed, - priv->child_count); - if (call_count == 0) { - sh_loop_return (sh_frame, this, loop_frame, 0, 0); - goto out; - } - - loop_local->call_count = call_count; - - /* - * We only really need the request size at the moment, but the buffer - * is required if we want to issue a retry in the event of a short write. - * Therefore, we duplicate the vector and ref the iobref here... - */ - loop_local->cont.writev.vector = iov_dup(vector, count); - loop_local->cont.writev.iobref = iobref_ref(iobref); - - for (i = 0; i < priv->child_count; i++) { - if (!loop_sh->write_needed[i]) - continue; - STACK_WIND_COOKIE (loop_frame, sh_loop_write_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->writev, - loop_sh->healing_fd, vector, count, - loop_sh->offset, 0, iobref, NULL); - - if (!--call_count) - break; - } - -out: - return 0; -} - - -static int -sh_loop_read (call_frame_t *loop_frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - - priv = this->private; - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - STACK_WIND_COOKIE (loop_frame, sh_loop_read_cbk, - (void *) (long) loop_sh->source, - priv->children[loop_sh->source], - priv->children[loop_sh->source]->fops->readv, - loop_sh->healing_fd, loop_sh->block_size, - loop_sh->offset, 0, NULL); - - return 0; -} - - -static int -sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - uint32_t weak_checksum, uint8_t *strong_checksum, - dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t *sh_local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_private_t *sh_priv = NULL; - int child_index = 0; - int call_count = 0; - int i = 0; - int write_needed = 0; - - priv = this->private; - - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - sh_frame = loop_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - sh_priv = sh->private; - - child_index = (long) cookie; - - if (op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "checksum on %s failed on subvolume %s (%s)", - sh_local->loc.path, priv->children[child_index]->name, - strerror (op_errno)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } else { - memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LENGTH, - strong_checksum, MD5_DIGEST_LENGTH); - } - - call_count = afr_frame_return (loop_frame); - - if (call_count == 0) { - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] || !sh_local->child_up[i]) - continue; - - if (memcmp (loop_sh->checksum + (i * MD5_DIGEST_LENGTH), - loop_sh->checksum + (sh->source * MD5_DIGEST_LENGTH), - MD5_DIGEST_LENGTH)) { - /* - Checksums differ, so this block - must be written to this sink - */ - - gf_log (this->name, GF_LOG_DEBUG, - "checksum on subvolume %s at offset %" - PRId64" differs from that on source", - priv->children[i]->name, loop_sh->offset); - - write_needed = loop_sh->write_needed[i] = 1; - } - } - - LOCK (&sh_priv->lock); - { - sh_priv->total_blocks++; - if (write_needed) - sh_priv->diff_blocks++; - } - UNLOCK (&sh_priv->lock); - - if (write_needed && - !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - sh_loop_read (loop_frame, this); - } else { - sh_loop_return (sh_frame, this, loop_frame, - op_ret, op_errno); - } - } - - return 0; -} - -static int -sh_diff_checksum (call_frame_t *loop_frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - int call_count = 0; - int i = 0; - - priv = this->private; - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - call_count = loop_sh->active_sinks + 1; /* sinks and source */ - - loop_local->call_count = call_count; - - STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk, - (void *) (long) loop_sh->source, - priv->children[loop_sh->source], - priv->children[loop_sh->source]->fops->rchecksum, - loop_sh->healing_fd, - loop_sh->offset, loop_sh->block_size, NULL); - - for (i = 0; i < priv->child_count; i++) { - if (loop_sh->sources[i] || !loop_local->child_up[i]) - continue; - - STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->rchecksum, - loop_sh->healing_fd, - loop_sh->offset, loop_sh->block_size, NULL); - - if (!--call_count) - break; - } - - return 0; -} - -static int -sh_full_read_write_to_sinks (call_frame_t *loop_frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - int i = 0; - - priv = this->private; - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - for (i = 0; i < priv->child_count; i++) { - if (loop_sh->sources[i] || !loop_local->child_up[i]) - continue; - loop_sh->write_needed[i] = 1; - } - sh_loop_read (loop_frame, this); - return 0; -} - -afr_sh_algo_private_t* -afr_sh_priv_init () -{ - afr_sh_algo_private_t *sh_priv = NULL; - - sh_priv = GF_CALLOC (1, sizeof (*sh_priv), - gf_afr_mt_afr_private_t); - if (!sh_priv) - goto out; - - LOCK_INIT (&sh_priv->lock); -out: - return sh_priv; -} - -int -afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, char *dom, - unsigned int child_count) -{ - afr_local_t *dst_local = NULL; - afr_self_heal_t *dst_sh = NULL; - afr_local_t *src_local = NULL; - afr_self_heal_t *src_sh = NULL; - int ret = -1; - - dst_local = dst->local; - dst_sh = &dst_local->self_heal; - src_local = src->local; - src_sh = &src_local->self_heal; - GF_ASSERT (src_sh->data_lock_held); - GF_ASSERT (!dst_sh->data_lock_held); - ret = afr_lk_transfer_datalock (dst, src, dom, child_count); - if (ret) - return ret; - src_sh->data_lock_held = _gf_false; - dst_sh->data_lock_held = _gf_true; - return 0; -} - -int -afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this, - afr_sh_algo_fn sh_data_algo_start) -{ - call_frame_t *first_loop_frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int ret = 0; - afr_private_t *priv = NULL; - - local = sh_frame->local; - sh = &local->self_heal; - priv = this->private; - - sh->sh_data_algo_start = sh_data_algo_start; - local->call_count = 0; - ret = sh_loop_frame_create (sh_frame, this, NULL, &first_loop_frame); - if (ret) - goto out; - ret = afr_sh_transfer_lock (first_loop_frame, sh_frame, this->name, - priv->child_count); - if (ret) - goto out; - sh->private = afr_sh_priv_init (); - if (!sh->private) { - ret = -1; - goto out; - } - sh_loop_driver (sh_frame, this, _gf_true, first_loop_frame); - ret = 0; -out: - if (ret) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - sh_loop_driver_done (sh_frame, this, NULL); - } - return 0; -} - -int -afr_sh_algo_diff (call_frame_t *sh_frame, xlator_t *this) -{ - afr_sh_start_loops (sh_frame, this, sh_diff_checksum); - return 0; -} - -int -afr_sh_algo_full (call_frame_t *sh_frame, xlator_t *this) -{ - afr_sh_start_loops (sh_frame, this, sh_full_read_write_to_sinks); - return 0; -} - -struct afr_sh_algorithm afr_self_heal_algorithms[] = { - {.name = "full", .fn = afr_sh_algo_full}, - {.name = "diff", .fn = afr_sh_algo_diff}, - {0, 0}, -}; diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.h b/xlators/cluster/afr/src/afr-self-heal-algorithm.h deleted file mode 100644 index 6b20789b1bb..00000000000 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef __AFR_SELF_HEAL_ALGORITHM_H__ -#define __AFR_SELF_HEAL_ALGORITHM_H__ - -typedef int (*afr_sh_algo_fn) (call_frame_t *frame, - xlator_t *this); - -struct afr_sh_algorithm { - const char *name; - afr_sh_algo_fn fn; -}; - -extern struct afr_sh_algorithm afr_self_heal_algorithms[3]; -typedef struct { - gf_lock_t lock; - unsigned int loops_running; - off_t offset; - - int32_t total_blocks; - int32_t diff_blocks; -} afr_sh_algo_private_t; - -#endif /* __AFR_SELF_HEAL_ALGORITHM_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index ef92b420551..4dac8311340 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,2805 +8,1002 @@ cases as published by the Free Software Foundation. */ -#include "glusterfs.h" -#include "xlator.h" -#include "byte-order.h" + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif #include "afr.h" -#include "afr-transaction.h" -#include "afr-self-heal-common.h" #include "afr-self-heal.h" -#include "pump.h" - -#define ADD_FMT_STRING(msg, off, sh_str, status, print_log) \ - do { \ - if (AFR_SELF_HEAL_NOT_ATTEMPTED != status) { \ - off += snprintf (msg + off, sizeof (msg) - off, \ - " "sh_str" self heal %s,", \ - get_sh_completion_status (status));\ - print_log = 1; \ - } \ - } while (0) - -#define ADD_FMT_STRING_SYNC(msg, off, sh_str, status, print_log) \ - do { \ - if (AFR_SELF_HEAL_SYNC_BEGIN == status || \ - AFR_SELF_HEAL_FAILED == status) { \ - off += snprintf (msg + off, sizeof (msg) - off, \ - " "sh_str" self heal %s,", \ - get_sh_completion_status (status));\ - print_log = 1; \ - } \ - } while (0) +#include "byte-order.h" -void -afr_sh_reset (call_frame_t *frame, xlator_t *this) +int +afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - memset (sh->child_errno, 0, - sizeof (*sh->child_errno) * priv->child_count); - memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count); - memset (sh->parentbufs, 0, - sizeof (*sh->parentbufs) * priv->child_count); - memset (sh->success, 0, sizeof (*sh->success) * priv->child_count); - memset (sh->locked_nodes, 0, - sizeof (*sh->locked_nodes) * priv->child_count); - sh->active_sinks = 0; - - afr_reset_xattr (sh->xattr, priv->child_count); -} + afr_local_t *local = NULL; -//Intersection[child]=1 if child is part of intersection -void -afr_children_intersection_get (int32_t *set1, int32_t *set2, - int *intersection, unsigned int child_count) -{ - int i = 0; - - memset (intersection, 0, sizeof (*intersection) * child_count); - for (i = 0; i < child_count; i++) { - intersection[i] = afr_is_child_present (set1, child_count, i) - && afr_is_child_present (set2, child_count, - i); - } + local = frame->local; + + syncbarrier_wake (&local->barrier); + + return 0; } -/** - * select_source - select a source and return it - */ int -afr_sh_select_source (int sources[], int child_count) +afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode, + int subvol, dict_t *xattr) { - int i = 0; - for (i = 0; i < child_count; i++) - if (sources[i]) - return i; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + loc_t loc = {0, }; - return -1; -} + priv = this->private; + local = frame->local; -void -afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this) -{ - int i = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int active_sinks = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] == 0 && local->child_up[i] == 1) { - active_sinks++; - sh->success[i] = 1; - } else if (sh->sources[i] == 1 && local->child_up[i] == 1) { - sh->success[i] = 1; - } - } - sh->active_sinks = active_sinks; -} + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); -int -afr_sh_source_count (int sources[], int child_count) -{ - int i = 0; - int nsource = 0; + STACK_WIND (frame, afr_selfheal_post_op_cbk, priv->children[subvol], + priv->children[subvol]->fops->xattrop, &loc, + GF_XATTROP_ADD_ARRAY, xattr, NULL); - for (i = 0; i < child_count; i++) - if (sources[i]) - nsource++; - return nsource; -} + syncbarrier_wait (&local->barrier, 1); -void -afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno) -{ - sh->op_ret = -1; - sh->op_errno = afr_most_important_error(sh->op_errno, op_errno, - _gf_false); + return 0; } -void -afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) -{ - afr_private_t * priv = this->private; - char *buf = NULL; - char *ptr = NULL; - int i = 0; - int j = 0; - - /* 10 digits per entry + 1 space + '[' and ']' */ - buf = GF_MALLOC (priv->child_count * 11 + 8, gf_afr_mt_char); - - for (i = 0; i < priv->child_count; i++) { - ptr = buf; - ptr += sprintf (ptr, "[ "); - for (j = 0; j < priv->child_count; j++) { - ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); - } - sprintf (ptr, "]"); - gf_log (this->name, GF_LOG_DEBUG, "pending_matrix: %s", buf); - } - - GF_FREE (buf); -} -char* -afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this) +dict_t * +afr_selfheal_output_xattr (xlator_t *this, afr_transaction_type type, + int *output_dirty, int **output_matrix, int subvol) { - afr_private_t * priv = this->private; - char *buf = NULL; - char *ptr = NULL; - int i = 0; - int j = 0; - int child_count = priv->child_count; - char *matrix_begin = "[ [ "; - char *matrix_end = "] ]"; - char *seperator = "] [ "; - int pending_entry_strlen = 12; //Including space after entry - int matrix_begin_strlen = 0; - int matrix_end_strlen = 0; - int seperator_strlen = 0; - int string_length = 0; - char *msg = "- Pending matrix: "; - - /* - * for a list of lists of [ [ a b ] [ c d ] ] - * */ - - matrix_begin_strlen = strlen (matrix_begin); - matrix_end_strlen = strlen (matrix_end); - seperator_strlen = strlen (seperator); - string_length = matrix_begin_strlen + matrix_end_strlen - + (child_count -1) * seperator_strlen - + (child_count * child_count * pending_entry_strlen); - - buf = GF_CALLOC (1, 1 + strlen (msg) + string_length , gf_afr_mt_char); - if (!buf) - goto out; - - ptr = buf; - ptr += sprintf (ptr, "%s", msg); - ptr += sprintf (ptr, "%s", matrix_begin); - for (i = 0; i < priv->child_count; i++) { - for (j = 0; j < priv->child_count; j++) { - ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); - } - if (i < priv->child_count -1) - ptr += sprintf (ptr, "%s", seperator); - } - - ptr += sprintf (ptr, "%s", matrix_end); + dict_t *xattr = NULL; + afr_private_t *priv = NULL; + int j = 0; + int idx = 0; + int ret = 0; + int *raw = 0; -out: - return buf; -} + priv = this->private; + idx = afr_index_for_transaction_type (type); -void -afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, - const char *loc) -{ - char *buf = NULL; - char *free_ptr = NULL; + xattr = dict_new (); + if (!xattr) + return NULL; - buf = afr_get_pending_matrix_str (pending_matrix, this); - if (buf) - free_ptr = buf; - else - buf = ""; + if (output_dirty[subvol]) { + /* clear dirty */ + raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t); + if (!raw) + goto err; + raw[idx] = hton32 (output_dirty[subvol]); + ret = dict_set_bin (xattr, AFR_DIRTY, raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) + goto err; + } - gf_log (this->name, GF_LOG_ERROR, "Unable to self-heal contents of '%s'" - " (possible split-brain). Please delete the file from all but " - "the preferred subvolume.%s", loc, buf); - GF_FREE (free_ptr); - return; -} + /* clear/set pending */ + for (j = 0; j < priv->child_count; j++) { + if (!output_matrix[subvol][j]) + continue; + raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS, + gf_afr_mt_int32_t); + if (!raw) + goto err; -void -afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count) -{ - int i = 0; - int j = 0; + raw[idx] = hton32 (output_matrix[subvol][j]); - GF_ASSERT (pending_matrix); + ret = dict_set_bin (xattr, priv->pending_key[j], + raw, sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) + goto err; + } - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - pending_matrix[i][j] = 0; - } - } + return xattr; +err: + if (xattr) + dict_unref (xattr); + return NULL; } -void -afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix, - unsigned char *ignorant_subvols, - size_t child_count) -{ - int i = 0; - int j = 0; - - GF_ASSERT (pending_matrix); - GF_ASSERT (ignorant_subvols); - - for (i = 0; i < child_count; i++) { - if (ignorant_subvols[i]) { - for (j = 0; j < child_count; j++) { - if (!ignorant_subvols[j]) - pending_matrix[j][i] += 1; - } - } - } -} int -afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, - unsigned char *ignorant_subvols, - dict_t *xattr[], afr_transaction_type type, - size_t child_count) -{ - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - int ret = -1; - int i = 0; - int j = 0; - int k = 0; - - afr_init_pending_matrix (pending_matrix, child_count); - - for (i = 0; i < child_count; i++) { - pending_raw = NULL; - - for (j = 0; j < child_count; j++) { - ret = dict_get_ptr (xattr[i], pending_key[j], - &pending_raw); - - if (ret != 0) { - /* - * There is no xattr present. This means this - * subvolume should be considered an 'ignorant' - * subvolume. - */ - - if (ignorant_subvols) - ignorant_subvols[i] = 1; - continue; - } - - memcpy (pending, pending_raw, sizeof(pending)); - k = afr_index_for_transaction_type (type); - - pending_matrix[i][j] = ntoh32 (pending[k]); - } - } - - return ret; -} +afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, afr_transaction_type type, + struct afr_reply *replies, unsigned char *locked_on) +{ + afr_private_t *priv = NULL; + int i = 0; + int j = 0; + unsigned char *pending = NULL; + int *input_dirty = NULL; + int **input_matrix = NULL; + int *output_dirty = NULL; + int **output_matrix = NULL; + dict_t *xattr = NULL; + + priv = this->private; + + pending = alloca0 (priv->child_count); + + input_dirty = alloca0 (priv->child_count * sizeof (int)); + input_matrix = ALLOC_MATRIX (priv->child_count, int); + output_dirty = alloca0 (priv->child_count * sizeof (int)); + output_matrix = ALLOC_MATRIX (priv->child_count, int); + + afr_selfheal_extract_xattr (this, replies, type, input_dirty, + input_matrix); + + for (i = 0; i < priv->child_count; i++) + if (sinks[i] && !healed_sinks[i]) + pending[i] = 1; + + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + if (pending[j]) + output_matrix[i][j] = 1; + else + output_matrix[i][j] = -input_matrix[i][j]; + } + } -typedef enum { - AFR_NODE_INVALID, - AFR_NODE_INNOCENT, - AFR_NODE_FOOL, - AFR_NODE_WISE, -} afr_node_type; + for (i = 0; i < priv->child_count; i++) { + if (!pending[i]) + output_dirty[i] = -input_dirty[i]; + } -typedef struct { - afr_node_type type; - int wisdom; -} afr_node_character; + for (i = 0; i < priv->child_count; i++) { + if (!locked_on[i]) + /* perform post-op only on subvols we had locked + and inspected on. + */ + continue; + xattr = afr_selfheal_output_xattr (this, type, output_dirty, + output_matrix, i); + if (!xattr) { + gf_log (this->name, GF_LOG_ERROR, + "unable to allocate xdata for subvol %d", i); + continue; + } -static int -afr_sh_is_innocent (int32_t *array, int child_count) -{ - int i = 0; - int ret = 1; /* innocent until proven guilty */ + afr_selfheal_post_op (frame, this, inode, i, xattr); - for (i = 0; i < child_count; i++) { - if (array[i]) { - ret = 0; - break; - } - } + dict_unref (xattr); + } - return ret; + return 0; } -static int -afr_sh_is_fool (int32_t *array, int i, int child_count) -{ - return array[i]; /* fool if accuses itself */ +void +afr_replies_copy (struct afr_reply *dst, struct afr_reply *src, int count) +{ + int i = 0; + dict_t *xdata = NULL; + + if (dst == src) + return; + + for (i = 0; i < count; i++) { + dst[i].valid = src[i].valid; + dst[i].op_ret = src[i].op_ret; + dst[i].op_errno = src[i].op_errno; + dst[i].prestat = src[i].prestat; + dst[i].poststat = src[i].poststat; + dst[i].preparent = src[i].preparent; + dst[i].postparent = src[i].postparent; + dst[i].preparent2 = src[i].preparent2; + dst[i].postparent2 = src[i].postparent2; + if (src[i].xdata) + xdata = dict_ref (src[i].xdata); + else + xdata = NULL; + if (dst[i].xdata) + dict_unref (dst[i].xdata); + dst[i].xdata = xdata; + memcpy (dst[i].checksum, src[i].checksum, + MD5_DIGEST_LENGTH); + } } -static int -afr_sh_is_wise (int32_t *array, int i, int child_count) +int +afr_selfheal_fill_dirty (xlator_t *this, int *dirty, int subvol, + int idx, dict_t *xdata) { - return !array[i]; /* wise if does not accuse itself */ -} + void *pending_raw = NULL; + int pending[3] = {0, }; + if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw)) + return -1; -static int -afr_sh_all_nodes_innocent (afr_node_character *characters, - int child_count) -{ - int i = 0; - int ret = 1; + if (!pending_raw) + return -1; + + memcpy (pending, pending_raw, sizeof(pending)); - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_INNOCENT) { - ret = 0; - break; - } - } + dirty[subvol] = ntoh32 (pending[idx]); - return ret; + return 0; } -static int -afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count) +int +afr_selfheal_fill_matrix (xlator_t *this, int **matrix, int subvol, + int idx, dict_t *xdata) { - int i = 0; - int ret = 0; + int i = 0; + void *pending_raw = NULL; + int pending[3] = {0, }; + afr_private_t *priv = NULL; - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_WISE) { - ret = 1; - break; - } - } + priv = this->private; - return ret; -} + for (i = 0; i < priv->child_count; i++) { + if (dict_get_ptr (xdata, priv->pending_key[i], &pending_raw)) + continue; + if (!pending_raw) + continue; -/* - * The 'wisdom' of a wise node is 0 if any other wise node accuses it. - * It is 1 if no other wise node accuses it. - * Only wise nodes with wisdom 1 are sources. - * - * If no nodes with wisdom 1 exist, a split-brain has occurred. - */ + memcpy (pending, pending_raw, sizeof(pending)); -static void -afr_sh_compute_wisdom (int32_t *pending_matrix[], - afr_node_character characters[], int child_count) -{ - int i = 0; - int j = 0; - - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_WISE) { - characters[i].wisdom = 1; - - for (j = 0; j < child_count; j++) { - if ((characters[j].type == AFR_NODE_WISE) - && pending_matrix[j][i]) { - - characters[i].wisdom = 0; - } - } - } - } + matrix[subvol][i] = ntoh32 (pending[idx]); + } + + return 0; } -static int -afr_sh_wise_nodes_conflict (afr_node_character *characters, - int child_count) +int +afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies, + afr_transaction_type type, int *dirty, int **matrix) { - int i = 0; - int ret = 1; + afr_private_t *priv = NULL; + int i = 0; + dict_t *xdata = NULL; + int idx = -1; + + idx = afr_index_for_transaction_type (type); - for (i = 0; i < child_count; i++) { - if ((characters[i].type == AFR_NODE_WISE) - && characters[i].wisdom == 1) { + priv = this->private; - /* There is atleast one bona-fide wise node */ - ret = 0; - break; - } - } + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].xdata) + continue; + + xdata = replies[i].xdata; - return ret; + afr_selfheal_fill_dirty (this, dirty, i, idx, xdata); + afr_selfheal_fill_matrix (this, matrix, i, idx, xdata); + } + + return 0; } -static int -afr_sh_mark_wisest_as_sources (int sources[], - afr_node_character *characters, - int child_count) -{ - int nsources = 0; - int i = 0; - for (i = 0; i < child_count; i++) { - if (characters[i].wisdom == 1) { - sources[i] = 1; - nsources++; - } - } +/* + * This function determines if a self-heal is required for a given inode, + * and if needed, in what direction. + * + * locked_on[] is the array representing servers which have been locked and + * from which xattrs have been fetched for analysis. + * + * The output of the function is by filling the arrays sources[] and sinks[]. + * + * sources[i] is set if i'th server is an eligible source for a selfheal. + * + * sinks[i] is set if i'th server needs to be healed. + * + * if sources[0..N] are all set, there is no need for a selfheal. + * + * if sinks[0..N] are all set, the inode is in split brain. + * + */ - return nsources; -} +int +afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + afr_transaction_type type, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks) +{ + afr_private_t *priv = NULL; + int i = 0; + int j = 0; + int *dirty = NULL; + int **matrix = NULL; + char *accused = NULL; + + priv = this->private; + + dirty = alloca0 (priv->child_count * sizeof (int)); + accused = alloca0 (priv->child_count); + matrix = ALLOC_MATRIX(priv->child_count, int); + + /* First construct the pending matrix for further analysis */ + afr_selfheal_extract_xattr (this, replies, type, dirty, matrix); + + /* Next short list all accused to exclude them from being sources */ + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + if (matrix[i][j]) + accused[j] = 1; + } + } -static void -afr_compute_witness_of_fools (int32_t *witnesses, int32_t **pending_matrix, - afr_node_character *characters, - int32_t child_count) -{ - int i = 0; - int j = 0; - int witness = 0; - - GF_ASSERT (witnesses); - GF_ASSERT (pending_matrix); - GF_ASSERT (characters); - GF_ASSERT (child_count > 0); - - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_FOOL) - continue; - - witness = 0; - for (j = 0; j < child_count; j++) { - if (i == j) - continue; - witness += pending_matrix[i][j]; - } - witnesses[i] = witness; - } -} + /* Short list all non-accused as sources */ + memset (sources, 0, priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (!accused[i] && locked_on[i]) + sources[i] = 1; + } -static int32_t -afr_find_biggest_witness_among_fools (int32_t *witnesses, - afr_node_character *characters, - int32_t child_count) -{ - int i = 0; - int biggest_witness = -1; - int biggest_witness_idx = -1; - int biggest_witness_cnt = -1; - - GF_ASSERT (witnesses); - GF_ASSERT (characters); - GF_ASSERT (child_count > 0); - - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_FOOL) - continue; - - if (biggest_witness < witnesses[i]) { - biggest_witness = witnesses[i]; - biggest_witness_idx = i; - biggest_witness_cnt = 1; + /* Everyone accused by sources are sinks */ + memset (sinks, 0, priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) continue; + for (j = 0; j < priv->child_count; j++) { + if (matrix[i][j]) + sinks[j] = 1; } + } - if (biggest_witness == witnesses[i]) - biggest_witness_cnt++; - } + /* If any source has 'dirty' bit, pick first + 'dirty' source and make everybody else sinks */ + for (i = 0; i < priv->child_count; i++) { + if (sources[i] && dirty[i]) { + for (j = 0; j < priv->child_count; j++) { + if (j != i) { + sources[j] = 0; + sinks[j] = 1; + } + } + break; + } + } - if (biggest_witness_cnt != 1) - return -1; + /* If no sources, all locked nodes are sinks - split brain */ + if (AFR_COUNT (sources, priv->child_count) == 0) { + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) + sinks[i] = 1; + } + } - return biggest_witness_idx; + return 0; } + int -afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses, - afr_node_character *characters, - int32_t child_count, int32_t witness) +afr_selfheal_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *parbuf) { - int i = 0; - int nsources = 0; - - GF_ASSERT (sources); - GF_ASSERT (witnesses); - GF_ASSERT (characters); - GF_ASSERT (child_count > 0); - - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_FOOL) - continue; - - if (witness == witnesses[i]) { - sources[i] = 1; - nsources++; - } - } - return nsources; -} + afr_local_t *local = NULL; + int i = -1; + local = frame->local; + i = (long) cookie; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (buf) + local->replies[i].poststat = *buf; + if (parbuf) + local->replies[i].postparent = *parbuf; + if (xdata) + local->replies[i].xdata = dict_ref (xdata); + + syncbarrier_wake (&local->barrier); -int -afr_mark_fool_as_source_by_idx (int32_t *sources, int child_count, int idx) -{ - if (idx >= 0 && idx < child_count) { - sources[idx] = 1; - return 1; - } return 0; } -static int -afr_find_largest_file_size (struct iatt *bufs, int32_t *success_children, - int child_count) +inode_t * +afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent, + const char *name, struct afr_reply *replies, + unsigned char *lookup_on) { - int idx = -1; - int i = -1; - int child = -1; - uint64_t max_size = 0; - uint64_t min_size = 0; - int num_children = 0; + loc_t loc = {0, }; + dict_t *xattr_req = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + inode_t *inode = NULL; - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; + local = frame->local; + priv = frame->this->private; - child = success_children[i]; - if (bufs[child].ia_size > max_size) { - max_size = bufs[child].ia_size; - idx = child; - } - - if ((num_children == 0) || (bufs[child].ia_size < min_size)) { - min_size = bufs[child].ia_size; - } + xattr_req = dict_new (); + if (!xattr_req) + return NULL; - num_children++; + if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) { + dict_destroy (xattr_req); + return NULL; } - /* If sizes are same for all of them, finding sources will have to - * happen with pending changelog. So return -1 - */ - if ((num_children > 1) && (min_size == max_size)) - return -1; - return idx; -} + inode = inode_new (parent->table); + if (!inode) { + dict_destroy (xattr_req); + return NULL; + } + loc.parent = inode_ref (parent); + uuid_copy (loc.pargfid, parent->gfid); + loc.name = name; + loc.inode = inode_ref (inode); -static int -afr_find_newest_file (struct iatt *bufs, int32_t *success_children, - int child_count) -{ - int idx = -1; - int i = -1; - int child = -1; - uint64_t max_ctime = 0; + AFR_ONLIST (lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xattr_req); - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; + afr_replies_copy (replies, local->replies, priv->child_count); - child = success_children[i]; - if (bufs[child].ia_ctime > max_ctime) { - max_ctime = bufs[child].ia_ctime; - idx = child; - } - } + loc_wipe (&loc); + dict_unref (xattr_req); - return idx; + return inode; } -static int -afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, - afr_node_character *characters, - int32_t *success_children, - int child_count, struct iatt *bufs) -{ - int32_t biggest_witness = 0; - int nsources = 0; - int32_t *witnesses = NULL; - - GF_ASSERT (child_count > 0); - - biggest_witness = afr_find_largest_file_size (bufs, success_children, - child_count); - if (biggest_witness != -1) - goto found; - - witnesses = GF_CALLOC (child_count, sizeof (*witnesses), - gf_afr_mt_int32_t); - if (NULL == witnesses) { - nsources = -1; - goto out; - } - - afr_compute_witness_of_fools (witnesses, pending_matrix, characters, - child_count); - biggest_witness = afr_find_biggest_witness_among_fools (witnesses, - characters, - child_count); - if (biggest_witness != -1) - goto found; - - biggest_witness = afr_find_newest_file (bufs, success_children, - child_count); - -found: - nsources = afr_mark_fool_as_source_by_idx (sources, child_count, - biggest_witness); -out: - GF_FREE (witnesses); - return nsources; -} - int -afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs, - int32_t *success_children, - unsigned int child_count, uint32_t uid) +afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies, + unsigned char *discover_on) { - int i = 0; - int nsources = 0; - int child = 0; - - for (i = 0; i < child_count; i++) { - if (-1 == success_children[i]) - break; - - child = success_children[i]; - if (uid == bufs[child].ia_uid) { - sources[child] = 1; - nsources++; - } - } - return nsources; -} + loc_t loc = {0, }; + dict_t *xattr_req = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; -int -afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *success_children, - unsigned int child_count) -{ - int i = 0; - int smallest = -1; - int child = 0; - - for (i = 0; i < child_count; i++) { - if (-1 == success_children[i]) - break; - child = success_children[i]; - if ((smallest == -1) || - (bufs[child].ia_uid < bufs[smallest].ia_uid)) { - smallest = child; - } - } - return smallest; -} + local = frame->local; + priv = frame->this->private; -static int -afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *success_children, - int child_count, int32_t *sources) -{ - int nsources = 0; - int smallest = 0; - - smallest = afr_get_child_with_lowest_uid (bufs, success_children, - child_count); - if (smallest < 0) { - nsources = -1; - goto out; - } - nsources = afr_mark_child_as_source_by_uid (sources, bufs, - success_children, child_count, - bufs[smallest].ia_uid); -out: - return nsources; -} + xattr_req = dict_new (); + if (!xattr_req) + return -ENOMEM; -int -afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children, - struct iatt *bufs) -{ - afr_private_t *priv = NULL; - int i = 0; - int child = -1; - int read_child = -1; - - priv = this->private; - for (i = 0; i < priv->child_count; i++) { - child = success_children[i]; - if (child < 0) - break; - if (read_child < 0) - read_child = child; - else if (bufs[read_child].ia_size < bufs[child].ia_size) - read_child = child; - } - return read_child; -} + if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) { + dict_destroy (xattr_req); + return -ENOMEM; + } -int -afr_sh_mark_zero_size_file_as_sink (struct iatt *bufs, int32_t *success_children, - int child_count, int32_t *sources) -{ - int nsources = 0; - int i = 0; - int child = 0; - gf_boolean_t sink_exists = _gf_false; - gf_boolean_t source_exists = _gf_false; - int source = -1; - - for (i = 0; i < child_count; i++) { - child = success_children[i]; - if (child < 0) - break; - if (!bufs[child].ia_size) { - sink_exists = _gf_true; - continue; - } - if (!source_exists) { - source_exists = _gf_true; - source = child; - continue; - } - if (bufs[source].ia_size != bufs[child].ia_size) { - nsources = -1; - goto out; - } - } - if (!source_exists && !sink_exists) { - nsources = -1; - goto out; - } - - if (!source_exists || !sink_exists) - goto out; - - for (i = 0; i < child_count; i++) { - child = success_children[i]; - if (child < 0) - break; - if (bufs[child].ia_size) { - sources[child] = 1; - nsources++; - } - } -out: - return nsources; -} + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, gfid); -char * -afr_get_character_str (afr_node_type type) -{ - char *character = NULL; - - switch (type) { - case AFR_NODE_INNOCENT: - character = "innocent"; - break; - case AFR_NODE_FOOL: - character = "fool"; - break; - case AFR_NODE_WISE: - character = "wise"; - break; - default: - character = "invalid"; - break; - } - return character; -} + AFR_ONLIST (discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xattr_req); -afr_node_type -afr_find_child_character_type (int32_t *pending_row, int32_t child, - unsigned int child_count) -{ - afr_node_type type = AFR_NODE_INVALID; + afr_replies_copy (replies, local->replies, priv->child_count); - GF_ASSERT ((child >= 0) && (child < child_count)); + loc_wipe (&loc); + dict_unref (xattr_req); - if (afr_sh_is_innocent (pending_row, child_count)) - type = AFR_NODE_INNOCENT; - else if (afr_sh_is_fool (pending_row, child, child_count)) - type = AFR_NODE_FOOL; - else if (afr_sh_is_wise (pending_row, child, child_count)) - type = AFR_NODE_WISE; - return type; + return 0; } int -afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, - int32_t **pending_matrix, int32_t *sources, - int32_t *success_children, afr_transaction_type type, - int32_t *subvol_status, gf_boolean_t ignore_ignorant) +afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies) { - afr_private_t *priv = NULL; - afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; - int nsources = -1; - unsigned char *ignorant_subvols = NULL; - unsigned int child_count = 0; - - priv = this->private; - child_count = priv->child_count; - - if (afr_get_children_count (success_children, priv->child_count) == 0) - goto out; - - if (!ignore_ignorant) { - ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), - child_count, gf_afr_mt_char); - if (NULL == ignorant_subvols) - goto out; - } - - afr_build_pending_matrix (priv->pending_key, pending_matrix, - ignorant_subvols, xattr, type, - priv->child_count); - - if (!ignore_ignorant) - afr_mark_ignorant_subvols_as_pending (pending_matrix, - ignorant_subvols, - priv->child_count); - sh_type = afr_self_heal_type_for_transaction (type); - if (AFR_SELF_HEAL_INVALID == sh_type) - goto out; - - afr_sh_print_pending_matrix (pending_matrix, this); - - nsources = afr_mark_sources (this, sources, pending_matrix, bufs, - sh_type, success_children, subvol_status); -out: - GF_FREE (ignorant_subvols); - return nsources; -} + afr_private_t *priv = NULL; -void -afr_find_character_types (afr_node_character *characters, - int32_t **pending_matrix, int32_t *success_children, - unsigned int child_count) -{ - afr_node_type type = AFR_NODE_INVALID; - int child = 0; - int i = 0; - - for (i = 0; i < child_count; i++) { - child = success_children[i]; - if (child == -1) - break; - type = afr_find_child_character_type (pending_matrix[child], - child, child_count); - characters[child].type = type; - } -} + priv = frame->this->private; -void -afr_mark_success_children_sources (int32_t *sources, int32_t *success_children, - unsigned int child_count) -{ - int i = 0; - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - sources[success_children[i]] = 1; - } + return afr_selfheal_unlocked_discover_on (frame, inode, gfid, replies, + priv->child_up); } -/** - * mark_sources: Mark all 'source' nodes and return number of source - * nodes found - * - * A node (a row in the pending matrix) belongs to one of - * three categories: - * - * M is the pending matrix. - * - * 'innocent' - M[i] is all zeroes - * 'fool' - M[i] has i'th element = 1 (self-reference) - * 'wise' - M[i] has i'th element = 0, others are 1 or 0. - * - * All 'innocent' nodes are sinks. If all nodes are innocent, no self-heal is - * needed. - * - * A 'wise' node can be a source. If two 'wise' nodes conflict, it is - * a split-brain. If one wise node refers to the other but the other doesn't - * refer back, the referrer is a source. - * - * All fools are sinks, unless there are no 'wise' nodes. In that case, - * one of the fools is made a source. - */ + int -afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, - struct iatt *bufs, afr_self_heal_type type, - int32_t *success_children, int32_t *subvol_status) +afr_selfheal_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - /* stores the 'characters' (innocent, fool, wise) of the nodes */ - afr_node_character *characters = NULL; - int nsources = -1; - unsigned int child_count = 0; - afr_private_t *priv = NULL; - - priv = this->private; - child_count = priv->child_count; - characters = GF_CALLOC (sizeof (afr_node_character), - child_count, gf_afr_mt_afr_node_character); - if (!characters) - goto out; - - this = THIS; - - /* start clean */ - memset (sources, 0, sizeof (*sources) * child_count); - nsources = 0; - afr_find_character_types (characters, pending_matrix, success_children, - child_count); - if (afr_sh_all_nodes_innocent (characters, child_count)) { - switch (type) { - case AFR_SELF_HEAL_METADATA: - nsources = afr_sh_mark_lowest_uid_as_source (bufs, - success_children, - child_count, - sources); - break; - case AFR_SELF_HEAL_DATA: - nsources = afr_sh_mark_zero_size_file_as_sink (bufs, - success_children, - child_count, - sources); - if ((nsources < 0) && subvol_status) - *subvol_status |= SPLIT_BRAIN; - break; - default: - break; - } - goto out; - } - - if (afr_sh_wise_nodes_exist (characters, child_count)) { - afr_sh_compute_wisdom (pending_matrix, characters, child_count); - - if (afr_sh_wise_nodes_conflict (characters, child_count)) { - if (subvol_status) - *subvol_status |= SPLIT_BRAIN; - nsources = -1; - } else { - nsources = afr_sh_mark_wisest_as_sources (sources, - characters, - child_count); - } - } else { - if (subvol_status) - *subvol_status |= ALL_FOOLS; - nsources = afr_mark_biggest_of_fools_as_source (sources, - pending_matrix, - characters, - success_children, - child_count, bufs); - } + afr_local_t *local = NULL; + int i = 0; -out: - if (nsources == 0) - afr_mark_success_children_sources (sources, success_children, - child_count); - GF_FREE (characters); + local = frame->local; + i = (long) cookie; - gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources); - return nsources; -} + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; -void -afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, - int32_t *delta_matrix[], unsigned char success[], - int child_count, afr_transaction_type type) -{ - int tgt = 0; - int src = 0; - int value = 0; - - afr_build_pending_matrix (priv->pending_key, delta_matrix, NULL, - xattr, type, priv->child_count); - - /* - * The algorithm here has two parts. First, for each subvol indexed - * as tgt, we try to figure out what count everyone should have for it. - * If the self-heal succeeded, that's easy; the value is zero. - * Otherwise, the value is the maximum of the succeeding nodes' counts. - * Once we know the value, we loop through (possibly for a second time) - * setting each count to the difference so that when we're done all - * succeeding nodes will have the same count for tgt. - */ - for (tgt = 0; tgt < priv->child_count; ++tgt) { - value = 0; - if (!success[tgt]) { - /* Find the maximum. */ - for (src = 0; src < priv->child_count; ++src) { - if (!success[src]) { - continue; - } - if (delta_matrix[src][tgt] > value) { - value = delta_matrix[src][tgt]; - } - } - } - /* Force everyone who succeeded to the chosen value. */ - for (src = 0; src < priv->child_count; ++src) { - if (success[src]) { - delta_matrix[src][tgt] = value - - delta_matrix[src][tgt]; - } - else { - delta_matrix[src][tgt] = 0; - } - } - } + syncbarrier_wake (&local->barrier); + + return 0; } int -afr_sh_delta_to_xattr (xlator_t *this, - int32_t *delta_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type) -{ - int i = 0; - int j = 0; - int k = 0; - int ret = 0; - int32_t *pending = NULL; - int32_t *local_pending = NULL; - afr_private_t *priv = NULL; - - priv = this->private; - for (i = 0; i < child_count; i++) { - if (!xattr[i]) - continue; - - local_pending = NULL; - for (j = 0; j < child_count; j++) { - pending = GF_CALLOC (sizeof (int32_t), 3, - gf_afr_mt_int32_t); - - if (!pending) { - gf_log (this->name, GF_LOG_ERROR, - "failed to allocate pending entry " - "for %s[%d] on %s", - priv->pending_key[j], type, - priv->children[i]->name); - continue; - } - /* 3 = data+metadata+entry */ - - k = afr_index_for_transaction_type (type); - - pending[k] = hton32 (delta_matrix[i][j]); - - if (j == i) { - local_pending = pending; - continue; - } - ret = dict_set_bin (xattr[i], priv->pending_key[j], - pending, - AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); - GF_FREE (pending); - } - } - if (local_pending) { - ret = dict_set_bin (xattr[i], priv->pending_key[i], - local_pending, - AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); - GF_FREE (local_pending); - } - } - } - return 0; +afr_selfheal_locked_fill (call_frame_t *frame, xlator_t *this, + unsigned char *locked_on) +{ + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int count = 0; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && local->replies[i].op_ret == 0) { + locked_on[i] = 1; + count++; + } else { + locked_on[i] = 0; + } + } + + return count; } int -afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) +afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - afr_sh_reset (frame, this); - - if (local->unhealable) { - gf_log (this->name, GF_LOG_DEBUG, - "split brain found, aborting selfheal of %s", - local->loc.path); - } - - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - sh->completion_cbk (frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to metadata check on %s", - local->loc.path); - afr_self_heal_metadata (frame, this); - } - - return 0; -} + loc_t loc = {0,}; + struct gf_flock flock = {0, }; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); -static int -afr_sh_missing_entries_finish (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; - local = frame->local; - int_lock = &local->internal_lock; + AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom, + &loc, F_SETLK, &flock, NULL); - int_lock->lock_cbk = afr_sh_missing_entries_done; - afr_unlock (frame, this); + loc_wipe (&loc); - return 0; + return afr_selfheal_locked_fill (frame, this, locked_on); } + int -afr_sh_common_create (afr_self_heal_t *sh, unsigned int child_count) -{ - int ret = -ENOMEM; - sh->buf = GF_CALLOC (child_count, sizeof (*sh->buf), - gf_afr_mt_iatt); - if (!sh->buf) - goto out; - sh->parentbufs = GF_CALLOC (child_count, sizeof (*sh->parentbufs), - gf_afr_mt_iatt); - if (!sh->parentbufs) - goto out; - sh->child_errno = GF_CALLOC (child_count, sizeof (*sh->child_errno), - gf_afr_mt_int); - if (!sh->child_errno) - goto out; - sh->success_children = afr_children_create (child_count); - if (!sh->success_children) - goto out; - sh->fresh_children = afr_children_create (child_count); - if (!sh->fresh_children) - goto out; - sh->xattr = GF_CALLOC (child_count, sizeof (*sh->xattr), - gf_afr_mt_dict_t); - if (!sh->xattr) - goto out; - ret = 0; -out: - return ret; -} +afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on) +{ + loc_t loc = {0,}; + struct gf_flock flock = {0, }; + afr_local_t *local = NULL; + int i = 0; + afr_private_t *priv = NULL; + + priv = this->private; + local = frame->local; + + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); + + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; + + AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom, + &loc, F_SETLK, &flock, NULL); + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == -1 && + local->replies[i].op_errno == EAGAIN) { + afr_selfheal_locked_fill (frame, this, locked_on); + afr_selfheal_uninodelk (frame, this, inode, dom, off, + size, locked_on); + + AFR_SEQ (frame, afr_selfheal_lock_cbk, inodelk, dom, + &loc, F_SETLKW, &flock, NULL); + break; + } + } -void -afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, - dict_t *xattr, struct iatt *postparent, - loc_t *loc) -{ - int child_index = 0; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == 0) { - sh->buf[child_index] = *buf; - sh->parentbufs[child_index] = *postparent; - sh->success_children[sh->success_count] = child_index; - sh->success_count++; - sh->xattr[child_index] = dict_ref (xattr); - } else { - gf_log (this->name, GF_LOG_DEBUG, "path %s on subvolume" - " %s => -1 (%s)", loc->path, - priv->children[child_index]->name, - strerror (op_errno)); - local->self_heal.child_errno[child_index] = op_errno; - } - } - UNLOCK (&frame->lock); - return; -} + loc_wipe (&loc); -gf_boolean_t -afr_valid_ia_type (ia_type_t ia_type) -{ - switch (ia_type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - case IA_IFLNK: - case IA_IFDIR: - return _gf_true; - default: - return _gf_false; - } - return _gf_false; + return afr_selfheal_locked_fill (frame, this, locked_on); } + int -afr_impunge_frame_create (call_frame_t *frame, xlator_t *this, - int active_source, call_frame_t **impunge_frame) +afr_selfheal_uninodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + const unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int32_t op_errno = 0; - afr_private_t *priv = NULL; - int ret = 0; - call_frame_t *new_frame = NULL; - - op_errno = ENOMEM; - priv = this->private; - new_frame = copy_frame (frame); - if (!new_frame) { - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (impunge_local, out); - - local = frame->local; - new_frame->local = impunge_local; - impunge_sh = &impunge_local->self_heal; - impunge_sh->sh_frame = frame; - impunge_sh->active_source = active_source; - impunge_local->child_up = memdup (local->child_up, - sizeof (*local->child_up) * - priv->child_count); - if (!impunge_local->child_up) - goto out; - - impunge_local->pending = afr_matrix_create (priv->child_count, - AFR_NUM_CHANGE_LOGS); - if (!impunge_local->pending) - goto out; - - ret = afr_sh_common_create (impunge_sh, priv->child_count); - if (ret) { - op_errno = -ret; - goto out; - } - op_errno = 0; - *impunge_frame = new_frame; -out: - if (op_errno && new_frame) - AFR_STACK_DESTROY (new_frame); - return -op_errno; -} + loc_t loc = {0,}; + struct gf_flock flock = {0, }; -void -afr_sh_missing_entry_call_impunge_recreate (call_frame_t *frame, xlator_t *this, - struct iatt *buf, - struct iatt *postparent, - afr_impunge_done_cbk_t impunge_done) -{ - call_frame_t *impunge_frame = NULL; - afr_local_t *local = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *impunge_sh = NULL; - int ret = 0; - unsigned int enoent_count = 0; - afr_private_t *priv = NULL; - int i = 0; - int32_t op_errno = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - enoent_count = afr_errno_count (NULL, sh->child_errno, - priv->child_count, ENOENT); - if (!enoent_count) { - gf_log (this->name, GF_LOG_INFO, - "no missing files - %s. proceeding to metadata check", - local->loc.path); - goto out; - } - sh->impunge_done = impunge_done; - ret = afr_impunge_frame_create (frame, this, sh->source, &impunge_frame); - if (ret) - goto out; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - loc_copy (&impunge_local->loc, &local->loc); - ret = afr_build_parent_loc (&impunge_sh->parent_loc, - &impunge_local->loc, &op_errno); - if (ret) { - ret = -op_errno; - goto out; - } - impunge_local->call_count = enoent_count; - impunge_sh->entrybuf = sh->buf[sh->source]; - impunge_sh->parentbuf = sh->parentbufs[sh->source]; - for (i = 0; i < priv->child_count; i++) { - if (!impunge_local->child_up[i]) { - impunge_sh->child_errno[i] = ENOTCONN; - continue; - } - if (sh->child_errno[i] != ENOENT) { - impunge_sh->child_errno[i] = EEXIST; - continue; - } - } - for (i = 0; i < priv->child_count; i++) { - if (sh->child_errno[i] != ENOENT) - continue; - afr_sh_entry_impunge_create (impunge_frame, this, i); - enoent_count--; - } - GF_ASSERT (!enoent_count); - return; -out: - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, " - "reason: %s", local->loc.path, strerror (-ret)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } - afr_sh_missing_entries_finish (frame, this); -} -int -afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - if (op_ret < 0) - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_missing_entries_finish (frame, this); - return 0; -} + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); -static int -sh_missing_entries_create (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int type = 0; - struct iatt *buf = NULL; - struct iatt *postparent = NULL; - - local = frame->local; - sh = &local->self_heal; - - buf = &sh->buf[sh->source]; - postparent = &sh->parentbufs[sh->source]; - - type = buf->ia_type; - if (!afr_valid_ia_type (type)) { - gf_log (this->name, GF_LOG_ERROR, - "%s: unknown file type: 0%o", local->loc.path, type); - afr_set_local_for_unhealable (local); - afr_sh_missing_entries_finish (frame, this); - goto out; - } - - afr_sh_missing_entry_call_impunge_recreate (frame, this, - buf, postparent, - afr_sh_create_entry_cbk); -out: - return 0; -} + flock.l_type = F_UNLCK; + flock.l_start = off; + flock.l_len = size; -void -afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - ia_type_t ia_type = IA_INVAL; - int32_t nsources = 0; - loc_t *loc = NULL; - int32_t subvol_status = 0; - afr_transaction_type txn_type = AFR_DATA_TRANSACTION; - gf_boolean_t split_brain = _gf_false; - int read_child = -1; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - loc = &local->loc; - - if (op_ret < 0) { - if (op_errno == EIO) { - afr_set_local_for_unhealable (local); - } - // EIO can happen if finding the fresh parent dir failed - goto out; - } - - //now No chance for the ia_type to conflict - ia_type = sh->buf[sh->success_children[0]].ia_type; - txn_type = afr_transaction_type_get (ia_type); - nsources = afr_build_sources (this, sh->xattr, sh->buf, - sh->pending_matrix, sh->sources, - sh->success_children, txn_type, - &subvol_status, _gf_false); - if (nsources < 0) { - gf_log (this->name, GF_LOG_INFO, "No sources for dir of %s," - " in missing entry self-heal, continuing with the rest" - " of the self-heals", local->loc.path); - if (subvol_status & SPLIT_BRAIN) { - split_brain = _gf_true; - switch (txn_type) { - case AFR_DATA_TRANSACTION: - nsources = 1; - sh->sources[sh->success_children[0]] = 1; - break; - case AFR_ENTRY_TRANSACTION: - read_child = afr_get_no_xattr_dir_read_child - (this, - sh->success_children, - sh->buf); - sh->sources[read_child] = 1; - nsources = 1; - break; - default: - op_errno = EIO; - goto out; - } - } else { - op_errno = EIO; - goto out; - } - } - - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); - sh->source = sh->fresh_children[0]; - if (sh->source == -1) { - gf_log (this->name, GF_LOG_DEBUG, "No active sources found."); - op_errno = EIO; - goto out; - } - - if (sh->gfid_sh_success_cbk) - sh->gfid_sh_success_cbk (frame, this); - sh->type = sh->buf[sh->source].ia_type; - if (uuid_is_null (loc->inode->gfid)) - uuid_copy (loc->gfid, sh->buf[sh->source].ia_gfid); - if (split_brain) { - afr_sh_missing_entries_finish (frame, this); - } else { - sh_missing_entries_create (frame, this); - } - return; -out: - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_set_error (sh, op_errno); - afr_sh_missing_entries_finish (frame, this); - return; -} + AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, inodelk, + dom, &loc, F_SETLK, &flock, NULL); -static int -afr_sh_common_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, - op_errno, inode, buf, xattr, - postparent, &sh->lookup_loc); - call_count = afr_frame_return (frame); - - if (call_count) - goto out; - op_ret = -1; - if (!sh->success_count) { - op_errno = afr_resultant_errno_get (NULL, sh->child_errno, - priv->child_count); - gf_log (this->name, GF_LOG_ERROR, "Failed to lookup %s, " - "reason %s", sh->lookup_loc.path, - strerror (op_errno)); - goto done; - } - - if ((sh->lookup_flags & AFR_LOOKUP_FAIL_CONFLICTS) && - (afr_conflicting_iattrs (sh->buf, sh->success_children, - priv->child_count, - sh->lookup_loc.path, this->name))) { - op_errno = EIO; - gf_log (this->name, GF_LOG_ERROR, "Conflicting entries " - "for %s", sh->lookup_loc.path); - goto done; - } - - if ((sh->lookup_flags & AFR_LOOKUP_FAIL_MISSING_GFIDS) && - (afr_gfid_missing_count (this->name, sh->success_children, - sh->buf, priv->child_count, - sh->lookup_loc.path))) { - op_errno = ENODATA; - gf_log (this->name, GF_LOG_ERROR, "Missing Gfids " - "for %s", sh->lookup_loc.path); - goto done; - } - op_ret = 0; - -done: - sh->lookup_done (frame, this, op_ret, op_errno); -out: - return 0; + loc_wipe (&loc); + + return 0; } + int -afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child, - int32_t op_ret, int32_t op_errno) +afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) { - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - GF_ASSERT (sh->post_remove_call); - if ((op_ret == -1) && (op_errno != ENOENT)) { - gf_log (this->name, GF_LOG_ERROR, - "purge entry %s failed, on child %d reason, %s", - local->loc.path, child, strerror (op_errno)); - LOCK (&frame->lock); - { - afr_sh_set_error (sh, EIO); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } - UNLOCK (&frame->lock); - } - call_count = afr_frame_return (frame); - if (call_count == 0) - sh->post_remove_call (frame, this); - return 0; -} + loc_t loc = {0,}; -void -afr_sh_call_entry_expunge_remove (call_frame_t *frame, xlator_t *this, - int child_index, struct iatt *buf, - struct iatt *parentbuf, - afr_expunge_done_cbk_t expunge_done) -{ - call_frame_t *expunge_frame = NULL; - afr_local_t *local = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *expunge_sh = NULL; - int32_t op_errno = 0; - int ret = 0; - - expunge_frame = copy_frame (frame); - if (!expunge_frame) { - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out); - - local = frame->local; - sh = &local->self_heal; - expunge_frame->local = expunge_local; - expunge_sh = &expunge_local->self_heal; - expunge_sh->sh_frame = frame; - loc_copy (&expunge_local->loc, &local->loc); - ret = afr_build_parent_loc (&expunge_sh->parent_loc, - &expunge_local->loc, &op_errno); - if (ret) { - ret = -op_errno; - goto out; - } - sh->expunge_done = expunge_done; - afr_sh_entry_expunge_remove (expunge_frame, this, child_index, buf, - parentbuf); - return; -out: - gf_log (this->name, GF_LOG_ERROR, "Expunge of %s failed, reason: %s", - local->loc.path, strerror (op_errno)); - expunge_done (frame, this, child_index, -1, op_errno); -} + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); |