diff options
| author | Ravishankar N <ravishankar@redhat.com> | 2014-04-03 11:47:28 +0530 | 
|---|---|---|
| committer | Pranith Kumar Karampuri <pkarampu@redhat.com> | 2014-04-03 21:32:52 -0700 | 
| commit | 52aac0ae61913c6c6997f27710b6bfcf8ca73bce (patch) | |
| tree | 2d80239535469984f9c731915b2cdcdbec0d2125 | |
| parent | 8596ecba074081d7843a6bea2299951db48aa3b5 (diff) | |
cluster/afr: Add the non-refactored afr code into the treedevelopment
Reverted all afr commits " 6d37392 - cluster/afr: refactor <Anand Avati>" and
upwards. The resulting afr code was moved to afr-v1 folder, resulting in
the following changes:
	modified:   libglusterfs/src/gf-dirent.c
	modified:   libglusterfs/src/glusterfs.h
	modified:   libglusterfs/src/xlator.c
	modified:   tests/basic/pump.t
	modified:   tests/bugs/859927/repl.t
	modified:   tests/bugs/bug-1015990-rep.t
	modified:   tests/bugs/bug-1035576.t
	modified:   tests/bugs/bug-1037501.t
	modified:   tests/bugs/bug-1058797.t
	modified:   tests/bugs/bug-767585-gfid.t
	modified:   tests/bugs/bug-802417.t
	modified:   tests/bugs/bug-830665.t
	modified:   tests/bugs/bug-853690.t
	modified:   tests/bugs/bug-865825.t
	modified:   tests/bugs/bug-873962.t
	modified:   tests/bugs/bug-888174.t
	modified:   tests/bugs/bug-906646.t
	modified:   tests/bugs/bug-913051.t
	modified:   tests/bugs/bug-913544.t
	modified:   tests/bugs/bug-918437-sh-mtime.t
	modified:   tests/bugs/bug-977797.t
	modified:   tests/volume.rc
	new file:   xlators/cluster/afr-v1/Makefile.am
	new file:   xlators/cluster/afr-v1/src/Makefile.am
	new file:   xlators/cluster/afr-v1/src/afr-common.c
	new file:   xlators/cluster/afr-v1/src/afr-dir-read.c
	new file:   xlators/cluster/afr-v1/src/afr-dir-read.h
	new file:   xlators/cluster/afr-v1/src/afr-dir-write.c
	new file:   xlators/cluster/afr-v1/src/afr-dir-write.h
	new file:   xlators/cluster/afr-v1/src/afr-inode-read.c
	new file:   xlators/cluster/afr-v1/src/afr-inode-read.h
	new file:   xlators/cluster/afr-v1/src/afr-inode-write.c
	new file:   xlators/cluster/afr-v1/src/afr-inode-write.h
	new file:   xlators/cluster/afr-v1/src/afr-lk-common.c
	new file:   xlators/cluster/afr-v1/src/afr-mem-types.h
	new file:   xlators/cluster/afr-v1/src/afr-open.c
	new file:   xlators/cluster/afr-v1/src/afr-self-heal-algorithm.c
	new file:   xlators/cluster/afr-v1/src/afr-self-heal-algorithm.h
	new file:   xlators/cluster/afr-v1/src/afr-self-heal-common.c
	new file:   xlators/cluster/afr-v1/src/afr-self-heal-common.h
	new file:   xlators/cluster/afr-v1/src/afr-self-heal-data.c
	new file:   xlators/cluster/afr-v1/src/afr-self-heal-entry.c
	new file:   xlators/cluster/afr-v1/src/afr-self-heal-metadata.c
	new file:   xlators/cluster/afr-v1/src/afr-self-heal.h
	new file:   xlators/cluster/afr-v1/src/afr-self-heald.c
	new file:   xlators/cluster/afr-v1/src/afr-self-heald.h
	new file:   xlators/cluster/afr-v1/src/afr-transaction.c
	new file:   xlators/cluster/afr-v1/src/afr-transaction.h
	new file:   xlators/cluster/afr-v1/src/afr.c
	new file:   xlators/cluster/afr-v1/src/afr.h
	new file:   xlators/cluster/afr-v1/src/pump.c
	new file:   xlators/cluster/afr-v1/src/pump.h
	modified:   xlators/cluster/dht/src/dht-common.c
	modified:   xlators/cluster/stripe/src/stripe.c
	modified:   xlators/features/index/src/index.c
	modified:   xlators/features/index/src/index.h
Also making changes to compile cluster/afr-v1 instead of cluster/afr :
	modified:   configure.ac
	modified:   xlators/cluster/Makefile.am
Change-Id: I3e33ac361e381f5475d1a58ea938d2676f6d5a2f
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
Reviewed-on: http://review.gluster.org/7388
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
58 files changed, 32908 insertions, 187 deletions
diff --git a/configure.ac b/configure.ac index 3a3d8712b..a77db4c39 100644 --- a/configure.ac +++ b/configure.ac @@ -56,8 +56,8 @@ AC_CONFIG_FILES([Makefile                  xlators/storage/bd/Makefile                  xlators/storage/bd/src/Makefile                  xlators/cluster/Makefile -                xlators/cluster/afr/Makefile -                xlators/cluster/afr/src/Makefile +                xlators/cluster/afr-v1/Makefile +                xlators/cluster/afr-v1/src/Makefile                  xlators/cluster/stripe/Makefile                  xlators/cluster/stripe/src/Makefile                  xlators/cluster/dht/Makefile diff --git a/libglusterfs/src/gf-dirent.c b/libglusterfs/src/gf-dirent.c index 0cda83a27..bb028c967 100644 --- a/libglusterfs/src/gf-dirent.c +++ b/libglusterfs/src/gf-dirent.c @@ -83,8 +83,6 @@ gf_link_inodes_from_dirent (xlator_t *this, inode_t *parent,                  if (entry->inode) {                          link_inode = inode_link (entry->inode, parent,                                                   entry->d_name, &entry->d_stat); -			if (!link_inode) -				continue;                          inode_lookup (link_inode);                          inode_unref (link_inode);                  } diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index 5dd26b451..ebda76d93 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -132,7 +132,7 @@  /* Index xlator related */  #define GF_XATTROP_INDEX_GFID "glusterfs.xattrop_index_gfid" -#define GF_XATTROP_INDEX_COUNT "glusterfs.xattrop_index_count" +#define GF_BASE_INDICES_HOLDER_GFID "glusterfs.base_indicies_holder_gfid"  #define GF_GFIDLESS_LOOKUP "gfidless-lookup"  /* replace-brick and pump related internal xattrs */ diff --git a/libglusterfs/src/xlator.c b/libglusterfs/src/xlator.c index 1bded6d3d..f3df8e2ae 100644 --- a/libglusterfs/src/xlator.c +++ b/libglusterfs/src/xlator.c @@ -680,9 +680,7 @@ loc_copy_overload_parent (loc_t *dst, loc_t *src, inode_t *parent)                          dst->name = strrchr (dst->path, '/');                  if (dst->name)                          dst->name++; -        } else if (src->name) { -		dst->name = src->name; -	} +        }          ret = 0;  out: @@ -720,9 +718,7 @@ loc_copy (loc_t *dst, loc_t *src)                          dst->name = strrchr (dst->path, '/');                  if (dst->name)                          dst->name++; -        } else if (src->name) { -		dst->name = src->name; -	} +        }          ret = 0;  out: diff --git a/tests/basic/pump.t b/tests/basic/pump.t index 23bdc187d..3faf06f05 100644 --- a/tests/basic/pump.t +++ b/tests/basic/pump.t @@ -22,7 +22,7 @@ done  cd  TEST umount $M0  TEST $CLI volume replace-brick $V0 $H0:$B0/${V0}0 $H0:$B0/${V0}1 start -EXPECT_WITHIN 600 "Y" gd_is_replace_brick_completed $H0 $V0 $H0:$B0/${V0}0 $H0:$B0/${V0}1 +EXPECT_WITHIN 60 "Y" gd_is_replace_brick_completed $H0 $V0 $H0:$B0/${V0}0 $H0:$B0/${V0}1  TEST $CLI volume replace-brick $V0 $H0:$B0/${V0}0 $H0:$B0/${V0}1 commit  TEST $CLI volume stop $V0  TEST diff -r --exclude=.glusterfs $B0/${V0}0 $B0/${V0}1 diff --git a/tests/bugs/859927/repl.t b/tests/bugs/859927/repl.t index 856b057fb..73c86e7be 100755 --- a/tests/bugs/859927/repl.t +++ b/tests/bugs/859927/repl.t @@ -33,20 +33,20 @@ TEST $CLI volume set $V0 cluster.data-self-heal-algorithm full  EXPECT full volume_option $V0 cluster.data-self-heal-algorithm  create_setup_for_self_heal $M0/a  EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0 -cat $file 2>&1 > /dev/null +ls -l $file 2>&1 > /dev/null  TEST cmp $B0/${V0}1/a $B0/${V0}2/a  TEST $CLI volume set $V0 cluster.data-self-heal-algorithm diff  EXPECT diff volume_option $V0 cluster.data-self-heal-algorithm  create_setup_for_self_heal $M0/a  EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0 -cat $file 2>&1 > /dev/null +ls -l $file 2>&1 > /dev/null  TEST cmp $B0/${V0}1/a $B0/${V0}2/a  TEST $CLI volume reset $V0 cluster.data-self-heal-algorithm  create_setup_for_self_heal $M0/a  EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0 -cat $file 2>&1 > /dev/null +ls -l $file 2>&1 > /dev/null  TEST cmp $B0/${V0}1/a $B0/${V0}2/a  TEST ! $CLI volume set $V0 cluster.data-self-heal-algorithm "" diff --git a/tests/bugs/bug-1015990-rep.t b/tests/bugs/bug-1015990-rep.t index bca0d7aff..f59bb2f75 100755 --- a/tests/bugs/bug-1015990-rep.t +++ b/tests/bugs/bug-1015990-rep.t @@ -35,6 +35,7 @@ for  i in  {1..100}; do echo "STRING" > $M0/File$i; done  brick_2_sh_entries=$(count_sh_entries $B0/$V0"2")  brick_4_sh_entries=$(count_sh_entries $B0/$V0"4") +  command_output=$(gluster volume heal $V0 statistics heal-count replica $H0:$B0/$V0"1") diff --git a/tests/bugs/bug-1035576.t b/tests/bugs/bug-1035576.t index 938306a85..52d93dd87 100644 --- a/tests/bugs/bug-1035576.t +++ b/tests/bugs/bug-1035576.t @@ -34,8 +34,7 @@ quota_limit_val1=$(get_hex_xattr trusted.glusterfs.quota.limit-set $B0/${V0}1/a)  quota_size_val1=$(get_hex_xattr trusted.glusterfs.quota.size $B0/${V0}1/a)  #Trigger entry,metadata self-heal -TEST ls $M0/a - +TEST stat $M0/a  quota_limit_val0=$(get_hex_xattr trusted.glusterfs.quota.limit-set $B0/${V0}0/a)  quota_size_val0=$(get_hex_xattr trusted.glusterfs.quota.size $B0/${V0}0/a) @@ -44,7 +43,7 @@ TEST [ $quota_limit_val0 == $quota_limit_val1 ]  #Only entry, metadata self-heal is done quota size value should not be same  TEST [ $quota_size_val0 != $quota_size_val1 ] -TEST cat $M0/a/f +TEST stat $M0/a/f  #Now that data self-heal is done quota size value should be same  quota_size_val0=$(get_hex_xattr trusted.glusterfs.quota.size $B0/${V0}0/a) diff --git a/tests/bugs/bug-1037501.t b/tests/bugs/bug-1037501.t index 596122a72..d11c788a0 100755 --- a/tests/bugs/bug-1037501.t +++ b/tests/bugs/bug-1037501.t @@ -24,6 +24,14 @@ TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}-{0,1,2}  EXPECT "$V0" volinfo_field $V0 'Volume Name';  EXPECT 'Created' volinfo_field $V0 'Status'; +## Make sure io-cache and write-behind don't interfere. +TEST $CLI volume set $V0 data-self-heal off; + +## Make sure automatic self-heal doesn't perturb our results. +TEST $CLI volume set $V0 cluster.self-heal-daemon off + +TEST $CLI volume set $V0 background-self-heal-count 0 +  ## Start volume and verify  TEST $CLI volume start $V0;  EXPECT 'Started' volinfo_field $V0 'Status'; @@ -40,38 +48,206 @@ TEST $CLI volume add-brick $V0 replica 4 $H0:$B0/$V0-3 force  TEST $CLI volume add-brick $V0 replica 5 $H0:$B0/$V0-4 force  TEST $CLI volume add-brick $V0 replica 6 $H0:$B0/$V0-5 force -sleep 5 - -TEST gluster volume heal $V0 full - -sleep 5 - -EXPECT 10 stat -c '%s' $B0/$V0-0/File -EXPECT 10 stat -c '%s' $B0/$V0-1/File -EXPECT 10 stat -c '%s' $B0/$V0-2/File -EXPECT 10 stat -c '%s' $B0/$V0-3/File -EXPECT 10 stat -c '%s' $B0/$V0-4/File -EXPECT 10 stat -c '%s' $B0/$V0-5/File - -EXPECT 3 stat -c '%h' $B0/$V0-0/Link -EXPECT 3 stat -c '%h' $B0/$V0-1/Link -EXPECT 3 stat -c '%h' $B0/$V0-2/Link -EXPECT 3 stat -c '%h' $B0/$V0-3/Link -EXPECT 3 stat -c '%h' $B0/$V0-4/Link -EXPECT 3 stat -c '%h' $B0/$V0-5/Link - -EXPECT 'directory' stat -c '%F' $B0/$V0-0/Dir -EXPECT 'directory' stat -c '%F' $B0/$V0-1/Dir -EXPECT 'directory' stat -c '%F' $B0/$V0-2/Dir -EXPECT 'directory' stat -c '%F' $B0/$V0-3/Dir -EXPECT 'directory' stat -c '%F' $B0/$V0-4/Dir -EXPECT 'directory' stat -c '%F' $B0/$V0-5/Dir - -EXPECT 'fifo' stat -c '%F' $B0/$V0-0/FIFO -EXPECT 'fifo' stat -c '%F' $B0/$V0-1/FIFO -EXPECT 'fifo' stat -c '%F' $B0/$V0-2/FIFO -EXPECT 'fifo' stat -c '%F' $B0/$V0-3/FIFO -EXPECT 'fifo' stat -c '%F' $B0/$V0-4/FIFO -EXPECT 'fifo' stat -c '%F' $B0/$V0-5/FIFO +sleep 10 + +TEST ls $M0/ + + +function compare() +{ +	var=-1; +	if [ $1 == $2 ]; then +		var=0; +	else +		var=-1; +	fi + +	echo $var +} + +var2="000000000000000000000000" + +var1=`getfattr -d -m . $B0/$V0-0/File -e hex 2>&1 | grep "client-3"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-0/File -e hex 2>&1 | grep "client-4"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-0/File -e hex 2>&1 | grep "client-5"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-1/File -e hex 2>&1 | grep "client-3"` +EXPECT "0" echo $? +var3=`echo $var1| cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-1/File -e hex 2>&1 | grep "client-4"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-1/File -e hex 2>&1 | grep "client-5"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-2/File -e hex 2>&1 | grep "client-3"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-2/File -e hex 2>&1 | grep "client-4"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-2/File -e hex 2>&1 | grep "client-5"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-0/Dir -e hex 2>&1 | grep "client-3"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-0/Dir -e hex 2>&1 | grep "client-4"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-0/Dir -e hex 2>&1 | grep "client-5"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-1/Dir -e hex 2>&1 | grep "client-3"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-1/Dir -e hex 2>&1 | grep "client-4"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-1/Dir -e hex 2>&1 | grep "client-5"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-2/Dir -e hex 2>&1 | grep "client-3"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-2/Dir -e hex 2>&1 | grep "client-4"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-2/Dir -e hex 2>&1 | grep "client-5"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + + +var1=`getfattr -d -m . $B0/$V0-0/Link -e hex 2>&1 | grep "client-3"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-0/Link -e hex 2>&1 | grep "client-4"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-0/Link -e hex 2>&1 | grep "client-5"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-1/Link -e hex 2>&1 | grep "client-3"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-1/Link -e hex 2>&1 | grep "client-4"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-1/Link -e hex 2>&1 | grep "client-5"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-2/Link -e hex 2>&1 | grep "client-3"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-2/Link -e hex 2>&1 | grep "client-4"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-2/Link -e hex 2>&1 | grep "client-5"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + + + +var1=`getfattr -d -m . $B0/$V0-0/FIFO -e hex 2>&1 | grep "client-3"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-0/FIFO -e hex 2>&1 | grep "client-4"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-0/FIFO -e hex 2>&1 | grep "client-5"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-1/FIFO -e hex 2>&1 | grep "client-3"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-1/FIFO -e hex 2>&1 | grep "client-4"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-1/FIFO -e hex 2>&1 | grep "client-5"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-2/FIFO -e hex 2>&1 | grep "client-3"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-2/FIFO -e hex 2>&1 | grep "client-4"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3 + +var1=`getfattr -d -m . $B0/$V0-2/FIFO -e hex 2>&1 | grep "client-5"` +EXPECT "0" echo $? +var3=`echo $var1 | cut -d x -f 2` +EXPECT_NOT $var2 echo $var3  cleanup; diff --git a/tests/bugs/bug-1058797.t b/tests/bugs/bug-1058797.t index 1e9f09af0..2b80794cf 100644 --- a/tests/bugs/bug-1058797.t +++ b/tests/bugs/bug-1058797.t @@ -29,7 +29,7 @@ EXPECT "s" echo $setuid_bit1  #Restart volume and do lookup from mount to trigger heal  TEST $CLI volume start $V0 force  EXPECT_WITHIN 20 "1" afr_child_up_status $V0 1 -TEST dd if=$M0/file of=/dev/null +TEST ls -l $M0/file  #Get file permissions from healed brick1 and verify that S_ISUID is indeed set  file_permissions2=`ls -l $B0/brick1/file | awk '{print $1}' | cut -d. -f1 | cut -d- -f2,3,4,5,6` diff --git a/tests/bugs/bug-767585-gfid.t b/tests/bugs/bug-767585-gfid.t index 41043a0b2..49cf7423f 100755 --- a/tests/bugs/bug-767585-gfid.t +++ b/tests/bugs/bug-767585-gfid.t @@ -26,9 +26,10 @@ TEST setfattr -n trusted.gfid -v $gfid2 $B0/${V0}1/c  sleep 2 -TEST stat $M0/a -TEST stat $M0/b -TEST stat $M0/c +cd $M0 +TEST ls -l a +TEST ls -l b +TEST ls -l c  TEST gf_get_gfid_xattr $B0/${V0}0/a  TEST gf_get_gfid_xattr $B0/${V0}1/a diff --git a/tests/bugs/bug-802417.t b/tests/bugs/bug-802417.t index b596df303..314141f6b 100755 --- a/tests/bugs/bug-802417.t +++ b/tests/bugs/bug-802417.t @@ -55,7 +55,7 @@ EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0  EXPECT_WITHIN 20 "1" afr_child_up_status $V0 1  EXPECT_WITHIN 20 "1" afr_child_up_status $V0 2  TEST kill_brick ${V0} ${H0} ${B0}/${V0}-2 -TEST dd if=${M0}/a_file of=/dev/null +TEST ls -l ${M0}/a_file  obs_path_0=${B0}/${V0}-0/a_file @@ -67,31 +67,31 @@ tgt_xattr_1="trusted.afr.${V0}-client-1"  tgt_xattr_2="trusted.afr.${V0}-client-2"  actual=$(afr_get_changelog_xattr $obs_path_0 $tgt_xattr_0) -EXPECT "0x000000000000000000000000|^\$" echo $actual +EXPECT "0x000000000000000000000000" echo $actual  actual=$(afr_get_changelog_xattr $obs_path_0 $tgt_xattr_1) -EXPECT "0x000000000000000000000000|^\$" echo $actual +EXPECT "0x000000000000000000000000" echo $actual  actual=$(afr_get_changelog_xattr $obs_path_0 $tgt_xattr_2) -EXPECT "0x000000030000000000000000" echo $actual +EXPECT "0x000000020000000000000000" echo $actual  actual=$(afr_get_changelog_xattr $obs_path_1 $tgt_xattr_0) -EXPECT "0x000000000000000000000000|^\$" echo $actual +EXPECT "0x000000000000000000000000" echo $actual  actual=$(afr_get_changelog_xattr $obs_path_1 $tgt_xattr_1) -EXPECT "0x000000000000000000000000|^\$" echo $actual +EXPECT "0x000000000000000000000000" echo $actual  actual=$(afr_get_changelog_xattr $obs_path_1 $tgt_xattr_2) -EXPECT "0x000000010000000000000000" echo $actual +EXPECT "0x000000020000000000000000" echo $actual  actual=$(afr_get_changelog_xattr $obs_path_2 $tgt_xattr_0) -EXPECT "0x000000000000000000000000|^\$" echo $actual +EXPECT "0x000000000000000000000000" echo $actual  actual=$(afr_get_changelog_xattr $obs_path_2 $tgt_xattr_1) -EXPECT "0x000000000000000000000000|^\$" echo $actual +EXPECT "0x000000000000000000000000" echo $actual  actual=$(afr_get_changelog_xattr $obs_path_2 $tgt_xattr_2) -EXPECT "0x000000000000000000000000|^\$" echo $actual +EXPECT "0x000000000000000000000000" echo $actual  if [ "$EXIT_EARLY" = "1" ]; then  	exit 0; diff --git a/tests/bugs/bug-830665.t b/tests/bugs/bug-830665.t index dd6f3ce2f..0073ff1d9 100755 --- a/tests/bugs/bug-830665.t +++ b/tests/bugs/bug-830665.t @@ -81,17 +81,15 @@ ls -l $N0 &> /dev/null;  sleep 5;  ## Force entry self-heal. -TEST $CLI volume set $V0 cluster.self-heal-daemon on -sleep 1 -TEST gluster volume heal $V0 full +find $N0 | xargs stat > /dev/null;  #ls -lR $N0 > /dev/null;  ## Do NOT check through the NFS mount here. That will force a new self-heal  ## check, but we want to test whether self-heal already happened.  ## Make sure everything's in order on the recreated brick. -EXPECT_WITHIN 20 'test_data' cat $B0/${V0}-0/a_file; -EXPECT_WITHIN 20 'more_test_data' cat $B0/${V0}-0/a_dir/another_file; +EXPECT 'test_data' cat $B0/${V0}-0/a_file; +EXPECT 'more_test_data' cat $B0/${V0}-0/a_dir/another_file;  if [ "$EXIT_EARLY" = "1" ]; then  	exit 0; diff --git a/tests/bugs/bug-853690.t b/tests/bugs/bug-853690.t index c2f82d103..77a581f54 100755 --- a/tests/bugs/bug-853690.t +++ b/tests/bugs/bug-853690.t @@ -66,6 +66,7 @@ TEST glusterfs --volfile=$B0/test.vol --attribute-timeout=0 --entry-timeout=0 $M  # file sizes and immediate split-brain (EIO).  TEST dd if=/dev/zero of=$M0/file bs=128k count=1  TEST dd if=$M0/file of=/dev/null bs=128k count=1 +  ########  #  # Test self-heal with short writes... @@ -75,11 +76,14 @@ TEST dd if=$M0/file of=/dev/null bs=128k count=1  # Cause a lookup and wait a few seconds for posterity. This self-heal also fails  # due to a short write.  TEST ls $M0/file +  # Verify the attributes on the healthy replica do not reflect consistency with  # the other replica. -xa=`getfattr -n trusted.afr.test-locks-0 -e hex $B0/test2/file 2>&1 | grep = | cut -f2 -d=` -EXPECT_NOT 0x000000000000000000000000 echo $xa +TEST "getfattr -n trusted.afr.test-locks-0 $B0/test2/file --only-values > $B0/out1 2> /dev/null" +TEST "getfattr -n trusted.afr.test-locks-1 $B0/test2/file --only-values > $B0/out2 2> /dev/null" +TEST ! cmp $B0/out1 $B0/out2 +TEST rm -f $B0/out1 $B0/out2  TEST rm -f $M0/file  TEST umount $M0 diff --git a/tests/bugs/bug-865825.t b/tests/bugs/bug-865825.t index 8ee751864..6bb1c2348 100755 --- a/tests/bugs/bug-865825.t +++ b/tests/bugs/bug-865825.t @@ -2,8 +2,6 @@  . $(dirname $0)/../include.rc -cleanup; -  TEST glusterd  TEST pidof glusterd  TEST $CLI volume info; @@ -30,7 +28,6 @@ EXPECT 'Created' volinfo_field $V0 'Status';  ## Make sure io-cache and write-behind don't interfere.  TEST $CLI volume set $V0 cluster.background-self-heal-count 0  TEST $CLI volume set $V0 performance.io-cache off; -TEST $CLI volume set $V0 performance.quick-read off;  TEST $CLI volume set $V0 performance.write-behind off;  TEST $CLI volume set $V0 performance.stat-prefetch off @@ -57,18 +54,19 @@ setfattr -n trusted.afr.${V0}-client-2 -v $value $B0/${V0}-0/a_file  setfattr -x trusted.afr.${V0}-client-2 $B0/${V0}-1/a_file  echo "wrong_data" > $B0/${V0}-2/a_file -gluster volume set $V0 cluster.self-heal-daemon on -sleep 3 -gluster volume heal $V0 full +## Remount and force a self-heal. +TEST glusterfs --volfile-server=$H0 --volfile-id=$V0 $M0 +stat ${M0}/a_file > /dev/null  ## Make sure brick 2 now has the correct contents. -EXPECT_WITHIN 30 "test_data" cat $B0/${V0}-2/a_file +EXPECT "test_data" cat $B0/${V0}-2/a_file  if [ "$EXIT_EARLY" = "1" ]; then  	exit 0;  fi  ## Finish up +TEST umount $M0;  TEST $CLI volume stop $V0;  EXPECT 'Stopped' volinfo_field $V0 'Status'; diff --git a/tests/bugs/bug-873962.t b/tests/bugs/bug-873962.t index 0281417f0..b245cc3da 100755 --- a/tests/bugs/bug-873962.t +++ b/tests/bugs/bug-873962.t @@ -61,12 +61,11 @@ EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0  EXPECT_WITHIN 20 "1" afr_child_up_status $V0 1  TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $M1 --direct-io-mode=enable -  #Files are in split-brain, so open should fail  TEST ! cat $M0/a;  TEST ! cat $M1/a; -TEST cat $M0/b; -TEST cat $M1/b; +TEST ! cat $M0/b; +TEST ! cat $M1/b;  #Reset split-brain status  TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/a; @@ -76,7 +75,6 @@ TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0  EXPECT "2" cat $M0/a;  # FAIL HERE - see comment about cluster.self-heal-background-count above.  EXPECT "2" cat $M1/a; -TEST dd if=$M0/b of=/dev/null bs=1M  EXPECT "def" getfattr -n trusted.mdata --only-values $M0/b 2>/dev/null  EXPECT "def" getfattr -n trusted.mdata --only-values $M1/b 2>/dev/null @@ -92,8 +90,8 @@ TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $  #Files are in split-brain, so open should fail  TEST ! cat $M0/c  TEST ! cat $M1/c -TEST cat $M0/d -TEST cat $M1/d +TEST ! cat $M0/d +TEST ! cat $M1/d  TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/c  TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/d @@ -104,4 +102,7 @@ EXPECT "2" cat $M1/c  EXPECT "1" cat $M0/d  EXPECT "1" cat $M1/d +#Check that the self-heal is not triggered. +EXPECT "1" cat $B0/${V0}1/c +EXPECT "abc" getfattr -n trusted.mdata --only-values $B0/${V0}1/d 2>/dev/null  cleanup; diff --git a/tests/bugs/bug-888174.t b/tests/bugs/bug-888174.t index ef653f76d..4ea34645b 100644 --- a/tests/bugs/bug-888174.t +++ b/tests/bugs/bug-888174.t @@ -38,9 +38,10 @@ TEST [ -z $inodelk_max_latency ]  TEST dd of=$M0/a if=/dev/urandom bs=1M count=10 conv=fsync  #Check for no trace of pending changelog. Flush should make sure of it. -EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_0/a trusted.afr.dirty -EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_1/a trusted.afr.dirty - +EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_0/a trusted.afr.$V0-client-0 +EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_0/a trusted.afr.$V0-client-1 +EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_1/a trusted.afr.$V0-client-0 +EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_1/a trusted.afr.$V0-client-1  dd of=$M0/a if=/dev/urandom bs=1M count=1024 2>/dev/null &  p=$! @@ -50,13 +51,15 @@ TEST $CLI volume set $V0 performance.io-cache off  TEST $CLI volume set $V0 performance.stat-prefetch off  TEST $CLI volume set $V0 performance.read-ahead off -kill -TERM $p +kill -SIGTERM $p  #wait for dd to exit  wait  > /dev/null 2>&1  #Goal is to check if there is permanent FOOL changelog  sleep 5 -EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_0/a trusted.afr.dirty -EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_1/a trusted.afr.dirty +EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_0/a trusted.afr.$V0-client-0 +EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_0/a trusted.afr.$V0-client-1 +EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_1/a trusted.afr.$V0-client-0 +EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_1/a trusted.afr.$V0-client-1  cleanup; diff --git a/tests/bugs/bug-906646.t b/tests/bugs/bug-906646.t index b2cbf6bc3..0e6a3bcb6 100644 --- a/tests/bugs/bug-906646.t +++ b/tests/bugs/bug-906646.t @@ -84,7 +84,7 @@ TEST $CLI volume start $V0 force  EXPECT_WITHIN 20 "1" afr_child_up_status $V0 `expr $brick_id - 1` -cat $pth >/dev/null +stat $pth  # check backends - xattr should not be present anywhere  EXPECT 1 xattr_query_check ${backend_paths_array[0]} "trusted.name" diff --git a/tests/bugs/bug-913051.t b/tests/bugs/bug-913051.t index 9a59424f4..69e90cf66 100644 --- a/tests/bugs/bug-913051.t +++ b/tests/bugs/bug-913051.t @@ -48,8 +48,8 @@ EXPECT "N" gf_check_file_opened_in_brick $V0 $H0 $B0/${V0}0 $B0/${V0}0/dir/b  #attempt self-heal so that the files are created on brick-0 -TEST dd if=$M0/dir/a of=/dev/null bs=1M -TEST dd if=$M0/dir/b of=/dev/null bs=1M +TEST ls -l $M0/dir/a +TEST ls -l $M0/dir/b  #trigger writev for attempting open-fd-fix in afr  TEST fd_write $wfd "open sesame" diff --git a/tests/bugs/bug-913544.t b/tests/bugs/bug-913544.t index db28ca814..790bc0898 100644 --- a/tests/bugs/bug-913544.t +++ b/tests/bugs/bug-913544.t @@ -17,7 +17,7 @@ TEST touch a  #simulate no-changelog data split-brain  echo "abc" > $B0/${V0}1/a  echo "abcd" > $B0/${V0}0/a -TEST truncate -s 0 a +TEST ! truncate -s 0 a  TEST ls  cd diff --git a/tests/bugs/bug-918437-sh-mtime.t b/tests/bugs/bug-918437-sh-mtime.t index 11155ad16..080956f51 100644 --- a/tests/bugs/bug-918437-sh-mtime.t +++ b/tests/bugs/bug-918437-sh-mtime.t @@ -38,12 +38,7 @@ TEST $CLI volume start $V0 force  EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0  EXPECT_WITHIN 20 "1" afr_child_up_status $V0 1 -TEST $CLI volume set $V0 cluster.self-heal-daemon on -sleep 1 -TEST gluster volume heal $V0 full - -size=`stat -c '%s' /etc/passwd` -EXPECT_WITHIN 60 $size stat -c '%s' $B0/gfs0/brick01/a +find $M0 | xargs stat 1>/dev/null  TEST modify_atstamp1=$(get_mtime $B0/gfs0/brick01/a)  TEST modify_atstamp2=$(get_mtime $B0/gfs0/brick02/a) diff --git a/tests/bugs/bug-977797.t b/tests/bugs/bug-977797.t index f2252159a..08cdbe8f1 100755 --- a/tests/bugs/bug-977797.t +++ b/tests/bugs/bug-977797.t @@ -54,7 +54,7 @@ TEST chmod 757 $M0/a/file  TEST $CLI volume start $V0 force  EXPECT_WITHIN 20 "1" afr_child_up_status $V0 1; -TEST dd if=$M0/a/file of=/dev/null bs=1M +TEST ls   -l $M0/a/file  b1c0dir=$(afr_get_specific_changelog_xattr $B0/$V0"1"/a \            trusted.afr.$V0-client-0 "entry") @@ -75,15 +75,34 @@ b2c0f=$(afr_get_specific_changelog_xattr $B0/$V0"2"/a/file \  b2c1f=$(afr_get_specific_changelog_xattr $B0/$V0"2"/a/file \          trusted.afr.$V0-client-1 "data") -EXPECT "00000000|^$" echo $b1c0f -EXPECT "00000000|^$" echo $b1c1f -EXPECT "00000000|^$" echo $b2c0f -EXPECT "00000000|^$" echo $b2c1f - -EXPECT "00000000|^$" echo $b1c0dir -EXPECT "00000000|^$" echo $b1c1dir -EXPECT "00000000|^$" echo $b2c0dir -EXPECT "00000000|^$" echo $b2c1dir +EXPECT "00000000" echo $b1c0f +EXPECT "00000000" echo $b1c1f +EXPECT "00000000" echo $b2c0f +EXPECT "00000000" echo $b2c1f + +EXPECT "00000000" echo $b1c0dir +EXPECT "00000000" echo $b1c1dir +EXPECT "00000000" echo $b2c0dir +EXPECT "00000000" echo $b2c1dir + +contains() { +    string="$1" +    substring="$2" +    var="-1" +    if test "${string#*$substring}" != "$string" +    then +        var="0"    # $substring is in $string +    else +        var="1"    # $substring is not in $string +    fi +    echo $var +} + +var1=$(cat $M0/a/file 2>&1) +var2="Input/output error" + + +EXPECT "0" contains "$var1" "$var2"  ## Finish up  TEST $CLI volume stop $V0; diff --git a/tests/volume.rc b/tests/volume.rc index 9e4843e06..4cfb89f27 100644 --- a/tests/volume.rc +++ b/tests/volume.rc @@ -169,7 +169,7 @@ function check_option_help_presence {  function afr_get_changelog_xattr {          local file=$1          local xkey=$2 -        getfattr -n $xkey -e hex $file 2>/dev/null | grep "$xkey" | cut -f2 -d'=' +        getfattr -n $xkey -e hex $file 2>/dev/null | grep "client-" | cut -f2 -d'='  }  function afr_get_pending_heal_count { diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am index 0990822a7..d891181a1 100644 --- a/xlators/cluster/Makefile.am +++ b/xlators/cluster/Makefile.am @@ -1,3 +1,3 @@ -SUBDIRS = stripe afr dht +SUBDIRS = stripe afr-v1 dht -CLEANFILES =  +CLEANFILES = diff --git a/xlators/cluster/afr-v1/Makefile.am b/xlators/cluster/afr-v1/Makefile.am new file mode 100644 index 000000000..a985f42a8 --- /dev/null +++ b/xlators/cluster/afr-v1/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/afr-v1/src/Makefile.am b/xlators/cluster/afr-v1/src/Makefile.am new file mode 100644 index 000000000..35d18a6c0 --- /dev/null +++ b/xlators/cluster/afr-v1/src/Makefile.am @@ -0,0 +1,37 @@ +xlator_LTLIBRARIES = afr.la pump.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \ +	afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c \ +	afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c \ +	afr-self-heal-algorithm.c afr-lk-common.c afr-self-heald.c \ +	$(top_builddir)/xlators/lib/src/libxlator.c + +afr_la_LDFLAGS = -module -avoid-version +afr_la_SOURCES = $(afr_common_source) afr.c +afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +pump_la_LDFLAGS = -module -avoid-version +pump_la_SOURCES =  $(afr_common_source) pump.c +pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h \ +	afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h \ +	afr-self-heal-algorithm.h pump.h afr-mem-types.h afr-common.c \ +	afr-self-heald.h $(top_builddir)/xlators/lib/src/libxlator.h \ +	$(top_builddir)/glusterfsd/src/glusterfsd.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) \ +	-I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ +	-I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = + +uninstall-local: +	rm -f $(DESTDIR)$(xlatordir)/replicate.so +	rm -f $(DESTDIR)$(xlatordir)/pump.so + +install-data-hook: +	ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so diff --git a/xlators/cluster/afr-v1/src/afr-common.c b/xlators/cluster/afr-v1/src/afr-common.c new file mode 100644 index 000000000..224d30546 --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-common.c @@ -0,0 +1,4603 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" +#include "statedump.h" +#include "inode.h" + +#include "fd.h" + +#include "afr-inode-read.h" +#include "afr-inode-write.h" +#include "afr-dir-read.h" +#include "afr-dir-write.h" +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" +#include "afr-self-heald.h" +#include "pump.h" + +#define AFR_ICTX_OPENDIR_DONE_MASK     0x0000000100000000ULL +#define AFR_ICTX_READ_CHILD_MASK       0x00000000FFFFFFFFULL +#define AFR_STATISTICS_HISTORY_SIZE    50 +int +afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, +                                gf_boolean_t fail_conflict); +void +afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count) +{ +        int     i = 0; + +        for (i = 0; i < child_count; i++) +                dst[i] = src[i]; +} + +void +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path) +{ +        int             i           = 0; +        afr_private_t   *priv       = NULL; +        int             ret         = 0; + +        priv   = this->private; + +        for (i = 0; i < priv->child_count; i++) { +                ret = dict_set_uint64 (xattr_req, priv->pending_key[i], +                                       3 * sizeof(int32_t)); +                if (ret < 0) +                        gf_log (this->name, GF_LOG_WARNING, +                                "%s: Unable to set dict value for %s", +                                path, priv->pending_key[i]); +                /* 3 = data+metadata+entry */ +        } +        ret = dict_set_int32 (xattr_req, GF_GFIDLESS_LOOKUP, 1); +        if (ret) { +                gf_log (this->name, GF_LOG_DEBUG, "%s: failed to set gfidless " +                        "lookup", path); +        } +} + +int +afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this, +                              dict_t *xattr_req, loc_t *loc, void **gfid_req) +{ +        int     ret = -ENOMEM; + +        GF_ASSERT (gfid_req); + +        *gfid_req = NULL; +        local->xattr_req = dict_new (); +        if (!local->xattr_req) +                goto out; +        if (xattr_req) +                dict_copy (xattr_req, local->xattr_req); + +        afr_xattr_req_prepare (this, local->xattr_req, loc->path); +        ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_WARNING, +                        "%s: Unable to set dict value for %s", +                        loc->path, GLUSTERFS_INODELK_COUNT); +        } +        ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_WARNING, +                        "%s: Unable to set dict value for %s", +                        loc->path, GLUSTERFS_ENTRYLK_COUNT); +        } + +        ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_WARNING, +                        "%s: Unable to set dict value for %s", +                        loc->path, GLUSTERFS_PARENT_ENTRYLK); +        } + +        ret = dict_get_ptr (local->xattr_req, "gfid-req", gfid_req); +        if (ret) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "%s: failed to get the gfid from dict", loc->path); +                *gfid_req = NULL; +        } else { +                if (loc->parent != NULL) +                        dict_del (local->xattr_req, "gfid-req"); +        } +        ret = 0; +out: +        return ret; +} + +void +afr_lookup_save_gfid (uuid_t dst, void* new, const loc_t *loc) +{ +        inode_t  *inode = NULL; + +        inode = loc->inode; +        if (inode && !uuid_is_null (inode->gfid)) +                uuid_copy (dst, inode->gfid); +        else if (!uuid_is_null (loc->gfid)) +                uuid_copy (dst, loc->gfid); +        else if (new && !uuid_is_null (new)) +                uuid_copy (dst, new); +} + +int +afr_errno_count (int32_t *children, int *child_errno, +                 unsigned int child_count, int32_t op_errno) +{ +        int i = 0; +        int errno_count = 0; +        int child = 0; + +        for (i = 0; i < child_count; i++) { +                if (children) { +                        child = children[i]; +                        if (child == -1) +                                break; +                } else { +                        child = i; +                } +                if (child_errno[child] == op_errno) +                        errno_count++; +        } +        return errno_count; +} + +int32_t +afr_set_dict_gfid (dict_t *dict, uuid_t gfid) +{ +        int ret       = 0; +        uuid_t *pgfid = NULL; + +        GF_ASSERT (gfid); + +        pgfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char); +        if (!pgfid) { +                ret = -1; +                goto out; +        } + +        uuid_copy (*pgfid, gfid); + +        ret = dict_set_dynptr (dict, "gfid-req", pgfid, sizeof (uuid_t)); +        if (ret) +                gf_log (THIS->name, GF_LOG_ERROR, "gfid set failed"); + +out: +        if (ret && pgfid) +                GF_FREE (pgfid); + +        return ret; +} + +void +afr_inode_ctx_destroy (afr_inode_ctx_t *ctx) +{ +        if (!ctx) +                return; +        GF_FREE (ctx->fresh_children); +        GF_FREE (ctx); +} + +afr_inode_ctx_t* +__afr_inode_ctx_get (inode_t *inode, xlator_t *this) +{ +        int             ret      = 0; +        uint64_t        ctx_addr = 0; +        afr_inode_ctx_t *ctx     = NULL; +        afr_private_t   *priv    = NULL; + +        priv = this->private; +        ret = __inode_ctx_get (inode, this, &ctx_addr); +        if (ret < 0) +                ctx_addr = 0; +        if (ctx_addr != 0) { +                ctx = (afr_inode_ctx_t*) (long) ctx_addr; +                goto out; +        } +        ctx = GF_CALLOC (1, sizeof (*ctx), +                         gf_afr_mt_inode_ctx_t); +        if (!ctx) +                goto fail; +        ctx->fresh_children = GF_CALLOC (priv->child_count, +                                         sizeof (*ctx->fresh_children), +                                         gf_afr_mt_int32_t); +        if (!ctx->fresh_children) +                goto fail; +        ret = __inode_ctx_put (inode, this, (uint64_t)ctx); +        if (ret) { +                gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to " +                                  "set the inode ctx (%s)", +                                  uuid_utoa (inode->gfid)); +                goto fail; +        } + +out: +        return ctx; + +fail: +        afr_inode_ctx_destroy (ctx); +        return NULL; +} + +afr_inode_ctx_t* +afr_inode_ctx_get (inode_t *inode, xlator_t *this) +{ +        afr_inode_ctx_t *ctx = NULL; + +        LOCK (&inode->lock); +        { +                ctx = __afr_inode_ctx_get (inode, this); +        } +        UNLOCK (&inode->lock); +        return ctx; +} + +void +afr_inode_get_ctx_params (xlator_t *this, inode_t *inode, +                          afr_inode_params_t *params) +{ +        GF_ASSERT (inode); +        GF_ASSERT (params); + +        afr_inode_ctx_t *ctx = NULL; +        afr_private_t   *priv = NULL; +        int             i = 0; +        int32_t         read_child = -1; +        int32_t         *fresh_children = NULL; + +        priv = this->private; +        LOCK (&inode->lock); +        { +                ctx = __afr_inode_ctx_get (inode, this); +                if (!ctx) +                        goto unlock; +                switch (params->op) { +                case AFR_INODE_GET_READ_CTX: +                        fresh_children = params->u.read_ctx.children; +                        read_child = (int32_t)(ctx->masks & +                                               AFR_ICTX_READ_CHILD_MASK); +                        params->u.read_ctx.read_child = read_child; +                        if (!fresh_children) +                                goto unlock; +                        for (i = 0; i < priv->child_count; i++) +                                fresh_children[i] = ctx->fresh_children[i]; +                        break; +                case AFR_INODE_GET_OPENDIR_DONE: +                        params->u.value = _gf_false; +                        if (ctx->masks & AFR_ICTX_OPENDIR_DONE_MASK) +                                params->u.value = _gf_true; +                        break; +                default: +                        GF_ASSERT (0); +                        break; +                } +        } +unlock: +        UNLOCK (&inode->lock); +} + +gf_boolean_t +afr_is_split_brain (xlator_t *this, inode_t *inode) +{ +        afr_inode_ctx_t *ctx = NULL; +        gf_boolean_t    spb  = _gf_false; + +        ctx = afr_inode_ctx_get (inode, this); +        if (!ctx) +                goto out; +        if ((ctx->mdata_spb == SPB) || (ctx->data_spb == SPB)) +                spb = _gf_true; +out: +        return spb; +} + +gf_boolean_t +afr_is_opendir_done (xlator_t *this, inode_t *inode) +{ +        afr_inode_params_t params = {0}; + +        params.op = AFR_INODE_GET_OPENDIR_DONE; +        afr_inode_get_ctx_params (this, inode, ¶ms); +        return params.u.value; +} + +int32_t +afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children) +{ +        afr_inode_params_t      params = {0}; + +        params.op = AFR_INODE_GET_READ_CTX; +        params.u.read_ctx.children = fresh_children; +        afr_inode_get_ctx_params (this, inode, ¶ms); +        return params.u.read_ctx.read_child; +} + +void +afr_inode_ctx_set_read_child (afr_inode_ctx_t *ctx, int32_t read_child) +{ +        uint64_t        remaining_mask = 0; +        uint64_t        mask         = 0; + +        remaining_mask = (~AFR_ICTX_READ_CHILD_MASK & ctx->masks); +        mask = (AFR_ICTX_READ_CHILD_MASK & read_child); +        ctx->masks = remaining_mask | mask; +} + +void +afr_inode_ctx_set_read_ctx (afr_inode_ctx_t *ctx, int32_t read_child, +                            int32_t *fresh_children, int32_t child_count) +{ +        int             i            = 0; + +        afr_inode_ctx_set_read_child (ctx, read_child); +        for (i = 0; i < child_count; i++) { +                if (fresh_children) +                        ctx->fresh_children[i] = fresh_children[i]; +                else +                        ctx->fresh_children[i] = -1; +        } +} + +void +afr_inode_ctx_rm_stale_children (afr_inode_ctx_t *ctx, int32_t *stale_children, +                                 int32_t child_count) +{ +        int             i            = 0; +        int32_t         read_child   = -1; + +        GF_ASSERT (stale_children); +        for (i = 0; i < child_count; i++) { +                if (stale_children[i] == -1) +                        break; +                afr_children_rm_child (ctx->fresh_children, +                                       stale_children[i], child_count); +        } +        read_child = (int32_t)(ctx->masks & AFR_ICTX_READ_CHILD_MASK); +        if (!afr_is_child_present (ctx->fresh_children, child_count, +                                   read_child)) +                afr_inode_ctx_set_read_child (ctx, ctx->fresh_children[0]); +} + +void +afr_inode_ctx_set_opendir_done (afr_inode_ctx_t *ctx) +{ +        uint64_t        remaining_mask = 0; +        uint64_t        mask = 0; + +        remaining_mask = (~AFR_ICTX_OPENDIR_DONE_MASK & ctx->masks); +        mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_OPENDIR_DONE_MASK); +        ctx->masks = remaining_mask | mask; +} + +void +afr_inode_set_ctx_params (xlator_t *this, inode_t *inode, +                          afr_inode_params_t *params) +{ +        GF_ASSERT (inode); +        GF_ASSERT (params); + +        afr_inode_ctx_t *ctx            = NULL; +        afr_private_t   *priv           = NULL; +        int32_t         read_child      = -1; +        int32_t         *fresh_children = NULL; +        int32_t         *stale_children = NULL; + +        priv = this->private; +        LOCK (&inode->lock); +        { +                ctx = __afr_inode_ctx_get (inode, this); +                if (!ctx) +                        goto unlock; +                switch (params->op) { +                case AFR_INODE_SET_READ_CTX: +                        read_child = params->u.read_ctx.read_child; +                        fresh_children = params->u.read_ctx.children; +                        afr_inode_ctx_set_read_ctx (ctx, read_child, +                                                    fresh_children, +                                                    priv->child_count); +                        break; +                case AFR_INODE_RM_STALE_CHILDREN: +                        stale_children = params->u.read_ctx.children; +                        afr_inode_ctx_rm_stale_children (ctx, +                                                         stale_children, +                                                         priv->child_count); +                        break; +                case AFR_INODE_SET_OPENDIR_DONE: +                        afr_inode_ctx_set_opendir_done (ctx); +                        break; +                default: +                        GF_ASSERT (0); +                        break; +                } +        } +unlock: +        UNLOCK (&inode->lock); +} + +void +afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, +                     afr_spb_state_t data_spb) +{ +        afr_inode_ctx_t *ctx = NULL; + +        ctx = afr_inode_ctx_get (inode, this); +        if (mdata_spb != DONT_KNOW) +                ctx->mdata_spb = mdata_spb; +        if (data_spb != DONT_KNOW) +                ctx->data_spb = data_spb; +} + +void +afr_set_opendir_done (xlator_t *this, inode_t *inode) +{ +        afr_inode_params_t params = {0}; + +        params.op = AFR_INODE_SET_OPENDIR_DONE; +        afr_inode_set_ctx_params (this, inode, ¶ms); +} + +void +afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, +                        int32_t *fresh_children) +{ +        afr_inode_params_t params = {0}; +        afr_private_t      *priv  = NULL; + +        priv = this->private; +        GF_ASSERT (read_child >= 0); +        GF_ASSERT (fresh_children); +        GF_ASSERT (afr_is_child_present (fresh_children, priv->child_count, +                                         read_child)); + +        params.op = AFR_INODE_SET_READ_CTX; +        params.u.read_ctx.read_child     = read_child; +        params.u.read_ctx.children = fresh_children; +        afr_inode_set_ctx_params (this, inode, ¶ms); +} + +void +afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, +                             int32_t *stale_children) +{ +        afr_inode_params_t params = {0}; + +        GF_ASSERT (stale_children); + +        params.op = AFR_INODE_RM_STALE_CHILDREN; +        params.u.read_ctx.children = stale_children; +        afr_inode_set_ctx_params (this, inode, ¶ms); +} + +gf_boolean_t +afr_is_source_child (int32_t *sources, int32_t child_count, int32_t child) +{ +        gf_boolean_t             source_xattrs = _gf_false; + +        GF_ASSERT (child < child_count); + +        if ((child >= 0) && (child < child_count) && +             sources[child]) { +                source_xattrs = _gf_true; +        } +        return source_xattrs; +} + +gf_boolean_t +afr_is_child_present (int32_t *success_children, int32_t child_count, +                      int32_t child) +{ +        gf_boolean_t             success_child = _gf_false; +        int                      i = 0; + +        GF_ASSERT (child < child_count); + +        for (i = 0; i < child_count; i++) { +                if (success_children[i] == -1) +                        break; +                if (child == success_children[i]) { +                        success_child = _gf_true; +                        break; +                } +        } +        return success_child; +} + +gf_boolean_t +afr_is_read_child (int32_t *success_children, int32_t *sources, +                   int32_t child_count, int32_t child) +{ +        gf_boolean_t             success_child = _gf_false; +        gf_boolean_t             source        = _gf_false; + +        if (child < 0) { +                return _gf_false; +        } + +        GF_ASSERT (success_children); +        GF_ASSERT (child_count > 0); + +        success_child = afr_is_child_present (success_children, child_count, +                                              child); +        if (!success_child) +                goto out; +        if (NULL == sources) { +                source = _gf_true; +                goto out; +        } +        source = afr_is_source_child (sources, child_count, child); +out: +        return (success_child && source); +} + +int32_t +afr_hash_child (int32_t *success_children, int32_t child_count, +                unsigned int hmode, uuid_t gfid) +{ +        uuid_t  gfid_copy = {0,}; +        pid_t pid; + +        if (!hmode) { +                return -1; +        } + +        if (gfid) { +               uuid_copy(gfid_copy,gfid); +        } +        if (hmode > 1) { +                /* +                 * Why getpid?  Because it's one of the cheapest calls +                 * available - faster than gethostname etc. - and returns a +                 * constant-length value that's sure to be shorter than a UUID. +                 * It's still very unlikely to be the same across clients, so +                 * it still provides good mixing.  We're not trying for +                 * perfection here.  All we need is a low probability that +                 * multiple clients won't converge on the same subvolume. +                 */ +                pid = getpid(); +                memcpy (gfid_copy, &pid, sizeof(pid)); +        } + +        return SuperFastHash((char *)gfid_copy, +                             sizeof(gfid_copy)) % child_count; +} + +/* If sources is NULL the xattrs are assumed to be of source for all + * success_children. + */ +int +afr_select_read_child_from_policy (int32_t *success_children, +                                   int32_t child_count, int32_t prev_read_child, +                                   int32_t config_read_child, int32_t *sources, +                                   unsigned int hmode, uuid_t gfid) +{ +        int32_t                  read_child   = -1; +        int                      i            = 0; + +        GF_ASSERT (success_children); + +        read_child = config_read_child; +        if (afr_is_read_child (success_children, sources, child_count, +                               read_child)) +                goto out; + +        read_child = prev_read_child; +        if (afr_is_read_child (success_children, sources, child_count, +                               read_child)) +                goto out; + +        read_child = afr_hash_child (success_children, child_count, +                                     hmode, gfid); +        if (afr_is_read_child (success_children, sources, child_count, +                               read_child)) { +                goto out; +        } + +        for (i = 0; i < child_count; i++) { +                read_child = success_children[i]; +                if (read_child < 0) +                        break; +                if (afr_is_read_child (success_children, sources, child_count, +                                       read_child)) +                        goto out; +        } +        read_child = -1; + +out: +        return read_child; +} + +/* This function should be used when all the success_children are sources + */ +void +afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, +                              int32_t *fresh_children, int32_t prev_read_child, +                              int32_t config_read_child, uuid_t gfid) +{ +        int                      read_child = -1; +        afr_private_t            *priv = NULL; + +        priv = this->private; +        read_child = afr_select_read_child_from_policy (fresh_children, +                                                        priv->child_count, +                                                        prev_read_child, +                                                        config_read_child, +                                                        NULL, +                                                        priv->hash_mode, gfid); +        if (read_child >= 0) +                afr_inode_set_read_ctx (this, inode, read_child, +                                        fresh_children); +} + +/* afr_next_call_child () + * This is a common function used by all the read-type fops + * This function should not be called with the inode's read_children array. + * The fop's handler should make a copy of the inode's read_children, + * preferred read_child into the local vars, because while this function is + * in execution there is a chance for inode's read_ctx to change. + */ +int32_t +afr_next_call_child (int32_t *fresh_children, unsigned char *child_up, +                     size_t child_count, int32_t *last_index, +                     int32_t read_child) +{ +        int             next_index      = 0; +        int32_t         next_call_child = -1; + +        GF_ASSERT (last_index); + +        next_index = *last_index; +retry: +        next_index++; +        if ((next_index >= child_count) || +           (fresh_children[next_index] == -1)) +                goto out; +        if ((fresh_children[next_index] == read_child) || +           (!child_up[fresh_children[next_index]])) +                goto retry; +        *last_index = next_index; +        next_call_child = fresh_children[next_index]; +out: +        return next_call_child; +} + + /* This function should not be called with the inode's read_children array. + * The fop's handler should make a copy of the inode's read_children, + * preferred read_child into the local vars, because while this function is + * in execution there is a chance for inode's read_ctx to change. + */ +int32_t +afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, +                    int32_t *fresh_children, +                    int32_t *call_child, int32_t *last_index) +{ +        int             ret   = 0; +        afr_private_t   *priv = NULL; +        int             i     = 0; + +        GF_ASSERT (child_up); +        GF_ASSERT (call_child); +        GF_ASSERT (last_index); +        GF_ASSERT (fresh_children); + +        if (read_child < 0) { +                ret = -EIO; +                goto out; +        } +        priv = this->private; +        *call_child = -1; +        *last_index = -1; + +        if (child_up[read_child]) { +                *call_child = read_child; +        } else { +                for (i = 0; i < priv->child_count; i++) { +                        if (fresh_children[i] == -1) +                                break; +                        if (child_up[fresh_children[i]]) { +                                *call_child = fresh_children[i]; +                                ret = 0; +                                break; +                        } +                } + +                if (*call_child == -1) { +                        ret = -ENOTCONN; +                        goto out; +                } + +                *last_index = i; +        } +out: +        gf_log (this->name, GF_LOG_DEBUG, "Returning %d, call_child: %d, " +                "last_index: %d", ret, *call_child, *last_index); +        return ret; +} + +void +afr_reset_xattr (dict_t **xattr, unsigned int child_count) +{ +        unsigned int i = 0; + +        if (!xattr) +                goto out; +        for (i = 0; i < child_count; i++) { +                if (xattr[i]) { +                        dict_unref (xattr[i]); +                        xattr[i] = NULL; +                } +        } +out: +        return; +} + +void +afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count) +{ +        afr_reset_xattr (xattr, child_count); +        GF_FREE (xattr); +} + +void +afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) +{ +        afr_self_heal_t *sh = NULL; +        afr_private_t   *priv = NULL; + +        sh = &local->self_heal; +        priv = this->private; + +        if (sh->data_sh_info && strcmp (sh->data_sh_info, "")) +                GF_FREE (sh->data_sh_info); + +        if (sh->metadata_sh_info && strcmp (sh->metadata_sh_info, "")) +                GF_FREE (sh->metadata_sh_info); + +        GF_FREE (sh->buf); + +        GF_FREE (sh->parentbufs); + +        if (sh->inode) +                inode_unref (sh->inode); + +        afr_xattr_array_destroy (sh->xattr, priv->child_count); + +        GF_FREE (sh->child_errno); + +        afr_matrix_cleanup (sh->pending_matrix, priv->child_count); +        afr_matrix_cleanup (sh->delta_matrix, priv->child_count); + +        GF_FREE (sh->sources); + +        GF_FREE (sh->success); + +        GF_FREE (sh->locked_nodes); + +        if (sh->healing_fd) { +                fd_unref (sh->healing_fd); +                sh->healing_fd = NULL; +        } + +        GF_FREE ((char *)sh->linkname); + +        GF_FREE (sh->success_children); + +        GF_FREE (sh->fresh_children); + +        GF_FREE (sh->fresh_parent_dirs); + +        loc_wipe (&sh->parent_loc); +        loc_wipe (&sh->lookup_loc); + +        GF_FREE (sh->checksum); + +        GF_FREE (sh->write_needed); +        if (sh->healing_fd) +                fd_unref (sh->healing_fd); +} + + +void +afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) +{ +        afr_private_t *priv    = NULL; +        int           i        = 0; + +        priv = this->private; + +        afr_matrix_cleanup (local->pending, priv->child_count); +        afr_matrix_cleanup (local->transaction.txn_changelog, +                            priv->child_count); + +        GF_FREE (local->internal_lock.locked_nodes); + +        for (i = 0; local->internal_lock.inodelk[i].domain; i++) { +                GF_FREE (local->internal_lock.inodelk[i].locked_nodes); +        } + +        GF_FREE (local->internal_lock.lower_locked_nodes); + +        afr_entry_lockee_cleanup (&local->internal_lock); + +        GF_FREE (local->transaction.pre_op); +        GF_FREE (local->transaction.eager_lock); + +        GF_FREE (local->transaction.basename); +        GF_FREE (local->transaction.new_basename); + +        loc_wipe (&local->transaction.parent_loc); +        loc_wipe (&local->transaction.new_parent_loc); + +        GF_FREE (local->transaction.postop_piggybacked); +} + + +void +afr_local_cleanup (afr_local_t *local, xlator_t *this) +{ +        afr_private_t * priv = NULL; + +        if (!local) +                return; + +        afr_local_sh_cleanup (local, this); + +        afr_local_transaction_cleanup (local, this); + +        priv = this->private; + +        loc_wipe (&local->loc); +        loc_wipe (&local->newloc); + +        if (local->fd) +                fd_unref (local->fd); + +        if (local->xattr_req) +                dict_unref (local->xattr_req); + +        if (local->dict) +                dict_unref (local->dict); + +	GF_FREE(local->replies); + +        GF_FREE (local->child_up); + +        GF_FREE (local->child_errno); + +        GF_FREE (local->fresh_children); + +        { /* lookup */ +                if (local->cont.lookup.xattrs) { +                        afr_reset_xattr (local->cont.lookup.xattrs, +                                         priv->child_count); +                        GF_FREE (local->cont.lookup.xattrs); +                        local->cont.lookup.xattrs = NULL; +                } + +                if (local->cont.lookup.xattr) { +                        dict_unref (local->cont.lookup.xattr); +                } + +                if (local->cont.lookup.inode) { +                        inode_unref (local->cont.lookup.inode); +                } + +                GF_FREE (local->cont.lookup.postparents); + +                GF_FREE (local->cont.lookup.bufs); + +                GF_FREE (local->cont.lookup.success_children); + +                GF_FREE (local->cont.lookup.sources); +                afr_matrix_cleanup (local->cont.lookup.pending_matrix, +                                    priv->child_count); +        } + +        { /* getxattr */ +                GF_FREE (local->cont.getxattr.name); +        } + +        { /* lk */ +                GF_FREE (local->cont.lk.locked_nodes); +        } + +        { /* create */ +                if (local->cont.create.fd) +                        fd_unref (local->cont.create.fd); +                if (local->cont.create.params) +                        dict_unref (local->cont.create.params); +        } + +        { /* mknod */ +                if (local->cont.mknod.params) +                        dict_unref (local->cont.mknod.params); +        } + +        { /* mkdir */ +                if (local->cont.mkdir.params) +                        dict_unref (local->cont.mkdir.params); +        } + +        { /* symlink */ +                if (local->cont.symlink.params) +                        dict_unref (local->cont.symlink.params); +        } + +        { /* writev */ +                GF_FREE (local->cont.writev.vector); +        } + +        { /* setxattr */ +                if (local->cont.setxattr.dict) +                        dict_unref (local->cont.setxattr.dict); +        } + +        { /* fsetxattr */ +                if (local->cont.fsetxattr.dict) +                        dict_unref (local->cont.fsetxattr.dict); +        } + +        { /* removexattr */ +                GF_FREE (local->cont.removexattr.name); +        } +        { /* xattrop */ +                if (local->cont.xattrop.xattr) +                        dict_unref (local->cont.xattrop.xattr); +        } +        { /* fxattrop */ +                if (local->cont.fxattrop.xattr) +                        dict_unref (local->cont.fxattrop.xattr); +        } +        { /* symlink */ +                GF_FREE (local->cont.symlink.linkpath); +        } + +        { /* opendir */ +                GF_FREE (local->cont.opendir.checksum); +        } + +        { /* readdirp */ +                if (local->cont.readdir.dict) +                        dict_unref (local->cont.readdir.dict); +        } + +        if (local->xdata_req) +                dict_unref (local->xdata_req); + +        if (local->xdata_rsp) +                dict_unref (local->xdata_rsp); +} + + +int +afr_frame_return (call_frame_t *frame) +{ +        afr_local_t *local = NULL; +        int          call_count = 0; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                call_count = --local->call_count; +        } +        UNLOCK (&frame->lock); + +        return call_count; +} + +int +afr_set_elem_count_get (unsigned char *elems, int child_count) +{ +        int i   = 0; +        int ret = 0; + +        for (i = 0; i < child_count; i++) +                if (elems[i]) +                        ret++; +        return ret; +} + +/** + * up_children_count - return the number of children that are up + */ + +unsigned int +afr_up_children_count (unsigned char *child_up, unsigned int child_count) +{ +        return afr_set_elem_count_get (child_up, child_count); +} + +unsigned int +afr_locked_children_count (unsigned char *children, unsigned int child_count) +{ +        return afr_set_elem_count_get (children, child_count); +} + +unsigned int +afr_pre_op_done_children_count (unsigned char *pre_op, +                                unsigned int child_count) +{ +        return afr_set_elem_count_get (pre_op, child_count); +} + +gf_boolean_t +afr_is_fresh_lookup (loc_t *loc, xlator_t *this) +{ +        uint64_t          ctx = 0; +        int32_t           ret = 0; + +        GF_ASSERT (loc); +        GF_ASSERT (this); +        GF_ASSERT (loc->inode); + +        ret = inode_ctx_get (loc->inode, this, &ctx); +        if (0 == ret) +                return _gf_false; +        return _gf_true; +} + +void +afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) +{ +        GF_ASSERT (loc); +        GF_ASSERT (buf); + +        uuid_copy (loc->gfid, buf->ia_gfid); +        if (postparent) +                uuid_copy (loc->pargfid, postparent->ia_gfid); +} + +/* + * Quota size xattrs are not maintained by afr. There is a + * possibility that they differ even when both the directory changelog xattrs + * suggest everything is fine. So if there is at least one 'source' check among + * the sources which has the maximum quota size. Otherwise check among all the + * available ones for maximum quota size. This way if there is a source and + * stale copies it always votes for the 'source'. + * */ + +static void +afr_handle_quota_size (afr_local_t *local, xlator_t *this, +                       dict_t *rsp_dict) +{ +        int32_t       *sources       = NULL; +        dict_t        *xattr         = NULL; +        data_t        *max_data      = NULL; +        int64_t       max_quota_size = -1; +        data_t        *data          = NULL; +        int64_t       *size          = NULL; +        int64_t       quota_size     = -1; +        afr_private_t *priv          = NULL; +        int           i              = 0; +        int           ret            = -1; +        gf_boolean_t  source_present = _gf_false; + +        priv    = this->private; +        sources = local->cont.lookup.sources; + +        if (rsp_dict == NULL) { +                gf_log_callingfn (this->name, GF_LOG_ERROR, "%s: Invalid " +                                  "response dictionary", local->loc.path); +                return; +        } + +        for (i = 0; i < priv->child_count; i++) { +                if (sources[i]) { +                        source_present = _gf_true; +                        break; +                } +        } + +        for (i = 0; i < priv->child_count; i++) { +                /* +                 * If there is at least one source lets check +                 * for maximum quota sizes among sources, otherwise take the +                 * maximum of the ones present to be on the safer side. +                 */ +                if (source_present && !sources[i]) +                        continue; + +                xattr = local->cont.lookup.xattrs[i]; +                if (!xattr) +                        continue; + +                data = dict_get (xattr, QUOTA_SIZE_KEY); +                if (!data) +                        continue; + +                size = (int64_t*)data->data; +                quota_size = ntoh64(*size); +                gf_log (this->name, GF_LOG_DEBUG, "%s: %d, size: %"PRId64, +                        local->loc.path, i, quota_size); +                if (quota_size > max_quota_size) { +                        if (max_data) +                                data_unref (max_data); + +                        max_quota_size = quota_size; +                        max_data = data_ref (data); +                } +        } + +        if (max_data) { +                ret = dict_set (rsp_dict, QUOTA_SIZE_KEY, max_data); +                if (ret) { +                        gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " +                                "quota size", local->loc.path); +                } + +                data_unref (max_data); +        } +} + +int +afr_lookup_build_response_params (afr_local_t *local, xlator_t *this) +{ +        struct iatt     *buf = NULL; +        struct iatt     *postparent = NULL; +        dict_t          **xattr = NULL; +        int32_t         *success_children = NULL; +        int32_t         *sources = NULL; +        afr_private_t   *priv = NULL; +        int32_t         read_child = -1; +        int             ret = 0; +        int             i = 0; + +        GF_ASSERT (local); + +        buf = &local->cont.lookup.buf; +        postparent = &local->cont.lookup.postparent; +        xattr = &local->cont.lookup.xattr; +        priv = this->private; + +        read_child = afr_inode_get_read_ctx (this, local->cont.lookup.inode, +                                             local->fresh_children); +        if (read_child < 0) { +                ret = -1; +                goto out; +        } +        success_children = local->cont.lookup.success_children; +        sources = local->cont.lookup.sources; +        memset (sources, 0, sizeof (*sources) * priv->child_count); +        afr_children_intersection_get (local->fresh_children, success_children, +                                       sources, priv->child_count); +        if (!sources[read_child]) { +                read_child = -1; +                for (i = 0; i < priv->child_count; i++) { +                        if (sources[i]) { +                                read_child = i; +                                break; +                        } +                } +        } +        if (read_child < 0) { +                ret = -1; +                goto out; +        } + +        gf_log (this->name, GF_LOG_DEBUG, "Building lookup response from %d", +                read_child); +        if (!*xattr) +                *xattr = dict_ref (local->cont.lookup.xattrs[read_child]); + +        *buf = local->cont.lookup.bufs[read_child]; +        *postparent = local->cont.lookup.postparents[read_child]; + +        if (dict_get (local->xattr_req, QUOTA_SIZE_KEY)) +                afr_handle_quota_size (local, this, *xattr); + +        if (IA_INVAL == local->cont.lookup.inode->ia_type) { +                /* fix for RT #602 */ +                local->cont.lookup.inode->ia_type = buf->ia_type; +        } +out: +        return ret; +} + +static void +afr_lookup_update_lk_counts (afr_local_t *local, xlator_t *this, +                            int child_index, dict_t *xattr) +{ +        uint32_t inodelk_count = 0; +        uint32_t entrylk_count = 0; +        int      ret           = -1; +        uint32_t parent_entrylk = 0; + +        GF_ASSERT (local); +        GF_ASSERT (this); +        GF_ASSERT (xattr); +        GF_ASSERT (child_index >= 0); + +        ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT, +                               &inodelk_count); +        if (ret == 0) +                local->inodelk_count += inodelk_count; + +        ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT, +                               &entrylk_count); +        if (ret == 0) +                local->entrylk_count += entrylk_count; +        ret = dict_get_uint32 (xattr, GLUSTERFS_PARENT_ENTRYLK, +                               &parent_entrylk); +        if (!ret) +                local->cont.lookup.parent_entrylk += parent_entrylk; +} + +/* + * It's important to maintain a commutative property on do_*_self_heal and + * found*; once set, they must not be cleared by a subsequent iteration or + * call, so that they represent a logical OR of all iterations and calls + * regardless of child/key order.  That allows the caller to call us multiple + * times without having to use a separate variable as a "reduce" accumulator. + */ +static void +afr_lookup_set_self_heal_params_by_xattr (afr_local_t *local, xlator_t *this, +                                          dict_t *xattr) +{ +        afr_private_t *priv        = NULL; +        int            i           = 0; +        int            ret         = -1; +        void          *pending_raw = NULL; +        int32_t       *pending     = NULL; + +        GF_ASSERT (local); +        GF_ASSERT (this); +        GF_ASSERT (xattr); + +        priv = this->private; + +        for (i = 0; i < priv->child_count; i++) { +                ret = dict_get_ptr (xattr, priv->pending_key[i], +                                    &pending_raw); +                if (ret != 0) { +                        continue; +                } +                pending = pending_raw; + +                if (pending[AFR_METADATA_TRANSACTION]) { +                        gf_log(this->name, GF_LOG_DEBUG, +                               "metadata self-heal is pending for %s.", +                               local->loc.path); +                        local->self_heal.do_metadata_self_heal = _gf_true; +                } + +                if (pending[AFR_ENTRY_TRANSACTION]) { +                        gf_log(this->name, GF_LOG_DEBUG, +                               "entry self-heal is pending for %s.", +                               local->loc.path); +                        local->self_heal.do_entry_self_heal = _gf_true; +                } + +                if (pending[AFR_DATA_TRANSACTION]) { +                        gf_log(this->name, GF_LOG_DEBUG, +                               "data self-heal is pending for %s.", +                               local->loc.path); +                        local->self_heal.do_data_self_heal = _gf_true; +                } +        } +} + +void +afr_lookup_check_set_metadata_split_brain (afr_local_t *local, xlator_t *this) +{ +        int32_t                  *sources = NULL; +        afr_private_t            *priv = NULL; +        int32_t                  subvol_status = 0; +        int32_t                  *success_children   = NULL; +        dict_t                   **xattrs = NULL; +        struct iatt              *bufs = NULL; +        int32_t                  **pending_matrix = NULL; + +        priv = this->private; + +        sources = GF_CALLOC (priv->child_count, sizeof (*sources), +                             gf_afr_mt_int32_t); +        if (NULL == sources) +                goto out; +        success_children = local->cont.lookup.success_children; +        xattrs = local->cont.lookup.xattrs; +        bufs = local->cont.lookup.bufs; +        pending_matrix = local->cont.lookup.pending_matrix; +        afr_build_sources (this, xattrs, bufs, pending_matrix, +                           sources, success_children, AFR_METADATA_TRANSACTION, +                           &subvol_status, _gf_false); +        if (subvol_status & SPLIT_BRAIN) +                local->cont.lookup.possible_spb = _gf_true; +out: +        GF_FREE (sources); +} + +static void +afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this, +                            struct iatt *buf, struct iatt *lookup_buf) +{ +        if (PERMISSION_DIFFERS (buf, lookup_buf)) { +                /* mismatching permissions */ +                gf_log (this->name, GF_LOG_DEBUG, +                        "permissions differ for %s ", local->loc.path); +                local->self_heal.do_metadata_self_heal = _gf_true; +        } + +        if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { +                /* mismatching permissions */ +                local->self_heal.do_metadata_self_heal = _gf_true; +                gf_log (this->name, GF_LOG_DEBUG, +                        "ownership differs for %s ", local->loc.path); +        } + +        if (SIZE_DIFFERS (buf, lookup_buf) +            && IA_ISREG (buf->ia_type)) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "size differs for %s ", local->loc.path); +                local->self_heal.do_data_self_heal = _gf_true; +        } + +        if (uuid_compare (buf->ia_gfid, lookup_buf->ia_gfid)) { +                /* mismatching gfid */ +                gf_log (this->name, GF_LOG_DEBUG, +                        "%s: gfid different on subvolume", local->loc.path); +        } +} + +static void +afr_detect_self_heal_by_split_brain_status (afr_local_t *local, xlator_t *this) +{ +        gf_boolean_t split_brain = _gf_false; +        afr_self_heal_t *sh = NULL; + +        sh = &local->self_heal; + +        split_brain = afr_is_split_brain (this, local->cont.lookup.inode); +        split_brain = split_brain || local->cont.lookup.possible_spb; +        if ((local->success_count > 0) && split_brain && +            IA_ISREG (local->cont.lookup.inode->ia_type)) { +                sh->force_confirm_spb = _gf_true; +                gf_log (this->name, GF_LOG_DEBUG, +                        "split brain detected during lookup of %s.", +                        local->loc.path); +        } +} + +static void +afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this) +{ +        GF_ASSERT (local); +        GF_ASSERT (this); + +        if ((local->success_count > 0) && (local->enoent_count > 0)) { +                local->self_heal.do_metadata_self_heal = _gf_true; +                local->self_heal.do_data_self_heal     = _gf_true; +                local->self_heal.do_entry_self_heal    = _gf_true; +                local->self_heal.do_gfid_self_heal    = _gf_true; +                local->self_heal.do_missing_entry_self_heal    = _gf_true; +                gf_log(this->name, GF_LOG_DEBUG, +                       "entries are missing in lookup of %s.", +                       local->loc.path); +        } + +        return; +} + +gf_boolean_t +afr_can_self_heal_proceed (afr_self_heal_t *sh, afr_private_t *priv) +{ +        GF_ASSERT (sh); +        GF_ASSERT (priv); + +        if (sh->force_confirm_spb) +                return _gf_true; +        return (sh->do_gfid_self_heal +                || sh->do_missing_entry_self_heal +                || (afr_data_self_heal_enabled (priv->data_self_heal) && +                    sh->do_data_self_heal) +                || (priv->metadata_self_heal && sh->do_metadata_self_heal) +                || (priv->entry_self_heal && sh->do_entry_self_heal)); +} + +afr_transaction_type +afr_transaction_type_get (ia_type_t ia_type) +{ +        afr_transaction_type    type = AFR_METADATA_TRANSACTION; + +        GF_ASSERT (ia_type != IA_INVAL); + +        if (IA_ISDIR (ia_type)) { +                type = AFR_ENTRY_TRANSACTION; +        } else if (IA_ISREG (ia_type)) { +                type = AFR_DATA_TRANSACTION; +        } +        return type; +} + +int +afr_lookup_select_read_child (afr_local_t *local, xlator_t *this, +                              int32_t *read_child) +{ +        ia_type_t               ia_type        = IA_INVAL; +        int32_t                 source         = -1; +        int                     ret            = -1; +        dict_t                  **xattrs       = NULL; +        int32_t                 *success_children = NULL; +        afr_transaction_type    type           = AFR_METADATA_TRANSACTION; +        uuid_t                  *gfid          = NULL; + +        GF_ASSERT (local); +        GF_ASSERT (this); +        GF_ASSERT (local->success_count > 0); + +        success_children = local->cont.lookup.success_children; +        /*We can take the success_children[0] only because we already +         *handle the conflicting children other wise, we could select the +         *read_child based on wrong file type +         */ +        ia_type = local->cont.lookup.bufs[success_children[0]].ia_type; +        type = afr_transaction_type_get (ia_type); +        xattrs = local->cont.lookup.xattrs; +        gfid = &local->cont.lookup.buf.ia_gfid; +        source = afr_lookup_select_read_child_by_txn_type (this, local, xattrs, +                                                           type, *gfid); +        if (source < 0) { +                gf_log (this->name, GF_LOG_DEBUG, "failed to select source " +                        "for %s", local->loc.path); +                goto out; +        } + +        gf_log (this->name, GF_LOG_DEBUG, "Source selected as %d for %s", +                source, local->loc.path); +        *read_child = source; +        ret = 0; +out: +        return ret; +} + +static inline gf_boolean_t +afr_is_transaction_running (afr_local_t *local) +{ +        GF_ASSERT (local->fop == GF_FOP_LOOKUP); +        return ((local->inodelk_count > 0) || (local->entrylk_count > 0)); +} + +void +afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, +                      gf_boolean_t background, ia_type_t ia_type, char *reason, +                      void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, +                                                   xlator_t *this), +                      int (*unwind) (call_frame_t *frame, xlator_t *this, +                                     int32_t op_ret, int32_t op_errno, +                                     int32_t sh_failed)) +{ +        afr_local_t             *local = NULL; +        char                    sh_type_str[256] = {0,}; +        char                    *bg = ""; + +        GF_ASSERT (frame); +        GF_ASSERT (this); +        GF_ASSERT (inode); +        GF_ASSERT (ia_type != IA_INVAL); + +        local = frame->local; +        local->self_heal.background = background; +        local->self_heal.type       = ia_type; +        local->self_heal.unwind     = unwind; +        local->self_heal.gfid_sh_success_cbk     = gfid_sh_success_cbk; + +        afr_self_heal_type_str_get (&local->self_heal, +                                    sh_type_str, +                                    sizeof (sh_type_str)); + +        if (background) +                bg = "background"; +        gf_log (this->name, GF_LOG_DEBUG, +                "%s %s self-heal triggered. path: %s, reason: %s", bg, +                sh_type_str, local->loc.path, reason); + +        afr_self_heal (frame, this, inode); +} + +unsigned int +afr_gfid_missing_count (const char *xlator_name, int32_t *success_children, +                        struct iatt *bufs, unsigned int child_count, +                        const char *path) +{ +        unsigned int    gfid_miss_count   = 0; +        int             i              = 0; +        struct iatt     *child1        = NULL; + +        for (i = 0; i < child_count; i++) { +                if (success_children[i] == -1) +                        break; +                child1 = &bufs[success_children[i]]; +                if (uuid_is_null (child1->ia_gfid)) { +                        gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid is null" +                                " on subvolume %d", path, success_children[i]); +                        gfid_miss_count++; +                } +        } + +        return gfid_miss_count; +} + +static int +afr_lookup_gfid_missing_count (afr_local_t *local, xlator_t *this) +{ +        int32_t         *success_children = NULL; +        afr_private_t   *priv          = NULL; +        struct iatt     *bufs          = NULL; +        int             miss_count     = 0; + +        priv = this->private; +        bufs = local->cont.lookup.bufs; +        success_children = local->cont.lookup.success_children; + +        miss_count =  afr_gfid_missing_count (this->name, success_children, +                                              bufs, priv->child_count, +                                              local->loc.path); +        return miss_count; +} + +gf_boolean_t +afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, +                        unsigned int child_count, const char *path, +                        const char *xlator_name) +{ +        gf_boolean_t    conflicting    = _gf_false; +        int             i              = 0; +        struct iatt     *child1        = NULL; +        struct iatt     *child2        = NULL; +        uuid_t          *gfid          = NULL; + +        for (i = 0; i < child_count; i++) { +                if (success_children[i] == -1) +                        break; +                child1 = &bufs[success_children[i]]; +                if ((!gfid) && (!uuid_is_null (child1->ia_gfid))) +                        gfid = &child1->ia_gfid; + +                if (i == 0) +                        continue; + +                child2 = &bufs[success_children[i-1]]; +                if (FILETYPE_DIFFERS (child1, child2)) { +                        gf_log (xlator_name, GF_LOG_DEBUG, "%s: filetype " +                                "differs on subvolumes (%d, %d)", path, +                                success_children[i-1], success_children[i]); +                        conflicting = _gf_true; +                        goto out; +                } +                if (!gfid || uuid_is_null (child1->ia_gfid)) +                        continue; +                if (uuid_compare (*gfid, child1->ia_gfid)) { +                       gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid differs" +                               " on subvolume %d", path, success_children[i]); +                       conflicting = _gf_true; +                       goto out; +                } +        } +out: +        return conflicting; +} + +/* afr_update_gfid_from_iatts: This function should be called only if the + * iatts are not conflicting. + */ +void +afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs, +                            int32_t *success_children, unsigned int child_count) +{ +        uuid_t          *gfid = NULL; +        int             i = 0; +        int             child = 0; + +        for (i = 0; i < child_count; i++) { +                child = success_children[i]; +                if (child == -1) +                        break; +                if ((!gfid) && (!uuid_is_null (bufs[child].ia_gfid))) { +                        gfid = &bufs[child].ia_gfid; +                } else if (gfid && (!uuid_is_null (bufs[child].ia_gfid))) { +                        if (uuid_compare (*gfid, bufs[child].ia_gfid)) { +                                GF_ASSERT (0); +                                goto out; +                        } +                } +        } +        if (gfid && (!uuid_is_null (*gfid))) +                uuid_copy (uuid, *gfid); +out: +        return; +} + +static gf_boolean_t +afr_lookup_conflicting_entries (afr_local_t *local, xlator_t *this) +{ +        afr_private_t           *priv = NULL; +        gf_boolean_t            conflict = _gf_false; + +        priv = this->private; +        conflict =  afr_conflicting_iattrs (local->cont.lookup.bufs, +                                            local->cont.lookup.success_children, +                                            priv->child_count, local->loc.path, +                                            this->name); +        return conflict; +} + +gf_boolean_t +afr_open_only_data_self_heal (char *data_self_heal) +{ +        return !strcmp (data_self_heal, "open"); +} + +gf_boolean_t +afr_data_self_heal_enabled (char *data_self_heal) +{ +        gf_boolean_t    enabled = _gf_false; + +        if (gf_string2boolean (data_self_heal, &enabled) == -1) { +                enabled = !strcmp (data_self_heal, "open"); +                GF_ASSERT (enabled); +        } + +        return enabled; +} + +static void +afr_lookup_set_self_heal_params (afr_local_t *local, xlator_t *this) +{ +        int                     i = 0; +        struct iatt             *bufs = NULL; +        dict_t                  **xattr = NULL; +        afr_private_t           *priv = NULL; +        int32_t                 child1 = -1; +        int32_t                 child2 = -1; +        afr_self_heal_t         *sh = NULL; + +        priv  = this->private; +        sh = &local->self_heal; + +        afr_detect_self_heal_by_lookup_status (local, this); + +        if (afr_lookup_gfid_missing_count (local, this)) +                local->self_heal.do_gfid_self_heal    = _gf_true; + +        if (_gf_true == afr_lookup_conflicting_entries (local, this)) +                local->self_heal.do_missing_entry_self_heal    = _gf_true; +        else +                afr_update_gfid_from_iatts (local->self_heal.sh_gfid_req, +                                            local->cont.lookup.bufs, +                                            local->cont.lookup.success_children, +                                            priv->child_count); + +        bufs = local->cont.lookup.bufs; +        for (i = 1; i < local->success_count; i++) { +                child1 = local->cont.lookup.success_children[i-1]; +                child2 = local->cont.lookup.success_children[i]; +                afr_detect_self_heal_by_iatt (local, this, +                                              &bufs[child1], &bufs[child2]); +        } + +        xattr = local->cont.lookup.xattrs; +        for (i = 0; i < local->success_count; i++) { +                child1 = local->cont.lookup.success_children[i]; +                afr_lookup_set_self_heal_params_by_xattr (local, this, +                                                          xattr[child1]); +        } +        if (afr_open_only_data_self_heal (priv->data_self_heal)) +                sh->do_data_self_heal = _gf_false; +        if (sh->do_metadata_self_heal) +                afr_lookup_check_set_metadata_split_brain (local, this); +        afr_detect_self_heal_by_split_brain_status (local, this); +} + +int +afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, +                             int32_t op_ret, int32_t op_errno, +                             int32_t sh_failed) +{ +        afr_local_t *local = NULL; +        int         ret    = -1; +        dict_t      *xattr = NULL; + +        local = frame->local; + +        if (op_ret == -1) { +                local->op_ret = -1; +		local->op_errno = afr_most_important_error(local->op_errno, +							   op_errno, _gf_true); + +                goto out; +        } else { +                local->op_ret = 0; +        } + +        afr_lookup_done_success_action (frame, this, _gf_true); +        xattr = local->cont.lookup.xattr; +        if (xattr) { +                ret = dict_set_int32 (xattr, "sh-failed", sh_failed); +                if (ret) +                        gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " +                                "sh-failed to %d", local->loc.path, sh_failed); + +                if (local->self_heal.actual_sh_started == _gf_true && +                    sh_failed == 0) { +                        ret = dict_set_int32 (xattr, "actual-sh-done", 1); +                        if (ret) +                                gf_log(this->name, GF_LOG_ERROR, "%s: Failed to" +                                       " set actual-sh-done to %d", +                                       local->loc.path, +                                       local->self_heal.actual_sh_started); +                } +        } +out: +        AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, +                          local->cont.lookup.inode, &local->cont.lookup.buf, +                          local->cont.lookup.xattr, +                          &local->cont.lookup.postparent); + +        return 0; +} + +//TODO: At the moment only lookup needs this, so not doing any checks, in the +// future we will have to do fop specific operations +void +afr_post_gfid_sh_success (call_frame_t *sh_frame, xlator_t *this) +{ +        afr_local_t             *local = NULL; +        afr_local_t             *sh_local = NULL; +        afr_private_t           *priv = NULL; +        afr_self_heal_t         *sh = NULL; +        int                     i = 0; +        struct iatt             *lookup_bufs = NULL; +        struct iatt             *lookup_parentbufs = NULL; + +        sh_local = sh_frame->local; +        sh       = &sh_local->self_heal; +        local = sh->orig_frame->local; +        lookup_bufs = local->cont.lookup.bufs; +        lookup_parentbufs = local->cont.lookup.postparents; +        priv = this->private; + +        memcpy (lookup_bufs, sh->buf, priv->child_count * sizeof (*sh->buf)); +        memcpy (lookup_parentbufs, sh->parentbufs, +                priv->child_count * sizeof (*sh->parentbufs)); + +        afr_reset_xattr (local->cont.lookup.xattrs, priv->child_count); +        if (local->cont.lookup.xattr) { +                dict_unref (local->cont.lookup.xattr); +                local->cont.lookup.xattr = NULL; +        } + +        for (i = 0; i < priv->child_count; i++) { +                if (sh->xattr[i]) +                        local->cont.lookup.xattrs[i] = dict_ref (sh->xattr[i]); +        } + +        afr_reset_children (local->cont.lookup.success_children, +                            priv->child_count); +        afr_children_copy (local->cont.lookup.success_children, +                           sh->fresh_children, priv->child_count); +} + +static void +afr_lookup_perform_self_heal (call_frame_t *frame, xlator_t *this, +                              gf_boolean_t *sh_launched) +{ +        unsigned int         up_count = 0; +        afr_private_t       *priv    = NULL; +        afr_local_t         *local   = NULL; +        char                *reason  = NULL; + +        GF_ASSERT (sh_launched); +        *sh_launched = _gf_false; +        priv         = this->private; +        local        = frame->local; + +        up_count  = afr_up_children_count (local->child_up, priv->child_count); +        if (up_count == 1) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Only 1 child up - do not attempt to detect self heal"); +                goto out; +        } + +        afr_lookup_set_self_heal_params (local, this); +        if (afr_can_self_heal_proceed (&local->self_heal, priv)) { +                if  (afr_is_transaction_running (local) && +                     /*Forcefully call afr_launch_self_heal (which will go on to +                       fail) for SB files.This prevents stale data being served +                       due to race in  afr_is_transaction_running() when +                       multiple clients access the same SB file*/ +                     !local->cont.lookup.possible_spb && +                     (!local->attempt_self_heal)) +                        goto out; + +                reason = "lookup detected pending operations"; +                afr_launch_self_heal (frame, this, local->cont.lookup.inode, +                                      !local->foreground_self_heal, +                                      local->cont.lookup.buf.ia_type, +                                      reason, afr_post_gfid_sh_success, +                                      afr_self_heal_lookup_unwind); +                *sh_launched = _gf_true; +        } +out: +        return; +} + +void +afr_get_fresh_children (int32_t *success_children, int32_t *sources, +                        int32_t *fresh_children, unsigned int child_count) +{ +        unsigned int i = 0; +        unsigned int j = 0; + +        GF_ASSERT (success_children); +        GF_ASSERT (sources); +        GF_ASSERT (fresh_children); + +        afr_reset_children (fresh_children, child_count); +        for (i = 0; i < child_count; i++) { +                if (success_children[i] == -1) +                        break; +                if (afr_is_read_child (success_children, sources, child_count, +                                       success_children[i])) { +                        fresh_children[j] = success_children[i]; +                        j++; +                } +        } +} + +static int +afr_lookup_set_read_ctx (afr_local_t *local, xlator_t *this, int32_t read_child) +{ +        afr_private_t           *priv = NULL; + +        GF_ASSERT (read_child >= 0); + +        priv = this->private; +        afr_get_fresh_children (local->cont.lookup.success_children, +                                local->cont.lookup.sources, +                                local->fresh_children, priv->child_count); +        afr_inode_set_read_ctx (this, local->cont.lookup.inode, read_child, +                                local->fresh_children); + +        return 0; +} + +int +afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, +                                gf_boolean_t fail_conflict) +{ +        int32_t             read_child = -1; +        int32_t             ret        = -1; +        afr_local_t         *local     = NULL; +        gf_boolean_t        fresh_lookup = _gf_false; + +        local   = frame->local; +        fresh_lookup = local->cont.lookup.fresh_lookup; + +        if (local->loc.parent == NULL) +                fail_conflict = _gf_true; + +        if (afr_lookup_conflicting_entries (local, this)) { +                if (fail_conflict == _gf_false) +                        ret = 0; +                goto out; +        } + +        ret = afr_lookup_select_read_child (local, this, &read_child); +        if (!afr_is_transaction_running (local) || fresh_lookup) { +                if (read_child < 0) +                        goto out; + +                ret = afr_lookup_set_read_ctx (local, this, read_child); +                if (ret) +                        goto out; +        } + +        ret = afr_lookup_build_response_params (local, this); +        if (ret) +                goto out; +        afr_update_loc_gfids (&local->loc, +                              &local->cont.lookup.buf, +                              &local->cont.lookup.postparent); + +        ret = 0; +out: +        if (ret) { +                local->op_ret = -1; +                local->op_errno = EIO; +        } +        return ret; +} + +int +afr_lookup_get_latest_subvol (afr_local_t *local, xlator_t *this) +{ +        afr_private_t *priv = NULL; +        int32_t       *success_children = NULL; +        struct iatt   *bufs = NULL; +        int           i = 0; +        int           child = 0; +        int           lsubvol = -1; + +        priv = this->private; +        success_children = local->cont.lookup.success_children; +        bufs = local->cont.lookup.bufs; +        for (i = 0; i < priv->child_count; i++) { +                child = success_children[i]; +                if (child == -1) +                        break; +                if (uuid_is_null (bufs[child].ia_gfid)) +                        continue; +                if (lsubvol < 0) { +                        lsubvol = child; +                } else if (bufs[lsubvol].ia_ctime < bufs[child].ia_ctime) { +                        lsubvol = child; +                } else if ((bufs[lsubvol].ia_ctime == bufs[child].ia_ctime) && +                  (bufs[lsubvol].ia_ctime_nsec < bufs[child].ia_ctime_nsec)) { +                        lsubvol = child; +                } +        } +        return lsubvol; +} + +void +afr_lookup_mark_other_entries_stale (afr_local_t *local, xlator_t *this, +                                     int subvol) +{ +        afr_private_t *priv = NULL; +        int32_t       *success_children = NULL; +        struct iatt   *bufs = NULL; +        int           i = 0; +        int           child = 0; + +        priv = this->private; +        success_children = local->cont.lookup.success_children; +        bufs = local->cont.lookup.bufs; +        memcpy (local->fresh_children, success_children, +                sizeof (*success_children) * priv->child_count); +        for (i = 0; i < priv->child_count; i++) { +                child = local->fresh_children[i]; +                if (child == -1) +                        break; +                if (child == subvol) +                        continue; +                if (uuid_is_null (bufs[child].ia_gfid) && +                    (bufs[child].ia_type == bufs[subvol].ia_type)) +                        continue; +                afr_children_rm_child (success_children, child, +                                       priv->child_count); +                local->success_count--; +        } +        afr_reset_children (local->fresh_children, priv->child_count); +} + +void +afr_succeed_lookup_on_latest_iatt (afr_local_t *local, xlator_t *this) +{ +        int    lsubvol = 0; + +        if (!afr_lookup_conflicting_entries (local, this)) +                goto out; + +        lsubvol = afr_lookup_get_latest_subvol (local, this); +        if (lsubvol < 0) +                goto out; +        afr_lookup_mark_other_entries_stale (local, this, lsubvol); +out: +        return; +} + +gf_boolean_t +afr_is_entry_possibly_under_creation (afr_local_t *local, xlator_t *this) +{ +        /* +         * We need to perform this test in lookup done and treat on going +         * create/DELETE as ENOENT. +         * Reason: +        Multiple clients A, B and C are attempting 'mkdir -p /mnt/a/b/c' + +        1 Client A is in the middle of mkdir(/a). It has acquired lock. +          It has performed mkdir(/a) on one subvol, and second one is still +          in progress +        2 Client B performs a lookup, sees directory /a on one, +          ENOENT on the other, succeeds lookup. +        3 Client B performs lookup on /a/b on both subvols, both return ENOENT +          (one subvol because /a/b does not exist, another because /a +          itself does not exist) +        4 Client B proceeds to mkdir /a/b. It obtains entrylk on inode=/a with +          basename=b on one subvol, but fails on other subvol as /a is yet to +          be created by Client A. +        5 Client A finishes mkdir of /a on other subvol +        6 Client C also attempts to create /a/b, lookup returns ENOENT on +          both subvols. +        7 Client C tries to obtain entrylk on on inode=/a with basename=b, +          obtains on one subvol (where B had failed), and waits for B to unlock +          on other subvol. +        8 Client B finishes mkdir() on one subvol with GFID-1 and completes +          transaction and unlocks +        9 Client C gets the lock on the second subvol, At this stage second +          subvol already has /a/b created from Client B, but Client C does not +          check that in the middle of mkdir transaction +        10 Client C attempts mkdir /a/b on both subvols. It succeeds on +           ONLY ONE (where Client B could not get lock because of +           missing parent /a dir) with GFID-2, and gets EEXIST from ONE subvol. +        This way we have /a/b in GFID mismatch. One subvol got GFID-1 because +        Client B performed transaction on only one subvol (because entrylk() +        could not be obtained on second subvol because of missing parent dir -- +        caused by premature/speculative succeeding of lookup() on /a when locks +        are detected). Other subvol gets GFID-2 from Client C because while +        it was waiting for entrylk() on both subvols, Client B was in the +        middle of creating mkdir() on only one subvol, and Client C does not +        "expect" this when it is between lock() and pre-op()/op() phase of the +        transaction. +         */ +	if (local->cont.lookup.parent_entrylk && local->enoent_count) +		return _gf_true; + +	return _gf_false; +} + + +static void +afr_lookup_done (call_frame_t *frame, xlator_t *this) +{ +        int                 unwind = 1; +        afr_private_t       *priv  = NULL; +        afr_local_t         *local = NULL; +        int                 ret = -1; +        gf_boolean_t        sh_launched = _gf_false; +        gf_boolean_t        fail_conflict = _gf_false; +        int                 gfid_miss_count = 0; +        int                 enotconn_count = 0; +        int                 up_children_count = 0; + +        priv  = this->private; +        local = frame->local; + +	if (afr_is_entry_possibly_under_creation (local, this)) { +		local->op_ret = -1; +		local->op_errno = ENOENT; +		goto unwind; +	} + +        if (local->op_ret < 0) +                goto unwind; + +        if (local->cont.lookup.parent_entrylk && local->success_count > 1) +                afr_succeed_lookup_on_latest_iatt (local, this); + +        gfid_miss_count = afr_lookup_gfid_missing_count (local, this); +        up_children_count = afr_up_children_count (local->child_up, +                                                   priv->child_count); +        enotconn_count = priv->child_count - up_children_count; +        if ((gfid_miss_count == local->success_count) && +            (enotconn_count > 0)) { +                local->op_ret = -1; +                local->op_errno = EIO; +                gf_log (this->name, GF_LOG_ERROR, "Failing lookup for %s, " +                        "LOOKUP on a file without gfid is not allowed when " +                        "some of the children are down", local->loc.path); +                goto unwind; +        } + +        if ((gfid_miss_count == local->success_count) && +            uuid_is_null (local->cont.lookup.gfid_req)) { +                local->op_ret = -1; +                local->op_errno = ENODATA; +                gf_log (this->name, GF_LOG_ERROR, "%s: No gfid present", +                        local->loc.path); +                goto unwind; +        } + +        if (gfid_miss_count && uuid_is_null (local->cont.lookup.gfid_req)) +                fail_conflict = _gf_true; +        ret = afr_lookup_done_success_action (frame, this, fail_conflict); +        if (ret) +                goto unwind; +        uuid_copy (local->self_heal.sh_gfid_req, local->cont.lookup.gfid_req); + +        afr_lookup_perform_self_heal (frame, this, &sh_launched); +        if (sh_launched) { +                unwind = 0; +                goto unwind; +        } + + unwind: +         if (unwind) { +                 AFR_STACK_UNWIND (lookup, frame, local->op_ret, +                                   local->op_errno, local->cont.lookup.inode, +                                   &local->cont.lookup.buf, +                                   local->cont.lookup.xattr, +                                   &local->cont.lookup.postparent); +        } +} + +/* + * During a lookup, some errors are more "important" than + * others in that they must be given higher priority while + * returning to the user. + * + * The hierarchy is ESTALE > EIO > ENOENT > others + */ +int32_t +afr_most_important_error(int32_t old_errno, int32_t new_errno, +			 gf_boolean_t eio) +{ +	if (old_errno == ESTALE || new_errno == ESTALE) +		return ESTALE; +	if (eio && (old_errno == EIO || new_errno == EIO)) +		return EIO; +	if (old_errno == ENOENT || new_errno == ENOENT) +		return ENOENT; + +	return new_errno; +} + +int32_t +afr_resultant_errno_get (int32_t *children, +                         int *child_errno, unsigned int child_count) +{ +        int     i = 0; +        int32_t op_errno = 0; +        int     child = 0; + +        for (i = 0; i < child_count; i++) { +                if (children) { +                        child = children[i]; +                        if (child == -1) +                                break; +                } else { +                        child = i; +                } +		op_errno = afr_most_important_error(op_errno, +						    child_errno[child], +						    _gf_false); +        } +        return op_errno; +} + +static void +afr_lookup_handle_error (afr_local_t *local, int32_t op_ret,  int32_t op_errno) +{ +        GF_ASSERT (local); +        if (op_errno == ENOENT) +                local->enoent_count++; + +	local->op_errno = afr_most_important_error(local->op_errno, op_errno, +						   _gf_false); + +        if (local->op_errno == ESTALE) { +                local->op_ret = -1; +        } +} + +static void +afr_set_root_inode_on_first_lookup (afr_local_t *local, xlator_t *this, +                                    inode_t *inode) +{ +        afr_private_t           *priv = NULL; +        GF_ASSERT (inode); + +        if (!__is_root_gfid (inode->gfid)) +                goto out; +        if (!afr_is_fresh_lookup (&local->loc, this)) +                goto out; +        priv = this->private; +        if ((priv->first_lookup)) { +                gf_log (this->name, GF_LOG_INFO, "added root inode"); +                priv->root_inode = inode_ref (inode); +                priv->first_lookup = 0; +        } +out: +        return; +} + +static void +afr_lookup_cache_args (afr_local_t *local, int child_index, dict_t *xattr, +                       struct iatt *buf, struct iatt *postparent) +{ +        GF_ASSERT (child_index >= 0); +        local->cont.lookup.xattrs[child_index] = dict_ref (xattr); +        local->cont.lookup.postparents[child_index] = *postparent; +        local->cont.lookup.bufs[child_index] = *buf; +} + +static void +afr_lookup_handle_first_success (afr_local_t *local, xlator_t *this, +                                 inode_t *inode, struct iatt *buf) +{ +        local->cont.lookup.inode      = inode_ref (inode); +        local->cont.lookup.buf        = *buf; +        afr_set_root_inode_on_first_lookup (local, this, inode); +} + +static int32_t +afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                   int32_t op_ret, int32_t op_errno, dict_t *dict, +                   dict_t *xdata) +{ +        int              ret            = 0; +        char            *pathinfo       = NULL; +        gf_boolean_t     is_local        = _gf_false; +        afr_private_t   *priv           = NULL; +        int32_t          child_index    = -1; + +        if (op_ret != 0) { +                goto out; +        } + +        ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); +        if (ret != 0) { +                goto out; +        } + +        ret = afr_local_pathinfo (pathinfo, &is_local); +        if (ret) { +                goto out; +        } + +        priv = this->private; +        /* +         * Note that one local subvolume will override another here.  The only +         * way to avoid that would be to retain extra information about whether +         * the previous read_child is local, and it's just not worth it.  Even +         * the slowest local subvolume is far preferable to a remote one. +         */ +        if (is_local) { +                child_index = (int32_t)(long)cookie; +                gf_log (this->name, GF_LOG_INFO, +                        "selecting local read_child %s", +                        priv->children[child_index]->name); +                priv->read_child = child_index; +        } + +out: +        STACK_DESTROY(frame->root); +        return 0; +} + +static void +afr_attempt_local_discovery (xlator_t *this, int32_t child_index) +{ +        call_frame_t    *newframe = NULL; +        loc_t            tmploc = {0,}; +        afr_private_t   *priv = this->private; + +        newframe = create_frame(this,this->ctx->pool); +        if (!newframe) { +                return; +        } + +        tmploc.gfid[sizeof(tmploc.gfid)-1] = 1; +        STACK_WIND_COOKIE (newframe, afr_discovery_cbk, +                           (void *)(long)child_index, +                           priv->children[child_index], +                           priv->children[child_index]->fops->getxattr, +                           &tmploc, GF_XATTR_PATHINFO_KEY, NULL); +} + +static void +afr_lookup_handle_success (afr_local_t *local, xlator_t *this, int32_t child_index, +                           int32_t op_ret, int32_t op_errno, inode_t *inode, +                           struct iatt *buf, dict_t *xattr, +                           struct iatt *postparent) +{ +        afr_private_t   *priv   = this->private; + +        if (local->success_count == 0) { +                if (local->op_errno != ESTALE) { +                        local->op_ret = op_ret; +                        local->op_errno = 0; +                } +                afr_lookup_handle_first_success (local, this, inode, buf); +        } +        afr_lookup_update_lk_counts (local, this, +                                     child_index, xattr); + +        afr_lookup_cache_args (local, child_index, xattr, +                               buf, postparent); + +        if (local->do_discovery && (priv->read_child == (-1))) { +                afr_attempt_local_discovery(this,child_index); +        } + +        local->cont.lookup.success_children[local->success_count] = child_index; +        local->success_count++; +} + +int +afr_lookup_cbk (call_frame_t *frame, void *cookie, +                xlator_t *this,  int32_t op_ret,  int32_t op_errno, +                inode_t *inode,   struct iatt *buf, dict_t *xattr, +                struct iatt *postparent) +{ +        afr_local_t *   local = NULL; +        int             call_count      = -1; +        int             child_index     = -1; + +         child_index = (long) cookie; + +        LOCK (&frame->lock); +        { +                local = frame->local; + +                if (op_ret == -1) { +                        afr_lookup_handle_error (local, op_ret, op_errno); +                        goto unlock; +                } +                afr_lookup_handle_success (local, this, child_index, op_ret, +                                           op_errno, inode, buf, xattr, +                                           postparent); + +         } +unlock: +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); +        if (call_count == 0) { +               afr_lookup_done (frame, this); +        } + +         return 0; +} + +int +afr_lookup_cont_init (afr_local_t *local, unsigned int child_count) +{ +        int               ret            = -ENOMEM; +        struct iatt       *iatts         = NULL; +        int32_t           *success_children = NULL; +        int32_t           *sources       = NULL; +        int32_t           **pending_matrix = NULL; + +        GF_ASSERT (local); +        local->cont.lookup.xattrs = GF_CALLOC (child_count, +                                               sizeof (*local->cont.lookup.xattr), +                                               gf_afr_mt_dict_t); +        if (NULL == local->cont.lookup.xattrs) +                goto out; + +        iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); +        if (NULL == iatts) +                goto out; +        local->cont.lookup.postparents = iatts; + +        iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); +        if (NULL == iatts) +                goto out; +        local->cont.lookup.bufs = iatts; + +        success_children = afr_children_create (child_count); +        if (NULL == success_children) +                goto out; +        local->cont.lookup.success_children = success_children; + +        local->fresh_children = afr_children_create (child_count); +        if (NULL == local->fresh_children) +                goto out; + +        sources = GF_CALLOC (sizeof (*sources), child_count, gf_afr_mt_int32_t); +        if (NULL == sources) +                goto out; +        local->cont.lookup.sources = sources; + +        pending_matrix = afr_matrix_create (child_count, child_count); +        if (NULL == pending_matrix) +                goto out; +        local->cont.lookup.pending_matrix = pending_matrix; + +        ret = 0; +out: +        return ret; +} + +int +afr_lookup (call_frame_t *frame, xlator_t *this, +            loc_t *loc, dict_t *xattr_req) +{ +        afr_private_t  *priv      = NULL; +        afr_local_t    *local     = NULL; +        void           *gfid_req  = NULL; +        int            ret        = -1; +        int            i          = 0; +        int            call_count = 0; +        uint64_t       ctx        = 0; +        int32_t        op_errno   = 0; +                       priv       = this->private; + +        AFR_LOCAL_ALLOC_OR_GOTO (local, out); + +        local->op_ret = -1; + +        frame->local = local; +        local->fop = GF_FOP_LOOKUP; + +        loc_copy (&local->loc, loc); +        ret = loc_path (&local->loc, NULL); +        if (ret < 0) { +                op_errno = EINVAL; +                goto out; +        } + +        if (local->loc.path && +            (strcmp (local->loc.path, "/" GF_REPLICATE_TRASH_DIR) == 0)) { +                op_errno = EPERM; +                ret = -1; +                goto out; +        } + +        ret = inode_ctx_get (local->loc.inode, this, &ctx); +        if (ret == 0) { +                /* lookup is a revalidate */ + +                local->read_child_index = afr_inode_get_read_ctx (this, +                                                               local->loc.inode, +                                                               NULL); +        } else { +                LOCK (&priv->read_child_lock); +                { +                        if (priv->hash_mode) { +                                local->read_child_index = -1; +                        } +                        else { +                                local->read_child_index = +                                        (++priv->read_child_rr) % +                                        (priv->child_count); +                        } +                } +                UNLOCK (&priv->read_child_lock); +                local->cont.lookup.fresh_lookup = _gf_true; +        } + +        local->child_up = memdup (priv->child_up, +                                  sizeof (*local->child_up) * priv->child_count); +        if (NULL == local->child_up) { +                op_errno = ENOMEM; +                goto out; +        } + +        ret = afr_lookup_cont_init (local, priv->child_count); +        if (ret < 0) { +                op_errno = -ret; +                goto out; +        } + +        local->call_count = afr_up_children_count (local->child_up, +                                                   priv->child_count); +        call_count = local->call_count; +        if (local->call_count == 0) { +                ret      = -1; +                op_errno = ENOTCONN; +                goto out; +        } + +        /* By default assume ENOTCONN. On success it will be set to 0. */ +        local->op_errno = ENOTCONN; + +        ret = dict_get_int32 (xattr_req, "attempt-self-heal", +                              &local->attempt_self_heal); +        dict_del (xattr_req, "attempt-self-heal"); + +        ret = dict_get_int32 (xattr_req, "foreground-self-heal", +                              &local->foreground_self_heal); +        dict_del (xattr_req, "foreground-self-heal"); + +        ret = afr_lookup_xattr_req_prepare (local, this, xattr_req, &local->loc, +                                            &gfid_req); +        if (ret) { +                local->op_errno = -ret; +                goto out; +        } +        afr_lookup_save_gfid (local->cont.lookup.gfid_req, gfid_req, +                              &local->loc); +        local->fop = GF_FOP_LOOKUP; +        if (priv->choose_local && !priv->did_discovery) { +                if (gfid_req && __is_root_gfid(gfid_req)) { +                        local->do_discovery = _gf_true; +                        priv->did_discovery = _gf_true; +                } +        } +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i]) { +                        STACK_WIND_COOKIE (frame, afr_lookup_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->lookup, +                                           &local->loc, local->xattr_req); +                        if (!--call_count) +                                break; +                } +        } + +        ret = 0; +out: +        if (ret) +                AFR_STACK_UNWIND (lookup, frame, -1, op_errno, +                                  NULL, NULL, NULL, NULL); + +        return 0; +} + + +/* {{{ open */ + +int +__afr_fd_ctx_set (xlator_t *this, fd_t *fd) +{ +        afr_private_t * priv   = NULL; +        int             ret    = -1; +        uint64_t        ctx    = 0; +        afr_fd_ctx_t *  fd_ctx = NULL; + +        VALIDATE_OR_GOTO (this->private, out); +        VALIDATE_OR_GOTO (fd, out); + +        priv = this->private; + +        ret = __fd_ctx_get (fd, this, &ctx); + +        if (ret == 0) +                goto out; + +        fd_ctx = GF_CALLOC (1, sizeof (afr_fd_ctx_t), +                            gf_afr_mt_afr_fd_ctx_t); +        if (!fd_ctx) { +                ret = -ENOMEM; +                goto out; +        } + +        fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done), +                                         priv->child_count, +                                         gf_afr_mt_char); +        if (!fd_ctx->pre_op_done) { +                ret = -ENOMEM; +                goto out; +        } + +        fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback), +                                              priv->child_count, +                                              gf_afr_mt_char); +        if (!fd_ctx->pre_op_piggyback) { +                ret = -ENOMEM; +                goto out; +        } + +        fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), +                                       priv->child_count, +                                       gf_afr_mt_int32_t); +        if (!fd_ctx->opened_on) { +                ret = -ENOMEM; +                goto out; +        } + +        fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback), +                                            priv->child_count, +                                            gf_afr_mt_char); +        if (!fd_ctx->lock_piggyback) { +                ret = -ENOMEM; +                goto out; +        } + +        fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired), +                                           priv->child_count, +                                           gf_afr_mt_char); +        if (!fd_ctx->lock_acquired) { +                ret = -ENOMEM; +                goto out; +        } + +        fd_ctx->up_count   = priv->up_count; +        fd_ctx->down_count = priv->down_count; + +        fd_ctx->locked_on = GF_CALLOC (sizeof (*fd_ctx->locked_on), +                                       priv->child_count, +                                       gf_afr_mt_char); +        if (!fd_ctx->locked_on) { +                ret = -ENOMEM; +                goto out; +        } + +	pthread_mutex_init (&fd_ctx->delay_lock, NULL); +        INIT_LIST_HEAD (&fd_ctx->entries); +        fd_ctx->call_child = -1; + +        INIT_LIST_HEAD (&fd_ctx->eager_locked); + +        ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); +        if (ret) +                gf_log (this->name, GF_LOG_DEBUG, +                        "failed to set fd ctx (%p)", fd); +out: +        return ret; +} + + +int +afr_fd_ctx_set (xlator_t *this, fd_t *fd) +{ +        int ret = -1; + +        LOCK (&fd->lock); +        { +                ret = __afr_fd_ctx_set (this, fd); +        } +        UNLOCK (&fd->lock); + +        return ret; +} + +/* {{{ flush */ + +int +afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +              int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +        afr_local_t *   local = NULL; +        int call_count  = -1; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (op_ret != -1) { +                        if (local->success_count == 0) { +                                local->op_ret = op_ret; +                        } +                        local->success_count++; +                } + +                local->op_errno = op_errno; +        } +        UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND(flush, frame, local->op_ret, +				 local->op_errno, NULL); + +        return 0; +} + +static int +afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ +        int           i      = 0; +        afr_local_t   *local = NULL; +        afr_private_t *priv  = NULL; +        int call_count       = -1; + +        priv = this->private; +        local = frame->local; +        call_count = local->call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i]) { +                        STACK_WIND_COOKIE (frame, afr_flush_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->flush, +                                           local->fd, NULL); +                        if (!--call_count) +                                break; + +                } +        } + +        return 0; +} + +int +afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; +        afr_local_t   *local = NULL; +        call_stub_t   *stub = NULL; +        int            ret        = -1; +        int            op_errno   = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +	AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +	local = frame->local; + +	ret = afr_local_init(local, priv, &op_errno); +	if (ret < 0) +		goto out; + +	local->fd = fd_ref(fd); +        stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata); +        if (!stub) { +                ret = -1; +                op_errno = ENOMEM; +                goto out; +        } + +        afr_delayed_changelog_wake_resume (this, fd, stub); +	ret = 0; + +out: +	if (ret < 0) +		AFR_STACK_UNWIND(flush, frame, -1, op_errno, NULL); + +        return 0; +} + +/* }}} */ + + +int +afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) +{ +        uint64_t        ctx = 0; +        afr_fd_ctx_t    *fd_ctx = NULL; +        int             ret = 0; + +        ret = fd_ctx_get (fd, this, &ctx); +        if (ret < 0) +                goto out; + +        fd_ctx = (afr_fd_ctx_t *)(long) ctx; + +        if (fd_ctx) { +                GF_FREE (fd_ctx->pre_op_done); + +                GF_FREE (fd_ctx->opened_on); + +                GF_FREE (fd_ctx->locked_on); + +                GF_FREE (fd_ctx->pre_op_piggyback); +                GF_FREE (fd_ctx->lock_piggyback); + +                GF_FREE (fd_ctx->lock_acquired); + +		pthread_mutex_destroy (&fd_ctx->delay_lock); + +                GF_FREE (fd_ctx); +        } + +out: +        return 0; +} + + +int +afr_release (xlator_t *this, fd_t *fd) +{ +        afr_locked_fd_t *locked_fd = NULL; +        afr_locked_fd_t *tmp       = NULL; +        afr_private_t   *priv      = NULL; + +        priv = this->private; + +        afr_cleanup_fd_ctx (this, fd); + +        list_for_each_entry_safe (locked_fd, tmp, &priv->saved_fds, +                                  list) { + +                if (locked_fd->fd == fd) { +                        list_del_init (&locked_fd->list); +                        GF_FREE (locked_fd); +                } + +        } + +        return 0; +} + + +/* {{{ fsync */ + +int +afr_fsync_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                      int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +                      struct iatt *postbuf, dict_t *xdata) +{ +        AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, +                          xdata); +        return 0; +} + +int +afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +               int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +               struct iatt *postbuf, dict_t *xdata) +{ +        afr_local_t *local = NULL; +        int call_count = -1; +        int child_index = (long) cookie; +        int read_child  = 0; +	call_stub_t *stub = NULL; + +        local = frame->local; + +        read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + +        LOCK (&frame->lock); +        { +                if (child_index == read_child) { +                        local->read_child_returned = _gf_true; +                } + +                if (op_ret == 0) { +                        local->op_ret = 0; + +                        if (local->success_count == 0) { +                                local->cont.inode_wfop.prebuf  = *prebuf; +                                local->cont.inode_wfop.postbuf = *postbuf; +                        } + +                        if (child_index == read_child) { +                                local->cont.inode_wfop.prebuf  = *prebuf; +                                local->cont.inode_wfop.postbuf = *postbuf; +                        } + +                        local->success_count++; +                } + +                local->op_errno = op_errno; +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +		/* Make a stub out of the frame, and register it +		   with the waking up post-op. When the call-stub resumes, +		   we are guaranteed that there was no post-op pending +		   (i.e changelogs were unset in the server). This is an +		   essential "guarantee", that fsync() returns only after +		   completely finishing EVERYTHING, including the delayed +		   post-op. This guarantee is expected by FUSE graph switching +		   for example. +		*/ +		stub = fop_fsync_cbk_stub (frame, afr_fsync_unwind_cbk, +                                           local->op_ret, local->op_errno, +                                           &local->cont.inode_wfop.prebuf, +                                           &local->cont.inode_wfop.postbuf, +                                           xdata); +		if (!stub) { +			AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); +			return 0; +		} + +		/* If no new unstable writes happened between the +		   time we cleared the unstable write witness flag in afr_fsync +		   and now, calling afr_delayed_changelog_wake_up() should +		   wake up and skip over the fsync phase and go straight to +		   afr_changelog_post_op_now() +		*/ +		afr_delayed_changelog_wake_resume (this, local->fd, stub); +        } + +        return 0; +} + + +int +afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, +           int32_t datasync, dict_t *xdata) +{ +        afr_private_t *priv = NULL; +        afr_local_t *local = NULL; +        int ret = -1; +        int i = 0; +        int32_t call_count = 0; +        int32_t op_errno = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        call_count = local->call_count; + +        local->fd             = fd_ref (fd); + +	if (afr_fd_has_witnessed_unstable_write (this, fd)) { +		/* don't care. we only wanted to CLEAR the bit */ +	} + +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i]) { +                        STACK_WIND_COOKIE (frame, afr_fsync_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->fsync, +                                           fd, datasync, xdata); +                        if (!--call_count) +                                break; +                } +        } + +        ret = 0; +out: +        if (ret < 0) +                AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); +        return 0; +} + +/* }}} */ + +/* {{{ fsync */ + +int32_t +afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, +                  xlator_t *this, int32_t op_ret, int32_t op_errno, +                  dict_t *xdata) +{ +        afr_local_t *local = NULL; +        int call_count = -1; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (op_ret == 0) +                        local->op_ret = 0; + +                local->op_errno = op_errno; +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) +                AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret, +                                  local->op_errno, xdata); + +        return 0; +} + + +int32_t +afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, +              int32_t datasync, dict_t *xdata) +{ +        afr_private_t *priv = NULL; +        afr_local_t *local = NULL; +        int ret = -1; +        int i = 0; +        int32_t call_count = 0; +        int32_t op_errno = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        call_count = local->call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i]) { +                        STACK_WIND (frame, afr_fsyncdir_cbk, +                                    priv->children[i], +                                    priv->children[i]->fops->fsyncdir, +                                    fd, datasync, xdata); +                        if (!--call_count) +                                break; +                } +        } + +        ret = 0; +out: +        if (ret < 0) +                AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); +        return 0; +} + +/* }}} */ + +/* {{{ xattrop */ + +int32_t +afr_xattrop_cbk (call_frame_t *frame, void *cookie, +                 xlator_t *this, int32_t op_ret, int32_t op_errno, +                 dict_t *xattr, dict_t *xdata) +{ +        afr_local_t *local = NULL; +        int call_count = -1; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (op_ret == 0) { +                        if (!local->cont.xattrop.xattr) +                                local->cont.xattrop.xattr = dict_ref (xattr); +                        local->op_ret = 0; +                } + +                local->op_errno = op_errno; +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) +                AFR_STACK_UNWIND (xattrop, frame, local->op_ret, local->op_errno, +                local->cont.xattrop.xattr, xdata); + +        return 0; +} + + +int32_t +afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, +             gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ +        afr_private_t *priv = NULL; +        afr_local_t *local  = NULL; +        int ret = -1; +        int i = 0; +        int32_t call_count = 0; +        int32_t op_errno = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        call_count = local->call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i]) { +                        STACK_WIND (frame, afr_xattrop_cbk, +                                    priv->children[i], +                                    priv->children[i]->fops->xattrop, +                                    loc, optype, xattr, xdata); +                        if (!--call_count) +                                break; +                } +        } + +        ret = 0; +out: +        if (ret < 0) +                AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); +        return 0; +} + +/* }}} */ + +/* {{{ fxattrop */ + +int32_t +afr_fxattrop_cbk (call_frame_t *frame, void *cookie, +                  xlator_t *this, int32_t op_ret, int32_t op_errno, +                  dict_t *xattr, dict_t *xdata) +{ +        afr_local_t *local = NULL; + +        int call_count = -1; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (op_ret == 0) { +                        if (!local->cont.fxattrop.xattr) +                                local->cont.fxattrop.xattr = dict_ref (xattr); + +                        local->op_ret = 0; +                } + +                local->op_errno = op_errno; +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) +                AFR_STACK_UNWIND (fxattrop, frame, local->op_ret, local->op_errno, +                                  local->cont.fxattrop.xattr, xdata); + +        return 0; +} + + +int32_t +afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, +              gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +{ +        afr_private_t *priv = NULL; +        afr_local_t *local  = NULL; +        int ret = -1; +        int i = 0; +        int32_t call_count = 0; +        int32_t op_errno = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        call_count = local->call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i]) { +                        STACK_WIND (frame, afr_fxattrop_cbk, +                                    priv->children[i], +                                    priv->children[i]->fops->fxattrop, +                                    fd, optype, xattr, xdata); +                        if (!--call_count) +                                break; +                } +        } + +        ret = 0; +out: +        if (ret < 0) +                AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); +        return 0; +} + +/* }}} */ + + +int32_t +afr_inodelk_cbk (call_frame_t *frame, void *cookie, +                 xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) + +{ +        afr_local_t *local = NULL; +        int call_count = -1; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (op_ret == 0) +                        local->op_ret = 0; + +                local->op_errno = op_errno; +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) +                AFR_STACK_UNWIND (inodelk, frame, local->op_ret, +                                  local->op_errno, xdata); + +        return 0; +} + + +int32_t +afr_inodelk (call_frame_t *frame, xlator_t *this, +             const char *volume, loc_t *loc, int32_t cmd, +             struct gf_flock *flock, dict_t *xdata) +{ +        afr_private_t *priv = NULL; +        afr_local_t *local  = NULL; +        int ret = -1; +        int i = 0; +        int32_t call_count = 0; +        int32_t op_errno = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        call_count = local->call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i]) { +                        STACK_WIND (frame, afr_inodelk_cbk, +                                    priv->children[i], +                                    priv->children[i]->fops->inodelk, +                                    volume, loc, cmd, flock, xdata); + +                        if (!--call_count) +                                break; +                } +        } + +        ret = 0; +out: +        if (ret < 0) +                AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); +        return 0; +} + + +int32_t +afr_finodelk_cbk (call_frame_t *frame, void *cookie, +                  xlator_t *this, int32_t op_ret, int32_t op_errno, +                  dict_t *xdata) + +{ +        afr_local_t *local = NULL; +        int call_count = -1; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (op_ret == 0) +                        local->op_ret = 0; + +                local->op_errno = op_errno; +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) +                AFR_STACK_UNWIND (finodelk, frame, local->op_ret, +                                  local->op_errno, xdata); + +        return 0; +} + + +int32_t +afr_finodelk (call_frame_t *frame, xlator_t *this, +              const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, +              dict_t *xdata) +{ +        afr_private_t *priv = NULL; +        afr_local_t *local  = NULL; +        int ret = -1; +        int i = 0; +        int32_t call_count = 0; +        int32_t op_errno = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        call_count = local->call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i]) { +                        STACK_WIND (frame, afr_finodelk_cbk, +                                    priv->children[i], +                                    priv->children[i]->fops->finodelk, +                                    volume, fd, cmd, flock, xdata); + +                        if (!--call_count) +                                break; +                } +        } + +        ret = 0; +out: +        if (ret < 0) +                AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); +        return 0; +} + + +int32_t +afr_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                 int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +        afr_local_t *local = NULL; +        int call_count = -1; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (op_ret == 0) +                        local->op_ret = 0; + +                local->op_errno = op_errno; +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) +                AFR_STACK_UNWIND (entrylk, frame, local->op_ret, +                                  local->op_errno, xdata); + +        return 0; +} + + +int32_t +afr_entrylk (call_frame_t *frame, xlator_t *this, +             const char *volume, loc_t *loc, +             const char *basename, entrylk_cmd cmd, entrylk_type type, +             dict_t *xdata) +{ +        afr_private_t *priv = NULL; +        afr_local_t *local  = NULL; +        int ret = -1; +        int i = 0; +        int32_t call_count = 0; +        int32_t op_errno = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        call_count = local->call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i]) { +                        STACK_WIND (frame, afr_entrylk_cbk, +                                    priv->children[i], +                                    priv->children[i]->fops->entrylk, +                                    volume, loc, basename, cmd, type, xdata); + +                        if (!--call_count) +                                break; +                } +        } + +        ret = 0; +out: +        if (ret < 0) +                AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); +        return 0; +} + + + +int32_t +afr_fentrylk_cbk (call_frame_t *frame, void *cookie, +                  xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) + +{ +        afr_local_t *local = NULL; +        int call_count = -1; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (op_ret == 0) +                        local->op_ret = 0; + +                local->op_errno = op_errno; +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) +                AFR_STACK_UNWIND (fentrylk, frame, local->op_ret, +                                  local->op_errno, xdata); + +        return 0; +} + + +int32_t +afr_fentrylk (call_frame_t *frame, xlator_t *this, +              const char *volume, fd_t *fd, +              const char *basename, entrylk_cmd cmd, +              entrylk_type type, dict_t *xdata) +{ +        afr_private_t *priv = NULL; +        afr_local_t *local  = NULL; +        int ret = -1; +        int i = 0; +        int32_t call_count = 0; +        int32_t op_errno = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        call_count = local->call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i]) { +                        STACK_WIND (frame, afr_fentrylk_cbk, +                                    priv->children[i], +                                    priv->children[i]->fops->fentrylk, +                                    volume, fd, basename, cmd, type, xdata); + +                        if (!--call_count) +                                break; +                } +        } + +        ret = 0; +out: +        if (ret < 0) +                AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); +        return 0; +} + +int32_t +afr_statfs_cbk (call_frame_t *frame, void *cookie, +                xlator_t *this, int32_t op_ret, int32_t op_errno, +                struct statvfs *statvfs, dict_t *xdata) +{ +        afr_local_t *local = NULL; +        int call_count = 0; + +        LOCK (&frame->lock); +        { +                local = frame->local; + +                if (op_ret == 0) { +                        local->op_ret   = op_ret; + +                        if (local->cont.statfs.buf_set) { +                                if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail) +                                        local->cont.statfs.buf = *statvfs; +                        } else { +                                local->cont.statfs.buf = *statvfs; +                                local->cont.statfs.buf_set = 1; +                        } +                } + +                if (op_ret == -1) +                        local->op_errno = op_errno; + +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) +                AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno, +                                  &local->cont.statfs.buf, xdata); + +        return 0; +} + + +int32_t +afr_statfs (call_frame_t *frame, xlator_t *this, +            loc_t *loc, dict_t *xdata) +{ +        afr_private_t *  priv        = NULL; +        int              child_count = 0; +        afr_local_t   *  local       = NULL; +        int              i           = 0; +        int              ret = -1; +        int              call_count = 0; +        int32_t          op_errno    = 0; + +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); +        VALIDATE_OR_GOTO (loc, out); + +        priv = this->private; +        child_count = priv->child_count; + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        call_count = local->call_count; + +        for (i = 0; i < child_count; i++) { +                if (local->child_up[i]) { +                        STACK_WIND (frame, afr_statfs_cbk, +                                    priv->children[i], +                                    priv->children[i]->fops->statfs, +                                    loc, xdata); +                        if (!--call_count) +                                break; +                } +        } + +        ret = 0; +out: +        if (ret < 0) +                AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); +        return 0; +} + + +int32_t +afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                   int32_t op_ret, int32_t op_errno, struct gf_flock *lock, +                   dict_t *xdata) +{ +        afr_local_t * local = NULL; +        int call_count = -1; + +        local = frame->local; +        call_count = afr_frame_return (frame); + +        if (call_count == 0) +                AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, +                                  lock, xdata); + +        return 0; +} + + +int32_t +afr_lk_unlock (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t   * local = NULL; +        afr_private_t * priv  = NULL; +        int i = 0; +        int call_count = 0; + +        local = frame->local; +        priv  = this->private; + +        call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes, +                                             priv->child_count); + +        if (call_count == 0) { +                AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, +                                  &local->cont.lk.ret_flock, NULL); +                return 0; +        } + +        local->call_count = call_count; + +        local->cont.lk.user_flock.l_type = F_UNLCK; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->cont.lk.locked_nodes[i]) { +                        STACK_WIND (frame, afr_lk_unlock_cbk, +                                    priv->children[i], +                                    priv->children[i]->fops->lk, +                                    local->fd, F_SETLK, +                                    &local->cont.lk.user_flock, NULL); + +                        if (!--call_count) +                                break; +                } +        } + +        return 0; +} + + +int32_t +afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +            int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        int child_index = -1; +/*        int            ret  = 0; */ + + +        local = frame->local; +        priv  = this->private; + +        child_index = (long) cookie; + +        if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) { +                local->op_ret   = -1; +                local->op_errno = op_errno; + +                afr_lk_unlock (frame, this); +                return 0; +        } + +        if (op_ret == 0) { +                local->op_ret        = 0; +                local->op_errno      = 0; +                local->cont.lk.locked_nodes[child_index] = 1; +                local->cont.lk.ret_flock = *lock; +        } + +        child_index++; + +        if (child_index < priv->child_count) { +                STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index, +                                   priv->children[child_index], +                                   priv->children[child_index]->fops->lk, +                                   local->fd, local->cont.lk.cmd, +                                   &local->cont.lk.user_flock, xdata); +        } else if (local->op_ret == -1) { +                /* all nodes have gone down */ + +                AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN, +                                  &local->cont.lk.ret_flock, NULL); +        } else { +                /* locking has succeeded on all nodes that are up */ + +                /* temporarily +                   ret = afr_mark_locked_nodes (this, local->fd, +                   local->cont.lk.locked_nodes); +                   if (ret) +                   gf_log (this->name, GF_LOG_DEBUG, +                   "Could not save locked nodes info in fdctx"); + +                   ret = afr_save_locked_fd (this, local->fd); +                   if (ret) +                   gf_log (this->name, GF_LOG_DEBUG, +                   "Could not save locked fd"); + +                */ +                AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, +                                  &local->cont.lk.ret_flock, NULL); +        } + +        return 0; +} + + +int +afr_lk (call_frame_t *frame, xlator_t *this, +        fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) +{ +        afr_private_t *priv = NULL; +        afr_local_t *local = NULL; +        int i = 0; +        int32_t op_errno = 0; +        int     ret      = -1; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count, +                                                 sizeof (*local->cont.lk.locked_nodes), +                                                 gf_afr_mt_char); + +        if (!local->cont.lk.locked_nodes) { +                op_errno = ENOMEM; +                goto out; +        } + +        local->fd            = fd_ref (fd); +        local->cont.lk.cmd   = cmd; +        local->cont.lk.user_flock = *flock; +        local->cont.lk.ret_flock = *flock; + +        STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0, +                           priv->children[i], +                           priv->children[i]->fops->lk, +                           fd, cmd, flock, xdata); + +        ret = 0; +out: +        if (ret < 0) +                AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); +        return 0; +} + +int +afr_forget (xlator_t *this, inode_t *inode) +{ +        uint64_t        ctx_addr = 0; +        afr_inode_ctx_t *ctx     = NULL; + +        inode_ctx_get (inode, this, &ctx_addr); + +        if (!ctx_addr) +                goto out; + +        ctx = (afr_inode_ctx_t *)(long)ctx_addr; +        GF_FREE (ctx->fresh_children); +        GF_FREE (ctx); +out: +        return 0; +} + +int +afr_priv_dump (xlator_t *this) +{ +        afr_private_t *priv = NULL; +        char  key_prefix[GF_DUMP_MAX_BUF_LEN]; +        char  key[GF_DUMP_MAX_BUF_LEN]; +        int   i = 0; + + +        GF_ASSERT (this); +        priv = this->private; + +        GF_ASSERT (priv); +        snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); +        gf_proc_dump_add_section(key_prefix); +        gf_proc_dump_write("child_count", "%u", priv->child_count); +        gf_proc_dump_write("read_child_rr", "%u", priv->read_child_rr); +        for (i = 0; i < priv->child_count; i++) { +                sprintf (key, "child_up[%d]", i); +                gf_proc_dump_write(key, "%d", priv->child_up[i]); +                sprintf (key, "pending_key[%d]", i); +                gf_proc_dump_write(key, "%s", priv->pending_key[i]); +        } +        gf_proc_dump_write("data_self_heal", "%s", priv->data_self_heal); +        gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal); +        gf_proc_dump_write("entry_self_heal", "%d", priv->entry_self_heal); +        gf_proc_dump_write("data_change_log", "%d", priv->data_change_log); +        gf_proc_dump_write("metadata_change_log", "%d", priv->metadata_change_log); +        gf_proc_dump_write("entry-change_log", "%d", priv->entry_change_log); +        gf_proc_dump_write("read_child", "%d", priv->read_child); +        gf_proc_dump_write("favorite_child", "%d", priv->favorite_child); +        gf_proc_dump_write("wait_count", "%u", priv->wait_count); + +        return 0; +} + + +/** + * find_child_index - find the child's index in the array of subvolumes + * @this: AFR + * @child: child + */ + +static int +find_child_index (xlator_t *this, xlator_t *child) +{ +        afr_private_t *priv = NULL; +        int i = -1; + +        priv = this->private; + +        for (i = 0; i < priv->child_count; i++) { +                if ((xlator_t *) child == priv->children[i]) +                        break; +        } + +        return i; +} + +int32_t +afr_notify (xlator_t *this, int32_t event, +            void *data, void *data2) +{ +        afr_private_t   *priv               = NULL; +        int             i                   = -1; +        int             up_children         = 0; +        int             down_children       = 0; +        int             propagate           = 0; +        int             had_heard_from_all  = 0; +        int             have_heard_from_all = 0; +        int             idx                 = -1; +        int             ret                 = -1; +        int             call_psh            = 0; +        int             up_child            = AFR_ALL_CHILDREN; +        dict_t          *input              = NULL; +        dict_t          *output             = NULL; + +        priv = this->private; + +        if (!priv) +                return 0; + +        /* +         * We need to reset this in case children come up in "staggered" +         * fashion, so that we discover a late-arriving local subvolume.  Note +         * that we could end up issuing N lookups to the first subvolume, and +         * O(N^2) overall, but N is small for AFR so it shouldn't be an issue. +         */ +        priv->did_discovery = _gf_false; + +        had_heard_from_all = 1; +        for (i = 0; i < priv->child_count; i++) { +                if (!priv->last_event[i]) { +                        had_heard_from_all = 0; +                } +        } + +        /* parent xlators dont need to know about every child_up, child_down +         * because of afr ha. If all subvolumes go down, child_down has +         * to be triggered. In that state when 1 subvolume comes up child_up +         * needs to be triggered. dht optimizes revalidate lookup by sending +         * it only to one of its subvolumes. When child up/down happens +         * for afr's subvolumes dht should be notified by child_modified. The +         * subsequent revalidate lookup happens on all the dht's subvolumes +         * which triggers afr self-heals if any. +         */ +        idx = find_child_index (this, data); +        if (idx < 0) { +                gf_log (this->name, GF_LOG_ERROR, "Received child_up " +                        "from invalid subvolume"); +                goto out; +        } + +        switch (event) { +        case GF_EVENT_CHILD_UP: +                LOCK (&priv->lock); +                { +                        /* +                         * This only really counts if the child was never up +                         * (value = -1) or had been down (value = 0).  See +                         * comment at GF_EVENT_CHILD_DOWN for a more detailed +                         * explanation. +                         */ +                        if (priv->child_up[idx] != 1) { +                                priv->up_count++; +                        } +                        priv->child_up[idx] = 1; + +                        call_psh = 1; +                        up_child = idx; +                        for (i = 0; i < priv->child_count; i++) +                                if (priv->child_up[i] == 1) +                                        up_children++; +                        if (up_children == 1) { +                                gf_log (this->name, GF_LOG_INFO, +                                        "Subvolume '%s' came back up; " +                                        "going online.", ((xlator_t *)data)->name); +                        } else { +                                event = GF_EVENT_CHILD_MODIFIED; +                        } + +                        priv->last_event[idx] = event; +                } +                UNLOCK (&priv->lock); + +                break; + +        case GF_EVENT_CHILD_DOWN: +                LOCK (&priv->lock); +                { +                        /* +                         * If a brick is down when we start, we'll get a +                         * CHILD_DOWN to indicate its initial state.  There +                         * was never a CHILD_UP in this case, so if we +                         * increment "down_count" the difference between than +                         * and "up_count" will no longer be the number of +                         * children that are currently up.  This has serious +                         * implications e.g. for quorum enforcement, so we +                         * don't increment these values unless the event +                         * represents an actual state transition between "up" +                         * (value = 1) and anything else. +                         */ +                        if (priv->child_up[idx] == 1) { +                                priv->down_count++; +                        } +                        priv->child_up[idx] = 0; + +                        for (i = 0; i < priv->child_count; i++) +                                if (priv->child_up[i] == 0) +                                        down_children++; +                        if (down_children == priv->child_count) { +                                gf_log (this->name, GF_LOG_ERROR, +                                        "All subvolumes are down. Going offline " +                                        "until atleast one of them comes back up."); +                        } else { +                                event = GF_EVENT_CHILD_MODIFIED; +                        } + +                        priv->last_event[idx] = event; +                } +                UNLOCK (&priv->lock); + +                break; + +        case GF_EVENT_CHILD_CONNECTING: +                LOCK (&priv->lock); +                { +                        priv->last_event[idx] = event; +                } +                UNLOCK (&priv->lock); + +                break; + +        case GF_EVENT_TRANSLATOR_OP: +                input = data; +                output = data2; +                if (!had_heard_from_all) { +                        ret = -1; +                        goto out; +                } +                ret = afr_xl_op (this, input, output); +                goto out; +                break; + +        default: +                propagate = 1; +                break; +        } + +        /* have all subvolumes reported status once by now? */ +        have_heard_from_all = 1; +        for (i = 0; i < priv->child_count; i++) { +                if (!priv->last_event[i]) +                        have_heard_from_all = 0; +        } + +        /* if all subvols have reported status, no need to hide anything +           or wait for anything else. Just propagate blindly */ +        if (have_heard_from_all) +                propagate = 1; + +        if (!had_heard_from_all && have_heard_from_all) { +                /* This is the first event which completes aggregation +                   of events from all subvolumes. If at least one subvol +                   had come up, propagate CHILD_UP, but only this time +                */ +                event = GF_EVENT_CHILD_DOWN; + +                LOCK (&priv->lock); +                { +                        up_children = afr_up_children_count (priv->child_up, +                                                             priv->child_count); +                        for (i = 0; i < priv->child_count; i++) { +                                if (priv->last_event[i] == GF_EVENT_CHILD_UP) { +                                        event = GF_EVENT_CHILD_UP; +                                        break; +                                } + +                                if (priv->last_event[i] == +                                                GF_EVENT_CHILD_CONNECTING) { +                                        event = GF_EVENT_CHILD_CONNECTING; +                                        /* continue to check other events for CHILD_UP */ +                                } +                        } +                } +                UNLOCK (&priv->lock); +        } + +        ret = 0; +        if (propagate) +                ret = default_notify (this, event, data); +        if (call_psh && priv->shd.iamshd) +                afr_proactive_self_heal ((void*) (long) up_child); + +out: +        return ret; +} + +int +afr_first_up_child (unsigned char *child_up, size_t child_count) +{ +        int         ret      = -1; +        int         i        = 0; + +        GF_ASSERT (child_up); + +        for (i = 0; i < child_count; i++) { +                if (child_up[i]) { +                        ret = i; +                        break; +                } +        } + +        return ret; +} + +int +afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) +{ +        int     ret = -1; + +        local->op_ret = -1; +        local->op_errno = EUCLEAN; + +        local->child_up = GF_CALLOC (priv->child_count, +                                     sizeof (*local->child_up), +                                     gf_afr_mt_char); +        if (!local->child_up) { +                if (op_errno) +                        *op_errno = ENOMEM; +                goto out; +        } + +        memcpy (local->child_up, priv->child_up, +                sizeof (*local->child_up) * priv->child_count); +        local->call_count = afr_up_children_count (local->child_up, +                                                   priv->child_count); +        if (local->call_count == 0) { +                gf_log (THIS->name, GF_LOG_INFO, "no subvolumes up"); +                if (op_errno) +                        *op_errno = ENOTCONN; +                goto out; +        } + +        local->child_errno = GF_CALLOC (priv->child_count, +                                        sizeof (*local->child_errno), +                                        gf_afr_mt_int32_t); +        if (!local->child_errno) { +                if (op_errno) +                        *op_errno = ENOMEM; +                goto out; +        } + +        local->transaction.postop_piggybacked = GF_CALLOC (priv->child_count, +							   sizeof (int), +							   gf_afr_mt_int32_t); +        if (!local->transaction.postop_piggybacked) { +                if (op_errno) +                        *op_errno = ENOMEM; +                goto out; +        } + +	local->append_write = _gf_false; + +        ret = 0; +out: +        return ret; +} + +int +afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, +                        transaction_lk_type_t lk_type) +{ +        int             ret = -ENOMEM; + +        lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), +                                      child_count, gf_afr_mt_char); +        if (NULL == lk->locked_nodes) +                goto out; + +        lk->lower_locked_nodes = GF_CALLOC (sizeof (*lk->lower_locked_nodes), +                                            child_count, gf_afr_mt_char); +        if (NULL == lk->lower_locked_nodes) +                goto out; + +        lk->lock_op_ret   = -1; +        lk->lock_op_errno = EUCLEAN; +        lk->transaction_lk_type = lk_type; + +        ret = 0; +out: +        return ret; +} + +void +afr_matrix_cleanup (int32_t **matrix, unsigned int m) +{ +        int             i         = 0; + +        if (!matrix) +                goto out; +        for (i = 0; i < m; i++) { +                GF_FREE (matrix[i]); +        } + +        GF_FREE (matrix); +out: +        return; +} + +int32_t** +afr_matrix_create (unsigned int m, unsigned int n) +{ +        int32_t         **matrix = NULL; +        int             i       = 0; + +        matrix = GF_CALLOC (sizeof (*matrix), m, gf_afr_mt_int32_t); +        if (!matrix) +                goto out; + +        for (i = 0; i < m; i++) { +                matrix[i] = GF_CALLOC (sizeof (*matrix[i]), n, +                                       gf_afr_mt_int32_t); +                if (!matrix[i]) +                        goto out; +        } +        return matrix; +out: +        afr_matrix_cleanup (matrix, m); +        return NULL; +} + +int +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count) +{ +        int             ret = -ENOMEM; + +        lk->domain = dom; +        lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), +                                      child_count, gf_afr_mt_char); +        if (NULL == lk->locked_nodes) +                goto out; +        ret = 0; +out: +        return ret; +} + +int +afr_transaction_local_init (afr_local_t *local, xlator_t *this) +{ +        int            child_up_count = 0; +        int            ret = -ENOMEM; +        afr_private_t *priv = NULL; + +        priv = this->private; +        ret = afr_internal_lock_init (&local->internal_lock, priv->child_count, +                                      AFR_TRANSACTION_LK); +        if (ret < 0) +                goto out; + +        if ((local->transaction.type == AFR_DATA_TRANSACTION) || +            (local->transaction.type == AFR_METADATA_TRANSACTION)) { +                ret = afr_inodelk_init (&local->internal_lock.inodelk[0], +                                        this->name, priv->child_count); +                if (ret < 0) +                        goto out; +        } + +        ret = -ENOMEM; +        child_up_count = afr_up_children_count (local->child_up, +                                                priv->child_count); +        if (priv->optimistic_change_log && child_up_count == priv->child_count) +                local->optimistic_change_log = 1; + +        local->first_up_child = afr_first_up_child (local->child_up, +                                                    priv->child_count); + +        local->transaction.eager_lock = +                GF_CALLOC (sizeof (*local->transaction.eager_lock), +                           priv->child_count, +                           gf_afr_mt_int32_t); + +        if (!local->transaction.eager_lock) +                goto out; + +        local->fresh_children = afr_children_create (priv->child_count); +        if (!local->fresh_children) +                goto out; + +        local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op), +                                               priv->child_count, +                                               gf_afr_mt_char); +        if (!local->transaction.pre_op) +                goto out; + +        local->pending = afr_matrix_create (priv->child_count, +                                            AFR_NUM_CHANGE_LOGS); +        if (!local->pending) +                goto out; + +        local->transaction.txn_changelog = afr_matrix_create (priv->child_count, +                                                           AFR_NUM_CHANGE_LOGS); +        if (!local->transaction.txn_changelog) +                goto out; + +	INIT_LIST_HEAD (&local->transaction.eager_locked); + +        ret = 0; +out: +        return ret; +} + +void +afr_reset_children (int32_t *fresh_children, int32_t child_count) +{ +        unsigned int i = 0; +        for (i = 0; i < child_count; i++) +                fresh_children[i] = -1; +} + +int32_t* +afr_children_create (int32_t child_count) +{ +        int32_t           *children = NULL; +        int               i               = 0; + +        GF_ASSERT (child_count > 0); + +        children = GF_CALLOC (child_count, sizeof (*children), +                              gf_afr_mt_int32_t); +        if (NULL == children) +                goto out; +        for (i = 0; i < child_count; i++) +                children[i] = -1; +out: +        return children; +} + +void +afr_children_add_child (int32_t *children, int32_t child, +                        int32_t child_count) +{ +        gf_boolean_t child_found = _gf_false; +        int          i               = 0; + +        for (i = 0; i < child_count; i++) { +                if (children[i] == -1) +                        break; +                if (children[i] == child) { +                        child_found = _gf_true; +                        break; +                } +        } + +        if (!child_found) { +                GF_ASSERT (i < child_count); +                children[i] = child; +        } +} + +void +afr_children_rm_child (int32_t *children, int32_t child, int32_t child_count) +{ +        int          i = 0; + +        GF_ASSERT ((child >= 0) && (child < child_count)); +        for (i = 0; i < child_count; i++) { +                if (children[i] == -1) +                        break; +                if (children[i] == child) { +                        if (i != (child_count - 1)) +                                memmove (children + i, children + i + 1, +                                         sizeof (*children)*(child_count - i - 1)); +                        children[child_count - 1] = -1; +                        break; +                } +        } +} + +int +afr_get_children_count (int32_t *children, unsigned int child_count) +{ +        int count = 0; +        int i = 0; + +        for (i = 0; i < child_count; i++) { +                if (children[i] == -1) +                        break; +                count++; +        } +        return count; +} + +void +afr_set_low_priority (call_frame_t *frame) +{ +        frame->root->pid = LOW_PRIO_PROC_PID; +} + +int +afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child, +                      int flags) +{ +        int             ret = 0; +        uint64_t        ctx = 0; +        afr_fd_ctx_t    *fd_ctx      = NULL; + +        GF_ASSERT (fd && fd->inode); +        ret = afr_fd_ctx_set (this, fd); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, +                        "could not set fd ctx for fd=%p", fd); +                goto out; +        } + +        ret = fd_ctx_get (fd, this, &ctx); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, +                        "could not get fd ctx for fd=%p", fd); +                goto out; +        } + +        fd_ctx = (afr_fd_ctx_t *)(long) ctx; +        fd_ctx->opened_on[child] = AFR_FD_OPENED; +        if (!IA_ISDIR (fd->inode->ia_type)) { +                fd_ctx->flags            = flags; +        } +        ret = 0; +out: +        return ret; +} + +gf_boolean_t +afr_have_quorum (char *logname, afr_private_t *priv) +{ +        unsigned int        quorum = 0; + +        GF_VALIDATE_OR_GOTO(logname,priv,out); + +        quorum = priv->quorum_count; +        if (quorum != AFR_QUORUM_AUTO) { +                return (priv->up_count >= (priv->down_count + quorum)); +        } + +        quorum = priv->child_count / 2 + 1; +        if (priv->up_count >= (priv->down_count + quorum)) { +                return _gf_true; +        } + +        /* +         * Special case for even numbers of nodes: if we have exactly half +         * and that includes the first ("senior-most") node, then that counts +         * as quorum even if it wouldn't otherwise.  This supports e.g. N=2 +         * while preserving the critical property that there can only be one +         * such group. +         */ +        if ((priv->child_count % 2) == 0) { +                quorum = priv->child_count / 2; +                if (priv->up_count >= (priv->down_count + quorum)) { +                        if (priv->child_up[0]) { +                                return _gf_true; +                        } +                } +        } + +out: +        return _gf_false; +} + +void +afr_priv_destroy (afr_private_t *priv) +{ +        int            i           = 0; + +        if (!priv) +                goto out; +        inode_unref (priv->root_inode); +        GF_FREE (priv->shd.pos); +        GF_FREE (priv->shd.pending); +        GF_FREE (priv->shd.inprogress); +//        for (i = 0; i < priv->child_count; i++) +//                if (priv->shd.timer && priv->shd.timer[i]) +//                        gf_timer_call_cancel (this->ctx, priv->shd.timer[i]); +        GF_FREE (priv->shd.timer); + +        if (priv->shd.healed) +                eh_destroy (priv->shd.healed); + +        if (priv->shd.heal_failed) +                eh_destroy (priv->shd.heal_failed); + +        if (priv->shd.split_brain) +                eh_destroy (priv->shd.split_brain); + +        for (i = 0; i < priv->child_count; i++) +        { +                if (priv->shd.statistics[i]) +                        eh_destroy (priv->shd.statistics[i]); +        } + +        GF_FREE (priv->shd.statistics); + +        GF_FREE (priv->shd.crawl_events); + +        GF_FREE (priv->last_event); +        if (priv->pending_key) { +                for (i = 0; i < priv->child_count; i++) +                        GF_FREE (priv->pending_key[i]); +        } +        GF_FREE (priv->pending_key); +        GF_FREE (priv->children); +        GF_FREE (priv->child_up); +        LOCK_DESTROY (&priv->lock); +        LOCK_DESTROY (&priv->read_child_lock); +        pthread_mutex_destroy (&priv->mutex); +        GF_FREE (priv); +out: +        return; +} + +int +xlator_subvolume_count (xlator_t *this) +{ +        int i = 0; +        xlator_list_t *list = NULL; + +        for (list = this->children; list; list = list->next) +                i++; +        return i; +} + +inline gf_boolean_t +afr_is_errno_set (int *child_errno, int child) +{ +        return child_errno[child]; +} + +inline gf_boolean_t +afr_is_errno_unset (int *child_errno, int child) +{ +        return !afr_is_errno_set (child_errno, child); +} + +void +afr_prepare_new_entry_pending_matrix (int32_t **pending, +                                      gf_boolean_t (*is_pending) (int *, int), +                                      int *ctx, struct iatt *buf, +                                      unsigned int child_count) +{ +        int midx = 0; +        int idx  = 0; +        int i    = 0; + +        midx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); +        if (IA_ISDIR (buf->ia_type)) +                idx = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); +        else if (IA_ISREG (buf->ia_type)) +                idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); +        else +                idx = -1; +        for (i = 0; i < child_count; i++) { +                if (is_pending (ctx, i)) { +                        pending[i][midx] = hton32 (1); +                        if (idx == -1) +                                continue; +                        pending[i][idx] = hton32 (1); +                } +        } +} + +gf_boolean_t +afr_is_fd_fixable (fd_t *fd) +{ +        if (!fd || !fd->inode) +                return _gf_false; +        else if (fd_is_anonymous (fd)) +                return _gf_false; +        else if (uuid_is_null (fd->inode->gfid)) +                return _gf_false; + +        return _gf_true; +} + +void +afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local = NULL; +        inode_t         *inode = NULL; +        afr_inode_ctx_t *ctx   = NULL; + +        local = frame->local; + +        if (local->fd) +                inode = local->fd->inode; +        else +                inode = local->loc.inode; + +        if (!inode) +                return; + +        LOCK (&inode->lock); +        { +                ctx = __afr_inode_ctx_get (inode, this); +                ctx->open_fd_count = local->open_fd_count; +        } +        UNLOCK (&inode->lock); +} + +int +afr_initialise_statistics (xlator_t *this) +{ +        afr_private_t       *priv = NULL; +        int                 ret = -1; +        int                 i = 0; +        int                 child_count = 0; +        eh_t                *stats_per_brick = NULL; +        shd_crawl_event_t   ***shd_crawl_events = NULL; +        priv = this->private; + +        priv->shd.statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count, +                                          gf_common_mt_eh_t); +        if (!priv->shd.statistics) { +                ret = -1; +                goto out; +        } +        child_count = priv->child_count; +        for (i=0; i < child_count ; i++) { +                stats_per_brick = eh_new (AFR_STATISTICS_HISTORY_SIZE, +                                          _gf_false, +                                          _destroy_crawl_event_data); +                if (!stats_per_brick) { +                        ret = -1; +                        goto out; +                } +                priv->shd.statistics[i] = stats_per_brick; + +        } + +        shd_crawl_events = (shd_crawl_event_t***)(&priv->shd.crawl_events); +        *shd_crawl_events  = GF_CALLOC (sizeof(shd_crawl_event_t*), +                                        priv->child_count, +                                        gf_afr_mt_shd_crawl_event_t); + +        if (!priv->shd.crawl_events) { +                ret = -1; +                goto out; +        } +        ret = 0; +out: +        return ret; + +} diff --git a/xlators/cluster/afr-v1/src/afr-dir-read.c b/xlators/cluster/afr-v1/src/afr-dir-read.c new file mode 100644 index 000000000..689dd84e6 --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-dir-read.c @@ -0,0 +1,545 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> +#include <string.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "checksum.h" + +#include "afr.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" + +int +afr_examine_dir_sh_unwind (call_frame_t *frame, xlator_t *this, int32_t op_ret, +                           int32_t op_errno, int32_t sh_failed) +{ +        afr_local_t *local  = NULL; + +        local = frame->local; + +        afr_set_opendir_done (this, local->fd->inode); + +        AFR_STACK_UNWIND (opendir, frame, local->op_ret, +                          local->op_errno, local->fd, NULL); + +        return 0; +} + + +gf_boolean_t +__checksums_differ (uint32_t *checksum, int child_count, +                    unsigned char *child_up) +{ +        int          ret            = _gf_false; +        int          i              = 0; +        uint32_t     cksum          = 0; +        gf_boolean_t activate_check = _gf_false; + +        for (i = 0; i < child_count; i++) { +                if (!child_up[i]) +                        continue; +                if (_gf_false == activate_check) { +                        cksum          = checksum[i]; +                        activate_check = _gf_true; +                        continue; +                } + +                if (cksum != checksum[i]) { +                        ret = _gf_true; +                        break; +                } + +                cksum = checksum[i]; +        } + +        return ret; +} + + +int32_t +afr_examine_dir_readdir_cbk (call_frame_t *frame, void *cookie, +                             xlator_t *this, int32_t op_ret, int32_t op_errno, +                             gf_dirent_t *entries, dict_t *xdata) +{ +        afr_private_t *   priv        = NULL; +        afr_local_t *     local       = NULL; +        afr_self_heal_t * sh          = NULL; +        gf_dirent_t *     entry       = NULL; +        gf_dirent_t *     tmp         = NULL; +        char              *reason     = NULL; +        int               child_index = 0; +        uint32_t          entry_cksum = 0; +        int               call_count  = 0; +        off_t             last_offset = 0; +        inode_t           *inode      = NULL; + +        priv  = this->private; +        local = frame->local; +        sh    = &local->self_heal; +        inode = local->fd->inode; + +        child_index = (long) cookie; + +        if (op_ret == -1) { +                gf_log (this->name, GF_LOG_INFO, +                        "%s: failed to do opendir on %s", +                        local->loc.path, priv->children[child_index]->name); +                local->op_ret = -1; +                local->op_ret = op_errno; +                goto out; +        } + +        if (op_ret == 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "%s: no entries found in %s", +                        local->loc.path, priv->children[child_index]->name); +                goto out; +        } + +        list_for_each_entry_safe (entry, tmp, &entries->list, list) { +                entry_cksum = gf_rsync_weak_checksum ((unsigned char *)entry->d_name, +                                                      strlen (entry->d_name)); +                local->cont.opendir.checksum[child_index] ^= entry_cksum; +        } + +        list_for_each_entry (entry, &entries->list, list) { +                last_offset = entry->d_off; +        } + +        /* read more entries */ + +        STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk, +                           (void *) (long) child_index, +                           priv->children[child_index], +                           priv->children[child_index]->fops->readdir, +                           local->fd, 131072, last_offset, NULL); + +        return 0; + +out: +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                if (__checksums_differ (local->cont.opendir.checksum, +                                        priv->child_count, +                                        local->child_up)) { + +                        sh->do_entry_self_heal  = _gf_true; +                        sh->forced_merge          = _gf_true; + +                        reason = "checksums of directory differ"; +                        afr_launch_self_heal (frame, this, inode, _gf_false, +                                              inode->ia_type, reason, NULL, +                                              afr_examine_dir_sh_unwind); +                } else { +                        afr_set_opendir_done (this, inode); + +                        AFR_STACK_UNWIND (opendir, frame, local->op_ret, +                                          local->op_errno, local->fd, NULL); +                } +        } + +        return 0; +} + + +int +afr_examine_dir (call_frame_t *frame, xlator_t *this) +{ +        afr_private_t * priv       = NULL; +        afr_local_t *   local      = NULL; +        int             i          = 0; +        int             call_count = 0; + +        local = frame->local; +        priv  = this->private; + +        local->cont.opendir.checksum = GF_CALLOC (priv->child_count, +                                                  sizeof (*local->cont.opendir.checksum), +                                                  gf_afr_mt_int32_t); + +        call_count = afr_up_children_count (local->child_up, priv->child_count); + +        local->call_count = call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i]) { +                        STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->readdir, +                                           local->fd, 131072, 0, NULL); + +                        if (!--call_count) +                                break; +                } +        } + +        return 0; +} + + +int32_t +afr_opendir_cbk (call_frame_t *frame, void *cookie, +                 xlator_t *this, int32_t op_ret, int32_t op_errno, +                 fd_t *fd, dict_t *xdata) +{ +        afr_private_t *priv              = NULL; +        afr_local_t   *local             = NULL; +        int32_t        up_children_count = 0; +        int            ret               = -1; +        int            call_count        = -1; +        int32_t        child_index       = 0; + +        priv  = this->private; +        local = frame->local; +        child_index = (long) cookie; + +        up_children_count = afr_up_children_count (local->child_up, +                                                   priv->child_count); + +        LOCK (&frame->lock); +        { +                if (op_ret >= 0) { +                        local->op_ret = op_ret; +                        ret = afr_child_fd_ctx_set (this, fd, child_index, 0); +                        if (ret) { +                                local->op_ret = -1; +                                local->op_errno = -ret; +                                goto unlock; +                        } +                } + +                local->op_errno = op_errno; +        } +unlock: +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                if (local->op_ret != 0) +                        goto out; + +                if (!afr_is_opendir_done (this, local->fd->inode) && +                    up_children_count > 1 && priv->entry_self_heal) { + +                        /* +                         * This is the first opendir on this inode. We need +                         * to check if the directory's entries are the same +                         * on all subvolumes. This is needed in addition +                         * to regular entry self-heal because the readdir +                         * call is sent only to the first subvolume, and +                         * thus files that exist only there will never be healed +                         * otherwise (assuming changelog shows no anomalies). +                         */ + +                        gf_log (this->name, GF_LOG_TRACE, +                                "reading contents of directory %s looking for mismatch", +                                local->loc.path); + +                        afr_examine_dir (frame, this); + +                } else { +                        /* do the unwind */ +                        goto out; +                } +        } + +        return 0; + +out: +        AFR_STACK_UNWIND (opendir, frame, local->op_ret, +                          local->op_errno, local->fd, NULL); + +        return 0; +} + + +int32_t +afr_opendir (call_frame_t *frame, xlator_t *this, +             loc_t *loc, fd_t *fd) +{ +        afr_private_t * priv        = NULL; +        afr_local_t   * local       = NULL; +        int             child_count = 0; +        int             i           = 0; +        int             ret         = -1; +        int             call_count  = -1; +        int32_t         op_errno    = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        child_count = priv->child_count; + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        loc_copy (&local->loc, loc); + +        local->fd    = fd_ref (fd); + +        call_count = local->call_count; + +        for (i = 0; i < child_count; i++) { +                if (local->child_up[i]) { +                        STACK_WIND_COOKIE (frame, afr_opendir_cbk, +                                           (void*) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->opendir, +                                           loc, fd, NULL); + +                        if (!--call_count) +                                break; +                } +        } + +        ret = 0; +out: +        if (ret < 0) +                AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL); + +        return 0; +} + + +/** + * Common algorithm for directory read calls: + * + * - Try the fop on the first child that is up + * - if we have failed due to ENOTCONN: + *     try the next child + * + * Applicable to: readdir + */ + + +struct entry_name { +        char *name; +        struct list_head list; +}; + +static void +afr_forget_entries (fd_t *fd) +{ +        struct entry_name *entry  = NULL; +        struct entry_name *tmp    = NULL; +        int                ret    = 0; +        uint64_t           ctx    = 0; +        afr_fd_ctx_t      *fd_ctx = NULL; + +        ret = fd_ctx_get (fd, THIS, &ctx); +        if (ret < 0) { +                gf_log (THIS->name, GF_LOG_INFO, +                        "could not get fd ctx for fd=%p", fd); +                return; +        } + +        fd_ctx = (afr_fd_ctx_t *)(long) ctx; + +        list_for_each_entry_safe (entry, tmp, &fd_ctx->entries, list) { +                GF_FREE (entry->name); +                list_del (&entry->list); +                GF_FREE (entry); +        } +} + +static void +afr_readdir_filter_trash_dir (gf_dirent_t *entries, fd_t *fd) +{ +        gf_dirent_t *   entry       = NULL; +        gf_dirent_t *   tmp         = NULL; + +        list_for_each_entry_safe (entry, tmp, &entries->list, list) { +                if (__is_root_gfid (fd->inode->gfid) && +                    !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { +                        list_del_init (&entry->list); +                        GF_FREE (entry); +                } +        } +} + +int32_t +afr_readdir_cbk (call_frame_t *frame, void *cookie, +                 xlator_t *this, int32_t op_ret, int32_t op_errno, +                 gf_dirent_t *entries, dict_t *xdata) +{ +        afr_local_t     *local = NULL; + +        if (op_ret == -1) +                goto out; + +        local = frame->local; +        afr_readdir_filter_trash_dir (entries, local->fd); + +out: +        AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, NULL); +        return 0; +} + + +int32_t +afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                  int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, +                  dict_t *xdata) +{ +        afr_local_t     *local = NULL; + +        if (op_ret == -1) +                goto out; + +        local = frame->local; +        afr_readdir_filter_trash_dir (entries, local->fd); + +out: +        AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, NULL); +        return 0; +} + +int32_t +afr_do_readdir (call_frame_t *frame, xlator_t *this, +                fd_t *fd, size_t size, off_t offset, int whichop, dict_t *dict) +{ +        afr_private_t *priv      = NULL; +        xlator_t      **children = NULL; +        int           call_child = 0; +        afr_local_t   *local     = NULL; +        afr_fd_ctx_t  *fd_ctx    = NULL; +        int           ret        = -1; +        int32_t       op_errno   = 0; +        uint64_t      read_child = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv     = this->private; +        children = priv->children; + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        local->fresh_children = afr_children_create (priv->child_count); +        if (!local->fresh_children) { +                op_errno = ENOMEM; +                goto out; +        } + +        read_child = afr_inode_get_read_ctx (this, fd->inode, +                                             local->fresh_children); +        ret = afr_get_call_child (this, local->child_up, read_child, +                                  local->fresh_children, +                                  &call_child, +                                  &local->cont.readdir.last_index); +        if (ret < 0) { +                op_errno = -ret; +                goto out; +        } + +        fd_ctx  = afr_fd_ctx_get (fd, this); +        if (!fd_ctx) { +                op_errno = EBADF; +                goto out; +        } + +        if ((offset == 0) || (fd_ctx->call_child == -1)) { +                fd_ctx->call_child = call_child; +        } else if ((priv->readdir_failover == _gf_false) && +                   (call_child != fd_ctx->call_child)) { +                op_errno = EBADF; +                goto out; +        } + +        local->fd                  = fd_ref (fd); +        local->cont.readdir.size   = size; +        local->cont.readdir.dict   = (dict)? dict_ref (dict) : NULL; + +        if (whichop == GF_FOP_READDIR) +                STACK_WIND_COOKIE (frame, afr_readdir_cbk, +                                   (void *) (long) call_child, +                                   children[call_child], +                                   children[call_child]->fops->readdir, fd, +                                   size, offset, dict); +        else +                STACK_WIND_COOKIE (frame, afr_readdirp_cbk, +                                   (void *) (long) call_child, +                                   children[call_child], +                                   children[call_child]->fops->readdirp, fd, +                                   size, offset, dict); + +        return 0; +out: +        AFR_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL); +        return 0; +} + + +int32_t +afr_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, +             off_t offset, dict_t *xdata) +{ +        afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata); +        return 0; +} + + +int32_t +afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, +              off_t offset, dict_t *dict) +{ +        afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP, dict); +        return 0; +} + + +int32_t +afr_releasedir (xlator_t *this, fd_t *fd) +{ +        afr_forget_entries (fd); +        afr_cleanup_fd_ctx (this, fd); + +        return 0; +} diff --git a/xlators/cluster/afr-v1/src/afr-dir-read.h b/xlators/cluster/afr-v1/src/afr-dir-read.h new file mode 100644 index 000000000..09456d159 --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-dir-read.h @@ -0,0 +1,36 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef __DIR_READ_H__ +#define __DIR_READ_H__ + + +int32_t +afr_opendir (call_frame_t *frame, xlator_t *this, +	     loc_t *loc, fd_t *fd, dict_t *xdata); + +int32_t +afr_releasedir (xlator_t *this, fd_t *fd); + +int32_t +afr_readdir (call_frame_t *frame, xlator_t *this, +	     fd_t *fd, size_t size, off_t offset, dict_t *xdata); + + +int32_t +afr_readdirp (call_frame_t *frame, xlator_t *this, +              fd_t *fd, size_t size, off_t offset, dict_t *dict); + +int32_t +afr_checksum (call_frame_t *frame, xlator_t *this, +	      loc_t *loc, int32_t flags, dict_t *xdata); + + +#endif /* __DIR_READ_H__ */ diff --git a/xlators/cluster/afr-v1/src/afr-dir-write.c b/xlators/cluster/afr-v1/src/afr-dir-write.c new file mode 100644 index 000000000..1943b719b --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-dir-write.c @@ -0,0 +1,1962 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" +#include "afr-transaction.h" + +int +afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno) +{ +        int     ret = -1; +        char    *child_path = NULL; + +        if (!child->parent) { +                if (op_errno) +                        *op_errno = EINVAL; +                goto out; +        } + +        child_path = gf_strdup (child->path); +        if (!child_path) { +                if (op_errno) +                        *op_errno = ENOMEM; +                goto out; +        } +        parent->path = gf_strdup( dirname (child_path) ); +	if (!parent->path) { +                if (op_errno) +                        *op_errno = ENOMEM; +                goto out; +        } +        parent->inode  = inode_ref (child->parent); +        uuid_copy (parent->gfid, child->pargfid); + +        ret = 0; +out: +	GF_FREE(child_path); + +        return ret; +} + +void +__dir_entry_fop_common_cbk (call_frame_t *frame, int child_index, +                            xlator_t *this, int32_t op_ret, +                            int32_t op_errno, inode_t *inode, +                            struct iatt *buf, struct iatt *preparent, +                            struct iatt *postparent, struct iatt *prenewparent, +                            struct iatt *postnewparent) +{ +        afr_local_t     *local          = NULL; + +        local = frame->local; + +        if (afr_fop_failed (op_ret, op_errno)) +                afr_transaction_fop_failed (frame, this, child_index); + +        if (op_ret > -1) { +                local->op_ret = op_ret; + +                if ((local->success_count == 0) || +                    (child_index == local->read_child_index)) { +                        local->cont.dir_fop.preparent      = *preparent; +                        local->cont.dir_fop.postparent     = *postparent; +                        if (buf) +                                local->cont.dir_fop.buf            = *buf; +                        if (prenewparent) +                             local->cont.dir_fop.prenewparent  = *prenewparent; +                        if (postnewparent) +                             local->cont.dir_fop.postnewparent = *postnewparent; +                } + +                local->cont.dir_fop.inode = inode; + +                local->fresh_children[local->success_count] = child_index; +                local->success_count++; +                local->child_errno[child_index] = 0; +        } else { +                local->child_errno[child_index] = op_errno; +        } + +        local->op_errno = op_errno; +} + +int +afr_mark_new_entry_changelog_cbk (call_frame_t *frame, void *cookie, +                                  xlator_t *this, +                                  int32_t op_ret, int32_t op_errno, +                                  dict_t *xattr, dict_t *xdata) +{ +        int     call_count = 0; + +        call_count = afr_frame_return (frame); +        if (call_count == 0) { +                AFR_STACK_DESTROY (frame); +        } +        return 0; +} + +void +afr_mark_new_entry_changelog (call_frame_t *frame, xlator_t *this) +{ +        call_frame_t  *new_frame  = NULL; +        afr_local_t   *local      = NULL; +        afr_local_t   *new_local  = NULL; +        afr_private_t *priv       = NULL; +        dict_t        **xattr     = NULL; +        int32_t       **changelog = NULL; +        int           i           = 0; +        GF_UNUSED int op_errno    = 0; + +        local = frame->local; +        priv = this->private; + +        new_frame = copy_frame (frame); +        if (!new_frame) { +                goto out; +        } + +        AFR_LOCAL_ALLOC_OR_GOTO (new_frame->local, out); +        new_local = new_frame->local; +        changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); +        if (!changelog) +                goto out; + +        xattr = GF_CALLOC (priv->child_count, sizeof (*xattr), +                           gf_afr_mt_dict_t); +        if (!xattr) +                goto out; +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_errno[i]) +                        continue; +                xattr[i] = dict_new (); +                if (!xattr[i]) +                        goto out; +        } + +        afr_prepare_new_entry_pending_matrix (changelog, +                                              afr_is_errno_set, +                                              local->child_errno, +                                              &local->cont.dir_fop.buf, +                                              priv->child_count); + +        new_local->pending = changelog; +        uuid_copy (new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid); +        new_local->loc.inode = inode_ref (local->cont.dir_fop.inode); +        new_local->call_count = local->success_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_errno[i]) +                        continue; + +                afr_set_pending_dict (priv, xattr[i], changelog, i, LOCAL_LAST); +                STACK_WIND_COOKIE (new_frame, afr_mark_new_entry_changelog_cbk, +                                   (void *) (long) i, priv->children[i], +                                   priv->children[i]->fops->xattrop, +                                   &new_local->loc, GF_XATTROP_ADD_ARRAY, +                                   xattr[i], NULL); +        } +        new_frame = NULL; +out: +        if (new_frame) +                AFR_STACK_DESTROY (new_frame); +        afr_xattr_array_destroy (xattr, priv->child_count); +        return; +} + +gf_boolean_t +afr_is_new_entry_changelog_needed (glusterfs_fop_t fop) +{ +        glusterfs_fop_t fops[]   = {GF_FOP_CREATE, GF_FOP_MKNOD, GF_FOP_NULL}; +        int             i        = 0; + +        for (i = 0; fops[i] != GF_FOP_NULL; i++) { +                if (fop == fops[i]) +                        return _gf_true; +        } +        return _gf_false; +} + +void +afr_dir_fop_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t   *local      = NULL; +        afr_private_t *priv       = NULL; + +        local = frame->local; +        priv  = this->private; + +        if (local->op_ret < 0) +                goto out; + +        if (local->success_count == priv->child_count) +                goto out; + +        if (!afr_is_new_entry_changelog_needed (local->op)) +                goto out; + +        afr_mark_new_entry_changelog (frame, this); + +out: +        return; +} + +void +afr_dir_fop_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local          = NULL; +        afr_private_t   *priv           = NULL; + +        local = frame->local; +        priv  = this->private; + +        if (local->cont.dir_fop.inode == NULL) +                goto done; +        afr_set_read_ctx_from_policy (this, local->cont.dir_fop.inode, +                                      local->fresh_children, +                                      local->read_child_index, +                                      priv->read_child, +                                      local->cont.dir_fop.buf.ia_gfid); +done: +        local->transaction.unwind (frame, this); +        afr_dir_fop_mark_entry_pending_changelog (frame, this); +        local->transaction.resume (frame, this); +} + +/* {{{ create */ + +int +afr_create_unwind (call_frame_t *frame, xlator_t *this) +{ +        call_frame_t *main_frame = NULL; +        afr_local_t  *local = NULL; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (local->transaction.main_frame) { +                        main_frame = local->transaction.main_frame; +                } +                local->transaction.main_frame = NULL; +        } +        UNLOCK (&frame->lock); + +        if (main_frame) { +                AFR_STACK_UNWIND (create, main_frame, +                                  local->op_ret, local->op_errno, +                                  local->cont.create.fd, +                                  local->cont.dir_fop.inode, +                                  &local->cont.dir_fop.buf, +                                  &local->cont.dir_fop.preparent, +                                  &local->cont.dir_fop.postparent, +                                  local->xdata_rsp); +        } + +        return 0; +} + + +int +afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                     int32_t op_ret, int32_t op_errno, +                     fd_t *fd, inode_t *inode, struct iatt *buf, +                     struct iatt *preparent, struct iatt *postparent, +                     dict_t *xdata) +{ +        afr_local_t     *local = NULL; +        uint64_t        ctx = 0; +        afr_fd_ctx_t    *fd_ctx = NULL; +        int             ret = 0; +        int             call_count = -1; +        int             child_index = -1; + +        local = frame->local; + +        child_index = (long) cookie; + +        LOCK (&frame->lock); +        { +                if (op_ret > -1) { +                        ret = afr_fd_ctx_set (this, fd); +                        if (ret < 0) { +                                gf_log (this->name, GF_LOG_ERROR, +                                        "could not set ctx on fd=%p", fd); + +                                local->op_ret   = -1; +                                local->op_errno = -ret; +                                goto unlock; +                        } + +                        ret = fd_ctx_get (fd, this, &ctx); +                        if (ret < 0) { +                                gf_log (this->name, GF_LOG_ERROR, +                                        "could not get fd ctx for fd=%p", fd); +                                local->op_ret   = -1; +                                local->op_errno = -ret; +                                goto unlock; +                        } + +                        fd_ctx = (afr_fd_ctx_t *)(long) ctx; + +                        fd_ctx->opened_on[child_index] = AFR_FD_OPENED; +                        fd_ctx->flags                  = local->cont.create.flags; + +                        if (local->success_count == 0) { +				if (xdata) +					local->xdata_rsp = dict_ref(xdata); +			} +                } +                __dir_entry_fop_common_cbk (frame, child_index, this, +                                            op_ret, op_errno, inode, buf, +                                            preparent, postparent, NULL, NULL); +        } + +unlock: +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) +                afr_dir_fop_done (frame, this); + +        return 0; +} + + +int +afr_create_wind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        int call_count = -1; +        int i = 0; + +        local = frame->local; +        priv = this->private; + +        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, +                                                     priv->child_count); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +                return 0; +        } + +        local->call_count = call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) { +                        STACK_WIND_COOKIE (frame, afr_create_wind_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->create, +                                           &local->loc, +                                           local->cont.create.flags, +                                           local->cont.create.mode, +                                           local->umask, +                                           local->cont.create.fd, +                                           local->xdata_req); +                        if (!--call_count) +                                break; +                } +        } + +        return 0; +} + + +int +afr_create_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t * local = NULL; + +        local = frame->local; + +        local->transaction.unwind (frame, this); + +        AFR_STACK_DESTROY (frame); + +        return 0; +} + + +int +afr_create (call_frame_t *frame, xlator_t *this, +            loc_t *loc, int32_t flags, mode_t mode, +            mode_t umask, fd_t *fd, dict_t *params) +{ +        afr_private_t           *priv                   = NULL; +        afr_local_t             *local                  = NULL; +        afr_internal_lock_t     *int_lock               = NULL; +        call_frame_t            *transaction_frame      = NULL; +        int                     ret                     = -1; +        int                     op_errno                = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        QUORUM_CHECK(create,out); + +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) { +                op_errno = ENOMEM; +                goto out; +        } + +        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); +        local = transaction_frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        loc_copy (&local->loc, loc); + +        LOCK (&priv->read_child_lock); +        { +                local->read_child_index = (++priv->read_child_rr) +                        % (priv->child_count); +        } +        UNLOCK (&priv->read_child_lock); + +        local->op                = GF_FOP_CREATE; +        local->cont.create.flags = flags; +        local->cont.create.mode  = mode; +        local->cont.create.fd    = fd_ref (fd); +        local->umask  = umask; +        if (params) +                local->xdata_req = dict_ref (params); + +        local->transaction.fop    = afr_create_wind; +        local->transaction.done   = afr_create_done; +        local->transaction.unwind = afr_create_unwind; + +        ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, +                                    &op_errno); +        if (ret) +                goto out; + +        local->transaction.main_frame = frame; +        local->transaction.basename = AFR_BASENAME (loc->path); +        int_lock = &local->internal_lock; + +        int_lock->lockee_count = 0; +        ret = afr_init_entry_lockee (&int_lock->lockee[0], local, +                                     &local->transaction.parent_loc, +                                     local->transaction.basename, +                                     priv->child_count); +        if (ret) +                goto out; + +        int_lock->lockee_count++; +        ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); +        if (ret < 0) { +            op_errno = -ret; +            goto out; +        } + +        ret = 0; +out: +        if (ret < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); +                AFR_STACK_UNWIND (create, frame, -1, op_errno, +                                  NULL, NULL, NULL, NULL, NULL, NULL); +        } + +        return 0; +} + +/* }}} */ + +/* {{{ mknod */ + +int +afr_mknod_unwind (call_frame_t *frame, xlator_t *this) +{ +        call_frame_t *main_frame = NULL; +        afr_local_t  *local = NULL; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (local->transaction.main_frame) { +                        main_frame = local->transaction.main_frame; +                } +                local->transaction.main_frame = NULL; +        } +        UNLOCK (&frame->lock); + +        if (main_frame) { +                AFR_STACK_UNWIND (mknod, main_frame, +                                  local->op_ret, local->op_errno, +                                  local->cont.dir_fop.inode, +                                  &local->cont.dir_fop.buf, +                                  &local->cont.dir_fop.preparent, +                                  &local->cont.dir_fop.postparent, +                                  NULL); +        } + +        return 0; +} + + +int +afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                    int32_t op_ret, int32_t op_errno, inode_t *inode, +                    struct iatt *buf, struct iatt *preparent, +                    struct iatt *postparent, dict_t *xdata) +{ +        int             call_count      = -1; +        int             child_index     = -1; + +        child_index = (long) cookie; + +        LOCK (&frame->lock); +        { +                __dir_entry_fop_common_cbk (frame, child_index, this, +                                            op_ret, op_errno, inode, buf, +                                            preparent, postparent, NULL, NULL); +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) +                afr_dir_fop_done (frame, this); + +        return 0; +} + + +int32_t +afr_mknod_wind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        int call_count = -1; +        int i = 0; + +        local = frame->local; +        priv  = this->private; + +        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, +                                                     priv->child_count); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +                return 0; +        } + +        local->call_count = call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) { +                        STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->mknod, +                                           &local->loc, local->cont.mknod.mode, +                                           local->cont.mknod.dev, +                                           local->umask, +                                           local->xdata_req); +                        if (!--call_count) +                                break; +                } +        } + +        return 0; +} + + +int +afr_mknod_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t * local = NULL; + +        local = frame->local; + +        local->transaction.unwind (frame, this); +        AFR_STACK_DESTROY (frame); + +        return 0; +} + + +int +afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, +           dev_t dev, mode_t umask, dict_t *params) +{ +        afr_private_t           *priv                   = NULL; +        afr_local_t             *local                  = NULL; +        afr_internal_lock_t     *int_lock               = NULL; +        call_frame_t            *transaction_frame      = NULL; +        int                     ret                     = -1; +        int                     op_errno                = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        QUORUM_CHECK(mknod,out); + +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) { +                op_errno = ENOMEM; +                goto out; +        } + +        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); +        local = transaction_frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        loc_copy (&local->loc, loc); + +        LOCK (&priv->read_child_lock); +        { +                local->read_child_index = (++priv->read_child_rr) +                        % (priv->child_count); +        } +        UNLOCK (&priv->read_child_lock); + +        local->op               = GF_FOP_MKNOD; +        local->cont.mknod.mode  = mode; +        local->cont.mknod.dev   = dev; +        local->umask = umask; +        if (params) +                local->xdata_req = dict_ref (params); + +        local->transaction.fop    = afr_mknod_wind; +        local->transaction.done   = afr_mknod_done; +        local->transaction.unwind = afr_mknod_unwind; + +        ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, +                                    &op_errno); +        if (ret) +                goto out; + +        local->transaction.main_frame = frame; +        local->transaction.basename = AFR_BASENAME (loc->path); +        int_lock = &local->internal_lock; + +        int_lock->lockee_count = 0; +        ret = afr_init_entry_lockee (&int_lock->lockee[0], local, +                                     &local->transaction.parent_loc, +                                     local->transaction.basename, +                                     priv->child_count); +        if (ret) +                goto out; + +        int_lock->lockee_count++; +        ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); +        if (ret < 0) { +            op_errno = -ret; +            goto out; +        } + +        ret = 0; +out: +        if (ret < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); +                AFR_STACK_UNWIND (mknod, frame, -1, op_errno, +                                  NULL, NULL, NULL, NULL, NULL); +        } + +        return 0; +} + +/* }}} */ + +/* {{{ mkdir */ + + +int +afr_mkdir_unwind (call_frame_t *frame, xlator_t *this) +{ +        call_frame_t *main_frame = NULL; +        afr_local_t  *local = NULL; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (local->transaction.main_frame) { +                        main_frame = local->transaction.main_frame; +                } +                local->transaction.main_frame = NULL; +        } +        UNLOCK (&frame->lock); + +        if (main_frame) { +                AFR_STACK_UNWIND (mkdir, main_frame, +                                  local->op_ret, local->op_errno, +                                  local->cont.dir_fop.inode, +                                  &local->cont.dir_fop.buf, +                                  &local->cont.dir_fop.preparent, +                                  &local->cont.dir_fop.postparent, +                                  NULL); +        } + +        return 0; +} + + +int +afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                    int32_t op_ret, int32_t op_errno, inode_t *inode, +                    struct iatt *buf, struct iatt *preparent, +                    struct iatt *postparent, dict_t *xdata) +{ +        int             call_count      = -1; +        int             child_index     = -1; + +        child_index = (long) cookie; + +        LOCK (&frame->lock); +        { +                __dir_entry_fop_common_cbk (frame, child_index, this, +                                            op_ret, op_errno, inode, buf, +                                            preparent, postparent, NULL, NULL); +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) +                afr_dir_fop_done (frame, this); + +        return 0; +} + + +int +afr_mkdir_wind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        int call_count = -1; +        int i = 0; + +        local = frame->local; +        priv  = this->private; + +        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, +                                                     priv->child_count); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +                return 0; +        } + +        local->call_count = call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) { +                        STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->mkdir, +                                           &local->loc, local->cont.mkdir.mode, +                                           local->umask, +                                           local->xdata_req); +                        if (!--call_count) +                                break; +                } +        } + +        return 0; +} + + +int +afr_mkdir_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t * local = NULL; + +        local = frame->local; + +        local->transaction.unwind (frame, this); + +        AFR_STACK_DESTROY (frame); + +        return 0; +} + +int +afr_mkdir (call_frame_t *frame, xlator_t *this, +           loc_t *loc, mode_t mode, mode_t umask, dict_t *params) +{ +        afr_private_t           *priv                   = NULL; +        afr_local_t             *local                  = NULL; +        afr_internal_lock_t     *int_lock               = NULL; +        call_frame_t            *transaction_frame      = NULL; +        int                     ret                     = -1; +        int                     op_errno                = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        QUORUM_CHECK(mkdir,out); + +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) { +                op_errno = ENOMEM; +                goto out; +        } + +        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); +        local = transaction_frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        loc_copy (&local->loc, loc); + +        LOCK (&priv->read_child_lock); +        { +                local->read_child_index = (++priv->read_child_rr) +                        % (priv->child_count); +        } +        UNLOCK (&priv->read_child_lock); + +        local->cont.mkdir.mode  = mode; +        local->umask = umask; +        if (params) +                local->xdata_req = dict_ref (params); + +        local->op = GF_FOP_MKDIR; +        local->transaction.fop    = afr_mkdir_wind; +        local->transaction.done   = afr_mkdir_done; +        local->transaction.unwind = afr_mkdir_unwind; + +        ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, +                                    &op_errno); +        if (ret) +                goto out; + +        local->transaction.main_frame = frame; +        local->transaction.basename = AFR_BASENAME (loc->path); +        int_lock = &local->internal_lock; + +        int_lock->lockee_count = 0; +        ret = afr_init_entry_lockee (&int_lock->lockee[0], local, +                                     &local->transaction.parent_loc, +                                     local->transaction.basename, +                                     priv->child_count); +        if (ret) +                goto out; + +        int_lock->lockee_count++; +        ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); +        if (ret < 0) { +            op_errno = -ret; +            goto out; +        } + +        ret = 0; +out: +        if (ret < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); + +                AFR_STACK_UNWIND (mkdir, frame, -1, op_errno, +                                  NULL, NULL, NULL, NULL, NULL); +        } + +        return 0; +} + +/* }}} */ + +/* {{{ link */ + + +int +afr_link_unwind (call_frame_t *frame, xlator_t *this) +{ +        call_frame_t *main_frame = NULL; +        afr_local_t  *local = NULL; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (local->transaction.main_frame) { +                        main_frame = local->transaction.main_frame; +                } +                local->transaction.main_frame = NULL; +        } +        UNLOCK (&frame->lock); + +        if (main_frame) { +                AFR_STACK_UNWIND (link, main_frame, +                                  local->op_ret, local->op_errno, +                                  local->cont.dir_fop.inode, +                                  &local->cont.dir_fop.buf, +                                  &local->cont.dir_fop.preparent, +                                  &local->cont.dir_fop.postparent, +                                  NULL); +        } + +        return 0; +} + + +int +afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                   int32_t op_ret, int32_t op_errno, inode_t *inode, +                   struct iatt *buf, struct iatt *preparent, +                   struct iatt *postparent, dict_t *xdata) +{ +        int             call_count      = -1; +        int             child_index     = -1; + +        child_index = (long) cookie; + +        LOCK (&frame->lock); +        { +                __dir_entry_fop_common_cbk (frame, child_index, this, +                                            op_ret, op_errno, inode, buf, +                                            preparent, postparent, NULL, NULL); +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) +                afr_dir_fop_done (frame, this); + +        return 0; +} + + +int +afr_link_wind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        int call_count = -1; +        int i = 0; + +        local = frame->local; +        priv  = this->private; + +        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, +                                                     priv->child_count); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +                return 0; +        } + +        local->call_count = call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) { +                        STACK_WIND_COOKIE (frame, afr_link_wind_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->link, +                                           &local->loc, +                                           &local->newloc, local->xdata_req); + +                        if (!--call_count) +                                break; +                } +        } + +        return 0; +} + + +int +afr_link_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t * local = frame->local; + +        local->transaction.unwind (frame, this); + +        AFR_STACK_DESTROY (frame); + +        return 0; +} + + +int +afr_link (call_frame_t *frame, xlator_t *this, +          loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ +        afr_private_t           *priv                   = NULL; +        afr_local_t             *local                  = NULL; +        afr_internal_lock_t     *int_lock               = NULL; +        call_frame_t            *transaction_frame      = NULL; +        int                     ret                     = -1; +        int                     op_errno                = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        QUORUM_CHECK(link,out); + +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) { +                op_errno = ENOMEM; +                goto out; +        } + +        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); +        local = transaction_frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        loc_copy (&local->loc,    oldloc); +        loc_copy (&local->newloc, newloc); +        if (xdata) +                local->xdata_req = dict_ref (xdata); + +        LOCK (&priv->read_child_lock); +        { +                local->read_child_index = (++priv->read_child_rr) +                        % (priv->child_count); +        } +        UNLOCK (&priv->read_child_lock); + +        local->op = GF_FOP_LINK; +        local->transaction.fop    = afr_link_wind; +        local->transaction.done   = afr_link_done; +        local->transaction.unwind = afr_link_unwind; + +        ret = afr_build_parent_loc (&local->transaction.parent_loc, newloc, +                                    &op_errno); +        if (ret) +                goto out; + +        local->transaction.main_frame   = frame; +        local->transaction.basename     = AFR_BASENAME (newloc->path); +        int_lock = &local->internal_lock; + +        int_lock->lockee_count = 0; +        ret = afr_init_entry_lockee (&int_lock->lockee[0], local, +                                     &local->transaction.parent_loc, +                                     local->transaction.basename, +                                     priv->child_count); +        if (ret) +                goto out; + +        int_lock->lockee_count++; +        ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); +        if (ret < 0) { +            op_errno = -ret; +            goto out; +        } +        ret = 0; +out: +        if (ret < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); +                AFR_STACK_UNWIND (link, frame, -1, op_errno, +                                  NULL, NULL, NULL, NULL, NULL); +        } + +        return 0; +} + +/* }}} */ + +/* {{{ symlink */ + + +int +afr_symlink_unwind (call_frame_t *frame, xlator_t *this) +{ +        call_frame_t *main_frame = NULL; +        afr_local_t  *local = NULL; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (local->transaction.main_frame) { +                        main_frame = local->transaction.main_frame; +                } +                local->transaction.main_frame = NULL; +        } +        UNLOCK (&frame->lock); + +        if (main_frame) { +                AFR_STACK_UNWIND (symlink, main_frame, +                                  local->op_ret, local->op_errno, +                                  local->cont.dir_fop.inode, +                                  &local->cont.dir_fop.buf, +                                  &local->cont.dir_fop.preparent, +                                  &local->cont.dir_fop.postparent, +                                  NULL); +        } + +        return 0; +} + + +int +afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                      int32_t op_ret, int32_t op_errno, inode_t *inode, +                      struct iatt *buf, struct iatt *preparent, +                      struct iatt *postparent, dict_t *xdata) +{ +        int             call_count      = -1; +        int             child_index     = -1; + +        child_index = (long) cookie; + +        LOCK (&frame->lock); +        { +                __dir_entry_fop_common_cbk (frame, child_index, this, +                                            op_ret, op_errno, inode, buf, +                                            preparent, postparent, NULL, NULL); +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) +                afr_dir_fop_done (frame, this); + +        return 0; +} + + +int +afr_symlink_wind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        int call_count = -1; +        int i = 0; + +        local = frame->local; +        priv = this->private; + +        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, +                                                     priv->child_count); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +                return 0; +        } + +        local->call_count = call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) { +                        STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->symlink, +                                           local->cont.symlink.linkpath, +                                           &local->loc, +                                           local->umask, +                                           local->xdata_req); + +                        if (!--call_count) +                                break; + +                } +        } + +        return 0; +} + + +int +afr_symlink_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t * local = frame->local; + +        local->transaction.unwind (frame, this); + +        AFR_STACK_DESTROY (frame); + +        return 0; +} + + +int +afr_symlink (call_frame_t *frame, xlator_t *this, +             const char *linkpath, loc_t *loc, mode_t umask, dict_t *params) +{ +        afr_private_t           *priv                   = NULL; +        afr_local_t             *local                  = NULL; +        afr_internal_lock_t     *int_lock               = NULL; +        call_frame_t            *transaction_frame      = NULL; +        int                     ret                     = -1; +        int                     op_errno                = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        QUORUM_CHECK(symlink,out); + +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) { +                op_errno = ENOMEM; +                goto out; +        } + +        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); +        local = transaction_frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        loc_copy (&local->loc, loc); + +        LOCK (&priv->read_child_lock); +        { +                local->read_child_index = (++priv->read_child_rr) +                        % (priv->child_count); +        } +        UNLOCK (&priv->read_child_lock); + +        local->cont.symlink.linkpath = gf_strdup (linkpath); +        local->umask = umask; +        if (params) +                local->xdata_req = dict_ref (params); + +        local->op = GF_FOP_SYMLINK; +        local->transaction.fop    = afr_symlink_wind; +        local->transaction.done   = afr_symlink_done; +        local->transaction.unwind = afr_symlink_unwind; + +        ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, +                                    &op_errno); +        if (ret) +                goto out; + +        local->transaction.main_frame   = frame; +        local->transaction.basename     = AFR_BASENAME (loc->path); +        int_lock = &local->internal_lock; + +        int_lock->lockee_count = 0; +        ret = afr_init_entry_lockee (&int_lock->lockee[0], local, +                                     &local->transaction.parent_loc, +                                     local->transaction.basename, +                                     priv->child_count); +        if (ret) +                goto out; + +        int_lock->lockee_count++; +        ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); +        if (ret < 0) { +            op_errno = -ret; +            goto out; +        } + +        ret = 0; +out: +        if (ret  < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); +                AFR_STACK_UNWIND (symlink, frame, -1, op_errno, +                                  NULL, NULL, NULL, NULL, NULL); +        } + +        return 0; +} + +/* }}} */ + +/* {{{ rename */ + +int +afr_rename_unwind (call_frame_t *frame, xlator_t *this) +{ +        call_frame_t *main_frame = NULL; +        afr_local_t  *local = NULL; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (local->transaction.main_frame) { +                        main_frame = local->transaction.main_frame; +                } +                local->transaction.main_frame = NULL; +        } +        UNLOCK (&frame->lock); + +        if (main_frame) { +                AFR_STACK_UNWIND (rename, main_frame, +                                  local->op_ret, local->op_errno, +                                  &local->cont.dir_fop.buf, +                                  &local->cont.dir_fop.preparent, +                                  &local->cont.dir_fop.postparent, +                                  &local->cont.dir_fop.prenewparent, +                                  &local->cont.dir_fop.postnewparent, +                                  NULL); +        } + +        return 0; +} + + +int +afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                     int32_t op_ret, int32_t op_errno, struct iatt *buf, +                     struct iatt *preoldparent, struct iatt *postoldparent, +                     struct iatt *prenewparent, struct iatt *postnewparent, +                     dict_t *xdata) +{ +        afr_local_t *   local = NULL; +        int call_count = -1; +        int child_index = -1; + +        local = frame->local; + +        child_index = (long) cookie; + +        LOCK (&frame->lock); +        { +                if (afr_fop_failed (op_ret, op_errno) && op_errno != ENOTEMPTY) +                        afr_transaction_fop_failed (frame, this, child_index); +                local->op_errno = op_errno; +                local->child_errno[child_index] = op_errno; + +                if (op_ret > -1) +                        __dir_entry_fop_common_cbk (frame, child_index, this, +                                                   op_ret, op_errno, NULL, buf, +                                                   preoldparent, postoldparent, +                                                   prenewparent, postnewparent); + +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) +                afr_dir_fop_done (frame, this); + +        return 0; +} + + +int32_t +afr_rename_wind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        int call_count = -1; +        int i = 0; + +        local = frame->local; +        priv = this->private; + +        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, +                                                     priv->child_count); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +                return 0; +        } + +        local->call_count = call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) { +                        STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->rename, +                                           &local->loc, +                                           &local->newloc, NULL); +                        if (!--call_count) +                                break; +                } +        } + +        return 0; +} + + +int +afr_rename_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t * local = frame->local; + +        local->transaction.unwind (frame, this); + +        AFR_STACK_DESTROY (frame); + +        return 0; +} + + +int +afr_rename (call_frame_t *frame, xlator_t *this, +            loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ +        afr_private_t           *priv                   = NULL; +        afr_local_t             *local                  = NULL; +        afr_internal_lock_t     *int_lock               = NULL; +        call_frame_t            *transaction_frame      = NULL; +        int                     ret                     = -1; +        int                     op_errno                = 0; +        int                     nlockee                 = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        QUORUM_CHECK(rename,out); + +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) { +                op_errno = ENOMEM; +                goto out; +        } + +        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); +        local = transaction_frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        loc_copy (&local->loc,    oldloc); +        loc_copy (&local->newloc, newloc); + +        local->read_child_index = afr_inode_get_read_ctx (this, oldloc->inode, NULL); + +        local->op = GF_FOP_RENAME; +        local->transaction.fop    = afr_rename_wind; +        local->transaction.done   = afr_rename_done; +        local->transaction.unwind = afr_rename_unwind; + +        ret = afr_build_parent_loc (&local->transaction.parent_loc, oldloc, +                                    &op_errno); +        if (ret) +                goto out; +        ret = afr_build_parent_loc (&local->transaction.new_parent_loc, newloc, +                                    &op_errno); +        if (ret) +                goto out; + +        local->transaction.main_frame   = frame; +        local->transaction.basename     = AFR_BASENAME (oldloc->path); +        local->transaction.new_basename = AFR_BASENAME (newloc->path); +        int_lock = &local->internal_lock; + +        int_lock->lockee_count = nlockee = 0; +        ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, +                                     &local->transaction.new_parent_loc, +                                     local->transaction.new_basename, +                                     priv->child_count); +        if (ret) +                goto out; + +        nlockee++; +        ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, +                                     &local->transaction.parent_loc, +                                     local->transaction.basename, +                                     priv->child_count); +        if (ret) +                goto out; + +        nlockee++; +        if (local->newloc.inode && IA_ISDIR (local->newloc.inode->ia_type)) { +                ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, +                                             &local->newloc, +                                             NULL, +                                             priv->child_count); +                if (ret) +                        goto out; + +                nlockee++; +        } +        qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee), +               afr_entry_lockee_cmp); +        int_lock->lockee_count = nlockee; + +        ret = afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); +        if (ret < 0) { +            op_errno = -ret; +            goto out; +        } + +        ret = 0; +out: +        if (ret < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); + +                AFR_STACK_UNWIND (rename, frame, -1, op_errno, +                                  NULL, NULL, NULL, NULL, NULL, NULL); +        } + +        return 0; +} + +/* }}} */ + +/* {{{ unlink */ + +int +afr_unlink_unwind (call_frame_t *frame, xlator_t *this) +{ +        call_frame_t *main_frame = NULL; +        afr_local_t  *local = NULL; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (local->transaction.main_frame) { +                        main_frame = local->transaction.main_frame; +                } +                local->transaction.main_frame = NULL; +        } +        UNLOCK (&frame->lock); + +        if (main_frame) { +                AFR_STACK_UNWIND (unlink, main_frame, +                                  local->op_ret, local->op_errno, +                                  &local->cont.dir_fop.preparent, +                                  &local->cont.dir_fop.postparent, +                                  NULL); +        } + +        return 0; +} + + +int +afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                     int32_t op_ret, int32_t op_errno, struct iatt *preparent, +                     struct iatt *postparent, dict_t *xdata) +{ +        afr_local_t *   local = NULL; +        int call_count  = -1; +        int child_index = (long) cookie; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (child_index == local->read_child_index) { +                        local->read_child_returned = _gf_true; +                } +                __dir_entry_fop_common_cbk (frame, child_index, this, +                                            op_ret, op_errno, NULL, NULL, +                                            preparent, postparent, NULL, NULL); +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); +        if (call_count == 0) +                afr_dir_fop_done (frame, this); + +        return 0; +} + + +int32_t +afr_unlink_wind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        int call_count = -1; +        int i = 0; + +        local = frame->local; +        priv  = this->private; + +        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, +                                                     priv->child_count); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +                return 0; +        } + +        local->call_count = call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) { +                        STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->unlink, +                                           &local->loc, local->xflag, +                                           local->xdata_req); + +                        if (!--call_count) +                                break; +                } +        } + +        return 0; +} + + +int32_t +afr_unlink_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t * local = frame->local; + +        local->transaction.unwind (frame, this); + +        AFR_STACK_DESTROY (frame); + +        return 0; +} + + +int32_t +afr_unlink (call_frame_t *frame, xlator_t *this, +            loc_t *loc, int xflag, dict_t *xdata) +{ +        afr_private_t           *priv                   = NULL; +        afr_local_t             *local                  = NULL; +        afr_internal_lock_t     *int_lock               = NULL; +        call_frame_t            *transaction_frame      = NULL; +        int                     ret                     = -1; +        int                     op_errno                = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        QUORUM_CHECK(unlink,out); + +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) { +                op_errno = ENOMEM; +                goto out; +        } + +        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); +        local = transaction_frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        loc_copy (&local->loc, loc); +        local->xflag = xflag; +        if (xdata) +                local->xdata_req = dict_ref (xdata); + +        local->op = GF_FOP_UNLINK; +        local->transaction.fop    = afr_unlink_wind; +        local->transaction.done   = afr_unlink_done; +        local->transaction.unwind = afr_unlink_unwind; + +        ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, +                                    &op_errno); +        if (ret) +                goto out; + +        local->transaction.main_frame = frame; +        local->transaction.basename = AFR_BASENAME (loc->path); +        int_lock = &local->internal_lock; + +        int_lock->lockee_count = 0; +        ret = afr_init_entry_lockee (&int_lock->lockee[0], local, +                                     &local->transaction.parent_loc, +                                     local->transaction.basename, +                                     priv->child_count); +        if (ret) +                goto out; + +        int_lock->lockee_count++; +        ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); +        if (ret < 0) { +            op_errno = -ret; +            goto out; +        } + +        ret = 0; +out: +        if (ret < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); +                AFR_STACK_UNWIND (unlink, frame, -1, op_errno, +                                  NULL, NULL, NULL); +        } + +        return 0; +} + +/* }}} */ + +/* {{{ rmdir */ + + + +int +afr_rmdir_unwind (call_frame_t *frame, xlator_t *this) +{ +        call_frame_t *main_frame = NULL; +        afr_local_t  *local = NULL; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (local->transaction.main_frame) { +                        main_frame = local->transaction.main_frame; +                } +                local->transaction.main_frame = NULL; +        } +        UNLOCK (&frame->lock); + +        if (main_frame) { +                AFR_STACK_UNWIND (rmdir, main_frame, +                                  local->op_ret, local->op_errno, +                                  &local->cont.dir_fop.preparent, +                                  &local->cont.dir_fop.postparent, +                                  NULL); +        } + +        return 0; +} + + +int +afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                    int32_t op_ret, int32_t op_errno, struct iatt *preparent, +                    struct iatt *postparent, dict_t *xdata) +{ +        afr_local_t *   local = NULL; +        int call_count  = -1; +        int child_index = (long) cookie; +        int read_child  = 0; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (child_index == read_child) { +                        local->read_child_returned = _gf_true; +                } +                if (afr_fop_failed (op_ret, op_errno) && (op_errno != ENOTEMPTY)) +                        afr_transaction_fop_failed (frame, this, child_index); +                local->op_errno = op_errno; +                local->child_errno[child_index] = op_errno; +                if (op_ret > -1) +                        __dir_entry_fop_common_cbk (frame, child_index, this, +                                                   op_ret, op_errno, NULL, NULL, +                                                   preparent, postparent, NULL, +                                                   NULL); + +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); +        if (call_count == 0) +                afr_dir_fop_done (frame, this); + +        return 0; +} + + +int +afr_rmdir_wind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        int call_count = -1; +        int i = 0; + +        local = frame->local; +        priv  = this->private; + +        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, +                                                     priv->child_count); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +                return 0; +        } + +        local->call_count = call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) { +                        STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->rmdir, +                                           &local->loc, local->cont.rmdir.flags, +                                           NULL); + +                        if (!--call_count) +                                break; +                } +        } + +        return 0; +} + + +int +afr_rmdir_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t * local = frame->local; + +        local->transaction.unwind (frame, this); + +        AFR_STACK_DESTROY (frame); + +        return 0; +} + + +int +afr_rmdir (call_frame_t *frame, xlator_t *this, +           loc_t *loc, int flags, dict_t *xdata) +{ +        afr_private_t           *priv                   = NULL; +        afr_local_t             *local                  = NULL; +        afr_internal_lock_t     *int_lock               = NULL; +        call_frame_t            *transaction_frame      = NULL; +        int                     ret                     = -1; +        int                     op_errno                = 0; +        int                     nlockee                 = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        QUORUM_CHECK(rmdir,out); + +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) { +                op_errno = ENOMEM; +                goto out; +        } + +        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); +        local = transaction_frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        local->cont.rmdir.flags = flags; +        loc_copy (&local->loc, loc); + +        local->op = GF_FOP_RMDIR; +        local->transaction.fop    = afr_rmdir_wind; +        local->transaction.done   = afr_rmdir_done; +        local->transaction.unwind = afr_rmdir_unwind; + +        ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, +                                    &op_errno); +        if (ret) +                goto out; + +        local->transaction.main_frame = frame; +        local->transaction.basename = AFR_BASENAME (loc->path); +        int_lock = &local->internal_lock; + +        int_lock->lockee_count = nlockee = 0; +        ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, +                                     &local->transaction.parent_loc, +                                     local->transaction.basename, +                                     priv->child_count); +        if (ret) +                goto out; + +        nlockee++; +        ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, +                                     &local->loc, +                                     NULL, +                                     priv->child_count); +        if (ret) +                goto out; + +        nlockee++; +        qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee), +               afr_entry_lockee_cmp); +        int_lock->lockee_count = nlockee; + +        ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); +        if (ret < 0) { +            op_errno = -ret; +            goto out; +        } + +        ret = 0; +out: +        if (ret < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); +                AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL); +        } + +        return 0; +} + +/* }}} */ diff --git a/xlators/cluster/afr-v1/src/afr-dir-write.h b/xlators/cluster/afr-v1/src/afr-dir-write.h new file mode 100644 index 000000000..02f0a3682 --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-dir-write.h @@ -0,0 +1,47 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef __DIR_WRITE_H__ +#define __DIR_WRITE_H__ + +int32_t +afr_create (call_frame_t *frame, xlator_t *this, +            loc_t *loc, int32_t flags, mode_t mode, +            mode_t umask, fd_t *fd, dict_t *xdata); + +int32_t +afr_mknod (call_frame_t *frame, xlator_t *this, +	   loc_t *loc, mode_t mode, dev_t dev, mode_t umask, dict_t *xdata); + +int32_t +afr_mkdir (call_frame_t *frame, xlator_t *this, +	   loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata); + +int32_t +afr_unlink (call_frame_t *frame, xlator_t *this, +	    loc_t *loc, int xflag, dict_t *xdata); + +int32_t +afr_rmdir (call_frame_t *frame, xlator_t *this, +	   loc_t *loc, int flags, dict_t *xdata); + +int32_t +afr_link (call_frame_t *frame, xlator_t *this, +	  loc_t *oldloc, loc_t *newloc, dict_t *xdata); + +int32_t +afr_rename (call_frame_t *frame, xlator_t *this, +	    loc_t *oldloc, loc_t *newloc, dict_t *xdata); + +int +afr_symlink (call_frame_t *frame, xlator_t *this, +	     const char *linkpath, loc_t *oldloc, mode_t umask, dict_t *params); + +#endif /* __DIR_WRITE_H__ */ diff --git a/xlators/cluster/afr-v1/src/afr-inode-read.c b/xlators/cluster/afr-v1/src/afr-inode-read.c new file mode 100644 index 000000000..0cfebcb9d --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-inode-read.c @@ -0,0 +1,1976 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +/** + * Common algorithm for inode read calls: + * + * - Try the fop on the first child that is up + * - if we have failed due to ENOTCONN: + *     try the next child + * + * Applicable to: access, stat, fstat, readlink, getxattr + */ + +/* {{{ access */ + +int32_t +afr_access_cbk (call_frame_t *frame, void *cookie, +                xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +        afr_private_t * priv            = NULL; +        afr_local_t *   local           = NULL; +        xlator_t **     children        = NULL; +        int             unwind          = 1; +        int32_t         *last_index     = NULL; +        int32_t         next_call_child = -1; +        int32_t         read_child      = -1; +        int32_t         *fresh_children  = NULL; + +        priv     = this->private; +        children = priv->children; + +        local = frame->local; + +        read_child = (long) cookie; + +        if (op_ret == -1) { +                last_index = &local->cont.access.last_index; +                fresh_children = local->fresh_children; +                next_call_child = afr_next_call_child (fresh_children, +                                                       local->child_up, +                                                       priv->child_count, +                                                       last_index, read_child); +                if (next_call_child < 0) +                        goto out; + +                unwind = 0; + +                STACK_WIND_COOKIE (frame, afr_access_cbk, +                                   (void *) (long) read_child, +                                   children[next_call_child], +                                   children[next_call_child]->fops->access, +                                   &local->loc, local->cont.access.mask, +                                   NULL); +        } + +out: +        if (unwind) { +                AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata); +        } + +        return 0; +} + + +int32_t +afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, +            dict_t *xdata) +{ +        afr_private_t   *priv      = NULL; +        xlator_t        **children = NULL; +        int             call_child = 0; +        afr_local_t     *local     = NULL; +        int32_t         op_errno   = 0; +        int32_t         read_child = -1; +        int             ret        = -1; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv     = this->private; +        VALIDATE_OR_GOTO (priv->children, out); + +        children = priv->children; + +        AFR_SBRAIN_CHECK_LOC (loc, out); + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        local->fresh_children = afr_children_create (priv->child_count); +        if (!local->fresh_children) { +                op_errno = ENOMEM; +                goto out; +        } + + +        read_child = afr_inode_get_read_ctx (this, loc->inode, +                                             local->fresh_children); +        ret = afr_get_call_child (this, local->child_up, read_child, +                                     local->fresh_children, +                                     &call_child, +                                     &local->cont.access.last_index); +        if (ret < 0) { +                op_errno = -ret; +                goto out; +        } + +        loc_copy (&local->loc, loc); +        local->cont.access.mask = mask; + +        STACK_WIND_COOKIE (frame, afr_access_cbk, +                           (void *) (long) call_child, +                           children[call_child], +                           children[call_child]->fops->access, +                           loc, mask, xdata); + +        ret = 0; +out: +        if (ret < 0) +                AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL); +        return 0; +} + + +/* }}} */ + +/* {{{ stat */ + +int32_t +afr_stat_cbk (call_frame_t *frame, void *cookie, +              xlator_t *this, int32_t op_ret, int32_t op_errno, +              struct iatt *buf, dict_t *xdata) +{ +        afr_private_t * priv            = NULL; +        afr_local_t *   local           = NULL; +        xlator_t **     children        = NULL; +        int             unwind          = 1; +        int32_t         *last_index     = NULL; +        int32_t         next_call_child = -1; +        int32_t         read_child      = -1; +        int32_t         *fresh_children  = NULL; + +        priv     = this->private; +        children = priv->children; + +        read_child = (long) cookie; + +        local = frame->local; + +        if (op_ret == -1) { +                last_index = &local->cont.stat.last_index; +                fresh_children = local->fresh_children; +                next_call_child = afr_next_call_child (fresh_children, +                                                       local->child_up, +                                                       priv->child_count, +                                                       last_index, read_child); +                if (next_call_child < 0) +                        goto out; + +                unwind = 0; + +                STACK_WIND_COOKIE (frame, afr_stat_cbk, +                                   (void *) (long) read_child, +                                   children[next_call_child], +                                   children[next_call_child]->fops->stat, +                                   &local->loc, NULL); +        } + +out: +        if (unwind) { +                AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); +        } + +        return 0; +} + + +int32_t +afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ +        afr_private_t   *priv      = NULL; +        afr_local_t     *local     = NULL; +        xlator_t        **children = NULL; +        int             call_child = 0; +        int32_t         op_errno   = 0; +        int32_t         read_child = -1; +        int             ret        = -1; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv     = this->private; +        VALIDATE_OR_GOTO (priv->children, out); + +        children = priv->children; + +        AFR_SBRAIN_CHECK_LOC (loc, out); + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        local->fresh_children = afr_children_create (priv->child_count); +        if (!local->fresh_children) { +                op_errno = ENOMEM; +                goto out; +        } + +        read_child = afr_inode_get_read_ctx (this, loc->inode, +                                             local->fresh_children); +        ret = afr_get_call_child (this, local->child_up, read_child, +                                     local->fresh_children, +                                     &call_child, +                                     &local->cont.stat.last_index); +        if (ret < 0) { +                op_errno = -ret; +                goto out; +        } +        loc_copy (&local->loc, loc); + +        STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child, +                           children[call_child], +                           children[call_child]->fops->stat, +                           loc, xdata); + +        ret = 0; +out: +        if (ret < 0) +                AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); + +        return 0; +} + + +/* }}} */ + +/* {{{ fstat */ + +int32_t +afr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +               int32_t op_ret, int32_t op_errno, struct iatt *buf, +               dict_t *xdata) +{ +        afr_private_t   *priv           = NULL; +        afr_local_t     *local          = NULL; +        xlator_t        **children      = NULL; +        int             unwind          = 1; +        int32_t         *last_index     = NULL; +        int32_t         next_call_child = -1; +        int32_t         read_child      = -1; +        int32_t         *fresh_children  = NULL; + +        priv     = this->private; +        children = priv->children; + +        local = frame->local; + +        read_child = (long) cookie; + +        if (op_ret == -1) { +                last_index = &local->cont.fstat.last_index; +                fresh_children = local->fresh_children; +                next_call_child = afr_next_call_child (fresh_children, +                                                       local->child_up, +                                                       priv->child_count, +                                                       last_index, read_child); +                if (next_call_child < 0) +                        goto out; + +                unwind = 0; + +                STACK_WIND_COOKIE (frame, afr_fstat_cbk, +                                   (void *) (long) read_child, +                                   children[next_call_child], +                                   children[next_call_child]->fops->fstat, +                                   local->fd, NULL); +        } + +out: +        if (unwind) { +                AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); +        } + +        return 0; +} + + +int32_t +afr_fstat (call_frame_t *frame, xlator_t *this, +           fd_t *fd, dict_t *xdata) +{ +        afr_private_t   *priv      = NULL; +        afr_local_t     *local     = NULL; +        xlator_t        **children = NULL; +        int             call_child = 0; +        int32_t         op_errno   = 0; +        int32_t         read_child = 0; +        int             ret        = -1; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv     = this->private; +        VALIDATE_OR_GOTO (priv->children, out); + +        children = priv->children; + +        VALIDATE_OR_GOTO (fd->inode, out); + +        AFR_SBRAIN_CHECK_FD (fd, out); + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        local->fresh_children = afr_children_create (priv->child_count); +        if (!local->fresh_children) { +                op_errno = ENOMEM; +                goto out; +        } + +        read_child = afr_inode_get_read_ctx (this, fd->inode, +                                             local->fresh_children); + + + +        ret = afr_get_call_child (this, local->child_up, read_child, +                                     local->fresh_children, +                                     &call_child, +                                     &local->cont.fstat.last_index); +        if (ret < 0) { +                op_errno = -ret; +                goto out; +        } + +        local->fd = fd_ref (fd); + +        afr_open_fd_fix (fd, this); + +        STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child, +                           children[call_child], +                           children[call_child]->fops->fstat, +                           fd, xdata); + +        ret = 0; +out: +        if (ret < 0) +                AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); + +        return 0; +} + +/* }}} */ + +/* {{{ readlink */ + +int32_t +afr_readlink_cbk (call_frame_t *frame, void *cookie, +                  xlator_t *this, int32_t op_ret, int32_t op_errno, +                  const char *buf, struct iatt *sbuf, dict_t *xdata) +{ +        afr_private_t * priv                  = NULL; +        afr_local_t *   local                 = NULL; +        xlator_t **     children              = NULL; +        int             unwind                = 1; +        int32_t         *last_index           = NULL; +        int32_t         next_call_child       = -1; +        int32_t         read_child            = -1; +        int32_t         *fresh_children        = NULL; + +        priv     = this->private; +        children = priv->children; + +        local = frame->local; + +        read_child = (long) cookie; + +        if (op_ret == -1) { +                last_index = &local->cont.readlink.last_index; +                fresh_children = local->fresh_children; +                next_call_child = afr_next_call_child (fresh_children, +                                                       local->child_up, +                                                       priv->child_count, +                                                       last_index, read_child); +                if (next_call_child < 0) +                        goto out; + +                unwind = 0; +                STACK_WIND_COOKIE (frame, afr_readlink_cbk, +                                   (void *) (long) read_child, +                                   children[next_call_child], +                                   children[next_call_child]->fops->readlink, +                                   &local->loc, +                                   local->cont.readlink.size, NULL); +        } + +out: +        if (unwind) { +                AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, buf, sbuf, +                                  xdata); +        } + +        return 0; +} + + +int32_t +afr_readlink (call_frame_t *frame, xlator_t *this, +              loc_t *loc, size_t size, dict_t *xdata) +{ +        afr_private_t   *priv      = NULL; +        xlator_t        **children = NULL; +        int             call_child = 0; +        afr_local_t     *local     = NULL; +        int32_t         op_errno   = 0; +        int32_t         read_child = -1; +        int             ret        = -1; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv     = this->private; +        VALIDATE_OR_GOTO (priv->children, out); + +        children = priv->children; + +        AFR_SBRAIN_CHECK_LOC (loc, out); + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        local->fresh_children = afr_children_create (priv->child_count); +        if (!local->fresh_children) { +                op_errno = ENOMEM; +                goto out; +        } +        read_child = afr_inode_get_read_ctx (this, loc->inode, +                                             local->fresh_children); +        ret = afr_get_call_child (this, local->child_up, read_child, +                                     local->fresh_children, +                                     &call_child, +                                     &local->cont.readlink.last_index); +        if (ret < 0) { +                op_errno = -ret; +                goto out; +        } + +        loc_copy (&local->loc, loc); + +        local->cont.readlink.size       = size; + +        STACK_WIND_COOKIE (frame, afr_readlink_cbk, +                           (void *) (long) call_child, +                           children[call_child], +                           children[call_child]->fops->readlink, +                           loc, size, xdata); + +        ret = 0; +out: +        if (ret < 0) +                AFR_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL, NULL); +        return 0; +} + + +/* }}} */ + +/* {{{ getxattr */ + +struct _xattr_key { +        char *key; +        struct list_head list; +}; + + +int +__gather_xattr_keys (dict_t *dict, char *key, data_t *value, +                     void *data) +{ +        struct list_head *  list  = data; +        struct _xattr_key * xkey  = NULL; + +        if (!strncmp (key, AFR_XATTR_PREFIX, +                      strlen (AFR_XATTR_PREFIX))) { + +                xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key); +                if (!xkey) +                        return -1; + +                xkey->key = key; +                INIT_LIST_HEAD (&xkey->list); + +                list_add_tail (&xkey->list, list); +        } +        return 0; +} + + +void +__filter_xattrs (dict_t *dict) +{ +        struct list_head   keys = {0,}; +        struct _xattr_key *key  = NULL; +        struct _xattr_key *tmp  = NULL; + +        INIT_LIST_HEAD (&keys); + +        dict_foreach (dict, __gather_xattr_keys, +                      (void *) &keys); + +        list_for_each_entry_safe (key, tmp, &keys, list) { +                dict_del (dict, key->key); + +                list_del_init (&key->list); + +                GF_FREE (key); +        } +} + + + +int32_t +afr_getxattr_cbk (call_frame_t *frame, void *cookie, +                  xlator_t *this, int32_t op_ret, int32_t op_errno, +                  dict_t *dict, dict_t *xdata) +{ +        afr_private_t * priv            = NULL; +        afr_local_t *   local           = NULL; +        xlator_t **     children        = NULL; +        int             unwind          = 1; +        int32_t         *last_index     = NULL; +        int32_t         next_call_child = -1; +        int32_t         read_child      = -1; +        int32_t         *fresh_children  = NULL; + +        priv     = this->private; +        children = priv->children; + +        local = frame->local; + +        read_child = (long) cookie; + +        if (op_ret == -1) { +                last_index = &local->cont.getxattr.last_index; +                fresh_children = local->fresh_children; +                next_call_child = afr_next_call_child (fresh_children, +                                                       local->child_up, +                                                       priv->child_count, +                                                       last_index, read_child); +                if (next_call_child < 0) +                        goto out; + +                unwind = 0; +                STACK_WIND_COOKIE (frame, afr_getxattr_cbk, +                                   (void *) (long) read_child, +                                   children[next_call_child], +                                   children[next_call_child]->fops->getxattr, +                                   &local->loc, +                                   local->cont.getxattr.name, +                                   NULL); +        } + +out: +        if (unwind) { +                if (op_ret >= 0 && dict) +                        __filter_xattrs (dict); + +                AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); +        } + +        return 0; +} + +int32_t +afr_getxattr_unwind (call_frame_t *frame, int op_ret, int op_errno, +                     dict_t *dict, dict_t *xdata) + +{ +        AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); +        return 0; +} + +int32_t +afr_fgetxattr_clrlk_cbk (call_frame_t *frame, void *cookie, +                         xlator_t *this, int32_t op_ret, int32_t op_errno, +                         dict_t *dict, dict_t *xdata) +{ +        afr_local_t     *local                  = NULL; +        afr_private_t   *priv                   = NULL; +        xlator_t        **children              = NULL; +        dict_t          *xattr                  = NULL; +        char            *tmp_report             = NULL; +        char            lk_summary[1024]        = {0,}; +        int             serz_len                = 0; +        int32_t         callcnt                 = 0; +        long int        cky                     = 0; +        int             ret                     = 0; + +        priv     = this->private; +        children = priv->children; + +        local = frame->local; +        cky = (long) cookie; + +        LOCK (&frame->lock); +        { +                callcnt = --local->call_count; +                if (op_ret == -1) +                        local->child_errno[cky] = op_errno; + +                if (!local->dict) +                        local->dict = dict_new (); +                if (local->dict) { +                        ret = dict_get_str (dict, local->cont.getxattr.name, +                                            &tmp_report); +                        if (ret) +                                goto unlock; +                        ret = dict_set_dynstr (local->dict, +                                               children[cky]->name, +                                               gf_strdup (tmp_report)); +                        if (ret) +                                goto unlock; +                } +        } +unlock: +        UNLOCK (&frame->lock); + +        if (!callcnt) { +                xattr = dict_new (); +                if (!xattr) { +                        op_ret = -1; +                        op_errno = ENOMEM; +                        goto unwind; +                } +                ret = dict_serialize_value_with_delim (local->dict, +                                                       lk_summary, +                                                       &serz_len, '\n'); +                if (ret) { +                        op_ret = -1; +                        op_errno = ENOMEM; +                        gf_log (this->name, GF_LOG_ERROR, +                                "Error serializing dictionary"); +                        goto unwind; +                } +                if (serz_len == -1) +                        snprintf (lk_summary, sizeof (lk_summary), +                                  "No locks cleared."); +                ret = dict_set_dynstr (xattr, local->cont.getxattr.name, +                                       gf_strdup (lk_summary)); +                if (ret) { +                        op_ret = -1; +                        op_errno = ENOMEM; +                        gf_log (this->name, GF_LOG_ERROR, +                                "Error setting dictionary"); +                        goto unwind; +                } + +        unwind: +                // Updating child_errno with more recent 'events' +                local->child_errno[cky] = op_errno; +                op_errno = afr_resultant_errno_get (NULL, local->child_errno, +                                                    priv->child_count); +                AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr, +                                  xdata); + +                if (xattr) +                        dict_unref (xattr); +        } + +        return ret; +} + +int32_t +afr_getxattr_clrlk_cbk (call_frame_t *frame, void *cookie, +                        xlator_t *this, int32_t op_ret, int32_t op_errno, +                        dict_t *dict, dict_t *xdata) +{ +        afr_local_t     *local                  = NULL; +        afr_private_t   *priv                   = NULL; +        xlator_t        **children              = NULL; +        dict_t          *xattr                  = NULL; +        char            *tmp_report             = NULL; +        char            lk_summary[1024]        = {0,}; +        int             serz_len                = 0; +        int32_t         callcnt                 = 0; +        long int        cky                     = 0; +        int             ret                     = 0; + +        priv     = this->private; +        children = priv->children; + +        local = frame->local; +        cky = (long) cookie; + +        LOCK (&frame->lock); +        { +                callcnt = --local->call_count; +                if (op_ret == -1) +                        local->child_errno[cky] = op_errno; + +                if (!local->dict) +                        local->dict = dict_new (); +                if (local->dict) { +                        ret = dict_get_str (dict, local->cont.getxattr.name, +                                            &tmp_report); +                        if (ret) +                                goto unlock; +                        ret = dict_set_dynstr (local->dict, +                                               children[cky]->name, +                                               gf_strdup (tmp_report)); +                        if (ret) +                                goto unlock; +                } +        } +unlock: +        UNLOCK (&frame->lock); + +        if (!callcnt) { +                xattr = dict_new (); +                if (!xattr) { +                        op_ret = -1; +                        op_errno = ENOMEM; +                        goto unwind; +                } +                ret = dict_serialize_value_with_delim (local->dict, +                                                       lk_summary, +                                                       &serz_len, '\n'); +                if (ret) { +                        op_ret = -1; +                        op_errno = ENOMEM; +                        gf_log (this->name, GF_LOG_ERROR, +                                "Error serializing dictionary"); +                        goto unwind; +                } +                if (serz_len == -1) +                        snprintf (lk_summary, sizeof (lk_summary), +                                  "No locks cleared."); +                ret = dict_set_dynstr (xattr, local->cont.getxattr.name, +                                       gf_strdup (lk_summary)); +                if (ret) { +                        op_ret = -1; +                        op_errno = ENOMEM; +                        gf_log (this->name, GF_LOG_ERROR, +                                "Error setting dictionary"); +                        goto unwind; +                } + +        unwind: +                // Updating child_errno with more recent 'events' +                local->child_errno[cky] = op_errno; +                op_errno = afr_resultant_errno_get (NULL, local->child_errno, +                                                    priv->child_count); +                AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata); + +                if (xattr) +                        dict_unref (xattr); +        } + +        return ret; +} + +/** + * node-uuid cbk uses next child querying mechanism + */ +int32_t +afr_getxattr_node_uuid_cbk (call_frame_t *frame, void *cookie, +                            xlator_t *this, int32_t op_ret, int32_t op_errno, +                            dict_t *dict, dict_t *xdata) +{ +        afr_private_t  *priv            = NULL; +        afr_local_t    *local           = NULL; +        xlator_t      **children        = NULL; +        int             unwind          = 1; +        int             curr_call_child = 0; + +        priv = this->private; +        children = priv->children; + +        local = frame->local; + +        if (op_ret == -1) { /** query the _next_ child */ + +                /** +                 * _current_ becomes _next_ +                 * If done with all childs and yet no success; give up ! +                 */ +                curr_call_child = (int) ((long)cookie); +                if (++curr_call_child == priv->child_count) +                        goto unwind; + +                gf_log (this->name, GF_LOG_WARNING, +                        "op_ret (-1): Re-querying afr-child (%d/%d)", +                        curr_call_child, priv->child_count); + +                unwind = 0; +                STACK_WIND_COOKIE (frame, afr_getxattr_node_uuid_cbk, +                                   (void *) (long) curr_call_child, +                                   children[curr_call_child], +                                   children[curr_call_child]->fops->getxattr, +                                   &local->loc, +                                   local->cont.getxattr.name, +                                   NULL); +        } + + unwind: +        if (unwind) +                AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, +                                  NULL); + +        return 0; +} + +int32_t +afr_getxattr_lockinfo_cbk (call_frame_t *frame, void *cookie, +                           xlator_t *this, int32_t op_ret, int32_t op_errno, +                           dict_t *dict, dict_t *xdata) +{ +        int          call_cnt     = 0, len = 0; +        char        *lockinfo_buf = NULL; +        dict_t      *lockinfo     = NULL, *newdict = NULL; +        afr_local_t *local        = NULL; + +        LOCK (&frame->lock); +        { +                local = frame->local; + +                call_cnt = --local->call_count; + +                if ((op_ret < 0) || (!dict && !xdata)) { +                        goto unlock; +                } + +                if (xdata) { +                        if (!local->xdata_rsp) { +                                local->xdata_rsp = dict_new (); +                                if (!local->xdata_rsp) { +                                        local->op_ret = -1; +                                        local->op_errno = ENOMEM; +                                        goto unlock; +                                } +                        } +                } + +                if (!dict) { +                        goto unlock; +                } + +                op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY, +                                               (void **)&lockinfo_buf, &len); + +                if (!lockinfo_buf) { +                        goto unlock; +                } + +                if (!local->dict) { +                        local->dict = dict_new (); +                        if (!local->dict) { +                                local->op_ret = -1; +                                local->op_errno = ENOMEM; +                                goto unlock; +                        } +                } +        } +unlock: +        UNLOCK (&frame->lock); + +        if (lockinfo_buf != NULL) { +                lockinfo = dict_new (); +                if (lockinfo == NULL) { +                        local->op_ret = -1; +                        local->op_errno = ENOMEM; +                } else { +                        op_ret = dict_unserialize (lockinfo_buf, len, +                                                   &lockinfo); + +                        if (lockinfo && local->dict) { +                                dict_copy (lockinfo, local->dict); +                        } +                } +        } + +        if (xdata && local->xdata_rsp) { +                dict_copy (xdata, local->xdata_rsp); +        } + +        if (!call_cnt) { +                newdict = dict_new (); +                if (!newdict) { +                        local->op_ret = -1; +                        local->op_errno = ENOMEM; +                        goto unwind; +                } + +                len = dict_serialized_length (local->dict); +                if (len == 0) { +                        goto unwind; +                } + +                lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char); +                if (!lockinfo_buf) { +                        local->op_ret = -1; +                        local->op_errno = ENOMEM; +                        goto unwind; +                } + +                op_ret = dict_serialize (local->dict, lockinfo_buf); +                if (op_ret < 0) { +                        local->op_ret = -1; +                        local->op_errno = -op_ret; +                } + +                op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY, +                                          (void *)lockinfo_buf, len); +                if (op_ret < 0) { +                        local->op_ret = -1; +                        local->op_errno = -op_ret; +                        goto unwind; +                } + +        unwind: +                AFR_STACK_UNWIND (getxattr, frame, op_ret, +                                  op_errno, newdict, +                                  local->xdata_rsp); +        } + +        dict_unref (lockinfo); + +        return 0; +} + +int32_t +afr_fgetxattr_lockinfo_cbk (call_frame_t *frame, void *cookie, +                            xlator_t *this, int32_t op_ret, int32_t op_errno, +                            dict_t *dict, dict_t *xdata) +{ +        int          call_cnt     = 0, len = 0; +        char        *lockinfo_buf = NULL; +        dict_t      *lockinfo     = NULL, *newdict = NULL; +        afr_local_t *local        = NULL; + +        LOCK (&frame->lock); +        { +                local = frame->local; + +                call_cnt = --local->call_count; + +                if ((op_ret < 0) || (!dict && !xdata)) { +                        goto unlock; +                } + +                if (xdata) { +                        if (!local->xdata_rsp) { +                                local->xdata_rsp = dict_new (); +                                if (!local->xdata_rsp) { +                                        local->op_ret = -1; +                                        local->op_errno = ENOMEM; +                                        goto unlock; +                                } +                        } +                } + +                if (!dict) { +                        goto unlock; +                } + +                op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY, +                                               (void **)&lockinfo_buf, &len); + +                if (!lockinfo_buf) { +                        goto unlock; +                } + +                if (!local->dict) { +                        local->dict = dict_new (); +                        if (!local->dict) { +                                local->op_ret = -1; +                                local->op_errno = ENOMEM; +                                goto unlock; +                        } +                } +        } +unlock: +        UNLOCK (&frame->lock); + +        if (lockinfo_buf != NULL) { +                lockinfo = dict_new (); +                if (lockinfo == NULL) { +                        local->op_ret = -1; +                        local->op_errno = ENOMEM; +                } else { +                        op_ret = dict_unserialize (lockinfo_buf, len, +                                                   &lockinfo); + +                        if (lockinfo && local->dict) { +                                dict_copy (lockinfo, local->dict); +                        } +                } +        } + +        if (xdata && local->xdata_rsp) { +                dict_copy (xdata, local->xdata_rsp); +        } + +        if (!call_cnt) { +                newdict = dict_new (); +                if (!newdict) { +                        local->op_ret = -1; +                        local->op_errno = ENOMEM; +                        goto unwind; +                } + +                len = dict_serialized_length (local->dict); +                if (len <= 0) { +                        goto unwind; +                } + +                lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char); +                if (!lockinfo_buf) { +                        local->op_ret = -1; +                        local->op_errno = ENOMEM; +                        goto unwind; +                } + +                op_ret = dict_serialize (local->dict, lockinfo_buf); +                if (op_ret < 0) { +                        local->op_ret = -1; +                        local->op_errno = -op_ret; +                } + +                op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY, +                                          (void *)lockinfo_buf, len); +                if (op_ret < 0) { +                        local->op_ret = -1; +                        local->op_errno = -op_ret; +                        goto unwind; +                } + +        unwind: +                AFR_STACK_UNWIND (fgetxattr, frame, op_ret, +                                  op_errno, newdict, +                                  local->xdata_rsp); +        } + +        dict_unref (lockinfo); + +        return 0; +} + +int32_t +afr_fgetxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, +                            xlator_t *this, int32_t op_ret, int32_t op_errno, +                            dict_t *dict, dict_t *xdata) +{ +        afr_local_t *local          = NULL; +        int32_t      callcnt        = 0; +        int          ret            = 0; +        char        *xattr          = NULL; +        char        *xattr_serz     = NULL; +        char        xattr_cky[1024] = {0,}; +        dict_t      *nxattr         = NULL; +        long         cky            = 0; +        int32_t      padding        = 0; +        int32_t      tlen           = 0; + +        if (!frame || !frame->local || !this) { +                gf_log ("", GF_LOG_ERROR, "possible NULL deref"); +                goto out; +        } + +        local = frame->local; +        cky = (long) cookie; + +        LOCK (&frame->lock); +        { +                callcnt = --local->call_count; + +                if (op_ret < 0) { +                        local->op_errno = op_errno; +                } else { +                        local->op_ret = op_ret; +                        if (!local->xdata_rsp && xdata) +                                local->xdata_rsp = dict_ref (xdata); +                } + +                if (!dict || (op_ret < 0)) +                        goto out; + +                if (!local->dict) +                        local->dict = dict_new (); + +                if (local->dict) { +                        ret = dict_get_str (dict, +                                            local->cont.getxattr.name, +                                            &xattr); +                        if (ret) +                                goto out; + +                        xattr = gf_strdup (xattr); + +                        (void)snprintf (xattr_cky, 1024, "%s-%ld", +                                        local->cont.getxattr.name, cky); +                        ret = dict_set_dynstr (local->dict, +                                               xattr_cky, xattr); +                        if (ret) { +                                gf_log (this->name, GF_LOG_ERROR, +                                        "Cannot set xattr cookie key"); +                                goto out; +                        } + +                        local->cont.getxattr.xattr_len +                                += strlen (xattr) + 1; +                } +        } +out: +        UNLOCK (&frame->lock); + +        if (!callcnt) { +                if (!local->cont.getxattr.xattr_len) +                        goto unwind; + +                nxattr = dict_new (); +                if (!nxattr) +                        goto unwind; + +                /* extra bytes for decorations (brackets and <>'s) */ +                padding += strlen (this->name) +                        + strlen (AFR_PATHINFO_HEADER) + 4; +                local->cont.getxattr.xattr_len += (padding + 2); + +                xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len, +                                        sizeof (char), gf_common_mt_char); + +                if (!xattr_serz) +                        goto unwind; + +                /* the xlator info */ +                (void) sprintf (xattr_serz, "(<"AFR_PATHINFO_HEADER"%s> ", +                                this->name); + +                /* actual series of pathinfo */ +                ret = dict_serialize_value_with_delim (local->dict, +                                                       xattr_serz +                                                       + strlen (xattr_serz), +                                                       &tlen, ' '); +                if (ret) { +                        gf_log (this->name, GF_LOG_ERROR, "Error serializing" +                                " dictionary"); +                        goto unwind; +                } + +                /* closing part */ +                *(xattr_serz + padding + tlen) = ')'; +                *(xattr_serz + padding + tlen + 1) = '\0'; + +                ret = dict_set_dynstr (nxattr, local->cont.getxattr.name, +                                       xattr_serz); +                if (ret) +                        gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo" +                                " key in dict"); + +        unwind: +                AFR_STACK_UNWIND (fgetxattr, frame, local->op_ret, +                                  local->op_errno, nxattr, local->xdata_rsp); + +                if (nxattr) +                        dict_unref (nxattr); +        } + +        return ret; +} + +int32_t +afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, +                           xlator_t *this, int32_t op_ret, int32_t op_errno, +                           dict_t *dict, dict_t *xdata) +{ +        afr_local_t *local          = NULL; +        int32_t      callcnt        = 0; +        int          ret            = 0; +        char        *xattr          = NULL; +        char        *xattr_serz     = NULL; +        char        xattr_cky[1024] = {0,}; +        dict_t      *nxattr         = NULL; +        long         cky            = 0; +        int32_t      padding        = 0; +        int32_t      tlen           = 0; + +        if (!frame || !frame->local || !this) { +                gf_log ("", GF_LOG_ERROR, "possible NULL deref"); +                goto out; +        } + +        local = frame->local; +        cky = (long) cookie; + +        LOCK (&frame->lock); +                { +                        callcnt = --local->call_count; + +                        if (op_ret < 0) { +                                local->op_errno = op_errno; +                        } else { +                                local->op_ret = op_ret; +                                if (!local->xdata_rsp && xdata) +                                        local->xdata_rsp = dict_ref (xdata); +                        } + +                        if (!dict || (op_ret < 0)) +                                goto out; + +                        if (!local->dict) +                                local->dict = dict_new (); + +                        if (local->dict) { +                                ret = dict_get_str (dict, +                                                    local->cont.getxattr.name, +                                                    &xattr); +                                if (ret) +                                        goto out; + +                                xattr = gf_strdup (xattr); + +                                (void)snprintf (xattr_cky, 1024, "%s-%ld", +                                                local->cont.getxattr.name, cky); +                                ret = dict_set_dynstr (local->dict, +                                                       xattr_cky, xattr); +                                if (ret) { +                                        gf_log (this->name, GF_LOG_ERROR, +                                                "Cannot set xattr cookie key"); +                                        goto out; +                                } + +                                local->cont.getxattr.xattr_len += strlen (xattr) + 1; +                        } +                } + out: +        UNLOCK (&frame->lock); + +        if (!callcnt) { +                if (!local->cont.getxattr.xattr_len) +                        goto unwind; + +                nxattr = dict_new (); +                if (!nxattr) +                        goto unwind; + +                /* extra bytes for decorations (brackets and <>'s) */ +                padding += strlen (this->name) + strlen (AFR_PATHINFO_HEADER) + 4; +                local->cont.getxattr.xattr_len += (padding + 2); + +                xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len, +                                        sizeof (char), gf_common_mt_char); + +                if (!xattr_serz) +                        goto unwind; + +                /* the xlator info */ +                (void) sprintf (xattr_serz, "(<"AFR_PATHINFO_HEADER"%s> ", +                                this->name); + +                /* actual series of pathinfo */ +                ret = dict_serialize_value_with_delim (local->dict, +                                                       xattr_serz + strlen (xattr_serz), +                                                       &tlen, ' '); +                if (ret) { +                        gf_log (this->name, GF_LOG_ERROR, "Error serializing" +                                " dictionary"); +                        goto unwind; +                } + +                /* closing part */ +                *(xattr_serz + padding + tlen) = ')'; +                *(xattr_serz + padding + tlen + 1) = '\0'; + +                ret = dict_set_dynstr (nxattr, local->cont.getxattr.name, +                                       xattr_serz); +                if (ret) +                        gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo" +                                " key in dict"); + +        unwind: +                AFR_STACK_UNWIND (getxattr, frame, local->op_ret, +                                  local->op_errno, nxattr, local->xdata_rsp); + +                if (nxattr) +                        dict_unref (nxattr); +        } + +        return ret; +} + +static int +afr_aggregate_stime_xattr (dict_t *this, char *key, data_t *value, void *data) +{ +        int ret = 0; + +        if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) +                ret = gf_get_max_stime (THIS, data, key, value); + +        return ret; +} + +int32_t +afr_common_getxattr_stime_cbk (call_frame_t *frame, void *cookie, +                               xlator_t *this, int32_t op_ret, int32_t op_errno, +                               dict_t *dict, dict_t *xdata) +{ +        afr_local_t *local          = NULL; +        int32_t      callcnt        = 0; + +        if (!frame || !frame->local || !this) { +                gf_log ("", GF_LOG_ERROR, "possible NULL deref"); +                goto out; +        } + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                callcnt = --local->call_count; + +                if (!dict || (op_ret < 0)) { +                        local->op_errno = op_errno; +                        goto cleanup; +                } + +                if (!local->dict) +                        local->dict = dict_copy_with_ref (dict, NULL); +                else +                        dict_foreach (dict, afr_aggregate_stime_xattr, +                                      local->dict); +                local->op_ret = 0; +        } + +cleanup: +        UNLOCK (&frame->lock); + +        if (!callcnt) { +                AFR_STACK_UNWIND (getxattr, frame, local->op_ret, +                                  local->op_errno, local->dict, xdata); +        } + +out: +        return 0; +} + + +static gf_boolean_t +afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk, +                      gf_boolean_t is_fgetxattr) +{ +        gf_boolean_t    is_spl = _gf_true; + +        GF_ASSERT (cbk); +        if (!cbk || !name) { +                is_spl = _gf_false; +                goto out; +        } + +        if (!strcmp (name, GF_XATTR_PATHINFO_KEY) || +                        !strcmp (name, GF_XATTR_USER_PATHINFO_KEY)) { +                if (is_fgetxattr) { +                        *cbk = afr_fgetxattr_pathinfo_cbk; +                } else { +                        *cbk = afr_getxattr_pathinfo_cbk; +                } +        } else if (!strncmp (name, GF_XATTR_CLRLK_CMD, +                             strlen (GF_XATTR_CLRLK_CMD))) { +                if (is_fgetxattr) { +                        *cbk = afr_fgetxattr_clrlk_cbk; +                } else { +                        *cbk = afr_getxattr_clrlk_cbk; +                } +        } else if (!strncmp (name, GF_XATTR_LOCKINFO_KEY, +                             strlen (GF_XATTR_LOCKINFO_KEY))) { +                if (is_fgetxattr) { +                        *cbk = afr_fgetxattr_lockinfo_cbk; +                } else { +                        *cbk = afr_getxattr_lockinfo_cbk; +                } +        } else if (fnmatch (GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) { +                *cbk = afr_common_getxattr_stime_cbk; +        } else { +                is_spl = _gf_false; +        } + +out: +        return is_spl; +} + +static void +afr_getxattr_frm_all_children (xlator_t *this, call_frame_t *frame, +                               const char *name, loc_t *loc, +                               fop_getxattr_cbk_t cbk) +{ +        afr_private_t   *priv           = NULL; +        afr_local_t     *local          = NULL; +        xlator_t        **children      = NULL; +        int             i               = 0; +        int             call_count      = 0; + +        priv     = this->private; +        children = priv->children; + +        local = frame->local; +        //local->call_count set in afr_local_init +        call_count = local->call_count; + +        //If up-children count is 0, afr_local_init would have failed already +        //and the call would have unwound so not handling it here. + +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i]) { +                        STACK_WIND_COOKIE (frame, cbk, +                                           (void *) (long) i, children[i], +                                           children[i]->fops->getxattr, +                                           loc, name, NULL); +                        if (!--call_count) +                                break; +                } +        } +        return; +} + +int32_t +afr_getxattr (call_frame_t *frame, xlator_t *this, +              loc_t *loc, const char *name, dict_t *xdata) +{ +        afr_private_t           *priv         = NULL; +        xlator_t                **children    = NULL; +        int                     call_child    = 0; +        afr_local_t             *local        = NULL; +        xlator_list_t           *trav         = NULL; +        xlator_t                **sub_volumes = NULL; +        int                     i             = 0; +        int32_t                 op_errno      = 0; +        int32_t                 read_child    = -1; +        int                     ret           = -1; +        fop_getxattr_cbk_t      cbk           = NULL; +        int                     afr_xtime_gauge[MCNT_MAX] = {0,}; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv     = this->private; +        VALIDATE_OR_GOTO (priv->children, out); + +        children = priv->children; + +        AFR_SBRAIN_CHECK_LOC (loc, out); + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        loc_copy (&local->loc, loc); +        if (!name) +                goto no_name; + +        local->cont.getxattr.name = gf_strdup (name); + +        if (!strncmp (name, AFR_XATTR_PREFIX, +                      strlen (AFR_XATTR_PREFIX))) { +                gf_log (this->name, GF_LOG_INFO, +                        "%s: no data present for key %s", +                        loc->path, name); +                op_errno = ENODATA; +                goto out; +        } +        if ((strcmp (GF_XATTR_MARKER_KEY, name) == 0) +            && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { + +                local->marker.call_count = priv->child_count; + +                sub_volumes = alloca ( priv->child_count * sizeof (xlator_t *)); +                for (i = 0, trav = this->children; trav ; +                     trav = trav->next, i++) { + +                        *(sub_volumes + i)  = trav->xlator; +                } + +                if (cluster_getmarkerattr (frame, this, loc, name, +                                           local, afr_getxattr_unwind, +                                           sub_volumes, +                                           priv->child_count, +                                           MARKER_UUID_TYPE, +                                           marker_uuid_default_gauge, +                                           priv->vol_uuid)) { + +                        gf_log (this->name, GF_LOG_INFO, +                                "%s: failed to get marker attr (%s)", +                                loc->path, name); +                        op_errno = EINVAL; +                        goto out; +                } + +                return 0; +        } + +        /* +         * if we are doing getxattr with pathinfo as the key then we +         * collect information from all childs +         */ +        if (afr_is_special_xattr (name, &cbk, 0)) { +                afr_getxattr_frm_all_children (this, frame, name, +                                               loc, cbk); +                return 0; +        } + +        if (XATTR_IS_NODE_UUID (name)) { +                i = 0; +                STACK_WIND_COOKIE (frame, afr_getxattr_node_uuid_cbk, +                                   (void *) (long) i, +                                   children[i], +                                   children[i]->fops->getxattr, +                                   loc, name, xdata); +                return 0; +        } + +        if (*priv->vol_uuid) { +                if ((match_uuid_local (name, priv->vol_uuid) == 0) +                    && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { +                        local->marker.call_count = priv->child_count; + +                        sub_volumes = alloca ( priv->child_count +                                               * sizeof (xlator_t *)); +                        for (i = 0, trav = this->children; trav ; +                             trav = trav->next, i++) { + +                                *(sub_volumes + i)  = trav->xlator; + +                        } + +                        /* don't err out on getting ENOTCONN (brick down) +                         * from a subset of the bricks +                         */ +                        memcpy (afr_xtime_gauge, marker_xtime_default_gauge, +                                sizeof (afr_xtime_gauge)); +                        afr_xtime_gauge[MCNT_NOTFOUND] = 0; +                        afr_xtime_gauge[MCNT_ENOTCONN] = 0; +                        if (cluster_getmarkerattr (frame, this, loc, +                                                   name, local, +                                                   afr_getxattr_unwind, +                                                   sub_volumes, +                                                   priv->child_count, +                                                   MARKER_XTIME_TYPE, +                                                   afr_xtime_gauge, +                                                   priv->vol_uuid)) { +                                gf_log (this->name, GF_LOG_INFO, +                                        "%s: failed to get marker attr (%s)", +                                        loc->path, name); +                                op_errno = EINVAL; +                                goto out; +                        } + +                        return 0; +                } +        } + +no_name: +        local->fresh_children = afr_children_create (priv->child_count); +        if (!local->fresh_children) { +                op_errno = ENOMEM; +                goto out; +        } + +        read_child = afr_inode_get_read_ctx (this, loc->inode, +                                             local->fresh_children); +        ret = afr_get_call_child (this, local->child_up, read_child, +                                     local->fresh_children, +                                     &call_child, +                                     &local->cont.getxattr.last_index); +        if (ret < 0) { +                op_errno = -ret; +                goto out; +        } + +        STACK_WIND_COOKIE (frame, afr_getxattr_cbk, +                           (void *) (long) call_child, +                           children[call_child], +                           children[call_child]->fops->getxattr, +                           loc, name, xdata); + +        ret = 0; +out: +        if (ret < 0) +                AFR_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); +        return 0; +} + +/* {{{ fgetxattr */ + + +int32_t +afr_fgetxattr_cbk (call_frame_t *frame, void *cookie, +                   xlator_t *this, int32_t op_ret, int32_t op_errno, +                   dict_t *dict, dict_t *xdata) +{ +        afr_private_t * priv            = NULL; +        afr_local_t *   local           = NULL; +        xlator_t **     children        = NULL; +        int             unwind          = 1; +        int32_t         *last_index     = NULL; +        int32_t         next_call_child = -1; +        int32_t         read_child      = -1; +        int32_t         *fresh_children  = NULL; + +        priv     = this->private; +        children = priv->children; + +        local = frame->local; + +        read_child = (long) cookie; + +        if (op_ret == -1) { +                last_index = &local->cont.getxattr.last_index; +                fresh_children = local->fresh_children; +                next_call_child = afr_next_call_child (fresh_children, +                                                       local->child_up, +                                                       priv->child_count, +                                                       last_index, read_child); +                if (next_call_child < 0) +                        goto out; + +                unwind = 0; +                STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, +                                   (void *) (long) read_child, +                                   children[next_call_child], +                                   children[next_call_child]->fops->fgetxattr, +                                   local->fd, +                                   local->cont.getxattr.name, +                                   NULL); +        } + +out: +        if (unwind) { +                if (op_ret >= 0 && dict) +                        __filter_xattrs (dict); + +                AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, +                                  xdata); +        } + +        return 0; +} + +int32_t +afr_fgetxattr_unwind (call_frame_t *frame, +                      int op_ret, int op_errno, dict_t *dict, dict_t *xdata) + +{ +        AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata); +        return 0; +} + +static void +afr_fgetxattr_frm_all_children (xlator_t *this, call_frame_t *frame, +                                const char *name, fd_t *fd, +                                fop_fgetxattr_cbk_t cbk) +{ +        afr_private_t   *priv           = NULL; +        afr_local_t     *local          = NULL; +        xlator_t        **children      = NULL; +        int             i               = 0; +        int             call_count      = 0; + +        priv     = this->private; +        children = priv->children; + +        local = frame->local; +        //local->call_count set in afr_local_init +        call_count = local->call_count; + +        //If up-children count is 0, afr_local_init would have failed already +        //and the call would have unwound so not handling it here. + +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i]) { +                        STACK_WIND_COOKIE (frame, cbk, +                                           (void *) (long) i, +                                           children[i], +                                           children[i]->fops->fgetxattr, +                                           fd, name, NULL); +                        if (!--call_count) +                                break; +                } +        } + +        return; +} + +int32_t +afr_fgetxattr (call_frame_t *frame, xlator_t *this, +               fd_t *fd, const char *name, dict_t *xdata) +{ +        afr_private_t        *priv       = NULL; +        xlator_t            **children   = NULL; +        int                   call_child = 0; +        afr_local_t          *local      = NULL; +        int32_t               op_ret     = -1; +        int32_t               op_errno   = 0; +        int32_t               read_child = -1; +        fop_fgetxattr_cbk_t   cbk        = NULL; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv     = this->private; +        VALIDATE_OR_GOTO (priv->children, out); + +        children = priv->children; + +        AFR_SBRAIN_CHECK_FD (fd, out); + +        AFR_LOCAL_ALLOC_OR_GOTO (local, out); +        frame->local = local; + +        op_ret = afr_local_init (local, priv, &op_errno); +        if (op_ret < 0) { +                op_errno = -op_ret; +                goto out; +        } + +        local->fd = fd_ref (fd); +        if (name) +                local->cont.getxattr.name = gf_strdup (name); + +        /* pathinfo gets handled only in getxattr(), but we need to handle +         * lockinfo. +         * If we are doing fgetxattr with lockinfo as the key then we +         * collect information from all children. +         */ +        if (afr_is_special_xattr (name, &cbk, 1)) { +                afr_fgetxattr_frm_all_children (this, frame, name, +                                                fd, cbk); +                return 0; +        } + + +        local->fresh_children = afr_children_create (priv->child_count); +        if (!local->fresh_children) { +                op_errno = ENOMEM; +                goto out; +        } + +        read_child = afr_inode_get_read_ctx (this, fd->inode, +                                             local->fresh_children); +        op_ret = afr_get_call_child (this, local->child_up, read_child, +                                     local->fresh_children, +                                     &call_child, +                                     &local->cont.getxattr.last_index); +        if (op_ret < 0) { +                op_errno = -op_ret; +                op_ret = -1; +                goto out; +        } + +        STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, +                           (void *) (long) call_child, +                           children[call_child], +                           children[call_child]->fops->fgetxattr, +                           fd, name, xdata); + +        op_ret = 0; +out: +        if (op_ret == -1) { +                AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, NULL, +                                  NULL); +        } +        return 0; +} + + +/* }}} */ + +/* {{{ readv */ + +/** + * read algorithm: + * + * if the user has specified a read subvolume, use it + * otherwise - + *   use the inode number to hash it to one of the subvolumes, and + *   read from there (to balance read load) + * + * if any of the above read's fail, try the children in sequence + * beginning at the beginning + */ + +int32_t +afr_readv_cbk (call_frame_t *frame, void *cookie, +               xlator_t *this, int32_t op_ret, int32_t op_errno, +               struct iovec *vector, int32_t count, struct iatt *buf, +               struct iobref *iobref, dict_t *xdata) +{ +        afr_private_t * priv            = NULL; +        afr_local_t *   local           = NULL; +        xlator_t **     children        = NULL; +        int             unwind          = 1; +        int32_t         *last_index     = NULL; +        int32_t         next_call_child = -1; +        int32_t         *fresh_children  = NULL; +        int32_t         read_child      = -1; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv     = this->private; +        VALIDATE_OR_GOTO (priv->children, out); + +        children = priv->children; + +        local = frame->local; + +        read_child = (long) cookie; + +        if (op_ret == -1) { +                last_index = &local->cont.readv.last_index; +                fresh_children = local->fresh_children; +                next_call_child = afr_next_call_child (fresh_children, +                                                       local->child_up, +                                                       priv->child_count, +                                                       last_index, read_child); +                if (next_call_child < 0) +                        goto out; + +                unwind = 0; + +                STACK_WIND_COOKIE (frame, afr_readv_cbk, +                                   (void *) (long) read_child, +                                   children[next_call_child], +                                   children[next_call_child]->fops->readv, +                                   local->fd, local->cont.readv.size, +                                   local->cont.readv.offset, +                                   local->cont.readv.flags, +                                   NULL); +        } + +out: +        if (unwind) { +                AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, +                                  vector, count, buf, iobref, xdata); +        } + +        return 0; +} + + +int32_t +afr_readv (call_frame_t *frame, xlator_t *this, +           fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata) +{ +        afr_private_t * priv       = NULL; +        afr_local_t   * local      = NULL; +        xlator_t **     children   = NULL; +        int             call_child = 0; +        int32_t         op_errno   = 0; +        int32_t         read_child = -1; +        int             ret        = -1; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); +        VALIDATE_OR_GOTO (fd, out); + +        priv     = this->private; +        children = priv->children; + +        AFR_SBRAIN_CHECK_FD (fd, out); + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        local->fresh_children = afr_children_create (priv->child_count); +        if (!local->fresh_children) { +                op_errno = ENOMEM; +                goto out; +        } + +        read_child = afr_inode_get_read_ctx (this, fd->inode, local->fresh_children); +        ret = afr_get_call_child (this, local->child_up, read_child, +                                     local->fresh_children, +                                     &call_child, +                                     &local->cont.readv.last_index); +        if (ret < 0) { +                op_errno = -ret; +                goto out; +        } + +        local->fd                    = fd_ref (fd); + +        local->cont.readv.size       = size; +        local->cont.readv.offset     = offset; +        local->cont.readv.flags      = flags; + +        afr_open_fd_fix (fd, this); + +        STACK_WIND_COOKIE (frame, afr_readv_cbk, +                           (void *) (long) call_child, +                           children[call_child], +                           children[call_child]->fops->readv, +                           fd, size, offset, flags, xdata); + +        ret = 0; +out: +        if (ret < 0) { +                AFR_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, +                                  NULL, NULL); +        } +        return 0; +} + +/* }}} */ diff --git a/xlators/cluster/afr-v1/src/afr-inode-read.h b/xlators/cluster/afr-v1/src/afr-inode-read.h new file mode 100644 index 000000000..e4091a793 --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-inode-read.h @@ -0,0 +1,42 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef __INODE_READ_H__ +#define __INODE_READ_H__ + +int32_t +afr_access (call_frame_t *frame, xlator_t *this, +	    loc_t *loc, int32_t mask, dict_t *xdata); + +int32_t +afr_stat (call_frame_t *frame, xlator_t *this, +	  loc_t *loc, dict_t *xdata); + +int32_t +afr_fstat (call_frame_t *frame, xlator_t *this, +	   fd_t *fd, dict_t *xdata); + +int32_t +afr_readlink (call_frame_t *frame, xlator_t *this, +	      loc_t *loc, size_t size, dict_t *xdata); + +int32_t +afr_readv (call_frame_t *frame, xlator_t *this, +	   fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata); + +int32_t +afr_getxattr (call_frame_t *frame, xlator_t *this, +	      loc_t *loc, const char *name, dict_t *xdata); + +int32_t +afr_fgetxattr (call_frame_t *frame, xlator_t *this, +               fd_t *fd, const char *name, dict_t *xdata); + +#endif /* __INODE_READ_H__ */ diff --git a/xlators/cluster/afr-v1/src/afr-inode-write.c b/xlators/cluster/afr-v1/src/afr-inode-write.c new file mode 100644 index 000000000..d62847def --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-inode-write.c @@ -0,0 +1,2861 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" +#include "afr-transaction.h" +#include "afr-self-heal-common.h" + +void +__inode_write_fop_cbk (call_frame_t *frame, int child_index, int read_child, +                       xlator_t *this, int32_t *op_ret, int32_t *op_errno, +                       struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ +        afr_local_t     *local = NULL; + +        local = frame->local; + +        if (afr_fop_failed (*op_ret, *op_errno)) { +                local->child_errno[child_index] = *op_errno; + +                switch (local->op) { +                case GF_FOP_TRUNCATE: +                case GF_FOP_FTRUNCATE: +                        if (*op_errno != EFBIG) +                                afr_transaction_fop_failed (frame, this, +                                                            child_index); +                break; +                default: +                        afr_transaction_fop_failed (frame, this, child_index); +                break; +                } +                local->op_errno = *op_errno; +                goto out; +        } + +        if ((local->success_count == 0) || (read_child == child_index)) { +                local->op_ret              = *op_ret; +                if (prebuf) +                        local->cont.inode_wfop.prebuf  = *prebuf; +                if (postbuf) +                        local->cont.inode_wfop.postbuf = *postbuf; +        } + +        local->success_count++; +out: +        return; +} + +/* {{{ writev */ + +void +afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame) +{ +        afr_local_t *src_local = NULL; +        afr_local_t *dst_local = NULL; + +        src_local = src_frame->local; +        dst_local = dst_frame->local; + +        dst_local->op_ret = src_local->op_ret; +        dst_local->op_errno = src_local->op_errno; +        dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf; +        dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf; +} + +void +afr_writev_unwind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *   local = NULL; +        local = frame->local; + +        AFR_STACK_UNWIND (writev, frame, +                          local->op_ret, local->op_errno, +                          &local->cont.inode_wfop.prebuf, +                          &local->cont.inode_wfop.postbuf, +                          NULL); +} + +call_frame_t* +afr_transaction_detach_fop_frame (call_frame_t *frame) +{ +        afr_local_t *   local = NULL; +        call_frame_t   *fop_frame = NULL; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                fop_frame = local->transaction.main_frame; +                local->transaction.main_frame = NULL; +        } +        UNLOCK (&frame->lock); + +        return fop_frame; +} + +int +afr_transaction_writev_unwind (call_frame_t *frame, xlator_t *this) +{ +        call_frame_t *fop_frame = NULL; + +        fop_frame = afr_transaction_detach_fop_frame (frame); + +        if (fop_frame) { +                afr_writev_copy_outvars (frame, fop_frame); +                afr_writev_unwind (fop_frame, this); +        } +        return 0; +} + +static void +afr_writev_handle_short_writes (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t   *local = NULL; +        afr_private_t *priv  = NULL; +        int           i      = 0; + +        local = frame->local; +        priv = this->private; +        /* +         * We already have the best case result of the writev calls staged +         * as the return value. Any writev that returns some value less +         * than the best case is now out of sync, so mark the fop as +         * failed. Note that fops that have returned with errors have +         * already been marked as failed. +         */ +        for (i = 0; i < priv->child_count; i++) { +                if ((!local->replies[i].valid) || +                    (local->replies[i].op_ret == -1)) +                        continue; + +                if (local->replies[i].op_ret < local->op_ret) +                        afr_transaction_fop_failed(frame, this, i); +        } +} + +int +afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                     int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +                     struct iatt *postbuf, dict_t *xdata) +{ +        afr_local_t *   local = NULL; +        afr_private_t  *priv  = NULL; +        call_frame_t    *fop_frame = NULL; +        int child_index = (long) cookie; +        int call_count  = -1; +        int read_child  = 0; +        int      ret = 0; +        uint32_t open_fd_count = 0; +        uint32_t write_is_append = 0; + +        local = frame->local; +        priv  = this->private; + +        read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + +        LOCK (&frame->lock); +        { +                if (child_index == read_child) { +                        local->read_child_returned = _gf_true; +                } + +                __inode_write_fop_cbk (frame, child_index, read_child, this, +                                       &op_ret, &op_errno, prebuf, postbuf, +                                       xdata); + +		local->replies[child_index].valid = 1; +		local->replies[child_index].op_ret = op_ret; +		local->replies[child_index].op_errno = op_errno; + + +		/* stage the best case return value for unwind */ +                if ((local->success_count == 0) || (op_ret > local->op_ret)) { +                        local->op_ret              = op_ret; +			local->op_errno		   = op_errno; +		} + +		if (op_ret != -1) { +                        if (xdata) { +                                ret = dict_get_uint32 (xdata, +                                                       GLUSTERFS_OPEN_FD_COUNT, +                                                       &open_fd_count); +                                if ((ret == 0) && +                                    (open_fd_count > local->open_fd_count)) { +                                        local->open_fd_count = open_fd_count; +                                        local->update_open_fd_count = _gf_true; +                                } + +				write_is_append = 0; +                                ret = dict_get_uint32 (xdata, +                                                       GLUSTERFS_WRITE_IS_APPEND, +                                                       &write_is_append); +                                if (ret || !write_is_append) +					local->append_write = _gf_false; +                        } + +		} +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { + +                if (local->update_open_fd_count) +                        afr_handle_open_fd_count (frame, this); + +                if (!local->stable_write && !local->append_write) +			/* An appended write removes the necessity to +			   fsync() the file. This is because self-heal +			   has the logic to check for larger file when +			   the xattrs are not reliably pointing at +			   a stale file. +			*/ +                        afr_fd_report_unstable_write (this, local->fd); + +                afr_writev_handle_short_writes (frame, this); +                if (afr_any_fops_failed (local, priv)) { +                        //Don't unwind until post-op is complete +                        local->transaction.resume (frame, this); +                } else { +                /* +                 * Generally inode-write fops do transaction.unwind then +                 * transaction.resume, but writev needs to make sure that +                 * delayed post-op frame is placed in fdctx before unwind +                 * happens. This prevents the race of flush doing the +                 * changelog wakeup first in fuse thread and then this +                 * writev placing its delayed post-op frame in fdctx. +                 * This helps flush make sure all the delayed post-ops are +                 * completed. +                 */ + +                        fop_frame = afr_transaction_detach_fop_frame (frame); +                        afr_writev_copy_outvars (frame, fop_frame); +                        local->transaction.resume (frame, this); +                        afr_writev_unwind (fop_frame, this); +                } +        } +        return 0; +} + +int +afr_writev_wind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        int i = 0; +        int call_count = -1; +        dict_t *xdata = NULL; +        GF_UNUSED int     ret = 0; + +        local = frame->local; +        priv = this->private; + +        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, +                                                     priv->child_count); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +                return 0; +        } + +        local->call_count = call_count; +	local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), +				   gf_afr_mt_reply_t); +	if (!local->replies) { +		local->op_ret = -1; +		local->op_errno = ENOMEM; +		local->transaction.unwind(frame, this); +		local->transaction.resume(frame, this); +		return 0; +	} + +        xdata = dict_new (); +        if (xdata) { +                ret = dict_set_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, +                                       sizeof (uint32_t)); +		ret = dict_set_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND, +				       0); +		/* Set append_write to be true speculatively. If on any +		   server it turns not be true, we unset it in the +		   callback. +		*/ +		local->append_write = _gf_true; +        } + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) { +                        STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->writev, +                                           local->fd, +                                           local->cont.writev.vector, +                                           local->cont.writev.count, +                                           local->cont.writev.offset, +                                           local->cont.writev.flags, +                                           local->cont.writev.iobref, +                                           xdata); + +                        if (!--call_count) +                                break; +                } +        } + +        if (xdata) +                dict_unref (xdata); + +        return 0; +} + + +int +afr_writev_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; + +        local = frame->local; + +        iobref_unref (local->cont.writev.iobref); +        local->cont.writev.iobref = NULL; + +        local->transaction.unwind (frame, this); + +        AFR_STACK_DESTROY (frame); + +        return 0; +} + + +int +afr_do_writev (call_frame_t *frame, xlator_t *this) +{ +        call_frame_t    *transaction_frame = NULL; +        afr_local_t     *local             = NULL; +        int             op_ret   = -1; +        int             op_errno = 0; + +        local = frame->local; + +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) { +                op_errno = ENOMEM; +                goto out; +        } + +        transaction_frame->local = local; +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + +        local->op = GF_FOP_WRITE; + +        local->success_count      = 0; + +        local->transaction.fop    = afr_writev_wind; +        local->transaction.done   = afr_writev_done; +        local->transaction.unwind = afr_transaction_writev_unwind; + +        local->transaction.main_frame = frame; +        if (local->fd->flags & O_APPEND) { +               /* +                * Backend vfs ignores the 'offset' for append mode fd so +                * locking just the region provided for the writev does not +                * give consistency gurantee. The actual write may happen at a +                * completely different range than the one provided by the +                * offset, len in the fop. So lock the entire file. +                */ +                local->transaction.start   = 0; +                local->transaction.len     = 0; +        } else { +                local->transaction.start   = local->cont.writev.offset; +                local->transaction.len     = iov_length (local->cont.writev.vector, +                                                         local->cont.writev.count); +        } + +        op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); +        if (op_ret < 0) { +            op_errno = -op_ret; +            goto out; +        } + +        op_ret = 0; +out: +        if (op_ret < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); +                AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL, NULL); +        } + +        return 0; +} + +static void +afr_trigger_open_fd_self_heal (fd_t *fd, xlator_t *this) +{ +        call_frame_t    *frame   = NULL; +        afr_local_t     *local   = NULL; +        afr_self_heal_t *sh      = NULL; +        char            *reason  = NULL; +        int32_t         op_errno = 0; +        int             ret      = 0; + +        if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) { +                gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid args: " +                                  "fd: %p, inode: %p", fd, +                                  fd ? fd->inode : NULL); +                goto out; +        } + +        frame = create_frame (this, this->ctx->pool); +        if (!frame) +                goto out; + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; +        ret = afr_local_init (local, this->private, &op_errno); +        if (ret < 0) +                goto out; + +        local->loc.inode = inode_ref (fd->inode); +        ret = loc_path (&local->loc, NULL); +        if (ret < 0) +                goto out; + +        sh    = &local->self_heal; +        sh->do_metadata_self_heal = _gf_true; +        if (fd->inode->ia_type == IA_IFREG) +                sh->do_data_self_heal = _gf_true; +        else if (fd->inode->ia_type == IA_IFDIR) +                sh->do_entry_self_heal = _gf_true; + +        reason = "subvolume came online"; +        afr_launch_self_heal (frame, this, fd->inode, _gf_true, +                              fd->inode->ia_type, reason, NULL, NULL); +        return; +out: +        AFR_STACK_DESTROY (frame); +} + +void +afr_open_fd_fix (fd_t *fd, xlator_t *this) +{ +        int           ret             = 0; +        int           i               = 0; +        afr_fd_ctx_t  *fd_ctx         = NULL; +        gf_boolean_t  need_self_heal  = _gf_false; +        int           *need_open      = NULL; +        size_t        need_open_count = 0; +        afr_private_t *priv           = NULL; + +        priv  = this->private; + +        if (!afr_is_fd_fixable (fd)) +                goto out; + +        fd_ctx = afr_fd_ctx_get (fd, this); +        if (!fd_ctx) +                goto out; + +        LOCK (&fd->lock); +        { +                if (fd_ctx->up_count < priv->up_count) { +                        need_self_heal = _gf_true; +                        fd_ctx->up_count   = priv->up_count; +                        fd_ctx->down_count = priv->down_count; +                } + +                need_open = alloca (priv->child_count * sizeof (*need_open)); +                for (i = 0; i < priv->child_count; i++) { +                        need_open[i] = 0; +                        if (fd_ctx->opened_on[i] != AFR_FD_NOT_OPENED) +                                continue; + +                        if (!priv->child_up[i]) +                                continue; + +                        fd_ctx->opened_on[i] = AFR_FD_OPENING; + +                        need_open[i] = 1; +                        need_open_count++; +                } +        } +        UNLOCK (&fd->lock); +        if (ret) +                goto out; + +        if (need_self_heal) +                afr_trigger_open_fd_self_heal (fd, this); + +        if (!need_open_count) +                goto out; + +        afr_fix_open (this, fd, need_open_count, need_open); +out: +        return; +} + +int +afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, +            struct iovec *vector, int32_t count, off_t offset, +            uint32_t flags, struct iobref *iobref, dict_t *xdata) +{ +        afr_private_t * priv  = NULL; +        afr_local_t   * local = NULL; +        int ret = -1; +        int op_errno = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        if (afr_is_split_brain (this, fd->inode)) { +                op_errno = EIO; +                goto out; +        } + +        QUORUM_CHECK(writev,out); + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        local->cont.writev.vector     = iov_dup (vector, count); +        local->cont.writev.count      = count; +        local->cont.writev.offset     = offset; +        local->cont.writev.flags      = flags; +        local->cont.writev.iobref     = iobref_ref (iobref); + +        local->fd                = fd_ref (fd); + +	/* detect here, but set it in writev_wind_cbk *after* the unstable +	   write is performed +	*/ +	local->stable_write = !!((fd->flags|flags)&(O_SYNC|O_DSYNC)); + +        afr_open_fd_fix (fd, this); + +        afr_do_writev (frame, this); + +        ret = 0; +out: +        if (ret < 0) +                AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); + +        return 0; +} + + +/* }}} */ + +/* {{{ truncate */ + +int +afr_truncate_unwind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *   local = NULL; +        call_frame_t   *main_frame = NULL; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (local->transaction.main_frame) +                        main_frame = local->transaction.main_frame; +                local->transaction.main_frame = NULL; +        } +        UNLOCK (&frame->lock); + +        if (main_frame) { +                AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, +                                  local->op_errno, +                                  &local->cont.inode_wfop.prebuf, +                                  &local->cont.inode_wfop.postbuf, +                                  NULL); +        } + +        return 0; +} + + +int +afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                       int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +                       struct iatt *postbuf, dict_t *xdata) +{ +        afr_local_t *   local = NULL; +        int child_index = (long) cookie; +        int read_child  = 0; +        int call_count  = -1; + +        local = frame->local; + +        read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); + +        LOCK (&frame->lock); +        { +                if (child_index == read_child) { +                        local->read_child_returned = _gf_true; +                } + +                if (op_ret != -1) { +			if (prebuf->ia_size != postbuf->ia_size) +				local->stable_write = _gf_false; +                } +                __inode_write_fop_cbk (frame, child_index, read_child, this, +                                       &op_ret, &op_errno, prebuf, postbuf, +                                       xdata); +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +		if (local->stable_write && afr_txn_nothing_failed (frame, this)) +			local->transaction.unwind (frame, this); + +                local->transaction.resume (frame, this); +        } + +        return 0; +} + + +int32_t +afr_truncate_wind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        int call_count = -1; +        int i = 0; + +        local = frame->local; +        priv = this->private; + +        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, +                                                     priv->child_count); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +                return 0; +        } + +        local->call_count = call_count; +	local->stable_write = _gf_true; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) { +                        STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->truncate, +                                           &local->loc, +                                           local->cont.truncate.offset, +                                           NULL); + +                        if (!--call_count) +                                break; +                } +        } + +        return 0; +} + + +int +afr_truncate_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; + +        local = frame->local; + +        local->transaction.unwind (frame, this); + +        AFR_STACK_DESTROY (frame); + +        return 0; +} + + +int +afr_truncate (call_frame_t *frame, xlator_t *this, +              loc_t *loc, off_t offset, dict_t *xdata) +{ +        afr_private_t * priv  = NULL; +        afr_local_t   * local = NULL; +        call_frame_t   *transaction_frame = NULL; +        int ret = -1; +        int op_errno = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        QUORUM_CHECK(truncate,out); + +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) { +                op_errno = ENOMEM; +                goto out; +        } + +        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); +        local = transaction_frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        local->cont.truncate.offset  = offset; + +        local->transaction.fop    = afr_truncate_wind; +        local->transaction.done   = afr_truncate_done; +        local->transaction.unwind = afr_truncate_unwind; + +        loc_copy (&local->loc, loc); + +        local->transaction.main_frame = frame; +        local->transaction.start   = offset; +        local->transaction.len     = 0; + +        ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); +        if (ret < 0) { +            op_errno = -ret; +            goto out; +        } + +        ret = 0; +out: +        if (ret < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); +                AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); +        } + +        return 0; +} + + +/* }}} */ + +/* {{{ ftruncate */ + + +int +afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *   local = NULL; +        call_frame_t   *main_frame = NULL; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (local->transaction.main_frame) +                        main_frame = local->transaction.main_frame; +                local->transaction.main_frame = NULL; +        } +        UNLOCK (&frame->lock); + +        if (main_frame) { +                AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, +                                  local->op_errno, +                                  &local->cont.inode_wfop.prebuf, +                                  &local->cont.inode_wfop.postbuf, +                                  NULL); +        } +        return 0; +} + + +int +afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                        int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +                        struct iatt *postbuf, dict_t *xdata) +{ +        afr_local_t *   local = NULL; +        int child_index = (long) cookie; +        int call_count  = -1; +        int read_child  = 0; + +        local = frame->local; + +        read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + +        LOCK (&frame->lock); +        { +                if (child_index == read_child) { +                        local->read_child_returned = _gf_true; +                } + +                if (op_ret != -1) { +			if (prebuf->ia_size != postbuf->ia_size) +				local->stable_write = _gf_false; +                } +                __inode_write_fop_cbk (frame, child_index, read_child, this, +                                       &op_ret, &op_errno, prebuf, postbuf, +                                       xdata); +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +		if (local->stable_write && afr_txn_nothing_failed (frame, this)) +			local->transaction.unwind (frame, this); + +                local->transaction.resume (frame, this); +        } + +        return 0; +} + + +int +afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        int call_count = -1; +        int i = 0; + +        local = frame->local; +        priv = this->private; + +        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, +                                                     priv->child_count); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +                return 0; +        } + +        local->call_count = call_count; +	local->stable_write = _gf_true; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) { +                        STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->ftruncate, +                                           local->fd, +                                           local->cont.ftruncate.offset, +                                           NULL); + +                        if (!--call_count) +                                break; +                } +        } + +        return 0; +} + + +int +afr_ftruncate_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; + +        local = frame->local; + +        local->transaction.unwind (frame, this); + +        AFR_STACK_DESTROY (frame); + +        return 0; +} + + +int +afr_do_ftruncate (call_frame_t *frame, xlator_t *this) +{ +        call_frame_t * transaction_frame = NULL; +        afr_local_t *  local             = NULL; +        int op_ret   = -1; +        int op_errno = 0; + +        local = frame->local; + +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) { +                goto out; +        } + +        transaction_frame->local = local; +        frame->local = NULL; + +        local->op = GF_FOP_FTRUNCATE; + +        local->transaction.fop    = afr_ftruncate_wind; +        local->transaction.done   = afr_ftruncate_done; +        local->transaction.unwind = afr_ftruncate_unwind; + +        local->transaction.main_frame = frame; + +        local->transaction.start   = local->cont.ftruncate.offset; +        local->transaction.len     = 0; + +        op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); +        if (op_ret < 0) { +            op_errno = -op_ret; +            goto out; +        } + +        op_ret = 0; +out: +        if (op_ret < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); +                AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, +                                  NULL, NULL); +        } + +        return 0; +} + + +int +afr_ftruncate (call_frame_t *frame, xlator_t *this, +               fd_t *fd, off_t offset, dict_t *xdata) +{ +        afr_private_t * priv  = NULL; +        afr_local_t   * local = NULL; +        call_frame_t   *transaction_frame = NULL; +        int ret = -1; +        int op_errno = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        if (afr_is_split_brain (this, fd->inode)) { +                op_errno = EIO; +                goto out; +        } +        QUORUM_CHECK(ftruncate,out); + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        local->cont.ftruncate.offset  = offset; + +        local->fd = fd_ref (fd); + +        afr_open_fd_fix (fd, this); + +        afr_do_ftruncate (frame, this); + +        ret = 0; +out: +        if (ret < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); +                AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); +        } + +        return 0; +} + +/* }}} */ + +/* {{{ setattr */ + +int +afr_setattr_unwind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *   local = NULL; +        call_frame_t   *main_frame = NULL; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (local->transaction.main_frame) +                        main_frame = local->transaction.main_frame; +                local->transaction.main_frame = NULL; +        } +        UNLOCK (&frame->lock); + +        if (main_frame) { +                AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, +                                  local->op_errno, +                                  &local->cont.inode_wfop.prebuf, +                                  &local->cont.inode_wfop.postbuf, +                                  NULL); +        } + +        return 0; +} + + +int +afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                      int32_t op_ret, int32_t op_errno, +                      struct iatt *preop, struct iatt *postop, dict_t *xdata) +{ +        afr_local_t *   local = NULL; +        afr_private_t * priv  = NULL; +        int child_index = (long) cookie; +        int read_child  = 0; +        int call_count  = -1; +        int need_unwind = 0; + +        local = frame->local; +        priv  = this->private; + +        read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); + +        LOCK (&frame->lock); +        { +                if (child_index == read_child) { +                        local->read_child_returned = _gf_true; +                } + +                __inode_write_fop_cbk (frame, child_index, read_child, this, +                                       &op_ret, &op_errno, preop, postop, +                                       xdata); + +                if ((local->success_count >= priv->wait_count) +                    && local->read_child_returned) { +                        need_unwind = 1; +                } +        } +        UNLOCK (&frame->lock); + +        if (need_unwind) +                local->transaction.unwind (frame, this); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +        } + +        return 0; +} + + +int32_t +afr_setattr_wind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        int call_count = -1; +        int i = 0; + +        local = frame->local; +        priv = this->private; + +        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, +                                                     priv->child_count); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +                return 0; +        } + +        local->call_count = call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) { +                        STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->setattr, +                                           &local->loc, +                                           &local->cont.setattr.in_buf, +                                           local->cont.setattr.valid, +                                           NULL); + +                        if (!--call_count) +                                break; +                } +        } + +        return 0; +} + + +int +afr_setattr_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; + +        local = frame->local; + +        local->transaction.unwind (frame, this); + +        AFR_STACK_DESTROY (frame); + +        return 0; +} + + +int +afr_setattr (call_frame_t *frame, xlator_t *this, +             loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata) +{ +        afr_private_t * priv  = NULL; +        afr_local_t   * local = NULL; +        call_frame_t   *transaction_frame = NULL; +        int ret = -1; +        int op_errno = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        QUORUM_CHECK(setattr,out); + +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) { +                op_errno = ENOMEM; +                goto out; +        } + +        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); +        local = transaction_frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        local->cont.setattr.in_buf = *buf; +        local->cont.setattr.valid  = valid; + +        local->transaction.fop    = afr_setattr_wind; +        local->transaction.done   = afr_setattr_done; +        local->transaction.unwind = afr_setattr_unwind; + +        loc_copy (&local->loc, loc); + +        local->transaction.main_frame = frame; +        local->transaction.start   = LLONG_MAX - 1; +        local->transaction.len     = 0; + +        ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); +        if (ret < 0) { +            op_errno = -ret; +            goto out; +        } + +        ret = 0; +out: +        if (ret < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); +                AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); +        } + +        return 0; +} + +/* {{{ fsetattr */ + +int +afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *   local = NULL; +        call_frame_t   *main_frame = NULL; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (local->transaction.main_frame) +                        main_frame = local->transaction.main_frame; +                local->transaction.main_frame = NULL; +        } +        UNLOCK (&frame->lock); + +        if (main_frame) { +                AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, +                                  local->op_errno, +                                  &local->cont.inode_wfop.prebuf, +                                  &local->cont.inode_wfop.postbuf, +                                  NULL); +        } + +        return 0; +} + + +int +afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                       int32_t op_ret, int32_t op_errno, +                       struct iatt *preop, struct iatt *postop, dict_t *xdata) +{ +        afr_local_t *   local = NULL; +        afr_private_t * priv  = NULL; +        int child_index = (long) cookie; +        int read_child  = 0; +        int call_count  = -1; +        int need_unwind = 0; + +        local = frame->local; +        priv  = this->private; + +        read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + +        LOCK (&frame->lock); +        { +                if (child_index == read_child) { +                        local->read_child_returned = _gf_true; +                } + +                __inode_write_fop_cbk (frame, child_index, read_child, this, +                                       &op_ret, &op_errno, preop, postop, +                                       xdata); + +                if ((local->success_count >= priv->wait_count) +                    && local->read_child_returned) { +                        need_unwind = 1; +                } +        } +        UNLOCK (&frame->lock); + +        if (need_unwind) +                local->transaction.unwind (frame, this); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +        } + +        return 0; +} + + +int32_t +afr_fsetattr_wind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        int call_count = -1; +        int i = 0; + +        local = frame->local; +        priv = this->private; + +        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, +                                                     priv->child_count); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +                return 0; +        } + +        local->call_count = call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) { +                        STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->fsetattr, +                                           local->fd, +                                           &local->cont.fsetattr.in_buf, +                                           local->cont.fsetattr.valid, +                                           NULL); + +                        if (!--call_count) +                                break; +                } +        } + +        return 0; +} + + +int +afr_fsetattr_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; + +        local = frame->local; + +        local->transaction.unwind (frame, this); + +        AFR_STACK_DESTROY (frame); + +        return 0; +} + +int +afr_fsetattr (call_frame_t *frame, xlator_t *this, +              fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata) +{ +        afr_private_t * priv  = NULL; +        afr_local_t   * local = NULL; +        call_frame_t   *transaction_frame = NULL; +        int ret = -1; +        int op_errno = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        if (afr_is_split_brain (this, fd->inode)) { +                op_errno = EIO; +                goto out; +        } + +        QUORUM_CHECK(fsetattr,out); + +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) { +                op_errno = ENOMEM; +                goto out; +        } + +        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); +        local = transaction_frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        local->cont.fsetattr.in_buf = *buf; +        local->cont.fsetattr.valid  = valid; + +        local->transaction.fop    = afr_fsetattr_wind; +        local->transaction.done   = afr_fsetattr_done; +        local->transaction.unwind = afr_fsetattr_unwind; + +        local->fd                 = fd_ref (fd); + +        afr_open_fd_fix (fd, this); + +        local->transaction.main_frame = frame; +        local->transaction.start   = LLONG_MAX - 1; +        local->transaction.len     = 0; + +        ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); +        if (ret < 0) { +            op_errno = -ret; +            goto out; +        } + +        ret = 0; +out: +        if (ret < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); +                AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); +        } + +        return 0; +} + + +/* {{{ setxattr */ + + +int +afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *   local = NULL; +        call_frame_t   *main_frame = NULL; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (local->transaction.main_frame) +                        main_frame = local->transaction.main_frame; +                local->transaction.main_frame = NULL; +        } +        UNLOCK (&frame->lock); + +        if (main_frame) { +                AFR_STACK_UNWIND (setxattr, main_frame, +                                  local->op_ret, local->op_errno, +                                  NULL); +        } +        return 0; +} + + +int +afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                       int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +        afr_local_t   *local      = NULL; +        afr_private_t *priv       = NULL; +        int           call_count  = -1; +        int           need_unwind = 0; +        int           child_index = (long) cookie; + +        local = frame->local; +        priv = this->private; + +        LOCK (&frame->lock); +        { +                __inode_write_fop_cbk (frame, child_index, -1, this, +                                       &op_ret, &op_errno, NULL, NULL, +                                       xdata); +                if (local->success_count == priv->child_count) { +                        need_unwind = 1; +                } +        } +        UNLOCK (&frame->lock); + +        if (need_unwind) +                local->transaction.unwind (frame, this); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +        } + +        return 0; +} + + +int +afr_setxattr_wind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t      *local         = NULL; +        afr_private_t    *priv          = NULL; +        int               call_count    = -1; +        int               i             = 0; + +        local = frame->local; +        priv = this->private; + +        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, +                                                     priv->child_count); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +                return 0; +        } + +        local->call_count = call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) { +                        STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->setxattr, +                                           &local->loc, +                                           local->cont.setxattr.dict, +                                           local->cont.setxattr.flags, +                                           NULL); + +                        if (!--call_count) +                                break; +                } +        } + +        return 0; +} + + +int +afr_setxattr_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t   *local    = frame->local; + +        local->transaction.unwind (frame, this); + +        AFR_STACK_DESTROY (frame); + +        return 0; +} + +int +afr_setxattr (call_frame_t *frame, xlator_t *this, +              loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) +{ +        afr_private_t  *priv              = NULL; +        afr_local_t    *local             = NULL; +        call_frame_t   *transaction_frame = NULL; +        int             ret               = -1; +        int             op_errno          = EINVAL; + +        VALIDATE_OR_GOTO (this, out); + +        GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, +                                   op_errno, out); + +        GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, +                                   op_errno, out); + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        QUORUM_CHECK(setxattr,out); +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) { +                op_errno = ENOMEM; +                goto out; +        } + +        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); +        local = transaction_frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        local->cont.setxattr.dict  = dict_ref (dict); +        local->cont.setxattr.flags = flags; + +        local->transaction.fop    = afr_setxattr_wind; +        local->transaction.done   = afr_setxattr_done; +        local->transaction.unwind = afr_setxattr_unwind; + +        loc_copy (&local->loc, loc); + +        local->transaction.main_frame = frame; +        local->transaction.start   = LLONG_MAX - 1; +        local->transaction.len     = 0; + +        ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); +        if (ret < 0) { +            op_errno = -ret; +            goto out; +        } + +        ret = 0; +out: +        if (ret < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); +                AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); +        } + +        return 0; +} + +/* {{{ fsetxattr */ + + +int +afr_fsetxattr_unwind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t    *local         = NULL; +        call_frame_t   *main_frame    = NULL; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (local->transaction.main_frame) +                        main_frame = local->transaction.main_frame; +                local->transaction.main_frame = NULL; +        } +        UNLOCK (&frame->lock); + +        if (main_frame) { +                AFR_STACK_UNWIND (fsetxattr, main_frame, +                                  local->op_ret, local->op_errno, +                                  NULL); +        } +        return 0; +} + + +int +afr_fsetxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                        int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +        afr_local_t   *local      = NULL; +        afr_private_t *priv       = NULL; +        int           call_count  = -1; +        int           need_unwind = 0; +        int           child_index = (long) cookie; + +        local = frame->local; +        priv = this->private; + +        LOCK (&frame->lock); +        { + +                __inode_write_fop_cbk (frame, child_index, -1, this, +                                       &op_ret, &op_errno, NULL, NULL, +                                       xdata); +                if (local->success_count == priv->child_count) { +                        need_unwind = 1; +                } +        } +        UNLOCK (&frame->lock); + +        if (need_unwind) +                local->transaction.unwind (frame, this); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +        } + +        return 0; +} + + +int +afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t        *local       = NULL; +        afr_private_t      *priv        = NULL; +        int                 call_count  = -1; +        int                 i           = 0; + +        local = frame->local; +        priv = this->private; + +        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, +                                                     priv->child_count); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +                return 0; +        } + +        local->call_count = call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) { +                        STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->fsetxattr, +                                           local->fd, +                                           local->cont.fsetxattr.dict, +                                           local->cont.fsetxattr.flags, +                                           NULL); + +                        if (!--call_count) +                                break; +                } +        } + +        return 0; +} + + +int +afr_fsetxattr_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t   *local   = frame->local; + +        local->transaction.unwind (frame, this); + +        AFR_STACK_DESTROY (frame); + +        return 0; +} + +int +afr_fsetxattr (call_frame_t *frame, xlator_t *this, +               fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata) +{ +        afr_private_t    *priv              = NULL; +        afr_local_t      *local             = NULL; +        call_frame_t     *transaction_frame = NULL; +        int               ret               = -1; +        int               op_errno          = EINVAL; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, +                                   op_errno, out); + +        GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, +                                   op_errno, out); + +        priv = this->private; + +        if (afr_is_split_brain (this, fd->inode)) { +                op_errno = EIO; +                goto out; +        } + +        QUORUM_CHECK(fsetxattr,out); + +        AFR_LOCAL_ALLOC_OR_GOTO (local, out); + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) { +                goto out; +        } + +        transaction_frame->local = local; + +        local->op_ret = -1; + +        local->cont.fsetxattr.dict  = dict_ref (dict); +        local->cont.fsetxattr.flags = flags; + +        local->transaction.fop    = afr_fsetxattr_wind; +        local->transaction.done   = afr_fsetxattr_done; +        local->transaction.unwind = afr_fsetxattr_unwind; + +        local->fd                 = fd_ref (fd); + +        local->transaction.main_frame = frame; +        local->transaction.start  = LLONG_MAX - 1; +        local->transaction.len    = 0; + +        ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); +        if (ret < 0) { +            op_errno = -ret; +            goto out; +        } + +        ret = 0; +out: +        if (ret < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); +                AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); +        } + +        return 0; +} + +/* }}} */ + + +/* {{{ removexattr */ + + +int +afr_removexattr_unwind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *   local = NULL; +        call_frame_t   *main_frame = NULL; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (local->transaction.main_frame) +                        main_frame = local->transaction.main_frame; +                local->transaction.main_frame = NULL; +        } +        UNLOCK (&frame->lock); + +        if (main_frame) { +                AFR_STACK_UNWIND (removexattr, main_frame, +                                  local->op_ret, local->op_errno, +                                  NULL); +        } +        return 0; +} + + +int +afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                          int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +        afr_local_t   *local      = NULL; +        afr_private_t *priv       = NULL; +        int           call_count  = -1; +        int           need_unwind = 0; +        int           child_index = (long) cookie; + +        local = frame->local; +        priv = this->private; + +        LOCK (&frame->lock); +        { +                __inode_write_fop_cbk (frame, child_index, -1, this, +                                       &op_ret, &op_errno, NULL, NULL, +                                       xdata); +                if (local->success_count == priv->wait_count) { +                        need_unwind = 1; +                } +        } +        UNLOCK (&frame->lock); + +        if (need_unwind) +                local->transaction.unwind (frame, this); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +        } + +        return 0; +} + + +int32_t +afr_removexattr_wind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        int call_count = -1; +        int i = 0; + +        local = frame->local; +        priv = this->private; + +        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, +                                                     priv->child_count); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +                return 0; +        } + +        local->call_count = call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) { +                        STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->removexattr, +                                           &local->loc, +                                           local->cont.removexattr.name, +                                           NULL); + +                        if (!--call_count) +                                break; +                } +        } + +        return 0; +} + + +int +afr_removexattr_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t * local = frame->local; + +        local->transaction.unwind (frame, this); + +        AFR_STACK_DESTROY (frame); + +        return 0; +} + + +int +afr_removexattr (call_frame_t *frame, xlator_t *this, +                 loc_t *loc, const char *name, dict_t *xdata) +{ +        afr_private_t   *priv              = NULL; +        afr_local_t     *local             = NULL; +        call_frame_t    *transaction_frame = NULL; +        int              ret               = -1; +        int              op_errno          = 0; + +        VALIDATE_OR_GOTO (this, out); + +        GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", +                                 name, op_errno, out); + +        GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", +                                 name, op_errno, out); + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this->private, out); +        VALIDATE_OR_GOTO (loc, out); + +        priv = this->private; + +        QUORUM_CHECK(removexattr,out); + +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) { +                op_errno = ENOMEM; +                goto out; +        } + +        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); +        local = transaction_frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        local->cont.removexattr.name = gf_strdup (name); + +        local->transaction.fop    = afr_removexattr_wind; +        local->transaction.done   = afr_removexattr_done; +        local->transaction.unwind = afr_removexattr_unwind; + +        loc_copy (&local->loc, loc); + +        local->transaction.main_frame = frame; +        local->transaction.start   = LLONG_MAX - 1; +        local->transaction.len     = 0; + +        ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); +        if (ret < 0) { +            op_errno = -ret; +            goto out; +        } + +        ret = 0; +out: +        if (ret < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); +                AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); +        } + +        return 0; +} + +/* ffremovexattr */ +int +afr_fremovexattr_unwind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *   local = NULL; +        call_frame_t   *main_frame = NULL; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (local->transaction.main_frame) +                        main_frame = local->transaction.main_frame; +                local->transaction.main_frame = NULL; +        } +        UNLOCK (&frame->lock); + +        if (main_frame) { +                AFR_STACK_UNWIND (fremovexattr, main_frame, +                                  local->op_ret, local->op_errno, +                                  NULL); +        } +        return 0; +} + + +int +afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                          int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +        afr_local_t *   local       = NULL; +        afr_private_t * priv        = NULL; +        int             call_count  = -1; +        int             need_unwind = 0; +        int             child_index = (long) cookie; + +        local = frame->local; +        priv = this->private; + +        LOCK (&frame->lock); +        { +                __inode_write_fop_cbk (frame, child_index, -1, this, +                                       &op_ret, &op_errno, NULL, NULL, +                                       xdata); + +                if (local->success_count == priv->wait_count) { +                        need_unwind = 1; +                } +        } +        UNLOCK (&frame->lock); + +        if (need_unwind) +                local->transaction.unwind (frame, this); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +        } + +        return 0; +} + + +int32_t +afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        int call_count = -1; +        int i = 0; + +        local = frame->local; +        priv = this->private; + +        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, +                                                     priv->child_count); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +                return 0; +        } + +        local->call_count = call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) { +                        STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->fremovexattr, +                                           local->fd, +                                           local->cont.removexattr.name, +                                           NULL); + +                        if (!--call_count) +                                break; +                } +        } + +        return 0; +} + + +int +afr_fremovexattr_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t * local = frame->local; + +        local->transaction.unwind (frame, this); + +        AFR_STACK_DESTROY (frame); + +        return 0; +} + + +int +afr_fremovexattr (call_frame_t *frame, xlator_t *this, +                  fd_t *fd, const char *name, dict_t *xdata) +{ +        afr_private_t * priv  = NULL; +        afr_local_t   * local = NULL; +        call_frame_t   *transaction_frame = NULL; +        int ret = -1; +        int op_ret   = -1; +        int op_errno = 0; + +        VALIDATE_OR_GOTO (this, out); + +        GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", +                                 name, op_errno, out); + +        GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", +                                 name, op_errno, out); + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; +        if (afr_is_split_brain (this, fd->inode)) { +                op_errno = EIO; +                goto out; +        } + +        QUORUM_CHECK(fremovexattr, out); + +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) { +                goto out; +        } + +        AFR_LOCAL_ALLOC_OR_GOTO (local, out); + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) { +                op_errno = -ret; +                goto out; +        } + +        transaction_frame->local = local; + +        local->op_ret = -1; + +        local->cont.removexattr.name = gf_strdup (name); + +        local->transaction.fop    = afr_fremovexattr_wind; +        local->transaction.done   = afr_fremovexattr_done; +        local->transaction.unwind = afr_fremovexattr_unwind; + +        local->fd = fd_ref (fd); + +        local->transaction.main_frame = frame; +        local->transaction.start   = LLONG_MAX - 1; +        local->transaction.len     = 0; + +        op_ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); +        if (op_ret < 0) { +            op_errno = -op_ret; +            goto out; +        } + +        op_ret = 0; +out: +        if (op_ret < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); +                AFR_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, NULL); +        } + +        return 0; +} + +static int +afr_fallocate_unwind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *   local = NULL; +        call_frame_t   *main_frame = NULL; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (local->transaction.main_frame) +                        main_frame = local->transaction.main_frame; +                local->transaction.main_frame = NULL; +        } +        UNLOCK (&frame->lock); + +        if (main_frame) { +                AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, +                                  local->op_errno, +                                  &local->cont.inode_wfop.prebuf, +                                  &local->cont.inode_wfop.postbuf, +                                  NULL); +        } +        return 0; +} + +static int +afr_fallocate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                        int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +                        struct iatt *postbuf, dict_t *xdata) +{ +        afr_local_t *   local = NULL; +        afr_private_t * priv  = NULL; +        int child_index = (long) cookie; +        int call_count  = -1; +        int need_unwind = 0; +        int read_child  = 0; + +        local = frame->local; +        priv  = this->private; + +        read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + +        LOCK (&frame->lock); +        { +                if (child_index == read_child) { +                        local->read_child_returned = _gf_true; +                } + +                __inode_write_fop_cbk (frame, child_index, read_child, this, +                                       &op_ret, &op_errno, prebuf, postbuf, +                                       xdata); + +                if ((local->success_count >= priv->wait_count) +                    && local->read_child_returned) { +                        need_unwind = 1; +                } +        } +        UNLOCK (&frame->lock); + +        if (need_unwind) +                local->transaction.unwind (frame, this); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +        } + +        return 0; +} + +static int +afr_fallocate_wind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        int call_count = -1; +        int i = 0; + +        local = frame->local; +        priv = this->private; + +        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, +                                                     priv->child_count); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +                return 0; +        } + +        local->call_count = call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) { +                        STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->fallocate, +                                           local->fd, +                                           local->cont.fallocate.mode, +                                           local->cont.fallocate.offset, +                                           local->cont.fallocate.len, +                                           NULL); + +                        if (!--call_count) +                                break; +                } +        } + +        return 0; +} + +static int +afr_fallocate_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; + +        local = frame->local; + +        local->transaction.unwind (frame, this); + +        AFR_STACK_DESTROY (frame); + +        return 0; +} + +static int +afr_do_fallocate (call_frame_t *frame, xlator_t *this) +{ +        call_frame_t * transaction_frame = NULL; +        afr_local_t *  local             = NULL; +        int op_ret   = -1; +        int op_errno = 0; + +        local = frame->local; + +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) { +                goto out; +        } + +        transaction_frame->local = local; +        frame->local = NULL; + +        local->op = GF_FOP_FALLOCATE; + +        local->transaction.fop    = afr_fallocate_wind; +        local->transaction.done   = afr_fallocate_done; +        local->transaction.unwind = afr_fallocate_unwind; + +        local->transaction.main_frame = frame; + +        local->transaction.start   = local->cont.fallocate.offset; +        local->transaction.len     = 0; + +        /* fallocate can modify the file size */ +        op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); +        if (op_ret < 0) { +            op_errno = -op_ret; +            goto out; +        } + +        op_ret = 0; +out: +        if (op_ret < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); +                AFR_STACK_UNWIND (fallocate, frame, op_ret, op_errno, NULL, +                                  NULL, NULL); +        } + +        return 0; +} + +int +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, +               off_t offset, size_t len, dict_t *xdata) +{ +        afr_private_t * priv  = NULL; +        afr_local_t   * local = NULL; +        call_frame_t   *transaction_frame = NULL; +        int ret = -1; +        int op_errno = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        if (afr_is_split_brain (this, fd->inode)) { +                op_errno = EIO; +                goto out; +        } +        QUORUM_CHECK(fallocate,out); + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        local->cont.fallocate.mode = mode; +        local->cont.fallocate.offset  = offset; +        local->cont.fallocate.len = len; + +        local->fd = fd_ref (fd); + +        afr_open_fd_fix (fd, this); + +        afr_do_fallocate (frame, this); + +        ret = 0; +out: +        if (ret < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); +                AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); +        } + +        return 0; +} + +/* }}} */ + +/* {{{ discard */ + +static int +afr_discard_unwind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *   local = NULL; +        call_frame_t   *main_frame = NULL; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (local->transaction.main_frame) +                        main_frame = local->transaction.main_frame; +                local->transaction.main_frame = NULL; +        } +        UNLOCK (&frame->lock); + +        if (main_frame) { +                AFR_STACK_UNWIND (discard, main_frame, local->op_ret, +                                  local->op_errno, +                                  &local->cont.inode_wfop.prebuf, +                                  &local->cont.inode_wfop.postbuf, +                                  NULL); +        } +        return 0; +} + +static int +afr_discard_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                      int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +                      struct iatt *postbuf, dict_t *xdata) +{ +        afr_local_t *   local = NULL; +        afr_private_t * priv  = NULL; +        int child_index = (long) cookie; +        int call_count  = -1; +        int need_unwind = 0; +        int read_child  = 0; + +        local = frame->local; +        priv  = this->private; + +        read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + +        LOCK (&frame->lock); +        { +                if (child_index == read_child) { +                        local->read_child_returned = _gf_true; +                } + +                __inode_write_fop_cbk (frame, child_index, read_child, this, +                                       &op_ret, &op_errno, prebuf, postbuf, +                                       xdata); + +                if ((local->success_count >= priv->wait_count) +                    && local->read_child_returned) { +                        need_unwind = 1; +                } +        } +        UNLOCK (&frame->lock); + +        if (need_unwind) +                local->transaction.unwind (frame, this); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +        } + +        return 0; +} + +static int +afr_discard_wind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        int call_count = -1; +        int i = 0; + +        local = frame->local; +        priv = this->private; + +        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, +                                                     priv->child_count); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +                return 0; +        } + +        local->call_count = call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) { +                        STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->discard, +                                           local->fd, +                                           local->cont.discard.offset, +                                           local->cont.discard.len, +                                           NULL); + +                        if (!--call_count) +                                break; +                } +        } + +        return 0; +} + +static int +afr_discard_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; + +        local = frame->local; + +        local->transaction.unwind (frame, this); + +        AFR_STACK_DESTROY (frame); + +        return 0; +} + +static int +afr_do_discard (call_frame_t *frame, xlator_t *this) +{ +        call_frame_t * transaction_frame = NULL; +        afr_local_t *  local             = NULL; +        int op_ret   = -1; +        int op_errno = 0; + +        local = frame->local; + +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) { +                goto out; +        } + +        transaction_frame->local = local; +        frame->local = NULL; + +        local->op = GF_FOP_DISCARD; + +        local->transaction.fop    = afr_discard_wind; +        local->transaction.done   = afr_discard_done; +        local->transaction.unwind = afr_discard_unwind; + +        local->transaction.main_frame = frame; + +        local->transaction.start   = local->cont.discard.offset; +        local->transaction.len     = 0; + +        op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); +        if (op_ret < 0) { +            op_errno = -op_ret; +            goto out; +        } + +        op_ret = 0; +out: +        if (op_ret < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); +                AFR_STACK_UNWIND (discard, frame, op_ret, op_errno, NULL, +                                  NULL, NULL); +        } + +        return 0; +} + +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +             size_t len, dict_t *xdata) +{ +        afr_private_t * priv  = NULL; +        afr_local_t   * local = NULL; +        call_frame_t   *transaction_frame = NULL; +        int ret = -1; +        int op_errno = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        if (afr_is_split_brain (this, fd->inode)) { +                op_errno = EIO; +                goto out; +        } +        QUORUM_CHECK(discard, out); + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        local->cont.discard.offset  = offset; +        local->cont.discard.len = len; + +        local->fd = fd_ref (fd); + +        afr_open_fd_fix (fd, this); + +        afr_do_discard(frame, this); + +        ret = 0; +out: +        if (ret < 0) { +                if (transaction_frame) +                        AFR_STACK_DESTROY (transaction_frame); +                AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); +        } + +        return 0; +} + + +/* {{{ zerofill */ + +static int +afr_zerofill_unwind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local            = NULL; +        call_frame_t    *main_frame       = NULL; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (local->transaction.main_frame) { +                        main_frame = local->transaction.main_frame; +                } +                local->transaction.main_frame = NULL; +        } +        UNLOCK (&frame->lock); + +        if (main_frame) { +                AFR_STACK_UNWIND (zerofill, main_frame, local->op_ret, +                                  local->op_errno, +                                  &local->cont.zerofill.prebuf, +                                  &local->cont.zerofill.postbuf, +                                  NULL); +        } +        return 0; +} + +static int +afr_zerofill_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, +                     int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +                     struct iatt *postbuf, dict_t *xdata) +{ +        afr_local_t       *local             = NULL; +        afr_private_t     *priv              = NULL; +        int                child_index       = (long) cookie; +        int                call_count        = -1; +        int                need_unwind       = 0; +        int                read_child        = 0; + +        local = frame->local; +        priv  = this->private; + +        read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + +        LOCK (&frame->lock); +        { +                if (child_index == read_child) { +                        local->read_child_returned = _gf_true; +                } + +                if (afr_fop_failed (op_ret, op_errno)) { +                        afr_transaction_fop_failed (frame, this, child_index); +                } + +                if (op_ret != -1) { +                        if (local->success_count == 0) { +                                local->op_ret = op_ret; +                                local->cont.zerofill.prebuf  = *prebuf; +                                local->cont.zerofill.postbuf = *postbuf; +                        } + +                        if (child_index == read_child) { +                                local->cont.zerofill.prebuf  = *prebuf; +                                local->cont.zerofill.postbuf = *postbuf; +                        } + +                        local->success_count++; + +                        if ((local->success_count >= priv->wait_count) +                            && local->read_child_returned) { +                                need_unwind = 1; +                        } +                } +                local->op_errno = op_errno; +        } +        UNLOCK (&frame->lock); + +        if (need_unwind) { +                local->transaction.unwind (frame, this); +        } +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +        } + +        return 0; +} + +static int +afr_zerofill_wind (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t    *local         = NULL; +        afr_private_t  *priv          = NULL; +        int             call_count    = -1; +        int             i             = 0; + +        local = frame->local; +        priv = this->private; + +        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, +                                                     priv->child_count); + +        if (call_count == 0) { +                local->transaction.resume (frame, this); +                return 0; +        } + +        local->call_count = call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) { +                        STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->zerofill, +                                           local->fd, +                                           local->cont.zerofill.offset, +                                           local->cont.zerofill.len, +                                           NULL); + +                        if (!--call_count) +                                break; +                } +        } + +        return 0; +} + +static int +afr_zerofill_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; + +        local = frame->local; + +        local->transaction.unwind (frame, this); + +        AFR_STACK_DESTROY (frame); + +        return 0; +} + +static int +afr_do_zerofill(call_frame_t *frame, xlator_t *this) +{ +        call_frame_t  *transaction_frame = NULL; +        afr_local_t   *local             = NULL; +        int            op_ret            = -1; +        int            op_errno          = 0; + +        local = frame->local; + +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) { +                goto out; +        } + +        transaction_frame->local = local; +        frame->local = NULL; + +        local->op = GF_FOP_ZEROFILL; + +        local->transaction.fop    = afr_zerofill_wind; +        local->transaction.done   = afr_zerofill_done; +        local->transaction.unwind = afr_zerofill_unwind; + +        local->transaction.main_frame = frame; + +        local->transaction.start   = local->cont.zerofill.offset; +        local->transaction.len     = 0; + +        op_ret = afr_transaction (transaction_frame, this, +                                  AFR_DATA_TRANSACTION); +        if (op_ret < 0) { +                op_errno = -op_ret; +                goto out; +        } + +        op_ret = 0; +out: +        if (op_ret < 0) { +                if (transaction_frame) { +                        AFR_STACK_DESTROY (transaction_frame); +                } +                AFR_STACK_UNWIND (zerofill, frame, op_ret, op_errno, NULL, +                                  NULL, NULL); +        } + +        return 0; +} + +int +afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +              off_t len, dict_t *xdata) +{ +        afr_private_t   *priv               = NULL; +        afr_local_t     *local              = NULL; +        call_frame_t    *transaction_frame  = NULL; +        int              ret                = -1; +        int              op_errno           = 0; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        if (afr_is_split_brain (this, fd->inode)) { +                op_errno = EIO; +                goto out; +        } +        QUORUM_CHECK(zerofill, out); + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) { +                goto out; +        } +        local->cont.zerofill.offset  = offset; +        local->cont.zerofill.len = len; + +        local->fd = fd_ref (fd); + +        afr_open_fd_fix (fd, this); + +        afr_do_zerofill(frame, this); + +        ret = 0; +out: +        if (ret < 0) { +                if (transaction_frame) { +                        AFR_STACK_DESTROY (transaction_frame); +                } +                AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, +                                  NULL, NULL); +        } + +        return 0; +} + +/* }}} */ + + diff --git a/xlators/cluster/afr-v1/src/afr-inode-write.h b/xlators/cluster/afr-v1/src/afr-inode-write.h new file mode 100644 index 000000000..7b1fc5528 --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-inode-write.h @@ -0,0 +1,82 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef __INODE_WRITE_H__ +#define __INODE_WRITE_H__ + +int32_t +afr_chmod (call_frame_t *frame, xlator_t *this, +	   loc_t *loc, mode_t mode, dict_t *xdata); + +int32_t +afr_chown (call_frame_t *frame, xlator_t *this, +	   loc_t *loc, uid_t uid, gid_t gid, dict_t *xdata); + +int +afr_fchown (call_frame_t *frame, xlator_t *this, +	    fd_t *fd, uid_t uid, gid_t gid, dict_t *xdata); + +int32_t +afr_fchmod (call_frame_t *frame, xlator_t *this, +	    fd_t *fd, mode_t mode, dict_t *xdata); + +int32_t +afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, +	    struct iovec *vector, int32_t count, off_t offset, +            uint32_t flags, struct iobref *iobref, dict_t *xdata); + +int32_t +afr_truncate (call_frame_t *frame, xlator_t *this, +	      loc_t *loc, off_t offset, dict_t *xdata); + +int32_t +afr_ftruncate (call_frame_t *frame, xlator_t *this, +	       fd_t *fd, off_t offset, dict_t *xdata); + +int32_t +afr_utimens (call_frame_t *frame, xlator_t *this, +	     loc_t *loc, struct timespec tv[2], dict_t *xdata); + +int +afr_setattr (call_frame_t *frame, xlator_t *this, +             loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata); + +int +afr_fsetattr (call_frame_t *frame, xlator_t *this, +              fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata); + +int32_t +afr_setxattr (call_frame_t *frame, xlator_t *this, +              loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata); + +int32_t +afr_fsetxattr (call_frame_t *frame, xlator_t *this, +               fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata); + +int32_t +afr_removexattr (call_frame_t *frame, xlator_t *this, +		 loc_t *loc, const char *name, dict_t *xdata); + +int32_t +afr_fremovexattr (call_frame_t *frame, xlator_t *this, +                  fd_t *fd, const char *name, dict_t *xdata); + +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +             size_t len, dict_t *xdata); + +int +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, +               off_t offset, size_t len, dict_t *xdata); + +int +afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +             off_t len, dict_t *xdata); +#endif /* __INODE_WRITE_H__ */ diff --git a/xlators/cluster/afr-v1/src/afr-lk-common.c b/xlators/cluster/afr-v1/src/afr-lk-common.c new file mode 100644 index 000000000..060d78f35 --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-lk-common.c @@ -0,0 +1,2174 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#include "dict.h" +#include "byte-order.h" +#include "common-utils.h" + +#include "afr.h" +#include "afr-transaction.h" + +#include <signal.h> + + +#define LOCKED_NO       0x0        /* no lock held */ +#define LOCKED_YES      0x1        /* for DATA, METADATA, ENTRY and higher_path */ +#define LOCKED_LOWER    0x2        /* for lower path */ + +#define AFR_TRACE_INODELK_IN(frame, this, params ...)           \ +        do {                                                    \ +                afr_private_t *_priv = this->private;           \ +                if (!_priv->inodelk_trace)                      \ +                        break;                                  \ +                afr_trace_inodelk_in (frame, this, params);     \ +        } while (0); + +#define AFR_TRACE_INODELK_OUT(frame, this, params ...)          \ +        do {                                                    \ +                afr_private_t *_priv = this->private;           \ +                if (!_priv->inodelk_trace)                      \ +                        break;                                  \ +                afr_trace_inodelk_out (frame, this, params);    \ +        } while (0); + +#define AFR_TRACE_ENTRYLK_IN(frame, this, params ...)           \ +        do {                                                    \ +                afr_private_t *_priv = this->private;           \ +                if (!_priv->entrylk_trace)                      \ +                        break;                                  \ +                afr_trace_entrylk_in (frame, this, params);     \ +        } while (0); + +#define AFR_TRACE_ENTRYLK_OUT(frame, this, params ...)          \ +        do {                                                    \ +                afr_private_t *_priv = this->private;           \ +                if (!_priv->entrylk_trace)                      \ +                        break;                                  \ +                afr_trace_entrylk_out (frame, this, params);    \ +        } while (0); + +int +afr_entry_lockee_cmp (const void *l1, const void *l2) +{ +        const afr_entry_lockee_t       *r1 = l1; +        const afr_entry_lockee_t       *r2 = l2; +        int                            ret = 0; +        uuid_t                         gfid1 = {0}; +        uuid_t                         gfid2 = {0}; + +        loc_gfid ((loc_t*)&r1->loc, gfid1); +        loc_gfid ((loc_t*)&r2->loc, gfid2); +        ret = uuid_compare (gfid1, gfid2); +        /*Entrylks with NULL basename are the 'smallest'*/ +        if (ret == 0) { +                if (!r1->basename) +                        return -1; +                if (!r2->basename) +                        return 1; +                ret = strcmp (r1->basename, r2->basename); +        } + +        if (ret <= 0) +                return -1; +        else +                return 1; +} + +int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index); + +static int +afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this); + +static uint64_t afr_lock_number = 1; + +static uint64_t +get_afr_lock_number () +{ +        return (++afr_lock_number); +} + +int +afr_set_lock_number (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t         *local    = NULL; +        afr_internal_lock_t *int_lock = NULL; + +        local    = frame->local; +        int_lock = &local->internal_lock; + +        int_lock->lock_number = get_afr_lock_number (); + +        return 0; +} + +void +afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner) +{ +        gf_log (this->name, GF_LOG_TRACE, +                "Setting lk-owner=%llu", +                (unsigned long long) (unsigned long)lk_owner); + +        set_lk_owner_from_ptr (&frame->root->lk_owner, lk_owner); +} + +static int +is_afr_lock_selfheal (afr_local_t *local) +{ +        afr_internal_lock_t *int_lock = NULL; +        int                  ret      = -1; + +        int_lock = &local->internal_lock; + +        switch (int_lock->selfheal_lk_type) { +        case AFR_DATA_SELF_HEAL_LK: +        case AFR_METADATA_SELF_HEAL_LK: +                ret = 1; +                break; +        case AFR_ENTRY_SELF_HEAL_LK: +                ret = 0; +                break; +        } + +        return ret; + +} + +int32_t +internal_lock_count (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t   *local = NULL; +        afr_private_t *priv  = NULL; +        int32_t call_count = 0; +        int i = 0; + +        local = frame->local; +        priv  = this->private; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i]) +                        ++call_count; +        } + +        return call_count; +} + +static void +afr_print_inodelk (char *str, int size, int cmd, +                   struct gf_flock *flock, gf_lkowner_t *owner) +{ +        char *cmd_str = NULL; +        char *type_str = NULL; + +        switch (cmd) { +#if F_GETLK != F_GETLK64 +        case F_GETLK64: +#endif +        case F_GETLK: +                cmd_str = "GETLK"; +                break; + +#if F_SETLK != F_SETLK64 +        case F_SETLK64: +#endif +        case F_SETLK: +                cmd_str = "SETLK"; +                break; + +#if F_SETLKW != F_SETLKW64 +        case F_SETLKW64: +#endif +        case F_SETLKW: +                cmd_str = "SETLKW"; +                break; + +        default: +                cmd_str = "<null>"; +                break; +        } + +        switch (flock->l_type) { +        case F_RDLCK: +                type_str = "READ"; +                break; +        case F_WRLCK: +                type_str = "WRITE"; +                break; +        case F_UNLCK: +                type_str = "UNLOCK"; +                break; +        default: +                type_str = "UNKNOWN"; +                break; +        } + +        snprintf (str, size, "lock=INODELK, cmd=%s, type=%s, " +                  "start=%llu, len=%llu, pid=%llu, lk-owner=%s", +                  cmd_str, type_str, (unsigned long long) flock->l_start, +                  (unsigned long long) flock->l_len, +                  (unsigned long long) flock->l_pid, +                  lkowner_utoa (owner)); + +} + +static void +afr_print_lockee (char *str, int size, loc_t *loc, fd_t *fd, +                  int child_index) +{ +        snprintf (str, size, "path=%s, fd=%p, child=%d", +                  loc->path ? loc->path : "<nul>", +                  fd ? fd : NULL, +                  child_index); +} + +void +afr_print_entrylk (char *str, int size, const char *basename, +                   gf_lkowner_t *owner) +{ +        snprintf (str, size, "Basename=%s, lk-owner=%s", +                  basename ? basename : "<nul>", +                  lkowner_utoa (owner)); +} + +static void +afr_print_verdict (int op_ret, int op_errno, char *str) +{ +        if (op_ret < 0) { +                if (op_errno == EAGAIN) +                        strcpy (str, "EAGAIN"); +                else +                        strcpy (str, "FAILED"); +        } +        else +                strcpy (str, "GRANTED"); +} + +static void +afr_set_lock_call_type (afr_lock_call_type_t lock_call_type, +                        char *lock_call_type_str, +                        afr_internal_lock_t *int_lock) +{ +        switch (lock_call_type) { +        case AFR_INODELK_TRANSACTION: +                if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) +                        strcpy (lock_call_type_str, "AFR_INODELK_TRANSACTION"); +                else +                        strcpy (lock_call_type_str, "AFR_INODELK_SELFHEAL"); +                break; +        case AFR_INODELK_NB_TRANSACTION: +                if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) +                        strcpy (lock_call_type_str, "AFR_INODELK_NB_TRANSACTION"); +                else +                        strcpy (lock_call_type_str, "AFR_INODELK_NB_SELFHEAL"); +                break; +        case AFR_ENTRYLK_TRANSACTION: +                if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) +                        strcpy (lock_call_type_str, "AFR_ENTRYLK_TRANSACTION"); +                else +                        strcpy (lock_call_type_str, "AFR_ENTRYLK_SELFHEAL"); +                break; +        case AFR_ENTRYLK_NB_TRANSACTION: +                if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) +                        strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_TRANSACTION"); +                else +                        strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_SELFHEAL"); +                break; +        default: +                strcpy (lock_call_type_str, "UNKNOWN"); +                break; +        } + +} + +static void +afr_trace_inodelk_out (call_frame_t *frame, xlator_t *this, +                       afr_lock_call_type_t lock_call_type, +                       afr_lock_op_type_t lk_op_type, struct gf_flock *flock, +                       int op_ret, int op_errno, int32_t child_index) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_local_t         *local    = NULL; + +        char lockee[256]; +        char lock_call_type_str[256]; +        char verdict[16]; + +        local    = frame->local; +        int_lock = &local->internal_lock; + +        afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); + +        afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); + +        afr_print_verdict (op_ret, op_errno, verdict); + +        gf_log (this->name, GF_LOG_INFO, +                "[%s %s] [%s] lk-owner=%s Lockee={%s} Number={%llu}", +                lock_call_type_str, +                lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", +                verdict, lkowner_utoa (&frame->root->lk_owner), lockee, +                (unsigned long long) int_lock->lock_number); + +} + +static void +afr_trace_inodelk_in (call_frame_t *frame, xlator_t *this, +                      afr_lock_call_type_t lock_call_type, +                      afr_lock_op_type_t lk_op_type, struct gf_flock *flock, +                      int32_t cmd, int32_t child_index) +{ +        afr_local_t         *local    = NULL; +        afr_internal_lock_t *int_lock = NULL; + +        char lock[256]; +        char lockee[256]; +        char lock_call_type_str[256]; + +        local    = frame->local; +        int_lock = &local->internal_lock; + +        afr_print_inodelk (lock, 256, cmd, flock, &frame->root->lk_owner); +        afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); + +        afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); + +        gf_log (this->name, GF_LOG_INFO, +                "[%s %s] Lock={%s} Lockee={%s} Number={%llu}", +                lock_call_type_str, +                lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST", +                lock, lockee, +                (unsigned long long) int_lock->lock_number); + +} + +static void +afr_trace_entrylk_in (call_frame_t *frame, xlator_t *this, +                      afr_lock_call_type_t lock_call_type, +                      afr_lock_op_type_t lk_op_type, const char *basename, +                      int32_t cookie) +{ +        afr_local_t         *local    = NULL; +        afr_internal_lock_t *int_lock = NULL; +        afr_private_t       *priv     = NULL; +        int                 child_index = 0; +        int                 lockee_no = 0; + +        char lock[256]; +        char lockee[256]; +        char lock_call_type_str[256]; + +        local    = frame->local; +        int_lock = &local->internal_lock; +        priv     = this->private; + +        if (!priv->entrylk_trace) { +                return; +        } +        lockee_no = cookie / priv->child_count; +        child_index = cookie % priv->child_count; + +        afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); +        afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, +                          child_index); + +        afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); + +        gf_log (this->name, GF_LOG_INFO, +                "[%s %s] Lock={%s} Lockee={%s} Number={%llu}, Cookie={%d}", +                lock_call_type_str, +                lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST", +                lock, lockee, +                (unsigned long long) int_lock->lock_number, +                cookie); +} + +static void +afr_trace_entrylk_out (call_frame_t *frame, xlator_t *this, +                       afr_lock_call_type_t lock_call_type, +                       afr_lock_op_type_t lk_op_type, const char *basename, +                       int op_ret, int op_errno, int32_t cookie) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_local_t         *local    = NULL; +        afr_private_t       *priv     = NULL; +        int                 lockee_no = 0; +        int                 child_index = 0; + +        char lock[256]; +        char lockee[256]; +        char lock_call_type_str[256]; +        char verdict[16]; + +        local    = frame->local; +        int_lock = &local->internal_lock; +        priv     = this->private; + +        if (!priv->entrylk_trace) { +                return; +        } +        lockee_no = cookie / priv->child_count; +        child_index = cookie % priv->child_count; + +        afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); +        afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, +                          child_index); + +        afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); + +        afr_print_verdict (op_ret, op_errno, verdict); + +        gf_log (this->name, GF_LOG_INFO, +                "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu} Cookie={%d}", +                lock_call_type_str, +                lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", +                verdict, +                lock, lockee, +                (unsigned long long) int_lock->lock_number, +                cookie); + +} + +static int +transaction_lk_op (afr_local_t *local) +{ +        afr_internal_lock_t *int_lock = NULL; +        int ret = -1; + +        int_lock = &local->internal_lock; + +        if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) { +                gf_log (THIS->name, GF_LOG_DEBUG, +                        "lk op is for a transaction"); +                ret = 1; +        } +        else if (int_lock->transaction_lk_type == AFR_SELFHEAL_LK) { +                gf_log (THIS->name, GF_LOG_DEBUG, +                        "lk op is for a self heal"); + +                ret = 0; +        } + +        if (ret == -1) +                gf_log (THIS->name, GF_LOG_DEBUG, +                        "lk op is not set"); + +        return ret; + +} + +static int +is_afr_lock_transaction (afr_local_t *local) +{ +        int ret = 0; + +        switch (local->transaction.type) { +        case AFR_DATA_TRANSACTION: +        case AFR_METADATA_TRANSACTION: +                ret = 1; +                break; + +        case AFR_ENTRY_RENAME_TRANSACTION: +        case AFR_ENTRY_TRANSACTION: +                ret = 0; +                break; + +        } + +        return ret; +} + +int +afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local, +                       loc_t *loc, char *basename, int child_count) +{ +        int     ret     = -1; + +        loc_copy (&lockee->loc, loc); +        lockee->basename        = (basename)? gf_strdup (basename): NULL; +        if (basename && !lockee->basename) +                goto out; + +        lockee->locked_count    = 0; +        lockee->locked_nodes    = GF_CALLOC (child_count, +                                             sizeof (*lockee->locked_nodes), +                                             gf_afr_mt_afr_node_character); + +        if (!lockee->locked_nodes) +                goto out; + +        ret = 0; +out: +        return ret; + +} + +void +afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock) +{ +        int     i   = 0; + +        for (i = 0; i < int_lock->lockee_count; i++) { +                loc_wipe (&int_lock->lockee[i].loc); +                if (int_lock->lockee[i].basename) +                        GF_FREE (int_lock->lockee[i].basename); +                if (int_lock->lockee[i].locked_nodes) +                       GF_FREE (int_lock->lockee[i].locked_nodes); +        } + +        return; +} + +static int +initialize_entrylk_variables (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t         *local    = NULL; +        afr_internal_lock_t *int_lock = NULL; +        afr_private_t       *priv     = NULL; + +        int i = 0; + +        priv     = this->private; +        local    = frame->local; +        int_lock = &local->internal_lock; + +        int_lock->entrylk_lock_count = 0; +        int_lock->lock_op_ret        = -1; +        int_lock->lock_op_errno      = 0; + +        for (i = 0; i < AFR_LOCKEE_COUNT_MAX; i++) { +                if (!int_lock->lockee[i].locked_nodes) +                        break; +                int_lock->lockee[i].locked_count = 0; +                memset (int_lock->lockee[i].locked_nodes, 0, +                        sizeof (*int_lock->lockee[i].locked_nodes) * +                        priv->child_count); +        } + +        return 0; +} + +static int +initialize_inodelk_variables (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t         *local    = NULL; +        afr_internal_lock_t *int_lock = NULL; +        afr_private_t       *priv     = NULL; +        afr_inodelk_t       *inodelk  = NULL; + +        priv     = this->private; +        local    = frame->local; +        int_lock = &local->internal_lock; + +        inodelk = afr_get_inodelk (int_lock, int_lock->domain); + +        inodelk->lock_count    = 0; +        int_lock->lk_attempted_count = 0; +        int_lock->lock_op_ret   = -1; +        int_lock->lock_op_errno = 0; + +        memset (inodelk->locked_nodes, 0, +                sizeof (*inodelk->locked_nodes) * priv->child_count); +        memset (int_lock->locked_nodes, 0, +                sizeof (*int_lock->locked_nodes) * priv->child_count); + +        return 0; +} + +loc_t * +lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2) +{ +        int ret = 0; + +        ret = uuid_compare (l1->inode->gfid, l2->inode->gfid); + +        if (ret == 0) +                ret = strcmp (b1, b2); + +        if (ret <= 0) +                return l1; +        else +                return l2; +} + +int +afr_lockee_locked_nodes_count (afr_internal_lock_t *int_lock) +{ +        int call_count  = 0; +        int i           = 0; + +        for (i = 0; i < int_lock->lockee_count; i++) +                call_count += int_lock->lockee[i].locked_count; + +        return call_count; +} + +int +afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) + +{ +        int i = 0; +        int call_count = 0; + +        for (i = 0; i < child_count; i++) { +                if (locked_nodes[i] & LOCKED_YES) +                        call_count++; +        } + +        return call_count; +} + +/* FIXME: What if UNLOCK fails */ +static int32_t +afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                       int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +        afr_local_t         *local    = NULL; +        afr_internal_lock_t *int_lock = NULL; +        int call_count = 0; + +        local    = frame->local; +        int_lock = &local->internal_lock; + +        LOCK (&frame->lock); +        { +                call_count = --int_lock->lk_call_count; +        } +        UNLOCK (&frame->lock); + +        if (call_count == 0) { +                gf_log (this->name, GF_LOG_TRACE, +                        "All internal locks unlocked"); +                int_lock->lock_cbk (frame, this); +        } + +        return 0; +} + +static int32_t +afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                        int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +        afr_local_t         *local = NULL; +        afr_internal_lock_t *int_lock = NULL; +        afr_inodelk_t       *inodelk = NULL; +        int32_t             child_index = (long)cookie; +        afr_private_t       *priv = NULL; + +        local = frame->local; +        int_lock = &local->internal_lock; + +        AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION, +                               AFR_UNLOCK_OP, NULL, op_ret, +                               op_errno, child_index); + +        priv = this->private; + +        if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { +                gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on subvolume %s " +                        "with lock owner %s", local->loc.path, +                        priv->children[child_index]->name, +                        lkowner_utoa (&frame->root->lk_owner)); +        } + + +        inodelk = afr_get_inodelk (int_lock, int_lock->domain); +        inodelk->locked_nodes[child_index] &= LOCKED_NO; +        if (local->transaction.eager_lock) +                local->transaction.eager_lock[child_index] = 0; + +        afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, xdata); + +        return 0; + +} + +static int +afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_inodelk_t       *inodelk  = NULL; +        afr_local_t         *local    = NULL; +        afr_private_t       *priv     = NULL; +        struct gf_flock flock = {0,}; +        struct gf_flock full_flock = {0,}; +        struct gf_flock *flock_use = NULL; +        int call_count = 0; +        int i = 0; +        int piggyback = 0; +        afr_fd_ctx_t        *fd_ctx      = NULL; + + +        local    = frame->local; +        int_lock = &local->internal_lock; +        priv     = this->private; + +        inodelk = afr_get_inodelk (int_lock, int_lock->domain); + +        flock.l_start = inodelk->flock.l_start; +        flock.l_len   = inodelk->flock.l_len; +        flock.l_type  = F_UNLCK; + +        full_flock.l_type = F_UNLCK; +        call_count = afr_locked_nodes_count (inodelk->locked_nodes, +                                             priv->child_count); + +        int_lock->lk_call_count = call_count; + +        if (!call_count) { +                gf_log (this->name, GF_LOG_TRACE, +                        "No internal locks unlocked"); +                int_lock->lock_cbk (frame, this); +                goto out; +        } + +        if (local->fd) +                fd_ctx = afr_fd_ctx_get (local->fd, this); + +        for (i = 0; i < priv->child_count; i++) { +                if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES) +                        continue; + +                if (local->fd) { +                        flock_use = &flock; +                        if (!local->transaction.eager_lock[i]) { +                                goto wind; +                        } + +                        piggyback = 0; + +                        LOCK (&local->fd->lock); +                        { +                                if (fd_ctx->lock_piggyback[i]) { +                                        fd_ctx->lock_piggyback[i]--; +                                        piggyback = 1; +                                } else { +                                        fd_ctx->lock_acquired[i]--; +                                } +                        } +                        UNLOCK (&local->fd->lock); + +                        if (piggyback) { +                                afr_unlock_inodelk_cbk (frame, (void *) (long) i, +                                                        this, 1, 0, NULL); +                                if (!--call_count) +                                        break; +                                continue; +                        } + +                        flock_use = &full_flock; +                wind: +                        AFR_TRACE_INODELK_IN (frame, this, +                                              AFR_INODELK_TRANSACTION, +                                              AFR_UNLOCK_OP, flock_use, F_SETLK, +                                              i); + +                        STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, +                                           (void *) (long)i, +                                           priv->children[i], +                                           priv->children[i]->fops->finodelk, +                                           int_lock->domain, local->fd, +                                           F_SETLK, flock_use, NULL); + +                        if (!--call_count) +                                break; + +                } else { +                        AFR_TRACE_INODELK_IN (frame, this, +                                              AFR_INODELK_TRANSACTION, +                                              AFR_UNLOCK_OP, &flock, F_SETLK, i); + +                        STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, +                                           (void *) (long)i, +                                           priv->children[i], +                                           priv->children[i]->fops->inodelk, +                                           int_lock->domain, &local->loc, +                                           F_SETLK, &flock, NULL); + +                        if (!--call_count) +                                break; +                } +        } +out: +        return 0; +} + +static int32_t +afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                        int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +        afr_local_t         *local = NULL; +        afr_private_t       *priv  = NULL; +        afr_internal_lock_t *int_lock = NULL; +        int32_t             child_index = 0; +        int                 lockee_no   = 0; + +        priv = this->private; +        lockee_no = (int)((long) cookie) / priv->child_count; +        child_index = (int) ((long) cookie) % priv->child_count; + +        local = frame->local; +        int_lock = &local->internal_lock; + +        AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, +                               AFR_UNLOCK_OP, +                               int_lock->lockee[lockee_no].basename, op_ret, +                               op_errno, (int) ((long)cookie)); + +        if (op_ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, +                        "%s: unlock failed on %d, reason: %s", +                        local->loc.path, child_index, strerror (op_errno)); +        } + +        int_lock->lockee[lockee_no].locked_nodes[child_index] &= LOCKED_NO; +        afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, NULL); + +        return 0; +} + +static int +afr_unlock_entrylk (call_frame_t *frame, xlator_t *this) +{ +        afr_internal_lock_t     *int_lock       = NULL; +        afr_local_t             *local          = NULL; +        afr_private_t           *priv           = NULL; +        int                     call_count      = 0; +        int                     index           = 0; +        int                     lockee_no       = 0; +        int                     copies          = 0; +        int                     i               = -1; + +        local    = frame->local; +        int_lock = &local->internal_lock; +        priv     = this->private; +        copies   = priv->child_count; + +        call_count = afr_lockee_locked_nodes_count (int_lock); + +        int_lock->lk_call_count = call_count; + +        if (!call_count){ +                gf_log (this->name, GF_LOG_TRACE, +                        "No internal locks unlocked"); +                int_lock->lock_cbk (frame, this); +                goto out; +        } + +        for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) { +                lockee_no = i / copies; +                index     = i % copies; +                if (int_lock->lockee[lockee_no].locked_nodes[index] & LOCKED_YES) { +                        AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, +                                              AFR_UNLOCK_OP, +                                              int_lock->lockee[lockee_no].basename, +                                              i); + +                        STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk, +                                           (void *) (long) i, +                                           priv->children[index], +                                           priv->children[index]->fops->entrylk, +                                           int_lock->domain, +                                           &int_lock->lockee[lockee_no].loc, +                                           int_lock->lockee[lockee_no].basename, +                                           ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); + +                        if (!--call_count) +                                break; +                } +        } + +out: +        return 0; + +} + +static int32_t +afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +              int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +        afr_internal_lock_t     *int_lock       = NULL; +        afr_local_t             *local          = NULL; +        afr_private_t           *priv           = NULL; +        int                     cky             = (long) cookie; +        int                     child_index     = 0; +        int                     lockee_no       = 0; + +        priv     = this->private; +        local    = frame->local; +        int_lock = &local->internal_lock; + +        child_index = ((int)cky) % priv->child_count; +        lockee_no   = ((int)cky) / priv->child_count; + +        LOCK (&frame->lock); +        { +                if (op_ret == -1) { +                        if (op_errno == ENOSYS) { +                                /* return ENOTSUP */ +                                gf_log (this->name, GF_LOG_ERROR, +                                        "subvolume does not support locking. " +                                        "please load features/locks xlator on server"); +                                local->op_ret = op_ret; +                                int_lock->lock_op_ret = op_ret; +                        } + +                        local->op_errno              = op_errno; +                        int_lock->lock_op_errno      = op_errno; +                } + +		int_lock->lk_attempted_count++; +        } +        UNLOCK (&frame->lock); + +        if ((op_ret == -1) && +            (op_errno == ENOSYS)) { +                afr_unlock (frame, this); +        } else { +                if (op_ret == 0) { +                        if (local->transaction.type == AFR_ENTRY_TRANSACTION || +                            local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { +                                int_lock->lockee[lockee_no].locked_nodes[child_index] |= LOCKED_YES; +                                int_lock->lockee[lockee_no].locked_count++; +                                int_lock->entrylk_lock_count++; +                        } else { +                                int_lock->locked_nodes[child_index] |= LOCKED_YES; +                                int_lock->lock_count++; +                        } +                } +                afr_lock_blocking (frame, this, cky + 1); +        } + +        return 0; +} + +static int32_t +afr_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                          int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +        AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION, +                               AFR_LOCK_OP, NULL, op_ret, +                               op_errno, (long) cookie); + +        afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); +        return 0; + +} + +static int32_t +afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                          int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +        AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, +                               AFR_LOCK_OP, NULL, op_ret, +                               op_errno, (long)cookie); + +        afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); +        return 0; +} + +static int +afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_inodelk_t       *inodelk  = NULL; +        afr_local_t         *local    = NULL; +        afr_private_t       *priv     = NULL; + +        priv     = this->private; +        local    = frame->local; +        int_lock = &local->internal_lock; + +        switch (local->transaction.type) { +        case AFR_DATA_TRANSACTION: +        case AFR_METADATA_TRANSACTION: +                inodelk = afr_get_inodelk (int_lock, int_lock->domain); +                memcpy (inodelk->locked_nodes, int_lock->locked_nodes, +                        sizeof (*inodelk->locked_nodes) * priv->child_count); +                inodelk->lock_count = int_lock->lock_count; +                break; + +        case AFR_ENTRY_RENAME_TRANSACTION: +        case AFR_ENTRY_TRANSACTION: +                /*entrylk_count is being used in both non-blocking and blocking +                 * modes */ +                break; +        } + +        return 0; + +} + +static inline gf_boolean_t +afr_is_entrylk (afr_internal_lock_t *int_lock, +                afr_transaction_type trans_type) +{ +        gf_boolean_t is_entrylk = _gf_false; + +        if ((int_lock->transaction_lk_type == AFR_SELFHEAL_LK) && +            int_lock->selfheal_lk_type == AFR_ENTRY_SELF_HEAL_LK) { + +                is_entrylk = _gf_true; + +        } else if ((int_lock->transaction_lk_type == AFR_TRANSACTION_LK) && +                 (trans_type == AFR_ENTRY_TRANSACTION || +                  trans_type == AFR_ENTRY_RENAME_TRANSACTION)) { + +                is_entrylk = _gf_true; + +        } else { +                is_entrylk = _gf_false; +        } + +        return is_entrylk; +} + +static gf_boolean_t +_is_lock_wind_needed (afr_local_t *local, int child_index) +{ +        if (!local->child_up[child_index]) +                return _gf_false; + +        return _gf_true; +} + +int +afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) +{ +        afr_internal_lock_t *int_lock    = NULL; +        afr_inodelk_t       *inodelk     = NULL; +        afr_local_t         *local       = NULL; +        afr_private_t       *priv        = NULL; +        struct gf_flock flock = {0,}; +        uint64_t ctx = 0; +        int ret = 0; +        int child_index = 0; +        int lockee_no   = 0; +        gf_boolean_t is_entrylk = _gf_false; + +        local         = frame->local; +        int_lock      = &local->internal_lock; +        priv          = this->private; +        child_index   = cookie % priv->child_count; +        lockee_no     = cookie / priv->child_count; +        is_entrylk    = afr_is_entrylk (int_lock, local->transaction.type); + + +        if (!is_entrylk) { +                inodelk = afr_get_inodelk (int_lock, int_lock->domain); +                flock.l_start = inodelk->flock.l_start; +                flock.l_len   = inodelk->flock.l_len; +                flock.l_type  = inodelk->flock.l_type; +        } + +        if (local->fd) { +                ret = fd_ctx_get (local->fd, this, &ctx); + +                if (ret < 0) { +                        gf_log (this->name, GF_LOG_INFO, +                                "unable to get fd ctx for fd=%p", +                                local->fd); + +                        local->op_ret           = -1; +                        int_lock->lock_op_ret   = -1; + +                        afr_copy_locked_nodes (frame, this); + +                        afr_unlock (frame, this); + +                        return 0; +                } +        } + +        if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { +                if ((is_entrylk && int_lock->entrylk_lock_count == 0) || +                    (!is_entrylk && int_lock->lock_count == 0)) { +                        gf_log (this->name, GF_LOG_INFO, +                                "unable to lock on even one child"); + +                        local->op_ret           = -1; +                        int_lock->lock_op_ret   = -1; + +                        afr_copy_locked_nodes (frame, this); + +                        afr_unlock(frame, this); + +                        return 0; +                } +        } + +        if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { +                /* we're done locking */ + +                gf_log (this->name, GF_LOG_DEBUG, +                        "we're done locking"); + +                afr_copy_locked_nodes (frame, this); + +                int_lock->lock_op_ret = 0; +                int_lock->lock_cbk (frame, this); +                return 0; +        } + +        if (!_is_lock_wind_needed (local, child_index)) { +                afr_lock_blocking (frame, this, cookie + 1); +                return 0; +        } + +        switch (local->transaction.type) { +        case AFR_DATA_TRANSACTION: +        case AFR_METADATA_TRANSACTION: + +                if (local->fd) { +                        AFR_TRACE_INODELK_IN (frame, this, +                                              AFR_INODELK_TRANSACTION, +                                              AFR_LOCK_OP, &flock, F_SETLKW, +                                              child_index); + +                        STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, +                                           (void *) (long) child_index, +                                           priv->children[child_index], +                                           priv->children[child_index]->fops->finodelk, +                                           int_lock->domain, local->fd, +                                           F_SETLKW, &flock, NULL); + +                } else { +                        AFR_TRACE_INODELK_IN (frame, this, +                                              AFR_INODELK_TRANSACTION, +                                              AFR_LOCK_OP, &flock, F_SETLKW, +                                              child_index); + +                        STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, +                                           (void *) (long) child_index, +                                           priv->children[child_index], +                                           priv->children[child_index]->fops->inodelk, +                                           int_lock->domain, &local->loc, +                                           F_SETLKW, &flock, NULL); +                } + +                break; + +        case AFR_ENTRY_RENAME_TRANSACTION: +        case AFR_ENTRY_TRANSACTION: +                /*Accounting for child_index increments on 'down' +                 *and 'fd-less' children */ + +                if (local->fd) { +                        AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_TRANSACTION, +                                              AFR_LOCK_OP, +                                              int_lock->lockee[lockee_no].basename, +                                              cookie); + +                        STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, +                                           (void *) (long) cookie, +                                           priv->children[child_index], +                                           priv->children[child_index]->fops->fentrylk, +                                           int_lock->domain, local->fd, +                                           int_lock->lockee[lockee_no].basename, +                                           ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); +                } else { +                        AFR_TRACE_ENTRYLK_IN (frame, this, +                                              AFR_ENTRYLK_TRANSACTION, +                                              AFR_LOCK_OP, local->transaction.basename, +                                              child_index); + +                        STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, +                                           (void *) (long) cookie, +                                           priv->children[child_index], +                                           priv->children[child_index]->fops->entrylk, +                                           int_lock->domain, +                                           &int_lock->lockee[lockee_no].loc, +                                           int_lock->lockee[lockee_no].basename, +                                           ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); +                } + +                break; +        } + +        return 0; +} + +int32_t +afr_blocking_lock (call_frame_t *frame, xlator_t *this) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_local_t         *local    = NULL; +        afr_private_t       *priv     = NULL; +        int                  up_count = 0; + +        priv     = this->private; +        local    = frame->local; +        int_lock = &local->internal_lock; + +        switch (local->transaction.type) { +        case AFR_DATA_TRANSACTION: +        case AFR_METADATA_TRANSACTION: +                initialize_inodelk_variables (frame, this); +                break; + +        case AFR_ENTRY_RENAME_TRANSACTION: +        case AFR_ENTRY_TRANSACTION: +                up_count = afr_up_children_count (local->child_up, +                                                  priv->child_count); +                int_lock->lk_call_count = int_lock->lk_expected_count +                                        = (int_lock->lockee_count * +                                           up_count); +                initialize_entrylk_variables (frame, this); +                break; +        } + +        afr_lock_blocking (frame, this, 0); + +        return 0; +} + +static int32_t +afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                             int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_local_t         *local    = NULL; +        int call_count          = 0; +        int child_index         = (long) cookie; +        int copies              = 0; +        int index               = 0; +        int lockee_no           = 0; +        afr_private_t       *priv = NULL; + +        priv = this->private; + +        copies = priv->child_count; +        index = child_index % copies; +        lockee_no = child_index / copies; + +        local    = frame->local; +        int_lock = &local->internal_lock; + +        AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, +                               AFR_LOCK_OP, +                               int_lock->lockee[lockee_no].basename, op_ret, +                               op_errno, (long) cookie); + +	LOCK (&frame->lock); +	{ +		if (op_ret < 0 ) { +			if (op_errno == ENOSYS) { +                        /* return ENOTSUP */ +				gf_log (this->name, GF_LOG_ERROR, +					"subvolume does not support locking. " +					"please load features/locks xlator on server"); +				local->op_ret         = op_ret; +				int_lock->lock_op_ret = op_ret; + +				int_lock->lock_op_errno      = op_errno; +				local->op_errno              = op_errno; +			} +		} else if (op_ret == 0) { +			int_lock->lockee[lockee_no].locked_nodes[index] |= \ +				LOCKED_YES; +			int_lock->lockee[lockee_no].locked_count++; +			int_lock->entrylk_lock_count++; +		} + +                call_count = --int_lock->lk_call_count; +        } +        UNLOCK (&frame->lock); + +        if (call_count == 0) { +                gf_log (this->name, GF_LOG_TRACE, +                        "Last locking reply received"); +                /* all locks successful. Proceed to call FOP */ +                if (int_lock->entrylk_lock_count == +                                int_lock->lk_expected_count) { +                        gf_log (this->name, GF_LOG_TRACE, +                                "All servers locked. Calling the cbk"); +                        int_lock->lock_op_ret = 0; +                        int_lock->lock_cbk (frame, this); +                } +                /* Not all locks were successful. Unlock and try locking +                   again, this time with serially blocking locks */ +                else { +                        gf_log (this->name, GF_LOG_TRACE, +                                "%d servers locked. Trying again with blocking calls", +                                int_lock->lock_count); + +                        afr_unlock(frame, this); +                } +        } + +        return 0; +} + +int +afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) +{ +        afr_internal_lock_t *int_lock   = NULL; +        afr_local_t         *local      = NULL; +        afr_private_t       *priv       = NULL; +        afr_fd_ctx_t        *fd_ctx     = NULL; +        int                 copies      = 0; +        int                 index       = 0; +        int                 lockee_no   = 0; +        int32_t             call_count  = 0; +        int i = 0; + +        local    = frame->local; +        int_lock = &local->internal_lock; +        priv     = this->private; + +        copies = priv->child_count; +        initialize_entrylk_variables (frame, this); + +        if (local->fd) { +                fd_ctx = afr_fd_ctx_get (local->fd, this); +                if (!fd_ctx) { +                        gf_log (this->name, GF_LOG_INFO, +                                "unable to get fd ctx for fd=%p", +                                local->fd); + +                        local->op_ret           = -1; +                        int_lock->lock_op_ret   = -1; +                        local->op_errno         = EINVAL; +                        int_lock->lock_op_errno = EINVAL; + +			afr_unlock (frame, this); +                        return -1; +                } + +                call_count = int_lock->lockee_count * internal_lock_count (frame, this); +                int_lock->lk_call_count = call_count; +                int_lock->lk_expected_count = call_count; + +                if (!call_count) { +                        gf_log (this->name, GF_LOG_INFO, +                                "fd not open on any subvolumes. aborting."); +                        afr_unlock (frame, this); +                        goto out; +                } + +                /* Send non-blocking entrylk calls only on up children +                   and where the fd has been opened */ +                for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) { +                        index = i%copies; +                        lockee_no = i/copies; +                        if (local->child_up[index]) { +                                AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, +                                                      AFR_LOCK_OP, +                                                      int_lock->lockee[lockee_no].basename, +                                                      i); + +                                STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, +                                                   (void *) (long) i, +                                                   priv->children[index], +                                                   priv->children[index]->fops->fentrylk, +                                                   this->name, local->fd, +                                                   int_lock->lockee[lockee_no].basename, +                                                   ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, +                                                   NULL); +                                if (!--call_count) +                                        break; +                        } +                } +        } else { +                call_count = int_lock->lockee_count * internal_lock_count (frame, this); +                int_lock->lk_call_count = call_count; +                int_lock->lk_expected_count = call_count; + +                for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) { +                        index = i%copies; +                        lockee_no = i/copies; +                        if (local->child_up[index]) { +                                AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, +                                                      AFR_LOCK_OP, +                                                      int_lock->lockee[lockee_no].basename, +                                                      i); + +                                STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, +                                                   (void *) (long) i, +                                                   priv->children[index], +                                                   priv->children[index]->fops->entrylk, +                                                   this->name, &int_lock->lockee[lockee_no].loc, +                                                   int_lock->lockee[lockee_no].basename, +                                                   ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, +                                                   NULL); + +                                if (!--call_count) +                                        break; +                        } +                } +        } +out: +        return 0; +} + +int32_t +afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                             int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_inodelk_t       *inodelk  = NULL; +        afr_local_t         *local    = NULL; +        int call_count  = 0; +        int child_index = (long) cookie; +        afr_fd_ctx_t        *fd_ctx = NULL; + + +        local    = frame->local; +        int_lock = &local->internal_lock; +        inodelk = afr_get_inodelk (int_lock, int_lock->domain); + +        AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_NB_TRANSACTION, +                               AFR_LOCK_OP, NULL, op_ret, +                               op_errno, (long) cookie); + +	if (local->fd) +		fd_ctx = afr_fd_ctx_get (local->fd, this); + +        LOCK (&frame->lock); +        { +		if (op_ret < 0) { +			if (op_errno == ENOSYS) { +				/* return ENOTSUP */ +				gf_log (this->name, GF_LOG_ERROR, +					"subvolume does not support locking. " +					"please load features/locks xlator on " +					"server"); +				local->op_ret                = op_ret; +				int_lock->lock_op_ret        = op_ret; +				int_lock->lock_op_errno      = op_errno; +				local->op_errno              = op_errno; +			} +			if (local->transaction.eager_lock) +				local->transaction.eager_lock[child_index] = 0; +		} else { +			inodelk->locked_nodes[child_index] |= LOCKED_YES; +			inodelk->lock_count++; + +			if (local->transaction.eager_lock && +			    local->transaction.eager_lock[child_index] && +			    local->fd) { +				/* piggybacked */ +				if (op_ret == 1) { +					/* piggybacked */ +				} else if (op_ret == 0) { +					/* lock acquired from server */ +                                        fd_ctx->lock_acquired[child_index]++; +				} +			} +		} + +                call_count = --int_lock->lk_call_count; +        } +        UNLOCK (&frame->lock); + +        if (call_count == 0) { +                gf_log (this->name, GF_LOG_TRACE, +                        "Last inode locking reply received"); +                /* all locks successful. Proceed to call FOP */ +                if (inodelk->lock_count == int_lock->lk_expected_count) { +                        gf_log (this->name, GF_LOG_TRACE, +                                "All servers locked. Calling the cbk"); +                        int_lock->lock_op_ret = 0; +                        int_lock->lock_cbk (frame, this); +                } +                /* Not all locks were successful. Unlock and try locking +                   again, this time with serially blocking locks */ +                else { +                        gf_log (this->name, GF_LOG_TRACE, +                                "%d servers locked. Trying again with blocking calls", +                                int_lock->lock_count); + +                        afr_unlock(frame, this); +                } +        } + +        return 0; +} + +int +afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_inodelk_t       *inodelk  = NULL; +        afr_local_t         *local    = NULL; +        afr_private_t       *priv     = NULL; +        afr_fd_ctx_t        *fd_ctx   = NULL; +        int32_t             call_count = 0; +        int                 i          = 0; +        int                 ret        = 0; +        struct              gf_flock flock = {0,}; +        struct              gf_flock full_flock = {0,}; +        struct              gf_flock *flock_use = NULL; +        int                 piggyback = 0; + +        local    = frame->local; +        int_lock = &local->internal_lock; +        priv     = this->private; + +        inodelk = afr_get_inodelk (int_lock, int_lock->domain); + +        flock.l_start = inodelk->flock.l_start; +        flock.l_len   = inodelk->flock.l_len; +        flock.l_type  = inodelk->flock.l_type; + +        full_flock.l_type = inodelk->flock.l_type; + +        initialize_inodelk_variables (frame, this); + +        if (local->fd) { +                fd_ctx = afr_fd_ctx_get (local->fd, this); +                if (!fd_ctx) { +                        gf_log (this->name, GF_LOG_INFO, +                                "unable to get fd ctx for fd=%p", +                                local->fd); + +                        local->op_ret           = -1; +                        int_lock->lock_op_ret   = -1; +                        local->op_errno         = EINVAL; +                        int_lock->lock_op_errno = EINVAL; + +			afr_unlock (frame, this); +                        ret = -1; +                        goto out; +                } + +                call_count = internal_lock_count (frame, this); +                int_lock->lk_call_count = call_count; +                int_lock->lk_expected_count = call_count; + +                if (!call_count) { +                        gf_log (this->name, GF_LOG_INFO, +                                "fd not open on any subvolumes. aborting."); +                        afr_unlock (frame, this); +                        goto out; +                } + +                /* Send non-blocking inodelk calls only on up children +                   and where the fd has been opened */ +                for (i = 0; i < priv->child_count; i++) { +                        if (!local->child_up[i]) +                                continue; + +                        flock_use = &flock; +                        if (!local->transaction.eager_lock_on) { +                                goto wind; +                        } + +                        piggyback = 0; +                        local->transaction.eager_lock[i] = 1; + +			afr_set_delayed_post_op (frame, this); + +                        LOCK (&local->fd->lock); +                        { +                                if (fd_ctx->lock_acquired[i]) { +                                        fd_ctx->lock_piggyback[i]++; +                                        piggyback = 1; +                                } +                        } +                        UNLOCK (&local->fd->lock); + +                        if (piggyback) { +                                /* (op_ret == 1) => indicate piggybacked lock */ +                                afr_nonblocking_inodelk_cbk (frame, (void *) (long) i, +                                                             this, 1, 0, NULL); +                                if (!--call_count) +                                        break; +                                continue; +                        } +                        flock_use = &full_flock; +                wind: +                        AFR_TRACE_INODELK_IN (frame, this, +                                              AFR_INODELK_NB_TRANSACTION, +                                              AFR_LOCK_OP, flock_use, F_SETLK, i); + +                        STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->finodelk, +                                           int_lock->domain, local->fd, +                                           F_SETLK, flock_use, NULL); + +                        if (!--call_count) +                                break; +                } +        } else { +                call_count = internal_lock_count (frame, this); +                int_lock->lk_call_count = call_count; +                int_lock->lk_expected_count = call_count; + +                for (i = 0; i < priv->child_count; i++) { +                        if (!local->child_up[i]) +                                continue; +                        AFR_TRACE_INODELK_IN (frame, this, +                                              AFR_INODELK_NB_TRANSACTION, +                                              AFR_LOCK_OP, &flock, F_SETLK, i); + +                        STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->inodelk, +                                           int_lock->domain, &local->loc, +                                           F_SETLK, &flock, NULL); + +                        if (!--call_count) +                                break; +                } +        } +out: +        return ret; +} + +int32_t +afr_unlock (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; + +        local = frame->local; + +        if (transaction_lk_op (local)) { +                if (is_afr_lock_transaction (local)) +                        afr_unlock_inodelk (frame, this); +                else +                        afr_unlock_entrylk (frame, this); + +        } else { +                if (is_afr_lock_selfheal (local)) +                        afr_unlock_inodelk (frame, this); +                else +                        afr_unlock_entrylk (frame, this); +        } + +        return 0; +} + +int +afr_mark_locked_nodes (xlator_t *this, fd_t *fd, +                       unsigned char *locked_nodes) +{ +        afr_private_t *priv  = NULL; +        afr_fd_ctx_t  *fdctx = NULL; +        uint64_t       tmp   = 0; +        int            ret   = 0; + +        priv = this->private; + +        ret = afr_fd_ctx_set (this, fd); +        if (ret) +                goto out; + +        ret = fd_ctx_get (fd, this, &tmp); +        if (ret) { +                gf_log (this->name, GF_LOG_INFO, +                        "failed to get the fd ctx"); +                goto out; +        } +        fdctx = (afr_fd_ctx_t *) (long) tmp; + +        GF_ASSERT (fdctx->locked_on); + +        memcpy (fdctx->locked_on, locked_nodes, +                priv->child_count); + +out: +        return ret; +} + +static int +__is_fd_saved (xlator_t *this, fd_t *fd) +{ +        afr_locked_fd_t *locked_fd = NULL; +        afr_private_t   *priv      = NULL; +        int              found     = 0; + +        priv = this->private; + +        list_for_each_entry (locked_fd, &priv->saved_fds, list) { +                if (locked_fd->fd == fd) { +                        found = 1; +                        break; +                } +        } + +        return found; +} + +static int +__afr_save_locked_fd (xlator_t *this, fd_t *fd) +{ +        afr_private_t   *priv      = NULL; +        afr_locked_fd_t *locked_fd = NULL; +        int              ret       = 0; + +        priv = this->private; + +        locked_fd = GF_CALLOC (1, sizeof (*locked_fd), +                               gf_afr_mt_locked_fd); +        if (!locked_fd) { +                ret = -1; +                goto out; +        } + +        locked_fd->fd = fd; +        INIT_LIST_HEAD (&locked_fd->list); + +        list_add_tail (&locked_fd->list, &priv->saved_fds); + +out: +        return ret; +} + +int +afr_save_locked_fd (xlator_t *this, fd_t *fd) +{ +        afr_private_t   *priv      = NULL; +        int              ret       = 0; + +        priv = this->private; + +        pthread_mutex_lock (&priv->mutex); +        { +                if (__is_fd_saved (this, fd)) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "fd=%p already saved", fd); +                        goto unlock; +                } + +                ret = __afr_save_locked_fd (this, fd); +                if (ret) { +                        gf_log (this->name, GF_LOG_INFO, +                                "fd=%p could not be saved", fd); +                        goto unlock; +                } +        } +unlock: +        pthread_mutex_unlock (&priv->mutex); + +        return ret; +} + +static int +afr_lock_recovery_cleanup (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local     = NULL; +        afr_locked_fd_t *locked_fd = NULL; + +        local = frame->local; + +        locked_fd = local->locked_fd; + +        STACK_DESTROY (frame->root); +        afr_local_cleanup (local, this); + +        afr_save_locked_fd (this, locked_fd->fd); + +        return 0; + +} + +static int +afr_get_source_lock_recovery (xlator_t *this, fd_t *fd) +{ +        afr_fd_ctx_t  *fdctx        = NULL; +        afr_private_t *priv         = NULL; +        uint64_t      tmp           = 0; +        int           i             = 0; +        int           source_child  = -1; +        int           ret           = 0; + +        priv = this->private; + +        ret = fd_ctx_get (fd, this, &tmp); +        if (ret) +                goto out; + +        fdctx = (afr_fd_ctx_t *) (long) tmp; + +        for (i = 0; i < priv->child_count; i++) { +                if (fdctx->locked_on[i]) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "Found lock recovery source=%d", i); +                        source_child = i; +                        break; +                } +        } + +out: +        return source_child; + +} + +int32_t +afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                      int32_t op_ret, int32_t op_errno, struct gf_flock *lock, +                      dict_t *xdata); +int32_t +afr_recover_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                      int32_t op_ret, int32_t op_errno, struct gf_flock *lock, +                      dict_t *xdata) +{ +        afr_local_t   *local = NULL; +        afr_private_t *priv  = NULL; +        int32_t source_child = 0; +        struct gf_flock flock   = {0,}; + +        local = frame->local; +        priv  = this->private; + +        if (op_ret) { +                gf_log (this->name, GF_LOG_INFO, +                        "lock recovery failed"); +                goto cleanup; +        } + +        source_child = local->source_child; + +        memcpy (&flock, lock, sizeof (*lock)); + +        STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk, +                           (void *) (long) source_child, +                           priv->children[source_child], +                           priv->children[source_child]->fops->lk, +                           local->fd, F_GETLK_FD, &flock, NULL); + +        return 0; + +cleanup: +        afr_lock_recovery_cleanup (frame, this); +        return 0; +} + +int +afr_recover_lock (call_frame_t *frame, xlator_t *this, +                  struct gf_flock *flock) +{ +        afr_local_t   *local             = NULL; +        afr_private_t *priv              = NULL; +        int32_t      lock_recovery_child = 0; + +        priv  = this->private; +        local = frame->local; + +        lock_recovery_child = local->lock_recovery_child; + +        frame->root->lk_owner = flock->l_owner; + +        STACK_WIND_COOKIE (frame, afr_recover_lock_cbk, +                           (void *) (long) lock_recovery_child, +                           priv->children[lock_recovery_child], +                           priv->children[lock_recovery_child]->fops->lk, +                           local->fd, F_SETLK, flock, NULL); + +        return 0; +} + +static int +is_afr_lock_eol (struct gf_flock *lock) +{ +        int ret = 0; + +        if ((lock->l_type == GF_LK_EOL)) +                ret = 1; + +        return ret; +} + +int32_t +afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                      int32_t op_ret, int32_t op_errno, struct gf_flock *lock, +                      dict_t *xdata) +{ +        if (op_ret) { +                gf_log (this->name, GF_LOG_INFO, +                        "Failed to get locks on fd"); +                goto cleanup; +        } + +        gf_log (this->name, GF_LOG_DEBUG, +                "Got a lock on fd"); + +        if (is_afr_lock_eol (lock)) { +                gf_log (this->name, GF_LOG_INFO, +                        "Reached EOL on locks on fd"); +                goto cleanup; +        } + +        afr_recover_lock (frame, this, lock); + +        return 0; + +cleanup: +        afr_lock_recovery_cleanup (frame, this); + +        return 0; +} + +static int +afr_lock_recovery (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t   *local        = NULL; +        afr_private_t *priv         = NULL; +        fd_t          *fd           = NULL; +        int            ret          = 0; +        int32_t        source_child = 0; +        struct gf_flock   flock        = {0,}; + +        priv  = this->private; +        local = frame->local; + +        fd = local->fd; + +        source_child = afr_get_source_lock_recovery (this, fd); +        if (source_child < 0) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Could not recover locks due to lock " +                        "split brain"); +                ret = -1; +                goto out; +        } + +        local->source_child = source_child; + +        /* the flock can be zero filled as we're querying incrementally +           the locks held on the fd. +        */ +        STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk, +                           (void *) (long) source_child, +                           priv->children[source_child], +                           priv->children[source_child]->fops->lk, +                           local->fd, F_GETLK_FD, &flock, NULL); + +out: +        return ret; +} + + +static int +afr_mark_fd_opened (xlator_t *this, fd_t *fd, int32_t child_index) +{ +        afr_fd_ctx_t *fdctx = NULL; +        uint64_t      tmp   = 0; +        int           ret   = 0; + +        ret = fd_ctx_get (fd, this, &tmp); +        if (ret) +                goto out; + +        fdctx = (afr_fd_ctx_t *) (long) tmp; + +        fdctx->opened_on[child_index] = AFR_FD_OPENED; + +out: +        return ret; +} + +int32_t +afr_lock_recovery_preopen_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                               int32_t op_ret, int32_t op_errno, fd_t *fd, +                               dict_t *xdata) +{ +        int32_t child_index = (long )cookie; +        int ret = 0; + +        if (op_ret) { +                gf_log (this->name, GF_LOG_INFO, +                        "Reopen during lock-recovery failed"); +                goto cleanup; +        } + +        gf_log (this->name, GF_LOG_DEBUG, +                "Open succeeded => proceed to recover locks"); + +        ret = afr_lock_recovery (frame, this); +        if (ret) { +                gf_log (this->name, GF_LOG_INFO, +                        "Lock recovery failed"); +                goto cleanup; +        } + +        ret = afr_mark_fd_opened (this, fd, child_index); +        if (ret) { +                gf_log (this->name, GF_LOG_INFO, +                        "Marking fd open failed"); +                goto cleanup; +        } + +        return 0; + +cleanup: +        afr_lock_recovery_cleanup (frame, this); +        return 0; +} + +static int +afr_lock_recovery_preopen (call_frame_t *frame, xlator_t *this) +{ +        afr_private_t *priv  = NULL; +        afr_local_t   *local = NULL; +        uint64_t       tmp   = 0; +        afr_fd_ctx_t  *fdctx = NULL; +        loc_t          loc   = {0,}; +        int32_t        child_index = 0; +        int            ret   = 0; + +        priv  = this->private; +        local = frame->local; + +        GF_ASSERT (local && local->fd); + +        ret = fd_ctx_get (local->fd, this, &tmp); +        if (ret) +                gf_log (this->name, GF_LOG_WARNING, +                        "%s: failed to get the context of fd", +                        uuid_utoa (local->fd->inode->gfid)); +        fdctx = (afr_fd_ctx_t *) (long) tmp; +        /* TODO: instead we should return from the function */ +        GF_ASSERT (fdctx); + +        child_index = local->lock_recovery_child; + +        inode_path (local->fd->inode, NULL, (char **)&loc.path); +        loc.name   = strrchr (loc.path, '/'); +        loc.inode  = inode_ref (local->fd->inode); +        loc.parent = inode_parent (local->fd->inode, 0, NULL); + + +        STACK_WIND_COOKIE (frame, afr_lock_recovery_preopen_cbk, +                           (void *)(long) child_index, +                           priv->children[child_index], +                           priv->children[child_index]->fops->open, +                           &loc, fdctx->flags, local->fd, NULL); + +        return 0; +} + +static int +is_fd_opened (fd_t *fd, int32_t child_index) +{ +        afr_fd_ctx_t *fdctx = NULL; +        uint64_t      tmp = 0; +        int           ret = 0; + +        ret = fd_ctx_get (fd, THIS, &tmp); +        if (ret) +                goto out; + +        fdctx = (afr_fd_ctx_t *) (long) tmp; + +        if (fdctx->opened_on[child_index] == AFR_FD_OPENED) +                ret = 1; + +out: +        return ret; +} + +int +afr_attempt_lock_recovery (xlator_t *this, int32_t child_index) +{ +        call_frame_t    *frame      = NULL; +        afr_private_t   *priv       = NULL; +        afr_local_t     *local      = NULL; +        afr_locked_fd_t *locked_fd  = NULL; +        afr_locked_fd_t  *tmp       = NULL; +        int              ret        = -1; +        struct list_head locks_list = {0,}; +        int32_t          op_errno   = 0; + + +        priv = this->private; + +        if (list_empty (&priv->saved_fds)) +                goto out; + +        frame = create_frame (this, this->ctx->pool); +        if (!frame) { +                ret = -1; +                goto out; +        } + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) { +                ret = -1; +                goto out; +        } + +        frame->local = local; + +        INIT_LIST_HEAD (&locks_list); + +        pthread_mutex_lock (&priv->mutex); +        { +                list_splice_init (&priv->saved_fds, &locks_list); +        } +        pthread_mutex_unlock (&priv->mutex); + +        list_for_each_entry_safe (locked_fd, tmp, +                                  &locks_list, list) { + +                list_del_init (&locked_fd->list); + +                local->fd                  = fd_ref (locked_fd->fd); +                local->lock_recovery_child = child_index; +                local->locked_fd           = locked_fd; + +                if (!is_fd_opened (locked_fd->fd, child_index)) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "attempting open before lock " +                                "recovery"); +                        afr_lock_recovery_preopen (frame, this); +                } else { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "attempting lock recovery " +                                "without a preopen"); +                        afr_lock_recovery (frame, this); +                } +        } + +out: +        if ((ret < 0) && frame) +                AFR_STACK_DESTROY (frame); +        return ret; +} + +int +afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, +                          unsigned int child_count) +{ +        afr_local_t         *dst_local   = NULL; +        afr_local_t         *src_local   = NULL; +        afr_internal_lock_t *dst_lock    = NULL; +        afr_internal_lock_t *src_lock    = NULL; +        afr_inodelk_t       *dst_inodelk = NULL; +        afr_inodelk_t       *src_inodelk = NULL; +        int                 ret = -1; + +        src_local = src->local; +        src_lock  = &src_local->internal_lock; +        src_inodelk = afr_get_inodelk (src_lock, dom); +        dst_local = dst->local; +        dst_lock  = &dst_local->internal_lock; +        dst_inodelk = afr_get_inodelk (dst_lock, dom); +        if (!dst_inodelk || !src_inodelk) +                goto out; +        if (src_inodelk->locked_nodes) { +                memcpy (dst_inodelk->locked_nodes, src_inodelk->locked_nodes, +                        sizeof (*dst_inodelk->locked_nodes) * child_count); +                memset (src_inodelk->locked_nodes, 0, +                        sizeof (*src_inodelk->locked_nodes) * child_count); +        } + +        dst_lock->transaction_lk_type = src_lock->transaction_lk_type; +        dst_lock->selfheal_lk_type    = src_lock->selfheal_lk_type; +        dst_inodelk->lock_count = src_inodelk->lock_count; +        src_inodelk->lock_count = 0; +        ret = 0; +out: +        return ret; +} diff --git a/xlators/cluster/afr-v1/src/afr-mem-types.h b/xlators/cluster/afr-v1/src/afr-mem-types.h new file mode 100644 index 000000000..73594f265 --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-mem-types.h @@ -0,0 +1,51 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + + +#ifndef __AFR_MEM_TYPES_H__ +#define __AFR_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_afr_mem_types_ { +        gf_afr_mt_iovec  = gf_common_mt_end + 1, +        gf_afr_mt_afr_fd_ctx_t, +        gf_afr_mt_afr_private_t, +        gf_afr_mt_int32_t, +        gf_afr_mt_char, +        gf_afr_mt_xattr_key, +        gf_afr_mt_dict_t, +        gf_afr_mt_xlator_t, +        gf_afr_mt_iatt, +        gf_afr_mt_int, +        gf_afr_mt_afr_node_character, +        gf_afr_mt_sh_diff_loop_state, +        gf_afr_mt_uint8_t, +        gf_afr_mt_loc_t, +        gf_afr_mt_entry_name, +        gf_afr_mt_pump_priv, +        gf_afr_mt_locked_fd, +        gf_afr_mt_inode_ctx_t, +        gf_afr_fd_paused_call_t, +        gf_afr_mt_crawl_data_t, +        gf_afr_mt_brick_pos_t, +        gf_afr_mt_shd_bool_t, +        gf_afr_mt_shd_timer_t, +        gf_afr_mt_shd_event_t, +        gf_afr_mt_time_t, +        gf_afr_mt_pos_data_t, +        gf_afr_mt_reply_t, +        gf_afr_mt_stats_t, +        gf_afr_mt_shd_crawl_event_t, +        gf_afr_mt_uint64_t, +        gf_afr_mt_end +}; +#endif + diff --git a/xlators/cluster/afr-v1/src/afr-open.c b/xlators/cluster/afr-v1/src/afr-open.c new file mode 100644 index 000000000..643a5d692 --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-open.c @@ -0,0 +1,382 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" +#include "statedump.h" + +#include "fd.h" + +#include "afr-inode-read.h" +#include "afr-inode-write.h" +#include "afr-dir-read.h" +#include "afr-dir-write.h" +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" + +int +afr_stale_child_up (afr_local_t *local, xlator_t *this) +{ +        int             i = 0; +        afr_private_t   *priv = NULL; +        int             up = -1; + +        priv = this->private; + +        if (!local->fresh_children) +                local->fresh_children = afr_children_create (priv->child_count); +        if (!local->fresh_children) +                goto out; + +        afr_inode_get_read_ctx (this, local->fd->inode, local->fresh_children); +        if (priv->child_count == afr_get_children_count (local->fresh_children, +                                                         priv->child_count)) +                goto out; + +        for (i = 0; i < priv->child_count; i++) { +                if (!local->child_up[i]) +                        continue; +                if (afr_is_child_present (local->fresh_children, +                                          priv->child_count, i)) +                        continue; +                up = i; +                break; +        } +out: +        return up; +} + +void +afr_perform_data_self_heal (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        inode_t         *inode = NULL; +        int             st_child = -1; +        char            reason[64] = {0}; + +        local = frame->local; +        sh = &local->self_heal; +        inode = local->fd->inode; + +        if (!IA_ISREG (inode->ia_type)) +                goto out; + +        st_child = afr_stale_child_up (local, this); +        if (st_child < 0) +                goto out; + +        sh->do_data_self_heal          = _gf_true; +        sh->do_metadata_self_heal      = _gf_true; +        sh->do_gfid_self_heal          = _gf_true; +        sh->do_missing_entry_self_heal = _gf_true; + +        snprintf (reason, sizeof (reason), "stale subvolume %d detected", +                  st_child); +        afr_launch_self_heal (frame, this, inode, _gf_true, inode->ia_type, +                              reason, NULL, NULL); +out: +        return; +} + +int +afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                        int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +                        struct iatt *postbuf, dict_t *xdata) +{ +        afr_local_t * local = frame->local; +        afr_private_t *priv = NULL; + +        priv = this->private; +        if (afr_open_only_data_self_heal (priv->data_self_heal)) +                afr_perform_data_self_heal (frame, this); +        AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, +                          local->fd, xdata); +        return 0; +} + + +int +afr_open_cbk (call_frame_t *frame, void *cookie, +              xlator_t *this, int32_t op_ret, int32_t op_errno, +              fd_t *fd, dict_t *xdata) +{ +        afr_local_t *  local       = NULL; +        int            ret         = 0; +        int            call_count  = -1; +        int            child_index = (long) cookie; +        afr_private_t *priv        = NULL; + +        priv = this->private; +        local = frame->local; + +        LOCK (&frame->lock); +        { +                if (op_ret == -1) { +                        local->op_errno = op_errno; +                } + +                if (op_ret >= 0) { +                        local->op_ret = op_ret; +                        local->success_count++; + +                        ret = afr_child_fd_ctx_set (this, fd, child_index, +                                                    local->cont.open.flags); +                        if (ret) { +                                local->op_ret = -1; +                                local->op_errno = -ret; +                                goto unlock; +                        } +                } +        } +unlock: +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                if ((local->cont.open.flags & O_TRUNC) +                    && (local->op_ret >= 0)) { +                        STACK_WIND (frame, afr_open_ftruncate_cbk, +                                    this, this->fops->ftruncate, +                                    fd, 0, NULL); +                } else { +                        if (afr_open_only_data_self_heal (priv->data_self_heal)) +                                afr_perform_data_self_heal (frame, this); +                        AFR_STACK_UNWIND (open, frame, local->op_ret, +                                          local->op_errno, local->fd, xdata); +                } +        } + +        return 0; +} + +int +afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, +          fd_t *fd, dict_t *xdata) +{ +        afr_private_t * priv       = NULL; +        afr_local_t *   local      = NULL; +        int             i          = 0; +        int             ret        = -1; +        int32_t         call_count = 0; +        int32_t         op_errno   = 0; +        int32_t         wind_flags = flags & (~O_TRUNC); +        //We can't let truncation to happen outside transaction. + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); +        VALIDATE_OR_GOTO (loc, out); + +        priv = this->private; + +        if (flags & (O_CREAT|O_TRUNC)) { +                QUORUM_CHECK(open,out); +        } + +        if (afr_is_split_brain (this, loc->inode)) { +                /* self-heal failed */ +                gf_log (this->name, GF_LOG_WARNING, +                        "failed to open as split brain seen, returning EIO"); +                op_errno = EIO; +                goto out; +        } + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        call_count   = local->call_count; +        loc_copy (&local->loc, loc); + +        local->cont.open.flags   = flags; + +        local->fd = fd_ref (fd); + +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i]) { +                        STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->open, +                                           loc, wind_flags, fd, xdata); + +                        if (!--call_count) +                                break; +                } +        } + +        ret = 0; +out: +        if (ret < 0) +                AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, xdata); + +        return 0; +} + +int +afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                         int32_t op_ret, int32_t op_errno, fd_t *fd, +                         dict_t *xdata) +{ +        afr_local_t   *local      = NULL; +        afr_private_t *priv       = NULL; +        afr_fd_ctx_t  *fd_ctx     = NULL; +        int           call_count  = 0; +        int           child_index = (long) cookie; + +        priv     = this->private; +        local    = frame->local; + +        if (op_ret >= 0) { +                gf_log (this->name, GF_LOG_DEBUG, "fd for %s opened " +                        "successfully on subvolume %s", local->loc.path, +                        priv->children[child_index]->name); +        } else { +                gf_log (this->name, GF_LOG_ERROR, "Failed to open %s " +                        "on subvolume %s", local->loc.path, +                        priv->children[child_index]->name); +        } + +        fd_ctx = afr_fd_ctx_get (local->fd, this); +        if (!fd_ctx) { +                gf_log (this->name, GF_LOG_WARNING, +                        "failed to get fd context, %p", local->fd); +                goto out; +        } + +        LOCK (&local->fd->lock); +        { +                if (op_ret >= 0) { +                        fd_ctx->opened_on[child_index] = AFR_FD_OPENED; +                } else { +                        fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; +                } +        } +        UNLOCK (&local->fd->lock); +out: +        call_count = afr_frame_return (frame); +        if (call_count == 0) +                AFR_STACK_DESTROY (frame); + +        return 0; +} + +void +afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open) +{ +        afr_private_t *priv    = NULL; +        int           i        = 0; +        call_frame_t  *frame   = NULL; +        afr_local_t   *local   = NULL; +        int           ret      = -1; +        int32_t       op_errno = 0; +        afr_fd_ctx_t  *fd_ctx  = NULL; + +        priv  = this->private; + +        if (!afr_is_fd_fixable (fd) || !need_open || !need_open_count) +                goto out; + +        fd_ctx = afr_fd_ctx_get (fd, this); +        if (!fd_ctx) { +                ret = -1; +                goto out; +        } + +        frame = create_frame (this, this->ctx->pool); +        if (!frame) { +                ret = -1; +                goto out; +        } + +        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +        local = frame->local; +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        local->loc.inode = inode_ref (fd->inode); +        ret = loc_path (&local->loc, NULL); +        if (ret < 0) +                goto out; + +        local->fd = fd_ref (fd); +        local->call_count = need_open_count; + +        gf_log (this->name, GF_LOG_DEBUG, "need open count: %zd", +                need_open_count); + +        for (i = 0; i < priv->child_count; i++) { +                if (!need_open[i]) +                        continue; + +                if (IA_IFDIR == fd->inode->ia_type) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "opening fd for dir %s on subvolume %s", +                                local->loc.path, priv->children[i]->name); + +                        STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk, +                                           (void*) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->opendir, +                                           &local->loc, local->fd, +                                           NULL); +                } else { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "opening fd for file %s on subvolume %s", +                                local->loc.path, priv->children[i]->name); + +                        STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk, +                                           (void *)(long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->open, +                                           &local->loc, +                                           fd_ctx->flags & (~O_TRUNC), +                                           local->fd, NULL); +                } + +        } +        op_errno = 0; +        ret = 0; +out: +        if (op_errno) +                ret = -1; //For handling ALLOC_OR_GOTO +        if (ret && frame) +                AFR_STACK_DESTROY (frame); +} diff --git a/xlators/cluster/afr-v1/src/afr-self-heal-algorithm.c b/xlators/cluster/afr-v1/src/afr-self-heal-algorithm.c new file mode 100644 index 000000000..83846f152 --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-self-heal-algorithm.c @@ -0,0 +1,837 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + + +#include <openssl/md5.h> +#include "glusterfs.h" +#include "afr.h" +#include "xlator.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" +#include "afr-self-heal-algorithm.h" + +/* +  This file contains the various self-heal algorithms +*/ + +static int +sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, +                gf_boolean_t is_first_call, call_frame_t *old_loop_frame); +static int +sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame, +                int32_t op_ret, int32_t op_errno); +static int +sh_destroy_frame (call_frame_t *frame, xlator_t *this) +{ +        if (!frame) +                goto out; + +        AFR_STACK_DESTROY (frame); +out: +        return 0; +} + +static void +sh_private_cleanup (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t             *local   = NULL; +        afr_self_heal_t         *sh      = NULL; +        afr_sh_algo_private_t   *sh_priv = NULL; + +        local = frame->local; +        sh    = &local->self_heal; + +        sh_priv = sh->private; +        GF_FREE (sh_priv); +} + +static int +sh_number_of_writes_needed (unsigned char *write_needed, int child_count) +{ +        int writes = 0; +        int i      = 0; + +        for (i = 0; i < child_count; i++) { +                if (write_needed[i]) +                        writes++; +        } + +        return writes; +} + + +static int +sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this, +                     call_frame_t *last_loop_frame) +{ +        afr_local_t             *local        = NULL; +        afr_self_heal_t         *sh           = NULL; +        afr_sh_algo_private_t   *sh_priv      = NULL; +        int32_t                 total_blocks = 0; +        int32_t                 diff_blocks  = 0; + +        local        = sh_frame->local; +        sh           = &local->self_heal; +        sh_priv      = sh->private; +        if (sh_priv) { +                total_blocks = sh_priv->total_blocks; +                diff_blocks  = sh_priv->diff_blocks; +        } + +        sh_private_cleanup (sh_frame, this); +        if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { +                GF_ASSERT (!last_loop_frame); +                //loop_finish should have happened and the old_loop should be NULL +                gf_log (this->name, GF_LOG_DEBUG, +                        "self-heal aborting on %s", +                        local->loc.path); + +                local->self_heal.algo_abort_cbk (sh_frame, this); +        } else { +                GF_ASSERT (last_loop_frame); +                if (diff_blocks == total_blocks) { +                        gf_log (this->name, GF_LOG_DEBUG, "full self-heal " +                                "completed on %s",local->loc.path); +                } else { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "diff self-heal on %s: completed. " +                                "(%d blocks of %d were different (%.2f%%))", +                                local->loc.path, diff_blocks, total_blocks, +                                ((diff_blocks * 1.0)/total_blocks) * 100); +                } + +                sh->old_loop_frame = last_loop_frame; +                local->self_heal.algo_completion_cbk (sh_frame, this); +        } + +        return 0; +} + +int +sh_loop_finish (call_frame_t *loop_frame, xlator_t *this) +{ +        afr_local_t             *loop_local = NULL; +        afr_self_heal_t         *loop_sh = NULL; + +        if (!loop_frame) +                goto out; + +        loop_local = loop_frame->local; +        if (loop_local) { +                loop_sh = &loop_local->self_heal; +        } + +        if (loop_sh && loop_sh->data_lock_held) { +                afr_sh_data_unlock (loop_frame, this, this->name, +                                    sh_destroy_frame); +        } else { +                sh_destroy_frame (loop_frame, this); +        } +out: +        return 0; +} + +static int +sh_loop_lock_success (call_frame_t *loop_frame, xlator_t *this) +{ +        afr_local_t                 *loop_local = NULL; +        afr_self_heal_t             *loop_sh    = NULL; + +        loop_local = loop_frame->local; +        loop_sh = &loop_local->self_heal; + +        sh_loop_finish (loop_sh->old_loop_frame, this); +        loop_sh->old_loop_frame = NULL; + +        gf_log (this->name, GF_LOG_DEBUG, "Acquired lock for range %"PRIu64 +                " %"PRIu64, loop_sh->offset, loop_sh->block_size); +        loop_sh->data_lock_held = _gf_true; +        loop_sh->sh_data_algo_start (loop_frame, this); +        return 0; +} + +static int +sh_loop_lock_failure (call_frame_t *loop_frame, xlator_t *this) +{ +        call_frame_t                *sh_frame = NULL; +        afr_local_t                 *loop_local = NULL; +        afr_self_heal_t             *loop_sh    = NULL; + +        loop_local = loop_frame->local; +        loop_sh = &loop_local->self_heal; +        sh_frame = loop_sh->sh_frame; + +        gf_log (this->name, GF_LOG_ERROR, "failed lock for range %"PRIu64 +                " %"PRIu64, loop_sh->offset, loop_sh->block_size); +        sh_loop_finish (loop_sh->old_loop_frame, this); +        loop_sh->old_loop_frame = NULL; +        sh_loop_return (sh_frame, this, loop_frame, -1, ENOTCONN); +        return 0; +} + +static int +sh_loop_frame_create (call_frame_t *sh_frame, xlator_t *this, +                      call_frame_t *old_loop_frame, call_frame_t **loop_frame) +{ +        call_frame_t                *new_loop_frame = NULL; +        afr_local_t                 *local          = NULL; +        afr_self_heal_t             *sh             = NULL; +        afr_local_t                 *new_loop_local = NULL; +        afr_self_heal_t             *new_loop_sh    = NULL; +        afr_private_t               *priv           = NULL; + +        GF_ASSERT (sh_frame); +        GF_ASSERT (loop_frame); + +        *loop_frame = NULL; +        local   = sh_frame->local; +        sh      = &local->self_heal; +        priv    = this->private; + +        new_loop_frame = copy_frame (sh_frame); +        if (!new_loop_frame) +                goto out; +        //We want the frame to have same lk_owner as sh_frame +        //so that locks translator allows conflicting locks +        new_loop_local = afr_self_heal_local_init (local, this); +        if (!new_loop_local) +                goto out; +        new_loop_frame->local = new_loop_local; + +        new_loop_sh = &new_loop_local->self_heal; +        new_loop_sh->sources = memdup (sh->sources, +                                       priv->child_count * sizeof (*sh->sources)); +        if (!new_loop_sh->sources) +                goto out; +        new_loop_sh->write_needed = GF_CALLOC (priv->child_count, +                                               sizeof (*new_loop_sh->write_needed), +                                               gf_afr_mt_char); +        if (!new_loop_sh->write_needed) +                goto out; +        new_loop_sh->checksum = GF_CALLOC (priv->child_count, MD5_DIGEST_LENGTH, +                                           gf_afr_mt_uint8_t); +        if (!new_loop_sh->checksum) +                goto out; +        new_loop_sh->inode      = inode_ref (sh->inode); +        new_loop_sh->sh_data_algo_start = sh->sh_data_algo_start; +        new_loop_sh->source = sh->source; +        new_loop_sh->active_sinks = sh->active_sinks; +        new_loop_sh->healing_fd = fd_ref (sh->healing_fd); +        new_loop_sh->file_has_holes = sh->file_has_holes; +        new_loop_sh->old_loop_frame = old_loop_frame; +        new_loop_sh->sh_frame = sh_frame; +        *loop_frame = new_loop_frame; +        return 0; +out: +        sh_destroy_frame (new_loop_frame, this); +        return -ENOMEM; +} + +static int +sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, +               call_frame_t *old_loop_frame) +{ +        call_frame_t                *new_loop_frame = NULL; +        afr_local_t                 *local          = NULL; +        afr_self_heal_t             *sh             = NULL; +        afr_local_t                 *new_loop_local = NULL; +        afr_self_heal_t             *new_loop_sh    = NULL; +        int                         ret             = 0; + +        GF_ASSERT (sh_frame); + +        local   = sh_frame->local; +        sh      = &local->self_heal; + +        ret = sh_loop_frame_create (sh_frame, this, old_loop_frame, +                                    &new_loop_frame); +        if (ret) +                goto out; +        new_loop_local = new_loop_frame->local; +        new_loop_sh = &new_loop_local->self_heal; +        new_loop_sh->offset = offset; +        new_loop_sh->block_size = sh->block_size; +        afr_sh_data_lock (new_loop_frame, this, offset, new_loop_sh->block_size, +                          _gf_true, this->name, sh_loop_lock_success, sh_loop_lock_failure); +        return 0; +out: +        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +        if (old_loop_frame) +                sh_loop_finish (old_loop_frame, this); +        sh_loop_return (sh_frame, this, new_loop_frame, -1, ENOMEM); +        return 0; +} + +static int +sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, +                gf_boolean_t is_first_call, call_frame_t *old_loop_frame) +{ +        afr_local_t *               local          = NULL; +        afr_self_heal_t *           sh             = NULL; +        afr_sh_algo_private_t       *sh_priv        = NULL; +        gf_boolean_t                is_driver_done = _gf_false; +        blksize_t                   block_size     = 0; +        int                         loop           = 0; +        off_t                       offset         = 0; +        afr_private_t               *priv          = NULL; + +        priv    = this->private; +        local   = sh_frame->local; +        sh      = &local->self_heal; +        sh_priv = sh->private; + +        LOCK (&sh_priv->lock); +        { +                if (!is_first_call) +                        sh_priv->loops_running--; +                offset = sh_priv->offset; +                block_size = sh->block_size; +                while ((!sh->eof_reached) && +                       (!is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) && +                     (sh_priv->loops_running < priv->data_self_heal_window_size) +                       && (sh_priv->offset < sh->file_size)) { + +                        loop++; +                        sh_priv->offset += block_size; +                        sh_priv->loops_running++; + +                        if (!is_first_call) +                                break; +                } +                if (0 == sh_priv->loops_running) { +                        is_driver_done = _gf_true; +                } +        } +        UNLOCK (&sh_priv->lock); + +        if (0 == loop) { +                //loop finish does unlock, but the erasing of the pending +                //xattrs needs to happen before that so do not finish the loop +                if (is_driver_done && +                    !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) +                        goto driver_done; +                if (old_loop_frame) { +                        sh_loop_finish (old_loop_frame, this); +                        old_loop_frame = NULL; +                } +        } + +        //If we have more loops to form we should finish previous loop after +        //the next loop lock +        while (loop--) { +                if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { +                        // op failed in other loop, stop spawning more loops +                        if (old_loop_frame) { +                                sh_loop_finish (old_loop_frame, this); +                                old_loop_frame = NULL; +                        } +                        sh_loop_driver (sh_frame, this, _gf_false, NULL); +                } else { +                        gf_log (this->name, GF_LOG_TRACE, "spawning a loop " +                                "for offset %"PRId64, offset); + +                        sh_loop_start (sh_frame, this, offset, old_loop_frame); +                        old_loop_frame = NULL; +                        offset += block_size; +                } +        } + +driver_done: +        if (is_driver_done) { +                sh_loop_driver_done (sh_frame, this, old_loop_frame); +        } +        return 0; +} + +static int +sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame, +                int32_t op_ret, int32_t op_errno) +{ +        afr_local_t *               loop_local = NULL; +        afr_self_heal_t *           loop_sh    = NULL; +        afr_local_t *               sh_local = NULL; +        afr_self_heal_t            *sh       = NULL; + +        sh_local = sh_frame->local; +        sh       = &sh_local->self_heal; + +        if (loop_frame) { +                loop_local = loop_frame->local; +                if (loop_local) +                        loop_sh    = &loop_local->self_heal; +                if (loop_sh) +                        gf_log (this->name, GF_LOG_TRACE, "loop for offset " +                                "%"PRId64" returned", loop_sh->offset); +        } + +        if (op_ret == -1) { +                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +                afr_sh_set_error (sh, op_errno); +                if (loop_frame) { +                        sh_loop_finish (loop_frame, this); +                        loop_frame = NULL; +                } +        } + +        sh_loop_driver (sh_frame, this, _gf_false, loop_frame); + +        return 0; +} + +static int +sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, +                   int32_t op_ret, int32_t op_errno, struct iatt *buf, +                   struct iatt *postbuf, dict_t *xdata) +{ +        afr_private_t *             priv        = NULL; +        afr_local_t *               loop_local    = NULL; +        afr_self_heal_t *           loop_sh       = NULL; +        call_frame_t               *sh_frame    = NULL; +        afr_local_t *               sh_local    = NULL; +        afr_self_heal_t            *sh          = NULL; +        int                         call_count  = 0; +        int                         child_index = 0; + +        priv     = this->private; +        loop_local = loop_frame->local; +        loop_sh    = &loop_local->self_heal; + +        sh_frame = loop_sh->sh_frame; +        sh_local = sh_frame->local; +        sh       = &sh_local->self_heal; + +        child_index =  (long) cookie; + +        gf_log (this->name, GF_LOG_TRACE, +                "wrote %d bytes of data from %s to child %d, offset %"PRId64"", +                op_ret, sh_local->loc.path, child_index, loop_sh->offset); + +        if (op_ret == -1) { +                gf_log (this->name, GF_LOG_ERROR, +                        "write to %s failed on subvolume %s (%s)", +                        sh_local->loc.path, +                        priv->children[child_index]->name, +                        strerror (op_errno)); + +                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +                afr_sh_set_error (loop_sh, op_errno); +        } else if (op_ret < loop_local->cont.writev.vector->iov_len) { +                gf_log (this->name, GF_LOG_ERROR, +                        "incomplete write to %s on subvolume %s " +                        "(expected %lu, returned %d)", sh_local->loc.path, +                        priv->children[child_index]->name, +                        loop_local->cont.writev.vector->iov_len, op_ret); +                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +        } + +        call_count = afr_frame_return (loop_frame); + +        if (call_count == 0) { +		iobref_unref(loop_local->cont.writev.iobref); + +                sh_loop_return (sh_frame, this, loop_frame, +                                loop_sh->op_ret, loop_sh->op_errno); +        } + +        return 0; +} + +static void +sh_prune_writes_needed (call_frame_t *sh_frame, call_frame_t *loop_frame, +                        afr_private_t *priv) +{ +        afr_local_t     *sh_local     = NULL; +        afr_self_heal_t *sh           = NULL; +        afr_local_t     *loop_local   = NULL; +        afr_self_heal_t *loop_sh      = NULL; +        int             i             = 0; + +        sh_local   = sh_frame->local; +        sh         = &sh_local->self_heal; + +        if (!strcmp (sh->algo->name, "diff")) +                return; + +        loop_local = loop_frame->local; +        loop_sh    = &loop_local->self_heal; + +        /* full self-heal guarantees there exists atleast 1 file with size 0 +         * That means for other files we can preserve holes that come after +         * its size before 'trim' +         */ +        for (i = 0; i < priv->child_count; i++) { +                if (loop_sh->write_needed[i] && +                    ((loop_sh->offset + 1) > sh->buf[i].ia_size)) +                        loop_sh->write_needed[i] = 0; +        } +} + +static int +sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, +                  xlator_t *this, int32_t op_ret, int32_t op_errno, +                  struct iovec *vector, int32_t count, struct iatt *buf, +                  struct iobref *iobref, dict_t *xdata) +{ +        afr_private_t *               priv       = NULL; +        afr_local_t *                 loop_local   = NULL; +        afr_self_heal_t *             loop_sh      = NULL; +        call_frame_t                 *sh_frame   = NULL; +        int                           i          = 0; +        int                           call_count = 0; +        afr_local_t *                 sh_local   = NULL; +        afr_self_heal_t *             sh      = NULL; + +        priv       = this->private; +        loop_local = loop_frame->local; +        loop_sh    = &loop_local->self_heal; + +        sh_frame = loop_sh->sh_frame; +        sh_local = sh_frame->local; +        sh       = &sh_local->self_heal; + +        gf_log (this->name, GF_LOG_TRACE, +                "read %d bytes of data from %s, offset %"PRId64"", +                op_ret, loop_local->loc.path, loop_sh->offset); + +        if (op_ret <= 0) { +                if (op_ret < 0) { +                        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +                        gf_log (this->name, GF_LOG_ERROR, "read failed on %d " +                                "for %s reason :%s", sh->source, +                                sh_local->loc.path, strerror (errno)); +                } else { +                        sh->eof_reached = _gf_true; +                        gf_log (this->name, GF_LOG_DEBUG, "Eof reached for %s", +                                sh_local->loc.path); +                } +                sh_loop_return (sh_frame, this, loop_frame, op_ret, op_errno); +                goto out; +        } + +        if (loop_sh->file_has_holes && iov_0filled (vector, count) == 0) +                sh_prune_writes_needed (sh_frame, loop_frame, priv); + +        call_count = sh_number_of_writes_needed (loop_sh->write_needed, +                                                 priv->child_count); +        if (call_count == 0) { +                sh_loop_return (sh_frame, this, loop_frame, 0, 0); +                goto out; +        } + +        loop_local->call_count = call_count; + +	/* +	 * We only really need the request size at the moment, but the buffer +	 * is required if we want to issue a retry in the event of a short write. +	 * Therefore, we duplicate the vector and ref the iobref here... +	 */ +	loop_local->cont.writev.vector = iov_dup(vector, count); +	loop_local->cont.writev.iobref = iobref_ref(iobref); + +        for (i = 0; i < priv->child_count; i++) { +                if (!loop_sh->write_needed[i]) +                        continue; +                STACK_WIND_COOKIE (loop_frame, sh_loop_write_cbk, +                                   (void *) (long) i, +                                   priv->children[i], +                                   priv->children[i]->fops->writev, +                                   loop_sh->healing_fd, vector, count, +                                   loop_sh->offset, 0, iobref, NULL); + +                if (!--call_count) +                        break; +        } + +out: +        return 0; +} + + +static int +sh_loop_read (call_frame_t *loop_frame, xlator_t *this) +{ +        afr_private_t           *priv       = NULL; +        afr_local_t             *loop_local   = NULL; +        afr_self_heal_t         *loop_sh      = NULL; + +        priv     = this->private; +        loop_local = loop_frame->local; +        loop_sh    = &loop_local->self_heal; + +        STACK_WIND_COOKIE (loop_frame, sh_loop_read_cbk, +                           (void *) (long) loop_sh->source, +                           priv->children[loop_sh->source], +                           priv->children[loop_sh->source]->fops->readv, +                           loop_sh->healing_fd, loop_sh->block_size, +                           loop_sh->offset, 0, NULL); + +        return 0; +} + + +static int +sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, +                      int32_t op_ret, int32_t op_errno, +                      uint32_t weak_checksum, uint8_t *strong_checksum, +                      dict_t *xdata) +{ +        afr_private_t                 *priv         = NULL; +        afr_local_t                   *loop_local   = NULL; +        afr_self_heal_t               *loop_sh      = NULL; +        call_frame_t                  *sh_frame     = NULL; +        afr_local_t                   *sh_local     = NULL; +        afr_self_heal_t               *sh           = NULL; +        afr_sh_algo_private_t         *sh_priv      = NULL; +        int                           child_index  = 0; +        int                           call_count   = 0; +        int                           i            = 0; +        int                           write_needed = 0; + +        priv  = this->private; + +        loop_local = loop_frame->local; +        loop_sh    = &loop_local->self_heal; + +        sh_frame = loop_sh->sh_frame; +        sh_local = sh_frame->local; +        sh       = &sh_local->self_heal; + +        sh_priv = sh->private; + +        child_index = (long) cookie; + +        if (op_ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, +                        "checksum on %s failed on subvolume %s (%s)", +                        sh_local->loc.path, priv->children[child_index]->name, +                        strerror (op_errno)); +                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +        } else { +                memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LENGTH, +                        strong_checksum, MD5_DIGEST_LENGTH); +        } + +        call_count = afr_frame_return (loop_frame); + +        if (call_count == 0) { +                for (i = 0; i < priv->child_count; i++) { +                        if (sh->sources[i] || !sh_local->child_up[i]) +                                continue; + +                        if (memcmp (loop_sh->checksum + (i * MD5_DIGEST_LENGTH), +                                    loop_sh->checksum + (sh->source * MD5_DIGEST_LENGTH), +                                    MD5_DIGEST_LENGTH)) { +                                /* +                                  Checksums differ, so this block +                                  must be written to this sink +                                */ + +                                gf_log (this->name, GF_LOG_DEBUG, +                                        "checksum on subvolume %s at offset %" +                                        PRId64" differs from that on source", +                                        priv->children[i]->name, loop_sh->offset); + +                                write_needed = loop_sh->write_needed[i] = 1; +                        } +                } + +                LOCK (&sh_priv->lock); +                { +                        sh_priv->total_blocks++; +                        if (write_needed) +                                sh_priv->diff_blocks++; +                } +                UNLOCK (&sh_priv->lock); + +                if (write_needed && +                    !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { +                        sh_loop_read (loop_frame, this); +                } else { +                        sh_loop_return (sh_frame, this, loop_frame, +                                        op_ret, op_errno); +                } +        } + +        return 0; +} + +static int +sh_diff_checksum (call_frame_t *loop_frame, xlator_t *this) +{ +        afr_private_t           *priv         = NULL; +        afr_local_t             *loop_local   = NULL; +        afr_self_heal_t         *loop_sh      = NULL; +        int                     call_count    = 0; +        int                     i             = 0; + +        priv         = this->private; +        loop_local   = loop_frame->local; +        loop_sh      = &loop_local->self_heal; + +        call_count = loop_sh->active_sinks + 1;  /* sinks and source */ + +        loop_local->call_count = call_count; + +        STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk, +                           (void *) (long) loop_sh->source, +                           priv->children[loop_sh->source], +                           priv->children[loop_sh->source]->fops->rchecksum, +                           loop_sh->healing_fd, +                           loop_sh->offset, loop_sh->block_size, NULL); + +        for (i = 0; i < priv->child_count; i++) { +                if (loop_sh->sources[i] || !loop_local->child_up[i]) +                        continue; + +                STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk, +                                   (void *) (long) i, +                                   priv->children[i], +                                   priv->children[i]->fops->rchecksum, +                                   loop_sh->healing_fd, +                                   loop_sh->offset, loop_sh->block_size, NULL); + +                if (!--call_count) +                        break; +        } + +        return 0; +} + +static int +sh_full_read_write_to_sinks (call_frame_t *loop_frame, xlator_t *this) +{ +        afr_private_t           *priv         = NULL; +        afr_local_t             *loop_local   = NULL; +        afr_self_heal_t         *loop_sh      = NULL; +        int                     i             = 0; + +        priv         = this->private; +        loop_local   = loop_frame->local; +        loop_sh      = &loop_local->self_heal; + +        for (i = 0; i < priv->child_count; i++) { +                if (loop_sh->sources[i] || !loop_local->child_up[i]) +                        continue; +                loop_sh->write_needed[i] = 1; +        } +        sh_loop_read (loop_frame, this); +        return 0; +} + +afr_sh_algo_private_t* +afr_sh_priv_init () +{ +        afr_sh_algo_private_t   *sh_priv = NULL; + +        sh_priv = GF_CALLOC (1, sizeof (*sh_priv), +                             gf_afr_mt_afr_private_t); +        if (!sh_priv) +                goto out; + +        LOCK_INIT (&sh_priv->lock); +out: +        return sh_priv; +} + +int +afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, char *dom, +                      unsigned int child_count) +{ +        afr_local_t             *dst_local   = NULL; +        afr_self_heal_t         *dst_sh      = NULL; +        afr_local_t             *src_local   = NULL; +        afr_self_heal_t         *src_sh      = NULL; +        int                     ret          = -1; + +        dst_local = dst->local; +        dst_sh = &dst_local->self_heal; +        src_local = src->local; +        src_sh = &src_local->self_heal; +        GF_ASSERT (src_sh->data_lock_held); +        GF_ASSERT (!dst_sh->data_lock_held); +        ret = afr_lk_transfer_datalock (dst, src, dom, child_count); +        if (ret) +                return ret; +        src_sh->data_lock_held = _gf_false; +        dst_sh->data_lock_held = _gf_true; +        return 0; +} + +int +afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this, +                    afr_sh_algo_fn sh_data_algo_start) +{ +        call_frame_t            *first_loop_frame = NULL; +        afr_local_t             *local   = NULL; +        afr_self_heal_t         *sh      = NULL; +        int                     ret      = 0; +        afr_private_t           *priv    = NULL; + +        local = sh_frame->local; +        sh    = &local->self_heal; +        priv  = this->private; + +        sh->sh_data_algo_start = sh_data_algo_start; +        local->call_count = 0; +        ret = sh_loop_frame_create (sh_frame, this, NULL, &first_loop_frame); +        if (ret) +                goto out; +        ret = afr_sh_transfer_lock (first_loop_frame, sh_frame, this->name, +                                    priv->child_count); +        if (ret) +                goto out; +        sh->private = afr_sh_priv_init (); +        if (!sh->private) { +                ret = -1; +                goto out; +        } +        sh_loop_driver (sh_frame, this, _gf_true, first_loop_frame); +        ret = 0; +out: +        if (ret) { +                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +                sh_loop_driver_done (sh_frame, this, NULL); +        } +        return 0; +} + +int +afr_sh_algo_diff (call_frame_t *sh_frame, xlator_t *this) +{ +        afr_sh_start_loops (sh_frame, this, sh_diff_checksum); +        return 0; +} + +int +afr_sh_algo_full (call_frame_t *sh_frame, xlator_t *this) +{ +        afr_sh_start_loops (sh_frame, this, sh_full_read_write_to_sinks); +        return 0; +} + +struct afr_sh_algorithm afr_self_heal_algorithms[] = { +        {.name = "full",  .fn = afr_sh_algo_full}, +        {.name = "diff",  .fn = afr_sh_algo_diff}, +        {0, 0}, +}; diff --git a/xlators/cluster/afr-v1/src/afr-self-heal-algorithm.h b/xlators/cluster/afr-v1/src/afr-self-heal-algorithm.h new file mode 100644 index 000000000..6b20789b1 --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-self-heal-algorithm.h @@ -0,0 +1,32 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef __AFR_SELF_HEAL_ALGORITHM_H__ +#define __AFR_SELF_HEAL_ALGORITHM_H__ + +typedef int (*afr_sh_algo_fn) (call_frame_t *frame, +                               xlator_t *this); + +struct afr_sh_algorithm { +        const char *name; +        afr_sh_algo_fn fn; +}; + +extern struct afr_sh_algorithm afr_self_heal_algorithms[3]; +typedef struct { +        gf_lock_t lock; +        unsigned int loops_running; +        off_t offset; + +        int32_t total_blocks; +        int32_t diff_blocks; +} afr_sh_algo_private_t; + +#endif /* __AFR_SELF_HEAL_ALGORITHM_H__ */ diff --git a/xlators/cluster/afr-v1/src/afr-self-heal-common.c b/xlators/cluster/afr-v1/src/afr-self-heal-common.c new file mode 100644 index 000000000..ef92b4205 --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-self-heal-common.c @@ -0,0 +1,2812 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#include "glusterfs.h" +#include "xlator.h" +#include "byte-order.h" + +#include "afr.h" +#include "afr-transaction.h" +#include "afr-self-heal-common.h" +#include "afr-self-heal.h" +#include "pump.h" + +#define ADD_FMT_STRING(msg, off, sh_str, status, print_log)                 \ +        do {                                                                \ +                if (AFR_SELF_HEAL_NOT_ATTEMPTED != status) {                \ +                        off += snprintf (msg + off, sizeof (msg) - off,     \ +                                         " "sh_str" self heal %s,",         \ +                                         get_sh_completion_status (status));\ +                        print_log = 1;                                      \ +                }                                                           \ +        } while (0) + +#define ADD_FMT_STRING_SYNC(msg, off, sh_str, status, print_log)            \ +        do {                                                                \ +                if (AFR_SELF_HEAL_SYNC_BEGIN == status ||                   \ +                    AFR_SELF_HEAL_FAILED == status)  {                      \ +                        off += snprintf (msg + off, sizeof (msg) - off,     \ +                                         " "sh_str" self heal %s,",         \ +                                         get_sh_completion_status (status));\ +                        print_log = 1;                                      \ +                }                                                           \ +        } while (0) + + +void +afr_sh_reset (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        afr_private_t   *priv = NULL; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        memset (sh->child_errno, 0, +                sizeof (*sh->child_errno) * priv->child_count); +        memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count); +        memset (sh->parentbufs, 0, +                sizeof (*sh->parentbufs) * priv->child_count); +        memset (sh->success, 0, sizeof (*sh->success) * priv->child_count); +        memset (sh->locked_nodes, 0, +                sizeof (*sh->locked_nodes) * priv->child_count); +        sh->active_sinks = 0; + +        afr_reset_xattr (sh->xattr, priv->child_count); +} + +//Intersection[child]=1 if child is part of intersection +void +afr_children_intersection_get (int32_t *set1, int32_t *set2, +                               int *intersection, unsigned int child_count) +{ +        int                      i = 0; + +        memset (intersection, 0, sizeof (*intersection) * child_count); +        for (i = 0; i < child_count; i++) { +                intersection[i] = afr_is_child_present (set1, child_count, i) +                                     && afr_is_child_present (set2, child_count, +                                                              i); +        } +} + +/** + * select_source - select a source and return it + */ + +int +afr_sh_select_source (int sources[], int child_count) +{ +        int i = 0; +        for (i = 0; i < child_count; i++) +                if (sources[i]) +                        return i; + +        return -1; +} + +void +afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this) +{ +        int              i = 0; +        afr_local_t     *local      = NULL; +        afr_self_heal_t *sh = NULL; +        afr_private_t   *priv = NULL; +        int              active_sinks = 0; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        for (i = 0; i < priv->child_count; i++) { +                if (sh->sources[i] == 0 && local->child_up[i] == 1) { +                        active_sinks++; +                        sh->success[i] = 1; +                } else if (sh->sources[i] == 1 && local->child_up[i] == 1) { +                        sh->success[i] = 1; +                } +        } +        sh->active_sinks = active_sinks; +} + +int +afr_sh_source_count (int sources[], int child_count) +{ +        int i = 0; +        int nsource = 0; + +        for (i = 0; i < child_count; i++) +                if (sources[i]) +                        nsource++; +        return nsource; +} + +void +afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno) +{ +        sh->op_ret = -1; +	sh->op_errno = afr_most_important_error(sh->op_errno, op_errno, +						_gf_false); +} + +void +afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) +{ +        afr_private_t *  priv = this->private; +        char            *buf  = NULL; +        char            *ptr  = NULL; +        int              i    = 0; +        int              j    = 0; + +        /* 10 digits per entry + 1 space + '[' and ']' */ +        buf = GF_MALLOC (priv->child_count * 11 + 8, gf_afr_mt_char); + +        for (i = 0; i < priv->child_count; i++) { +                ptr = buf; +                ptr += sprintf (ptr, "[ "); +                for (j = 0; j < priv->child_count; j++) { +                        ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); +                } +                sprintf (ptr, "]"); +                gf_log (this->name, GF_LOG_DEBUG, "pending_matrix: %s", buf); +        } + +        GF_FREE (buf); +} + +char* +afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this) +{ +        afr_private_t *  priv = this->private; +        char            *buf  = NULL; +        char            *ptr  = NULL; +        int              i    = 0; +        int              j    = 0; +        int             child_count = priv->child_count; +        char            *matrix_begin = "[ [ "; +        char            *matrix_end = "] ]"; +        char            *seperator = "] [ "; +        int             pending_entry_strlen = 12; //Including space after entry +        int             matrix_begin_strlen = 0; +        int             matrix_end_strlen = 0; +        int             seperator_strlen = 0; +        int             string_length = 0; +        char            *msg = "- Pending matrix:  "; + +        /* +         *  for a list of lists of [ [ a b ] [ c d ] ] +         * */ + +        matrix_begin_strlen = strlen (matrix_begin); +        matrix_end_strlen = strlen (matrix_end); +        seperator_strlen = strlen (seperator); +        string_length = matrix_begin_strlen + matrix_end_strlen +                        + (child_count -1) * seperator_strlen +                        + (child_count * child_count * pending_entry_strlen); + +        buf = GF_CALLOC (1, 1 + strlen (msg) + string_length , gf_afr_mt_char); +        if (!buf) +                goto out; + +        ptr = buf; +        ptr += sprintf (ptr, "%s", msg); +        ptr += sprintf (ptr, "%s", matrix_begin); +        for (i = 0; i < priv->child_count; i++) { +                for (j = 0; j < priv->child_count; j++) { +                        ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); +                } +                if (i < priv->child_count -1) +                        ptr += sprintf (ptr, "%s", seperator); +        } + +        ptr += sprintf (ptr, "%s", matrix_end); + +out: +        return buf; +} + +void +afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, +                              const char *loc) +{ +        char *buf      = NULL; +        char *free_ptr = NULL; + +        buf = afr_get_pending_matrix_str (pending_matrix, this); +        if (buf) +                free_ptr = buf; +        else +                buf = ""; + + +        gf_log (this->name, GF_LOG_ERROR, "Unable to self-heal contents of '%s'" +                " (possible split-brain). Please delete the file from all but " +                "the preferred subvolume.%s", loc, buf); +        GF_FREE (free_ptr); +        return; +} + + +void +afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count) +{ +        int             i   = 0; +        int             j   = 0; + +        GF_ASSERT (pending_matrix); + +        for (i = 0; i < child_count; i++) { +                for (j = 0; j < child_count; j++) { +                        pending_matrix[i][j] = 0; +                } +        } +} + +void +afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix, +                                      unsigned char *ignorant_subvols, +                                      size_t  child_count) +{ +        int            i                = 0; +        int            j                = 0; + +        GF_ASSERT (pending_matrix); +        GF_ASSERT (ignorant_subvols); + +        for (i = 0; i < child_count; i++) { +                if (ignorant_subvols[i]) { +                        for (j = 0; j < child_count; j++) { +                                if (!ignorant_subvols[j]) +                                        pending_matrix[j][i] += 1; +                        } +                } +        } +} + +int +afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, +                          unsigned char *ignorant_subvols, +                          dict_t *xattr[], afr_transaction_type type, +                          size_t child_count) +{ +        /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ +        int32_t        pending[3]       = {0,}; +        void          *pending_raw      = NULL; +        int            ret              = -1; +        int            i                = 0; +        int            j                = 0; +        int            k                = 0; + +        afr_init_pending_matrix (pending_matrix, child_count); + +        for (i = 0; i < child_count; i++) { +                pending_raw = NULL; + +                for (j = 0; j < child_count; j++) { +                        ret = dict_get_ptr (xattr[i], pending_key[j], +                                            &pending_raw); + +                        if (ret != 0) { +                                /* +                                 * There is no xattr present. This means this +                                 * subvolume should be considered an 'ignorant' +                                 * subvolume. +                                 */ + +                                if (ignorant_subvols) +                                        ignorant_subvols[i] = 1; +                                continue; +                        } + +                        memcpy (pending, pending_raw, sizeof(pending)); +                        k = afr_index_for_transaction_type (type); + +                        pending_matrix[i][j] = ntoh32 (pending[k]); +                } +        } + +        return ret; +} + +typedef enum { +        AFR_NODE_INVALID, +        AFR_NODE_INNOCENT, +        AFR_NODE_FOOL, +        AFR_NODE_WISE, +} afr_node_type; + +typedef struct { +        afr_node_type type; +        int           wisdom; +} afr_node_character; + + +static int +afr_sh_is_innocent (int32_t *array, int child_count) +{ +        int i   = 0; +        int ret = 1;   /* innocent until proven guilty */ + +        for (i = 0; i < child_count; i++) { +                if (array[i]) { +                        ret = 0; +                        break; +                } +        } + +        return ret; +} + + +static int +afr_sh_is_fool (int32_t *array, int i, int child_count) +{ +        return array[i];   /* fool if accuses itself */ +} + + +static int +afr_sh_is_wise (int32_t *array, int i, int child_count) +{ +        return !array[i];  /* wise if does not accuse itself */ +} + + +static int +afr_sh_all_nodes_innocent (afr_node_character *characters, +                           int child_count) +{ +        int i   = 0; +        int ret = 1; + +        for (i = 0; i < child_count; i++) { +                if (characters[i].type != AFR_NODE_INNOCENT) { +                        ret = 0; +                        break; +                } +        } + +        return ret; +} + + +static int +afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count) +{ +        int i   = 0; +        int ret = 0; + +        for (i = 0; i < child_count; i++) { +                if (characters[i].type == AFR_NODE_WISE) { +                        ret = 1; +                        break; +                } +        } + +        return ret; +} + + +/* + * The 'wisdom' of a wise node is 0 if any other wise node accuses it. + * It is 1 if no other wise node accuses it. + * Only wise nodes with wisdom 1 are sources. + * + * If no nodes with wisdom 1 exist, a split-brain has occurred. + */ + +static void +afr_sh_compute_wisdom (int32_t *pending_matrix[], +                       afr_node_character characters[], int child_count) +{ +        int i = 0; +        int j = 0; + +        for (i = 0; i < child_count; i++) { +                if (characters[i].type == AFR_NODE_WISE) { +                        characters[i].wisdom = 1; + +                        for (j = 0; j < child_count; j++) { +                                if ((characters[j].type == AFR_NODE_WISE) +                                    && pending_matrix[j][i]) { + +                                        characters[i].wisdom = 0; +                                } +                        } +                } +        } +} + + +static int +afr_sh_wise_nodes_conflict (afr_node_character *characters, +                            int child_count) +{ +        int i   = 0; +        int ret = 1; + +        for (i = 0; i < child_count; i++) { +                if ((characters[i].type == AFR_NODE_WISE) +                    && characters[i].wisdom == 1) { + +                        /* There is atleast one bona-fide wise node */ +                        ret = 0; +                        break; +                } +        } + +        return ret; +} + + +static int +afr_sh_mark_wisest_as_sources (int sources[], +                               afr_node_character *characters, +                               int child_count) +{ +        int nsources = 0; +        int i        = 0; + +        for (i = 0; i < child_count; i++) { +                if (characters[i].wisdom == 1) { +                        sources[i] = 1; +                        nsources++; +                } +        } + +        return nsources; +} + +static void +afr_compute_witness_of_fools (int32_t *witnesses, int32_t **pending_matrix, +                              afr_node_character *characters, +                              int32_t child_count) +{ +        int i       = 0; +        int j       = 0; +        int witness = 0; + +        GF_ASSERT (witnesses); +        GF_ASSERT (pending_matrix); +        GF_ASSERT (characters); +        GF_ASSERT (child_count > 0); + +        for (i = 0; i < child_count; i++) { +                if (characters[i].type != AFR_NODE_FOOL) +                        continue; + +                witness = 0; +                for (j = 0; j < child_count; j++) { +                        if (i == j) +                                continue; +                        witness += pending_matrix[i][j]; +                } +                witnesses[i] = witness; +        } +} + +static int32_t +afr_find_biggest_witness_among_fools (int32_t *witnesses, +                                      afr_node_character *characters, +                                      int32_t child_count) +{ +        int i               = 0; +        int biggest_witness = -1; +        int biggest_witness_idx = -1; +        int biggest_witness_cnt = -1; + +        GF_ASSERT (witnesses); +        GF_ASSERT (characters); +        GF_ASSERT (child_count > 0); + +        for (i = 0; i < child_count; i++) { +                if (characters[i].type != AFR_NODE_FOOL) +                        continue; + +                if (biggest_witness < witnesses[i]) { +                        biggest_witness = witnesses[i]; +			biggest_witness_idx = i; +			biggest_witness_cnt = 1; +			continue; +		} + +		if (biggest_witness == witnesses[i]) +			biggest_witness_cnt++; +        } + +	if (biggest_witness_cnt != 1) +		return -1; + +        return biggest_witness_idx; +} + +int +afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses, +                                    afr_node_character *characters, +                                    int32_t child_count, int32_t witness) +{ +        int i        = 0; +        int nsources = 0; + +        GF_ASSERT (sources); +        GF_ASSERT (witnesses); +        GF_ASSERT (characters); +        GF_ASSERT (child_count > 0); + +        for (i = 0; i < child_count; i++) { +                if (characters[i].type != AFR_NODE_FOOL) +                        continue; + +                if (witness == witnesses[i]) { +                        sources[i] = 1; +                        nsources++; +                } +        } +        return nsources; +} + + +int +afr_mark_fool_as_source_by_idx (int32_t *sources, int child_count, int idx) +{ +	if (idx >= 0 && idx < child_count) { +		sources[idx] = 1; +		return 1; +	} +	return 0; +} + + +static int +afr_find_largest_file_size (struct iatt *bufs, int32_t *success_children, +			    int child_count) +{ +	int idx = -1; +	int i = -1; +	int child = -1; +	uint64_t max_size = 0; +        uint64_t min_size = 0; +        int      num_children = 0; + +	for (i = 0; i < child_count; i++) { +		if (success_children[i] == -1) +			break; + +		child = success_children[i]; +		if (bufs[child].ia_size > max_size) { +			max_size = bufs[child].ia_size; +			idx = child; +		} + +                if ((num_children == 0) || (bufs[child].ia_size < min_size)) { +                        min_size = bufs[child].ia_size; +                } + +                num_children++; +	} + +        /* If sizes are same for all of them, finding sources will have to +         * happen with pending changelog. So return -1 +         */ +        if ((num_children > 1) && (min_size == max_size)) +                return -1; +	return idx; +} + + +static int +afr_find_newest_file (struct iatt *bufs, int32_t *success_children, +		      int child_count) +{ +	int idx = -1; +	int i = -1; +	int child = -1; +	uint64_t max_ctime = 0; + +	for (i = 0; i < child_count; i++) { +		if (success_children[i] == -1) +			break; + +		child = success_children[i]; +		if (bufs[child].ia_ctime > max_ctime) { +			max_ctime = bufs[child].ia_ctime; +			idx = child; +		} +	} + +	return idx; +} + + +static int +afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, +                                     afr_node_character *characters, +				     int32_t *success_children, +                                     int child_count, struct iatt *bufs) +{ +        int32_t       biggest_witness = 0; +        int           nsources        = 0; +        int32_t       *witnesses      = NULL; + +        GF_ASSERT (child_count > 0); + +	biggest_witness = afr_find_largest_file_size (bufs, success_children, +						      child_count); +	if (biggest_witness != -1) +		goto found; + +        witnesses = GF_CALLOC (child_count, sizeof (*witnesses), +                               gf_afr_mt_int32_t); +        if (NULL == witnesses) { +                nsources = -1; +                goto out; +        } + +        afr_compute_witness_of_fools (witnesses, pending_matrix, characters, +                                      child_count); +        biggest_witness = afr_find_biggest_witness_among_fools (witnesses, +                                                                characters, +                                                                child_count); +	if (biggest_witness != -1) +		goto found; + +	biggest_witness = afr_find_newest_file (bufs, success_children, +						child_count); + +found: +	nsources = afr_mark_fool_as_source_by_idx (sources, child_count, +						   biggest_witness); +out: +        GF_FREE (witnesses); +        return nsources; +} + +int +afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs, +                                 int32_t *success_children, +                                 unsigned int child_count, uint32_t uid) +{ +        int     i        = 0; +        int     nsources = 0; +        int     child    = 0; + +        for (i = 0; i < child_count; i++) { +                if (-1 == success_children[i]) +                        break; + +                child = success_children[i]; +                if (uid == bufs[child].ia_uid) { +                        sources[child] = 1; +                        nsources++; +                } +        } +        return nsources; +} + +int +afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *success_children, +                               unsigned int child_count) +{ +        int     i        = 0; +        int     smallest = -1; +        int     child    = 0; + +        for (i = 0; i < child_count; i++) { +                if (-1 == success_children[i]) +                        break; +                child = success_children[i]; +                if ((smallest == -1) || +                    (bufs[child].ia_uid < bufs[smallest].ia_uid)) { +                        smallest = child; +                } +        } +        return smallest; +} + +static int +afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *success_children, +                                  int child_count, int32_t *sources) +{ +        int   nsources              = 0; +        int   smallest              = 0; + +        smallest = afr_get_child_with_lowest_uid (bufs, success_children, +                                                  child_count); +        if (smallest < 0) { +                nsources = -1; +                goto out; +        } +        nsources = afr_mark_child_as_source_by_uid (sources, bufs, +                                                    success_children, child_count, +                                                    bufs[smallest].ia_uid); +out: +        return nsources; +} + +int +afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children, +                                 struct iatt *bufs) +{ +        afr_private_t *priv = NULL; +        int            i = 0; +        int            child = -1; +        int            read_child = -1; + +        priv = this->private; +        for (i = 0; i < priv->child_count; i++) { +                child = success_children[i]; +                if (child < 0) +                        break; +                if (read_child < 0) +                        read_child = child; +                else if (bufs[read_child].ia_size < bufs[child].ia_size) +                        read_child = child; +        } +        return read_child; +} + +int +afr_sh_mark_zero_size_file_as_sink (struct iatt *bufs, int32_t *success_children, +                                    int child_count, int32_t *sources) +{ +        int             nsources = 0; +        int             i = 0; +        int             child = 0; +        gf_boolean_t    sink_exists = _gf_false; +        gf_boolean_t    source_exists = _gf_false; +        int             source = -1; + +        for (i = 0; i < child_count; i++) { +                child = success_children[i]; +                if (child < 0) +                        break; +                if (!bufs[child].ia_size) { +                        sink_exists = _gf_true; +                        continue; +                } +                if (!source_exists) { +                        source_exists = _gf_true; +                        source = child; +                        continue; +                } +                if (bufs[source].ia_size != bufs[child].ia_size) { +                        nsources = -1; +                        goto out; +                } +        } +        if (!source_exists && !sink_exists) { +                nsources = -1; +                goto out; +        } + +        if (!source_exists || !sink_exists) +                goto out; + +        for (i = 0; i < child_count; i++) { +                child = success_children[i]; +                if (child < 0) +                        break; +                if (bufs[child].ia_size) { +                        sources[child] = 1; +                        nsources++; +                } +        } +out: +        return nsources; +} + +char * +afr_get_character_str (afr_node_type type) +{ +        char *character = NULL; + +        switch (type) { +        case AFR_NODE_INNOCENT: +                character = "innocent"; +                break; +        case AFR_NODE_FOOL: +                character = "fool"; +                break; +        case AFR_NODE_WISE: +                character = "wise"; +                break; +        default: +                character = "invalid"; +                break; +        } +        return character; +} + +afr_node_type +afr_find_child_character_type (int32_t *pending_row, int32_t child, +                               unsigned int child_count) +{ +        afr_node_type type = AFR_NODE_INVALID; + +        GF_ASSERT ((child >= 0) && (child < child_count)); + +        if (afr_sh_is_innocent (pending_row, child_count)) +                type = AFR_NODE_INNOCENT; +        else if (afr_sh_is_fool (pending_row, child, child_count)) +                type = AFR_NODE_FOOL; +        else if (afr_sh_is_wise (pending_row, child, child_count)) +                type = AFR_NODE_WISE; +        return type; +} + +int +afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, +                   int32_t **pending_matrix, int32_t *sources, +                   int32_t *success_children, afr_transaction_type type, +                   int32_t *subvol_status, gf_boolean_t ignore_ignorant) +{ +        afr_private_t           *priv = NULL; +        afr_self_heal_type      sh_type    = AFR_SELF_HEAL_INVALID; +        int                     nsources   = -1; +        unsigned char           *ignorant_subvols = NULL; +        unsigned int            child_count = 0; + +        priv = this->private; +        child_count = priv->child_count; + +        if (afr_get_children_count (success_children, priv->child_count) == 0) +                goto out; + +        if (!ignore_ignorant) { +                ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), +                                              child_count, gf_afr_mt_char); +                if (NULL == ignorant_subvols) +                        goto out; +        } + +        afr_build_pending_matrix (priv->pending_key, pending_matrix, +                                  ignorant_subvols, xattr, type, +                                  priv->child_count); + +        if (!ignore_ignorant) +                afr_mark_ignorant_subvols_as_pending (pending_matrix, +                                                      ignorant_subvols, +                                                      priv->child_count); +        sh_type = afr_self_heal_type_for_transaction (type); +        if (AFR_SELF_HEAL_INVALID == sh_type) +                goto out; + +        afr_sh_print_pending_matrix (pending_matrix, this); + +        nsources = afr_mark_sources (this, sources, pending_matrix, bufs, +                                     sh_type, success_children, subvol_status); +out: +        GF_FREE (ignorant_subvols); +        return nsources; +} + +void +afr_find_character_types (afr_node_character *characters, +                          int32_t **pending_matrix, int32_t *success_children, +                          unsigned int child_count) +{ +        afr_node_type type  = AFR_NODE_INVALID; +        int           child = 0; +        int           i     = 0; + +        for (i = 0; i < child_count; i++) { +                child = success_children[i]; +                if (child == -1) +                        break; +                type = afr_find_child_character_type (pending_matrix[child], +                                                      child, child_count); +                characters[child].type = type; +        } +} + +void +afr_mark_success_children_sources (int32_t *sources, int32_t *success_children, +                                   unsigned int child_count) +{ +        int i = 0; +        for (i = 0; i < child_count; i++) { +                if (success_children[i] == -1) +                        break; +                sources[success_children[i]] = 1; +        } +} +/** + * mark_sources: Mark all 'source' nodes and return number of source + * nodes found + * + * A node (a row in the pending matrix) belongs to one of + * three categories: + * + * M is the pending matrix. + * + * 'innocent' - M[i] is all zeroes + * 'fool'     - M[i] has i'th element = 1 (self-reference) + * 'wise'     - M[i] has i'th element = 0, others are 1 or 0. + * + * All 'innocent' nodes are sinks. If all nodes are innocent, no self-heal is + * needed. + * + * A 'wise' node can be a source. If two 'wise' nodes conflict, it is + * a split-brain. If one wise node refers to the other but the other doesn't + * refer back, the referrer is a source. + * + * All fools are sinks, unless there are no 'wise' nodes. In that case, + * one of the fools is made a source. + */ + +int +afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, +                  struct iatt *bufs, afr_self_heal_type type, +                  int32_t *success_children, int32_t *subvol_status) +{ +        /* stores the 'characters' (innocent, fool, wise) of the nodes */ +        afr_node_character *characters =  NULL; +        int                nsources    = -1; +        unsigned int       child_count = 0; +        afr_private_t      *priv       = NULL; + +        priv = this->private; +        child_count = priv->child_count; +        characters = GF_CALLOC (sizeof (afr_node_character), +                                child_count, gf_afr_mt_afr_node_character); +        if (!characters) +                goto out; + +        this = THIS; + +        /* start clean */ +        memset (sources, 0, sizeof (*sources) * child_count); +        nsources = 0; +        afr_find_character_types (characters, pending_matrix, success_children, +                                  child_count); +        if (afr_sh_all_nodes_innocent (characters, child_count)) { +                switch (type) { +                case AFR_SELF_HEAL_METADATA: +                        nsources = afr_sh_mark_lowest_uid_as_source (bufs, +                                                             success_children, +                                                             child_count, +                                                             sources); +                        break; +                case AFR_SELF_HEAL_DATA: +                        nsources = afr_sh_mark_zero_size_file_as_sink (bufs, +                                                             success_children, +                                                             child_count, +                                                             sources); +                        if ((nsources < 0) && subvol_status) +                                *subvol_status |= SPLIT_BRAIN; +                        break; +                default: +                        break; +                } +                goto out; +        } + +        if (afr_sh_wise_nodes_exist (characters, child_count)) { +                afr_sh_compute_wisdom (pending_matrix, characters, child_count); + +                if (afr_sh_wise_nodes_conflict (characters, child_count)) { +                        if (subvol_status) +                                *subvol_status |= SPLIT_BRAIN; +                        nsources = -1; +                } else { +                        nsources = afr_sh_mark_wisest_as_sources (sources, +                                                                  characters, +                                                                  child_count); +                } +        } else { +                if (subvol_status) +                        *subvol_status |= ALL_FOOLS; +                nsources = afr_mark_biggest_of_fools_as_source (sources, +                                                                pending_matrix, +                                                                characters, +								success_children, +                                                                child_count, bufs); +        } + +out: +        if (nsources == 0) +                afr_mark_success_children_sources (sources, success_children, +                                                   child_count); +        GF_FREE (characters); + +        gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources); +        return nsources; +} + +void +afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, +                         int32_t *delta_matrix[], unsigned char success[], +                         int child_count, afr_transaction_type type) +{ +        int     tgt     = 0; +        int     src     = 0; +        int     value   = 0; + +        afr_build_pending_matrix (priv->pending_key, delta_matrix, NULL, +                                  xattr, type, priv->child_count); + +        /* +         * The algorithm here has two parts.  First, for each subvol indexed +         * as tgt, we try to figure out what count everyone should have for it. +         * If the self-heal succeeded, that's easy; the value is zero. +         * Otherwise, the value is the maximum of the succeeding nodes' counts. +         * Once we know the value, we loop through (possibly for a second time) +         * setting each count to the difference so that when we're done all +         * succeeding nodes will have the same count for tgt. +         */ +        for (tgt = 0; tgt < priv->child_count; ++tgt) { +                value = 0; +                if (!success[tgt]) { +                        /* Find the maximum. */ +                        for (src = 0; src < priv->child_count; ++src) { +                                if (!success[src]) { +                                        continue; +                                } +                                if (delta_matrix[src][tgt] > value) { +                                        value = delta_matrix[src][tgt]; +                                } +                        } +                } +                /* Force everyone who succeeded to the chosen value. */ +                for (src = 0; src < priv->child_count; ++src) { +                        if (success[src]) { +                                delta_matrix[src][tgt] = value +                                                       - delta_matrix[src][tgt]; +                        } +                        else { +                                delta_matrix[src][tgt] = 0; +                        } +                } +        } +} + + +int +afr_sh_delta_to_xattr (xlator_t *this, +                       int32_t *delta_matrix[], dict_t *xattr[], +                       int child_count, afr_transaction_type type) +{ +        int              i       = 0; +        int              j       = 0; +        int              k       = 0; +        int              ret     = 0; +        int32_t         *pending = NULL; +        int32_t         *local_pending = NULL; +        afr_private_t   *priv = NULL; + +        priv = this->private; +        for (i = 0; i < child_count; i++) { +                if (!xattr[i]) +                        continue; + +                local_pending = NULL; +                for (j = 0; j < child_count; j++) { +                        pending = GF_CALLOC (sizeof (int32_t), 3, +                                             gf_afr_mt_int32_t); + +                        if (!pending) { +                                gf_log (this->name, GF_LOG_ERROR, +                                        "failed to allocate pending entry " +                                        "for %s[%d] on %s", +                                        priv->pending_key[j], type, +                                        priv->children[i]->name); +                                continue; +                        } +                        /* 3 = data+metadata+entry */ + +                        k = afr_index_for_transaction_type (type); + +                        pending[k] = hton32 (delta_matrix[i][j]); + +                        if (j == i) { +                                local_pending = pending; +                                continue; +                        } +                        ret = dict_set_bin (xattr[i], priv->pending_key[j], +                                            pending, +                                        AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); +                        if (ret < 0) { +                                gf_log (this->name, GF_LOG_WARNING, +                                        "Unable to set dict value."); +                                GF_FREE (pending); +                        } +                } +                if (local_pending) { +                        ret = dict_set_bin (xattr[i], priv->pending_key[i], +                                            local_pending, +                                        AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); +                        if (ret < 0) { +                                gf_log (this->name, GF_LOG_WARNING, +                                        "Unable to set dict value."); +                                GF_FREE (local_pending); +                        } +                } +        } +        return 0; +} + + +int +afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; + +        local = frame->local; +        sh = &local->self_heal; + +        afr_sh_reset (frame, this); + +        if (local->unhealable) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "split brain found, aborting selfheal of %s", +                        local->loc.path); +        } + +        if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { +                sh->completion_cbk (frame, this); +        } else { +                gf_log (this->name, GF_LOG_TRACE, +                        "proceeding to metadata check on %s", +                        local->loc.path); +                afr_self_heal_metadata (frame, this); +        } + +        return 0; +} + + +static int +afr_sh_missing_entries_finish (call_frame_t *frame, xlator_t *this) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_local_t         *local    = NULL; + +        local = frame->local; +        int_lock = &local->internal_lock; + +        int_lock->lock_cbk = afr_sh_missing_entries_done; +        afr_unlock (frame, this); + +        return 0; +} + +int +afr_sh_common_create (afr_self_heal_t *sh, unsigned int child_count) +{ +        int     ret = -ENOMEM; +        sh->buf = GF_CALLOC (child_count, sizeof (*sh->buf), +                             gf_afr_mt_iatt); +        if (!sh->buf) +                goto out; +        sh->parentbufs = GF_CALLOC (child_count, sizeof (*sh->parentbufs), +                                    gf_afr_mt_iatt); +        if (!sh->parentbufs) +                goto out; +        sh->child_errno = GF_CALLOC (child_count, sizeof (*sh->child_errno), +                                     gf_afr_mt_int); +        if (!sh->child_errno) +                goto out; +        sh->success_children = afr_children_create (child_count); +        if (!sh->success_children) +                goto out; +        sh->fresh_children = afr_children_create (child_count); +        if (!sh->fresh_children) +                goto out; +        sh->xattr = GF_CALLOC (child_count, sizeof (*sh->xattr), +                               gf_afr_mt_dict_t); +        if (!sh->xattr) +                goto out; +        ret = 0; +out: +        return ret; +} + +void +afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, +                                   xlator_t *this, +                                   int32_t op_ret, int32_t op_errno, +                                   inode_t *inode, struct iatt *buf, +                                   dict_t *xattr, struct iatt *postparent, +                                   loc_t *loc) +{ +        int              child_index = 0; +        afr_local_t     *local = NULL; +        afr_private_t   *priv = NULL; +        afr_self_heal_t *sh = NULL; + +        local = frame->local; +        priv = this->private; +        sh   = &local->self_heal; +        child_index = (long) cookie; + +        LOCK (&frame->lock); +        { +                if (op_ret == 0) { +                        sh->buf[child_index] = *buf; +                        sh->parentbufs[child_index] = *postparent; +                        sh->success_children[sh->success_count] = child_index; +                        sh->success_count++; +                        sh->xattr[child_index] = dict_ref (xattr); +                } else { +                        gf_log (this->name, GF_LOG_DEBUG, "path %s on subvolume" +                                " %s => -1 (%s)", loc->path, +                                priv->children[child_index]->name, +                                strerror (op_errno)); +                        local->self_heal.child_errno[child_index] = op_errno; +                } +        } +        UNLOCK (&frame->lock); +        return; +} + +gf_boolean_t +afr_valid_ia_type (ia_type_t ia_type) +{ +        switch (ia_type) { +        case IA_IFSOCK: +        case IA_IFREG: +        case IA_IFBLK: +        case IA_IFCHR: +        case IA_IFIFO: +        case IA_IFLNK: +        case IA_IFDIR: +                return _gf_true; +        default: +                return _gf_false; +        } +        return _gf_false; +} + +int +afr_impunge_frame_create (call_frame_t *frame, xlator_t *this, +                          int active_source, call_frame_t **impunge_frame) +{ +        afr_local_t     *local         = NULL; +        afr_local_t     *impunge_local = NULL; +        afr_self_heal_t *impunge_sh    = NULL; +        int32_t         op_errno       = 0; +        afr_private_t   *priv          = NULL; +        int             ret            = 0; +        call_frame_t    *new_frame     = NULL; + +        op_errno = ENOMEM; +        priv = this->private; +        new_frame = copy_frame (frame); +        if (!new_frame) { +                goto out; +        } + +        AFR_LOCAL_ALLOC_OR_GOTO (impunge_local, out); + +        local = frame->local; +        new_frame->local = impunge_local; +        impunge_sh = &impunge_local->self_heal; +        impunge_sh->sh_frame = frame; +        impunge_sh->active_source = active_source; +        impunge_local->child_up  = memdup (local->child_up, +                                           sizeof (*local->child_up) * +                                           priv->child_count); +        if (!impunge_local->child_up) +                goto out; + +        impunge_local->pending = afr_matrix_create (priv->child_count, +                                                    AFR_NUM_CHANGE_LOGS); +        if (!impunge_local->pending) +                goto out; + +        ret = afr_sh_common_create (impunge_sh, priv->child_count); +        if (ret) { +                op_errno = -ret; +                goto out; +        } +        op_errno = 0; +        *impunge_frame = new_frame; +out: +        if (op_errno && new_frame) +                AFR_STACK_DESTROY (new_frame); +        return -op_errno; +} + +void +afr_sh_missing_entry_call_impunge_recreate (call_frame_t *frame, xlator_t *this, +                                            struct iatt *buf, +                                            struct iatt *postparent, +                                            afr_impunge_done_cbk_t impunge_done) +{ +        call_frame_t    *impunge_frame = NULL; +        afr_local_t     *local = NULL; +        afr_local_t     *impunge_local = NULL; +        afr_self_heal_t *sh = NULL; +        afr_self_heal_t *impunge_sh = NULL; +        int             ret = 0; +        unsigned int    enoent_count = 0; +        afr_private_t   *priv = NULL; +        int             i = 0; +        int32_t         op_errno = 0; + +        local = frame->local; +        sh    = &local->self_heal; +        priv  = this->private; + +        enoent_count = afr_errno_count (NULL, sh->child_errno, +                                        priv->child_count, ENOENT); +        if (!enoent_count) { +                gf_log (this->name, GF_LOG_INFO, +                        "no missing files - %s. proceeding to metadata check", +                        local->loc.path); +                goto out; +        } +        sh->impunge_done = impunge_done; +        ret = afr_impunge_frame_create (frame, this, sh->source, &impunge_frame); +        if (ret) +                goto out; +        impunge_local = impunge_frame->local; +        impunge_sh    = &impunge_local->self_heal; +        loc_copy (&impunge_local->loc, &local->loc); +        ret = afr_build_parent_loc (&impunge_sh->parent_loc, +                                    &impunge_local->loc, &op_errno); +        if (ret) { +                ret = -op_errno; +                goto out; +        } +        impunge_local->call_count = enoent_count; +        impunge_sh->entrybuf = sh->buf[sh->source]; +        impunge_sh->parentbuf = sh->parentbufs[sh->source]; +        for (i = 0; i < priv->child_count; i++) { +                if (!impunge_local->child_up[i]) { +                        impunge_sh->child_errno[i] = ENOTCONN; +                        continue; +                } +                if (sh->child_errno[i] != ENOENT) { +                        impunge_sh->child_errno[i] = EEXIST; +                        continue; +                } +        } +        for (i = 0; i < priv->child_count; i++) { +                if (sh->child_errno[i] != ENOENT) +                        continue; +                afr_sh_entry_impunge_create (impunge_frame, this, i); +                enoent_count--; +        } +        GF_ASSERT (!enoent_count); +        return; +out: +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, " +                        "reason: %s", local->loc.path, strerror (-ret)); +                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +        } +        afr_sh_missing_entries_finish (frame, this); +} + +int +afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this, +                         int32_t op_ret, int32_t op_errno) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; + +        local = frame->local; +        sh = &local->self_heal; +        if (op_ret < 0) +                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +        afr_sh_missing_entries_finish (frame, this); +        return 0; +} + +static int +sh_missing_entries_create (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        int              type = 0; +        struct iatt     *buf = NULL; +        struct iatt     *postparent = NULL; + +        local = frame->local; +        sh = &local->self_heal; + +        buf = &sh->buf[sh->source]; +        postparent = &sh->parentbufs[sh->source]; + +        type = buf->ia_type; +        if (!afr_valid_ia_type (type)) { +                gf_log (this->name, GF_LOG_ERROR, +                        "%s: unknown file type: 0%o", local->loc.path, type); +                afr_set_local_for_unhealable (local); +                afr_sh_missing_entries_finish (frame, this); +                goto out; +        } + +        afr_sh_missing_entry_call_impunge_recreate (frame, this, +                                                    buf, postparent, +                                                    afr_sh_create_entry_cbk); +out: +        return 0; +} + +void +afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this, +                                    int32_t op_ret, int32_t op_errno) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        afr_private_t   *priv = NULL; +        ia_type_t       ia_type = IA_INVAL; +        int32_t         nsources = 0; +        loc_t           *loc = NULL; +        int32_t         subvol_status = 0; +        afr_transaction_type txn_type = AFR_DATA_TRANSACTION; +        gf_boolean_t    split_brain = _gf_false; +        int             read_child = -1; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; +        loc = &local->loc; + +        if (op_ret < 0) { +                if (op_errno == EIO) { +                        afr_set_local_for_unhealable (local); +                } +                // EIO can happen if finding the fresh parent dir failed +                goto out; +        } + +        //now No chance for the ia_type to conflict +        ia_type = sh->buf[sh->success_children[0]].ia_type; +        txn_type = afr_transaction_type_get (ia_type); +        nsources = afr_build_sources (this, sh->xattr, sh->buf, +                                      sh->pending_matrix, sh->sources, +                                      sh->success_children, txn_type, +                                      &subvol_status, _gf_false); +        if (nsources < 0) { +                gf_log (this->name, GF_LOG_INFO, "No sources for dir of %s," +                        " in missing entry self-heal, continuing with the rest" +                        " of the self-heals", local->loc.path); +                if (subvol_status & SPLIT_BRAIN) { +                        split_brain = _gf_true; +                        switch (txn_type) { +                        case AFR_DATA_TRANSACTION: +                                nsources = 1; +                                sh->sources[sh->success_children[0]] = 1; +                                break; +                        case AFR_ENTRY_TRANSACTION: +                                read_child = afr_get_no_xattr_dir_read_child +                                                          (this, +                                                           sh->success_children, +                                                           sh->buf); +                                sh->sources[read_child] = 1; +                                nsources = 1; +                                break; +                        default: +                                op_errno = EIO; +                                goto out; +                        } +                } else { +                        op_errno = EIO; +                        goto out; +                } +        } + +        afr_get_fresh_children (sh->success_children, sh->sources, +                                sh->fresh_children, priv->child_count); +        sh->source = sh->fresh_children[0]; +        if (sh->source == -1) { +                gf_log (this->name, GF_LOG_DEBUG, "No active sources found."); +                op_errno = EIO; +                goto out; +        } + +        if (sh->gfid_sh_success_cbk) +                sh->gfid_sh_success_cbk (frame, this); +        sh->type = sh->buf[sh->source].ia_type; +        if (uuid_is_null (loc->inode->gfid)) +                uuid_copy (loc->gfid, sh->buf[sh->source].ia_gfid); +        if (split_brain) { +                afr_sh_missing_entries_finish (frame, this); +        } else { +                sh_missing_entries_create (frame, this); +        } +        return; +out: +        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +        afr_sh_set_error (sh, op_errno); +        afr_sh_missing_entries_finish (frame, this); +        return; +} + +static int +afr_sh_common_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                          int32_t op_ret, int32_t op_errno, inode_t *inode, +                          struct iatt *buf, dict_t *xattr, +                          struct iatt *postparent) +{ +        int                     call_count = 0; +        afr_local_t             *local = NULL; +        afr_self_heal_t         *sh    = NULL; +        afr_private_t           *priv  = NULL; + +        local = frame->local; +        sh    = &local->self_heal; +        priv  = this->private; + +        afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, +                                           op_errno, inode, buf, xattr, +                                           postparent, &sh->lookup_loc); +        call_count = afr_frame_return (frame); + +        if (call_count) +                goto out; +        op_ret = -1; +        if (!sh->success_count) { +                op_errno = afr_resultant_errno_get (NULL, sh->child_errno, +                                                    priv->child_count); +                gf_log (this->name, GF_LOG_ERROR, "Failed to lookup %s, " +                        "reason %s", sh->lookup_loc.path, +                        strerror (op_errno)); +                goto done; +        } + +        if ((sh->lookup_flags & AFR_LOOKUP_FAIL_CONFLICTS) && +            (afr_conflicting_iattrs (sh->buf, sh->success_children, +                                     priv->child_count, +                                     sh->lookup_loc.path, this->name))) { +                op_errno = EIO; +                gf_log (this->name, GF_LOG_ERROR, "Conflicting entries " +                        "for %s", sh->lookup_loc.path); +                goto done; +        } + +        if ((sh->lookup_flags & AFR_LOOKUP_FAIL_MISSING_GFIDS) && +            (afr_gfid_missing_count (this->name, sh->success_children, +                                     sh->buf, priv->child_count, +                                     sh->lookup_loc.path))) { +                op_errno = ENODATA; +                gf_log (this->name, GF_LOG_ERROR, "Missing Gfids " +                        "for %s", sh->lookup_loc.path); +                goto done; +        } +        op_ret = 0; + +done: +        sh->lookup_done (frame, this, op_ret, op_errno); +out: +        return 0; +} + +int +afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child, +                         int32_t op_ret, int32_t op_errno) +{ +        int             call_count = 0; +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; + +        local = frame->local; +        sh = &local->self_heal; + +        GF_ASSERT (sh->post_remove_call); +        if ((op_ret == -1) && (op_errno != ENOENT)) { +                gf_log (this->name, GF_LOG_ERROR, +                        "purge entry %s failed, on child %d reason, %s", +                        local->loc.path, child, strerror (op_errno)); +                LOCK (&frame->lock); +                { +                        afr_sh_set_error (sh, EIO); +                        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +                } +                UNLOCK (&frame->lock); +        } +        call_count = afr_frame_return (frame); +        if (call_count == 0) +                sh->post_remove_call (frame, this); +        return 0; +} + +void +afr_sh_call_entry_expunge_remove (call_frame_t *frame, xlator_t *this, +                                  int child_index, struct iatt *buf, +                                  struct iatt *parentbuf, +                                  afr_expunge_done_cbk_t expunge_done) +{ +        call_frame_t    *expunge_frame = NULL; +        afr_local_t     *local = NULL; +        afr_local_t     *expunge_local = NULL; +        afr_self_heal_t *sh = NULL; +        afr_self_heal_t *expunge_sh = NULL; +        int32_t         op_errno = 0; +        int             ret = 0; + +        expunge_frame = copy_frame (frame); +        if (!expunge_frame) { +                goto out; +        } + +        AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out); + +        local = frame->local; +        sh = &local->self_heal; +        expunge_frame->local = expunge_local; +        expunge_sh = &expunge_local->self_heal; +        expunge_sh->sh_frame = frame; +        loc_copy (&expunge_local->loc, &local->loc); +        ret = afr_build_parent_loc (&expunge_sh->parent_loc, +                                    &expunge_local->loc, &op_errno); +        if (ret) { +                ret = -op_errno; +                goto out; +        } +        sh->expunge_done = expunge_done; +        afr_sh_entry_expunge_remove (expunge_frame, this, child_index, buf, +                                     parentbuf); +        return; +out: +        gf_log (this->name, GF_LOG_ERROR, "Expunge of %s failed, reason: %s", +                local->loc.path, strerror (op_errno)); +        expunge_done (frame, this, child_index, -1, op_errno); +} + +void +afr_sh_remove_stale_lookup_info (afr_self_heal_t *sh, int32_t *success_children, +                                 int32_t *fresh_children, +                                 unsigned int child_count) +{ +        int     i = 0; + +        for (i = 0; i < child_count; i++) { +                if (afr_is_child_present (success_children, child_count, i) && +                    !afr_is_child_present (fresh_children, child_count, i)) { +                        sh->child_errno[i] = ENOENT; +                        GF_ASSERT (sh->xattr[i]); +                        dict_unref (sh->xattr[i]); +                        sh->xattr[i] = NULL; +                } +        } +} + +int +afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t         *local    = NULL; +        afr_self_heal_t     *sh       = NULL; +        afr_private_t       *priv     = NULL; + +        local    = frame->local; +        sh       = &local->self_heal; +        priv     = this->private; + +        if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { +                afr_sh_missing_entries_finish (frame, this); +        } else { +                if (afr_gfid_missing_count (this->name, sh->fresh_children, +                                            sh->buf, priv->child_count, +                                            local->loc.path)) { +                        afr_sh_common_lookup (frame, this, &local->loc, +                                              afr_sh_missing_entries_lookup_done, +                                              sh->sh_gfid_req, +                                              AFR_LOOKUP_FAIL_CONFLICTS| +                                              AFR_LOOKUP_FAIL_MISSING_GFIDS, +                                              NULL); +                } else { +                        //No need to set gfid so goto missing entries lookup done +                        //Behave as if you have done the lookup +                        afr_sh_remove_stale_lookup_info (sh, +                                                         sh->success_children, +                                                         sh->fresh_children, +                                                         priv->child_count); +                        afr_children_copy (sh->success_children, +                                           sh->fresh_children, +                                           priv->child_count); +                        afr_sh_missing_entries_lookup_done (frame, this, 0, 0); +                } +        } +        return 0; +} + +gf_boolean_t +afr_sh_purge_entry_condition (afr_local_t *local, afr_private_t *priv, +                              int child) +{ +        afr_self_heal_t *sh = NULL; + +        sh = &local->self_heal; + +        if (local->child_up[child] && +            (!afr_is_child_present (sh->fresh_parent_dirs, priv->child_count, +                                    child)) +            && (sh->child_errno[child] != ENOENT)) +                return _gf_true; + +        return _gf_false; +} + +gf_boolean_t +afr_sh_purge_stale_entry_condition (afr_local_t *local, afr_private_t *priv, +                                    int child) +{ +        afr_self_heal_t *sh = NULL; + +        sh = &local->self_heal; + +        if (local->child_up[child] && +            (!afr_is_child_present (sh->fresh_children, priv->child_count, +                                    child)) +             && (sh->child_errno[child] != ENOENT)) +                return _gf_true; + +        return _gf_false; +} + +void +afr_sh_purge_entry_common (call_frame_t *frame, xlator_t *this, +                           gf_boolean_t purge_condition (afr_local_t *local, +                                                         afr_private_t *priv, +                                                         int child)) +{ +        afr_local_t     *local = NULL; +        afr_private_t   *priv = NULL; +        afr_self_heal_t *sh = NULL; +        int             i = 0; +        int             call_count = 0; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        for (i = 0; i < priv->child_count; i++) { +                if (purge_condition (local, priv, i)) +                        call_count++; +        } + +        if (call_count == 0) { +                sh->post_remove_call (frame, this); +                goto out; +        } + +        local->call_count = call_count; +        for (i = 0; i < priv->child_count; i++) { +                if (!purge_condition (local, priv, i)) +                        continue; +                gf_log (this->name, GF_LOG_INFO, "purging the stale entry %s " +                        "on %s", local->loc.path, priv->children[i]->name); +                afr_sh_call_entry_expunge_remove (frame, this, +                                                  (long) i, &sh->buf[i], +                                                  &sh->parentbufs[i], +                                                  afr_sh_remove_entry_cbk); +        } +out: +        return; +} + +void +afr_sh_purge_entry (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; + +        local = frame->local; +        sh = &local->self_heal; +        sh->post_remove_call = afr_sh_missing_entries_finish; + +        afr_sh_purge_entry_common (frame, this, afr_sh_purge_entry_condition); +} + +void +afr_sh_purge_stale_entry (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        afr_private_t   *priv = NULL; +        int             i = 0; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        sh->post_remove_call = afr_sh_purge_stale_entries_done; + +        for (i = 0; i < priv->child_count; i++) { +                if (afr_is_child_present (sh->fresh_children, +                                          priv->child_count, i)) +                        continue; + +                if ((!local->child_up[i]) || sh->child_errno[i] != 0) +                        continue; + +                GF_ASSERT (!uuid_is_null (sh->entrybuf.ia_gfid) || +                           uuid_is_null (sh->buf[i].ia_gfid)); + +                if ((sh->entrybuf.ia_type != sh->buf[i].ia_type) || +                    (uuid_compare (sh->buf[i].ia_gfid, +                                   sh->entrybuf.ia_gfid))) +                        continue; + +                afr_children_add_child (sh->fresh_children, i, +                                        priv->child_count); + +        } +        afr_sh_purge_entry_common (frame, this, +                                   afr_sh_purge_stale_entry_condition); +} + +void +afr_sh_save_child_iatts_from_policy (int32_t *children, struct iatt *bufs, +                                     struct iatt *save, +                                     unsigned int child_count) +{ +        int             i = 0; +        int             child = 0; +        gf_boolean_t    saved = _gf_false; + +        GF_ASSERT (save); +        //if iatt buf with gfid exists sets it +        for (i = 0; i < child_count; i++) { +                child = children[i]; +                if (child == -1) +                        break; +                *save = bufs[child]; +                saved = _gf_true; +                if (!uuid_is_null (save->ia_gfid)) +                        break; +        } +        GF_ASSERT (saved); +} + +void +afr_get_children_of_fresh_parent_dirs (afr_self_heal_t *sh, +                                       unsigned int child_count) +{ +        afr_children_intersection_get (sh->success_children, +                                       sh->fresh_parent_dirs, +                                       sh->sources, child_count); +        afr_get_fresh_children (sh->success_children, sh->sources, +                                sh->fresh_children, child_count); +        memset (sh->sources, 0, sizeof (*sh->sources) * child_count); +} + +void +afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this, +                             int32_t op_ret, int32_t op_errno) +{ +        afr_local_t      *local = NULL; +        afr_self_heal_t  *sh = NULL; +        afr_private_t    *priv = NULL; +        int32_t          fresh_child_enoents = 0; +        int32_t          fresh_parent_count = 0; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        if (op_ret < 0) +                goto fail; +        afr_get_children_of_fresh_parent_dirs (sh, priv->child_count); +        fresh_parent_count = afr_get_children_count (sh->fresh_parent_dirs, +                                                     priv->child_count); +        //we need the enoent count of the subvols present in fresh_parent_dirs +        fresh_child_enoents = afr_errno_count (sh->fresh_parent_dirs, +                                               sh->child_errno, +                                               priv->child_count, ENOENT); +        if (fresh_child_enoents == fresh_parent_count) { +                afr_sh_set_error (sh, ENOENT); +                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +                afr_sh_purge_entry (frame, this); +        } else if (!afr_conflicting_iattrs (sh->buf, sh->fresh_children, +                                            priv->child_count, local->loc.path, +                                            this->name)) { +                afr_sh_save_child_iatts_from_policy (sh->fresh_children, +                                                     sh->buf, &sh->entrybuf, +                                                     priv->child_count); +                afr_update_gfid_from_iatts (sh->sh_gfid_req, sh->buf, +                                            sh->fresh_children, +                                            priv->child_count); +                afr_sh_purge_stale_entry (frame, this); +        } else { +                op_errno = EIO; +                afr_set_local_for_unhealable (local); +                goto fail; +        } + +        return; + +fail: +        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +        afr_sh_set_error (sh, op_errno); +        afr_sh_missing_entries_finish (frame, this); +        return; +} + +static void +afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this, +                           int32_t op_ret, int32_t op_errno) +{ +        afr_self_heal_t *sh  = NULL; +        afr_private_t   *priv = NULL; +        afr_local_t     *local = NULL; +        int             enoent_count = 0; +        int             nsources = 0; +        int             source  = -1; +        int32_t         subvol_status = 0; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        if (op_ret < 0) +                goto out; +        enoent_count = afr_errno_count (NULL, sh->child_errno, +                                        priv->child_count, ENOENT); +        if (enoent_count > 0) { +                gf_log (this->name, GF_LOG_INFO, "Parent dir missing for %s," +                        " in missing entry self-heal, aborting missing-entry " +                        "self-heal", +                        local->loc.path); +                afr_sh_missing_entries_finish (frame, this); +                return; +        } + +        nsources = afr_build_sources (this, sh->xattr, sh->buf, +                                      sh->pending_matrix, sh->sources, +                                      sh->success_children, +                                      AFR_ENTRY_TRANSACTION, &subvol_status, +                                      _gf_true); +        if ((subvol_status & ALL_FOOLS) || +            (subvol_status & SPLIT_BRAIN)) { +                gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative " +                        "merge", sh->parent_loc.path); +                afr_mark_success_children_sources (sh->sources, +                                                   sh->success_children, +                                                   priv->child_count); +        } else if (nsources < 0) { +                gf_log (this->name, GF_LOG_ERROR, "No sources for dir " +                        "of %s, in missing entry self-heal, aborting " +                        "self-heal", local->loc.path); +                op_errno = EIO; +                goto out; +        } + +        source = afr_sh_select_source (sh->sources, priv->child_count); +        if (source == -1) { +                GF_ASSERT (0); +                gf_log (this->name, GF_LOG_DEBUG, "No active sources found."); +                op_errno = EIO; +                goto out; +        } +        afr_get_fresh_children (sh->success_children, sh->sources, +                                sh->fresh_parent_dirs, priv->child_count); +        afr_sh_common_lookup (frame, this, &local->loc, +                              afr_sh_children_lookup_done, NULL, 0, +                              NULL); +        return; + +out: +        afr_sh_set_error (sh, op_errno); +        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +	afr_sh_missing_entries_finish (frame, this); +        return; +} + +void +afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count) +{ +        int             i = 0; + +        for (i = 0; i < child_count; i++) { +                memset (&sh->buf[i], 0, sizeof (sh->buf[i])); +                memset (&sh->parentbufs[i], 0, sizeof (sh->parentbufs[i])); +                sh->child_errno[i] = 0; +        } +        memset (&sh->parentbuf, 0, sizeof (sh->parentbuf)); +        sh->success_count = 0; +        afr_reset_children (sh->success_children, child_count); +        afr_reset_children (sh->fresh_children, child_count); +        afr_reset_xattr (sh->xattr, child_count); +        loc_wipe (&sh->lookup_loc); +} + +/* afr self-heal state will be lost if this call is made + * please check the afr_sh_common_reset that is called in this function + */ +int +afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, +                      afr_lookup_done_cbk_t lookup_done , uuid_t gfid, +                      int32_t flags, dict_t *xdata) +{ +        afr_local_t    *local = NULL; +        int             i = 0; +        int             call_count = 0; +        afr_private_t  *priv = NULL; +        dict_t         *xattr_req = NULL; +        afr_self_heal_t *sh = NULL; + +        local = frame->local; +        priv  = this->private; +        sh    = &local->self_heal; + +        call_count = afr_up_children_count (local->child_up, priv->child_count); + +        local->call_count = call_count; + +        xattr_req = dict_new(); + +        if (xattr_req) { +                afr_xattr_req_prepare (this, xattr_req, loc->path); +                if (gfid) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "looking up %s with gfid: %s", +                                loc->path, uuid_utoa (gfid)); +                        GF_ASSERT (!uuid_is_null (gfid)); +                        afr_set_dict_gfid (xattr_req, gfid); +                } +        } + +        afr_sh_common_reset (sh, priv->child_count); +        sh->lookup_done = lookup_done; +        loc_copy (&sh->lookup_loc, loc); +        sh->lookup_flags = flags; +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i]) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "looking up %s on subvolume %s", +                                loc->path, priv->children[i]->name); + +                        STACK_WIND_COOKIE (frame, +                                           afr_sh_common_lookup_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->lookup, +                                           loc, xattr_req); + +                        if (!--call_count) +                                break; +                } +        } + +        if (xattr_req) +                dict_unref (xattr_req); + +        return 0; +} + + + +int +afr_sh_post_nb_entrylk_missing_entry_sh_cbk (call_frame_t *frame, +                                             xlator_t *this) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_local_t         *local    = NULL; +        afr_self_heal_t     *sh       = NULL; + +        local    = frame->local; +        int_lock = &local->internal_lock; +        sh       = &local->self_heal; + +        if (int_lock->lock_op_ret < 0) { +                gf_log (this->name, GF_LOG_INFO, +                        "Non blocking entrylks failed."); +                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +                afr_sh_missing_entries_done (frame, this); +        } else { + +                gf_log (this->name, GF_LOG_DEBUG, +                        "Non blocking entrylks done. Proceeding to FOP"); +                afr_sh_common_lookup (frame, this, &sh->parent_loc, +                                      afr_sh_find_fresh_parents, +                                      NULL, AFR_LOOKUP_FAIL_CONFLICTS, +                                      NULL); +        } + +        return 0; +} + +int +afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, +                char *base_name, afr_lock_cbk_t lock_cbk) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_local_t         *local    = NULL; +        afr_private_t       *priv     = NULL; + +        priv     = this->private; +        local    = frame->local; +        int_lock = &local->internal_lock; + +        int_lock->transaction_lk_type = AFR_SELFHEAL_LK; +        int_lock->selfheal_lk_type    = AFR_ENTRY_SELF_HEAL_LK; + +        afr_set_lock_number (frame, this); + +        int_lock->lk_basename = base_name; +        int_lock->lk_loc      = loc; +        int_lock->lock_cbk    = lock_cbk; +        int_lock->domain      = this->name; + +        int_lock->lockee_count = 0; +        afr_init_entry_lockee (&int_lock->lockee[0], local, loc, +                               base_name, priv->child_count); +        int_lock->lockee_count++; +        afr_nonblocking_entrylk (frame, this); + +        return 0; +} + +static int +afr_self_heal_parent_entrylk (call_frame_t *frame, xlator_t *this, +                              afr_lock_cbk_t lock_cbk) +{ +        afr_local_t         *local    = NULL; +        afr_self_heal_t     *sh       = NULL; +        afr_internal_lock_t *int_lock = NULL; +        int                 ret       = -1; +        int32_t             op_errno  = 0; + +        local    = frame->local; +        sh       = &local->self_heal; + +        gf_log (this->name, GF_LOG_TRACE, +                "attempting to recreate missing entries for path=%s", +                local->loc.path); + +        ret = afr_build_parent_loc (&sh->parent_loc, &local->loc, &op_errno); +        if (ret) +                goto out; + +        afr_sh_entrylk (frame, this, &sh->parent_loc, NULL, +                        lock_cbk); +        return 0; +out: +        int_lock = &local->internal_lock; +        int_lock->lock_op_ret = -1; +        lock_cbk (frame, this); +        return 0; +} + +static int +afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; + +        local = frame->local; +        sh = &local->self_heal; + +        sh->sh_type_in_action  = AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY; + +        afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); + +        afr_self_heal_parent_entrylk (frame, this, +                                      afr_sh_post_nb_entrylk_missing_entry_sh_cbk); +        return 0; +} + +afr_local_t* +afr_self_heal_local_init (afr_local_t *l, xlator_t *this) +{ +        afr_private_t   *priv  = NULL; +        afr_local_t     *lc    = NULL; +        afr_self_heal_t *sh    = NULL; +        afr_self_heal_t *shc   = NULL; +        int             ret    = 0; + +        priv = this->private; + +        sh = &l->self_heal; + +        lc = mem_get0 (this->local_pool); +        if (!lc) +                goto out; + +        shc = &lc->self_heal; + +        shc->unwind = sh->unwind; +        shc->gfid_sh_success_cbk = sh->gfid_sh_success_cbk; +        shc->do_missing_entry_self_heal = sh->do_missing_entry_self_heal; +        shc->do_gfid_self_heal = sh->do_gfid_self_heal; +        shc->do_data_self_heal = sh->do_data_self_heal; +        shc->do_metadata_self_heal = sh->do_metadata_self_heal; +        shc->do_entry_self_heal = sh->do_entry_self_heal; +        shc->force_confirm_spb = sh->force_confirm_spb; +        shc->forced_merge = sh->forced_merge; +        shc->background = sh->background; +        shc->type = sh->type; +        shc->data_sh_info = ""; +        shc->metadata_sh_info =  ""; + +        uuid_copy (shc->sh_gfid_req, sh->sh_gfid_req); +        if (l->loc.path) { +                ret = loc_copy (&lc->loc, &l->loc); +                if (ret < 0) +                        goto out; +        } + +        lc->child_up  = memdup (l->child_up, +                                sizeof (*lc->child_up) * priv->child_count); +        if (!lc->child_up) { +                ret = -1; +                goto out; +        } + +        if (l->xattr_req) +                lc->xattr_req = dict_ref (l->xattr_req); + +        if (l->cont.lookup.inode) +                lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode); +        if (l->cont.lookup.xattr) +                lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr); + +        lc->internal_lock.locked_nodes = +                             GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), +                                        priv->child_count, gf_afr_mt_char); +        if (!lc->internal_lock.locked_nodes) { +                ret = -1; +                goto out; +        } + +        ret = afr_inodelk_init (&lc->internal_lock.inodelk[0], +                                this->name, priv->child_count); +        if (ret) +                goto out; + +out: +        if (ret) { +                afr_local_cleanup (lc, this); +                lc = NULL; +        } +        return lc; +} + +int +afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) +{ +        afr_private_t *   priv  = NULL; +        afr_local_t *     local = NULL; +        afr_self_heal_t * sh    = NULL; +        afr_local_t *     orig_frame_local = NULL; +        afr_self_heal_t * orig_frame_sh = NULL; +        char              sh_type_str[256] = {0,}; +        gf_loglevel_t     loglevel = 0; + +        priv  = this->private; +        local = bgsh_frame->local; +        sh    = &local->self_heal; + +        if (local->unhealable) { +                afr_set_split_brain (this, sh->inode, SPB, SPB); +        } + +        afr_self_heal_type_str_get (sh, sh_type_str, +                                    sizeof(sh_type_str)); +        if (is_self_heal_failed (sh, AFR_CHECK_ALL) && !priv->shd.iamshd) { +                loglevel = GF_LOG_ERROR; +        } else if (!is_self_heal_failed (sh, AFR_CHECK_ALL)) { +                loglevel = GF_LOG_INFO; +        } else { +                loglevel = GF_LOG_DEBUG; +        } + +        afr_log_self_heal_completion_status (local, loglevel); + +        FRAME_SU_UNDO (bgsh_frame, afr_local_t); + +        if (!sh->unwound && sh->unwind) { +                orig_frame_local = sh->orig_frame->local; +                orig_frame_sh = &orig_frame_local->self_heal; +                orig_frame_sh->actual_sh_started = _gf_true; +                sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, +                            is_self_heal_failed (sh, AFR_CHECK_ALL)); +        } + +        if (sh->background) { +                LOCK (&priv->lock); +                { +                        priv->background_self_heals_started--; +                } +                UNLOCK (&priv->lock); +        } + +        AFR_STACK_DESTROY (bgsh_frame); + +        return 0; +} + +int +afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        afr_private_t   *priv = NULL; +        int32_t          op_errno = 0; +        int              ret = 0; +        afr_self_heal_t *orig_sh = NULL; +        call_frame_t    *sh_frame = NULL; +        afr_local_t     *sh_local = NULL; +        loc_t           *loc   = NULL; + +        local = frame->local; +        orig_sh = &local->self_heal; +        priv  = this->private; + +        GF_ASSERT (local->loc.path); + +        gf_log (this->name, GF_LOG_TRACE, +                "performing self heal on %s (metadata=%d data=%d entry=%d)", +                local->loc.path, +                local->self_heal.do_metadata_self_heal, +                local->self_heal.do_data_self_heal, +                local->self_heal.do_entry_self_heal); + +        op_errno        = ENOMEM; +        sh_frame        = copy_frame (frame); +        if (!sh_frame) +                goto out; +        afr_set_lk_owner (sh_frame, this, sh_frame->root); +        afr_set_low_priority (sh_frame); + +        sh_local        = afr_self_heal_local_init (local, this); +        if (!sh_local) +                goto out; +        sh_frame->local = sh_local; +        sh              = &sh_local->self_heal; + +        sh->inode       = inode_ref (inode); +        sh->orig_frame  = frame; + +        sh->completion_cbk = afr_self_heal_completion_cbk; + +        sh->success = GF_CALLOC (priv->child_count, sizeof (*sh->success), +                                 gf_afr_mt_char); +        if (!sh->success) +                goto out; +        sh->sources = GF_CALLOC (sizeof (*sh->sources), priv->child_count, +                                 gf_afr_mt_int); +        if (!sh->sources) +                goto out; +        sh->locked_nodes = GF_CALLOC (sizeof (*sh->locked_nodes), +                                      priv->child_count, +                                      gf_afr_mt_int); +        if (!sh->locked_nodes) +                goto out; + +        sh->pending_matrix = afr_matrix_create (priv->child_count, +                                                priv->child_count); +        if (!sh->pending_matrix) +                goto out; + +        sh->delta_matrix = afr_matrix_create (priv->child_count, +                                              priv->child_count); +        if (!sh->delta_matrix) +                goto out; + +        sh->fresh_parent_dirs = afr_children_create (priv->child_count); +        if (!sh->fresh_parent_dirs) +                goto out; +        ret = afr_sh_common_create (sh, priv->child_count); +        if (ret) { +                op_errno = -ret; +                goto out; +        } + +        if (local->self_heal.background) { +                LOCK (&priv->lock); +                { +                        if (priv->background_self_heals_started +                            < priv->background_self_heal_count) { +                                priv->background_self_heals_started++; + + +                        } else { +                                local->self_heal.background = _gf_false; +                                sh->background = _gf_false; +                        } +                } +                UNLOCK (&priv->lock); +        } + +        if (!local->loc.parent) { +                sh->do_missing_entry_self_heal = _gf_false; +                sh->do_gfid_self_heal = _gf_false; +        } + +        sh->sh_type_in_action = AFR_SELF_HEAL_INVALID; + +        FRAME_SU_DO (sh_frame, afr_local_t); +        if (sh->do_missing_entry_self_heal || sh->do_gfid_self_heal) { +                afr_self_heal_missing_entries (sh_frame, this); +        } else { +                loc = &sh_local->loc; +                if (uuid_is_null (loc->inode->gfid) && uuid_is_null (loc->gfid)) { +                        if (!uuid_is_null (inode->gfid)) +                                GF_ASSERT (!uuid_compare (inode->gfid, +                                           sh->sh_gfid_req)); +                        uuid_copy (loc->gfid, sh->sh_gfid_req); +                } +                gf_log (this->name, GF_LOG_TRACE, +                        "proceeding to metadata check on %s", +                        local->loc.path); + +                afr_sh_missing_entries_done (sh_frame, this); +        } +        op_errno = 0; + +out: +        if (op_errno) { +                orig_sh->unwind (frame, this, -1, op_errno, 1); +                if (sh_frame) +                        AFR_STACK_DESTROY (sh_frame); +        } +        return 0; +} + +void +afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, +                            size_t size) +{ +        GF_ASSERT (str && (size > strlen (" missing-entry gfid " +                                          "meta-data data entry"))); + +        if (self_heal_p->do_metadata_self_heal) { +                snprintf (str, size, " meta-data"); +        } + +        if (self_heal_p->do_data_self_heal) { +                snprintf (str + strlen(str), size - strlen(str), " data"); +        } + +        if (self_heal_p->do_entry_self_heal) { +                snprintf (str + strlen(str), size - strlen(str), " entry"); +        } + +        if (self_heal_p->do_missing_entry_self_heal) { +                snprintf (str + strlen(str), size - strlen(str), +                         " missing-entry"); +        } + +        if (self_heal_p->do_gfid_self_heal) { +                snprintf (str + strlen(str), size - strlen(str), " gfid"); +        } +} + +afr_self_heal_type +afr_self_heal_type_for_transaction (afr_transaction_type type) +{ +        afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; + +        switch (type) { +        case AFR_DATA_TRANSACTION: +                sh_type = AFR_SELF_HEAL_DATA; +                break; +        case AFR_METADATA_TRANSACTION: +                sh_type = AFR_SELF_HEAL_METADATA; +                break; +        case AFR_ENTRY_TRANSACTION: +                sh_type = AFR_SELF_HEAL_ENTRY; +                break; +        case AFR_ENTRY_RENAME_TRANSACTION: +                GF_ASSERT (0); +                break; +        } +        return sh_type; +} + +int +afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +{ +        int   ret = -1; +        uuid_t pargfid = {0}; + +        if (!child) +                goto out; + +        if (!uuid_is_null (parent->inode->gfid)) +                uuid_copy (pargfid, parent->inode->gfid); +        else if (!uuid_is_null (parent->gfid)) +                uuid_copy (pargfid, parent->gfid); + +        if (uuid_is_null (pargfid)) +                goto out; + +        if (strcmp (parent->path, "/") == 0) +                ret = gf_asprintf ((char **)&child->path, "/%s", name); +        else +                ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path, +                                   name); + +        if (-1 == ret) { +                gf_log (this->name, GF_LOG_ERROR, +                        "asprintf failed while setting child path"); +        } + +        child->name = strrchr (child->path, '/'); +        if (child->name) +                child->name++; + +        child->parent = inode_ref (parent->inode); +        child->inode = inode_new (parent->inode->table); +        uuid_copy (child->pargfid, pargfid); + +        if (!child->inode) { +                ret = -1; +                goto out; +        } + +        ret = 0; +out: +        if ((ret == -1) && child) +                loc_wipe (child); + +        return ret; +} + +int +afr_sh_erase_pending (call_frame_t *frame, xlator_t *this, +                      afr_transaction_type type, afr_fxattrop_cbk_t cbk, +                      int (*finish)(call_frame_t *frame, xlator_t *this)) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        afr_private_t   *priv = NULL; +        int              call_count = 0; +        int              i = 0; +        dict_t          **erase_xattr = NULL; +        int             ret = -1; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, +                                 sh->success, priv->child_count, type); + +        erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, +                                 gf_afr_mt_dict_t); +        if (!erase_xattr) +                goto out; + +        for (i = 0; i < priv->child_count; i++) { +                if (sh->xattr[i]) { +                        call_count++; +                        erase_xattr[i] = dict_new (); +                        if (!erase_xattr[i]) +                                goto out; +                } +        } + +        afr_sh_delta_to_xattr (this, sh->delta_matrix, erase_xattr, +                               priv->child_count, type); + +        gf_log (this->name, GF_LOG_DEBUG, "Delta matrix for: %s", +                lkowner_utoa (&frame->root->lk_owner)); +        afr_sh_print_pending_matrix (sh->delta_matrix, this); +        local->call_count = call_count; +        if (call_count == 0) { +                ret = 0; +                finish (frame, this); +                goto out; +        } + +        for (i = 0; i < priv->child_count; i++) { +                if (!erase_xattr[i]) +                        continue; + +                if (sh->healing_fd) {//true for ENTRY, reg file DATA transaction +                        STACK_WIND_COOKIE (frame, cbk, (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->fxattrop, +                                           sh->healing_fd, +                                           GF_XATTROP_ADD_ARRAY, erase_xattr[i], +                                           NULL); +                } else { +                        STACK_WIND_COOKIE (frame, cbk, (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->xattrop, +                                           &local->loc, +                                           GF_XATTROP_ADD_ARRAY, erase_xattr[i], +                                           NULL); +                } +        } + +        ret = 0; +out: +        if (erase_xattr) { +                for (i = 0; i < priv->child_count; i++) { +                        if (erase_xattr[i]) { +                                dict_unref (erase_xattr[i]); +                        } +                } +        } + +        GF_FREE (erase_xattr); + +        if (ret < 0) { +                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +                finish (frame, this); +        } + +        return 0; +} + +void +afr_set_self_heal_status(afr_self_heal_t *sh, afr_self_heal_status status) +{ +        xlator_t                *this = NULL; +        afr_sh_status_for_all_type *sh_status = &(sh->afr_all_sh_status); +        afr_self_heal_type  sh_type_in_action = sh->sh_type_in_action; +        this = THIS; + +        if (!sh) { +                gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal" +                                  "Structure"); +                goto out; +        } + +        switch (sh_type_in_action) { +                case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: +                       sh_status->gfid_or_missing_entry_self_heal = status; +                        break; +                case AFR_SELF_HEAL_METADATA: +                        sh_status->metadata_self_heal = status; +                        break; +                case AFR_SELF_HEAL_DATA: +                        sh_status->data_self_heal = status; +                        break; +                case AFR_SELF_HEAL_ENTRY: +                        sh_status->entry_self_heal = status; +                        break; +                case AFR_SELF_HEAL_INVALID: +                        gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid" +                                          "self heal type in action"); +                        break; +        } +out: +        return; +} + +void +afr_set_local_for_unhealable (afr_local_t *local) +{ +        afr_self_heal_t  *sh = NULL; + +        sh = &local->self_heal; + +        local->unhealable = 1; +        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +} + +int +is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type) +{ +        afr_sh_status_for_all_type      sh_status = sh->afr_all_sh_status; +        afr_self_heal_type   sh_type_in_action =  AFR_SELF_HEAL_INVALID; +        afr_self_heal_status    status = AFR_SELF_HEAL_FAILED; +        xlator_t                *this = NULL; +        int                     sh_failed = 0; + +        this = THIS; + +        if (!sh) { +                gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal " +                                  "structure"); +                sh_failed = 1; +                goto out; +        } + +        if (type == AFR_CHECK_ALL) { +                if ((sh_status.gfid_or_missing_entry_self_heal == AFR_SELF_HEAL_FAILED) +                    || (sh_status.metadata_self_heal == AFR_SELF_HEAL_FAILED) +                    || (sh_status.data_self_heal == AFR_SELF_HEAL_FAILED) +                    || (sh_status.entry_self_heal == AFR_SELF_HEAL_FAILED)) +                sh_failed = 1; +        } else if (type == AFR_CHECK_SPECIFIC) { +                sh_type_in_action = sh->sh_type_in_action; +                switch (sh_type_in_action) { +                        case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: +                             status = sh_status.gfid_or_missing_entry_self_heal; +                                break; +                        case AFR_SELF_HEAL_METADATA: +                                status = sh_status.metadata_self_heal; +                                break; +                        case AFR_SELF_HEAL_ENTRY: +                                status = sh_status.entry_self_heal; +                                break; +                        case AFR_SELF_HEAL_DATA: +                                status = sh_status.data_self_heal; +                                break; +                        case AFR_SELF_HEAL_INVALID: +                                status = AFR_SELF_HEAL_NOT_ATTEMPTED; +                                break; +                } +                if (status == AFR_SELF_HEAL_FAILED) +                        sh_failed = 1; + +        } + +out: +        return sh_failed; +} + +char * +get_sh_completion_status (afr_self_heal_status status) +{ + +        char *not_attempted       = " is not attempted"; +        char *failed              = " failed"; +        char *started             = " is started"; +        char *sync_begin          = " is successfully completed"; +        char *result              = " has unknown status"; + +        switch (status) +        { +                case AFR_SELF_HEAL_NOT_ATTEMPTED: +                        result = not_attempted; +                        break; +                case AFR_SELF_HEAL_FAILED: +                        result = failed; +                        break; +                case AFR_SELF_HEAL_STARTED: +                        result = started; +                        break; +                case AFR_SELF_HEAL_SYNC_BEGIN: +                        result = sync_begin; +                        break; +        } + +        return result; + +} + +void +afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t loglvl) +{ + +        char sh_log[4096]              = {0}; +        afr_self_heal_t *sh            = &local->self_heal; +        afr_sh_status_for_all_type   all_status = sh->afr_all_sh_status; +        xlator_t      *this            = NULL; +        size_t        off              = 0; +        int           data_sh          = 0; +        int           metadata_sh      = 0; +        int           print_log        = 0; + +        this = THIS; + +        ADD_FMT_STRING (sh_log, off, "gfid or missing entry", +                        all_status.gfid_or_missing_entry_self_heal, print_log); +        ADD_FMT_STRING_SYNC (sh_log, off, "metadata", +                             all_status.metadata_self_heal, print_log); +        if (sh->background) { +                ADD_FMT_STRING_SYNC (sh_log, off, "backgroung data", +                                all_status.data_self_heal, print_log); +        } else { +                ADD_FMT_STRING_SYNC (sh_log, off, "foreground data", +                                all_status.data_self_heal, print_log); +        } +        ADD_FMT_STRING_SYNC (sh_log, off, "entry", all_status.entry_self_heal, +                             print_log); + +        if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.data_self_heal && +	    strcmp (sh->data_sh_info, "") && sh->data_sh_info ) +                data_sh = 1; +        if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.metadata_self_heal && +	    strcmp (sh->metadata_sh_info, "") && sh->metadata_sh_info) +                metadata_sh = 1; + +        if (!print_log) +                return; + +        gf_log (this->name, loglvl, "%s %s %s on %s", sh_log, +                ((data_sh == 1) ? sh->data_sh_info : ""), +                ((metadata_sh == 1) ? sh->metadata_sh_info : ""), +                local->loc.path); +} diff --git a/xlators/cluster/afr-v1/src/afr-self-heal-common.h b/xlators/cluster/afr-v1/src/afr-self-heal-common.h new file mode 100644 index 000000000..473264776 --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-self-heal-common.h @@ -0,0 +1,144 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef __AFR_SELF_HEAL_COMMON_H__ +#define __AFR_SELF_HEAL_COMMON_H__ + +#define FILE_HAS_HOLES(buf) (((buf)->ia_size) > ((buf)->ia_blocks * 512)) +#define AFR_SH_MIN_PARTICIPANTS 2 + +typedef enum { +        AFR_LOOKUP_FAIL_CONFLICTS = 1, +        AFR_LOOKUP_FAIL_MISSING_GFIDS = 2, +} afr_lookup_flags_t; + +int +afr_sh_select_source (int sources[], int child_count); + +int +afr_sh_source_count (int sources[], int child_count); + +void +afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this); + +void +afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, +                              const char *loc); + +int +afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, +                          unsigned char *ignorant_subvols, +                          dict_t *xattr[], afr_transaction_type type, +                          size_t child_count); + +void +afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, +                         int32_t *delta_matrix[], unsigned char success[], +                         int child_count, afr_transaction_type type); + +int +afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, +                  struct iatt *bufs, afr_self_heal_type type, +                  int32_t *success_children, int32_t *subvol_status); + +int +afr_sh_delta_to_xattr (xlator_t *this, +                       int32_t *delta_matrix[], dict_t *xattr[], +		       int child_count, afr_transaction_type type); + +void +afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, +                            size_t size); + +afr_self_heal_type +afr_self_heal_type_for_transaction (afr_transaction_type type); + +int +afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, +                   int32_t **pending_matrix, int32_t *sources, +                   int32_t *success_children, afr_transaction_type type, +                   int32_t *subvol_status, gf_boolean_t ignore_ignorant); +void +afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count); + +void +afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, +                                   xlator_t *this, +                                   int32_t op_ret, int32_t op_errno, +                                   inode_t *inode, struct iatt *buf, +                                   dict_t *xattr, struct iatt *postparent, +                                   loc_t *loc); + +int +afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, +                      afr_lookup_done_cbk_t lookup_cbk, uuid_t uuid, +                      int32_t flags, dict_t *xdata); +int +afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, +                             int active_src, struct iatt *buf, +                             struct iatt *parentbuf); +int +afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, +                char *base_name, afr_lock_cbk_t lock_cbk); +int +afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, +                             int child_index); +int +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, +                    afr_lock_cbk_t lock_cbk); +afr_local_t * +afr_self_heal_local_init (afr_local_t *l, xlator_t *this); +int +afr_sh_data_lock (call_frame_t *frame, xlator_t *this, +                  off_t start, off_t len, gf_boolean_t block, char *dom, +                  afr_lock_cbk_t success_handler, +                  afr_lock_cbk_t failure_handler); +void +afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno); +void +afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this); +typedef int +(*afr_fxattrop_cbk_t) (call_frame_t *frame, void *cookie, +                       xlator_t *this, int32_t op_ret, int32_t op_errno, +                       dict_t *xattr, dict_t *xdata); +int +afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name); +int +afr_impunge_frame_create (call_frame_t *frame, xlator_t *this, +                          int active_source, call_frame_t **impunge_frame); +void +afr_sh_reset (call_frame_t *frame, xlator_t *this); + +void +afr_children_intersection_get (int32_t *set1, int32_t *set2, +                               int *intersection, unsigned int child_count); +int +afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children, +                                 struct iatt *bufs); +int +afr_sh_erase_pending (call_frame_t *frame, xlator_t *this, +                      afr_transaction_type type, afr_fxattrop_cbk_t cbk, +                      int (*finish)(call_frame_t *frame, xlator_t *this)); + +void +afr_set_local_for_unhealable (afr_local_t *local); + +int +is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type); + +void +afr_set_self_heal_status (afr_self_heal_t *sh, afr_self_heal_status status); + +void +afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t  logl); + +char* +afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this); +#endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr-v1/src/afr-self-heal-data.c b/xlators/cluster/afr-v1/src/afr-self-heal-data.c new file mode 100644 index 000000000..9de26ee56 --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-self-heal-data.c @@ -0,0 +1,1754 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" +#include "afr-self-heal-algorithm.h" + +int +afr_sh_data_fail (call_frame_t *frame, xlator_t *this); + +static inline gf_boolean_t +afr_sh_data_proceed (unsigned int success_count) +{ +        return (success_count >= AFR_SH_MIN_PARTICIPANTS); +} + +extern int +sh_loop_finish (call_frame_t *loop_frame, xlator_t *this); + +int +afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this); + +int +afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this); + +int +afr_sh_data_finish (call_frame_t *frame, xlator_t *this); + +int +afr_sh_data_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; + +        local = frame->local; +        sh = &local->self_heal; + +        sh->completion_cbk (frame, this); + +        return 0; +} + + +int +afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                       int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +        afr_local_t   *local       = NULL; +        afr_private_t *priv        = NULL; +        int            call_count  = 0; +        int            child_index = (long) cookie; + +        local = frame->local; +        priv = this->private; + +        LOCK (&frame->lock); +        { +                if (op_ret == -1) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "flush failed on %s on subvolume %s: %s", +                                local->loc.path, priv->children[child_index]->name, +                                strerror (op_errno)); +                } +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                afr_sh_data_done (frame, this); +        } + +        return 0; +} + +int +afr_sh_data_close (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local      = NULL; +        afr_private_t   *priv       = NULL; +        afr_self_heal_t *sh         = NULL; +        int              i          = 0; +        int              call_count = 0; + +        local = frame->local; +        sh    = &local->self_heal; +        priv  = this->private; + +        if (!sh->healing_fd) { +                //This happens when file is non-reg +                afr_sh_data_done (frame, this); +                return 0; +        } +        call_count        = afr_set_elem_count_get (sh->success, +                                                    priv->child_count); +        local->call_count = call_count; + +        if (call_count == 0) { +                afr_sh_data_done (frame, this); +                return 0; +        } + +        for (i = 0; i < priv->child_count; i++) { +                if (!sh->success[i]) +                        continue; +                gf_log (this->name, GF_LOG_DEBUG, +                        "closing fd of %s on %s", +                        local->loc.path, priv->children[i]->name); + +                STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, +                                   (void *) (long) i, +                                   priv->children[i], +                                   priv->children[i]->fops->flush, +                                   sh->healing_fd, NULL); + +                if (!--call_count) +                        break; +        } + +        return 0; +} + +int +afr_sh_dom_unlock (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh    = NULL; +        afr_private_t   *priv  = NULL; + +        local = frame->local; +        sh    = &local->self_heal; +        priv  = this->private; + +        if (sh->sh_dom_lock_held) +                afr_sh_data_unlock (frame, this, priv->sh_domain, +                                    afr_sh_data_close); +        else +                afr_sh_data_close (frame, this); +        return 0; +} + +int +afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                         int32_t op_ret, int32_t op_errno, struct iatt *statpre, +                         struct iatt *statpost, dict_t *xdata) +{ + +        afr_local_t   *local       = NULL; +        afr_private_t *priv        = NULL; +        int            call_count  = 0; +        int            child_index = (long) cookie; + +        local = frame->local; +        priv = this->private; + +        LOCK (&frame->lock); +        { +                if (op_ret == -1) { +                        gf_log (this->name, GF_LOG_INFO, +                                "setattr failed on %s on subvolume %s: %s", +                                local->loc.path, priv->children[child_index]->name, +                                strerror (op_errno)); +                } +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                afr_sh_data_finish (frame, this); +        } + +        return 0; +} + +int +afr_sh_data_setattr (call_frame_t *frame, xlator_t *this, struct iatt* stbuf) +{ +        afr_local_t     *local      = NULL; +        afr_private_t   *priv       = NULL; +        afr_self_heal_t *sh         = NULL; +        int              i          = 0; +        int              call_count = 0; +        int32_t          valid      = 0; + +        local = frame->local; +        sh    = &local->self_heal; +        priv  = this->private; + +        valid = (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME); + +        call_count        = afr_set_elem_count_get (sh->success, +                                                    priv->child_count); +        local->call_count = call_count; + +        if (call_count == 0) { +                GF_ASSERT (0); +                afr_sh_data_finish (frame, this); +                return 0; +        } + +        for (i = 0; i < priv->child_count; i++) { +                if (!sh->success[i]) +                        continue; + +                STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk, +                                   (void *) (long) i, +                                   priv->children[i], +                                   priv->children[i]->fops->setattr, +                                   &local->loc, stbuf, valid, NULL); + +                if (!--call_count) +                        break; +        } + +        return 0; +} + +int +afr_sh_data_setattr_fstat_cbk (call_frame_t *frame, void *cookie, +                               xlator_t *this, int32_t op_ret, int32_t op_errno, +                               struct iatt *buf, dict_t *xdata) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        int child_index = (long) cookie; + +        local = frame->local; +        sh = &local->self_heal; + +        GF_ASSERT (sh->source == child_index); +        if (op_ret != -1) { +                sh->buf[child_index] = *buf; +                afr_sh_data_setattr (frame, this, buf); +        } else { +                gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " +                        "time-stamps after self-heal", local->loc.path); +                afr_sh_data_fail (frame, this); +        } + +        return 0; +} + +/* + * If there are any writes after the self-heal is triggered then the + * stbuf stored in local->self_heal.buf[] will be invalid so we do one more + * stat on the source and then set the [am]times + */ +int +afr_sh_set_timestamps (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local      = NULL; +        afr_private_t   *priv       = NULL; +        afr_self_heal_t *sh         = NULL; + +        local = frame->local; +        sh    = &local->self_heal; +        priv  = this->private; + +        STACK_WIND_COOKIE (frame, afr_sh_data_setattr_fstat_cbk, +                           (void *) (long) sh->source, +                           priv->children[sh->source], +                           priv->children[sh->source]->fops->fstat, +                           sh->healing_fd, NULL); +        return 0; +} + +//Fun fact, lock_cbk is being used for both lock & unlock +int +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, +                    afr_lock_cbk_t lock_cbk) +{ +        afr_local_t         *local    = NULL; +        afr_internal_lock_t *int_lock = NULL; +        afr_self_heal_t     *sh       = NULL; +        afr_private_t       *priv     = NULL; +        int                 ret       = 0; + +        local    = frame->local; +        int_lock = &local->internal_lock; +        sh       = &local->self_heal; +        priv     = this->private; + +        if (strcmp (dom, this->name) == 0) { +                sh->data_lock_held = _gf_false; +        } else if (strcmp (dom, priv->sh_domain) == 0) { +                sh->sh_dom_lock_held = _gf_false; +        } else { +                ret = -1; +                goto out; +        } +        int_lock->lock_cbk = lock_cbk; +        int_lock->domain = dom; +        afr_unlock (frame, this); + +out: +        if (ret) { +                int_lock->lock_op_ret = -1; +                int_lock->lock_cbk (frame, this); +        } +        return 0; +} + +int +afr_sh_data_finish (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh    = NULL; + +        local = frame->local; +        sh = &local->self_heal; + +        gf_log (this->name, GF_LOG_DEBUG, +                "finishing data selfheal of %s", local->loc.path); + +        if (sh->data_lock_held) +                afr_sh_data_unlock (frame, this, this->name, afr_sh_dom_unlock); +        else +                afr_sh_dom_unlock (frame, this); + +        return 0; +} + +int +afr_sh_data_fail (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t   *local = NULL; +        afr_self_heal_t *sh = NULL; + +        local = frame->local; +        sh = &local->self_heal; + +        gf_log (this->name, GF_LOG_DEBUG, +                "finishing failed data selfheal of %s", local->loc.path); + +        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +        afr_sh_data_finish (frame, this); +        return 0; +} + +int +afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, +                               xlator_t *this, int32_t op_ret, +                               int32_t op_errno, dict_t *xattr, dict_t *xdata) +{ +        int             call_count = 0; +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        afr_private_t   *priv = NULL; +        int32_t         child_index = (long) cookie; + +        priv  = this->private; +        local = frame->local; +        sh    = &local->self_heal; +        if (op_ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, "Erasing of pending change " +                        "log failed on %s for subvol %s, reason: %s", +                        local->loc.path, priv->children[child_index]->name, +                        strerror (op_errno)); +                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +        } + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { +                        if (sh->old_loop_frame) +                                sh_loop_finish (sh->old_loop_frame, this); +                        sh->old_loop_frame = NULL; +                        afr_sh_data_fail (frame, this); +                        goto out; +                } +                if (!IA_ISREG (sh->type)) { +                        afr_sh_data_finish (frame, this); +                        goto out; +                } +                GF_ASSERT (sh->old_loop_frame); +                afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, +                                  afr_post_sh_big_lock_success, +                                  afr_post_sh_big_lock_failure); +        } +out: +        return 0; +} + +int +afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) +{ +        afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION, +                              afr_sh_data_erase_pending_cbk, +                              afr_sh_data_finish); +        return 0; +} + +int +afr_sh_data_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                       int op_ret, int op_errno, struct iatt *pre, +                       struct iatt *post, dict_t *xdata) +{ +        afr_local_t     *local      = NULL; +        afr_private_t   *priv       = NULL; +        afr_self_heal_t *sh         = NULL; +        int             call_count  = 0; +        int             child_index = (long) cookie; + +        local = frame->local; +        priv = this->private; +        sh   = &local->self_heal; + +        if (op_ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, "%s: Failed to fsync on " +                        "%s - %s", local->loc.path, +                        priv->children[child_index]->name, strerror (op_errno)); +                LOCK (&frame->lock); +                { +                        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +                } +                UNLOCK (&frame->lock); +                if (sh->old_loop_frame) +                        sh_loop_finish (sh->old_loop_frame, this); +                sh->old_loop_frame = NULL; +        } + +        call_count = afr_frame_return (frame); +        if (call_count == 0) { +                if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) +                        afr_sh_data_fail (frame, this); +                else +                        afr_sh_data_erase_pending (frame, this); +        } +        return 0; +} + +/* + * Before erasing xattrs, make sure the data is written to disk + */ +int +afr_sh_data_fsync (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local     = NULL; +        afr_private_t   *priv      = NULL; +        afr_self_heal_t *sh        = NULL; +        int             i          = 0; +        int             call_count = 0; + +        local = frame->local; +        priv = this->private; +        sh   = &local->self_heal; + +        call_count        = sh->active_sinks; +        if (call_count == 0) { +                afr_sh_data_erase_pending (frame, this); +                return 0; +        } + +        local->call_count = call_count; +        for (i = 0; i < priv->child_count; i++) { +                if (!sh->success[i] || sh->sources[i]) +                        continue; + +                STACK_WIND_COOKIE (frame, afr_sh_data_fsync_cbk, +                                   (void *) (long) i, priv->children[i], +                                   priv->children[i]->fops->fsync, +                                   sh->healing_fd, 1, NULL); +        } + +        return 0; +} + +static struct afr_sh_algorithm * +sh_algo_from_name (xlator_t *this, char *name) +{ +        int i = 0; + +        if (name == NULL) +                goto out; + +        while (afr_self_heal_algorithms[i].name) { +                if (!strcmp (name, afr_self_heal_algorithms[i].name)) { +                        return &afr_self_heal_algorithms[i]; +                } + +                i++; +        } + +out: +        return NULL; +} + + +static int +sh_zero_byte_files_exist (afr_local_t *local, int child_count) +{ +        int             i = 0; +        int             ret = 0; +        afr_self_heal_t *sh = NULL; + +        sh = &local->self_heal; +        for (i = 0; i < child_count; i++) { +                if (!local->child_up[i] || sh->child_errno[i]) +                        continue; +                if (sh->buf[i].ia_size == 0) { +                        ret = 1; +                        break; +                } +        } + +        return ret; +} + + +struct afr_sh_algorithm * +afr_sh_data_pick_algo (call_frame_t *frame, xlator_t *this) +{ +        afr_private_t *           priv  = NULL; +        struct afr_sh_algorithm * algo  = NULL; +        afr_local_t *             local = NULL; +        afr_self_heal_t *         sh    = NULL; + +        priv  = this->private; +        local = frame->local; +        sh    = &local->self_heal; +        algo  = sh_algo_from_name (this, priv->data_self_heal_algorithm); + +        if (algo == NULL) { +                /* option not set, so fall back on heuristics */ + +                if (sh_zero_byte_files_exist (local, priv->child_count) +                    || (sh->file_size <= (priv->data_self_heal_window_size * +                                          this->ctx->page_size))) { + +                        /* +                         * If the file does not exist on one of the subvolumes, +                         * or a zero-byte file exists (created by entry self-heal) +                         * the entire content has to be copied anyway, so there +                         * is no benefit from using the "diff" algorithm. +                         * +                         * If the file size is about the same as page size, +                         * the entire file can be read and written with a few +                         * (pipelined) STACK_WINDs, which will be faster +                         * than "diff" which has to read checksums and then +                         * read and write. +                         */ + +                        algo = sh_algo_from_name (this, "full"); + +                } else { +                        algo = sh_algo_from_name (this, "diff"); +                } +        } + +        return algo; +} + + +int +afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        struct afr_sh_algorithm *sh_algo = NULL; + +        local = frame->local; +        sh = &local->self_heal; + +        sh->algo_completion_cbk = afr_sh_data_fsync; +        sh->algo_abort_cbk      = afr_sh_data_fail; + +        sh_algo = afr_sh_data_pick_algo (frame, this); + +        sh->algo = sh_algo; +        sh_algo->fn (frame, this); + +        return 0; +} + +int +afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                      int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +                      struct iatt *postbuf, dict_t *xdata) +{ +        int              call_count = 0; +        int              child_index = 0; +        afr_private_t    *priv = NULL; +        afr_local_t      *local  = NULL; +        afr_self_heal_t  *sh = NULL; + +        priv  = this->private; +        local = frame->local; +        sh    = &local->self_heal; + +        child_index = (long) cookie; + +        LOCK (&frame->lock); +        { +                if (op_ret == -1) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "ftruncate of %s on subvolume %s failed (%s)", +                                local->loc.path, +                                priv->children[child_index]->name, +                                strerror (op_errno)); +                        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +                } else { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "ftruncate of %s on subvolume %s completed", +                                local->loc.path, +                                priv->children[child_index]->name); +                } +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) +                        afr_sh_data_fail (frame, this); +                else +                        afr_sh_data_sync_prepare (frame, this); +        } + +        return 0; +} + + +int +afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) +{ +        afr_private_t * priv = NULL; +        afr_local_t * local  = NULL; +        afr_self_heal_t *sh  = NULL; +        int             *sources = NULL; +        int              call_count = 0; +        int              i = 0; + + +        priv = this->private; +        local = frame->local; +        sh = &local->self_heal; + +        sources = sh->sources; +        call_count = sh->active_sinks; + +        local->call_count = call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (sources[i] || !local->child_up[i]) +                        continue; + +                STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk, +                                   (void *) (long) i, +                                   priv->children[i], +                                   priv->children[i]->fops->ftruncate, +                                   sh->healing_fd, sh->file_size, +                                   NULL); + +                if (!--call_count) +                        break; +        } + +        return 0; +} + +int +afr_sh_inode_set_read_ctx (afr_self_heal_t *sh, xlator_t *this) +{ +        afr_private_t   *priv = NULL; +        int             ret = 0; +        int             i = 0; + +        priv = this->private; +        sh->source = afr_sh_select_source (sh->sources, priv->child_count); +        if (sh->source < 0) { +                ret = -1; +                goto out; +        } + +        /* detect changes not visible through pending flags -- JIC */ +        for (i = 0; i < priv->child_count; i++) { +                if (i == sh->source || sh->child_errno[i]) +                        continue; + +                if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[sh->source])) +                        sh->sources[i] = 0; +        } + +        afr_reset_children (sh->fresh_children, priv->child_count); +        afr_get_fresh_children (sh->success_children, sh->sources, +                                sh->fresh_children, priv->child_count); +        afr_inode_set_read_ctx (this, sh->inode, sh->source, +                                sh->fresh_children); +out: +        return ret; +} + +char* +afr_get_sizes_str (afr_local_t *local, struct iatt *bufs, xlator_t *this) +{ +        afr_private_t *priv = NULL; +        int           i     = 0; +        char          num[1024] = {0}; +        size_t        len = 0; +        char          *sizes_str = NULL; +        size_t        off = 0; +        char          *fmt_str = "%llu bytes on %s, "; +        char          *child_down =  " %s,"; +        char          *child_unknown = " %s,"; +        int           down_child_present = 0; +        int           down_count = 0; +        int           unknown_count = 0; +        int           unknown_child_present = 0; +        char          *down_subvol_1 = " down subvolume is "; +        char          *unknown_subvol_1 = " unknown subvolume is "; +        char          *down_subvol_2 = " down subvolumes are "; +        char          *unknown_subvol_2 = " unknown subvolumes are "; + +        priv = this->private; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i] == 1) { +                        len += snprintf (num, sizeof (num), fmt_str, +                                         (unsigned long long) bufs[i].ia_size, +                                         priv->children[i]->name); +                } else if (local->child_up[i] == 0) { +                        len += snprintf (num, sizeof (num), child_down, +                                         priv->children[i]->name); +                        if (!down_child_present) +                                down_child_present = 1; +                        down_count ++; +                } else if (local->child_up[i] == -1) { +                        len += snprintf (num, sizeof (num), child_unknown, +                                         priv->children[i]->name); +                        if (!unknown_child_present) +                                unknown_child_present = 1; +                        unknown_count++; +                } + +        } + +        if (down_child_present) { +                if (down_count > 1) +                        len += snprintf (num, sizeof (num), "%s", +                                         down_subvol_2); +                else +                        len += snprintf (num, sizeof (num), "%s", +                                        down_subvol_1); +        } +        if (unknown_child_present) { +                if (unknown_count > 1) +                        len += snprintf (num, sizeof (num), "%s", +                                         unknown_subvol_2); +                else +                        len += snprintf (num, sizeof (num), "%s", +                                         unknown_subvol_1); +        } + +        len++;//for '\0' + +        sizes_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); + +        if (!sizes_str) +                return NULL; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i] == 1) { +                        off += snprintf (sizes_str + off, len - off, fmt_str, +                                         (unsigned long long) bufs[i].ia_size, +                                         priv->children[i]->name); +                } +        } + +        if (down_child_present) { +                if (down_count > 1) { +                        off += snprintf (sizes_str + off, len - off, "%s", +                                         down_subvol_2); +                } else { +                        off += snprintf (sizes_str + off, len - off, "%s", +                                         down_subvol_1); +                } +        } + +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i] == 0) { +                        off += snprintf (sizes_str + off, len - off, child_down, +                                         priv->children[i]->name); +                } +        } + +        if (unknown_child_present) { +                if (unknown_count > 1) { +                        off += snprintf (sizes_str + off, len - off, "%s", +                                        unknown_subvol_2); +                } else { +                        off += snprintf (sizes_str + off, len - off, "%s", +                                         unknown_subvol_1); +                } +        } + +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i] == -1) { +                        off += snprintf (sizes_str + off, len - off, +                                         child_unknown, +                                         priv->children[i]->name); + +                } +        } + +        return sizes_str; +} + +char* +afr_get_sinks_str (xlator_t *this, afr_local_t *local, afr_self_heal_t *sh) +{ +        afr_private_t   *priv = NULL; +        int             i = 0; +        char            num[1024] = {0}; +        size_t          len = 0; +        char            *sinks_str = NULL; +        char            *temp_str = " to sinks "; +        char            *str_format = " %s,"; +        char            off = 0; + +        priv = this->private; + +        len += snprintf (num, sizeof (num), "%s", temp_str); +        for (i = 0; i < priv->child_count; i++) { +                if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { +                        len += snprintf (num, sizeof (num), str_format, +                                         priv->children[i]->name); +                } +        } + +        len ++; + +        sinks_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); + +        if (!sinks_str) +                return NULL; + +        off += snprintf (sinks_str + off, len - off, "%s", temp_str); + +        for (i = 0; i < priv->child_count; i++) { +                if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { +                        off += snprintf (sinks_str + off, len - off, +                                         str_format, +                                         priv->children[i]->name); +                } +        } + +        return sinks_str; + +} + + +void +afr_set_data_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, xlator_t *this) +{ +        char            *pending_matrix_str = NULL; +        char            *sizes_str = NULL; +        char            *sinks_str = NULL; +        afr_private_t   *priv = NULL; + +        priv = this->private; + +        pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix, +                                                         this); +        if (!pending_matrix_str) +                pending_matrix_str = ""; + +        sizes_str = afr_get_sizes_str (local, sh->buf, this); +        if (!sizes_str) +                sizes_str = ""; + +        sinks_str = afr_get_sinks_str (this, local, sh); +        if (!sinks_str) +                sinks_str = ""; + +        gf_asprintf (&sh->data_sh_info, " data self heal from %s %s with " +                     "%s data %s", priv->children[sh->source]->name, sinks_str, +                     sizes_str, pending_matrix_str); + +        if (pending_matrix_str && strcmp (pending_matrix_str, "")) +                GF_FREE (pending_matrix_str); + +        if (sizes_str && strcmp (sizes_str, "")) +                GF_FREE (sizes_str); +} + +void +afr_sh_data_fix (call_frame_t *frame, xlator_t *this) +{ +        int              source = 0; +        afr_local_t     *local      = NULL; +        afr_self_heal_t *sh = NULL; +        afr_private_t   *priv = NULL; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        source     = sh->source; +        sh->block_size = this->ctx->page_size; +        sh->file_size  = sh->buf[source].ia_size; + +        if (FILE_HAS_HOLES (&sh->buf[source])) +                sh->file_has_holes = 1; + +        if (sh->background && sh->unwind && !sh->unwound) { +                sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, +                            is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)); +                sh->unwound = _gf_true; +        } + +        afr_sh_mark_source_sinks (frame, this); +        if (sh->active_sinks == 0) { +                gf_log (this->name, GF_LOG_INFO, +                        "no active sinks for performing self-heal on file %s", +                        local->loc.path); +                afr_sh_data_finish (frame, this); +                return; +        } + +        gf_log (this->name, GF_LOG_DEBUG, +                "self-healing file %s from subvolume %s to %d other", +                local->loc.path, priv->children[sh->source]->name, +                sh->active_sinks); + +        sh->actual_sh_started = _gf_true; +        afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); +        afr_sh_data_trim_sinks (frame, this); +} + +int +afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local      = NULL; +        afr_self_heal_t *sh = NULL; +        afr_private_t   *priv = NULL; +        int              nsources = 0; +        int              ret = 0; +        int             *old_sources = NULL; +        int             tstamp_source = 0; +        int             i = 0; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        gf_log (this->name, GF_LOG_DEBUG, "Pending matrix for: %s", +                lkowner_utoa (&frame->root->lk_owner)); +        if (sh->sync_done) { +                //store sources before sync so that mtime can be set using the +                //iatt buf from one of them. +                old_sources = alloca (priv->child_count*sizeof (*old_sources)); +                memcpy (old_sources, sh->sources, +                        priv->child_count * sizeof (*old_sources)); +        } + +        nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, +                                      sh->sources, sh->success_children, +                                      AFR_DATA_TRANSACTION, NULL, _gf_true); +        if ((nsources == -1) +            && (priv->favorite_child != -1) +            && (sh->child_errno[priv->favorite_child] == 0)) { + +                gf_log (this->name, GF_LOG_DEBUG, +                        "Picking favorite child %s as authentic source to " +                        "resolve conflicting data of %s", +                        priv->children[priv->favorite_child]->name, +                        local->loc.path); + +                sh->sources[priv->favorite_child] = 1; + +                nsources = afr_sh_source_count (sh->sources, +                                                priv->child_count); +        } + +        if (nsources == -1) { +                afr_sh_print_split_brain_log (sh->pending_matrix, this, +                                              local->loc.path); +                afr_set_split_brain (this, sh->inode, DONT_KNOW, SPB); + +                afr_sh_data_fail (frame, this); +                return 0; +        } + +        afr_set_split_brain (this, sh->inode, DONT_KNOW, NO_SPB); + +        ret = afr_sh_inode_set_read_ctx (sh, this); +        if (ret) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "No active sources found."); + +                afr_sh_data_fail (frame, this); +                return 0; +        } + +        if (sh->sync_done) { +                /* Perform setattr from one of the old_sources if possible +                 * Because only they have the correct mtime, the new sources +                 * (i.e. old sinks) have mtime from last writev in sync. +                 */ +                tstamp_source = sh->source; +                for (i = 0; i < priv->child_count; i++) { +                        if (old_sources[i] && sh->sources[i]) +                                tstamp_source = i; +                } +                afr_sh_data_setattr (frame, this, &sh->buf[tstamp_source]); +        } else { +                afr_set_data_sh_info_str (local, sh, this); +                if (nsources == 0) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "No self-heal needed for %s", +                                local->loc.path); + +                        afr_sh_data_finish (frame, this); +                        return 0; +                } + +                if (sh->do_data_self_heal && +                    afr_data_self_heal_enabled (priv->data_self_heal)) +                        afr_sh_data_fix (frame, this); +                else +                        afr_sh_data_finish (frame, this); +        } +        return 0; +} + +int +afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, +                                          dict_t **xattr, +                                          afr_transaction_type txn_type, +                                          uuid_t gfid) +{ +        afr_private_t            *priv      = NULL; +        int                      read_child = -1; +        int32_t                  **pending_matrix = NULL; +        int32_t                  *sources         = NULL; +        int32_t                  *success_children   = NULL; +        struct iatt              *bufs            = NULL; +        int32_t                  nsources         = 0; +        int32_t                  prev_read_child  = -1; +        int32_t                  config_read_child = -1; +        int32_t                  subvol_status = 0; + +        priv = this->private; +        bufs = local->cont.lookup.bufs; +        success_children = local->cont.lookup.success_children; + +        pending_matrix = local->cont.lookup.pending_matrix; +        sources = local->cont.lookup.sources; +        memset (sources, 0, sizeof (*sources) * priv->child_count); + +        nsources = afr_build_sources (this, xattr, bufs, pending_matrix, +                                      sources, success_children, txn_type, +                                      &subvol_status, _gf_false); +        if (subvol_status & SPLIT_BRAIN) { +                gf_log (this->name, GF_LOG_DEBUG, "%s: Possible split-brain", +                        local->loc.path); +                switch (txn_type) { +                case AFR_DATA_TRANSACTION: +                        local->cont.lookup.possible_spb = _gf_true; +                        nsources = 1; +                        sources[success_children[0]] = 1; +                        break; +                case AFR_ENTRY_TRANSACTION: +                        read_child = afr_get_no_xattr_dir_read_child (this, +                                                             success_children, +                                                             bufs); +                        sources[read_child] = 1; +                        nsources = 1; +                        break; +                default: +                        break; +                } +        } +        if (nsources < 0) +                goto out; + +        prev_read_child = local->read_child_index; +        config_read_child = priv->read_child; +        read_child = afr_select_read_child_from_policy (success_children, +                                                        priv->child_count, +                                                        prev_read_child, +                                                        config_read_child, +                                                        sources, +                                                        priv->hash_mode, gfid); +out: +        gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d", +                read_child); +        return read_child; +} + +int +afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie, +                       xlator_t *this, int32_t op_ret, int32_t op_errno, +                       struct iatt *buf, dict_t *xdata) +{ +        afr_private_t   *priv  = NULL; +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        int call_count  = -1; +        int child_index = (long) cookie; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        LOCK (&frame->lock); +        { +                if (op_ret != -1) { +                        gf_log (this->name, GF_LOG_TRACE, +                                "fstat of %s on %s succeeded", +                                local->loc.path, +                                priv->children[child_index]->name); + +                        sh->buf[child_index] = *buf; +                        sh->success_children[sh->success_count] = child_index; +                        sh->success_count++; +                } else { +                        gf_log (this->name, GF_LOG_ERROR, "%s: fstat failed " +                                "on %s, reason %s", local->loc.path, +                                priv->children[child_index]->name, +                                strerror (op_errno)); +                        sh->child_errno[child_index] = op_errno; +                } +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                /* Previous versions of glusterfs might have set +                 * the pending data xattrs which need to be erased +                 */ +                if (!afr_sh_data_proceed (sh->success_count)) { +                        gf_log (this->name, GF_LOG_ERROR, "inspecting metadata " +                                "succeeded on < %d children, aborting " +                                "self-heal for %s", AFR_SH_MIN_PARTICIPANTS, +                                local->loc.path); +                        afr_sh_data_fail (frame, this); +                        goto out; +                } +                afr_sh_data_fxattrop_fstat_done (frame, this); +        } +out: +        return 0; +} + + +int +afr_sh_data_fstat (call_frame_t *frame, xlator_t *this) +{ +        afr_self_heal_t *sh    = NULL; +        afr_local_t     *local = NULL; +        afr_private_t   *priv  = NULL; +        int             call_count = 0; +        int             i = 0; +        int             child = 0; +        int32_t         *fstat_children = NULL; + +        priv  = this->private; +        local = frame->local; +        sh    = &local->self_heal; + +        fstat_children = memdup (sh->success_children, +                                 sizeof (*fstat_children) * priv->child_count); +        if (!fstat_children) { +                afr_sh_data_fail (frame, this); +                goto out; +        } +        call_count = sh->success_count; +        local->call_count = call_count; + +        memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count); +        afr_reset_children (sh->success_children, priv->child_count); +        sh->success_count = 0; +        for (i = 0; i < priv->child_count; i++) { +                child = fstat_children[i]; +                if (child == -1) +                        break; +                STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk, +                                   (void *) (long) child, +                                   priv->children[child], +                                   priv->children[child]->fops->fstat, +                                   sh->healing_fd, NULL); +                --call_count; +        } +        GF_ASSERT (!call_count); +out: +        GF_FREE (fstat_children); +        return 0; +} + +void +afr_sh_common_fxattrop_resp_handler (call_frame_t *frame, void *cookie, +                                     xlator_t *this, int32_t op_ret, +                                     int32_t op_errno, dict_t *xattr) +{ +        afr_private_t   *priv  = NULL; +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        int child_index = (long) cookie; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        LOCK (&frame->lock); +        { +                if (op_ret != -1) { +                        gf_log (this->name, GF_LOG_TRACE, +                                "fxattrop of %s on %s succeeded", +                                local->loc.path, +                                priv->children[child_index]->name); + +                        sh->xattr[child_index] = dict_ref (xattr); +                        sh->success_children[sh->success_count] = child_index; +                        sh->success_count++; +                } else { +                        gf_log (this->name, GF_LOG_ERROR, "fxattrop of %s " +                                "failed on %s, reason %s", local->loc.path, +                                priv->children[child_index]->name, +                                strerror (op_errno)); +                        sh->child_errno[child_index] = op_errno; +                } +        } +        UNLOCK (&frame->lock); +} + +int +afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, +                          xlator_t *this, int32_t op_ret, int32_t op_errno, +                          dict_t *xattr, dict_t *xdata) +{ +        int             call_count  = -1; +        afr_local_t     *local  = NULL; +        afr_self_heal_t *sh     = NULL; + +        local = frame->local; +        sh    = &local->self_heal; + +        afr_sh_common_fxattrop_resp_handler (frame, cookie, this, op_ret, +                                             op_errno, xattr); + +        call_count = afr_frame_return (frame); +        if (call_count == 0) { +                if (!afr_sh_data_proceed (sh->success_count)) { +                        gf_log (this->name, GF_LOG_ERROR, "%s, inspecting " +                                "change log succeeded on < %d children", +                                local->loc.path, AFR_SH_MIN_PARTICIPANTS); +                        afr_sh_data_fail (frame, this); +                        goto out; +                } +                afr_sh_data_fstat (frame, this); +        } +out: +        return 0; +} + + +int +afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this) +{ +        afr_self_heal_t *sh    = NULL; +        afr_local_t     *local = NULL; +        afr_private_t   *priv  = NULL; +        dict_t          **xattr_req; +        int32_t         *zero_pending = NULL; +        int call_count = 0; +        int i = 0; +        int ret = 0; +	int j; + +        priv  = this->private; +        local = frame->local; +        sh    = &local->self_heal; + +        call_count = afr_up_children_count (local->child_up, +                                            priv->child_count); + +        local->call_count = call_count; + +	xattr_req = GF_CALLOC(priv->child_count, sizeof(struct dict_t *), +			      gf_afr_mt_dict_t); +	if (!xattr_req) +		goto out; + +	for (i = 0; i < priv->child_count; i++) { +		xattr_req[i] = dict_new(); +		if (!xattr_req[i]) { +			ret = -1; +			goto out; +		} +	} + +	for (i = 0; i < priv->child_count; i++) { +		for (j = 0; j < priv->child_count; j++) { +			zero_pending = GF_CALLOC (3, sizeof (*zero_pending), +						  gf_afr_mt_int32_t); +			if (!zero_pending) { +				ret = -1; +				goto out; +			} +			ret = dict_set_dynptr (xattr_req[i], priv->pending_key[j], +					       zero_pending, +					       3 * sizeof (*zero_pending)); +			if (ret < 0) { +				gf_log (this->name, GF_LOG_WARNING, +					"Unable to set dict value"); +				goto out; +			} else { +				zero_pending = NULL; +			} +		} +	} + +        afr_reset_xattr (sh->xattr, priv->child_count); +        afr_reset_children (sh->success_children, priv->child_count); +        memset (sh->child_errno, 0, +                sizeof (*sh->child_errno) * priv->child_count); +        sh->success_count = 0; +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i]) { +                        STACK_WIND_COOKIE (frame, afr_sh_data_fxattrop_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->fxattrop, +                                           sh->healing_fd, GF_XATTROP_ADD_ARRAY, +                                           xattr_req[i], NULL); + +                        if (!--call_count) +                                break; +                } +        } + +out: +	if (xattr_req) { +		for (i = 0; i < priv->child_count; i++) +			if (xattr_req[i]) +				dict_unref(xattr_req[i]); +		GF_FREE(xattr_req); +	} + +        if (ret) { +                GF_FREE (zero_pending); +                afr_sh_data_fail (frame, this); +        } + +        return 0; +} + +int +afr_sh_data_big_lock_success (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t   *local = NULL; +        afr_self_heal_t *sh = NULL; + +        local = frame->local; +        sh = &local->self_heal; + +        sh->data_lock_held = _gf_true; +        afr_sh_data_fxattrop (frame, this); +        return 0; +} + +int +afr_sh_dom_lock_success (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t   *local = NULL; +        afr_self_heal_t *sh = NULL; + +        local = frame->local; +        sh = &local->self_heal; + +        sh->sh_dom_lock_held = _gf_true; +        afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, +                          afr_sh_data_big_lock_success, +                          afr_sh_data_fail); +        return 0; +} + +int +afr_sh_data_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_local_t         *local    = NULL; +        afr_self_heal_t     *sh       = NULL; + +        local    = frame->local; +        int_lock = &local->internal_lock; +        sh       = &local->self_heal; + +        if (int_lock->lock_op_ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, "Blocking data inodelks " +                        "failed for %s. by %s", +                        local->loc.path, lkowner_utoa (&frame->root->lk_owner)); + +                sh->data_lock_failure_handler (frame, this); +        } else { + +                gf_log (this->name, GF_LOG_DEBUG, "Blocking data inodelks " +                        "done for %s by %s. Proceding to self-heal", +                        local->loc.path, lkowner_utoa (&frame->root->lk_owner)); + +                sh->data_lock_success_handler (frame, this); +        } + +        return 0; +} + +int +afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_local_t         *local    = NULL; +        afr_self_heal_t     *sh       = NULL; + +        local    = frame->local; +        int_lock = &local->internal_lock; +        sh       = &local->self_heal; + +        if (int_lock->lock_op_ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks " +                        "failed for %s. by %s", +                        local->loc.path, lkowner_utoa (&frame->root->lk_owner)); + +		if (!sh->data_lock_block) { +			sh->data_lock_failure_handler(frame, this); +		} else { +			int_lock->lock_cbk = +				afr_sh_data_post_blocking_inodelk_cbk; +			afr_blocking_lock (frame, this); +		} +        } else { + +                gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks " +                        "done for %s by %s. Proceeding to self-heal", +                        local->loc.path, lkowner_utoa (&frame->root->lk_owner)); +                sh->data_lock_success_handler (frame, this); +        } + +        return 0; +} + +int +afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, char *dom, +                      off_t start, off_t len) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_inodelk_t       *inodelk  = NULL; +        afr_local_t         *local    = NULL; + +        local    = frame->local; +        int_lock = &local->internal_lock; + +        int_lock->transaction_lk_type = AFR_SELFHEAL_LK; +        int_lock->selfheal_lk_type    = AFR_DATA_SELF_HEAL_LK; + +        afr_set_lock_number (frame, this); + +        int_lock->lock_cbk         = afr_sh_data_post_nonblocking_inodelk_cbk; + +        int_lock->domain = dom; +        inodelk = afr_get_inodelk (int_lock, int_lock->domain); +        inodelk->flock.l_start = start; +        inodelk->flock.l_len   = len; +        inodelk->flock.l_type  = F_WRLCK; + +        afr_nonblocking_inodelk (frame, this); + +        return 0; +} + +int +afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local     = NULL; +        afr_self_heal_t *sh        = NULL; + +        local = frame->local; +        sh = &local->self_heal; + +        GF_ASSERT (sh->old_loop_frame); +        sh_loop_finish (sh->old_loop_frame, this); +        sh->old_loop_frame = NULL; +        sh->data_lock_held = _gf_true; +        sh->sync_done = _gf_true; +        afr_sh_data_fxattrop (frame, this); +        return 0; +} + +int +afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local     = NULL; +        afr_self_heal_t *sh        = NULL; + +        local = frame->local; +        sh = &local->self_heal; + +        GF_ASSERT (sh->old_loop_frame); +        sh_loop_finish (sh->old_loop_frame, this); +        sh->old_loop_frame = NULL; +        afr_sh_set_timestamps (frame, this); +        return 0; +} + + +int +afr_sh_data_lock (call_frame_t *frame, xlator_t *this, +                  off_t start, off_t len, gf_boolean_t block, +                  char *dom, afr_lock_cbk_t success_handler, +                  afr_lock_cbk_t failure_handler) +{ +        afr_local_t *   local = NULL; +        afr_self_heal_t * sh  = NULL; + +        local = frame->local; +        sh    = &local->self_heal; + +        sh->data_lock_success_handler = success_handler; +        sh->data_lock_failure_handler = failure_handler; +	sh->data_lock_block = block; +        return afr_sh_data_lock_rec (frame, this, dom, start, len); +} + +int +afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                      int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        afr_private_t   *priv = NULL; +        int              call_count = 0; +        int              child_index = 0; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        child_index = (long) cookie; + +        /* TODO: some of the open's might fail. +           In that case, modify cleanup fn to send flush on those +           fd's which are already open */ + +        LOCK (&frame->lock); +        { +                if (op_ret == -1) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "open of %s failed on child %s (%s)", +                                local->loc.path, +                                priv->children[child_index]->name, +                                strerror (op_errno)); +                        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +                } else { +                        gf_log (this->name, GF_LOG_TRACE, +                                "open of %s succeeded on child %s", +                                local->loc.path, +                                priv->children[child_index]->name); +                } +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { +                        afr_sh_data_fail (frame, this); +                        return 0; +                } + +                gf_log (this->name, GF_LOG_TRACE, +                        "fd for %s opened, commencing sync", +                        local->loc.path); + +                afr_sh_data_lock (frame, this, 0, 0, _gf_true, priv->sh_domain, +                                  afr_sh_dom_lock_success, afr_sh_data_fail); +        } + +        return 0; +} + + +int +afr_sh_data_open (call_frame_t *frame, xlator_t *this) +{ +        int i = 0; +        int call_count = 0; +        fd_t *fd = NULL; +        afr_local_t *   local = NULL; +        afr_private_t * priv  = NULL; +        afr_self_heal_t *sh = NULL; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        call_count = afr_up_children_count (local->child_up, priv->child_count); +        local->call_count = call_count; + +        fd = fd_create (local->loc.inode, frame->root->pid); +        sh->healing_fd = fd; + +        /* open sinks */ +        for (i = 0; i < priv->child_count; i++) { +                if(!local->child_up[i]) +                        continue; + +                STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, +                                   (void *) (long) i, +                                   priv->children[i], +                                   priv->children[i]->fops->open, +                                   &local->loc, +                                   O_RDWR|O_LARGEFILE, fd, NULL); + +                if (!--call_count) +                        break; +        } + +        return 0; +} + +void +afr_sh_non_reg_fix (call_frame_t *frame, xlator_t *this, +                    int32_t op_ret, int32_t op_errno) +{ +        afr_private_t   *priv = NULL; +        afr_self_heal_t *sh = NULL; +        afr_local_t     *local = NULL; +        int             i = 0; + +        if (op_ret < 0) { +                afr_sh_data_fail (frame, this); +                return; +        } + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        for (i = 0; i < priv->child_count ; i++) { +                if (1 == local->child_up[i]) +                        sh->success[i] = 1; +        } + +        afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION, +                              afr_sh_data_erase_pending_cbk, +                              afr_sh_data_finish); +} + +int +afr_sh_non_reg_lock_success (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t   *local = NULL; +        afr_self_heal_t *sh = NULL; + +        local = frame->local; +        sh = &local->self_heal; +        sh->data_lock_held = _gf_true; +        afr_sh_common_lookup (frame, this, &local->loc, +                              afr_sh_non_reg_fix, NULL, +                              AFR_LOOKUP_FAIL_CONFLICTS | +                              AFR_LOOKUP_FAIL_MISSING_GFIDS, +                              NULL); +        return 0; +} + +gf_boolean_t +afr_can_start_data_self_heal (afr_self_heal_t *sh, afr_private_t *priv) +{ +        if (sh->force_confirm_spb) +                return _gf_true; +        if (sh->do_data_self_heal && +            afr_data_self_heal_enabled (priv->data_self_heal)) +                return _gf_true; +        return _gf_false; +} + +int +afr_self_heal_data (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh    = NULL; +        afr_private_t   *priv  = this->private; +        int             ret    = -1; + +        local = frame->local; +        sh = &local->self_heal; + +        sh->sh_type_in_action = AFR_SELF_HEAL_DATA; + +        if (afr_can_start_data_self_heal (sh, priv)) { +                afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); +                ret = afr_inodelk_init (&local->internal_lock.inodelk[1], +                                        priv->sh_domain, priv->child_count); +                if (ret < 0) { +                        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +                        afr_sh_data_done (frame, this); +                        return 0; +                } + +                if (IA_ISREG (sh->type)) { +                        afr_sh_data_open (frame, this); +                } else { +                        afr_sh_data_lock (frame, this, 0, 0, _gf_true, +                                          this->name, +                                          afr_sh_non_reg_lock_success, +                                          afr_sh_data_fail); +                } +        } else { +                gf_log (this->name, GF_LOG_TRACE, +                        "not doing data self heal on %s", +                        local->loc.path); +                afr_sh_data_done (frame, this); +        } + +        return 0; +} diff --git a/xlators/cluster/afr-v1/src/afr-self-heal-entry.c b/xlators/cluster/afr-v1/src/afr-self-heal-entry.c new file mode 100644 index 000000000..00f1a9cb9 --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-self-heal-entry.c @@ -0,0 +1,2406 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "inode.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" + +#define AFR_INIT_SH_FRAME_VALS(_frame, _local, _sh, _sh_frame, _sh_local, _sh_sh)\ +        do {\ +                _local = _frame->local;\ +                _sh = &_local->self_heal;\ +                _sh_frame = _sh->sh_frame;\ +                _sh_local = _sh_frame->local;\ +                _sh_sh    = &_sh_local->self_heal;\ +        } while (0); + +int +afr_sh_entry_impunge_create_file (call_frame_t *impunge_frame, xlator_t *this, +                                  int child_index); +int +afr_sh_entry_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; + +        local = frame->local; +        sh = &local->self_heal; + +        sh->completion_cbk (frame, this); + +        return 0; +} + + +int +afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t         *local    = NULL; +        afr_internal_lock_t *int_lock = NULL; + +        local    = frame->local; +        int_lock = &local->internal_lock; + +        int_lock->lock_cbk = afr_sh_entry_done; +        afr_unlock (frame, this); + +        return 0; +} + + +int +afr_sh_entry_finish (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t   *local = NULL; + +        local = frame->local; + +        gf_log (this->name, GF_LOG_TRACE, +                "finishing entry selfheal of %s", local->loc.path); + +        afr_sh_entry_unlock (frame, this); + +        return 0; +} + + +int +afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie, +                                xlator_t *this, int32_t op_ret, +                                int32_t op_errno, dict_t *xattr, dict_t *xdata) +{ +        long                 i          = 0; +        int                  call_count = 0; +        afr_local_t         *local      = NULL; +        afr_self_heal_t     *sh         = NULL; +        afr_local_t         *orig_local = NULL; +        call_frame_t        *orig_frame = NULL; +        afr_private_t       *priv       = NULL; +        int32_t             read_child  = -1; + +        local = frame->local; +        priv  = this->private; +        sh = &local->self_heal; +        i = (long)cookie; + + +        afr_children_add_child (sh->fresh_children, i, priv->child_count); +        if (op_ret == -1) { +                gf_log (this->name, GF_LOG_INFO, +                        "%s: failed to erase pending xattrs on %s (%s)", +                        local->loc.path, priv->children[i]->name, +                        strerror (op_errno)); +        } + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                if (sh->source == -1) { +                        //this happens if the forced merge option is set +                        read_child = sh->fresh_children[0]; +                } else { +                        read_child = sh->source; +                } +                afr_inode_set_read_ctx (this, sh->inode, read_child, +                                        sh->fresh_children); +                orig_frame = sh->orig_frame; +                orig_local = orig_frame->local; + +                if (sh->source != -1) { +                        orig_local->cont.lookup.buf.ia_nlink = sh->buf[sh->source].ia_nlink; +                } + +                afr_sh_entry_finish (frame, this); +        } + +        return 0; +} + + +int +afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; + +        local = frame->local; +        sh = &local->self_heal; + +        if (sh->entries_skipped) { +                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +                goto out; +        } +        afr_sh_erase_pending (frame, this, AFR_ENTRY_TRANSACTION, +                              afr_sh_entry_erase_pending_cbk, +                              afr_sh_entry_finish); +        return 0; +out: +        afr_sh_entry_finish (frame, this); +        return 0; +} + + + +static int +next_active_source (call_frame_t *frame, xlator_t *this, +                    int current_active_source) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *local  = NULL; +        afr_self_heal_t *sh  = NULL; +        int              source = -1; +        int              next_active_source = -1; +        int              i = 0; + +        priv = this->private; +        local = frame->local; +        sh = &local->self_heal; + +        source = sh->source; + +        if (source != -1) { +                if (current_active_source != source) +                        next_active_source = source; +                goto out; +        } + +        /* +          the next active sink becomes the source for the +          'conservative decision' of merging all entries +        */ + +        for (i = 0; i < priv->child_count; i++) { +                if ((sh->sources[i] == 0) +                    && (local->child_up[i] == 1) +                    && (i > current_active_source)) { + +                        next_active_source = i; +                        break; +                } +        } +out: +        return next_active_source; +} + + + +static int +next_active_sink (call_frame_t *frame, xlator_t *this, +                  int current_active_sink) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *local  = NULL; +        afr_self_heal_t *sh  = NULL; +        int              next_active_sink = -1; +        int              i = 0; + +        priv = this->private; +        local = frame->local; +        sh = &local->self_heal; + +        /* +          the next active sink becomes the source for the +          'conservative decision' of merging all entries +        */ + +        for (i = 0; i < priv->child_count; i++) { +                if ((sh->sources[i] == 0) +                    && (local->child_up[i] == 1) +                    && (i > current_active_sink)) { + +                        next_active_sink = i; +                        break; +                } +        } + +        return next_active_sink; +} + +int +afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this); + +int +afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this); + +int +afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this); + +int +afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, +                             int active_src); + +int +afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this, +                                 int active_src, int32_t op_ret, +                                 int32_t op_errno) +{ +        int              call_count = 0; + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) +                afr_sh_entry_expunge_subvol (frame, this, active_src); + +        return 0; +} + +int +afr_sh_entry_expunge_parent_setattr_cbk (call_frame_t *expunge_frame, +                                         void *cookie, xlator_t *this, +                                         int32_t op_ret, int32_t op_errno, +                                         struct iatt *preop, struct iatt *postop, +                                         dict_t *xdata) +{ +        afr_private_t   *priv          = NULL; +        afr_local_t     *expunge_local = NULL; +        afr_self_heal_t *expunge_sh    = NULL; +        call_frame_t    *frame         = NULL; +        int              active_src    = (long) cookie; +        afr_self_heal_t *sh            = NULL; +        afr_local_t     *local         = NULL; + +        priv          = this->private; +        expunge_local = expunge_frame->local; +        expunge_sh    = &expunge_local->self_heal; +        frame         = expunge_sh->sh_frame; +        local         = frame->local; +        sh            = &local->self_heal; + +        if (op_ret != 0) { +                gf_log (this->name, GF_LOG_ERROR, +                        "setattr on parent directory of %s on subvolume %s failed: %s", +                        expunge_local->loc.path, +                        priv->children[active_src]->name, strerror (op_errno)); +        } + +        AFR_STACK_DESTROY (expunge_frame); +        sh->expunge_done (frame, this, active_src, op_ret, op_errno); + +        return 0; +} + + +int +afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie, +                                 xlator_t *this, +                                 int32_t op_ret, int32_t op_errno, +                                 struct iatt *preparent, +                                 struct iatt *postparent, dict_t *xdata) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *expunge_local = NULL; +        afr_self_heal_t *expunge_sh = NULL; +        int              active_src = 0; +        int32_t          valid = 0; + +        priv = this->private; +        expunge_local = expunge_frame->local; +        expunge_sh = &expunge_local->self_heal; + +        active_src = (long) cookie; + +        if (op_ret == 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "removed %s on %s", +                        expunge_local->loc.path, +                        priv->children[active_src]->name); +        } else { +                gf_log (this->name, GF_LOG_INFO, +                        "removing %s on %s failed (%s)", +                        expunge_local->loc.path, +                        priv->children[active_src]->name, +                        strerror (op_errno)); +        } + +        valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; + +        STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_parent_setattr_cbk, +                           (void *) (long) active_src, +                           priv->children[active_src], +                           priv->children[active_src]->fops->setattr, +                           &expunge_sh->parent_loc, +                           &expunge_sh->parentbuf, +                           valid, NULL); + +        return 0; +} + + +int +afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this, +                             int active_src) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *expunge_local = NULL; + +        priv          = this->private; +        expunge_local = expunge_frame->local; + +        gf_log (this->name, GF_LOG_TRACE, +                "expunging file %s on %s", +                expunge_local->loc.path, priv->children[active_src]->name); + +        STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, +                           (void *) (long) active_src, +                           priv->children[active_src], +                           priv->children[active_src]->fops->unlink, +                           &expunge_local->loc, 0, NULL); + +        return 0; +} + + + +int +afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this, +                            int active_src) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *expunge_local = NULL; + +        priv          = this->private; +        expunge_local = expunge_frame->local; + +        gf_log (this->name, GF_LOG_DEBUG, +                "expunging directory %s on %s", +                expunge_local->loc.path, priv->children[active_src]->name); + +        STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, +                           (void *) (long) active_src, +                           priv->children[active_src], +                           priv->children[active_src]->fops->rmdir, +                           &expunge_local->loc, 1, NULL); + +        return 0; +} + + +int +afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, +                             int active_src, struct iatt *buf, +                             struct iatt *parentbuf) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *expunge_local = NULL; +        afr_self_heal_t *expunge_sh = NULL; +        call_frame_t    *frame = NULL; +        int              type = 0; +        afr_self_heal_t *sh            = NULL; +        afr_local_t     *local         = NULL; +        loc_t           *loc           = NULL; + +        priv = this->private; +        expunge_local = expunge_frame->local; +        expunge_sh = &expunge_local->self_heal; +        frame = expunge_sh->sh_frame; +        local         = frame->local; +        sh            = &local->self_heal; +        loc           = &expunge_local->loc; + +        type = buf->ia_type; +        if (loc->parent && uuid_is_null (loc->parent->gfid)) +                uuid_copy (loc->pargfid, parentbuf->ia_gfid); + +        switch (type) { +        case IA_IFSOCK: +        case IA_IFREG: +        case IA_IFBLK: +        case IA_IFCHR: +        case IA_IFIFO: +        case IA_IFLNK: +                afr_sh_entry_expunge_unlink (expunge_frame, this, active_src); +                break; +        case IA_IFDIR: +                afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src); +                break; +        default: +                gf_log (this->name, GF_LOG_ERROR, +                        "%s has unknown file type on %s: 0%o", +                        expunge_local->loc.path, +                        priv->children[active_src]->name, type); +                goto out; +                break; +        } + +        return 0; +out: +        AFR_STACK_DESTROY (expunge_frame); +        sh->expunge_done (frame, this, active_src, -1, EINVAL); + +        return 0; +} + + +int +afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie, +                                 xlator_t *this, +                                 int32_t op_ret, int32_t op_errno, +                                 inode_t *inode, struct iatt *buf, dict_t *x, +                                 struct iatt *postparent) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *expunge_local = NULL; +        afr_self_heal_t *expunge_sh = NULL; +        call_frame_t    *frame = NULL; +        int              active_src = 0; +        afr_self_heal_t *sh            = NULL; +        afr_local_t     *local         = NULL; + +        priv = this->private; +        expunge_local = expunge_frame->local; +        expunge_sh = &expunge_local->self_heal; +        frame = expunge_sh->sh_frame; +        active_src = (long) cookie; +        local         = frame->local; +        sh            = &local->self_heal; + +        if (op_ret == -1) { +                gf_log (this->name, GF_LOG_ERROR, +                        "lookup of %s on %s failed (%s)", +                        expunge_local->loc.path, +                        priv->children[active_src]->name, +                        strerror (op_errno)); +                goto out; +        } + +        afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf, +                                     postparent); + +        return 0; +out: +        AFR_STACK_DESTROY (expunge_frame); +        sh->expunge_done (frame, this, active_src, op_ret, op_errno); + +        return 0; +} + + +int +afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this, +                            int active_src) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *expunge_local = NULL; + +        priv = this->private; +        expunge_local = expunge_frame->local; + +        gf_log (this->name, GF_LOG_TRACE, +                "looking up %s on %s", +                expunge_local->loc.path, priv->children[active_src]->name); + +        STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk, +                           (void *) (long) active_src, +                           priv->children[active_src], +                           priv->children[active_src]->fops->lookup, +                           &expunge_local->loc, NULL); + +        return 0; +} + +int +afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie, +                                xlator_t *this, +                                int32_t op_ret, int32_t op_errno, +                                inode_t *inode, struct iatt *buf, dict_t *x, +                                struct iatt *postparent) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *expunge_local = NULL; +        afr_self_heal_t *expunge_sh = NULL; +        int              source = 0; +        call_frame_t    *frame = NULL; +        int              active_src = 0; +        int              need_expunge = 0; +        afr_self_heal_t *sh            = NULL; +        afr_local_t     *local         = NULL; + +        priv = this->private; +        expunge_local = expunge_frame->local; +        expunge_sh = &expunge_local->self_heal; +        frame = expunge_sh->sh_frame; +        active_src = expunge_sh->active_source; +        source = (long) cookie; +        local         = frame->local; +        sh            = &local->self_heal; + +        if (op_ret == -1 && op_errno == ENOENT) +                need_expunge = 1; +        else if (op_ret == -1) +                goto out; + +        if (!uuid_is_null (expunge_sh->entrybuf.ia_gfid) && +            !uuid_is_null (buf->ia_gfid) && +            (uuid_compare (expunge_sh->entrybuf.ia_gfid, buf->ia_gfid) != 0)) { +                char uuidbuf1[64]; +                char uuidbuf2[64]; +                gf_log (this->name, GF_LOG_DEBUG, +                        "entry %s found on %s with mismatching gfid (%s/%s)", +                        expunge_local->loc.path, +                        priv->children[source]->name, +                        uuid_utoa_r (expunge_sh->entrybuf.ia_gfid, uuidbuf1), +                        uuid_utoa_r (buf->ia_gfid, uuidbuf2)); +                need_expunge = 1; +        } + +        if (need_expunge) { +                gf_log (this->name, GF_LOG_INFO, +                        "Entry %s is missing on %s and deleting from " +                        "replica's other bricks", +                        expunge_local->loc.path, +                        priv->children[source]->name); + +                if (postparent) +                        expunge_sh->parentbuf = *postparent; + +                afr_sh_entry_expunge_purge (expunge_frame, this, active_src); + +                return 0; +        } + +out: +        if (op_ret == 0) { +                gf_log (this->name, GF_LOG_TRACE, +                        "%s exists under %s", +                        expunge_local->loc.path, +                        priv->children[source]->name); +        } else { +                gf_log (this->name, GF_LOG_INFO, +                        "looking up %s under %s failed (%s)", +                        expunge_local->loc.path, +                        priv->children[source]->name, +                        strerror (op_errno)); +        } + +        AFR_STACK_DESTROY (expunge_frame); +        sh->expunge_done (frame, this, active_src, op_ret, op_errno); + +        return 0; +} + +static gf_boolean_t +can_skip_entry_self_heal (char *name, loc_t *parent_loc) +{ +        if (strcmp (name, ".") == 0) { +                return _gf_true; +        } else if (strcmp (name, "..") == 0) { +                return _gf_true; +        } else if (loc_is_root (parent_loc) && +                   (strcmp (name, GF_REPLICATE_TRASH_DIR) == 0)) { +                return _gf_true; +        } +        return _gf_false; +} + +int +afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, +                            gf_dirent_t *entry) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *local  = NULL; +        afr_self_heal_t *sh  = NULL; +        int              ret = -1; +        call_frame_t    *expunge_frame = NULL; +        afr_local_t     *expunge_local = NULL; +        afr_self_heal_t *expunge_sh = NULL; +        int              active_src = 0; +        int              source = 0; +        int              op_errno = 0; +        char            *name = NULL; +        int             op_ret = -1; + +        priv = this->private; +        local = frame->local; +        sh = &local->self_heal; + +        active_src = sh->active_source; +        source = sh->source; +        sh->expunge_done = afr_sh_entry_expunge_entry_done; + +        name = entry->d_name; +        if (can_skip_entry_self_heal (name, &local->loc)) { +                op_ret = 0; +                goto out; +        } + +        gf_log (this->name, GF_LOG_TRACE, +                "inspecting existence of %s under %s", +                name, local->loc.path); + +        expunge_frame = copy_frame (frame); +        if (!expunge_frame) { +                op_errno = ENOMEM; +                goto out; +        } + +        AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out); + +        expunge_frame->local = expunge_local; +        expunge_sh = &expunge_local->self_heal; +        expunge_sh->sh_frame = frame; +        expunge_sh->active_source = active_src; +        expunge_sh->entrybuf = entry->d_stat; +        loc_copy (&expunge_sh->parent_loc, &local->loc); + +        ret = afr_build_child_loc (this, &expunge_local->loc, &local->loc, +                                   name); +        if (ret != 0) { +                op_errno = EINVAL; +                goto out; +        } + +        gf_log (this->name, GF_LOG_TRACE, +                "looking up %s on %s", expunge_local->loc.path, +                priv->children[source]->name); + +        STACK_WIND_COOKIE (expunge_frame, +                           afr_sh_entry_expunge_entry_cbk, +                           (void *) (long) source, +                           priv->children[source], +                           priv->children[source]->fops->lookup, +                           &expunge_local->loc, NULL); + +        ret = 0; +out: +        if (ret == -1) +                sh->expunge_done (frame, this, active_src, op_ret, op_errno); + +        return 0; +} + + +int +afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie, +                                  xlator_t *this, +                                  int32_t op_ret, int32_t op_errno, +                                  gf_dirent_t *entries, dict_t *xdata) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *local  = NULL; +        afr_self_heal_t *sh  = NULL; +        gf_dirent_t     *entry = NULL; +        off_t            last_offset = 0; +        int              active_src = 0; +        int              entry_count = 0; + +        priv = this->private; +        local = frame->local; +        sh = &local->self_heal; + +        active_src = sh->active_source; + +        if (op_ret <= 0) { +                if (op_ret < 0) { +                        gf_log (this->name, GF_LOG_INFO, +                                "readdir of %s on subvolume %s failed (%s)", +                                local->loc.path, +                                priv->children[active_src]->name, +                                strerror (op_errno)); +                } else { +                        gf_log (this->name, GF_LOG_TRACE, +                                "readdir of %s on subvolume %s complete", +                                local->loc.path, +                                priv->children[active_src]->name); +                } + +                afr_sh_entry_expunge_all (frame, this); +                return 0; +        } + +        list_for_each_entry (entry, &entries->list, list) { +                last_offset = entry->d_off; +                entry_count++; +        } + +        gf_log (this->name, GF_LOG_TRACE, +                "readdir'ed %d entries from %s", +                entry_count, priv->children[active_src]->name); + +        sh->offset = last_offset; +        local->call_count = entry_count; + +        list_for_each_entry (entry, &entries->list, list) { +                afr_sh_entry_expunge_entry (frame, this, entry); +        } + +        return 0; +} + +int +afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, +                             int active_src) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *local  = NULL; +        afr_self_heal_t *sh  = NULL; + +        priv = this->private; +        local = frame->local; +        sh = &local->self_heal; + +        STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk, +                    priv->children[active_src], +                    priv->children[active_src]->fops->readdirp, +                    sh->healing_fd, sh->block_size, sh->offset, NULL); + +        return 0; +} + + +int +afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *local  = NULL; +        afr_self_heal_t *sh  = NULL; +        int              active_src = -1; + +        priv = this->private; +        local = frame->local; +        sh = &local->self_heal; + +        sh->offset = 0; + +        if (sh->source == -1) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "no active sources for %s to expunge entries", +                        local->loc.path); +                goto out; +        } + +        active_src = next_active_sink (frame, this, sh->active_source); +        sh->active_source = active_src; + +        if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { +                goto out; +        } + +        if (active_src == -1) { +                /* completed creating missing files on all subvolumes */ +                goto out; +        } + +        gf_log (this->name, GF_LOG_TRACE, +                "expunging entries of %s on %s to other sinks", +                local->loc.path, priv->children[active_src]->name); + +        afr_sh_entry_expunge_subvol (frame, this, active_src); + +        return 0; +out: +        afr_sh_entry_impunge_all (frame, this); +        return 0; + +} + + +int +afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this, +                                 int32_t op_ret, int32_t op_errno) +{ +        int              call_count = 0; +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; + +        local = frame->local; +        sh = &local->self_heal; +        if (op_ret < 0) +                sh->entries_skipped = _gf_true; +        call_count = afr_frame_return (frame); +        if (call_count == 0) +                afr_sh_entry_impunge_subvol (frame, this); + +        return 0; +} + +void +afr_sh_entry_call_impunge_done (call_frame_t *impunge_frame, xlator_t *this, +                                int32_t op_ret, int32_t op_errno) +{ +        afr_local_t     *impunge_local = NULL; +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        afr_self_heal_t *impunge_sh = NULL; +        call_frame_t    *frame = NULL; + +        AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, +                                frame, local, sh); + +        AFR_STACK_DESTROY (impunge_frame); +        sh->impunge_done (frame, this, op_ret, op_errno); +} + +int +afr_sh_entry_impunge_setattr_cbk (call_frame_t *impunge_frame, void *cookie, +                                  xlator_t *this, +                                  int32_t op_ret, int32_t op_errno, +                                  struct iatt *preop, struct iatt *postop, +                                  dict_t *xdata) +{ +        int              call_count = 0; +        afr_private_t   *priv = NULL; +        afr_local_t     *impunge_local = NULL; +        int              child_index = 0; + +        priv = this->private; +        impunge_local = impunge_frame->local; +        child_index = (long) cookie; + +        if (op_ret == 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "setattr done for %s on %s", +                        impunge_local->loc.path, +                        priv->children[child_index]->name); +        } else { +                gf_log (this->name, GF_LOG_INFO, +                        "setattr (%s) on %s failed (%s)", +                        impunge_local->loc.path, +                        priv->children[child_index]->name, +                        strerror (op_errno)); +        } + +        call_count = afr_frame_return (impunge_frame); +        if (call_count == 0) { +                afr_sh_entry_call_impunge_done (impunge_frame, this, +                                                0, op_errno); +        } + +        return 0; +} + +int +afr_sh_entry_impunge_parent_setattr_cbk (call_frame_t *setattr_frame, +                                         void *cookie, xlator_t *this, +                                         int32_t op_ret, int32_t op_errno, +                                         struct iatt *preop, struct iatt *postop, +                                         dict_t *xdata) +{ +        int             call_count = 0; +        afr_local_t     *setattr_local = NULL; + +        setattr_local = setattr_frame->local; +        if (op_ret != 0) { +                gf_log (this->name, GF_LOG_INFO, +                        "setattr on parent directory (%s) failed: %s", +                        setattr_local->loc.path, strerror (op_errno)); +        } + +        call_count = afr_frame_return (setattr_frame); +        if (call_count == 0) +                AFR_STACK_DESTROY (setattr_frame); +        return 0; +} + +int +afr_sh_entry_impunge_setattr (call_frame_t *impunge_frame, xlator_t *this) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *impunge_local = NULL; +        afr_local_t     *setattr_local = NULL; +        afr_self_heal_t *impunge_sh = NULL; +        call_frame_t    *setattr_frame = NULL; +        int32_t          valid = 0; +        int32_t          op_errno = 0; +        int              child_index = 0; +        int              call_count = 0; +        int              i = 0; + +        priv          = this->private; +        impunge_local = impunge_frame->local; +        impunge_sh    = &impunge_local->self_heal; + +        gf_log (this->name, GF_LOG_DEBUG, +                "setting ownership of %s on %s to %d/%d", +                impunge_local->loc.path, +                priv->children[child_index]->name, +                impunge_sh->entrybuf.ia_uid, +                impunge_sh->entrybuf.ia_gid); + +        setattr_frame = copy_frame (impunge_frame); +        if (!setattr_frame) { +                op_errno = ENOMEM; +                goto out; +        } +        AFR_LOCAL_ALLOC_OR_GOTO (setattr_frame->local, out); +        setattr_local = setattr_frame->local; +        call_count = afr_errno_count (NULL, impunge_sh->child_errno, +                                      priv->child_count, 0); +        loc_copy (&setattr_local->loc, &impunge_sh->parent_loc); +        impunge_local->call_count = call_count; +        setattr_local->call_count = call_count; +        for (i = 0; i < priv->child_count; i++) { +                if (impunge_sh->child_errno[i]) +                        continue; +                valid         = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; +                STACK_WIND_COOKIE (setattr_frame, +                                   afr_sh_entry_impunge_parent_setattr_cbk, +                                   (void *) (long) i, priv->children[i], +                                   priv->children[i]->fops->setattr, +                                   &setattr_local->loc, +                                   &impunge_sh->parentbuf, valid, NULL); + +                valid = GF_SET_ATTR_UID   | GF_SET_ATTR_GID | +                        GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; +                STACK_WIND_COOKIE (impunge_frame, +                                   afr_sh_entry_impunge_setattr_cbk, +                                   (void *) (long) i, priv->children[i], +                                   priv->children[i]->fops->setattr, +                                   &impunge_local->loc, +                                   &impunge_sh->entrybuf, valid, NULL); +                call_count--; +        } +        GF_ASSERT (!call_count); +        return 0; +out: +        if (setattr_frame) +                AFR_STACK_DESTROY (setattr_frame); +        afr_sh_entry_call_impunge_done (impunge_frame, this, 0, op_errno); +        return 0; +} + +int +afr_sh_entry_impunge_xattrop_cbk (call_frame_t *impunge_frame, void *cookie, +                                  xlator_t *this, +                                  int32_t op_ret, int32_t op_errno, +                                  dict_t *xattr, dict_t *xdata) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *impunge_local = NULL; +        int              child_index = 0; +        int              call_count = -1; + +        priv          = this->private; +        impunge_local = impunge_frame->local; + +        child_index = (long) cookie; + +        if (op_ret == -1) { +                gf_log (this->name, GF_LOG_INFO, +                        "%s: failed to perform xattrop on %s (%s)", +                        impunge_local->loc.path, +                        priv->children[child_index]->name, strerror (op_errno)); + +                        LOCK (&impunge_frame->lock); +                        { +                                impunge_local->op_ret = -1; +                                impunge_local->op_errno = op_errno; +                        } +                        UNLOCK (&impunge_frame->lock); +        } + +        call_count = afr_frame_return (impunge_frame); + +        if (call_count == 0) { +                if (impunge_local->op_ret == 0) { +                        afr_sh_entry_impunge_setattr (impunge_frame, this); +                } else { +                        afr_sh_entry_call_impunge_done (impunge_frame, this, +                                                -1, impunge_local->op_errno); +                } +        } +        return 0; +} + +int +afr_sh_entry_impunge_perform_xattrop (call_frame_t *impunge_frame, +                                      xlator_t *this) +{ +        int              active_src       = 0; +        dict_t          *xattr            = NULL; +        afr_private_t   *priv             = NULL; +        afr_local_t     *impunge_local    = NULL; +        afr_self_heal_t *impunge_sh       = NULL; +        int32_t         op_errno          = 0; +        int32_t         call_count        = 0; +        int32_t         i                 = 0; + + +        priv = this->private; +        impunge_local = impunge_frame->local; +        impunge_sh = &impunge_local->self_heal; +        active_src = impunge_sh->active_source; +        impunge_local->op_ret = 0; + +        afr_prepare_new_entry_pending_matrix (impunge_local->pending, +                                              afr_is_errno_unset, +                                              impunge_sh->child_errno, +                                              &impunge_sh->entrybuf, +                                              priv->child_count); +        xattr = dict_new (); +        if (!xattr) { +                op_errno = ENOMEM; +                goto out; +        } + +        afr_set_pending_dict (priv, xattr, impunge_local->pending, active_src, +                              LOCAL_LAST); + +        for (i = 0; i < priv->child_count; i++) { +                if ((impunge_sh->child_errno[i] == EEXIST) && +                    (impunge_local->child_up[i] == 1)) + +                        call_count++; +        } + +        impunge_local->call_count  = call_count; + +        for (i = 0; i < priv->child_count; i++) { + +                if ((impunge_sh->child_errno[i] == EEXIST) +                    && (impunge_local->child_up[i] == 1)) { + + +                        STACK_WIND_COOKIE (impunge_frame, +                                           afr_sh_entry_impunge_xattrop_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->xattrop, +                                           &impunge_local->loc, +                                           GF_XATTROP_ADD_ARRAY, xattr, NULL); +                        if (!--call_count) +                                break; +                } +        } + +        if (xattr) +                dict_unref (xattr); +        return 0; +out: +        afr_sh_entry_call_impunge_done (impunge_frame, this, +                                        -1, op_errno); +        return 0; +} + +int +afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie, +                                  xlator_t *this, +                                  int32_t op_ret, int32_t op_errno, +                                  inode_t *inode, struct iatt *stbuf, +                                  struct iatt *preparent, +                                  struct iatt *postparent, dict_t *xdata) +{ +        int              call_count       = 0; +        afr_private_t   *priv             = NULL; +        afr_local_t     *impunge_local    = NULL; +        afr_self_heal_t *impunge_sh       = NULL; +        int              child_index      = 0; + +        priv = this->private; +        impunge_local = impunge_frame->local; +        impunge_sh = &impunge_local->self_heal; + +        child_index = (long) cookie; + +        if (op_ret == -1) { +                impunge_sh->child_errno[child_index] = op_errno; +                gf_log (this->name, GF_LOG_ERROR, +                        "creation of %s on %s failed (%s)", +                        impunge_local->loc.path, +                        priv->children[child_index]->name, +                        strerror (op_errno)); +        } else { +                impunge_sh->child_errno[child_index] = 0; +        } + +        call_count = afr_frame_return (impunge_frame); +        if (call_count == 0) { +                if (!afr_errno_count (NULL, impunge_sh->child_errno, +                                      priv->child_count, 0)) { +                        // new_file creation failed every where +                        afr_sh_entry_call_impunge_done (impunge_frame, this, +                                                        -1, op_errno); +                        goto out; +                } +                afr_sh_entry_impunge_perform_xattrop (impunge_frame, this); +        } +out: +        return 0; +} + +int +afr_sh_entry_impunge_hardlink_cbk (call_frame_t *impunge_frame, void *cookie, +                                   xlator_t *this, int32_t op_ret, +                                   int32_t op_errno, inode_t *inode, +                                   struct iatt *buf, struct iatt *preparent, +                                   struct iatt *postparent, dict_t *xdata) +{ +        int              call_count        = 0; +        afr_local_t     *impunge_local = NULL; +        afr_self_heal_t *impunge_sh  = NULL; + +        impunge_local = impunge_frame->local; +        impunge_sh = &impunge_local->self_heal; + +        if (IA_IFLNK == impunge_sh->entrybuf.ia_type) { +                //For symlinks impunge is attempted un-conditionally +                //So the file can already exist. +                if ((op_ret < 0) && (op_errno == EEXIST)) +                        op_ret = 0; +        } + +        call_count = afr_frame_return (impunge_frame); +        if (call_count == 0) +                afr_sh_entry_call_impunge_done (impunge_frame, this, +                                                op_ret, op_errno); + +        return 0; +} + +int +afr_sh_entry_impunge_hardlink (call_frame_t *impunge_frame, xlator_t *this, +                               int child_index) +{ +        afr_private_t   *priv          = NULL; +        afr_local_t     *impunge_local = NULL; +        afr_self_heal_t *impunge_sh  = NULL; +        loc_t           *loc           = NULL; +        struct iatt     *buf           = NULL; +        loc_t            oldloc        = {0}; + +        priv = this->private; +        impunge_local = impunge_frame->local; +        impunge_sh = &impunge_local->self_heal; +        loc = &impunge_local->loc; +        buf = &impunge_sh->entrybuf; + +        oldloc.inode = inode_ref (loc->inode); +        uuid_copy (oldloc.gfid, buf->ia_gfid); +        gf_log (this->name, GF_LOG_DEBUG, "linking missing file %s on %s", +                loc->path, priv->children[child_index]->name); + +        STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_hardlink_cbk, +                           (void *) (long) child_index, +                           priv->children[child_index], +                           priv->children[child_index]->fops->link, +                           &oldloc, loc, NULL); +        loc_wipe (&oldloc); + +        return 0; +} + +int +afr_sh_nameless_lookup_cbk (call_frame_t *impunge_frame, void *cookie, +                            xlator_t *this, +                            int32_t op_ret, int32_t op_errno, inode_t *inode, +                            struct iatt *buf, dict_t *xattr, +                            struct iatt *postparent) +{ +        if (op_ret < 0) { +                 afr_sh_entry_impunge_create_file (impunge_frame, this, +                                                   (long)cookie); +        } else { +                afr_sh_entry_impunge_hardlink (impunge_frame, this, +                                               (long)cookie); +        } +        return 0; +} + +int +afr_sh_entry_impunge_check_hardlink (call_frame_t *impunge_frame, +                                     xlator_t *this, +                                     int child_index, struct iatt *stbuf) +{ +        afr_private_t   *priv          = NULL; +        call_frame_t    *frame             = NULL; +        afr_local_t     *impunge_local     = NULL; +        afr_local_t     *local             = NULL; +        afr_self_heal_t *impunge_sh        = NULL; +        afr_self_heal_t *sh                = NULL; +        loc_t           *loc           = NULL; +        dict_t          *xattr_req     = NULL; +        loc_t            oldloc        = {0}; +        int              ret           = -1; + +        priv = this->private; +        AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, +                                frame, local, sh); +        loc = &impunge_local->loc; + +        xattr_req = dict_new (); +        if (!xattr_req) +                goto out; +        oldloc.inode = inode_ref (loc->inode); +        uuid_copy (oldloc.gfid, stbuf->ia_gfid); + +        STACK_WIND_COOKIE (impunge_frame, afr_sh_nameless_lookup_cbk, +                           (void *) (long) child_index, +                           priv->children[child_index], +                           priv->children[child_index]->fops->lookup, +                           &oldloc, xattr_req); +        ret = 0; +out: +        if (xattr_req) +                dict_unref (xattr_req); +        loc_wipe (&oldloc); +        if (ret) +                sh->impunge_done (frame, this, -1, ENOMEM); +        return 0; +} + +int +afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, +                            int child_index, struct iatt *stbuf) +{ +        afr_private_t *priv          = NULL; +        afr_local_t   *impunge_local = NULL; +        dict_t        *dict          = NULL; +        int            ret           = 0; + +        priv = this->private; +        impunge_local = impunge_frame->local; + +        gf_log (this->name, GF_LOG_DEBUG, +                "creating missing file %s on %s", +                impunge_local->loc.path, +                priv->children[child_index]->name); + +        dict = dict_new (); +        if (!dict) +                gf_log (this->name, GF_LOG_ERROR, "Out of memory"); + +        GF_ASSERT (!uuid_is_null (stbuf->ia_gfid)); +        ret = afr_set_dict_gfid (dict, stbuf->ia_gfid); +        if (ret) +                gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed", +                        impunge_local->loc.path); + +        /* +         * Reason for adding GLUSTERFS_INTERNAL_FOP_KEY : +         * +         * Problem: +         * While a brick is down in a replica pair, lets say the user creates +         * one file(file-A) and a hard link to that file(h-file-A). After the +         * brick comes back up, entry self-heal is attempted on parent dir of +         * these two files. As part of readdir in self-heal it reads both the +         * entries file-A and h-file-A for both of them it does name less lookup +         * to check if there are any hardlinks already present in the +         * destination brick. It finds that there are no hard links already +         * present for files file-A, h-file-A. Self-heal does mknods for both +         * file-A and h-file-A. This leads to file-A and h-file-A not being +         * hardlinks anymore. +         * +         * Fix: (More like shrinking of race-window, the race itself is still +         * present in posix-mknod). +         * If mknod comes with the presence of GLUSTERFS_INTERNAL_FOP_KEY then +         * posix_mknod checks if there are already any gfid-links and does +         * link() instead of mknod. There still can be a race where two +         * posix_mknods same gfid see that +         * gfid-link file is not present and proceeds with mknods and result in +         * two different files with same gfid. +         */ +        ret = dict_set_str (dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); +        if (ret) +                gf_log (this->name, GF_LOG_INFO, "%s: %s set failed", +                        impunge_local->loc.path, GLUSTERFS_INTERNAL_FOP_KEY); + +        STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, +                           (void *) (long) child_index, +                           priv->children[child_index], +                           priv->children[child_index]->fops->mknod, +                           &impunge_local->loc, +                           st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type), +                           makedev (ia_major (stbuf->ia_rdev), +                                    ia_minor (stbuf->ia_rdev)), 0, dict); + +        if (dict) +                dict_unref (dict); + +        return 0; +} + + + +int +afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this, +                            int child_index, struct iatt *stbuf) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *impunge_local = NULL; +        dict_t          *dict = NULL; + +        int ret = 0; + +        priv = this->private; +        impunge_local = impunge_frame->local; + +        dict = dict_new (); +        if (!dict) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Out of memory"); +                return 0; +        } + +        GF_ASSERT (!uuid_is_null (stbuf->ia_gfid)); +        ret = afr_set_dict_gfid (dict, stbuf->ia_gfid); +        if (ret) +                gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed", +                        impunge_local->loc.path); + +        gf_log (this->name, GF_LOG_DEBUG, +                "creating missing directory %s on %s", +                impunge_local->loc.path, +                priv->children[child_index]->name); + +        STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, +                           (void *) (long) child_index, +                           priv->children[child_index], +                           priv->children[child_index]->fops->mkdir, +                           &impunge_local->loc, +                           st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type), +                           0, dict); + +        if (dict) +                dict_unref (dict); + +        return 0; +} + + +int +afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this, +                              int child_index, const char *linkname) +{ +        afr_private_t   *priv          = NULL; +        afr_local_t     *impunge_local = NULL; +        dict_t          *dict          = NULL; +        struct iatt     *buf           = NULL; +        int              ret           = 0; + +        priv = this->private; +        impunge_local = impunge_frame->local; + +        buf = &impunge_local->cont.dir_fop.buf; + +        dict = dict_new (); +        if (!dict) { +                afr_sh_entry_call_impunge_done (impunge_frame, this, +                                                -1, ENOMEM); +                goto out; +        } + +        GF_ASSERT (!uuid_is_null (buf->ia_gfid)); +        ret = afr_set_dict_gfid (dict, buf->ia_gfid); +        if (ret) +                gf_log (this->name, GF_LOG_INFO, +                        "%s: dict set gfid failed", +                        impunge_local->loc.path); + +        gf_log (this->name, GF_LOG_DEBUG, +                "creating missing symlink %s -> %s on %s", +                impunge_local->loc.path, linkname, +                priv->children[child_index]->name); + +        STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, +                           (void *) (long) child_index, +                           priv->children[child_index], +                           priv->children[child_index]->fops->symlink, +                           linkname, &impunge_local->loc, 0, dict); + +        if (dict) +                dict_unref (dict); +out: +        return 0; +} + + +int +afr_sh_entry_impunge_symlink_unlink_cbk (call_frame_t *impunge_frame, +                                         void *cookie, xlator_t *this, +                                         int32_t op_ret, int32_t op_errno, +                                         struct iatt *preparent, +                                         struct iatt *postparent, dict_t *xdata) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *impunge_local = NULL; +        afr_self_heal_t *impunge_sh = NULL; +        int              child_index = -1; +        int              call_count = -1; + +        priv          = this->private; +        impunge_local = impunge_frame->local; +        impunge_sh    = &impunge_local->self_heal; + +        child_index = (long) cookie; + +        if (op_ret == -1) { +                gf_log (this->name, GF_LOG_INFO, +                        "unlink of %s on %s failed (%s)", +                        impunge_local->loc.path, +                        priv->children[child_index]->name, +                        strerror (op_errno)); +                goto out; +        } + +        afr_sh_entry_impunge_symlink (impunge_frame, this, child_index, +                                      impunge_sh->linkname); + +        return 0; +out: +        LOCK (&impunge_frame->lock); +        { +                call_count = --impunge_local->call_count; +        } +        UNLOCK (&impunge_frame->lock); + +        if (call_count == 0) +                afr_sh_entry_call_impunge_done (impunge_frame, this, +                                                op_ret, op_errno); + +        return 0; +} + + +int +afr_sh_entry_impunge_symlink_unlink (call_frame_t *impunge_frame, xlator_t *this, +                                     int child_index) +{ +        afr_private_t   *priv          = NULL; +        afr_local_t     *impunge_local = NULL; + +        priv          = this->private; +        impunge_local = impunge_frame->local; + +        gf_log (this->name, GF_LOG_DEBUG, +                "unlinking symlink %s with wrong target on %s", +                impunge_local->loc.path, +                priv->children[child_index]->name); + +        STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_symlink_unlink_cbk, +                           (void *) (long) child_index, +                           priv->children[child_index], +                           priv->children[child_index]->fops->unlink, +                           &impunge_local->loc, 0, NULL); + +        return 0; +} + + +int +afr_sh_entry_impunge_readlink_sink_cbk (call_frame_t *impunge_frame, void *cookie, +                                        xlator_t *this, +                                        int32_t op_ret, int32_t op_errno, +                                        const char *linkname, struct iatt *sbuf, dict_t *xdata) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *impunge_local = NULL; +        afr_self_heal_t *impunge_sh = NULL; +        int              child_index = -1; +        int              call_count = -1; +        int              active_src = -1; + +        priv          = this->private; +        impunge_local = impunge_frame->local; +        impunge_sh    = &impunge_local->self_heal; +        active_src    = impunge_sh->active_source; + +        child_index = (long) cookie; + +        if ((op_ret == -1) && (!afr_inode_missing(op_errno))) { +                gf_log (this->name, GF_LOG_INFO, +                        "readlink of %s on %s failed (%s)", +                        impunge_local->loc.path, +                        priv->children[active_src]->name, +                        strerror (op_errno)); +                goto out; +        } + +        /* symlink doesn't exist on the sink */ + +        if ((op_ret == -1) && (afr_inode_missing(op_errno))) { +                afr_sh_entry_impunge_symlink (impunge_frame, this, +                                              child_index, impunge_sh->linkname); +                return 0; +        } + + +        /* symlink exists on the sink, so check if targets match */ + +        if (strcmp (linkname, impunge_sh->linkname) == 0) { +                /* targets match, nothing to do */ + +                goto out; +        } else { +                /* +                 * Hah! Sneaky wolf in sheep's clothing! +                 */ +                afr_sh_entry_impunge_symlink_unlink (impunge_frame, this, +                                                     child_index); +                return 0; +        } + +out: +        LOCK (&impunge_frame->lock); +        { +                call_count = --impunge_local->call_count; +        } +        UNLOCK (&impunge_frame->lock); + +        if (call_count == 0) +                afr_sh_entry_call_impunge_done (impunge_frame, this, +                                                op_ret, op_errno); + +        return 0; +} + + +int +afr_sh_entry_impunge_readlink_sink (call_frame_t *impunge_frame, xlator_t *this, +                                    int child_index) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *impunge_local = NULL; + +        priv = this->private; +        impunge_local = impunge_frame->local; + +        gf_log (this->name, GF_LOG_DEBUG, +                "checking symlink target of %s on %s", +                impunge_local->loc.path, priv->children[child_index]->name); + +        STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_sink_cbk, +                           (void *) (long) child_index, +                           priv->children[child_index], +                           priv->children[child_index]->fops->readlink, +                           &impunge_local->loc, 4096, NULL); + +        return 0; +} + + +int +afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie, +                                   xlator_t *this, +                                   int32_t op_ret, int32_t op_errno, +                                   const char *linkname, struct iatt *sbuf, dict_t *xdata) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *impunge_local = NULL; +        afr_self_heal_t *impunge_sh = NULL; +        int              child_index = -1; +        int              call_count = -1; +        int              active_src = -1; + +        priv = this->private; +        impunge_local = impunge_frame->local; +        impunge_sh = &impunge_local->self_heal; +        active_src = impunge_sh->active_source; + +        child_index = (long) cookie; + +        if (op_ret == -1) { +                gf_log (this->name, GF_LOG_INFO, +                        "readlink of %s on %s failed (%s)", +                        impunge_local->loc.path, +                        priv->children[active_src]->name, +                        strerror (op_errno)); +                goto out; +        } + +        impunge_sh->linkname = gf_strdup (linkname); +        afr_sh_entry_impunge_readlink_sink (impunge_frame, this, child_index); + +        return 0; + +out: +        LOCK (&impunge_frame->lock); +        { +                call_count = --impunge_local->call_count; +        } +        UNLOCK (&impunge_frame->lock); + +        if (call_count == 0) +                afr_sh_entry_call_impunge_done (impunge_frame, this, +                                                op_ret, op_errno); + +        return 0; +} + + +int +afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this, +                               int child_index, struct iatt *stbuf) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *impunge_local = NULL; +        afr_self_heal_t *impunge_sh = NULL; +        int              active_src = -1; + +        priv = this->private; +        impunge_local = impunge_frame->local; +        impunge_sh = &impunge_local->self_heal; +        active_src = impunge_sh->active_source; +        impunge_local->cont.dir_fop.buf = *stbuf; + +        STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk, +                           (void *) (long) child_index, +                           priv->children[active_src], +                           priv->children[active_src]->fops->readlink, +                           &impunge_local->loc, 4096, NULL); + +        return 0; +} + +int +afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, +                             int child_index) +{ +        call_frame_t    *frame             = NULL; +        afr_local_t     *impunge_local     = NULL; +        afr_local_t     *local             = NULL; +        afr_self_heal_t *impunge_sh        = NULL; +        afr_self_heal_t *sh                = NULL; +        afr_private_t   *priv = NULL; +        ia_type_t       type = IA_INVAL; +        int             active_src = 0; +        struct iatt     *buf = NULL; + +        AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, +                                frame, local, sh); +        active_src = impunge_sh->active_source; +        afr_update_loc_gfids (&impunge_local->loc, &impunge_sh->entrybuf, +                              &impunge_sh->parentbuf); + +        buf = &impunge_sh->entrybuf; +        type = buf->ia_type; + +        switch (type) { +        case IA_IFSOCK: +        case IA_IFREG: +        case IA_IFBLK: +        case IA_IFCHR: +        case IA_IFIFO: +        case IA_IFLNK: +                afr_sh_entry_impunge_check_hardlink (impunge_frame, this, +                                                     child_index, buf); +                break; +        case IA_IFDIR: +                afr_sh_entry_impunge_mkdir (impunge_frame, this, +                                            child_index, buf); +                break; +        default: +                gf_log (this->name, GF_LOG_ERROR, +                        "%s has unknown file type on %s: 0%o", +                        impunge_local->loc.path, +                        priv->children[active_src]->name, type); +                sh->impunge_done (frame, this, -1, EINVAL); +                break; +        } + +        return 0; +} + +int +afr_sh_entry_impunge_create_file (call_frame_t *impunge_frame, xlator_t *this, +                                  int child_index) +{ +        call_frame_t    *frame             = NULL; +        afr_local_t     *impunge_local     = NULL; +        afr_local_t     *local             = NULL; +        afr_self_heal_t *impunge_sh        = NULL; +        afr_self_heal_t *sh                = NULL; +        afr_private_t   *priv = NULL; +        ia_type_t       type = IA_INVAL; +        int             active_src = 0; +        struct iatt     *buf = NULL; + +        AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, +                                frame, local, sh); +        active_src = impunge_sh->active_source; +        buf = &impunge_sh->entrybuf; +        type = buf->ia_type; + +        switch (type) { +        case IA_IFSOCK: +        case IA_IFREG: +        case IA_IFBLK: +        case IA_IFCHR: +        case IA_IFIFO: +                afr_sh_entry_impunge_mknod (impunge_frame, this, +                                            child_index, buf); +                break; +        case IA_IFLNK: +                afr_sh_entry_impunge_readlink (impunge_frame, this, +                                               child_index, buf); +                break; +        default: +                gf_log (this->name, GF_LOG_ERROR, +                        "%s has unknown file type on %s: 0%o", +                        impunge_local->loc.path, +                        priv->children[active_src]->name, type); +                sh->impunge_done (frame, this, -1, EINVAL); +                break; +        } + +        return 0; +} + +gf_boolean_t +afr_sh_need_recreate (afr_self_heal_t *impunge_sh, unsigned int child, +                      unsigned int child_count) +{ +        gf_boolean_t    recreate = _gf_false; + +        GF_ASSERT (impunge_sh->child_errno); + +        if (child == impunge_sh->active_source) +                goto out; + +        if (IA_IFLNK == impunge_sh->entrybuf.ia_type) { +                recreate = _gf_true; +                goto out; +        } + +        if (impunge_sh->child_errno[child] == ENOENT) +                recreate = _gf_true; +out: +        return recreate; +} + +unsigned int +afr_sh_recreate_count (afr_self_heal_t *impunge_sh, int *sources, +                       unsigned int child_count) +{ +        int             count = 0; +        int             i = 0; + +        for (i = 0; i < child_count; i++) { +                if (afr_sh_need_recreate (impunge_sh, i, child_count)) +                        count++; +        } + +        return count; +} + +int +afr_sh_entry_call_impunge_recreate (call_frame_t *impunge_frame, +                                    xlator_t *this) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *impunge_local = NULL; +        afr_self_heal_t *impunge_sh = NULL; +        call_frame_t    *frame = NULL; +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        unsigned int     recreate_count = 0; +        int              i = 0; +        int              active_src = 0; + +        priv          = this->private; +        AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, +                                frame, local, sh); +        active_src = impunge_sh->active_source; +        impunge_sh->entrybuf = impunge_sh->buf[active_src]; +        impunge_sh->parentbuf = impunge_sh->parentbufs[active_src]; +        recreate_count = afr_sh_recreate_count (impunge_sh, sh->sources, +                                                priv->child_count); +        if (!recreate_count) { +                afr_sh_entry_call_impunge_done (impunge_frame, this, 0, 0); +                goto out; +        } +        impunge_local->call_count = recreate_count; +        for (i = 0; i < priv->child_count; i++) { +                if (!impunge_local->child_up[i]) { +                        impunge_sh->child_errno[i] = ENOTCONN; +                        continue; +                } +                if (!afr_sh_need_recreate (impunge_sh, i, priv->child_count)) { +                        impunge_sh->child_errno[i] = EEXIST; +                        continue; +                } +        } +        for (i = 0; i < priv->child_count; i++) { +                if (!afr_sh_need_recreate (impunge_sh, i, priv->child_count)) +                        continue; +                (void)afr_sh_entry_impunge_create (impunge_frame, this, i); +                recreate_count--; +        } +        GF_ASSERT (!recreate_count); +out: +        return 0; +} + +void +afr_sh_entry_common_lookup_done (call_frame_t *impunge_frame, xlator_t *this, +                                 int32_t op_ret, int32_t op_errno) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *impunge_local = NULL; +        afr_self_heal_t *impunge_sh = NULL; +        call_frame_t    *frame = NULL; +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        unsigned int     gfid_miss_count = 0; +        unsigned int     children_up_count = 0; +        uuid_t           gfid = {0}; +        int              active_src = 0; + +        priv          = this->private; +        AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, +                                frame, local, sh); +        active_src    = impunge_sh->active_source; + +        if (op_ret < 0) +                goto done; +        if (impunge_sh->child_errno[active_src]) { +                op_ret = -1; +                op_errno = impunge_sh->child_errno[active_src]; +                goto done; +        } + +        gfid_miss_count = afr_gfid_missing_count (this->name, +                                                  impunge_sh->success_children, +                                                  impunge_sh->buf, priv->child_count, +                                                  impunge_local->loc.path); +        children_up_count = afr_up_children_count (impunge_local->child_up, +                                                   priv->child_count); +        if ((gfid_miss_count == children_up_count) && +            (children_up_count < priv->child_count)) { +                op_ret = -1; +                op_errno = ENODATA; +                gf_log (this->name, GF_LOG_ERROR, "Not all children are up, " +                        "gfid should not be assigned in this state for %s", +                        impunge_local->loc.path); +                goto done; +        } + +        if (gfid_miss_count) { +                afr_update_gfid_from_iatts (gfid, impunge_sh->buf, +                                            impunge_sh->success_children, +                                            priv->child_count); +                if (uuid_is_null (gfid)) { +                        sh->entries_skipped = _gf_true; +                        gf_log (this->name, GF_LOG_INFO, "%s: Skipping entry " +                                "self-heal because of gfid absence", +                                impunge_local->loc.path); +                        goto done; +                } +                afr_sh_common_lookup (impunge_frame, this, &impunge_local->loc, +                                      afr_sh_entry_common_lookup_done, gfid, +                                      AFR_LOOKUP_FAIL_CONFLICTS | +                                      AFR_LOOKUP_FAIL_MISSING_GFIDS, +                                      NULL); +        } else { +                afr_sh_entry_call_impunge_recreate (impunge_frame, this); +        } +        return; +done: +        afr_sh_entry_call_impunge_done (impunge_frame, this, +                                        op_ret, op_errno); +        return; +} + +int +afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this, +                            gf_dirent_t *entry) +{ +        afr_local_t     *local  = NULL; +        afr_self_heal_t *sh  = NULL; +        afr_self_heal_t *impunge_sh  = NULL; +        int              ret = -1; +        call_frame_t    *impunge_frame = NULL; +        afr_local_t     *impunge_local = NULL; +        int              active_src = 0; +        int              op_errno = 0; +        int              op_ret = -1; + +        local = frame->local; +        sh = &local->self_heal; + +        active_src = sh->active_source; +        sh->impunge_done = afr_sh_entry_impunge_entry_done; + +        if (can_skip_entry_self_heal (entry->d_name, &local->loc)) { +                op_ret = 0; +                goto out; +        } + +        gf_log (this->name, GF_LOG_TRACE, +                "inspecting existence of %s under %s", +                entry->d_name, local->loc.path); + +        ret = afr_impunge_frame_create (frame, this, active_src, +                                        &impunge_frame); +        if (ret) { +                op_errno = -ret; +                goto out; +        } + +        impunge_local = impunge_frame->local; +        impunge_sh = &impunge_local->self_heal; +        ret = afr_build_child_loc (this, &impunge_local->loc, &local->loc, +                                   entry->d_name); +        loc_copy (&impunge_sh->parent_loc, &local->loc); +        if (ret != 0) { +                op_errno = ENOMEM; +                goto out; +        } + +        afr_sh_common_lookup (impunge_frame, this, &impunge_local->loc, +                              afr_sh_entry_common_lookup_done, NULL, +                              AFR_LOOKUP_FAIL_CONFLICTS, NULL); + +        op_ret = 0; +out: +        if (ret) { +                if (impunge_frame) +                        AFR_STACK_DESTROY (impunge_frame); +                sh->impunge_done (frame, this, op_ret, op_errno); +        } + +        return 0; +} + + +int +afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, +                                  xlator_t *this, +                                  int32_t op_ret, int32_t op_errno, +                                  gf_dirent_t *entries, dict_t *xdata) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *local  = NULL; +        afr_self_heal_t *sh  = NULL; +        gf_dirent_t     *entry = NULL; +        off_t            last_offset = 0; +        int              active_src = 0; +        int              entry_count = 0; + +        priv = this->private; +        local = frame->local; +        sh = &local->self_heal; + +        active_src = sh->active_source; + +        if (op_ret <= 0) { +                if (op_ret < 0) { +                        gf_log (this->name, GF_LOG_INFO, +                                "readdir of %s on subvolume %s failed (%s)", +                                local->loc.path, +                                priv->children[active_src]->name, +                                strerror (op_errno)); +                        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +                } else { +                        gf_log (this->name, GF_LOG_TRACE, +                                "readdir of %s on subvolume %s complete", +                                local->loc.path, +                                priv->children[active_src]->name); +                } + +                afr_sh_entry_impunge_all (frame, this); +                return 0; +        } + +        list_for_each_entry (entry, &entries->list, list) { +                last_offset = entry->d_off; +                entry_count++; +        } + +        gf_log (this->name, GF_LOG_DEBUG, +                "readdir'ed %d entries from %s", +                entry_count, priv->children[active_src]->name); + +        sh->offset = last_offset; +        local->call_count = entry_count; + +        list_for_each_entry (entry, &entries->list, list) { +                afr_sh_entry_impunge_entry (frame, this, entry); +        } + +        return 0; +} + + +int +afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *local  = NULL; +        afr_self_heal_t *sh  = NULL; +        int32_t         active_src = 0; + +        priv = this->private; +        local = frame->local; +        sh = &local->self_heal; +        active_src = sh->active_source; +        gf_log (this->name, GF_LOG_DEBUG, "%s: readdir from offset %zd", +                local->loc.path, sh->offset); + +        STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk, +                    priv->children[active_src], +                    priv->children[active_src]->fops->readdirp, +                    sh->healing_fd, sh->block_size, sh->offset, NULL); + +        return 0; +} + + +int +afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) +{ +        afr_private_t   *priv = NULL; +        afr_local_t     *local  = NULL; +        afr_self_heal_t *sh  = NULL; +        int              active_src = -1; + +        priv = this->private; +        local = frame->local; +        sh = &local->self_heal; + +        sh->offset = 0; + +        active_src = next_active_source (frame, this, sh->active_source); +        sh->active_source = active_src; + +        if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { +                afr_sh_entry_finish (frame, this); +                return 0; +        } + +        if (active_src == -1) { +                /* completed creating missing files on all subvolumes */ +                afr_sh_entry_erase_pending (frame, this); +                return 0; +        } + +        gf_log (this->name, GF_LOG_TRACE, +                "impunging entries of %s on %s to other sinks", +                local->loc.path, priv->children[active_src]->name); + +        afr_sh_entry_impunge_subvol (frame, this); + +        return 0; +} + + +int +afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                          int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        afr_private_t   *priv = NULL; +        int              call_count = 0; +        int              child_index = 0; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        child_index = (long) cookie; + +        /* TODO: some of the open's might fail. +           In that case, modify cleanup fn to send flush on those +           fd's which are already open */ + +        LOCK (&frame->lock); +        { +                if (op_ret == -1) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "opendir of %s failed on child %s (%s)", +                                local->loc.path, +                                priv->children[child_index]->name, +                                strerror (op_errno)); +                        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +                } +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { +                        afr_sh_entry_finish (frame, this); +                        return 0; +                } +                gf_log (this->name, GF_LOG_TRACE, +                        "fd for %s opened, commencing sync", +                        local->loc.path); + +                sh->active_source = -1; +                afr_sh_entry_expunge_all (frame, this); +        } + +        return 0; +} + + +int +afr_sh_entry_open (call_frame_t *frame, xlator_t *this) +{ +        int i = 0; +        int call_count = 0; + +        int source = -1; +        int *sources = NULL; + +        fd_t *fd = NULL; + +        afr_local_t *   local = NULL; +        afr_private_t * priv  = NULL; +        afr_self_heal_t *sh = NULL; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        source  = local->self_heal.source; +        sources = local->self_heal.sources; + +        sh->block_size = priv->sh_readdir_size; +        sh->offset = 0; + +        call_count = sh->active_sinks; +        if (source != -1) +                call_count++; + +        local->call_count = call_count; + +        fd = fd_create (local->loc.inode, frame->root->pid); +        sh->healing_fd = fd; + +        if (source != -1) { +                gf_log (this->name, GF_LOG_TRACE, +                        "opening directory %s on subvolume %s (source)", +                        local->loc.path, priv->children[source]->name); + +                /* open source */ +                STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, +                                   (void *) (long) source, +                                   priv->children[source], +                                   priv->children[source]->fops->opendir, +                                   &local->loc, fd, NULL); +                call_count--; +        } + +        /* open sinks */ +        for (i = 0; i < priv->child_count; i++) { +                if (sources[i] || !local->child_up[i]) +                        continue; + +                gf_log (this->name, GF_LOG_TRACE, +                        "opening directory %s on subvolume %s (sink)", +                        local->loc.path, priv->children[i]->name); + +                STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, +                                   (void *) (long) i, +                                   priv->children[i], +                                   priv->children[i]->fops->opendir, +                                   &local->loc, fd, NULL); + +                if (!--call_count) +                        break; +        } + +        return 0; +} + + +int +afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        afr_private_t   *priv = NULL; +        int              source = 0; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        source = sh->source; + +        afr_sh_mark_source_sinks (frame, this); +        if (source != -1) +                sh->success[source] = 1; + +        if (sh->active_sinks == 0) { +                gf_log (this->name, GF_LOG_TRACE, +                        "no active sinks for self-heal on dir %s", +                        local->loc.path); +                afr_sh_entry_finish (frame, this); +                return 0; +        } +        if (source == -1 && sh->active_sinks < 2) { +                gf_log (this->name, GF_LOG_TRACE, +                        "cannot sync with 0 sources and 1 sink on dir %s", +                        local->loc.path); +                afr_sh_entry_finish (frame, this); +                return 0; +        } + +        if (source != -1) +                gf_log (this->name, GF_LOG_DEBUG, +                        "self-healing directory %s from subvolume %s to " +                        "%d other", +                        local->loc.path, priv->children[source]->name, +                        sh->active_sinks); +        else +                gf_log (this->name, GF_LOG_DEBUG, +                        "no active sources for %s found. " +                        "merging all entries as a conservative decision", +                        local->loc.path); + +        sh->actual_sh_started = _gf_true; +        afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); +        afr_sh_entry_open (frame, this); + +        return 0; +} + + +void +afr_sh_entry_fix (call_frame_t *frame, xlator_t *this, +                  int32_t op_ret, int32_t op_errno) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        afr_private_t   *priv = NULL; +        int              source = 0; +        int              nsources = 0; +        int32_t          subvol_status = 0; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        if (op_ret < 0) { +                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +                afr_sh_set_error (sh, op_errno); +                afr_sh_entry_finish (frame, this); +                goto out; +        } + +        if (sh->forced_merge) { +                sh->source = -1; +                goto heal; +        } + +        nsources = afr_build_sources (this, sh->xattr, sh->buf, +                                      sh->pending_matrix, sh->sources, +                                      sh->success_children, +                                      AFR_ENTRY_TRANSACTION, &subvol_status, +                                      _gf_true); +        if ((subvol_status & ALL_FOOLS) || +            (subvol_status & SPLIT_BRAIN)) { +                gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative " +                        "merge", local->loc.path); +                source = -1; +                memset (sh->sources, 0, +                        sizeof (*sh->sources) * priv->child_count); +        } else if (nsources == 0) { +                gf_log (this->name, GF_LOG_TRACE, +                        "No self-heal needed for %s", +                        local->loc.path); + +                afr_sh_entry_finish (frame, this); +                return; +        } else { +                source = afr_sh_select_source (sh->sources, priv->child_count); +        } + +        sh->source = source; + +        afr_reset_children (sh->fresh_children, priv->child_count); +        afr_get_fresh_children (sh->success_children, sh->sources, +                                sh->fresh_children, priv->child_count); +        if (sh->source >= 0) +                afr_inode_set_read_ctx (this, sh->inode, sh->source, +                                        sh->fresh_children); + +heal: +        afr_sh_entry_sync_prepare (frame, this); +out: +        return; +} + +int +afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_local_t         *local    = NULL; +        afr_self_heal_t     *sh       = NULL; + +        local    = frame->local; +        int_lock = &local->internal_lock; +        sh       = &local->self_heal; + +        if (int_lock->lock_op_ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, "Non Blocking entrylks " +                        "failed for %s.", local->loc.path); +                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +                afr_sh_entry_done (frame, this); +        } else { + +                gf_log (this->name, GF_LOG_DEBUG, "Non Blocking entrylks done " +                        "for %s. Proceeding to FOP", local->loc.path); +                afr_sh_common_lookup (frame, this, &local->loc, +                                      afr_sh_entry_fix, NULL, +                                      AFR_LOOKUP_FAIL_CONFLICTS | +                                      AFR_LOOKUP_FAIL_MISSING_GFIDS, +                                      NULL); +        } + +        return 0; +} + +int +afr_self_heal_entry (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local = NULL; +        afr_private_t   *priv = NULL; +        afr_self_heal_t *sh = NULL; + +        priv = this->private; +        local = frame->local; +        sh = &local->self_heal; + +        sh->sh_type_in_action = AFR_SELF_HEAL_ENTRY; + +        if (local->self_heal.do_entry_self_heal && priv->entry_self_heal) { +                afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); +                afr_sh_entrylk (frame, this, &local->loc, NULL, +                                afr_sh_post_nonblocking_entry_cbk); +        } else { +                gf_log (this->name, GF_LOG_TRACE, +                        "proceeding to completion on %s", +                        local->loc.path); +                afr_sh_entry_done (frame, this); +        } + +        return 0; +} diff --git a/xlators/cluster/afr-v1/src/afr-self-heal-metadata.c b/xlators/cluster/afr-v1/src/afr-self-heal-metadata.c new file mode 100644 index 000000000..fd5da6cfd --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-self-heal-metadata.c @@ -0,0 +1,770 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" + + +int +afr_sh_metadata_done (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; + +        local = frame->local; +        sh = &local->self_heal; + +        afr_sh_reset (frame, this); +        if (IA_ISDIR (sh->type)) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "proceeding to entry check on %s", +                        local->loc.path); +                afr_self_heal_entry (frame, this); +        } else { +                gf_log (this->name, GF_LOG_DEBUG, +                        "proceeding to data check on %s", +                        local->loc.path); +                afr_self_heal_data (frame, this); +        } + +        return 0; +} + +int +afr_sh_inode_unlock (call_frame_t *frame, xlator_t *this) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_local_t         *local    = NULL; + +        local    = frame->local; +        int_lock = &local->internal_lock; + +        int_lock->lock_cbk = afr_sh_metadata_done; +        afr_unlock (frame, this); + +        return 0; +} + +int +afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this) +{ +        afr_sh_inode_unlock (frame, this); + +        return 0; +} + +int +afr_sh_metadata_fail (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t         *local    = NULL; +        afr_self_heal_t     *sh       = NULL; + +        local    = frame->local; +        sh       = &local->self_heal; + +        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +        afr_sh_metadata_finish (frame, this); +        return 0; +} + +int +afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie, +                                   xlator_t *this, int32_t op_ret, +                                   int32_t op_errno, dict_t *xattr, dict_t *xdata) +{ +        afr_local_t     *local     = NULL; +        int             call_count = 0; +        long            i          = 0; +        afr_self_heal_t *sh        = NULL; +        afr_private_t   *priv      = NULL; + +        local = frame->local; +        priv  = this->private; +        sh = &local->self_heal; +        i = (long)cookie; + +        if ((!IA_ISREG (sh->buf[sh->source].ia_type)) && +            (!IA_ISDIR (sh->buf[sh->source].ia_type))) { +                afr_children_add_child (sh->fresh_children, i, +                                        priv->child_count); +        } +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                if ((!IA_ISREG (sh->buf[sh->source].ia_type)) && +                    (!IA_ISDIR (sh->buf[sh->source].ia_type))) { +                        afr_inode_set_read_ctx (this, sh->inode, sh->source, +                                                sh->fresh_children); +                } +                afr_sh_metadata_finish (frame, this); +        } + +        return 0; +} + +int +afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this) +{ +         afr_sh_erase_pending (frame, this, AFR_METADATA_TRANSACTION, +                               afr_sh_metadata_erase_pending_cbk, +                               afr_sh_metadata_finish); +         return 0; +} + + +int +afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                          int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        afr_private_t   *priv = NULL; +        int              call_count = 0; +        int              child_index = 0; + + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        child_index = (long) cookie; + +        LOCK (&frame->lock); +        { +                if (op_ret == -1) { +                        gf_log (this->name, GF_LOG_INFO, +                                "setting attributes failed for %s on %s (%s)", +                                local->loc.path, +                                priv->children[child_index]->name, +                                strerror (op_errno)); + +                        sh->success[child_index] = 0; +                } +        } +        UNLOCK (&frame->lock); + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +                if (local->xattr_req) { +                        dict_unref (local->xattr_req); +                        local->xattr_req = NULL; +                } +                afr_sh_metadata_erase_pending (frame, this); +        } + +        return 0; +} + + +int +afr_sh_metadata_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                             int32_t op_ret, int32_t op_errno, +                             struct iatt *preop, struct iatt *postop, dict_t *xdata) +{ +        afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno, xdata); + +        return 0; +} + + +int +afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                           int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +        afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno, xdata); + +        return 0; +} + +int +afr_sh_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                        int32_t op_ret, int32_t op_errno, +                        dict_t *xdata) +{ +        int            i     = 0; +        afr_private_t *priv  = NULL; +        afr_local_t   *local = NULL; + +        priv = this->private; +        local = frame->local; + +        if (op_ret < 0) { +                afr_sh_metadata_sync_cbk (frame, cookie, +                                          this, -1, op_errno, xdata); +                goto out; +        } + +        i = (long) cookie; + +        STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk, +                           (void *) (long) i, +                           priv->children[i], +                           priv->children[i]->fops->setxattr, +                           &local->loc, local->xattr_req, 0, NULL); + + out: +        return 0; +} + +inline void +afr_prune_special_keys (dict_t *xattr_dict) +{ +        dict_del (xattr_dict, GF_SELINUX_XATTR_KEY); +} + +inline void +afr_prune_pending_keys (dict_t *xattr_dict, afr_private_t *priv) +{ +        int i = 0; + +        for (; i < priv->child_count; i++) { +                dict_del (xattr_dict, priv->pending_key[i]); +        } +} + +int +afr_sh_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                     int32_t op_ret, int32_t op_errno, dict_t *xattr, +                     dict_t *xdata) +{ +        int            i     = 0; +        afr_private_t *priv  = NULL; +        afr_local_t   *local = NULL; + +        priv = this->private; +        local = frame->local; + +        if (op_ret < 0) { +                afr_sh_metadata_sync_cbk (frame, cookie, +                                          this, -1, op_errno, xdata); +                goto out; +        } + +        afr_prune_pending_keys (xattr, priv); + +        afr_prune_special_keys (xattr); + +        i = (long) cookie; + +        /* send removexattr in bulk via xdata */ +        STACK_WIND_COOKIE (frame, afr_sh_removexattr_cbk, +                           cookie, +                           priv->children[i], +                           priv->children[i]->fops->removexattr, +                           &local->loc, "", xattr); + + out: +        return 0; +} + +int +afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        afr_private_t   *priv = NULL; +        int              source = 0; +        int              active_sinks = 0; +        int              call_count = 0; +        int              i = 0; + +        struct iatt      stbuf = {0,}; +        int32_t          valid = 0; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        source = sh->source; +        active_sinks = sh->active_sinks; + +        /* +         * 2 calls per sink - setattr, setxattr +         */ +        if (xattr) { +                call_count = active_sinks * 2; +                local->xattr_req = dict_ref (xattr); +        } else +                call_count = active_sinks; + +        local->call_count = call_count; + +        stbuf.ia_atime = sh->buf[source].ia_atime; +        stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec; +        stbuf.ia_mtime = sh->buf[source].ia_mtime; +        stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec; + +        stbuf.ia_uid = sh->buf[source].ia_uid; +        stbuf.ia_gid = sh->buf[source].ia_gid; + +        stbuf.ia_type = sh->buf[source].ia_type; +        stbuf.ia_prot = sh->buf[source].ia_prot; + +        valid = GF_SET_ATTR_MODE  | +                GF_SET_ATTR_UID   | GF_SET_ATTR_GID | +                GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; + +        for (i = 0; i < priv->child_count; i++) { +                if (call_count == 0) { +                        break; +                } +                if (sh->sources[i] || !local->child_up[i]) +                        continue; + +                gf_log (this->name, GF_LOG_DEBUG, +                        "self-healing metadata of %s from %s to %s", +                        local->loc.path, priv->children[source]->name, +                        priv->children[i]->name); + +                STACK_WIND_COOKIE (frame, afr_sh_metadata_setattr_cbk, +                                   (void *) (long) i, +                                   priv->children[i], +                                   priv->children[i]->fops->setattr, +                                   &local->loc, &stbuf, valid, NULL); + +                call_count--; + +                if (!xattr) +                        continue; + +                STACK_WIND_COOKIE (frame, afr_sh_getxattr_cbk, +                                   (void *) (long) i, +                                   priv->children[i], +                                   priv->children[i]->fops->getxattr, +                                   &local->loc, NULL, NULL); +                call_count--; +        } + +        return 0; +} + + +int +afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                              int32_t op_ret, int32_t op_errno, dict_t *xattr, +                              dict_t *xdata) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        afr_private_t   *priv = NULL; +        int              source = 0; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        source = sh->source; + +        if (op_ret == -1) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "getxattr of %s failed on subvolume %s (%s). proceeding without xattr", +                        local->loc.path, priv->children[source]->name, +                        strerror (op_errno)); + +                afr_sh_metadata_sync (frame, this, NULL); +        } else { +                afr_prune_pending_keys (xattr, priv); +                afr_sh_metadata_sync (frame, this, xattr); +        } + +        return 0; +} + +static void +afr_set_metadata_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, +                              xlator_t *this) +{ +        afr_private_t    *priv = NULL; +        int              i = 0; +        char             num[1024] = {0}; +        size_t           len = 0; +        char             *string = NULL; +        size_t           off = 0; +        char             *source_child =  " from source %s to"; +        char             *format = " %s, "; +        char             *string_msg = " metadata self heal"; +        char             *pending_matrix_str = NULL; +        int              down_child_present = 0; +        int              unknown_child_present = 0; +        char             *down_subvol_1 = " down subvolume is "; +        char             *unknown_subvol_1 = " unknown subvolume is"; +        char             *down_subvol_2 = " down subvolumes are "; +        char             *unknown_subvol_2 = " unknown subvolumes are "; +        int              down_count = 0; +        int              unknown_count = 0; + +        priv = this->private; + +        pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix, +                                                         this); + +        if (!pending_matrix_str) +                pending_matrix_str = ""; + +        len += snprintf (num, sizeof (num), "%s", string_msg); + +        for (i = 0; i < priv->child_count; i++) { +                if ((sh->source == i) && (local->child_up[i] == 1)) { +                        len += snprintf (num, sizeof (num), source_child, +                                         priv->children[i]->name); +                } else if ((local->child_up[i] == 1) && (sh->sources[i] == 0)) { +                        len += snprintf (num, sizeof (num), format, +                                         priv->children[i]->name); +                } else if (local->child_up[i] == 0) { +                        len += snprintf (num, sizeof (num), format, +                                         priv->children[i]->name); +                        if (!down_child_present) +                                down_child_present = 1; +                        down_count++; +                } else if (local->child_up[i] == -1) { +                        len += snprintf (num, sizeof (num), format, +                                         priv->children[i]->name); +                        if (!unknown_child_present) +                                unknown_child_present = 1; +                        unknown_count++; +                } +        } + +        if (down_child_present) { +                if (down_count > 1) { +                        len += snprintf (num, sizeof (num), "%s", +                                         down_subvol_2); +                } else { +                        len += snprintf (num, sizeof (num), "%s", +                                         down_subvol_1); +                } +        } +        if (unknown_child_present) { +                if (unknown_count > 1) { +                        len += snprintf (num, sizeof (num), "%s", +                                         unknown_subvol_2); +                } else { +                        len += snprintf (num, sizeof (num), "%s", +                                         unknown_subvol_1); +                } +        } + +        len ++; + +        string = GF_CALLOC (len, sizeof (char), gf_common_mt_char); +        if (!string) +                return; + +        off += snprintf (string + off, len - off, "%s", string_msg); +        for (i=0; i < priv->child_count; i++) { +                if ((sh->source == i) && (local->child_up[i] == 1)) +                        off += snprintf (string + off, len - off, source_child, +                                         priv->children[i]->name); +        } + +        for (i = 0; i < priv->child_count; i++) { +                if ((local->child_up[i] == 1)&& (sh->sources[i] == 0)) +                        off += snprintf (string + off, len - off, format, +                                         priv->children[i]->name); +        } + +        if (down_child_present) { +                if (down_count > 1) { +                        off += snprintf (string + off, len - off, "%s", +                                         down_subvol_2); +                } else { +                        off += snprintf (string + off, len - off, "%s", +                                         down_subvol_1); +                } +        } + +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i] == 0) +                        off += snprintf (string + off, len - off, format, +                                         priv->children[i]->name); +        } + +        if (unknown_child_present) { +                if (unknown_count > 1) { +                        off += snprintf (string + off, len - off, "%s", +                                 unknown_subvol_2); +                } else { +                        off += snprintf (string + off, len - off, "%s", +                                         unknown_subvol_1); +                } +        } + +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i] == -1) +                        off += snprintf (string + off, len - off, format, +                                         priv->children[i]->name); +        } + +        gf_asprintf (&sh->metadata_sh_info, "%s metadata %s,", string, +                     pending_matrix_str); + +        if (pending_matrix_str && strcmp (pending_matrix_str, "")) +                GF_FREE (pending_matrix_str); + +        if (string && strcmp (string, "")) +                GF_FREE (string); +} + +int +afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        afr_private_t   *priv = NULL; +        int              source = 0; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        source = sh->source; + +        afr_sh_mark_source_sinks (frame, this); +        if (sh->active_sinks == 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "no active sinks for performing self-heal on file %s", +                        local->loc.path); +                afr_sh_metadata_finish (frame, this); +                return 0; +        } + +        gf_log (this->name, GF_LOG_TRACE, +                "syncing metadata of %s from subvolume %s to %d active sinks", +                local->loc.path, priv->children[source]->name, +                sh->active_sinks); + +        sh->actual_sh_started = _gf_true; +        afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); +        afr_set_metadata_sh_info_str (local, sh, this); +        STACK_WIND (frame, afr_sh_metadata_getxattr_cbk, +                    priv->children[source], +                    priv->children[source]->fops->getxattr, +                    &local->loc, NULL, NULL); + +        return 0; +} + + +void +afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this, +                     int32_t op_ret, int32_t op_errno) +{ +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        afr_private_t   *priv = NULL; +        int              nsources = 0; +        int              source = 0; +        int              i = 0; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        if (op_ret < 0) { +                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +                afr_sh_set_error (sh, op_errno); +                afr_sh_metadata_finish (frame, this); +                goto out; +        } +        nsources = afr_build_sources (this, sh->xattr, sh->buf, +                                      sh->pending_matrix, sh->sources, +                                      sh->success_children, +                                      AFR_METADATA_TRANSACTION, NULL, _gf_false); +        if ((nsources == -1) +            && (priv->favorite_child != -1) +            && (sh->child_errno[priv->favorite_child] == 0)) { + +                gf_log (this->name, GF_LOG_WARNING, +                        "Picking favorite child %s as authentic source to resolve conflicting metadata of %s", +                        priv->children[priv->favorite_child]->name, +                        local->loc.path); + +                sh->sources[priv->favorite_child] = 1; + +                nsources = afr_sh_source_count (sh->sources, +                                                priv->child_count); +        } + +        if (nsources == -1) { +                afr_sh_print_split_brain_log (sh->pending_matrix, this, +                                              local->loc.path); +                afr_set_split_brain (this, sh->inode, SPB, DONT_KNOW); +                afr_sh_metadata_fail (frame, this); +                goto out; +        } + +        afr_set_split_brain (this, sh->inode, NO_SPB, DONT_KNOW); +        if (nsources == 0) { +                gf_log (this->name, GF_LOG_TRACE, +                        "No self-heal needed for %s", +                        local->loc.path); + +                afr_sh_metadata_finish (frame, this); +                goto out; +        } + +        source = afr_sh_select_source (sh->sources, priv->child_count); + +        if (source == -1) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "No active sources found."); + +                afr_sh_metadata_finish (frame, this); +                goto out; +        } + +        sh->source = source; + +        /* detect changes not visible through pending flags -- JIC */ +        for (i = 0; i < priv->child_count; i++) { +                if (i == source || sh->child_errno[i]) +                        continue; + +                if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source])) +                        sh->sources[i] = 0; + +                if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source])) +                        sh->sources[i] = 0; +        } + +        if ((!IA_ISREG (sh->buf[source].ia_type)) && +            (!IA_ISDIR (sh->buf[source].ia_type))) { +                afr_reset_children (sh->fresh_children, priv->child_count); +                afr_get_fresh_children (sh->success_children, sh->sources, +                                        sh->fresh_children, priv->child_count); +                afr_inode_set_read_ctx (this, sh->inode, sh->source, +                                        sh->fresh_children); +        } + +        if (sh->do_metadata_self_heal && priv->metadata_self_heal) +                afr_sh_metadata_sync_prepare (frame, this); +        else +                afr_sh_metadata_finish (frame, this); +out: +        return; +} + +int +afr_sh_metadata_post_nonblocking_inodelk_cbk (call_frame_t *frame, +                                              xlator_t *this) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_local_t         *local    = NULL; + +        local    = frame->local; +        int_lock = &local->internal_lock; + +        if (int_lock->lock_op_ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, "Non Blocking metadata " +                        "inodelks failed for %s.", local->loc.path); +                gf_log (this->name, GF_LOG_DEBUG, "Metadata self-heal " +                        "failed for %s.", local->loc.path); +                afr_sh_metadata_done (frame, this); +        } else { + +                gf_log (this->name, GF_LOG_DEBUG, "Non Blocking metadata " +                        "inodelks done for %s. Proceeding to FOP", +                        local->loc.path); +                afr_sh_common_lookup (frame, this, &local->loc, +                                      afr_sh_metadata_fix, NULL, +                                      AFR_LOOKUP_FAIL_CONFLICTS | +                                      AFR_LOOKUP_FAIL_MISSING_GFIDS, +                                      NULL); +        } + +        return 0; +} + +int +afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_inodelk_t       *inodelk  = NULL; +        afr_local_t         *local    = NULL; + +        local    = frame->local; +        int_lock = &local->internal_lock; + +        int_lock->domain           = this->name; +        inodelk = afr_get_inodelk (int_lock, int_lock->domain); +        int_lock->transaction_lk_type = AFR_SELFHEAL_LK; +        int_lock->selfheal_lk_type    = AFR_METADATA_SELF_HEAL_LK; + +        afr_set_lock_number (frame, this); + +        inodelk->flock.l_start = LLONG_MAX - 1; +        inodelk->flock.l_len   = 0; +        inodelk->flock.l_type  = F_WRLCK; +        int_lock->lock_cbk         = afr_sh_metadata_post_nonblocking_inodelk_cbk; + +        afr_nonblocking_inodelk (frame, this); + +        return 0; +} + +gf_boolean_t +afr_can_start_metadata_self_heal (afr_self_heal_t *sh, afr_private_t *priv) +{ +        if (sh->force_confirm_spb) +                return _gf_true; +        if (sh->do_metadata_self_heal && priv->metadata_self_heal) +                return _gf_true; +        return _gf_false; +} + +int +afr_self_heal_metadata (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t   *local = NULL; +        afr_private_t *priv = this->private; +        afr_self_heal_t *sh = &local->self_heal; + +        local = frame->local; +        sh = &local->self_heal; +        sh->sh_type_in_action = AFR_SELF_HEAL_METADATA; + +        if (afr_can_start_metadata_self_heal (sh, priv)) { +                afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); +                afr_sh_metadata_lock (frame, this); +        } else { +                afr_sh_metadata_done (frame, this); +        } + +        return 0; +} diff --git a/xlators/cluster/afr-v1/src/afr-self-heal.h b/xlators/cluster/afr-v1/src/afr-self-heal.h new file mode 100644 index 000000000..7c9bc8111 --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-self-heal.h @@ -0,0 +1,43 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef __AFR_SELF_HEAL_H__ +#define __AFR_SELF_HEAL_H__ + +#include <sys/stat.h> + +#define FILETYPE_DIFFERS(buf1,buf2) ((buf1)->ia_type != (buf2)->ia_type) +#define PERMISSION_DIFFERS(buf1,buf2) (st_mode_from_ia ((buf1)->ia_prot, (buf1)->ia_type) != st_mode_from_ia ((buf2)->ia_prot, (buf2)->ia_type)) +#define OWNERSHIP_DIFFERS(buf1,buf2) (((buf1)->ia_uid != (buf2)->ia_uid) || ((buf1)->ia_gid != (buf2)->ia_gid)) +#define SIZE_DIFFERS(buf1,buf2) ((buf1)->ia_size != (buf2)->ia_size) + +#define SIZE_GREATER(buf1,buf2) ((buf1)->ia_size > (buf2)->ia_size) + +int +afr_self_heal_entry (call_frame_t *frame, xlator_t *this); + +int +afr_self_heal_data (call_frame_t *frame, xlator_t *this); + +int +afr_self_heal_metadata (call_frame_t *frame, xlator_t *this); + +int +afr_self_heal_get_source (xlator_t *this, afr_local_t *local, dict_t **xattr); + +int +afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode); + +int +afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, +                                          dict_t **xattr, +                                          afr_transaction_type txn_type, +                                          uuid_t gfid); +#endif /* __AFR_SELF_HEAL_H__ */ diff --git a/xlators/cluster/afr-v1/src/afr-self-heald.c b/xlators/cluster/afr-v1/src/afr-self-heald.c new file mode 100644 index 000000000..9e5c1b3e7 --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-self-heald.c @@ -0,0 +1,1835 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif +#include "afr.h" +#include "syncop.h" +#include "afr-self-heald.h" +#include "afr-self-heal-common.h" +#include "protocol-common.h" +#include "event-history.h" + +typedef enum { +        STOP_CRAWL_ON_SINGLE_SUBVOL = 1, +        STOP_INDEX_CRAWL_ON_PENDING_FULL_CRAWL = 2 +} afr_crawl_flags_t; + +typedef enum { +        HEAL = 1, +        INFO, +        STATISTICS_TO_BE_HEALED, +} shd_crawl_op; + +typedef struct shd_dump { +        dict_t   *dict; +        xlator_t *this; +        int      child; +} shd_dump_t; + +typedef struct shd_event_ { +        int     child; +        char    *path; +} shd_event_t; + +typedef struct shd_pos_ { +        int     child; +        xlator_t *this; +        afr_child_pos_t pos; +} shd_pos_t; + +typedef int +(*afr_crawl_done_cbk_t)  (int ret, call_frame_t *sync_frame, void *crawl_data); + +void +afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl, +                 process_entry_cbk_t process_entry, void *op_data, +                 gf_boolean_t exclusive, int crawl_flags, +                 afr_crawl_done_cbk_t crawl_done); + +static int +_crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data); + +/* For calling straight through (e.g. already in a synctask). */ +int +afr_find_child_position (xlator_t *this, int child, afr_child_pos_t *pos); + +/* For deferring through a new synctask. */ +int +afr_syncop_find_child_position (void *data); + +static int +_loc_assign_gfid_path (loc_t *loc) +{ +        int  ret = -1; +        char gfid_path[64] = {0}; + +        if (loc->inode && !uuid_is_null (loc->inode->gfid)) { +                ret = inode_path (loc->inode, NULL, (char**)&loc->path); +        } else if (!uuid_is_null (loc->gfid)) { +                snprintf (gfid_path, sizeof (gfid_path), "<gfid:%s>", +                          uuid_utoa (loc->gfid)); +                loc->path = gf_strdup (gfid_path); +                if (loc->path) +                        ret = 0; +        } +        return ret; +} + +void +_destroy_crawl_event_data (void *data) +{ +        shd_crawl_event_t        *crawl_event = NULL; + +        if (!data) +                goto out; + +        crawl_event = (shd_crawl_event_t *)data; +        GF_FREE (crawl_event->start_time_str); +        GF_FREE (crawl_event->end_time_str); + +out: +        return; +} + +void +_destroy_shd_event_data (void *data) +{ +        shd_event_t             *event = NULL; +        if (!data) +                goto out; +        event = (shd_event_t*)data; +        GF_FREE (event->path); +out: +        return; +} +void +shd_cleanup_event (void *event) +{ +        shd_event_t *shd_event = event; + +        if (!shd_event) +                goto out; +        GF_FREE (shd_event->path); +        GF_FREE (shd_event); +out: +        return; +} + +int +afr_get_local_child (afr_self_heald_t *shd, unsigned int child_count) +{ +        int i = 0; +        int ret = -1; +        for (i = 0; i < child_count; i++) { +                if (shd->pos[i] == AFR_POS_LOCAL) { +                        ret = i; +                        break; +                } +        } +        return ret; +} + +static int +_build_index_loc (xlator_t *this, loc_t *loc, char *name, loc_t *parent) +{ +        int             ret = 0; + +        uuid_copy (loc->pargfid, parent->inode->gfid); +        loc->path = ""; +        loc->name = name; +        loc->parent = inode_ref (parent->inode); +        if (!loc->parent) { +                loc->path = NULL; +                loc_wipe (loc); +                ret = -1; +        } +        return ret; +} + +int +_add_crawl_stats_to_dict (xlator_t *this, dict_t *output, int child, +                          shd_crawl_event_t *shd_event, struct timeval *tv) +{ +        int             ret = 0; +        uint64_t        count = 0; +        char            key[256] = {0}; +        int             xl_id = 0; +        uint64_t        healed_count = 0; +        uint64_t        split_brain_count = 0; +        uint64_t        heal_failed_count = 0; +        char            *start_time_str = NULL; +        char            *end_time_str = NULL; +        char            *crawl_type = NULL; +        int             progress = -1; + +        healed_count = shd_event->healed_count; +        split_brain_count = shd_event->split_brain_count; +        heal_failed_count = shd_event->heal_failed_count; +        start_time_str = shd_event->start_time_str; +        end_time_str = shd_event->end_time_str; +        crawl_type = shd_event->crawl_type; + +        if (!start_time_str) { +                ret = -1; +                goto out; +        } + + +        ret = dict_get_int32 (output, this->name, &xl_id); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "xl does not have id"); +                goto out; +        } + +        snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child); +        ret = dict_get_uint64 (output, key, &count); + +        snprintf (key, sizeof (key), "statistics_healed_cnt-%d-%d-%"PRIu64, +                  xl_id, child, count); +        ret = dict_set_uint64(output, key, healed_count); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" +                        "healed_count to outout"); +                goto out; +         } +        snprintf (key, sizeof (key), "statistics_sb_cnt-%d-%d-%"PRIu64, +                  xl_id, child, count); +        ret = dict_set_uint64 (output, key, split_brain_count); +         if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" +                        "split_brain_count to outout"); +                goto out; +        } +        snprintf (key, sizeof (key), "statistics_crawl_type-%d-%d-%"PRIu64, +                  xl_id, child, count); +        ret = dict_set_dynstr (output, key, gf_strdup (crawl_type)); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" +                        "crawl_type to output"); +                goto out; +        } +        snprintf (key, sizeof (key), "statistics_heal_failed_cnt-%d-%d-%"PRIu64, +                  xl_id, child, count); +        ret = dict_set_uint64 (output, key, heal_failed_count); +         if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" +                        "healed_failed_count to outout"); +                goto out; +        } +        snprintf (key, sizeof (key), "statistics_strt_time-%d-%d-%"PRIu64, +                  xl_id, child, count); +        ret = dict_set_dynstr (output, key, gf_strdup(start_time_str)); + +         if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" +                        "crawl_start_time to outout"); +                goto out; +        } + +        snprintf (key, sizeof (key), "statistics_end_time-%d-%d-%"PRIu64, +                  xl_id, child, count); + +        if (!end_time_str) +                end_time_str = "Could not determine the end time"; +        ret = dict_set_dynstr (output, key, gf_strdup(end_time_str)); +         if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" +                        "crawl_end_time to outout"); +                goto out; +        } +        snprintf (key, sizeof (key), "statistics_inprogress-%d-%d-%"PRIu64, +                  xl_id, child, count); + +        if (shd_event->crawl_inprogress == _gf_true) +                progress = 1; +        else +                progress = 0; + +        ret = dict_set_int32 (output, key, progress); +         if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" +                        "inprogress to outout"); +                goto out; +        } + +         snprintf (key, sizeof (key), "statistics-%d-%d-count",xl_id, child); +         ret = dict_set_uint64 (output, key, count + 1); +         if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "Could not increment the " +                        "counter."); +                goto out; +         } +out: +        return ret; +} + +int +_add_path_to_dict (xlator_t *this, dict_t *output, int child, char *path, +                   struct timeval *tv, gf_boolean_t dyn) +{ +        //subkey not used for now +        int             ret = -1; +        uint64_t        count = 0; +        char            key[256] = {0}; +        int             xl_id = 0; + +        ret = dict_get_int32 (output, this->name, &xl_id); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "xl does not have id"); +                goto out; +        } + +        snprintf (key, sizeof (key), "%d-%d-count", xl_id, child); +        ret = dict_get_uint64 (output, key, &count); + +        snprintf (key, sizeof (key), "%d-%d-%"PRIu64, xl_id, child, count); +        if (dyn) +                ret = dict_set_dynstr (output, key, path); +        else +                ret = dict_set_str (output, key, path); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "%s: Could not add to output", +                        path); +                goto out; +        } + +        if (!tv) +                goto inc_count; +        snprintf (key, sizeof (key), "%d-%d-%"PRIu64"-time", xl_id, +                  child, count); +        ret = dict_set_uint32 (output, key, tv->tv_sec); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "%s: Could not set time", +                        path); +                goto out; +        } + +inc_count: +        snprintf (key, sizeof (key), "%d-%d-count", xl_id, child); +        ret = dict_set_uint64 (output, key, count + 1); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "Could not increment count"); +                goto out; +        } +        ret = 0; +out: +        return ret; +} + +int +_get_path_from_gfid_loc (xlator_t *this, xlator_t *readdir_xl, loc_t *child, +                         char **fpath, gf_boolean_t *missing) +{ +        dict_t          *xattr = NULL; +        char            *path = NULL; +        int             ret = -1; + +        ret = syncop_getxattr (readdir_xl, child, &xattr, GFID_TO_PATH_KEY); +        if (ret < 0) { +                if ((-ret == ENOENT || -ret == ESTALE) && missing) +                        *missing = _gf_true; +                ret = -1; +                goto out; +        } +        ret = dict_get_str (xattr, GFID_TO_PATH_KEY, &path); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "Failed to get path for " +                        "gfid %s", uuid_utoa (child->gfid)); +                goto out; +        } +        path = gf_strdup (path); +        if (!path) { +                ret = -1; +                goto out; +        } +        ret = 0; +out: +        if (!ret) +                *fpath = path; +        if (xattr) +                dict_unref (xattr); +        return ret; +} + +int +_add_event_to_dict (circular_buffer_t *cb, void *data) +{ +        int               ret = 0; +        shd_dump_t        *dump_data = NULL; +        shd_event_t       *shd_event = NULL; + +        dump_data = data; +        shd_event = cb->data; +        if (shd_event->child != dump_data->child) +                goto out; +        ret = _add_path_to_dict (dump_data->this, dump_data->dict, +                                 dump_data->child, shd_event->path, &cb->tv, +                                 _gf_false); +out: +        return ret; +} + +int +_add_crawl_event_statistics_to_dict (circular_buffer_t *cb, void *data) +{ +        int               ret = 0; +        shd_dump_t        *dump_data = NULL; +        shd_crawl_event_t *shd_event = NULL; + +        dump_data = data; +        shd_event = cb->data; +        ret = _add_crawl_stats_to_dict (dump_data->this, dump_data->dict, +                                        dump_data->child, shd_event, &cb->tv); +        return ret; +} + +int +_add_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict, int child) +{ +        shd_dump_t dump_data = {0}; + +        dump_data.this = this; +        dump_data.dict = dict; +        dump_data.child = child; +        eh_dump (eh, &dump_data, _add_event_to_dict); +        return 0; +} + + +int +_add_statistics_to_dict (xlator_t *this, dict_t *dict, int child) +{ +        shd_dump_t              dump_data = {0}; +        afr_private_t           *priv  = NULL; +        afr_self_heald_t        *shd = NULL; + +        priv = this->private; +        shd = &priv->shd; + +        dump_data.this = this; +        dump_data.dict = dict; +        dump_data.child = child; +        eh_dump (shd->statistics[child], &dump_data, +                 _add_crawl_event_statistics_to_dict); +        return 0; + +} + +void +_remove_stale_index (xlator_t *this, xlator_t *readdir_xl, +                     loc_t *parent, char *fname) +{ +        int              ret = 0; +        loc_t            index_loc = {0}; + +        ret = _build_index_loc (this, &index_loc, fname, parent); +        if (ret) +                goto out; +        gf_log (this->name, GF_LOG_DEBUG, "Removing stale index " +                "for %s on %s", index_loc.name, readdir_xl->name); +        ret = syncop_unlink (readdir_xl, &index_loc); +        if((ret < 0) && (-ret != ENOENT)) { +                gf_log(this->name, GF_LOG_ERROR, "%s: Failed to remove index " +                       "on %s - %s",index_loc.name, readdir_xl->name, +                       strerror (-ret)); +        } +        index_loc.path = NULL; +        loc_wipe (&index_loc); +out: +        return; +} + +int +_count_hard_links_under_base_indices_dir (xlator_t *this, +                                           afr_crawl_data_t *crawl_data, +                                           gf_dirent_t *entry, loc_t *childloc, +                                           loc_t *parentloc, struct iatt *iattr) +{ +        xlator_t                *readdir_xl = crawl_data->readdir_xl; +        struct iatt             parent = {0}; +        int                     ret = 0; +        dict_t                  *output = NULL; +        int                     xl_id =  0; +        char                    key[256] = {0}; +        int                     child  = -1; +        uint64_t                     hardlinks = 0; + +        output = crawl_data->op_data; +        child = crawl_data->child; + +        ret = syncop_lookup (readdir_xl, childloc, NULL, iattr, NULL, &parent); +        if (ret) { +                ret = -1; +                goto out; +        } + +        ret = dict_get_int32 (output, this->name, &xl_id); +        if (ret) +                goto out; + +        snprintf (key, sizeof (key), "%d-%d-hardlinks", xl_id, child); +        ret =  dict_get_uint64 (output, key, &hardlinks); + +        /*Removing the count of base_entry under indices/base_indicies and +         * entry under indices/xattrop */ +        hardlinks = hardlinks + iattr->ia_nlink - 2; +        ret = dict_set_uint64 (output, key, hardlinks); +        if (ret) +                goto out; + +out: +        return ret; +} + +int +_add_summary_to_dict (xlator_t *this, afr_crawl_data_t *crawl_data, +                      gf_dirent_t *entry, +                      loc_t *childloc, loc_t *parentloc, struct iatt *iattr) +{ +        dict_t          *output = NULL; +        xlator_t        *readdir_xl = NULL; +        int             ret = -1; +        char            *path = NULL; +        gf_boolean_t    missing = _gf_false; +        char            gfid_str[64] = {0}; + +        if (uuid_is_null (childloc->gfid)) +                goto out; + +        output = crawl_data->op_data; +        readdir_xl = crawl_data->readdir_xl; + +        ret = _get_path_from_gfid_loc (this, readdir_xl, childloc, &path, +                                       &missing); +        if (ret == 0) { +                ret = _add_path_to_dict (this, output, crawl_data->child, path, +                                         NULL, _gf_true); +        } else if (missing) { +                _remove_stale_index (this, readdir_xl, parentloc, +                                     uuid_utoa_r (childloc->gfid, gfid_str)); +        } + +out: +        if (ret && path) +                GF_FREE (path); +        return ret; +} + +void +_crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child, +                       int32_t op_ret, int32_t op_errno, dict_t *xattr_rsp, +                       afr_crawl_data_t *crawl_data) +{ +        int                ret = 0; +        afr_private_t      *priv = NULL; +        afr_self_heald_t   *shd = NULL; +        eh_t               *eh = NULL; +        char               *path = NULL; +        char               gfid_str[64] = {0}; +        shd_event_t        *event = NULL; +        int32_t            sh_failed = 0; +        gf_boolean_t       split_brain = 0; +        int32_t            actual_sh_done = 0; +        shd_crawl_event_t  **shd_crawl_event = NULL; + +        priv = this->private; +        shd  = &priv->shd; +        if (crawl_data->crawl == INDEX) { +                if ((op_ret < 0) && (op_errno == ENOENT)) { +                        _remove_stale_index (this, crawl_data->readdir_xl, +                                             parent, uuid_utoa_r (child->gfid, +                                                                  gfid_str)); +                        goto out; +                } +                ret = _get_path_from_gfid_loc (this, crawl_data->readdir_xl, +                                               child, &path, NULL); +                if (ret) +                        goto out; +        } else { +                path = gf_strdup (child->path); +                if (!path) { +                        ret = -1; +                        goto out; +                } +        } + +        if (xattr_rsp) { +                ret = dict_get_int32 (xattr_rsp, "sh-failed", &sh_failed); +                ret = dict_get_int32 (xattr_rsp, "actual-sh-done", &actual_sh_done); +        } + +        shd_crawl_event = (shd_crawl_event_t**)(shd->crawl_events); + +        split_brain = afr_is_split_brain (this, child->inode); +        if ((op_ret < 0 && op_errno == EIO) || split_brain) { +                eh = shd->split_brain; +                shd_crawl_event[crawl_data->child]->split_brain_count += 1; +        } else if ((op_ret < 0) || sh_failed) { +                eh = shd->heal_failed; +                shd_crawl_event[crawl_data->child]->heal_failed_count += 1; +        } else if (actual_sh_done == 1) { +                eh = shd->healed; +                shd_crawl_event[crawl_data->child]->healed_count += 1; +        } +        ret = -1; + +        if (eh != NULL) { +                event = GF_CALLOC (1, sizeof (*event), gf_afr_mt_shd_event_t); +                if (!event) +                        goto out; +                event->child = crawl_data->child; +                event->path = path; + +                ret = eh_save_history (eh, event); +                if (ret < 0) { +                        gf_log (this->name, GF_LOG_ERROR, "%s:Failed to save " +                                "to event history, (%d, %s)", path, op_ret, +                                strerror (op_errno)); + +                        goto out; +                } +        } else { +                gf_log (this->name, GF_LOG_DEBUG, "%s:Self heal already done ", +                        path); + +        } +        ret = 0; +out: +        if (ret && path) +                GF_FREE (path); +        return; +} + +int +_link_inode_update_loc (xlator_t *this, loc_t *loc, struct iatt *iattr) +{ +        inode_t       *link_inode = NULL; +        int           ret = -1; + +        link_inode = inode_link (loc->inode, NULL, NULL, iattr); +        if (link_inode == NULL) { +                gf_log (this->name, GF_LOG_ERROR, "inode link failed " +                        "on the inode (%s)", uuid_utoa (iattr->ia_gfid)); +                goto out; +        } +        inode_unref (loc->inode); +        loc->inode = link_inode; +        ret = 0; +out: +        return ret; +} + +int +_self_heal_entry (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *entry, +                  loc_t *child, loc_t *parent, struct iatt *iattr) +{ +        struct iatt      parentbuf = {0}; +        int              ret = 0; +        dict_t           *xattr_rsp = NULL; +        dict_t           *xattr_req = NULL; + +        xattr_req = dict_new (); +        if (!xattr_req) { +                errno = ENOMEM; +                ret = -1; +                goto out; +        } + +        ret = dict_set_int32 (xattr_req, "attempt-self-heal", 1); + +        gf_log (this->name, GF_LOG_DEBUG, "lookup %s", child->path); + +        ret = syncop_lookup (this, child, xattr_req, +                             iattr, &xattr_rsp, &parentbuf); +        _crawl_post_sh_action (this, parent, child, ret, -ret, xattr_rsp, +                               crawl_data); +        if (ret < 0) +                ret = -1; +        if (xattr_rsp) +                dict_unref (xattr_rsp); +        if (ret == 0) +                ret = _link_inode_update_loc (this, child, iattr); + +out: +        if (xattr_req) +                dict_unref(xattr_req); +        return ret; +} + +static int +afr_crawl_done  (int ret, call_frame_t *sync_frame, void *data) +{ +        GF_FREE (data); +        STACK_DESTROY (sync_frame->root); +        return 0; +} + +int +_get_heal_op_flags (shd_crawl_op op, afr_crawl_type_t crawl) +{ +        int crawl_flags = 0; + +        if (HEAL == op) { +                crawl_flags |= STOP_CRAWL_ON_SINGLE_SUBVOL; + +                if (crawl == INDEX) +                        crawl_flags |= STOP_INDEX_CRAWL_ON_PENDING_FULL_CRAWL; +        } + +        return crawl_flags; +} + +void +_do_self_heal_on_subvol (xlator_t *this, int child, afr_crawl_type_t crawl) +{ +        afr_start_crawl (this, child, crawl, _self_heal_entry, +                         NULL, _gf_true, _get_heal_op_flags (HEAL, crawl), +                         afr_crawl_done); +} + +gf_boolean_t +_crawl_proceed (xlator_t *this, int child, int crawl_flags, char **reason) +{ +        afr_private_t           *priv = NULL; +        afr_self_heald_t        *shd = NULL; +        gf_boolean_t            proceed = _gf_false; +        char                    *msg = NULL; + +        priv = this->private; +        shd  = &priv->shd; +        if (!shd->enabled) { +                msg = "Self-heal daemon is not enabled"; +                gf_log (this->name, GF_LOG_DEBUG, "%s", msg); +                goto out; +        } + +        if (!priv->child_up[child]) { +                gf_log (this->name, GF_LOG_DEBUG, "Stopping crawl for %s , " +                        "subvol went down", priv->children[child]->name); +                msg = "Brick is Not connected"; +                goto out; +        } + +        if (crawl_flags & STOP_CRAWL_ON_SINGLE_SUBVOL) { +                if (afr_up_children_count (priv->child_up, +                                           priv->child_count) < 2) { +                        gf_log (this->name, GF_LOG_DEBUG, "Stopping crawl as " +                                "< 2 children are up"); +                        msg = "< 2 bricks in replica are running"; +                        goto out; +                } +        } + +        if (crawl_flags & STOP_INDEX_CRAWL_ON_PENDING_FULL_CRAWL) { +                if (shd->pending[child] == FULL) { +                        gf_log (this->name, GF_LOG_INFO, "Stopping index " +                                "self-heal as Full self-heal is pending on %s", +                                priv->children[child]->name); +                        msg = "Full crawl is pending"; +                        goto out; +                } +        } + +        proceed = _gf_true; +out: +        if (reason) +                *reason = msg; +        return proceed; +} + +int +_do_crawl_op_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl, +                               shd_crawl_op op, dict_t *output) +{ +        afr_private_t       *priv = NULL; +        char                *status = NULL; +        char                *subkey = NULL; +        char                key[256] = {0}; +        shd_pos_t           pos_data = {0}; +        int                 op_ret = -1; +        int                 xl_id = -1; +        int                 i = 0; +        int                 ret = 0; +        int                 crawl_flags = 0; + +        priv = this->private; +        crawl_flags = _get_heal_op_flags (op, crawl); + +        if (output) { +                ret = dict_get_int32 (output, this->name, &xl_id); +                if (ret) { +                        gf_log (this->name, GF_LOG_ERROR, "Invalid input, " +                                "translator-id is not available"); +                        goto out; +                } +        } +        pos_data.this = this; +        subkey = "status"; +        for (i = 0; i < priv->child_count; i++) { +                if (_crawl_proceed (this, i, crawl_flags, &status)) { +                        pos_data.child = i; +                        /* +                         * We're already in a synctask in this case, so we +                         * don't need to defer through a second (and in fact +                         * that can cause deadlock).  Just call straight +                         * through instead. +                         */ +                        ret = afr_find_child_position(pos_data.this, +                                                      pos_data.child, +                                                      &pos_data.pos); +                        if (ret) { +                                status = "Not able to find brick location"; +                        } else if (pos_data.pos == AFR_POS_REMOTE) { +                                status = "brick is remote"; +                        } else { +                                op_ret = 0; +                                if (op == HEAL) { +                                        status = "Started self-heal"; +                                        _do_self_heal_on_subvol (this, i, +                                                                 crawl); +                                } else if (output && (op == INFO)) { +                                        status = ""; +                                        afr_start_crawl (this, i, INDEX, +                                                         _add_summary_to_dict, +                                                         output, _gf_false, 0, +                                                         NULL); +                                } else if (output && +                                           (op == STATISTICS_TO_BE_HEALED)) { +                                            status = ""; +                                            afr_start_crawl (this, i, +                                                             INDEX_TO_BE_HEALED, +                                       _count_hard_links_under_base_indices_dir, +                                                             output, _gf_false, +                                                             0, NULL); +                                } +                        } +                        if (output) { +                                snprintf (key, sizeof (key), "%d-%d-%s", xl_id, +                                          i, subkey); +                                ret = dict_set_str (output, key, status); +                        } +                        if (!op_ret && (crawl == FULL)) +                                break; +                } +                if (output) { +                        snprintf (key, sizeof (key), "%d-%d-%s", xl_id, i, +                                  subkey); +                        ret = dict_set_str (output, key, status); +                } +        } +out: +        return op_ret; +} + +int +_do_self_heal_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl, +                                dict_t *output) +{ +        return _do_crawl_op_on_local_subvols (this, crawl, HEAL, output); +} + +int +_get_index_summary_on_local_subvols (xlator_t *this, dict_t *output) +{ +        return _do_crawl_op_on_local_subvols (this, INDEX, INFO, output); +} + +void +afr_fill_completed_crawl_statistics_to_dict (xlator_t *this, dict_t *dict) +{ +        afr_private_t           *priv  = NULL; +        afr_self_heald_t        *shd = NULL; +        int                     i = 0; +        priv = this->private; +        shd= &priv->shd; +        for (i = 0; i < priv->child_count; i++) { +                if (shd->pos[i] != AFR_POS_LOCAL) +                        continue; +                _add_statistics_to_dict (this, dict, i); +        } + +        return ; +} + +static void +reset_crawl_event (shd_crawl_event_t *crawl_event) +{ +    crawl_event->healed_count = 0; +    crawl_event->split_brain_count = 0; +    crawl_event->heal_failed_count = 0; +    GF_FREE (crawl_event->start_time_str); +    crawl_event->start_time_str = NULL; +    crawl_event->end_time_str = NULL; +    crawl_event->crawl_type = NULL; +    crawl_event->crawl_inprogress = _gf_false; +    return; +} + +static void +afr_copy_crawl_event_struct (shd_crawl_event_t *src, shd_crawl_event_t *dst) +{ +        dst->healed_count = src->healed_count; +        dst->split_brain_count = src->split_brain_count; +        dst->heal_failed_count = src->heal_failed_count; +        dst->start_time_str = gf_strdup (src->start_time_str); +        dst->end_time_str = "Crawl is already in progress"; +        dst->crawl_type = src->crawl_type; +        dst->crawl_inprogress = _gf_true; +        return; +} + +static int +afr_fill_crawl_statistics_of_running_crawl(xlator_t *this, dict_t *dict) +{ +        shd_crawl_event_t       *evnt = NULL; +        int                     ret = 0; +        afr_private_t           *priv = NULL; +        afr_self_heald_t        *shd = NULL; +        int                     i = 0; +        priv = this->private; +        shd = &priv->shd; + +        evnt = GF_CALLOC (1, sizeof (shd_crawl_event_t), +                          gf_afr_mt_shd_crawl_event_t); +        if (!evnt) { +                ret = -1; +                goto out; +        } +        LOCK (&priv->lock); +        { +                for (i = 0; i < priv->child_count; i++) { +                        if (shd->pos[i] != AFR_POS_LOCAL) +                                continue; + +                        reset_crawl_event (evnt); + +                        if (!shd->crawl_events[i]) { +                                continue; +                        } + +                        afr_copy_crawl_event_struct (shd->crawl_events[i], +                                                     evnt); +                        _add_crawl_stats_to_dict (this, dict, i, evnt, NULL); + +                } +        } +        UNLOCK (&priv->lock); +        reset_crawl_event (evnt); +        GF_FREE (evnt); + +out: +        return ret; +} + +static int +_add_local_subvols_crawl_statistics_to_dict (xlator_t *this, dict_t *dict) +{ +        int ret = 0; +        afr_fill_completed_crawl_statistics_to_dict (this, dict); +        ret = afr_fill_crawl_statistics_of_running_crawl (this, dict); +        return ret; +} +int +_add_local_subvols_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict) +{ +        afr_private_t           *priv = NULL; +        afr_self_heald_t        *shd = NULL; +        int                     i = 0; + +        priv = this->private; +        shd = &priv->shd; + +        for (i = 0; i < priv->child_count; i++) { +                if (shd->pos[i] != AFR_POS_LOCAL) +                        continue; +                _add_eh_to_dict (this, eh, dict, i); +        } +        return 0; +} + +int +afr_xl_op (xlator_t *this, dict_t *input, dict_t *output) +{ +        gf_xl_afr_op_t   op = GF_AFR_OP_INVALID; +        int              ret = 0; +        afr_private_t    *priv = NULL; +        afr_self_heald_t *shd = NULL; +        int              xl_id = 0; + +        priv = this->private; +        shd = &priv->shd; + +        ret = dict_get_int32 (input, "xl-op", (int32_t*)&op); +        if (ret) +                goto out; +        ret = dict_get_int32 (input, this->name, &xl_id); +        if (ret) +                goto out; +        ret = dict_set_int32 (output, this->name, xl_id); +        if (ret) +                goto out; +        switch (op) { +        case GF_AFR_OP_HEAL_INDEX: +                ret = _do_self_heal_on_local_subvols (this, INDEX, output); +                break; +        case GF_AFR_OP_HEAL_FULL: +                ret = _do_self_heal_on_local_subvols (this, FULL, output); +                break; +        case GF_AFR_OP_INDEX_SUMMARY: +                (void)_get_index_summary_on_local_subvols (this, output); +                ret = 0; +                break; +        case GF_AFR_OP_HEALED_FILES: +                ret = _add_local_subvols_eh_to_dict (this, shd->healed, output); +                break; +        case GF_AFR_OP_HEAL_FAILED_FILES: +                ret = _add_local_subvols_eh_to_dict (this, shd->heal_failed, +                                                   output); +                break; +        case GF_AFR_OP_SPLIT_BRAIN_FILES: +                ret = _add_local_subvols_eh_to_dict (this, shd->split_brain, +                                                   output); +                break; +        case GF_AFR_OP_STATISTICS: +                ret = _add_local_subvols_crawl_statistics_to_dict (this, output); +                break; +        case GF_AFR_OP_STATISTICS_HEAL_COUNT: +        case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA: +                ret = _do_crawl_op_on_local_subvols (this, INDEX_TO_BE_HEALED, +                                                     STATISTICS_TO_BE_HEALED, +                                                     output); +                break; +        default: +                gf_log (this->name, GF_LOG_ERROR, "Unknown set op %d", op); +                break; +        } +out: +        dict_del (output, this->name); +        return ret; +} + +void +afr_poll_self_heal (void *data) +{ +        afr_private_t    *priv = NULL; +        afr_self_heald_t *shd = NULL; +        struct timespec  timeout = {0}; +        xlator_t         *this = NULL; +        long             child = (long)data; +        gf_timer_t       *old_timer = NULL; +        gf_timer_t       *new_timer = NULL; +        shd_pos_t        pos_data = {0}; +        int              ret = 0; + +        this = THIS; +        priv = this->private; +        shd = &priv->shd; + +        if (shd->pos[child] == AFR_POS_UNKNOWN) { +                pos_data.this = this; +                pos_data.child = child; +                ret = synctask_new (this->ctx->env, +                                    afr_syncop_find_child_position, +                                    NULL, NULL, &pos_data); +                if (!ret) +                        shd->pos[child] = pos_data.pos; +        } +        if (shd->enabled && (shd->pos[child] == AFR_POS_LOCAL)) +                _do_self_heal_on_subvol (this, child, INDEX); +        timeout.tv_sec = shd->timeout; +        timeout.tv_nsec = 0; +        //notify and previous timer should be synchronized. +        LOCK (&priv->lock); +        { +                old_timer = shd->timer[child]; +                if (shd->pos[child] == AFR_POS_REMOTE) +                        goto unlock; +                shd->timer[child] = gf_timer_call_after (this->ctx, timeout, +                                                         afr_poll_self_heal, +                                                         data); +                new_timer = shd->timer[child]; +        } +unlock: +        UNLOCK (&priv->lock); + +        if (old_timer) +                gf_timer_call_cancel (this->ctx, old_timer); +        if (!new_timer && (shd->pos[child] != AFR_POS_REMOTE)) { +                gf_log (this->name, GF_LOG_WARNING, +                        "Could not create self-heal polling timer for %s", +                        priv->children[child]->name); +        } +        return; +} + +static int +afr_handle_child_up  (int ret, call_frame_t *sync_frame, void *data) +{ +        afr_self_heald_t *shd = NULL; +        shd_pos_t        *pos_data = data; +        afr_private_t    *priv = NULL; + +        if (ret) +                goto out; + +        priv = pos_data->this->private; +        shd = &priv->shd; +        shd->pos[pos_data->child] = pos_data->pos; +        if (pos_data->pos != AFR_POS_REMOTE) +                afr_poll_self_heal ((void*)(long)pos_data->child); +        _do_self_heal_on_local_subvols (THIS, INDEX, NULL); +out: +        GF_FREE (data); +        return 0; +} + +void +afr_proactive_self_heal (void *data) +{ +        xlator_t         *this = NULL; +        long             child = (long)data; +        shd_pos_t        *pos_data = NULL; +        int              ret = 0; + +        this = THIS; + +        //Position of brick could have changed and it could be local now. +        //Compute the position again +        pos_data = GF_CALLOC (1, sizeof (*pos_data), gf_afr_mt_pos_data_t); +        if (!pos_data) +                goto out; +        pos_data->this = this; +        pos_data->child = child; +        ret = synctask_new (this->ctx->env, afr_syncop_find_child_position, +                            afr_handle_child_up, NULL, pos_data); +        if (ret) +                goto out; +out: +        return; +} + +static int +get_pathinfo_host (char *pathinfo, char *hostname, size_t size) +{ +        char    *start = NULL; +        char    *end = NULL; +        int     ret  = -1; +        int     i    = 0; + +        if (!pathinfo) +                goto out; + +        start = strchr (pathinfo, ':'); +        if (!start) +                goto out; +        end = strrchr (pathinfo, ':'); +        if (start == end) +                goto out; + +        memset (hostname, 0, size); +        i = 0; +        while (++start != end) +                hostname[i++] = *start; +        ret = 0; +out: +        return ret; +} + +int +afr_local_pathinfo (char *pathinfo, gf_boolean_t *local) +{ +        int             ret   = 0; +        char            pathinfohost[1024] = {0}; +        char            localhost[1024] = {0}; +        xlator_t        *this = THIS; + +        *local = _gf_false; +        ret = get_pathinfo_host (pathinfo, pathinfohost, sizeof (pathinfohost)); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "Invalid pathinfo: %s", +                        pathinfo); +                goto out; +        } + +        ret = gethostname (localhost, sizeof (localhost)); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "gethostname() failed, " +                        "reason: %s", strerror (errno)); +                goto out; +        } + +        if (!strcmp (localhost, pathinfohost)) +                *local = _gf_true; +out: +        return ret; +} + +int +afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data, +                           loc_t *dirloc) +{ +        afr_private_t *priv = NULL; +        dict_t        *xattr = NULL; +        void          *index_gfid = NULL; +        void          *base_indices_holder_vgfid = NULL; +        loc_t         rootloc = {0}; +        struct iatt   iattr = {0}; +        struct iatt   parent = {0}; +        int           ret = 0; +        xlator_t      *readdir_xl = crawl_data->readdir_xl; + +        priv = this->private; +        if (crawl_data->crawl == FULL) { +                afr_build_root_loc (this, dirloc); +        } else if (crawl_data->crawl == INDEX) { +                afr_build_root_loc (this, &rootloc); +                ret = syncop_getxattr (readdir_xl, &rootloc, &xattr, +                                       GF_XATTROP_INDEX_GFID); +                if (ret < 0) { +                        ret = -1; +                        goto out; +                } +                ret = dict_get_ptr (xattr, GF_XATTROP_INDEX_GFID, &index_gfid); +                if (ret < 0) { +                        gf_log (this->name, GF_LOG_ERROR, "failed to get index " +                                "dir gfid on %s", readdir_xl->name); +                        goto out; +                } +                if (!index_gfid) { +                        gf_log (this->name, GF_LOG_ERROR, "index gfid empty " +                                "on %s", readdir_xl->name); +                        ret = -1; +                        goto out; +                } +                uuid_copy (dirloc->gfid, index_gfid); +                dirloc->path = ""; +                dirloc->inode = inode_new (priv->root_inode->table); +                ret = syncop_lookup (readdir_xl, dirloc, NULL, +                                     &iattr, NULL, &parent); +                if (ret < 0) { +                        if (-ret != ENOENT) { +                                gf_log (this->name, GF_LOG_ERROR, "lookup " +                                        "failed on index dir on %s - (%s)", +                                        readdir_xl->name, strerror (-ret)); +                        } +                        ret = -1; +                        goto out; +                } +                ret = _link_inode_update_loc (this, dirloc, &iattr); +                if (ret) +                        goto out; +        } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) { +                afr_build_root_loc (this, &rootloc); +                ret = syncop_getxattr (readdir_xl, &rootloc, &xattr, +                                       GF_BASE_INDICES_HOLDER_GFID); +                if (ret < 0) { +                        ret = -1; +                        goto out; +                } +                ret = dict_get_ptr (xattr, GF_BASE_INDICES_HOLDER_GFID, +                                    &base_indices_holder_vgfid); +                if (ret < 0) { +                        gf_log (this->name, GF_LOG_ERROR, "index gfid empty " +                                "on %s", readdir_xl->name); +                        ret = -1; +                        goto out; +                } +                if (!base_indices_holder_vgfid) { +                        gf_log (this->name, GF_LOG_ERROR, "Base indices holder" +                                "virtual gfid is null on %s", readdir_xl->name); +                        ret = -1; +                        goto out; +                } +                uuid_copy (dirloc->gfid,  base_indices_holder_vgfid); +                dirloc->path = ""; +                dirloc->inode = inode_new (priv->root_inode->table); +                ret = syncop_lookup (readdir_xl, dirloc, NULL, &iattr, NULL, +                                     &parent); +                if (ret < 0) { +                        if (-ret != ENOENT) { +                                gf_log (this->name, GF_LOG_ERROR, "lookup " +                                        "failed for base_indices_holder dir" +                                        " on %s - (%s)", readdir_xl->name, +                                        strerror (-ret)); + +                        } else { +                                gf_log (this->name, GF_LOG_ERROR, "base_indices" +                                        "_holder is not yet created."); +                        } +                        ret = -1; +                        goto out; +                } +                ret = _link_inode_update_loc (this, dirloc, &iattr); +                if (ret) +                        goto out; +        } +        ret = 0; +out: +        if (xattr) +                dict_unref (xattr); +        loc_wipe (&rootloc); +        return ret; +} + +int +afr_crawl_opendir (xlator_t *this, afr_crawl_data_t *crawl_data, fd_t **dirfd, +                   loc_t *dirloc) +{ +        fd_t          *fd   = NULL; +        int           ret = 0; + +        if (crawl_data->crawl == FULL) { +                fd = fd_create (dirloc->inode, crawl_data->pid); +                if (!fd) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "Failed to create fd for %s", dirloc->path); +                        ret = -1; +                        goto out; +                } + +                ret = syncop_opendir (crawl_data->readdir_xl, dirloc, fd); +                if (ret < 0) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "opendir failed on %s", dirloc->path); +                        ret = -1; +                        goto out; +                } +        } else { +                fd = fd_anonymous (dirloc->inode); +        } +        ret = 0; +out: +        if (!ret) +                *dirfd = fd; +        return ret; +} + +xlator_t* +afr_crawl_readdir_xl_get (xlator_t *this, afr_crawl_data_t *crawl_data) +{ +        afr_private_t *priv = this->private; + +        if (crawl_data->crawl == FULL) { +                return this; +        } else { +                return priv->children[crawl_data->child]; +        } +        return NULL; +} + +int +afr_crawl_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, +                           gf_dirent_t *entry, afr_crawl_data_t *crawl_data) +{ +        int           ret = -1; +        afr_private_t *priv = NULL; + +        priv = this->private; +        if (crawl_data->crawl == FULL) { +                ret = afr_build_child_loc (this, child, parent, entry->d_name); +        } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) { +                ret = _build_index_loc (this, child, entry->d_name, parent); +                if (ret) +                        goto out; +                child->inode = inode_new (priv->root_inode->table); +                if (!child->inode) { +                        ret = -1; +                        goto out; +                } +                child->path = NULL; +        } else { +                child->inode = inode_new (priv->root_inode->table); +                if (!child->inode) +                        goto out; +                uuid_parse (entry->d_name, child->gfid); +                ret = _loc_assign_gfid_path (child); +        } +out: +        return ret; +} + +static int +_process_entries (xlator_t *this, loc_t *parentloc, gf_dirent_t *entries, +                  off_t *offset, afr_crawl_data_t *crawl_data) +{ +        gf_dirent_t      *entry = NULL; +        gf_dirent_t      *tmp = NULL; +        int              ret = 0; +        loc_t            entry_loc = {0}; +        fd_t             *fd = NULL; +        struct iatt      iattr = {0}; + +        list_for_each_entry_safe (entry, tmp, &entries->list, list) { +                if (!_crawl_proceed (this, crawl_data->child, +                                     crawl_data->crawl_flags, NULL)) { +                        ret = -1; +                        goto out; +                } +                *offset = entry->d_off; +                if (IS_ENTRY_CWD (entry->d_name) || +                    IS_ENTRY_PARENT (entry->d_name)) +                        continue; +                if ((crawl_data->crawl == FULL) && +                     uuid_is_null (entry->d_stat.ia_gfid)) { +                        gf_log (this->name, GF_LOG_WARNING, "%s/%s: No " +                                "gfid present skipping", +                                parentloc->path, entry->d_name); +                        continue; +                } + +                loc_wipe (&entry_loc); +                ret = afr_crawl_build_child_loc (this, &entry_loc, parentloc, +                                                 entry, crawl_data); +                if (ret) +                        goto out; + +                ret = crawl_data->process_entry (this, crawl_data, entry, +                                                 &entry_loc, parentloc, &iattr); + +                if (crawl_data->crawl == INDEX_TO_BE_HEALED && ret) { +                       goto out; +                } else if (ret) { +                        continue; +                } + +                if ((crawl_data->crawl == INDEX) || +                    (crawl_data->crawl == INDEX_TO_BE_HEALED)) +                        continue; + +                if (!IA_ISDIR (iattr.ia_type)) +                        continue; +                fd = NULL; +                ret = afr_crawl_opendir (this, crawl_data, &fd, &entry_loc); +                if (ret) +                        continue; +                ret = _crawl_directory (fd, &entry_loc, crawl_data); +                if (fd) +                        fd_unref (fd); +        } +        ret = 0; +out: +        if ((crawl_data->crawl == INDEX_TO_BE_HEALED)  && ret) { +                gf_log (this->name, GF_LOG_ERROR,"Failed to get the hardlink " +                        "count"); +        } +        loc_wipe (&entry_loc); +        return ret; +} + +static int +_crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data) +{ +        xlator_t        *this = NULL; +        off_t           offset   = 0; +        gf_dirent_t     entries; +        int             ret = 0; +        gf_boolean_t    free_entries = _gf_false; +        xlator_t        *readdir_xl = crawl_data->readdir_xl; + +        INIT_LIST_HEAD (&entries.list); +        this = THIS; + +        GF_ASSERT (loc->inode); + +        if (crawl_data->crawl == FULL) +                gf_log (this->name, GF_LOG_DEBUG, "crawling %s", loc->path); +        else +                gf_log (this->name, GF_LOG_DEBUG, "crawling INDEX %s", +                        uuid_utoa (loc->gfid)); + +        while (1) { +                if (crawl_data->crawl == FULL) +                        ret = syncop_readdirp (readdir_xl, fd, 131072, offset, +                                               NULL, &entries); +                else +                        ret = syncop_readdir (readdir_xl, fd, 131072, offset, +                                              &entries); +                if (ret < 0) { +                        ret = -1; +                        break; +                } else if (ret == 0) { +                        break; +                } + +                ret = 0; +                free_entries = _gf_true; + +                if (!_crawl_proceed (this, crawl_data->child, +                                     crawl_data->crawl_flags, NULL)) { +                        ret = -1; +                        goto out; +                } +                if (list_empty (&entries.list)) +                        goto out; + +                ret = _process_entries (this, loc, &entries, &offset, +                                        crawl_data); +                if ((ret < 0) && (crawl_data->crawl == INDEX_TO_BE_HEALED)) { +                        goto out; +                } +                gf_dirent_free (&entries); +                free_entries = _gf_false; +        } +        ret = 0; +out: +        if (free_entries) +                gf_dirent_free (&entries); +        return ret; +} + +static char* +position_str_get (afr_child_pos_t pos) +{ +        switch (pos) { +        case AFR_POS_UNKNOWN: +                return "unknown"; +        case AFR_POS_LOCAL: +                return "local"; +        case AFR_POS_REMOTE: +                return "remote"; +        } +        return NULL; +} + +int +afr_find_child_position (xlator_t *this, int child, afr_child_pos_t *pos) +{ +        afr_private_t    *priv = NULL; +        afr_self_heald_t *shd  = NULL; +        dict_t           *xattr_rsp = NULL; +        loc_t            loc = {0}; +        int              ret = 0; +        char             *node_uuid = NULL; + +        priv = this->private; +        shd  = &priv->shd; + +        afr_build_root_loc (this, &loc); + +        ret = syncop_getxattr (priv->children[child], &loc, &xattr_rsp, +                               GF_XATTR_NODE_UUID_KEY); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, "getxattr failed on %s - " +                        "(%s)", priv->children[child]->name, strerror (-ret)); +                ret = -1; +                goto out; +        } + +        ret = dict_get_str (xattr_rsp, GF_XATTR_NODE_UUID_KEY, &node_uuid); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "node-uuid key not found on " +                        "child %s", priv->children[child]->name); +                goto out; +        } + +        if (!strcmp (node_uuid, shd->node_uuid)) +                *pos = AFR_POS_LOCAL; +        else +                *pos = AFR_POS_REMOTE; + +        gf_log (this->name, GF_LOG_DEBUG, "child %s is %s", +                priv->children[child]->name, position_str_get (*pos)); +out: +        if (ret) +                *pos = AFR_POS_UNKNOWN; +        loc_wipe (&loc); +        return ret; +} + +int +afr_syncop_find_child_position (void *data) +{ +        shd_pos_t *pos_data = data; +        int       ret = 0; + +        ret = afr_find_child_position (pos_data->this, pos_data->child, +                                       &pos_data->pos); +        return ret; +} + +static int +afr_dir_crawl (void *data) +{ +        xlator_t            *this = NULL; +        int                 ret = -1; +        xlator_t            *readdir_xl = NULL; +        fd_t                *fd = NULL; +        loc_t               dirloc = {0}; +        afr_crawl_data_t    *crawl_data = data; + +        this = THIS; + +        if (!_crawl_proceed (this, crawl_data->child, crawl_data->crawl_flags, +                             NULL)) +                goto out; + +        readdir_xl = afr_crawl_readdir_xl_get (this, crawl_data); +        if (!readdir_xl) +                goto out; +        crawl_data->readdir_xl = readdir_xl; + +        ret = afr_crawl_build_start_loc (this, crawl_data, &dirloc); +        if (ret) +                goto out; + +        ret = afr_crawl_opendir (this, crawl_data, &fd, &dirloc); +        if (ret) { +                if (crawl_data->crawl == INDEX_TO_BE_HEALED) { +                        gf_log (this->name, GF_LOG_ERROR, "Failed to open base_" +                                "indices_holder"); +                } +                goto out; +        } + +        ret = _crawl_directory (fd, &dirloc, crawl_data); +        if (ret) +                gf_log (this->name, GF_LOG_ERROR, "Crawl failed on %s", +                        readdir_xl->name); +        else +                gf_log (this->name, GF_LOG_DEBUG, "Crawl completed " +                        "on %s", readdir_xl->name); +        if (crawl_data->crawl == INDEX) +                dirloc.path = NULL; +out: +        if (fd) +                fd_unref (fd); +        if ((crawl_data->crawl == INDEX) || +            (crawl_data->crawl == INDEX_TO_BE_HEALED )) +                dirloc.path = NULL; +        loc_wipe (&dirloc); +        return ret; +} + +char * +get_crawl_type_in_string (afr_crawl_type_t crawl) +{ +        char    *index = "INDEX"; +        char    *full  = "FULL"; +        char    *crawl_type = NULL; + +        if (crawl == INDEX){ +                crawl_type = index; +        } else if (crawl == FULL) { +                crawl_type = full; +        } + +        return  crawl_type; +} + +static int +afr_allocate_crawl_event (xlator_t *this, int child, afr_crawl_type_t crawl) +{ +        afr_private_t           *priv = NULL; +        afr_self_heald_t        *shd = NULL; +        int                     ret = 0; +        shd_crawl_event_t       *crawl_event = NULL; +        time_t                  get_time = 0; + +        priv = this->private; +        shd = &priv->shd; + +        crawl_event = GF_CALLOC (sizeof (shd_crawl_event_t), 1, +                                 gf_afr_mt_shd_crawl_event_t); +        if (!crawl_event) { +                ret = -1; +                goto out; +        } + +        get_time =  time(NULL); +        if (get_time == ((time_t)-1)) { +                 ret = -1; +                goto out; +        } + +        crawl_event->start_time_str = gf_strdup (ctime(&get_time)); + +        crawl_event->crawl_type = get_crawl_type_in_string (crawl); +        if (!crawl_event->crawl_type) { +                ret = -1; +                goto out; +        } +        LOCK (&priv->lock); +        { +                shd->crawl_events[child] = crawl_event; +        } +        UNLOCK (&priv->lock); +        ret = 0; +out: +        return ret; + +} + +static int +afr_put_crawl_event_in_eh (xlator_t *this, int child) +{ +        afr_private_t           *priv = NULL; +        afr_self_heald_t        *shd = NULL; +        int                     ret = 0; +        time_t                  get_time = 0; +        shd_crawl_event_t       **crawl_event = NULL; + +        priv = this->private; +        shd = &priv->shd; + +        get_time = time(NULL); +        if (get_time == ((time_t)-1)) { +                ret = -1; +                goto out; +        } +        crawl_event = (shd_crawl_event_t**)shd->crawl_events; +        LOCK (&priv->lock); +        { +                crawl_event[child]->end_time_str = gf_strdup (ctime(&get_time)); +                ret = eh_save_history (shd->statistics[child], +                                       crawl_event[child]); +                crawl_event[child] = NULL; +        } +        UNLOCK (&priv->lock); +out: +        return ret; +} + +static int +afr_dir_exclusive_crawl (void *data) +{ +        afr_private_t    *priv = NULL; +        afr_self_heald_t *shd = NULL; +        gf_boolean_t     crawl = _gf_false; +        int              ret = 0; +        int              child = -1; +        xlator_t         *this = NULL; +        afr_crawl_data_t *crawl_data = data; + +        this = THIS; +        priv = this->private; +        shd = &priv->shd; +        child = crawl_data->child; + +        LOCK (&priv->lock); +        { +                if (shd->inprogress[child]) { +                        if (shd->pending[child] != FULL) +                                shd->pending[child] = crawl_data->crawl; +                } else { +                        shd->inprogress[child] = _gf_true; +                        crawl = _gf_true; +                } +        } +        UNLOCK (&priv->lock); + +        if (!crawl) { +                gf_log (this->name, GF_LOG_INFO, "Another crawl is in progress " +                        "for %s while attempting %s heal on %s", +                        priv->children[child]->name, +                        get_crawl_type_in_string (crawl_data->crawl), +                        priv->children[child]->name); +                goto out; +        } + +        do { +                ret = afr_allocate_crawl_event (this, child, crawl_data->crawl); +                if (ret) +                        goto out; +                afr_dir_crawl (data); + +                ret = afr_put_crawl_event_in_eh (this, child); +                if (ret < 0) +                        goto out; + +                LOCK (&priv->lock); +                { +                        if (shd->pending[child] != NONE) { +                                crawl_data->crawl = shd->pending[child]; +                                shd->pending[child] = NONE; +                        } else { +                                shd->inprogress[child] = _gf_false; +                                crawl = _gf_false; +                        } +                } +                UNLOCK (&priv->lock); +        } while (crawl); +out: +        return ret; +} + +void +afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl, +                 process_entry_cbk_t process_entry, void *op_data, +                 gf_boolean_t exclusive, int crawl_flags, +                 afr_crawl_done_cbk_t crawl_done) +{ +        afr_private_t              *priv = NULL; +        call_frame_t               *frame = NULL; +        afr_crawl_data_t           *crawl_data = NULL; +        int                        ret = 0; +        int (*crawler) (void*) = NULL; + +        priv = this->private; + +        frame = create_frame (this, this->ctx->pool); +        if (!frame) +                goto out; + +        afr_set_lk_owner (frame, this, frame->root); +        afr_set_low_priority (frame); +        crawl_data = GF_CALLOC (1, sizeof (*crawl_data), +                                gf_afr_mt_crawl_data_t); +        if (!crawl_data) +                goto out; +        crawl_data->process_entry = process_entry; +        crawl_data->child = idx; +        crawl_data->pid = frame->root->pid; +        crawl_data->crawl = crawl; +        crawl_data->op_data = op_data; +        crawl_data->crawl_flags = crawl_flags; +        gf_log (this->name, GF_LOG_DEBUG, "starting crawl %d for %s", +                crawl_data->crawl, priv->children[idx]->name); + +        if (exclusive) +                crawler = afr_dir_exclusive_crawl; +        else +                crawler = afr_dir_crawl; +        ret = synctask_new (this->ctx->env, crawler, +                            crawl_done, frame, crawl_data); +        if (ret) +                gf_log (this->name, GF_LOG_ERROR, "afr crawl failed for child" +                        " %d with ret %d", idx, ret); +out: +        return; +} + +void +afr_build_root_loc (xlator_t *this, loc_t *loc) +{ +        afr_private_t   *priv = NULL; + +        priv = this->private; +        loc->path = gf_strdup ("/"); +        loc->name = ""; +        loc->inode = inode_ref (priv->root_inode); +        uuid_copy (loc->gfid, loc->inode->gfid); +} + +int +afr_set_root_gfid (dict_t *dict) +{ +        uuid_t gfid; +        int ret = 0; + +        memset (gfid, 0, 16); +        gfid[15] = 1; + +        ret = afr_set_dict_gfid (dict, gfid); + +        return ret; +} diff --git a/xlators/cluster/afr-v1/src/afr-self-heald.h b/xlators/cluster/afr-v1/src/afr-self-heald.h new file mode 100644 index 000000000..e0c083754 --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-self-heald.h @@ -0,0 +1,65 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef __AFR_SELF_HEALD_H__ +#define __AFR_SELF_HEALD_H__ +#include "xlator.h" + +#define IS_ROOT_PATH(path) (!strcmp (path, "/")) +#define IS_ENTRY_CWD(entry) (!strcmp (entry, ".")) +#define IS_ENTRY_PARENT(entry) (!strcmp (entry, "..")) +#define AFR_ALL_CHILDREN -1 + +typedef struct afr_crawl_data_ { +        int                 child; +        pid_t               pid; +        afr_crawl_type_t    crawl; +        xlator_t            *readdir_xl; +        void                *op_data; +        int                 crawl_flags; +        int (*process_entry) (xlator_t *this, struct afr_crawl_data_ *crawl_data, +                              gf_dirent_t *entry, loc_t *child, loc_t *parent, +                              struct iatt *iattr); +} afr_crawl_data_t; + +typedef struct crawl_event_stats_ { +        uint64_t healed_count; +        uint64_t split_brain_count; +        uint64_t heal_failed_count; +        char     *start_time_str; +        char     *end_time_str; +        char     *crawl_type; +        gf_boolean_t crawl_inprogress; +} shd_crawl_event_t; + +void _destroy_crawl_event_data (void *data); +void _destroy_shd_event_data (void *data); + +typedef int (*process_entry_cbk_t) (xlator_t *this, afr_crawl_data_t *crawl_data, +                              gf_dirent_t *entry, loc_t *child, loc_t *parent, +                              struct iatt *iattr); + +void afr_build_root_loc (xlator_t *this, loc_t *loc); + +int afr_set_root_gfid (dict_t *dict); + +void +afr_proactive_self_heal (void *data); + +int +afr_xl_op (xlator_t *this, dict_t *input, dict_t *output); + +/* + * In addition to its self-heal use, this is used to find a local default + * read_child. + */ +int +afr_local_pathinfo (char *pathinfo, gf_boolean_t *local); +#endif /* __AFR_SELF_HEALD_H__ */ diff --git a/xlators/cluster/afr-v1/src/afr-transaction.c b/xlators/cluster/afr-v1/src/afr-transaction.c new file mode 100644 index 000000000..47a3481bc --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-transaction.c @@ -0,0 +1,1963 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#include "dict.h" +#include "byte-order.h" +#include "common-utils.h" +#include "timer.h" + +#include "afr.h" +#include "afr-transaction.h" + +#include <signal.h> + + +#define LOCKED_NO       0x0        /* no lock held */ +#define LOCKED_YES      0x1        /* for DATA, METADATA, ENTRY and higher_path +                                      of RENAME */ +#define LOCKED_LOWER    0x2        /* for lower_path of RENAME */ + +afr_fd_ctx_t * +__afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ +        uint64_t       ctx = 0; +        int            ret = 0; +        afr_fd_ctx_t  *fd_ctx = NULL; +        int            i = 0; +        afr_private_t *priv = NULL; + +        priv = this->private; + +        ret = __fd_ctx_get (fd, this, &ctx); + +        if (ret < 0 && fd_is_anonymous (fd)) { +                ret = __afr_fd_ctx_set (this, fd); +                if (ret < 0) +                        goto out; + +                ret = __fd_ctx_get (fd, this, &ctx); +                if (ret < 0) +                        goto out; + +                fd_ctx = (afr_fd_ctx_t *)(long) ctx; +                for (i = 0; i < priv->child_count; i++) +                        fd_ctx->opened_on[i] = AFR_FD_OPENED; +        } + +        fd_ctx = (afr_fd_ctx_t *)(long) ctx; +out: +        return fd_ctx; +} + + +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ +        afr_fd_ctx_t  *fd_ctx = NULL; + +        LOCK(&fd->lock); +        { +                fd_ctx = __afr_fd_ctx_get (fd, this); +        } +        UNLOCK(&fd->lock); + +        return fd_ctx; +} + + +static void +afr_save_lk_owner (call_frame_t *frame) +{ +        afr_local_t * local = NULL; + +        local = frame->local; + +        local->saved_lk_owner = frame->root->lk_owner; +} + + +static void +afr_restore_lk_owner (call_frame_t *frame) +{ +        afr_local_t * local = NULL; + +        local = frame->local; + +        frame->root->lk_owner = local->saved_lk_owner; +} + +static void +__mark_all_pending (int32_t *pending[], int child_count, +                    afr_transaction_type type) +{ +        int i = 0; +        int j = 0; + +        for (i = 0; i < child_count; i++) { +                j = afr_index_for_transaction_type (type); +                pending[i][j] = hton32 (1); +        } +} + + +static void +__mark_child_dead (int32_t *pending[], int child_count, int child, +                   afr_transaction_type type) +{ +        int j = 0; + +        j = afr_index_for_transaction_type (type); + +        pending[child][j] = 0; +} + + +static void +__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index) +{ +        afr_local_t   *local = NULL; +        afr_fd_ctx_t  *fd_ctx = NULL; + +        local = frame->local; + +        if (!local->fd) +                return; + +        fd_ctx = afr_fd_ctx_get (local->fd, this); + +        if (!fd_ctx) +                goto out; + +        LOCK (&local->fd->lock); +        { +                if (local->transaction.type == AFR_DATA_TRANSACTION) +                        fd_ctx->pre_op_done[child_index]++; +        } +        UNLOCK (&local->fd->lock); +out: +        return; +} + +static void +__mark_non_participant_children (int32_t *pending[], int child_count, +                                 unsigned char *participants, +                                 afr_transaction_type type) +{ +        int i = 0; +        int j = 0; + +        j = afr_index_for_transaction_type (type); +        for (i = 0; i < child_count; i++) { +                if (!participants[i]) +                        pending[i][j] = 0; +        } +} + + +void +__mark_all_success (int32_t *pending[], int child_count, +                    afr_transaction_type type) +{ +        int i; +        int j; + +        for (i = 0; i < child_count; i++) { +                j = afr_index_for_transaction_type (type); +                pending[i][j] = hton32 (-1); +        } +} + +void +_set_all_child_errno (int *child_errno, unsigned int child_count) +{ +        int     i = 0; + +        for (i = 0; i < child_count; i++) +                if (child_errno[i] == 0) +                        child_errno[i] = ENOTCONN; +} + +void +afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local = NULL; +        afr_private_t   *priv = NULL; +        fd_t            *fd   = NULL; + +        local = frame->local; +        priv  = this->private; +        fd    = local->fd; + +        __mark_all_success (local->pending, priv->child_count, +                            local->transaction.type); + +        _set_all_child_errno (local->child_errno, priv->child_count); + +        /*  Perform fops with the lk-owner from top xlator. +         *  Eg: lk-owner of posix-lk and flush should be same, +         *  flush cant clear the  posix-lks without that lk-owner. +         */ +        afr_save_lk_owner (frame); +        frame->root->lk_owner = +                local->transaction.main_frame->root->lk_owner; + + +        /* The wake up needs to happen independent of +           what type of fop arrives here. If it was +           a write, then it has already inherited the +           lock and changelog. If it was not a write, +           then the presumption of the optimization (of +           optimizing for successive write operations) +           fails. +        */ +        if (fd) +                afr_delayed_changelog_wake_up (this, fd); +        local->transaction.fop (frame, this); +} + + +static int +__changelog_enabled (afr_private_t *priv, afr_transaction_type type) +{ +        int ret = 0; + +        switch (type) { +        case AFR_DATA_TRANSACTION: +                if (priv->data_change_log) +                        ret = 1; + +                break; + +        case AFR_METADATA_TRANSACTION: +                if (priv->metadata_change_log) +                        ret = 1; + +                break; + +        case AFR_ENTRY_TRANSACTION: +        case AFR_ENTRY_RENAME_TRANSACTION: +                if (priv->entry_change_log) +                        ret = 1; + +                break; +        } + +        return ret; +} + + +static int +__fop_changelog_needed (call_frame_t *frame, xlator_t *this) +{ +        afr_private_t * priv  = NULL; +        afr_local_t   * local = NULL; +        int op_ret = 0; +        afr_transaction_type type = -1; + +        priv  = this->private; +        local = frame->local; +        type  = local->transaction.type; + +        if (__changelog_enabled (priv, type)) { +                switch (local->op) { + +                case GF_FOP_WRITE: +                case GF_FOP_FTRUNCATE: +                        op_ret = 1; +                        break; + +                case GF_FOP_FLUSH: +                        op_ret = 0; +                        break; + +                default: +                        op_ret = 1; +                } +        } + +        return op_ret; +} + +int +afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, +                      int child, afr_xattrop_type_t op) +{ +        int i = 0; +        int ret = 0; + +        if (op == LOCAL_FIRST) { +                ret = dict_set_static_bin (xattr, priv->pending_key[child], +                                           pending[child], +                                   AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); +                if (ret) +                        goto out; +        } +        for (i = 0; i < priv->child_count; i++) { +                if (i == child) +                        continue; +                ret = dict_set_static_bin (xattr, priv->pending_key[i], +                                           pending[i], +                                   AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); +                /* 3 = data+metadata+entry */ + +                if (ret < 0) +                        goto out; +        } +        if (op == LOCAL_LAST) { +                ret = dict_set_static_bin (xattr, priv->pending_key[child], +                                           pending[child], +                                   AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); +                if (ret) +                        goto out; +        } +out: +        return ret; +} + +int +afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) +{ +        int ret = 0; + +        switch (type) { +        case AFR_DATA_TRANSACTION: +                ret = priv->child_count; +                break; + +        case AFR_METADATA_TRANSACTION: +                ret = priv->child_count; +                break; + +        case AFR_ENTRY_TRANSACTION: +        case AFR_ENTRY_RENAME_TRANSACTION: +                ret = priv->child_count; +                break; +        } + +        return ret; +} + +/* {{{ pending */ + +int32_t +afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                           int32_t op_ret, int32_t op_errno, dict_t *xattr, +                           dict_t *xdata) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_private_t       *priv     = NULL; +        afr_local_t         *local    = NULL; +        int                  call_count = -1; + +        priv     = this->private; +        local    = frame->local; +        int_lock = &local->internal_lock; + +        LOCK (&frame->lock); +        { +                call_count = --local->call_count; +        } +        UNLOCK (&frame->lock); + +        if (call_count == 0) { +                if (local->transaction.resume_stub) { +			call_resume (local->transaction.resume_stub); +                        local->transaction.resume_stub = NULL; +                } + +                if (afr_lock_server_count (priv, local->transaction.type) == 0) { +                        local->transaction.done (frame, this); +                } else { +                        int_lock->lock_cbk = local->transaction.done; +                        afr_unlock (frame, this); +                } +        } + +        return 0; +} + + +void +afr_transaction_rm_stale_children (call_frame_t *frame, xlator_t *this, +                                   inode_t *inode, afr_transaction_type type) +{ +        int             i = -1; +        int             count = 0; +        int             read_child = -1; +        afr_private_t   *priv = NULL; +        afr_local_t     *local = NULL; +        int             **pending = NULL; +        int             idx = 0; +        int32_t         *stale_children = NULL; +        int32_t         *fresh_children = NULL; +        gf_boolean_t    rm_stale_children = _gf_false; + +        idx = afr_index_for_transaction_type (type); + +        priv = this->private; +        local = frame->local; +        pending = local->pending; + +        if (local->op_ret < 0) +                goto out; +        fresh_children = local->fresh_children; +        read_child = afr_inode_get_read_ctx (this, inode, fresh_children); +        if (read_child < 0) { +                gf_log (this->name, GF_LOG_DEBUG, "Possible split-brain " +                        "for %s", uuid_utoa (inode->gfid)); +                goto out; +        } + +        for (i = 0; i < priv->child_count; i++) { +                if (!afr_is_child_present (fresh_children, +                                           priv->child_count, i)) +                        continue; +                if (pending[i][idx]) +                        continue; +                /* child is down or op failed on it */ +                if (!stale_children) +                        stale_children = afr_children_create (priv->child_count); +                if (!stale_children) +                        goto out; + +                rm_stale_children = _gf_true; +                stale_children[count++] = i; +                gf_log (this->name, GF_LOG_DEBUG, "Removing stale child " +                        "%d for %s", i, uuid_utoa (inode->gfid)); +        } + +        if (!rm_stale_children) +                goto out; + +        afr_inode_rm_stale_children (this, inode, stale_children); +out: +        GF_FREE (stale_children); +        return; +} + +afr_inodelk_t* +afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom) +{ +        afr_inodelk_t *inodelk = NULL; +        int           i = 0; + +        for (i = 0; int_lock->inodelk[i].domain; i++) { +                inodelk = &int_lock->inodelk[i]; +                if (strcmp (dom, inodelk->domain) == 0) +                        return inodelk; +        } +        return NULL; +} + +unsigned char* +afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock) +{ +        unsigned char *locked_nodes = NULL; +        afr_inodelk_t *inodelk = NULL; +        switch (type) { +        case AFR_DATA_TRANSACTION: +        case AFR_METADATA_TRANSACTION: +                inodelk = afr_get_inodelk (int_lock, int_lock->domain); +                locked_nodes = inodelk->locked_nodes; +        break; + +        case AFR_ENTRY_TRANSACTION: +        case AFR_ENTRY_RENAME_TRANSACTION: +                /*Because same set of subvols participate in all lockee +                 * entities*/ +                locked_nodes = int_lock->lockee[0].locked_nodes; +        break; +        } +        return locked_nodes; +} + +int +afr_changelog_pre_op_call_count (afr_transaction_type type, +                                 afr_internal_lock_t *int_lock, +                                 unsigned int child_count) +{ +        int           call_count = 0; +        unsigned char *locked_nodes = NULL; + +        locked_nodes = afr_locked_nodes_get (type, int_lock); +        GF_ASSERT (locked_nodes); + +        call_count = afr_locked_children_count (locked_nodes, child_count); +        if (type == AFR_ENTRY_RENAME_TRANSACTION) +                call_count *= 2; + +        return call_count; +} + +int +afr_changelog_post_op_call_count (afr_transaction_type type, +                                  unsigned char *pre_op, +                                  unsigned int child_count) +{ +        int           call_count = 0; + +        call_count = afr_pre_op_done_children_count (pre_op, child_count); +        if (type == AFR_ENTRY_RENAME_TRANSACTION) +                call_count *= 2; + +        return call_count; +} + +void +afr_compute_txn_changelog (afr_local_t *local, afr_private_t *priv) +{ +        int             i = 0; +        int             index = 0; +        int32_t         postop = 0; +        int32_t         preop = 1; +        int32_t         **txn_changelog = NULL; + +        txn_changelog = local->transaction.txn_changelog; +        index = afr_index_for_transaction_type (local->transaction.type); +        for (i = 0; i < priv->child_count; i++) { +                postop = ntoh32 (local->pending[i][index]); +                txn_changelog[i][index] = hton32 (postop + preop); +        } +} + +afr_xattrop_type_t +afr_get_postop_xattrop_type (int32_t **pending, int optimized, int child, +                             afr_transaction_type type) +{ +        int                     index = 0; +        afr_xattrop_type_t      op = LOCAL_LAST; + +        index = afr_index_for_transaction_type (type); +        if (optimized && !pending[child][index]) +                op = LOCAL_FIRST; +        return op; +} + +void +afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr, +                     int optimized, int child) +{ +        int32_t                 **txn_changelog = NULL; +        int32_t                 **changelog = NULL; +        afr_private_t           *priv = NULL; +        int                     ret = 0; +        afr_xattrop_type_t      op = LOCAL_LAST; + +        priv = this->private; +        txn_changelog = local->transaction.txn_changelog; +        op = afr_get_postop_xattrop_type (local->pending, optimized, child, +                                          local->transaction.type); +        if (optimized) +                changelog = txn_changelog; +        else +                changelog = local->pending; +        ret = afr_set_pending_dict (priv, xattr, changelog, child, op); +        if (ret < 0) +                gf_log (this->name, GF_LOG_INFO, +                        "failed to set pending entry"); +} + + +gf_boolean_t +afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) +{ +        afr_private_t *priv = NULL; +        afr_local_t *local = NULL; +        int index = -1; +        int i = 0; + +        local = frame->local; +        priv = this->private; + +        index = afr_index_for_transaction_type (local->transaction.type); + +        for (i = 0; i < priv->child_count; i++) { +                if (local->pending[i][index] == 0) +                        return _gf_false; +        } + +        return _gf_true; +} + +static void +afr_dir_fop_handle_all_fop_failures (call_frame_t *frame) +{ +        xlator_t        *this = NULL; +        afr_local_t     *local = NULL; +        afr_private_t   *priv = NULL; + +        this = frame->this; +        local = frame->local; +        priv = this->private; + +        if ((local->transaction.type != AFR_ENTRY_TRANSACTION) && +            (local->transaction.type != AFR_ENTRY_RENAME_TRANSACTION)) +                return; + +        if (local->op_ret >= 0) +                goto out; + +        __mark_all_success (local->pending, priv->child_count, +                            local->transaction.type); +out: +        return; +} + +static void +afr_data_handle_quota_errors (call_frame_t *frame, xlator_t *this) +{ +        int     i = 0; +        afr_private_t *priv = NULL; +        afr_local_t   *local = NULL; +        gf_boolean_t  all_quota_failures = _gf_false; + +        local = frame->local; +        priv  = this->private; +        if (local->transaction.type != AFR_DATA_TRANSACTION) +                return; +        /* +         * Idea is to not leave the file in FOOL-FOOL scenario in case on +         * all the bricks data transaction failed with EDQUOT to avoid +         * increasing un-necessary load of self-heals in the system. +         */ +        all_quota_failures = _gf_true; +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i] && +                    (local->child_errno[i] != EDQUOT)) { +                        all_quota_failures = _gf_false; +                        break; +                } +        } +        if (all_quota_failures) +                __mark_all_success (local->pending, priv->child_count, +                                    local->transaction.type); +} + +int +afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) +{ +        afr_private_t * priv = this->private; +        afr_internal_lock_t *int_lock = NULL; +        int i          = 0; +        int call_count = 0; + +        afr_local_t *  local = NULL; +        afr_fd_ctx_t  *fdctx = NULL; +        dict_t        **xattr = NULL; +        int            piggyback = 0; +        int            nothing_failed = 1; + +        local    = frame->local; +        int_lock = &local->internal_lock; + +        __mark_non_participant_children (local->pending, priv->child_count, +                                         local->transaction.pre_op, +                                         local->transaction.type); + +        afr_data_handle_quota_errors (frame, this); +        afr_dir_fop_handle_all_fop_failures (frame); + +        if (local->fd) +                afr_transaction_rm_stale_children (frame, this, +                                                   local->fd->inode, +                                                   local->transaction.type); + +        xattr = alloca (priv->child_count * sizeof (*xattr)); +        memset (xattr, 0, (priv->child_count * sizeof (*xattr))); +        for (i = 0; i < priv->child_count; i++) { +                xattr[i] = dict_new (); +        } + +        call_count = afr_changelog_post_op_call_count (local->transaction.type, +                                                       local->transaction.pre_op, +                                                       priv->child_count); +        local->call_count = call_count; + +        if (local->fd) +                fdctx = afr_fd_ctx_get (local->fd, this); + +        if (call_count == 0) { +                /* no child is up */ +                int_lock->lock_cbk = local->transaction.done; +                afr_unlock (frame, this); +                goto out; +        } + +        nothing_failed = afr_txn_nothing_failed (frame, this); + +        afr_compute_txn_changelog (local , priv); + +        for (i = 0; i < priv->child_count; i++) { +                if (!local->transaction.pre_op[i]) +                        continue; + +                if (local->transaction.type != AFR_DATA_TRANSACTION) +                        afr_set_postop_dict (local, this, xattr[i], +                                             local->optimistic_change_log, i); +                switch (local->transaction.type) { +                case AFR_DATA_TRANSACTION: +                { +                        if (!fdctx) { +                                afr_set_postop_dict (local, this, xattr[i], +                                                     0, i); +                                STACK_WIND (frame, afr_changelog_post_op_cbk, +                                            priv->children[i], +                                            priv->children[i]->fops->xattrop, +                                            &local->loc, +                                            GF_XATTROP_ADD_ARRAY, xattr[i], +                                            NULL); +                                break; +                        } + +                        /* local->transaction.postop_piggybacked[] was +                           precomputed in is_piggyback_postop() when called from +                           afr_changelog_post_op_safe() +                        */ + +                        piggyback = 0; +                        if (local->transaction.postop_piggybacked[i]) +                                piggyback = 1; + +                        afr_set_postop_dict (local, this, xattr[i], +                                             piggyback, i); + +                        if (nothing_failed && piggyback) { +                                afr_changelog_post_op_cbk (frame, (void *)(long)i, +                                                           this, 1, 0, xattr[i], NULL); +                        } else { +                                STACK_WIND_COOKIE (frame, +                                                   afr_changelog_post_op_cbk, +                                                   (void *) (long) i, +                                                   priv->children[i], +                                                   priv->children[i]->fops->fxattrop, +                                                   local->fd, +                                                   GF_XATTROP_ADD_ARRAY, xattr[i], +                                                   NULL); +                        } +                } +                break; +                case AFR_METADATA_TRANSACTION: +                { +                        if (nothing_failed && local->optimistic_change_log) { +                                afr_changelog_post_op_cbk (frame, (void *)(long)i, +                                                           this, 1, 0, xattr[i], +                                                           NULL); +                                break; +                        } + +                        if (local->fd) +                                STACK_WIND (frame, afr_changelog_post_op_cbk, +                                            priv->children[i], +                                            priv->children[i]->fops->fxattrop, +                                            local->fd, +                                            GF_XATTROP_ADD_ARRAY, xattr[i], +                                            NULL); +                        else +                                STACK_WIND (frame, afr_changelog_post_op_cbk, +                                            priv->children[i], +                                            priv->children[i]->fops->xattrop, +                                            &local->loc, +                                            GF_XATTROP_ADD_ARRAY, xattr[i], +                                            NULL); +                } +                break; + +                case AFR_ENTRY_RENAME_TRANSACTION: +                { +                        if (nothing_failed && local->optimistic_change_log) { +                                afr_changelog_post_op_cbk (frame, (void *)(long)i, +                                                           this, 1, 0, xattr[i], +                                                           NULL); +                        } else { +                                STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, +                                                   (void *) (long) i, +                                                   priv->children[i], +                                                   priv->children[i]->fops->xattrop, +                                                   &local->transaction.new_parent_loc, +                                                   GF_XATTROP_ADD_ARRAY, xattr[i], +                                                   NULL); +                        } +                        call_count--; +                } + +                /* +                  set it again because previous stack_wind +                  might have already returned (think of case +                  where subvolume is posix) and would have +                  used the dict as placeholder for return +                  value +                */ + +                afr_set_postop_dict (local, this, xattr[i], +                                     local->optimistic_change_log, i); + +                /* fall through */ + +                case AFR_ENTRY_TRANSACTION: +                { +                        if (nothing_failed && local->optimistic_change_log) { +                                afr_changelog_post_op_cbk (frame, (void *)(long)i, +                                                           this, 1, 0, xattr[i], +                                                           NULL); +                                break; +                        } + +                        if (local->fd) +                                STACK_WIND (frame, afr_changelog_post_op_cbk, +                                            priv->children[i], +                                            priv->children[i]->fops->fxattrop, +                                            local->fd, +                                            GF_XATTROP_ADD_ARRAY, xattr[i], +                                            NULL); +                        else +                                STACK_WIND (frame, afr_changelog_post_op_cbk, +                                            priv->children[i], +                                            priv->children[i]->fops->xattrop, +                                            &local->transaction.parent_loc, +                                            GF_XATTROP_ADD_ARRAY, xattr[i], +                                            NULL); +                } +                break; +                } + +                if (!--call_count) +                        break; +        } + +out: +        for (i = 0; i < priv->child_count; i++) { +                dict_unref (xattr[i]); +        } + +        return 0; +} + + +int32_t +afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                          int32_t op_ret, int32_t op_errno, dict_t *xattr, +                          dict_t *xdata) +{ +        afr_local_t *   local = NULL; +        afr_private_t * priv  = this->private; +        int call_count  = -1; +        int child_index = (long) cookie; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                switch (op_ret) { +                case 0: +                        __mark_pre_op_done_on_fd (frame, this, child_index); +                        //fallthrough we need to mark the pre_op +                case 1: +                        local->transaction.pre_op[child_index] = 1; +                        /* special op_ret for piggyback */ +                        break; +                case -1: +                        if (op_errno == ENOTSUP) { +                                gf_log (this->name, GF_LOG_ERROR, +                                        "xattrop not supported by %s", +                                        priv->children[child_index]->name); +                                local->op_ret = -1; + +                        } else if (!child_went_down (op_ret, op_errno)) { +                                gf_log (this->name, GF_LOG_ERROR, +                                        "xattrop failed on child %s: %s", +                                        priv->children[child_index]->name, +                                        strerror (op_errno)); +                        } +                        local->op_errno = op_errno; +                        break; +                } + +                call_count = --local->call_count; +        } +        UNLOCK (&frame->lock); + +        if (call_count == 0) { +                if ((local->op_ret == -1) && +                    (local->op_errno == ENOTSUP)) { +                        local->transaction.resume (frame, this); +                } else { +                        afr_transaction_perform_fop (frame, this); +                } +        } + +        return 0; +} + +int +afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) +{ +        afr_private_t * priv = this->private; +        int i = 0; +        int ret = 0; +        int call_count = 0; +        dict_t **xattr = NULL; +        afr_fd_ctx_t *fdctx = NULL; +        afr_local_t *local = NULL; +        int          piggyback = 0; +        afr_internal_lock_t *int_lock = NULL; +        unsigned char       *locked_nodes = NULL; + +        local = frame->local; +        int_lock = &local->internal_lock; + +        xattr = alloca (priv->child_count * sizeof (*xattr)); +        memset (xattr, 0, (priv->child_count * sizeof (*xattr))); + +        for (i = 0; i < priv->child_count; i++) { +                xattr[i] = dict_new (); +        } + +        call_count = afr_changelog_pre_op_call_count (local->transaction.type, +                                                      int_lock, +                                                      priv->child_count); +        if (call_count == 0) { +                local->internal_lock.lock_cbk = +                        local->transaction.done; +                afr_unlock (frame, this); +                goto out; +        } + +        local->call_count = call_count; + +        __mark_all_pending (local->pending, priv->child_count, +                            local->transaction.type); + +        if (local->fd) +                fdctx = afr_fd_ctx_get (local->fd, this); + +        locked_nodes = afr_locked_nodes_get (local->transaction.type, int_lock); +        for (i = 0; i < priv->child_count; i++) { +                if (!locked_nodes[i]) +                        continue; +                ret = afr_set_pending_dict (priv, xattr[i], local->pending, +                                            i, LOCAL_FIRST); + +                if (ret < 0) +                        gf_log (this->name, GF_LOG_INFO, +                                "failed to set pending entry"); + + +                switch (local->transaction.type) { +                case AFR_DATA_TRANSACTION: +                { +                        if (!fdctx) { +                                STACK_WIND_COOKIE (frame, +                                                   afr_changelog_pre_op_cbk, +                                                   (void *) (long) i, +                                                   priv->children[i], +                                                   priv->children[i]->fops->xattrop, +                                                   &(local->loc), +                                                   GF_XATTROP_ADD_ARRAY, xattr[i], +                                                   NULL); +                                break; +                        } + +                        LOCK (&local->fd->lock); +                        { +                                piggyback = 0; +                                if (fdctx->pre_op_done[i]) { +                                        fdctx->pre_op_piggyback[i]++; +                                        piggyback = 1; +                                        fdctx->hit++; +                                } else { +                                        fdctx->miss++; +                                } +                        } +                        UNLOCK (&local->fd->lock); + +                        afr_set_delayed_post_op (frame, this); + +                        if (piggyback) +                                afr_changelog_pre_op_cbk (frame, (void *)(long)i, +                                                          this, 1, 0, xattr[i], +                                                          NULL); +                        else +                                STACK_WIND_COOKIE (frame, +                                                   afr_changelog_pre_op_cbk, +                                                   (void *) (long) i, +                                                   priv->children[i], +                                                   priv->children[i]->fops->fxattrop, +                                                   local->fd, +                                                   GF_XATTROP_ADD_ARRAY, xattr[i], +                                                   NULL); +                } +                break; +                case AFR_METADATA_TRANSACTION: +                { +                        if (local->optimistic_change_log) { +                                afr_changelog_pre_op_cbk (frame, (void *)(long)i, +                                                          this, 1, 0, xattr[i], +                                                          NULL); +                                break; +                        } + +                        if (local->fd) +                                STACK_WIND_COOKIE (frame, +                                                   afr_changelog_pre_op_cbk, +                                                   (void *) (long) i, +                                                   priv->children[i], +                                                   priv->children[i]->fops->fxattrop, +                                                   local->fd, +                                                   GF_XATTROP_ADD_ARRAY, xattr[i], +                                                   NULL); +                        else +                                STACK_WIND_COOKIE (frame, +                                                   afr_changelog_pre_op_cbk, +                                                   (void *) (long) i, +                                                   priv->children[i], +                                                   priv->children[i]->fops->xattrop, +                                                   &(local->loc), +                                                   GF_XATTROP_ADD_ARRAY, xattr[i], +                                                   NULL); +                } +                break; + +                case AFR_ENTRY_RENAME_TRANSACTION: +                { +                        if (local->optimistic_change_log) { +                                afr_changelog_pre_op_cbk (frame, (void *)(long)i, +                                                          this, 1, 0, xattr[i], +                                                          NULL); +                        } else { +                                STACK_WIND_COOKIE (frame, +                                                   afr_changelog_pre_op_cbk, +                                                   (void *) (long) i, +                                                   priv->children[i], +                                                   priv->children[i]->fops->xattrop, +                                                   &local->transaction.new_parent_loc, +                                                   GF_XATTROP_ADD_ARRAY, xattr[i], +                                                   NULL); +                        } + +                        call_count--; +                } + + +                /* +                  set it again because previous stack_wind +                  might have already returned (think of case +                  where subvolume is posix) and would have +                  used the dict as placeholder for return +                  value +                */ + +                ret = afr_set_pending_dict (priv, xattr[i], local->pending, +                                            i, LOCAL_FIRST); + +                if (ret < 0) +                        gf_log (this->name, GF_LOG_INFO, +                                "failed to set pending entry"); + +                /* fall through */ + +                case AFR_ENTRY_TRANSACTION: +                { +                        if (local->optimistic_change_log) { +                                afr_changelog_pre_op_cbk (frame, (void *)(long)i, +                                                          this, 1, 0, xattr[i], +                                                          NULL); +                                break; +                        } + +                        if (local->fd) +                                STACK_WIND_COOKIE (frame, +                                                   afr_changelog_pre_op_cbk, +                                                   (void *) (long) i, +                                                   priv->children[i], +                                                   priv->children[i]->fops->fxattrop, +                                                   local->fd, +                                                   GF_XATTROP_ADD_ARRAY, xattr[i], +                                                   NULL); +                        else +                                STACK_WIND_COOKIE (frame, +                                                   afr_changelog_pre_op_cbk, +                                                   (void *) (long) i, +                                                   priv->children[i], +                                                   priv->children[i]->fops->xattrop, +                                                   &local->transaction.parent_loc, +                                                   GF_XATTROP_ADD_ARRAY, xattr[i], +                                                   NULL); +                } +                break; +                } + +                if (!--call_count) +                        break; +        } +out: +        for (i = 0; i < priv->child_count; i++) { +                dict_unref (xattr[i]); +        } + +        return 0; +} + + +int +afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_local_t         *local    = NULL; + +        local    = frame->local; +        int_lock = &local->internal_lock; + +        if (int_lock->lock_op_ret < 0) { +                gf_log (this->name, GF_LOG_INFO, +                        "Blocking inodelks failed."); +                local->transaction.done (frame, this); +        } else { + +                gf_log (this->name, GF_LOG_DEBUG, +                        "Blocking inodelks done. Proceeding to FOP"); +                afr_internal_lock_finish (frame, this); +        } + +        return 0; +} + + +int +afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_local_t         *local    = NULL; + +        local    = frame->local; +        int_lock = &local->internal_lock; + +        /* Initiate blocking locks if non-blocking has failed */ +        if (int_lock->lock_op_ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Non blocking inodelks failed. Proceeding to blocking"); +                int_lock->lock_cbk = afr_post_blocking_inodelk_cbk; +                afr_blocking_lock (frame, this); +        } else { + +                gf_log (this->name, GF_LOG_DEBUG, +                        "Non blocking inodelks done. Proceeding to FOP"); +                afr_internal_lock_finish (frame, this); +        } + +        return 0; +} + + +int +afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_local_t         *local    = NULL; + +        local    = frame->local; +        int_lock = &local->internal_lock; + +        if (int_lock->lock_op_ret < 0) { +                gf_log (this->name, GF_LOG_INFO, +                        "Blocking entrylks failed."); +                local->transaction.done (frame, this); +        } else { + +                gf_log (this->name, GF_LOG_DEBUG, +                        "Blocking entrylks done. Proceeding to FOP"); +                afr_internal_lock_finish (frame, this); +        } + +        return 0; +} + + +int +afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_local_t         *local    = NULL; + +        local = frame->local; +        int_lock = &local->internal_lock; + +        /* Initiate blocking locks if non-blocking has failed */ +        if (int_lock->lock_op_ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Non blocking entrylks failed. Proceeding to blocking"); +                int_lock->lock_cbk = afr_post_blocking_entrylk_cbk; +                afr_blocking_lock (frame, this); +        } else { + +                gf_log (this->name, GF_LOG_DEBUG, +                        "Non blocking entrylks done. Proceeding to FOP"); +                afr_internal_lock_finish (frame, this); +        } + +        return 0; +} + + +int +afr_post_blocking_rename_cbk (call_frame_t *frame, xlator_t *this) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_local_t         *local    = NULL; + +        local    = frame->local; +        int_lock = &local->internal_lock; + +        if (int_lock->lock_op_ret < 0) { +                gf_log (this->name, GF_LOG_INFO, +                        "Blocking entrylks failed."); +                local->transaction.done (frame, this); +        } else { + +                gf_log (this->name, GF_LOG_DEBUG, +                        "Blocking entrylks done. Proceeding to FOP"); +                afr_internal_lock_finish (frame, this); +        } +        return 0; +} + + +int +afr_post_lower_unlock_cbk (call_frame_t *frame, xlator_t *this) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_local_t         *local    = NULL; + +        local    = frame->local; +        int_lock = &local->internal_lock; + +        GF_ASSERT (!int_lock->higher_locked); + +        int_lock->lock_cbk = afr_post_blocking_rename_cbk; +        afr_blocking_lock (frame, this); + +        return 0; +} + + +int +afr_set_transaction_flock (afr_local_t *local) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_inodelk_t       *inodelk  = NULL; + +        int_lock = &local->internal_lock; +        inodelk = afr_get_inodelk (int_lock, int_lock->domain); + +        inodelk->flock.l_len   = local->transaction.len; +        inodelk->flock.l_start = local->transaction.start; +        inodelk->flock.l_type  = F_WRLCK; + +        return 0; +} + +int +afr_lock_rec (call_frame_t *frame, xlator_t *this) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_local_t         *local    = NULL; + +        local    = frame->local; +        int_lock = &local->internal_lock; + +        int_lock->transaction_lk_type = AFR_TRANSACTION_LK; +        int_lock->domain = this->name; + +        switch (local->transaction.type) { +        case AFR_DATA_TRANSACTION: +        case AFR_METADATA_TRANSACTION: +                afr_set_transaction_flock (local); + +                int_lock->lock_cbk = afr_post_nonblocking_inodelk_cbk; + +                afr_nonblocking_inodelk (frame, this); +                break; + +        case AFR_ENTRY_RENAME_TRANSACTION: + +                int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk; +                afr_nonblocking_entrylk (frame, this); +                break; + +        case AFR_ENTRY_TRANSACTION: +                int_lock->lk_basename = local->transaction.basename; +                if (&local->transaction.parent_loc) +                        int_lock->lk_loc = &local->transaction.parent_loc; +                else +                        GF_ASSERT (local->fd); + +                int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk; +                afr_nonblocking_entrylk (frame, this); +                break; +        } + +        return 0; +} + + +int +afr_lock (call_frame_t *frame, xlator_t *this) +{ +        afr_set_lock_number (frame, this); + +        return afr_lock_rec (frame, this); +} + + +/* }}} */ + +int +afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) +{ +        if (__fop_changelog_needed (frame, this)) { +                afr_changelog_pre_op (frame, this); +        } else { +                afr_transaction_perform_fop (frame, this); +        } + +        return 0; +} + + +void +afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t    *local = NULL; +        afr_private_t  *priv = NULL; + +        /* call this function from any of the related optimizations +           which benefit from delaying post op are enabled, namely: + +           - changelog piggybacking +           - eager locking +        */ + +        priv = this->private; +        if (!priv) +                return; + +        if (!priv->post_op_delay_secs) +                return; + +        local = frame->local; +        if (!local->transaction.eager_lock_on) +                return; + +        if (!local) +                return; + +        if (!local->fd) +                return; + +        if (local->op == GF_FOP_WRITE) +                local->delayed_post_op = _gf_true; +} + +gf_boolean_t +afr_are_multiple_fds_opened (inode_t *inode, xlator_t *this) +{ +        afr_inode_ctx_t *ictx = NULL; + +        if (!inode) { +                /* If false is returned, it may keep on taking eager-lock +                 * which may lead to starvation, so return true to avoid that. +                 */ +                gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid inode"); +                return _gf_true; +        } +        /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock +         * is taken mount2 opened the same file, it won't be able to +         * perform any data operations until mount1 releases eager-lock. +         * To avoid such scenario do not enable eager-lock for this transaction +         * if open-fd-count is > 1 +         */ + +        ictx = afr_inode_ctx_get (inode, this); +        if (!ictx) +                return _gf_true; + +        if (ictx->open_fd_count > 1) +                return _gf_true; + +        return _gf_false; +} + +gf_boolean_t +afr_any_fops_failed (afr_local_t *local, afr_private_t *priv) +{ +        if (local->success_count != priv->child_count) +                return _gf_true; +        return _gf_false; +} + +gf_boolean_t +is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t      *local = NULL; +        gf_boolean_t      res = _gf_false; +        afr_private_t    *priv  = NULL; + +        priv  = this->private; + +        local = frame->local; +        if (!local) +                goto out; + +        if (!local->delayed_post_op) +                goto out; + +        //Mark pending changelog ASAP +        if (afr_any_fops_failed (local, priv)) +                goto out; + +        if (local->fd && afr_are_multiple_fds_opened (local->fd->inode, this)) +                goto out; + +        res = _gf_true; +out: +        return res; +} + + +void +afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, +                               call_stub_t *stub); + +void +afr_delayed_changelog_wake_up_cbk (void *data) +{ +        fd_t           *fd = NULL; + +        fd = data; + +        afr_delayed_changelog_wake_up (THIS, fd); +} + + +/* +  Check if the frame is destined to get optimized away +  with changelog piggybacking +*/ +static gf_boolean_t +is_piggyback_post_op (call_frame_t *frame, fd_t *fd) +{ +        afr_fd_ctx_t *fdctx = NULL; +        afr_local_t *local = NULL; +        gf_boolean_t piggyback = _gf_true; +        afr_private_t *priv = NULL; +        int i = 0; + +        priv = frame->this->private; +        local = frame->local; +        fdctx = afr_fd_ctx_get (fd, frame->this); + +        LOCK(&fd->lock); +        { +                piggyback = _gf_true; + +                for (i = 0; i < priv->child_count; i++) { +                        if (!local->transaction.pre_op[i]) +                                continue; +                        if (fdctx->pre_op_piggyback[i]) { +                                fdctx->pre_op_piggyback[i]--; +                                local->transaction.postop_piggybacked[i] = 1; +                        } else { +                                /* For at least _one_ subvolume we cannot +                                   piggyback on the changelog, and have to +                                   perform a hard POST-OP and therefore fsync +                                   if necesssary +                                */ +                                piggyback = _gf_false; +                                GF_ASSERT (fdctx->pre_op_done[i]); +                                fdctx->pre_op_done[i]--; +                        } +                } +        } +        UNLOCK(&fd->lock); + +        if (!afr_txn_nothing_failed (frame, frame->this)) { +                /* something failed in this transaction, +                   we will be performing a hard post-op +                */ +                return _gf_false; +        } + +        return piggyback; +} + + +/* SET operation */ +int +afr_fd_report_unstable_write (xlator_t *this, fd_t *fd) +{ +        afr_fd_ctx_t *fdctx = NULL; + +        fdctx = afr_fd_ctx_get (fd, this); + +        LOCK(&fd->lock); +        { +                fdctx->witnessed_unstable_write = _gf_true; +        } +        UNLOCK(&fd->lock); + +        return 0; +} + +/* TEST and CLEAR operation */ +gf_boolean_t +afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd) +{ +        afr_fd_ctx_t *fdctx = NULL; +        gf_boolean_t witness = _gf_false; + +	fdctx = afr_fd_ctx_get (fd, this); +        if (!fdctx) +                return _gf_true; + +        LOCK(&fd->lock); +        { +                if (fdctx->witnessed_unstable_write) { +                        witness = _gf_true; +                        fdctx->witnessed_unstable_write = _gf_false; +                } +        } +        UNLOCK (&fd->lock); + +        return witness; +} + + +int +afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                         int op_ret, int op_errno, struct iatt *pre, +                         struct iatt *post, dict_t *xdata) +{ +        afr_private_t *priv = NULL; +        int child_index = (long) cookie; +        int call_count = -1; +        afr_local_t *local = NULL; + +        priv = this->private; +        local = frame->local; + +        if (afr_fop_failed (op_ret, op_errno)) { +                /* Failure of fsync() is as good as failure of previous +                   write(). So treat it like one. +                */ +                gf_log (this->name, GF_LOG_WARNING, +                        "fsync(%s) failed on subvolume %s. Transaction was %s", +                        uuid_utoa (local->fd->inode->gfid), +                        priv->children[child_index]->name, +                        gf_fop_list[local->op]); + +                afr_transaction_fop_failed (frame, this, child_index); +        } + +        call_count = afr_frame_return (frame); + +        if (call_count == 0) +                afr_changelog_post_op_now (frame, this); + +        return 0; +} + + +int +afr_changelog_fsync (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; +        int i = 0; +        int call_count = 0; +        afr_private_t *priv = NULL; +        dict_t *xdata = NULL; +        GF_UNUSED int ret = -1; + +        local = frame->local; +        priv = this->private; + +        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, +                                                     priv->child_count); + +        if (!call_count) { +                /* will go straight to unlock */ +                afr_changelog_post_op_now (frame, this); +                return 0; +        } + +        local->call_count = call_count; + +	xdata = dict_new(); +	if (xdata) +		ret = dict_set_int32 (xdata, "batch-fsync", 1); + +        for (i = 0; i < priv->child_count; i++) { +                if (!local->transaction.pre_op[i]) +                        continue; + +                STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk, +                                (void *) (long) i, priv->children[i], +                                priv->children[i]->fops->fsync, local->fd, +                                1, xdata); +                if (!--call_count) +                        break; +        } + +	if (xdata) +		dict_unref (xdata); + +        return 0; +} + + +        int +afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t    *local = NULL; +        afr_private_t  *priv = NULL; + +	local = frame->local; +        priv = this->private; + +        if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) { +                afr_changelog_post_op_now (frame, this); +                return 0; +        } + +        if (is_piggyback_post_op (frame, local->fd)) { +                /* just detected that this post-op is about to +                   be optimized away as a new write() has +                   already piggybacked on this frame's changelog. +                   */ +                afr_changelog_post_op_now (frame, this); +                return 0; +        } + +        /* Calling afr_changelog_post_op_now() now will result in +           issuing ->[f]xattrop(). + +           Performing a hard POST-OP (->[f]xattrop() FOP) is a more +           responsible operation that what it might appear on the surface. + +           The changelog of a file (in the xattr of the file on the server) +           stores information (pending count) about the state of the file +           on the OTHER server. This changelog is blindly trusted, and must +           therefore be updated in such a way it remains trustworthy. This +           implies that decrementing the pending count (essentially "clearing +           the dirty flag") must be done STRICTLY after we are sure that the +           operation on the other server has reached stable storage. + +           While the backend filesystem on that server will eventually flush +           it to stable storage, we (being in userspace) have no mechanism +           to get notified when the write became "stable". + +           This means we need take matter into our own hands and issue an +           fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES, +           and get an acknowledgement for it. And we need to wait for the +           fsync() acknowledgement before initiating the hard POST-OP. + +           However if the FD itself was opened in O_SYNC or O_DSYNC then +           we are already guaranteed that the writes were made stable as +           part of the FOP itself. The same holds true for NFS stable +           writes which happen on an anonymous FD with O_DSYNC or O_SYNC +           flag set in the writev() @flags param. For all other write types, +           mark a flag in the fdctx whenever an unstable write is witnessed. +           */ + +        if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) { +                afr_changelog_post_op_now (frame, this); +                return 0; +        } + +        /* Check whether users want durability and perform fsync/post-op +         * accordingly. +         */ +        if (priv->ensure_durability) { +                /* Time to fsync() */ +                afr_changelog_fsync (frame, this); +        } else { +                afr_changelog_post_op_now (frame, this); +        } + +        return 0; +} + + +void +afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, +                               call_stub_t *stub) +{ +	afr_fd_ctx_t      *fd_ctx = NULL; +	call_frame_t      *prev_frame = NULL; +	struct timespec    delta = {0, }; +	afr_private_t     *priv = NULL; +	afr_local_t       *local = NULL; + +	priv = this->private; + +	fd_ctx = afr_fd_ctx_get (fd, this); +	if (!fd_ctx) +                goto out; + +	delta.tv_sec = priv->post_op_delay_secs; +	delta.tv_nsec = 0; + +	pthread_mutex_lock (&fd_ctx->delay_lock); +	{ +		prev_frame = fd_ctx->delay_frame; +		fd_ctx->delay_frame = NULL; +		if (fd_ctx->delay_timer) +			gf_timer_call_cancel (this->ctx, fd_ctx->delay_timer); +		fd_ctx->delay_timer = NULL; +		if (!frame) +			goto unlock; +		fd_ctx->delay_timer = gf_timer_call_after (this->ctx, delta, +							   afr_delayed_changelog_wake_up_cbk, +							   fd); +		fd_ctx->delay_frame = frame; +	} +unlock: +	pthread_mutex_unlock (&fd_ctx->delay_lock); + +out: +	if (prev_frame) { +		local = prev_frame->local; +		local->transaction.resume_stub = stub; +		afr_changelog_post_op_safe (prev_frame, this); +	} else if (stub) { +		call_resume (stub); +	} +} + + +void +afr_changelog_post_op (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t  *local = NULL; + +        local = frame->local; + +        if (is_afr_delayed_changelog_post_op_needed (frame, this)) +                afr_delayed_changelog_post_op (this, frame, local->fd, NULL); +        else +                afr_changelog_post_op_safe (frame, this); +} + + + +/* Wake up the sleeping/delayed post-op, and also register +   a stub to have it resumed after this transaction +   completely finishes. + +   The @stub gets saved in @local and gets resumed in +   afr_local_cleanup() +   */ +void +afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub) +{ +        afr_delayed_changelog_post_op (this, NULL, fd, stub); +} + + +void +afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd) +{ +        afr_delayed_changelog_post_op (this, NULL, fd, NULL); +} + + +        int +afr_transaction_resume (call_frame_t *frame, xlator_t *this) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_local_t         *local    = NULL; +        afr_private_t       *priv     = NULL; + +        local    = frame->local; +        int_lock = &local->internal_lock; +        priv     = this->private; + +        if (local->transaction.eager_lock_on) { +                /* We don't need to retain "local" in the +                   fd list anymore, writes to all subvols +                   are finished by now */ +                LOCK (&local->fd->lock); +                { +                        list_del_init (&local->transaction.eager_locked); +                } +                UNLOCK (&local->fd->lock); +        } + +        afr_restore_lk_owner (frame); + +        if (__fop_changelog_needed (frame, this)) { +                afr_changelog_post_op (frame, this); +        } else { +                if (afr_lock_server_count (priv, local->transaction.type) == 0) { +                        local->transaction.done (frame, this); +                } else { +                        int_lock->lock_cbk = local->transaction.done; +                        afr_unlock (frame, this); +                } +        } + +        return 0; +} + + +/** + * afr_transaction_fop_failed - inform that an fop failed + */ + +void +afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, +                            int child_index) +{ +        afr_local_t *   local = NULL; +        afr_private_t * priv  = NULL; + +        local = frame->local; +        priv  = this->private; + +        __mark_child_dead (local->pending, priv->child_count, +                        child_index, local->transaction.type); +} + + + +        static gf_boolean_t +afr_locals_overlap (afr_local_t *local1, afr_local_t *local2) +{ +        uint64_t start1 = local1->transaction.start; +        uint64_t start2 = local2->transaction.start; +        uint64_t end1 = 0; +        uint64_t end2 = 0; + +        if (local1->transaction.len) +                end1 = start1 + local1->transaction.len - 1; +        else +                end1 = ULLONG_MAX; + +        if (local2->transaction.len) +                end2 = start2 + local2->transaction.len - 1; +        else +                end2 = ULLONG_MAX; + +        return ((end1 >= start2) && (end2 >= start1)); +} + +void +afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this) +{ +        afr_private_t *priv = NULL; +        afr_fd_ctx_t  *fdctx = NULL; +        afr_local_t   *each = NULL; + +        priv = this->private; + +        if (!local->fd) +                return; + +        if (local->transaction.type != AFR_DATA_TRANSACTION) +                return; + +        if (!priv->eager_lock) +                return; + +        fdctx = afr_fd_ctx_get (local->fd, this); +        if (!fdctx) +                return; + +        if (afr_are_multiple_fds_opened (local->fd->inode, this)) +                return; +        /* +         * Once full file lock is acquired in eager-lock phase, overlapping +         * writes do not compete for inode-locks, instead are transferred to the +         * next writes. Because of this overlapping writes are not ordered. +         * This can cause inconsistencies in replication. +         * Example: +         * Two overlapping writes w1, w2 are sent in parallel on same fd +         * in two threads t1, t2. +         * Both threads can execute afr_writev_wind in the following manner. +         * t1 winds w1 on brick-0 +         * t2 winds w2 on brick-0 +         * t2 winds w2 on brick-1 +         * t1 winds w1 on brick-1 +         * +         * This check makes sure the locks are not transferred for +         * overlapping writes. +         */ +        LOCK (&local->fd->lock); +        { +                list_for_each_entry (each, &fdctx->eager_locked, +                                     transaction.eager_locked) { +                        if (afr_locals_overlap (each, local)) { +                                local->transaction.eager_lock_on = _gf_false; +                                goto unlock; +                        } +                } + +                local->transaction.eager_lock_on = _gf_true; +                list_add_tail (&local->transaction.eager_locked, +                               &fdctx->eager_locked); +        } +unlock: +        UNLOCK (&local->fd->lock); +} + + +int +afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) +{ +        afr_local_t *   local = NULL; +        afr_private_t * priv  = NULL; +        fd_t            *fd   = NULL; +        int             ret   = -1; + +        local = frame->local; +        priv  = this->private; + +        local->transaction.resume = afr_transaction_resume; +        local->transaction.type   = type; + +        ret = afr_transaction_local_init (local, this); +        if (ret < 0) +            goto out; + +        afr_transaction_eager_lock_init (local, this); + +        if (local->fd && local->transaction.eager_lock_on) +                afr_set_lk_owner (frame, this, local->fd); +        else +                afr_set_lk_owner (frame, this, frame->root); + +        if (!local->transaction.eager_lock_on && local->loc.inode) { +                fd = fd_lookup (local->loc.inode, frame->root->pid); +                if (fd == NULL) +                        fd = fd_lookup_anonymous (local->loc.inode); + +                if (fd) { +                        afr_delayed_changelog_wake_up (this, fd); +                        fd_unref (fd); +                } +        } + +        if (afr_lock_server_count (priv, local->transaction.type) == 0) { +                afr_internal_lock_finish (frame, this); +        } else { +                afr_lock (frame, this); +        } +        ret = 0; +out: +        return ret; +} diff --git a/xlators/cluster/afr-v1/src/afr-transaction.h b/xlators/cluster/afr-v1/src/afr-transaction.h new file mode 100644 index 000000000..fa626fd0d --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr-transaction.h @@ -0,0 +1,51 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef __TRANSACTION_H__ +#define __TRANSACTION_H__ + +typedef enum { +        LOCAL_FIRST = 1, +        LOCAL_LAST = 2 +} afr_xattrop_type_t; + +void +afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, +			    int child_index); + +int +afr_lock_server_count (afr_private_t *priv, afr_transaction_type type); + +afr_inodelk_t* +afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom); + +int32_t +afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); + +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this); +int +afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, +                      int child, afr_xattrop_type_t op); +void +afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this); + +void +afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd); + +void +__mark_all_success (int32_t *pending[], int child_count, +                    afr_transaction_type type); +gf_boolean_t +afr_any_fops_failed (afr_local_t *local, afr_private_t *priv); + +gf_boolean_t +afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this); +#endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr-v1/src/afr.c b/xlators/cluster/afr-v1/src/afr.c new file mode 100644 index 000000000..c26453807 --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr.c @@ -0,0 +1,793 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif +#include "afr-common.c" + +#define SHD_INODE_LRU_LIMIT          2048 +#define AFR_EH_HEALED_LIMIT          1024 +#define AFR_EH_HEAL_FAIL_LIMIT       1024 +#define AFR_EH_SPLIT_BRAIN_LIMIT     1024 + +struct volume_options options[]; + +int32_t +notify (xlator_t *this, int32_t event, +        void *data, ...) +{ +        int ret = -1; +        va_list         ap; +        void *data2 = NULL; + +        va_start (ap, data); +        data2 = va_arg (ap, dict_t*); +        va_end (ap); +        ret = afr_notify (this, event, data, data2); + +        return ret; +} + +int32_t +mem_acct_init (xlator_t *this) +{ +        int     ret = -1; + +        if (!this) +                return ret; + +        ret = xlator_mem_acct_init (this, gf_afr_mt_end + 1); + +        if (ret != 0) { +                gf_log(this->name, GF_LOG_ERROR, "Memory accounting init" +                       "failed"); +                return ret; +        } + +        return ret; +} + + +int +xlator_subvolume_index (xlator_t *this, xlator_t *subvol) +{ +        int index = -1; +        int i = 0; +        xlator_list_t *list = NULL; + +        list = this->children; + +        while (list) { +                if (subvol == list->xlator || +                    strcmp (subvol->name, list->xlator->name) == 0) { +                        index = i; +                        break; +                } +                list = list->next; +                i++; +        } + +        return index; +} + +void +fix_quorum_options (xlator_t *this, afr_private_t *priv, char *qtype) +{ +        if (priv->quorum_count && strcmp(qtype,"fixed")) { +                gf_log(this->name,GF_LOG_WARNING, +                       "quorum-type %s overriding quorum-count %u", +                       qtype, priv->quorum_count); +        } +        if (!strcmp(qtype,"none")) { +                priv->quorum_count = 0; +        } +        else if (!strcmp(qtype,"auto")) { +                priv->quorum_count = AFR_QUORUM_AUTO; +        } +} + +int +reconfigure (xlator_t *this, dict_t *options) +{ +        afr_private_t *priv        = NULL; +        xlator_t      *read_subvol = NULL; +        int            read_subvol_index = -1; +        int            ret         = -1; +        int            index       = -1; +        char          *qtype       = NULL; + +        priv = this->private; + +        GF_OPTION_RECONF ("background-self-heal-count", +                          priv->background_self_heal_count, options, uint32, +                          out); + +        GF_OPTION_RECONF ("metadata-self-heal", +                          priv->metadata_self_heal, options, bool, out); + +        GF_OPTION_RECONF ("data-self-heal", priv->data_self_heal, options, str, +                          out); + +        GF_OPTION_RECONF ("entry-self-heal", priv->entry_self_heal, options, +                          bool, out); + +        GF_OPTION_RECONF ("strict-readdir", priv->strict_readdir, options, bool, +                          out); + +        GF_OPTION_RECONF ("data-self-heal-window-size", +                          priv->data_self_heal_window_size, options, +                          uint32, out); + +        GF_OPTION_RECONF ("data-change-log", priv->data_change_log, options, +                          bool, out); + +        GF_OPTION_RECONF ("metadata-change-log", +                          priv->metadata_change_log, options, bool, out); + +        GF_OPTION_RECONF ("entry-change-log", priv->entry_change_log, options, +                          bool, out); + +        GF_OPTION_RECONF ("data-self-heal-algorithm", +                          priv->data_self_heal_algorithm, options, str, out); + +        GF_OPTION_RECONF ("self-heal-daemon", priv->shd.enabled, options, bool, out); + +        GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out); + +        GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode, +                          options, uint32, out); + +        if (read_subvol) { +                index = xlator_subvolume_index (this, read_subvol); +                if (index == -1) { +                        gf_log (this->name, GF_LOG_ERROR, "%s not a subvolume", +                                read_subvol->name); +                        goto out; +                } +                priv->read_child = index; +        } + +        GF_OPTION_RECONF ("read-subvolume-index",read_subvol_index, options,int32,out); + +        if (read_subvol_index >-1) { +                index=read_subvol_index; +                if (index >= priv->child_count) { +                        gf_log (this->name, GF_LOG_ERROR, "%d not a subvolume-index", +                                index); +                        goto out; +                } +                priv->read_child = index; +        } + +        GF_OPTION_RECONF ("eager-lock", priv->eager_lock, options, bool, out); +        GF_OPTION_RECONF ("quorum-type", qtype, options, str, out); +        GF_OPTION_RECONF ("quorum-count", priv->quorum_count, options, +                          uint32, out); +        fix_quorum_options(this,priv,qtype); +        GF_OPTION_RECONF ("heal-timeout", priv->shd.timeout, options, +                          int32, out); + +	GF_OPTION_RECONF ("post-op-delay-secs", priv->post_op_delay_secs, options, +			  uint32, out); + +        GF_OPTION_RECONF (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, +                          options, size, out); +        /* Reset this so we re-discover in case the topology changed.  */ +        GF_OPTION_RECONF ("readdir-failover", priv->readdir_failover, options, +                          bool, out); +        GF_OPTION_RECONF ("ensure-durability", priv->ensure_durability, options, +                          bool, out); +        priv->did_discovery = _gf_false; + +        ret = 0; +out: +        return ret; + +} + + +static const char *favorite_child_warning_str = "You have specified subvolume '%s' " +        "as the 'favorite child'. This means that if a discrepancy in the content " +        "or attributes (ownership, permission, etc.) of a file is detected among " +        "the subvolumes, the file on '%s' will be considered the definitive " +        "version and its contents will OVERWRITE the contents of the file on other " +        "subvolumes. All versions of the file except that on '%s' " +        "WILL BE LOST."; + + +int32_t +init (xlator_t *this) +{ +        afr_private_t *priv        = NULL; +        int            child_count = 0; +        xlator_list_t *trav        = NULL; +        int            i           = 0; +        int            ret         = -1; +        GF_UNUSED int  op_errno    = 0; +        xlator_t      *read_subvol = NULL; +        int            read_subvol_index = -1; +        xlator_t      *fav_child   = NULL; +        char          *qtype       = NULL; + +        if (!this->children) { +                gf_log (this->name, GF_LOG_ERROR, +                        "replicate translator needs more than one " +                        "subvolume defined."); +                return -1; +        } + +        if (!this->parents) { +                gf_log (this->name, GF_LOG_WARNING, +                        "Volume is dangling."); +        } + +	this->private = GF_CALLOC (1, sizeof (afr_private_t), +                                   gf_afr_mt_afr_private_t); +        if (!this->private) +                goto out; + +        priv = this->private; +        LOCK_INIT (&priv->lock); +        LOCK_INIT (&priv->read_child_lock); +        //lock recovery is not done in afr +        pthread_mutex_init (&priv->mutex, NULL); +        INIT_LIST_HEAD (&priv->saved_fds); + +        child_count = xlator_subvolume_count (this); + +        priv->child_count = child_count; + +        priv->read_child = -1; + +        GF_OPTION_INIT ("read-subvolume", read_subvol, xlator, out); +        if (read_subvol) { +                priv->read_child = xlator_subvolume_index (this, read_subvol); +                if (priv->read_child == -1) { +                        gf_log (this->name, GF_LOG_ERROR, "%s not a subvolume", +                                read_subvol->name); +                        goto out; +                } +        } +        GF_OPTION_INIT ("read-subvolume-index",read_subvol_index,int32,out); +        if (read_subvol_index > -1) { +                if (read_subvol_index >= priv->child_count) { +                        gf_log (this->name, GF_LOG_ERROR, "%d not a subvolume-index", +                                read_subvol_index); +                        goto out; +                } +                priv->read_child = read_subvol_index; +        } +        GF_OPTION_INIT ("choose-local", priv->choose_local, bool, out); + +        GF_OPTION_INIT ("read-hash-mode", priv->hash_mode, uint32, out); + +        priv->favorite_child = -1; +        GF_OPTION_INIT ("favorite-child", fav_child, xlator, out); +        if (fav_child) { +                priv->favorite_child = xlator_subvolume_index (this, fav_child); +                if (priv->favorite_child == -1) { +                        gf_log (this->name, GF_LOG_ERROR, "%s not a subvolume", +                                fav_child->name); +                        goto out; +                } +                gf_log (this->name, GF_LOG_WARNING, +                        favorite_child_warning_str, fav_child->name, +                        fav_child->name, fav_child->name); +        } + + +        GF_OPTION_INIT ("background-self-heal-count", +                        priv->background_self_heal_count, uint32, out); + +        GF_OPTION_INIT ("data-self-heal", priv->data_self_heal, str, out); + +        GF_OPTION_INIT ("data-self-heal-algorithm", +                        priv->data_self_heal_algorithm, str, out); + +        GF_OPTION_INIT ("data-self-heal-window-size", +                        priv->data_self_heal_window_size, uint32, out); + +        GF_OPTION_INIT ("metadata-self-heal", priv->metadata_self_heal, bool, +                        out); + +        GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out); + +        GF_OPTION_INIT ("self-heal-daemon", priv->shd.enabled, bool, out); + +        GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out); + +        GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out); + +        GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool, +                        out); + +        GF_OPTION_INIT ("entry-change-log", priv->entry_change_log, bool, out); + +        GF_OPTION_INIT ("optimistic-change-log", priv->optimistic_change_log, +                        bool, out); + +        GF_OPTION_INIT ("inodelk-trace", priv->inodelk_trace, bool, out); + +        GF_OPTION_INIT ("entrylk-trace", priv->entrylk_trace, bool, out); + +        GF_OPTION_INIT ("strict-readdir", priv->strict_readdir, bool, out); + +        GF_OPTION_INIT ("eager-lock", priv->eager_lock, bool, out); +        GF_OPTION_INIT ("quorum-type", qtype, str, out); +        GF_OPTION_INIT ("quorum-count", priv->quorum_count, uint32, out); +        GF_OPTION_INIT (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size, +                        out); +        fix_quorum_options(this,priv,qtype); + +	GF_OPTION_INIT ("post-op-delay-secs", priv->post_op_delay_secs, uint32, out); +        GF_OPTION_INIT ("readdir-failover", priv->readdir_failover, bool, out); +        GF_OPTION_INIT ("ensure-durability", priv->ensure_durability, bool, +                        out); + +        priv->wait_count = 1; + +        priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, +                                    gf_afr_mt_char); +        if (!priv->child_up) { +                ret = -ENOMEM; +                goto out; +        } + +        for (i = 0; i < child_count; i++) +                priv->child_up[i] = -1; /* start with unknown state. +                                           this initialization needed +                                           for afr_notify() to work +                                           reliably +                                        */ + +        priv->children = GF_CALLOC (sizeof (xlator_t *), child_count, +                                    gf_afr_mt_xlator_t); +        if (!priv->children) { +                ret = -ENOMEM; +                goto out; +        } + +        priv->pending_key = GF_CALLOC (sizeof (*priv->pending_key), +                                       child_count, +                                       gf_afr_mt_char); +        if (!priv->pending_key) { +                ret = -ENOMEM; +                goto out; +        } + +        trav = this->children; +        i = 0; +        while (i < child_count) { +                priv->children[i] = trav->xlator; + +                ret = gf_asprintf (&priv->pending_key[i], "%s.%s", +                                   AFR_XATTR_PREFIX, +                                   trav->xlator->name); +                if (-1 == ret) { +                        ret = -ENOMEM; +                        goto out; +                } + +                trav = trav->next; +                i++; +        } + +        ret = gf_asprintf (&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT, +                           this->name); +        if (-1 == ret) { +                ret = -ENOMEM; +                goto out; +        } + +        priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event), +                                      gf_afr_mt_int32_t); +        if (!priv->last_event) { +                ret = -ENOMEM; +                goto out; +        } + +        /* keep more local here as we may need them for self-heal etc */ +        this->local_pool = mem_pool_new (afr_local_t, 512); +        if (!this->local_pool) { +                ret = -1; +                gf_log (this->name, GF_LOG_ERROR, +                        "failed to create local_t's memory pool"); +                goto out; +        } + +        priv->first_lookup = 1; +        priv->root_inode = NULL; + +        if (!priv->shd.iamshd) { +                ret = 0; +                goto out; +        } + +        ret = -ENOMEM; +        priv->shd.pos = GF_CALLOC (sizeof (*priv->shd.pos), child_count, +                                   gf_afr_mt_brick_pos_t); +        if (!priv->shd.pos) +                goto out; + +        priv->shd.pending = GF_CALLOC (sizeof (*priv->shd.pending), child_count, +                                       gf_afr_mt_int32_t); +        if (!priv->shd.pending) +                goto out; + +        priv->shd.inprogress = GF_CALLOC (sizeof (*priv->shd.inprogress), +                                          child_count, gf_afr_mt_shd_bool_t); +        if (!priv->shd.inprogress) +                goto out; +        priv->shd.timer = GF_CALLOC (sizeof (*priv->shd.timer), child_count, +                                     gf_afr_mt_shd_timer_t); +        if (!priv->shd.timer) +                goto out; + +        priv->shd.healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false, +                                   _destroy_shd_event_data); +        if (!priv->shd.healed) +                goto out; + +        priv->shd.heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false, +                                        _destroy_shd_event_data); +        if (!priv->shd.heal_failed) +                goto out; + +        priv->shd.split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false, +                                        _destroy_shd_event_data); +        if (!priv->shd.split_brain) +                goto out; + +        this->itable = inode_table_new (SHD_INODE_LRU_LIMIT, this); +        if (!this->itable) +                goto out; +        priv->root_inode = inode_ref (this->itable->root); +        GF_OPTION_INIT ("node-uuid", priv->shd.node_uuid, str, out); +        GF_OPTION_INIT ("heal-timeout", priv->shd.timeout, int32, out); +        ret = afr_initialise_statistics (this); +        if (ret) +                goto out; +        ret = 0; +out: +        return ret; +} + + +int +fini (xlator_t *this) +{ +        afr_private_t *priv = NULL; + +        priv = this->private; +        this->private = NULL; +        afr_priv_destroy (priv); +        if (this->itable);//I dont see any destroy func + +        return 0; +} + + +struct xlator_fops fops = { +        .lookup      = afr_lookup, +        .open        = afr_open, +        .lk          = afr_lk, +        .flush       = afr_flush, +        .statfs      = afr_statfs, +        .fsync       = afr_fsync, +        .fsyncdir    = afr_fsyncdir, +        .xattrop     = afr_xattrop, +        .fxattrop    = afr_fxattrop, +        .inodelk     = afr_inodelk, +        .finodelk    = afr_finodelk, +        .entrylk     = afr_entrylk, +        .fentrylk    = afr_fentrylk, +	.fallocate   = afr_fallocate, +	.discard     = afr_discard, +        .zerofill    = afr_zerofill, + +        /* inode read */ +        .access      = afr_access, +        .stat        = afr_stat, +        .fstat       = afr_fstat, +        .readlink    = afr_readlink, +        .getxattr    = afr_getxattr, +        .fgetxattr   = afr_fgetxattr, +        .readv       = afr_readv, + +        /* inode write */ +        .writev      = afr_writev, +        .truncate    = afr_truncate, +        .ftruncate   = afr_ftruncate, +        .setxattr    = afr_setxattr, +        .fsetxattr   = afr_fsetxattr, +        .setattr     = afr_setattr, +        .fsetattr    = afr_fsetattr, +        .removexattr = afr_removexattr, +        .fremovexattr = afr_fremovexattr, + +        /* dir read */ +        .opendir     = afr_opendir, +        .readdir     = afr_readdir, +        .readdirp    = afr_readdirp, + +        /* dir write */ +        .create      = afr_create, +        .mknod       = afr_mknod, +        .mkdir       = afr_mkdir, +        .unlink      = afr_unlink, +        .rmdir       = afr_rmdir, +        .link        = afr_link, +        .symlink     = afr_symlink, +        .rename      = afr_rename, +}; + + +struct xlator_dumpops dumpops = { +        .priv       = afr_priv_dump, +}; + + +struct xlator_cbks cbks = { +        .release     = afr_release, +        .releasedir  = afr_releasedir, +        .forget      = afr_forget, +}; + + +struct volume_options options[] = { +        { .key  = {"read-subvolume" }, +          .type = GF_OPTION_TYPE_XLATOR, +          .description = "inode-read fops happen only on one of the bricks in " +                         "replicate. Afr will prefer the one specified using " +                         "this option if it is not stale. Option value must be " +                         "one of the xlator names of the children. " +                         "Ex: <volname>-client-0 till " +                         "<volname>-client-<number-of-bricks - 1>" +        }, +        { .key  = {"read-subvolume-index" }, +          .type = GF_OPTION_TYPE_INT, +          .default_value = "-1", +          .description = "inode-read fops happen only on one of the bricks in " +                         "replicate. AFR will prefer the one specified using " +                         "this option if it is not stale. allowed options" +                         " include -1 till replica-count - 1" +        }, +        { .key = {"read-hash-mode" }, +          .type = GF_OPTION_TYPE_INT, +          .min = 0, +          .max = 2, +          .default_value = "0", +          .description = "inode-read fops happen only on one of the bricks in " +                         "replicate. AFR will prefer the one computed using " +                         "the method specified using this option" +                         "0 = first responder, " +                         "1 = hash by GFID of file (all clients use " +                                                    "same subvolume), " +                         "2 = hash by GFID of file and client PID", +        }, +        { .key  = {"choose-local" }, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "true", +          .description = "Choose a local subvolume (i.e. Brick) to read from" +                         " if read-subvolume is not explicitly set.", +        }, +        { .key  = {"favorite-child"}, +          .type = GF_OPTION_TYPE_XLATOR, +          .description = "If a split-brain happens choose subvol/brick set by " +                         "this option as source." +        }, +        { .key  = {"background-self-heal-count"}, +          .type = GF_OPTION_TYPE_INT, +          .min  = 0, +          .default_value = "16", +          .validate = GF_OPT_VALIDATE_MIN, +          .description = "This specifies the number of self-heals that can be " +                         " performed in background without blocking the fop" +        }, +        { .key  = {"data-self-heal"}, +          .type = GF_OPTION_TYPE_STR, +          .value = {"1", "on", "yes", "true", "enable", +                    "0", "off", "no", "false", "disable", +                    "open"}, +          .default_value = "on", +          .description   = "Using this option we can enable/disable data " +                           "self-heal on the file. \"open\" means data " +                           "self-heal action will only be triggered by file " +                           "open operations." +        }, +        { .key  = {"data-self-heal-algorithm"}, +          .type = GF_OPTION_TYPE_STR, +          .description   = "Select between \"full\", \"diff\". The " +                           "\"full\" algorithm copies the entire file from " +                           "source to sink. The \"diff\" algorithm copies to " +                           "sink only those blocks whose checksums don't match " +                           "with those of source. If no option is configured " +                           "the option is chosen dynamically as follows: " +                           "If the file does not exist on one of the sinks " +                           "or empty file exists or if the source file size is " +                           "about the same as page size the entire file will " +                           "be read and written i.e \"full\" algo, " +                           "otherwise \"diff\" algo is chosen.", +          .value = { "diff", "full"} +        }, +        { .key  = {"data-self-heal-window-size"}, +          .type = GF_OPTION_TYPE_INT, +          .min  = 1, +          .max  = 1024, +          .default_value = "1", +          .description = "Maximum number blocks per file for which self-heal " +                         "process would be applied simultaneously." +        }, +        { .key  = {"metadata-self-heal"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "on", +          .description = "Using this option we can enable/disable metadata " +                         "i.e. Permissions, ownerships, xattrs self-heal on " +                         "the file/directory." +        }, +        { .key  = {"entry-self-heal"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "on", +          .description = "Using this option we can enable/disable entry " +                         "self-heal on the directory." +        }, +        { .key  = {"data-change-log"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "on", +          .description = "Data fops like write/truncate will not perform " +                         "pre/post fop changelog operations in afr transaction " +                         "if this option is disabled" +        }, +        { .key  = {"metadata-change-log"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "on", +          .description = "Metadata fops like setattr/setxattr will not perform " +                         "pre/post fop changelog operations in afr transaction " +                         "if this option is disabled" +        }, +        { .key  = {"entry-change-log"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "on", +          .description = "Entry fops like create/unlink will not perform " +                         "pre/post fop changelog operations in afr transaction " +                         "if this option is disabled" +        }, +        { .key  = {"optimistic-change-log"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "on", +          .description = "Entry/Metadata fops will not perform " +                         "pre fop changelog operations in afr transaction " +                         "if this option is enabled." +        }, +        { .key  = {"strict-readdir"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "off", +        }, +        { .key = {"inodelk-trace"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "off", +          .description = "Enabling this option logs inode lock/unlocks" +        }, +        { .key = {"entrylk-trace"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "off", +          .description = "Enabling this option logs entry lock/unlocks" +        }, +        { .key = {"eager-lock"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "on", +          .description = "Lock phase of a transaction has two sub-phases. " +                         "First is an attempt to acquire locks in parallel by " +                         "broadcasting non-blocking lock requests. If lock " +                         "acquisition fails on any server, then the held locks " +                         "are unlocked and revert to a blocking locked mode " +                         "sequentially on one server after another.  If this " +                         "option is enabled the initial broadcasting lock " +                         "request attempt to acquire lock on the entire file. " +                         "If this fails, we revert back to the sequential " +                         "\"regional\" blocking lock as before. In the case " +                         "where such an \"eager\" lock is granted in the " +                         "non-blocking phase, it gives rise to an opportunity " +                         "for optimization. i.e, if the next write transaction " +                         "on the same FD arrives before the unlock phase of " +                         "the first transaction, it \"takes over\" the full " +                         "file lock. Similarly if yet another data transaction " +                         "arrives before the unlock phase of the \"optimized\" " +                         "transaction, that in turn \"takes over\" the lock as " +                         "well. The actual unlock now happens at the end of " +                         "the last \"optimized\" transaction." + +        }, +        { .key = {"self-heal-daemon"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "on", +          .description = "This option applies to only self-heal-daemon. " +                         "Index directory crawl and automatic healing of files " +                         "will not be performed if this option is turned off." +        }, +        { .key = {"iam-self-heal-daemon"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "off", +          .description = "This option differentiates if the replicate " +                         "translator is running as part of self-heal-daemon " +                         "or not." +        }, +        { .key = {"quorum-type"}, +          .type = GF_OPTION_TYPE_STR, +          .value = { "none", "auto", "fixed"}, +          .default_value = "none", +          .description = "If value is \"fixed\" only allow writes if " +                         "quorum-count bricks are present.  If value is " +                         "\"auto\" only allow writes if more than half of " +                         "bricks, or exactly half including the first, are " +                         "present.", +        }, +        { .key = {"quorum-count"}, +          .type = GF_OPTION_TYPE_INT, +          .min = 1, +          .max = INT_MAX, +          .default_value = 0, +          .description = "If quorum-type is \"fixed\" only allow writes if " +                         "this many bricks or present.  Other quorum types " +                         "will OVERWRITE this value.", +        }, +        { .key  = {"node-uuid"}, +          .type = GF_OPTION_TYPE_STR, +          .description = "Local glusterd uuid string, used in starting " +                         "self-heal-daemon so that it can crawl only on " +                         "local index directories.", +        }, +        { .key  = {"heal-timeout"}, +          .type = GF_OPTION_TYPE_INT, +          .min  = 60, +          .max  = INT_MAX, +          .default_value = "600", +          .description = "time interval for checking the need to self-heal " +                         "in self-heal-daemon" +        }, +        { .key  = {"post-op-delay-secs"}, +          .type = GF_OPTION_TYPE_INT, +          .min  = 0, +          .max  = INT_MAX, +          .default_value = "1", +          .description = "Time interval induced artificially before " +	                 "post-operation phase of the transaction to " +                         "enhance overlap of adjacent write operations.", +        }, +        { .key = {AFR_SH_READDIR_SIZE_KEY}, +          .type = GF_OPTION_TYPE_SIZET, +          .description = "readdirp size for performing entry self-heal", +          .min = 1024, +          .max = 131072, +          .default_value = "1KB", +        }, +        { .key = {"readdir-failover"}, +          .type = GF_OPTION_TYPE_BOOL, +          .description = "readdir(p) will not failover if this option is off", +          .default_value = "on", +        }, +        { .key = {"ensure-durability"}, +          .type = GF_OPTION_TYPE_BOOL, +          .description = "Afr performs fsyncs for transactions if this " +                         "option is on to make sure the changelogs/data is " +                         "written to the disk", +          .default_value = "on", +        }, +        { .key  = {NULL} }, +}; diff --git a/xlators/cluster/afr-v1/src/afr.h b/xlators/cluster/afr-v1/src/afr.h new file mode 100644 index 000000000..9196a1f27 --- /dev/null +++ b/xlators/cluster/afr-v1/src/afr.h @@ -0,0 +1,1215 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + + +#ifndef __AFR_H__ +#define __AFR_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "call-stub.h" +#include "compat-errno.h" +#include "afr-mem-types.h" +#include "afr-self-heal-algorithm.h" + +#include "libxlator.h" +#include "timer.h" + +#define AFR_XATTR_PREFIX "trusted.afr" +#define AFR_PATHINFO_HEADER "REPLICATE:" +#define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size" +#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal" + +#define AFR_LOCKEE_COUNT_MAX    3 +#define AFR_DOM_COUNT_MAX    3 + +#define afr_inode_missing(op_errno) (op_errno == ENOENT || op_errno == ESTALE) + +struct _pump_private; + +typedef int (*afr_expunge_done_cbk_t) (call_frame_t *frame, xlator_t *this, +                                       int child, int32_t op_error, +                                       int32_t op_errno); + +typedef int (*afr_impunge_done_cbk_t) (call_frame_t *frame, xlator_t *this, +                                       int32_t op_error, int32_t op_errno); +typedef int (*afr_post_remove_call_t) (call_frame_t *frame, xlator_t *this); + +typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this); +typedef void (*afr_lookup_done_cbk_t) (call_frame_t *frame, xlator_t *this, +                                      int32_t op_ret, int32_t op_errno); + +typedef enum { +        AFR_POS_UNKNOWN, +        AFR_POS_LOCAL, +        AFR_POS_REMOTE +} afr_child_pos_t; + +typedef enum { +        SPLIT_BRAIN = 1, +        ALL_FOOLS = 2 +} afr_subvol_status_t; + +typedef enum { +        AFR_INODE_SET_READ_CTX = 1, +        AFR_INODE_RM_STALE_CHILDREN, +        AFR_INODE_SET_OPENDIR_DONE, +        AFR_INODE_GET_READ_CTX, +        AFR_INODE_GET_OPENDIR_DONE, +} afr_inode_op_t; + +typedef struct afr_inode_params_ { +        afr_inode_op_t op; +        union { +                gf_boolean_t value; +                struct { +                        int32_t read_child; +                        int32_t *children; +                } read_ctx; +        } u; +} afr_inode_params_t; + +typedef enum afr_spb_state { +        DONT_KNOW, +        SPB, +        NO_SPB +} afr_spb_state_t; + +typedef struct afr_inode_ctx_ { +        uint64_t masks; +        int32_t  *fresh_children;//increasing order of latency +        afr_spb_state_t mdata_spb; +        afr_spb_state_t data_spb; +        uint32_t        open_fd_count; +} afr_inode_ctx_t; + +typedef enum { +        NONE, +        INDEX, +        INDEX_TO_BE_HEALED, +        FULL, +} afr_crawl_type_t; + +typedef struct afr_self_heald_ { +        gf_boolean_t            enabled; +        gf_boolean_t            iamshd; +        afr_crawl_type_t        *pending; +        gf_boolean_t            *inprogress; +        afr_child_pos_t         *pos; +        gf_timer_t              **timer; +        eh_t                    *healed; +        eh_t                    *heal_failed; +        eh_t                    *split_brain; +        eh_t                    **statistics; +        void                    **crawl_events; +        char                    *node_uuid; +        int                     timeout; +} afr_self_heald_t; + +typedef struct _afr_private { +        gf_lock_t lock;               /* to guard access to child_count, etc */ +        unsigned int child_count;     /* total number of children   */ + +        unsigned int read_child_rr;   /* round-robin index of the read_child */ +        gf_lock_t read_child_lock;    /* lock to protect above */ + +        xlator_t **children; + +        int first_lookup; +        inode_t *root_inode; + +        unsigned char *child_up; + +        char **pending_key; + +        char         *data_self_heal;              /* on/off/open */ +        char *       data_self_heal_algorithm;    /* name of algorithm */ +        unsigned int data_self_heal_window_size;  /* max number of pipelined +                                                     read/writes */ + +        unsigned int background_self_heal_count; +        unsigned int background_self_heals_started; +        gf_boolean_t metadata_self_heal;   /* on/off */ +        gf_boolean_t entry_self_heal;      /* on/off */ + +        gf_boolean_t data_change_log;       /* on/off */ +        gf_boolean_t metadata_change_log;   /* on/off */ +        gf_boolean_t entry_change_log;      /* on/off */ + +        int read_child;               /* read-subvolume */ +        unsigned int hash_mode;       /* for when read_child is not set */ +        int favorite_child;  /* subvolume to be preferred in resolving +                                         split-brain cases */ + +        gf_boolean_t inodelk_trace; +        gf_boolean_t entrylk_trace; + +        gf_boolean_t strict_readdir; + +        unsigned int wait_count;      /* # of servers to wait for success */ + +        uint64_t up_count;      /* number of CHILD_UPs we have seen */ +        uint64_t down_count;    /* number of CHILD_DOWNs we have seen */ + +        struct _pump_private *pump_private; /* Set if we are loaded as pump */ +        int                   use_afr_in_pump; + +        pthread_mutex_t  mutex; +        struct list_head saved_fds;   /* list of fds on which locks have succeeded */ +        gf_boolean_t      optimistic_change_log; +        gf_boolean_t      eager_lock; +	uint32_t          post_op_delay_secs; +        unsigned int      quorum_count; + +        char                   vol_uuid[UUID_SIZE + 1]; +        int32_t                *last_event; +        afr_self_heald_t       shd; +        gf_boolean_t           choose_local; +        gf_boolean_t           did_discovery; +        gf_boolean_t           readdir_failover; +        uint64_t               sh_readdir_size; +        gf_boolean_t           ensure_durability; +        char                   *sh_domain; +} afr_private_t; + +typedef enum { +        AFR_SELF_HEAL_NOT_ATTEMPTED, +        AFR_SELF_HEAL_STARTED, +        AFR_SELF_HEAL_FAILED, +        AFR_SELF_HEAL_SYNC_BEGIN, +} afr_self_heal_status; + +typedef struct { +        afr_self_heal_status gfid_or_missing_entry_self_heal; +        afr_self_heal_status metadata_self_heal; +        afr_self_heal_status data_self_heal; +        afr_self_heal_status entry_self_heal; +} afr_sh_status_for_all_type; + +typedef enum { +        AFR_SELF_HEAL_ENTRY, +        AFR_SELF_HEAL_METADATA, +        AFR_SELF_HEAL_DATA, +        AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY, +        AFR_SELF_HEAL_INVALID = -1, +} afr_self_heal_type; + +typedef enum { +        AFR_CHECK_ALL, +        AFR_CHECK_SPECIFIC, +} afr_sh_fail_check_type; + +struct afr_self_heal_ { +        /* External interface: These are variables (some optional) that +           are set by whoever has triggered self-heal */ + +        gf_boolean_t do_data_self_heal; +        gf_boolean_t do_metadata_self_heal; +        gf_boolean_t do_entry_self_heal; +        gf_boolean_t do_gfid_self_heal; +        gf_boolean_t do_missing_entry_self_heal; +        gf_boolean_t force_confirm_spb; /* Check for split-brains even when +                                           self-heal is turned off */ + +        gf_boolean_t forced_merge;        /* Is this a self-heal triggered to +                                             forcibly merge the directories? */ + +        gf_boolean_t background;          /* do self-heal in background +                                             if possible */ +        ia_type_t type;                   /* st_mode of the entry we're doing +                                             self-heal on */ +        inode_t   *inode;                 /* inode on which the self-heal is +                                             performed on */ +        uuid_t  sh_gfid_req;                 /* gfid self-heal needs to be done +                                             with this gfid if it is not null */ + +        /* Function to call to unwind. If self-heal is being done in the +           background, this function will be called as soon as possible. */ + +        int (*unwind) (call_frame_t *frame, xlator_t *this, int32_t op_ret, +                       int32_t op_errno, int32_t sh_failed); + +        /* End of external interface members */ + + +        /* array of stat's, one for each child */ +        struct iatt *buf; +        struct iatt *parentbufs; +        struct iatt parentbuf; +        struct iatt entrybuf; + +        afr_expunge_done_cbk_t expunge_done; +        afr_impunge_done_cbk_t impunge_done; + +        /* array of xattr's, one for each child */ +        dict_t **xattr; + +        /* array containing if the lookups succeeded in the order of response +         */ +        int32_t *success_children; +        int     success_count; +        /* array containing the fresh children found in the self-heal process */ +        int32_t *fresh_children; +        /* array containing the fresh children found in the parent lookup */ +        int32_t *fresh_parent_dirs; +        /* array of errno's, one for each child */ +        int *child_errno; +        /*loc used for lookup*/ +        loc_t lookup_loc; +        int32_t lookup_flags; +        afr_lookup_done_cbk_t lookup_done; + +        int32_t **pending_matrix; +        int32_t **delta_matrix; + +        int32_t op_ret; +        int32_t op_errno; + +        int *sources; +        int source; +        int active_source; +        int active_sinks; +        unsigned char *success; +        unsigned char *locked_nodes; +        int lock_count; + +        const char *linkname; +        gf_boolean_t entries_skipped; + +        gf_boolean_t actual_sh_started; +        gf_boolean_t sync_done; +        gf_boolean_t data_lock_held; +        gf_boolean_t sh_dom_lock_held; +        gf_boolean_t eof_reached; +        fd_t  *healing_fd; +        int   file_has_holes; +        blksize_t block_size; +        off_t file_size; +        off_t offset; +        unsigned char *write_needed; +        uint8_t *checksum; +        afr_post_remove_call_t post_remove_call; + +        char    *data_sh_info; +        char    *metadata_sh_info; + +        loc_t parent_loc; +        call_frame_t *orig_frame; +        call_frame_t *old_loop_frame; +        gf_boolean_t unwound; + +        afr_sh_algo_private_t *private; +        afr_sh_status_for_all_type  afr_all_sh_status; +        afr_self_heal_type       sh_type_in_action; + +        struct afr_sh_algorithm  *algo; +        afr_lock_cbk_t data_lock_success_handler; +        afr_lock_cbk_t data_lock_failure_handler; +	gf_boolean_t data_lock_block; +        int (*completion_cbk) (call_frame_t *frame, xlator_t *this); +        int (*sh_data_algo_start) (call_frame_t *frame, xlator_t *this); +        int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this); +        int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this); +        void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this); + +        call_frame_t *sh_frame; +}; + +typedef struct afr_self_heal_ afr_self_heal_t; + +typedef enum { +        AFR_DATA_TRANSACTION,          /* truncate, write, ... */ +        AFR_METADATA_TRANSACTION,      /* chmod, chown, ... */ +        AFR_ENTRY_TRANSACTION,         /* create, rmdir, ... */ +        AFR_ENTRY_RENAME_TRANSACTION,  /* rename */ +} afr_transaction_type; + +typedef enum { +        AFR_TRANSACTION_LK, +        AFR_SELFHEAL_LK, +} transaction_lk_type_t; + +typedef enum { +        AFR_LOCK_OP, +        AFR_UNLOCK_OP, +} afr_lock_op_type_t; + +typedef enum { +        AFR_DATA_SELF_HEAL_LK, +        AFR_METADATA_SELF_HEAL_LK, +        AFR_ENTRY_SELF_HEAL_LK, +}selfheal_lk_type_t; + +typedef enum { +        AFR_INODELK_TRANSACTION, +        AFR_INODELK_NB_TRANSACTION, +        AFR_ENTRYLK_TRANSACTION, +        AFR_ENTRYLK_NB_TRANSACTION, +        AFR_INODELK_SELFHEAL, +        AFR_INODELK_NB_SELFHEAL, +        AFR_ENTRYLK_SELFHEAL, +        AFR_ENTRYLK_NB_SELFHEAL, +} afr_lock_call_type_t; + +/* +  xattr format: trusted.afr.volume = [x y z] +  x - data pending +  y - metadata pending +  z - entry pending +*/ + +static inline int +afr_index_for_transaction_type (afr_transaction_type type) +{ +        switch (type) { + +        case AFR_DATA_TRANSACTION: +                return 0; + +        case AFR_METADATA_TRANSACTION: +                return 1; + +        case AFR_ENTRY_TRANSACTION: +        case AFR_ENTRY_RENAME_TRANSACTION: +                return 2; +        } + +        return -1;  /* make gcc happy */ +} + +typedef struct { +        loc_t                   loc; +        char                    *basename; +        unsigned char           *locked_nodes; +        int                     locked_count; + +} afr_entry_lockee_t; + +int +afr_entry_lockee_cmp (const void *l1, const void *l2); + +typedef struct { +        char    *domain; /* Domain on which inodelk is taken */ +        struct gf_flock flock; +        unsigned char *locked_nodes; +        int32_t lock_count; +} afr_inodelk_t; + +typedef struct { +        loc_t *lk_loc; + +        int                     lockee_count; +        afr_entry_lockee_t      lockee[AFR_LOCKEE_COUNT_MAX]; + +        afr_inodelk_t       inodelk[AFR_DOM_COUNT_MAX]; +        const char *lk_basename; +        const char *lower_basename; +        const char *higher_basename; +        char lower_locked; +        char higher_locked; + +        unsigned char *locked_nodes; +        unsigned char *lower_locked_nodes; + +        selfheal_lk_type_t selfheal_lk_type; +        transaction_lk_type_t transaction_lk_type; + +        int32_t lock_count; +        int32_t entrylk_lock_count; + +        uint64_t lock_number; +        int32_t lk_call_count; +        int32_t lk_expected_count; +        int32_t lk_attempted_count; + +        int32_t lock_op_ret; +        int32_t lock_op_errno; +        afr_lock_cbk_t lock_cbk; +        char *domain; /* Domain on which inode/entry lock/unlock in progress.*/ +} afr_internal_lock_t; + +typedef struct _afr_locked_fd { +        fd_t  *fd; +        struct list_head list; +} afr_locked_fd_t; + +struct afr_reply { +	int	valid; +	int32_t	op_ret; +	int32_t	op_errno; +}; + +typedef struct _afr_local { +        int     uid; +        int     gid; +        unsigned int call_count; +        unsigned int success_count; +        unsigned int enoent_count; +        uint32_t     open_fd_count; +        gf_boolean_t update_open_fd_count; + + +        unsigned int unhealable; + +        unsigned int read_child_index; +        unsigned char read_child_returned; +        unsigned int first_up_child; + +	gf_lkowner_t  saved_lk_owner; + +        int32_t op_ret; +        int32_t op_errno; + +        int32_t **pending; + +        loc_t loc; +        loc_t newloc; + +        fd_t *fd; + +        glusterfs_fop_t fop; + +        unsigned char *child_up; +        int32_t       *fresh_children; //in the order of response + +        int32_t *child_errno; + +        dict_t  *xattr_req; + +        int32_t  inodelk_count; +        int32_t  entrylk_count; + +        afr_internal_lock_t internal_lock; + +        afr_locked_fd_t *locked_fd; +        int32_t          source_child; +        int32_t          lock_recovery_child; + +        dict_t  *dict; +        int      optimistic_change_log; +	gf_boolean_t      delayed_post_op; + + +	/* Is the current writev() going to perform a stable write? +	   i.e, is fd->flags or @flags writev param have O_SYNC or +	   O_DSYNC? +	*/ +        gf_boolean_t      stable_write; + +        /* This write appended to the file. Nnot necessarily O_APPEND, +           just means the offset of write was at the end of file. +        */ +        gf_boolean_t      append_write; + +        int attempt_self_heal; +        int foreground_self_heal; + + +        /* This struct contains the arguments for the "continuation" +           (scheme-like) of fops +        */ + +        int   op; +        struct { +                struct { +                        unsigned char buf_set; +                        struct statvfs buf; +                } statfs; + +                struct { +                        uint32_t parent_entrylk; +                        uuid_t  gfid_req; +                        inode_t *inode; +                        struct iatt buf; +                        struct iatt postparent; +                        dict_t **xattrs; +                        dict_t *xattr; +                        struct iatt *postparents; +                        struct iatt *bufs; +                        int32_t read_child; +                        int32_t *sources; +                        int32_t *success_children; +                        int32_t **pending_matrix; +                        gf_boolean_t fresh_lookup; +                        gf_boolean_t possible_spb; +                } lookup; + +                struct { +                        int32_t flags; +                } open; + +                struct { +                        int32_t cmd; +                        struct gf_flock user_flock; +                        struct gf_flock ret_flock; +                        unsigned char *locked_nodes; +                } lk; + +                /* inode read */ + +                struct { +                        int32_t mask; +                        int last_index;  /* index of the child we tried previously */ +                } access; + +                struct { +                        int last_index; +                } stat; + +                struct { +                        int last_index; +                } fstat; + +                struct { +                        size_t size; +                        int last_index; +                } readlink; + +                struct { +                        char *name; +                        int last_index; +                        long xattr_len; +                } getxattr; + +                struct { +                        size_t size; +                        off_t offset; +                        int last_index; +                        uint32_t flags; +                } readv; + +                /* dir read */ + +                struct { +                        int success_count; +                        int32_t op_ret; +                        int32_t op_errno; + +                        uint32_t *checksum; +                } opendir; + +                struct { +                        int32_t op_ret; +                        int32_t op_errno; +                        size_t size; +                        off_t offset; +                        dict_t *dict; +                        gf_boolean_t failed; +                        int last_index; +                } readdir; +                /* inode write */ + +                struct { +                        struct iatt prebuf; +                        struct iatt postbuf; +                } inode_wfop; //common structure for all inode-write-fops + +                struct { +                        int32_t op_ret; + +                        struct iovec *vector; +                        struct iobref *iobref; +                        int32_t count; +                        off_t offset; +                        uint32_t flags; +                } writev; + +                struct { +                        off_t offset; +                } truncate; + +                struct { +                        off_t offset; +                } ftruncate; + +                struct { +                        struct iatt in_buf; +                        int32_t valid; +                } setattr; + +                struct { +                        struct iatt in_buf; +                        int32_t valid; +                } fsetattr; + +                struct { +                        dict_t *dict; +                        int32_t flags; +                } setxattr; + +                struct { +                        dict_t *dict; +                        int32_t flags; +                } fsetxattr; + +                struct { +                        char *name; +                } removexattr; + +                struct { +                        dict_t *xattr; +                } xattrop; + +                struct { +                        dict_t *xattr; +                } fxattrop; + +                /* dir write */ + +                struct { +                        inode_t *inode; +                        struct iatt buf; +                        struct iatt preparent; +                        struct iatt postparent; +                        struct iatt prenewparent; +                        struct iatt postnewparent; +                } dir_fop; //common structure for all dir fops + +                struct { +                        fd_t *fd; +                        dict_t *params; +                        int32_t flags; +                        mode_t mode; +                } create; + +                struct { +                        dev_t dev; +                        mode_t mode; +                        dict_t *params; +                } mknod; + +                struct { +                        int32_t mode; +                        dict_t *params; +                } mkdir; + +                struct { +                        int flags; +                } rmdir; + +                struct { +                        dict_t *params; +                        char *linkpath; +                } symlink; + +		struct { +			int32_t mode; +			off_t offset; +			size_t len; +		} fallocate; + +		struct { +			off_t offset; +			size_t len; +		} discard; + +                struct { +                        off_t offset; +                        off_t len; +                        struct iatt prebuf; +                        struct iatt postbuf; +                } zerofill; + + +        } cont; + +        struct { +                off_t start, len; + +                gf_boolean_t    eager_lock_on; +                int *eager_lock; + +                char *basename; +                char *new_basename; + +                loc_t parent_loc; +                loc_t new_parent_loc; + +                afr_transaction_type type; + +		/* pre-compute the post piggyback status before +		   entering POST-OP phase +		*/ +		int              *postop_piggybacked; + +		/* stub to resume on destruction +		   of the transaction frame */ +		call_stub_t      *resume_stub; + +		struct list_head  eager_locked; + +                int32_t         **txn_changelog;//changelog after pre+post ops +                unsigned char   *pre_op; + +                call_frame_t *main_frame; + +                int (*fop) (call_frame_t *frame, xlator_t *this); + +                int (*done) (call_frame_t *frame, xlator_t *this); + +                int (*resume) (call_frame_t *frame, xlator_t *this); + +                int (*unwind) (call_frame_t *frame, xlator_t *this); + +                /* post-op hook */ +        } transaction; + +        afr_self_heal_t self_heal; + +        struct marker_str     marker; + +        /* extra data for fops */ +        dict_t         *xdata_req; +        dict_t         *xdata_rsp; + +        mode_t          umask; +        int             xflag; +        gf_boolean_t    do_discovery; +	struct afr_reply *replies; +} afr_local_t; + +typedef enum { +        AFR_FD_NOT_OPENED, +        AFR_FD_OPENED, +        AFR_FD_OPENING +} afr_fd_open_status_t; + +typedef struct { +        unsigned int *pre_op_done; +        afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ +        unsigned int *pre_op_piggyback; + +        unsigned int *lock_piggyback; +        unsigned int *lock_acquired; + +        int flags; +        uint64_t up_count;   /* number of CHILD_UPs this fd has seen */ +        uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */ + +        int32_t last_tried; + +        int  hit, miss; +        gf_boolean_t failed_over; +        struct list_head entries; /* needed for readdir failover */ + +        unsigned char *locked_on; /* which subvolumes locks have been successful */ + +	/* used for delayed-post-op optimization */ +	pthread_mutex_t    delay_lock; +	gf_timer_t        *delay_timer; +	call_frame_t      *delay_frame; +        int               call_child; + +	/* set if any write on this fd was a non stable write +	   (i.e, without O_SYNC or O_DSYNC) +	*/ +	gf_boolean_t      witnessed_unstable_write; + +	/* list of frames currently in progress */ +	struct list_head  eager_locked; +} afr_fd_ctx_t; + + +/* try alloc and if it fails, goto label */ +#define AFR_LOCAL_ALLOC_OR_GOTO(var, label) do {                    \ +                var = mem_get0 (THIS->local_pool);                  \ +                if (!var) {                                         \ +                        gf_log (this->name, GF_LOG_ERROR,           \ +                                "out of memory :(");                \ +                        op_errno = ENOMEM;                          \ +                        goto label;                                 \ +                }                                                   \ +        } while (0); + + +/* did a call fail due to a child failing? */ +#define child_went_down(op_ret, op_errno) (((op_ret) < 0) &&            \ +                                           ((op_errno == ENOTCONN) ||   \ +                                            (op_errno == EBADFD))) + +#define afr_fop_failed(op_ret, op_errno) ((op_ret) == -1) + +/* have we tried all children? */ +#define all_tried(i, count)  ((i) == (count) - 1) + +int32_t +afr_set_dict_gfid (dict_t *dict, uuid_t gfid); + +int +pump_command_reply (call_frame_t *frame, xlator_t *this); + +int32_t +afr_notify (xlator_t *this, int32_t event, void *data, void *data2); + +int +afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local, +                       loc_t *loc, char *basename, int child_count); + +void +afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock); + +int +afr_attempt_lock_recovery (xlator_t *this, int32_t child_index); + +int +afr_save_locked_fd (xlator_t *this, fd_t *fd); + +int +afr_mark_locked_nodes (xlator_t *this, fd_t *fd, +                       unsigned char *locked_nodes); + +void +afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner); + +int +afr_set_lock_number (call_frame_t *frame, xlator_t *this); + + +loc_t * +lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2); + +int32_t +afr_unlock (call_frame_t *frame, xlator_t *this); + +int +afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this); + +int +afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this); + +int +afr_blocking_lock (call_frame_t *frame, xlator_t *this); + +int +afr_internal_lock_finish (call_frame_t *frame, xlator_t *this); + +int +afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, +                          unsigned int child_count); + +int pump_start (call_frame_t *frame, xlator_t *this); + +int +__afr_fd_ctx_set (xlator_t *this, fd_t *fd); + +int +afr_fd_ctx_set (xlator_t *this, fd_t *fd); + +int32_t +afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children); + +void +afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, +                        int32_t *fresh_children); + +int +afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno); + +unsigned int +afr_up_children_count (unsigned char *child_up, unsigned int child_count); + +unsigned int +afr_locked_children_count (unsigned char *children, unsigned int child_count); + +unsigned int +afr_pre_op_done_children_count (unsigned char *pre_op, +                                unsigned int child_count); + +gf_boolean_t +afr_is_fresh_lookup (loc_t *loc, xlator_t *this); + +void +afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent); + +int +afr_locked_nodes_count (unsigned char *locked_nodes, int child_count); + +void +afr_local_cleanup (afr_local_t *local, xlator_t *this); + +int +afr_frame_return (call_frame_t *frame); + +gf_boolean_t +afr_is_split_brain (xlator_t *this, inode_t *inode); + +void +afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, +                     afr_spb_state_t data_spb); + +int +afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, +          fd_t *fd, dict_t *xdata); + +void +afr_set_opendir_done (xlator_t *this, inode_t *inode); + +gf_boolean_t +afr_is_opendir_done (xlator_t *this, inode_t *inode); + +void +afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this); + +int +afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd); + +int +afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd); + +#define AFR_STACK_UNWIND(fop, frame, params ...)                \ +        do {                                                    \ +                afr_local_t *__local = NULL;                    \ +                xlator_t    *__this = NULL;                     \ +                if (frame) {                                    \ +                        __local = frame->local;                 \ +                        __this = frame->this;                   \ +                        frame->local = NULL;                    \ +                }                                               \ +                STACK_UNWIND_STRICT (fop, frame, params);       \ +                if (__local) {                                  \ +                        afr_local_cleanup (__local, __this);    \ +                        mem_put (__local);                      \ +                }                                               \ +        } while (0) + +#define AFR_STACK_DESTROY(frame)                                \ +        do {                                                    \ +                afr_local_t *__local = NULL;                    \ +                xlator_t    *__this = NULL;                     \ +                __local = frame->local;                         \ +                __this = frame->this;                           \ +                frame->local = NULL;                            \ +                STACK_DESTROY (frame->root);                    \ +                if (__local) {                                  \ +                        afr_local_cleanup (__local, __this);    \ +                        mem_put (__local);                      \ +                }                                               \ +        } while (0); + +#define AFR_NUM_CHANGE_LOGS            3 /*data + metadata + entry*/ +/* allocate and return a string that is the basename of argument */ +static inline char * +AFR_BASENAME (const char *str) +{ +        char *__tmp_str = NULL; +        char *__basename_str = NULL; +        __tmp_str = gf_strdup (str); +        __basename_str = gf_strdup (basename (__tmp_str)); +        GF_FREE (__tmp_str); +        return __basename_str; +} + +int +afr_transaction_local_init (afr_local_t *local, xlator_t *this); + +int32_t +afr_marker_getxattr (call_frame_t *frame, xlator_t *this, +                     loc_t *loc, const char *name,afr_local_t *local, afr_private_t *priv ); + +int32_t * +afr_children_create (int32_t child_count); + +int +afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno); + +int +afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, +                        transaction_lk_type_t lk_type); + +int +afr_first_up_child (unsigned char *child_up, size_t child_count); + +int +afr_select_read_child_from_policy (int32_t *fresh_children, int32_t child_count, +                                   int32_t prev_read_child, +                                   int32_t config_read_child, int32_t *sources, +                                   unsigned int hmode, uuid_t gfid); + +void +afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, +                              int32_t *fresh_children, int32_t prev_read_child, +                              int32_t config_read_child, uuid_t gfid); + +int32_t +afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, +                    int32_t *fresh_children, +                    int32_t *call_child, int32_t *last_index); + +int32_t +afr_next_call_child (int32_t *fresh_children, unsigned char *child_up, +                     size_t child_count, int32_t *last_index, +                     int32_t read_child); +void +afr_get_fresh_children (int32_t *success_children, int32_t *sources, +                        int32_t *children, unsigned int child_count); +void +afr_children_add_child (int32_t *children, int32_t child, +                              int32_t child_count); +void +afr_children_rm_child (int32_t *children, int32_t child, +                             int32_t child_count); +void +afr_reset_children (int32_t *children, int32_t child_count); +int32_t +afr_most_important_error(int32_t old_errno, int32_t new_errno, +			 gf_boolean_t eio); +int +afr_errno_count (int32_t *children, int *child_errno, +                 unsigned int child_count, int32_t op_errno); +int +afr_get_children_count (int32_t *children, unsigned int child_count); +gf_boolean_t +afr_is_child_present (int32_t *success_children, int32_t child_count, +                      int32_t child); +void +afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs, +                            int32_t *success_children, +                            unsigned int child_count); +void +afr_reset_xattr (dict_t **xattr, unsigned int child_count); +gf_boolean_t +afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, +                        unsigned int child_count, const char *path, +                        const char *xlator_name); +unsigned int +afr_gfid_missing_count (const char *xlator_name, int32_t *children, +                        struct iatt *bufs, unsigned int child_count, +                        const char *path); +void +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path); +void +afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count); +afr_transaction_type +afr_transaction_type_get (ia_type_t ia_type); +int32_t +afr_resultant_errno_get (int32_t *children, +                         int *child_errno, unsigned int child_count); +void +afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, +                             int32_t *stale_children); +void +afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, +                      gf_boolean_t background, ia_type_t ia_type, char *reason, +                      void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, +                                                   xlator_t *this), +                      int (*unwind) (call_frame_t *frame, xlator_t *this, +                                     int32_t op_ret, int32_t op_errno, +                                     int32_t sh_failed)); +void +afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open); + +void +afr_open_fd_fix (fd_t *fd, xlator_t *this); +int +afr_set_elem_count_get (unsigned char *elems, int child_count); + +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this); + +gf_boolean_t +afr_open_only_data_self_heal (char *data_self_heal); + +gf_boolean_t +afr_data_self_heal_enabled (char *data_self_heal); + +void +afr_set_low_priority (call_frame_t *frame); +int +afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child, +                      int flags); + +gf_boolean_t +afr_have_quorum (char *logname, afr_private_t *priv); + +void +afr_matrix_cleanup (int32_t **pending, unsigned int m); + +int32_t** +afr_matrix_create (unsigned int m, unsigned int n); + +gf_boolean_t +afr_is_errno_set (int *child_errno, int child); + +gf_boolean_t +afr_is_errno_unset (int *child_errno, int child); + +gf_boolean_t +afr_is_fd_fixable (fd_t *fd); + +void +afr_prepare_new_entry_pending_matrix (int32_t **pending, +                                      gf_boolean_t (*is_pending) (int *, int), +                                      int *ctx, struct iatt *buf, +                                      unsigned int child_count); +void +afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count); +/* + * Special value indicating we should use the "auto" quorum method instead of + * a fixed value (including zero to turn off quorum enforcement). + */ +#define AFR_QUORUM_AUTO INT_MAX + +/* + * Having this as a macro will make debugging a bit weirder, but does reduce + * the probability of functions handling this check inconsistently. + */ +#define QUORUM_CHECK(_func,_label) do {                                  \ +        if (priv->quorum_count && !afr_have_quorum(this->name,priv)) { \ +                gf_log(this->name,GF_LOG_WARNING,                        \ +                       "failing "#_func" due to lack of quorum");        \ +                op_errno = EROFS;                                        \ +                goto _label;                                             \ +        }                                                                \ +} while (0); + + +#define AFR_SBRAIN_MSG "Failed on %s as split-brain is seen. Returning EIO." + +#define AFR_SBRAIN_CHECK_FD(fd, label) do {                              \ +        if (fd->inode && afr_is_split_brain (this, fd->inode)) {        \ +                op_errno = EIO;                                         \ +                gf_log (this->name, GF_LOG_WARNING,                     \ +                        AFR_SBRAIN_MSG ,uuid_utoa (fd->inode->gfid));   \ +                goto label;                                             \ +        }                                                               \ +} while (0) + +#define AFR_SBRAIN_CHECK_LOC(loc, label) do {                           \ +        if (loc->inode && afr_is_split_brain (this, loc->inode)) {      \ +                op_errno = EIO;                                         \ +                loc_path (loc, NULL);                                   \ +                gf_log (this->name, GF_LOG_WARNING,                     \ +                        AFR_SBRAIN_MSG , loc->path);                    \ +                goto label;                                             \ +        }                                                               \ +} while (0) + +int +afr_fd_report_unstable_write (xlator_t *this, fd_t *fd); + +gf_boolean_t +afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd); + +void +afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub); + +int +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count); + +void +afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this); + +afr_inode_ctx_t* +afr_inode_ctx_get (inode_t *inode, xlator_t *this); + +#endif /* __AFR_H__ */ diff --git a/xlators/cluster/afr-v1/src/pump.c b/xlators/cluster/afr-v1/src/pump.c new file mode 100644 index 000000000..987696e55 --- /dev/null +++ b/xlators/cluster/afr-v1/src/pump.c @@ -0,0 +1,2663 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#include <unistd.h> +#include <sys/time.h> +#include <stdlib.h> +#include <fnmatch.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "afr-common.c" +#include "defaults.c" +#include "glusterfs.h" + +static uint64_t pump_pid = 0; +static inline void +pump_fill_loc_info (loc_t *loc, struct iatt *iatt, struct iatt *parent) +{ +        afr_update_loc_gfids (loc, iatt, parent); +        uuid_copy (loc->inode->gfid, iatt->ia_gfid); +} + +static int +pump_mark_start_pending (xlator_t *this) +{ +        afr_private_t  *priv      = NULL; +        pump_private_t *pump_priv = NULL; + +        priv      = this->private; +        pump_priv = priv->pump_private; + +        pump_priv->pump_start_pending = 1; + +        return 0; +} + +static int +is_pump_start_pending (xlator_t *this) +{ +        afr_private_t  *priv      = NULL; +        pump_private_t *pump_priv = NULL; + +        priv      = this->private; +        pump_priv = priv->pump_private; + +        return (pump_priv->pump_start_pending); +} + +static int +pump_remove_start_pending (xlator_t *this) +{ +        afr_private_t  *priv      = NULL; +        pump_private_t *pump_priv = NULL; + +        priv      = this->private; +        pump_priv = priv->pump_private; + +        pump_priv->pump_start_pending = 0; + +        return 0; +} + +static pump_state_t +pump_get_state () +{ +        xlator_t *this = NULL; +        afr_private_t *priv = NULL; +        pump_private_t *pump_priv = NULL; + +        pump_state_t ret; + +        this = THIS; +        priv = this->private; +        pump_priv = priv->pump_private; + +        LOCK (&pump_priv->pump_state_lock); +        { +                ret = pump_priv->pump_state; +        } +        UNLOCK (&pump_priv->pump_state_lock); + +        return ret; +} + +int +pump_change_state (xlator_t *this, pump_state_t state) +{ +        afr_private_t *priv = NULL; +        pump_private_t *pump_priv = NULL; + +        pump_state_t state_old; +        pump_state_t state_new; + + +        priv = this->private; +        pump_priv = priv->pump_private; + +        GF_ASSERT (pump_priv); + +        LOCK (&pump_priv->pump_state_lock); +        { +                state_old = pump_priv->pump_state; +                state_new = state; + +                pump_priv->pump_state = state; + +        } +        UNLOCK (&pump_priv->pump_state_lock); + +        gf_log (this->name, GF_LOG_DEBUG, +                "Pump changing state from %d to %d", +                state_old, +                state_new); + +        return  0; +} + +static int +pump_set_resume_path (xlator_t *this, const char *path) +{ +        int ret = 0; + +        afr_private_t *priv = NULL; +        pump_private_t *pump_priv = NULL; + +        priv = this->private; +        pump_priv = priv->pump_private; + +        GF_ASSERT (pump_priv); + +        LOCK (&pump_priv->resume_path_lock); +        { +                strncpy (pump_priv->resume_path, path, strlen (path) + 1); +        } +        UNLOCK (&pump_priv->resume_path_lock); + +        return ret; +} + +static int +pump_save_path (xlator_t *this, const char *path) +{ +        afr_private_t *priv = NULL; +        pump_state_t state; +        dict_t *dict = NULL; +        loc_t  loc = {0}; +        int dict_ret = 0; +        int ret = -1; + +        state = pump_get_state (); +        if (state == PUMP_STATE_RESUME) +                return 0; + +        priv = this->private; + +        GF_ASSERT (priv->root_inode); + +        afr_build_root_loc (this, &loc); + +        dict = dict_new (); +        dict_ret = dict_set_str (dict, PUMP_PATH, (char *)path); +        if (dict_ret) +                gf_log (this->name, GF_LOG_WARNING, +                        "%s: failed to set the key %s", path, PUMP_PATH); + +        ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0); + +        if (ret < 0) { +                gf_log (this->name, GF_LOG_INFO, +                        "setxattr failed - could not save path=%s", path); +        } else { +                gf_log (this->name, GF_LOG_DEBUG, +                        "setxattr succeeded - saved path=%s", path); +        } + +        dict_unref (dict); + +        loc_wipe (&loc); +        return 0; +} + +static int +pump_check_and_update_status (xlator_t *this) +{ +        pump_state_t state; +        int ret = -1; + +        state = pump_get_state (); + +        switch (state) { + +        case PUMP_STATE_RESUME: +        case PUMP_STATE_RUNNING: +        { +                ret = 0; +                break; +        } +        case PUMP_STATE_PAUSE: +        { +                ret = -1; +                break; +        } +        case PUMP_STATE_ABORT: +        { +                pump_save_path (this, "/"); +                ret = -1; +                break; +        } +        default: +        { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Unknown pump state"); +                ret = -1; +                break; +        } + +        } + +        return ret; +} + +static const char * +pump_get_resume_path (xlator_t *this) +{ +        afr_private_t *priv = NULL; +        pump_private_t *pump_priv = NULL; + +        const char *resume_path = NULL; + +        priv = this->private; +        pump_priv = priv->pump_private; + +        resume_path = pump_priv->resume_path; + +        return resume_path; +} + +static int +pump_update_resume_state (xlator_t *this, const char *path) +{ +        pump_state_t state; +        const char *resume_path = NULL; + +        state = pump_get_state (); + +        if (state == PUMP_STATE_RESUME) { +                resume_path = pump_get_resume_path (this); +                if (strcmp (resume_path, "/") == 0) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "Reached the resume path (/). Proceeding to change state" +                                " to running"); +                        pump_change_state (this, PUMP_STATE_RUNNING); +                } else if (strcmp (resume_path, path) == 0) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "Reached the resume path. Proceeding to change state" +                                " to running"); +                        pump_change_state (this, PUMP_STATE_RUNNING); +                } else { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "Not yet hit the resume path:res-path=%s,path=%s", +                                resume_path, path); +                } +        } + +        return 0; +} + +static gf_boolean_t +is_pump_traversal_allowed (xlator_t *this, const char *path) +{ +        pump_state_t state; +        const char *resume_path = NULL; +        gf_boolean_t ret = _gf_true; + +        state = pump_get_state (); + +        if (state == PUMP_STATE_RESUME) { +                resume_path = pump_get_resume_path (this); +                if (strstr (resume_path, path)) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "On the right path to resumption path"); +                        ret = _gf_true; +                } else { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "Not the right path to resuming=> ignoring traverse"); +                        ret = _gf_false; +                } +        } + +        return ret; +} + +static int +pump_save_file_stats (xlator_t *this, const char *path) +{ +        afr_private_t  *priv        = NULL; +        pump_private_t *pump_priv   = NULL; + +        priv      = this->private; +        pump_priv = priv->pump_private; + +        LOCK (&pump_priv->resume_path_lock); +        { +                pump_priv->number_files_pumped++; + +                strncpy (pump_priv->current_file, path, +                         PATH_MAX); +        } +        UNLOCK (&pump_priv->resume_path_lock); + +        return 0; +} + +static int +gf_pump_traverse_directory (loc_t *loc) +{ +        xlator_t        *this              = NULL; +        fd_t            *fd                = NULL; +        off_t           offset             = 0; +        loc_t           entry_loc          = {0}; +        gf_dirent_t     *entry             = NULL; +        gf_dirent_t     *tmp               = NULL; +        gf_dirent_t     entries; +	struct iatt     iatt               = {0}; +        struct iatt     parent             = {0}; +	dict_t          *xattr_rsp         = NULL; +        int             ret                = 0; +        gf_boolean_t    is_directory_empty = _gf_true; +        gf_boolean_t    free_entries       = _gf_false; + +        INIT_LIST_HEAD (&entries.list); +        this = THIS; + +        GF_ASSERT (loc->inode); + +	fd = fd_create (loc->inode, pump_pid); +        if (!fd) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Failed to create fd for %s", loc->path); +                goto out; +        } + +        ret = syncop_opendir (this, loc, fd); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "opendir failed on %s", loc->path); +                goto out; +        } + +        gf_log (this->name, GF_LOG_TRACE, +                "pump opendir on %s returned=%d", +                loc->path, ret); + +        while (syncop_readdirp (this, fd, 131072, offset, NULL, &entries)) { +                free_entries = _gf_true; + +                if (list_empty (&entries.list)) { +                        gf_log (this->name, GF_LOG_TRACE, +                                "no more entries in directory"); +                        goto out; +                } + +                list_for_each_entry_safe (entry, tmp, &entries.list, list) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "found readdir entry=%s", entry->d_name); + +                        offset = entry->d_off; +                        if (uuid_is_null (entry->d_stat.ia_gfid)) { +                                gf_log (this->name, GF_LOG_WARNING, "%s/%s: No " +                                        "gfid present skipping", +                                        loc->path, entry->d_name); +                                continue; +                        } +                        loc_wipe (&entry_loc); +                        ret = afr_build_child_loc (this, &entry_loc, loc, +                                                   entry->d_name); +                        if (ret) +                                goto out; + +                        if (!IS_ENTRY_CWD (entry->d_name) && +                            !IS_ENTRY_PARENT (entry->d_name)) { + +                                    is_directory_empty = _gf_false; +                                    gf_log (this->name, GF_LOG_DEBUG, +                                            "lookup %s => %"PRId64, +                                            entry_loc.path, +                                            iatt.ia_ino); + +                                    ret = syncop_lookup (this, &entry_loc, NULL, +                                                         &iatt, &xattr_rsp, &parent); + +                                    if (ret) { +                                            gf_log (this->name, GF_LOG_ERROR, +                                                    "%s: lookup failed", +                                                    entry_loc.path); +                                            continue; +                                    } +                                    pump_fill_loc_info (&entry_loc, &iatt, +                                                       &parent); + +                                    pump_update_resume_state (this, entry_loc.path); + +                                    pump_save_path (this, entry_loc.path); +                                    pump_save_file_stats (this, entry_loc.path); + +                                    ret = pump_check_and_update_status (this); +                                    if (ret < 0) { +                                            gf_log (this->name, GF_LOG_DEBUG, +                                                    "Pump beginning to exit out"); +                                            goto out; +                                    } + +                                    if (IA_ISDIR (iatt.ia_type)) { +                                            if (is_pump_traversal_allowed (this, entry_loc.path)) { +                                                    gf_log (this->name, GF_LOG_TRACE, +                                                            "entering dir=%s", +                                                            entry->d_name); +                                                    gf_pump_traverse_directory (&entry_loc); +                                            } +                                    } +                        } +                } + +                gf_dirent_free (&entries); +                free_entries = _gf_false; +                gf_log (this->name, GF_LOG_TRACE, +                        "offset incremented to %d", +                        (int32_t ) offset); + +        } + +        ret = syncop_close (fd); +        if (ret < 0) +                gf_log (this->name, GF_LOG_DEBUG, "closing the fd failed"); + +        if (is_directory_empty && IS_ROOT_PATH (loc->path)) { +               pump_change_state (this, PUMP_STATE_RUNNING); +               gf_log (this->name, GF_LOG_INFO, "Empty source brick. " +                                "Nothing to be done."); +        } + +out: +        if (entry_loc.path) +                loc_wipe (&entry_loc); +        if (free_entries) +                gf_dirent_free (&entries); +        return 0; +} + +static int +pump_update_resume_path (xlator_t *this) +{ +        const char *resume_path = NULL; + +        resume_path = pump_get_resume_path (this); + +        if (resume_path) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Found a path to resume from: %s", +                        resume_path); + +        }else { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Did not find a path=> setting to '/'"); +                pump_set_resume_path (this, "/"); +        } + +        pump_change_state (this, PUMP_STATE_RESUME); + +        return 0; +} + +static int32_t +pump_xattr_cleaner (call_frame_t *frame, void *cookie, xlator_t *this, +                    int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +        afr_private_t  *priv      = NULL; +        loc_t           loc       = {0}; +        int             i         = 0; +        int             ret       = 0; +        int             source    = 0; +        int             sink      = 1; + +        priv      = this->private; + +        afr_build_root_loc (this, &loc); + +        ret = syncop_removexattr (priv->children[source], &loc, +				  PUMP_PATH, 0); + +        ret = syncop_removexattr (priv->children[sink], &loc, +                                  PUMP_SINK_COMPLETE, 0); + +        for (i = 0; i < priv->child_count; i++) { +                ret = syncop_removexattr (priv->children[i], &loc, +                                          PUMP_SOURCE_COMPLETE, 0); +                if (ret) { +                        gf_log (this->name, GF_LOG_DEBUG, "removexattr " +                                "failed with %s", strerror (-ret)); +                } +        } + +        loc_wipe (&loc); +        return pump_command_reply (frame, this); +} + +static int +pump_complete_migration (xlator_t *this) +{ +        afr_private_t *priv = NULL; +        pump_private_t *pump_priv = NULL; +        dict_t *dict = NULL; +        pump_state_t state; +        loc_t  loc = {0}; +        int dict_ret = 0; +        int ret = -1; + +        priv = this->private; +        pump_priv = priv->pump_private; + +        GF_ASSERT (priv->root_inode); + +        afr_build_root_loc (this, &loc); + +        dict = dict_new (); + +        state = pump_get_state (); +        if (state == PUMP_STATE_RUNNING) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Pump finished pumping"); + +                pump_priv->pump_finished = _gf_true; + +                dict_ret = dict_set_str (dict, PUMP_SOURCE_COMPLETE, "jargon"); +                if (dict_ret) +                        gf_log (this->name, GF_LOG_WARNING, +                                "%s: failed to set the key %s", +                                loc.path, PUMP_SOURCE_COMPLETE); + +                ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0); +                if (ret < 0) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "setxattr failed - while  notifying source complete"); +                } +                dict_ret = dict_set_str (dict, PUMP_SINK_COMPLETE, "jargon"); +                if (dict_ret) +                        gf_log (this->name, GF_LOG_WARNING, +                                "%s: failed to set the key %s", +                                loc.path, PUMP_SINK_COMPLETE); + +                ret = syncop_setxattr (PUMP_SINK_CHILD (this), &loc, dict, 0); +                if (ret < 0) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "setxattr failed - while notifying sink complete"); +                } + +                pump_save_path (this, "/"); + +        } else if (state == PUMP_STATE_ABORT) { +                gf_log (this->name, GF_LOG_DEBUG, "Starting cleanup " +                        "of pump internal xattrs"); +                call_resume (pump_priv->cleaner); +        } + +        loc_wipe (&loc); +        return 0; +} + +static int +pump_lookup_sink (loc_t *loc) +{ +        xlator_t *this = NULL; +	struct iatt iatt, parent; +	dict_t *xattr_rsp; +        dict_t *xattr_req = NULL; +        int ret = 0; + +        this = THIS; + +        xattr_req = dict_new (); + +        ret = afr_set_root_gfid (xattr_req); +        if (ret) +                goto out; + +        ret = syncop_lookup (PUMP_SINK_CHILD (this), loc, +                             xattr_req, &iatt, &xattr_rsp, &parent); + +        if (ret) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Lookup on sink child failed"); +                ret = -1; +                goto out; +        } + +out: +        if (xattr_req) +                dict_unref (xattr_req); + +        return ret; +} + +static int +pump_task (void *data) +{ +	xlator_t *this = NULL; +        afr_private_t *priv = NULL; + + +        loc_t loc = {0}; +	struct iatt iatt, parent; +	dict_t *xattr_rsp = NULL; +        dict_t *xattr_req = NULL; + +        int ret = -1; + +        this = THIS; +        priv = this->private; + +        GF_ASSERT (priv->root_inode); + +        afr_build_root_loc (this, &loc); +        xattr_req = dict_new (); +        if (!xattr_req) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Out of memory"); +                ret = -1; +                goto out; +        } + +        afr_set_root_gfid (xattr_req); +        ret = syncop_lookup (this, &loc, xattr_req, +                             &iatt, &xattr_rsp, &parent); + +        gf_log (this->name, GF_LOG_TRACE, +                "lookup: path=%s gfid=%s", +                loc.path, uuid_utoa (loc.inode->gfid)); + +        ret = pump_check_and_update_status (this); +        if (ret < 0) { +                goto out; +        } + +        pump_update_resume_path (this); + +        afr_set_root_gfid (xattr_req); +        ret = pump_lookup_sink (&loc); +        if (ret) { +                pump_update_resume_path (this); +                goto out; +        } + +        gf_pump_traverse_directory (&loc); + +        pump_complete_migration (this); +out: +        if (xattr_req) +                dict_unref (xattr_req); + +        loc_wipe (&loc); +	return 0; +} + + +static int +pump_task_completion (int ret, call_frame_t *sync_frame, void *data) +{ +        xlator_t *this = NULL; +        afr_private_t *priv = NULL; + +        this = THIS; + +        priv = this->private; + +        inode_unref (priv->root_inode); +        STACK_DESTROY (sync_frame->root); + +        gf_log (this->name, GF_LOG_DEBUG, +                "Pump xlator exiting"); +	return 0; +} + +int +pump_start (call_frame_t *pump_frame, xlator_t *this) +{ +	afr_private_t *priv = NULL; +	pump_private_t *pump_priv = NULL; + +	int ret = -1; + +	priv = this->private; +        pump_priv = priv->pump_private; + +        afr_set_lk_owner (pump_frame, this, pump_frame->root); +	pump_pid = (uint64_t) (unsigned long)pump_frame->root; + +	ret = synctask_new (pump_priv->env, pump_task, +                            pump_task_completion, +                            pump_frame, NULL); +        if (ret == -1) { +                gf_log (this->name, GF_LOG_ERROR, +                        "starting pump failed"); +                pump_change_state (this, PUMP_STATE_ABORT); +                goto out; +        } + +        gf_log (this->name, GF_LOG_DEBUG, +                "setting pump as started lk_owner: %s %"PRIu64, +                lkowner_utoa (&pump_frame->root->lk_owner), pump_pid); + +        priv->use_afr_in_pump = 1; +out: +	return ret; +} + +static int +pump_start_synctask (xlator_t *this) +{ +        call_frame_t *frame = NULL; +        int ret = 0; + +        frame = create_frame (this, this->ctx->pool); +        if (!frame) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Out of memory"); +                ret = -1; +                goto out; +        } + +        pump_change_state (this, PUMP_STATE_RUNNING); + +        ret = pump_start (frame, this); + +out: +        return ret; +} + +int32_t +pump_cmd_start_setxattr_cbk (call_frame_t *frame, +                             void *cookie, +                             xlator_t *this, +                             int32_t op_ret, +                             int32_t op_errno, dict_t *xdata) + +{ +        call_frame_t *prev = NULL; +        afr_local_t *local = NULL; +        int ret = 0; + +        local = frame->local; + +        if (op_ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Could not initiate destination " +                        "brick connect"); +                ret = op_ret; +                goto out; +        } + +        gf_log (this->name, GF_LOG_DEBUG, +                "Successfully initiated destination " +                "brick connect"); + +        pump_mark_start_pending (this); + +        /* send the PARENT_UP as pump is ready now */ +        prev = cookie; +        if (prev && prev->this) +                prev->this->notify (prev->this, GF_EVENT_PARENT_UP, this); + +out: +        local->op_ret = ret; +        pump_command_reply (frame, this); + +        return 0; +} + +static int +pump_initiate_sink_connect (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t   *local     = NULL; +        afr_private_t *priv      = NULL; +        dict_t        *dict      = NULL; +        data_t        *data      = NULL; +        char          *clnt_cmd  = NULL; +        loc_t loc = {0}; + +        int ret = 0; + +        priv  = this->private; +        local = frame->local; + +        GF_ASSERT (priv->root_inode); + +        afr_build_root_loc (this, &loc); + +        data = data_ref (dict_get (local->dict, RB_PUMP_CMD_START)); +        if (!data) { +                ret = -1; +                gf_log (this->name, GF_LOG_ERROR, +                        "Could not get destination brick value"); +                goto out; +        } + +        dict = dict_new (); +        if (!dict) { +                ret = -1; +                goto out; +        } + +        clnt_cmd = GF_CALLOC (1, data->len+1, gf_common_mt_char); +        if (!clnt_cmd) { +                ret = -1; +                goto out; +        } + +        memcpy (clnt_cmd, data->data, data->len); +        clnt_cmd[data->len] = '\0'; +        gf_log (this->name, GF_LOG_DEBUG, "Got destination brick %s\n", +                        clnt_cmd); + +        ret = dict_set_dynstr (dict, CLIENT_CMD_CONNECT, clnt_cmd); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Could not inititiate destination brick " +                        "connect"); +                goto out; +        } + +	STACK_WIND (frame, +		    pump_cmd_start_setxattr_cbk, +		    PUMP_SINK_CHILD(this), +		    PUMP_SINK_CHILD(this)->fops->setxattr, +		    &loc, +		    dict, +		    0, NULL); + +        ret = 0; + +out: +        if (dict) +                dict_unref (dict); + +        if (data) +                data_unref (data); + +        if (ret && clnt_cmd) +                GF_FREE (clnt_cmd); + +        loc_wipe (&loc); +        return ret; +} + +static int +is_pump_aborted (xlator_t *this) +{ +        pump_state_t state; + +        state = pump_get_state (); + +        return ((state == PUMP_STATE_ABORT)); +} + +int32_t +pump_cmd_start_getxattr_cbk (call_frame_t *frame, +                             void *cookie, +                             xlator_t *this, +                             int32_t op_ret, +                             int32_t op_errno, +                             dict_t *dict, dict_t *xdata) +{ +        afr_local_t *local = NULL; +        char *path = NULL; + +        pump_state_t state; +        int ret = 0; +        int need_unwind = 0; +        int dict_ret = -1; + +        local = frame->local; + +        if (op_ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "getxattr failed - changing pump " +                        "state to RUNNING with '/'"); +                path = "/"; +                ret = op_ret; +        } else { +                gf_log (this->name, GF_LOG_TRACE, +                        "getxattr succeeded"); + +                dict_ret =  dict_get_str (dict, PUMP_PATH, &path); +                if (dict_ret < 0) +                        path = "/"; +        } + +        state = pump_get_state (); +        if ((state == PUMP_STATE_RUNNING) || +            (state == PUMP_STATE_RESUME)) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Pump is already started"); +                ret = -1; +                goto out; +        } + +        pump_set_resume_path (this, path); + +        if (is_pump_aborted (this)) +                /* We're re-starting pump afresh */ +                ret = pump_initiate_sink_connect (frame, this); +        else { +                /* We're re-starting pump from a previous +                   pause */ +                gf_log (this->name, GF_LOG_DEBUG, +                        "about to start synctask"); +                ret = pump_start_synctask (this); +                need_unwind = 1; +        } + +out: +        if ((ret < 0) || (need_unwind == 1)) { +                local->op_ret = ret; +                pump_command_reply (frame, this); +        } +	return 0; +} + +int +pump_execute_status (call_frame_t *frame, xlator_t *this) +{ +        afr_private_t *priv = NULL; +        pump_private_t *pump_priv = NULL; + +        uint64_t number_files = 0; + +        char filename[PATH_MAX]; +        char summary[PATH_MAX+256]; +        char *dict_str = NULL; + +        int32_t op_ret = 0; +        int32_t op_errno = 0; + +        dict_t *dict = NULL; +        int ret = -1; + +        priv = this->private; +        pump_priv = priv->pump_private; + +        LOCK (&pump_priv->resume_path_lock); +        { +                number_files  = pump_priv->number_files_pumped; +                strncpy (filename, pump_priv->current_file, PATH_MAX); +        } +        UNLOCK (&pump_priv->resume_path_lock); + +        dict_str     = GF_CALLOC (1, PATH_MAX + 256, gf_afr_mt_char); +        if (!dict_str) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Out of memory"); +                op_ret = -1; +                op_errno = ENOMEM; +                goto out; +        } + +        if (pump_priv->pump_finished) { +                snprintf (summary, PATH_MAX+256, +                          "no_of_files=%"PRIu64, number_files); +        } else { +                snprintf (summary, PATH_MAX+256, +                          "no_of_files=%"PRIu64":current_file=%s", +                          number_files, filename); +        } +        snprintf (dict_str, PATH_MAX+256, "status=%d:%s", +                  (pump_priv->pump_finished)?1:0, summary); + +        dict = dict_new (); + +        ret = dict_set_dynstr (dict, RB_PUMP_CMD_STATUS, dict_str); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "dict_set_dynstr returned negative value"); +        } else { +                dict_str = NULL; +        } + +        op_ret = 0; + +out: + +        AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, NULL); + +        if (dict) +                dict_unref (dict); + +        GF_FREE (dict_str); + +        return 0; +} + +int +pump_execute_pause (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; + +        local = frame->local; + +        pump_change_state (this, PUMP_STATE_PAUSE); + +        local->op_ret = 0; +        pump_command_reply (frame, this); + +        return 0; +} + +int +pump_execute_start (call_frame_t *frame, xlator_t *this) +{ +        afr_private_t *priv = NULL; +        afr_local_t   *local = NULL; + +        int ret = 0; +        loc_t loc = {0}; + +        priv = this->private; +        local = frame->local; + +        if (!priv->root_inode) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Pump xlator cannot be started without an initial " +                        "lookup"); +                ret = -1; +                goto out; +        } + +        GF_ASSERT (priv->root_inode); + +        afr_build_root_loc (this, &loc); + +	STACK_WIND (frame, +		    pump_cmd_start_getxattr_cbk, +		    PUMP_SOURCE_CHILD(this), +		    PUMP_SOURCE_CHILD(this)->fops->getxattr, +		    &loc, +		    PUMP_PATH, NULL); + +        ret = 0; + +out: +        if (ret < 0) { +                local->op_ret = ret; +                pump_command_reply (frame, this); +        } + +        loc_wipe (&loc); +	return 0; +} + +static int +pump_cleanup_helper (void *data) { +        call_frame_t *frame = data; + +        pump_xattr_cleaner (frame, 0, frame->this, 0, 0, NULL); + +        return 0; +} + +static int +pump_cleanup_done (int ret, call_frame_t *sync_frame, void *data) +{ +        STACK_DESTROY (sync_frame->root); + +        return 0; +} + +int +pump_execute_commit (call_frame_t *frame, xlator_t *this) +{ +        afr_private_t  *priv       = NULL; +        pump_private_t *pump_priv  = NULL; +        afr_local_t    *local      = NULL; +        call_frame_t   *sync_frame = NULL; +        int             ret        = 0; + +        priv      = this->private; +        pump_priv = priv->pump_private; +        local     = frame->local; + +        local->op_ret = 0; +        if (pump_priv->pump_finished) { +                pump_change_state (this, PUMP_STATE_COMMIT); +                sync_frame = create_frame (this, this->ctx->pool); +                ret = synctask_new (pump_priv->env, pump_cleanup_helper, +                                    pump_cleanup_done, sync_frame, frame); +                if (ret) { +                        gf_log (this->name, GF_LOG_DEBUG, "Couldn't create " +                                "synctask for cleaning up xattrs."); +                } + +        } else { +                gf_log (this->name, GF_LOG_ERROR, "Commit can't proceed. " +                        "Migration in progress"); +                local->op_ret = -1; +                local->op_errno = EINPROGRESS; +                pump_command_reply (frame, this); +        } + +        return 0; +} +int +pump_execute_abort (call_frame_t *frame, xlator_t *this) +{ +        afr_private_t  *priv       = NULL; +        pump_private_t *pump_priv  = NULL; +        afr_local_t    *local      = NULL; +        call_frame_t   *sync_frame = NULL; +        int             ret        = 0; + +        priv      = this->private; +        pump_priv = priv->pump_private; +        local     = frame->local; + +        pump_change_state (this, PUMP_STATE_ABORT); + +        LOCK (&pump_priv->resume_path_lock); +        { +                pump_priv->number_files_pumped = 0; +                pump_priv->current_file[0] = '\0'; +        } +        UNLOCK (&pump_priv->resume_path_lock); + +        local->op_ret = 0; +        if (pump_priv->pump_finished) { +                sync_frame = create_frame (this, this->ctx->pool); +                ret = synctask_new (pump_priv->env, pump_cleanup_helper, +                                    pump_cleanup_done, sync_frame, frame); +                if (ret) { +                        gf_log (this->name, GF_LOG_DEBUG, "Couldn't create " +                                "synctask for cleaning up xattrs."); +                } + +        } else { +                pump_priv->cleaner = fop_setxattr_cbk_stub (frame, +                                                            pump_xattr_cleaner, +                                                            0, 0, NULL); +        } + +        return 0; +} + +gf_boolean_t +pump_command_status (xlator_t *this, dict_t *dict) +{ +        char *cmd = NULL; +        int dict_ret = -1; +        int ret = _gf_true; + +        dict_ret = dict_get_str (dict, RB_PUMP_CMD_STATUS, &cmd); +        if (dict_ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Not a pump status command"); +                ret = _gf_false; +                goto out; +        } + +        gf_log (this->name, GF_LOG_DEBUG, +                "Hit a pump command - status"); +        ret = _gf_true; + +out: +        return ret; + +} + +gf_boolean_t +pump_command_pause (xlator_t *this, dict_t *dict) +{ +        char *cmd = NULL; +        int dict_ret = -1; +        int ret = _gf_true; + +        dict_ret = dict_get_str (dict, RB_PUMP_CMD_PAUSE, &cmd); +        if (dict_ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Not a pump pause command"); +                ret = _gf_false; +                goto out; +        } + +        gf_log (this->name, GF_LOG_DEBUG, +                "Hit a pump command - pause"); +        ret = _gf_true; + +out: +        return ret; + +} + +gf_boolean_t +pump_command_commit (xlator_t *this, dict_t *dict) +{ +        char *cmd = NULL; +        int dict_ret = -1; +        int ret = _gf_true; + +        dict_ret = dict_get_str (dict, RB_PUMP_CMD_COMMIT, &cmd); +        if (dict_ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Not a pump commit command"); +                ret = _gf_false; +                goto out; +        } + +        gf_log (this->name, GF_LOG_DEBUG, +                "Hit a pump command - commit"); +        ret = _gf_true; + +out: +        return ret; + +} + +gf_boolean_t +pump_command_abort (xlator_t *this, dict_t *dict) +{ +        char *cmd = NULL; +        int dict_ret = -1; +        int ret = _gf_true; + +        dict_ret = dict_get_str (dict, RB_PUMP_CMD_ABORT, &cmd); +        if (dict_ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Not a pump abort command"); +                ret = _gf_false; +                goto out; +        } + +        gf_log (this->name, GF_LOG_DEBUG, +                "Hit a pump command - abort"); +        ret = _gf_true; + +out: +        return ret; + +} + +gf_boolean_t +pump_command_start (xlator_t *this, dict_t *dict) +{ +        char *cmd = NULL; +        int dict_ret = -1; +        int ret = _gf_true; + +        dict_ret = dict_get_str (dict, RB_PUMP_CMD_START, &cmd); +        if (dict_ret < 0) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Not a pump start command"); +                ret = _gf_false; +                goto out; +        } + +        gf_log (this->name, GF_LOG_DEBUG, +                "Hit a pump command - start"); +        ret = _gf_true; + +out: +        return ret; + +} + +struct _xattr_key { +        char *key; +        struct list_head list; +}; + +static int +__gather_xattr_keys (dict_t *dict, char *key, data_t *value, +                     void *data) +{ +        struct list_head *  list  = data; +        struct _xattr_key * xkey  = NULL; + +        if (!strncmp (key, AFR_XATTR_PREFIX, +                      strlen (AFR_XATTR_PREFIX))) { + +                xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key); +                if (!xkey) +                        return -1; + +                xkey->key = key; +                INIT_LIST_HEAD (&xkey->list); + +                list_add_tail (&xkey->list, list); +        } +        return 0; +} + +static void +__filter_xattrs (dict_t *dict) +{ +        struct list_head keys; + +        struct _xattr_key *key; +        struct _xattr_key *tmp; + +        INIT_LIST_HEAD (&keys); + +        dict_foreach (dict, __gather_xattr_keys, +                      (void *) &keys); + +        list_for_each_entry_safe (key, tmp, &keys, list) { +                dict_del (dict, key->key); + +                list_del_init (&key->list); + +                GF_FREE (key); +        } +} + +int32_t +pump_getxattr_cbk (call_frame_t *frame, void *cookie, +		  xlator_t *this, int32_t op_ret, int32_t op_errno, +		  dict_t *dict, dict_t *xdata) +{ +	afr_private_t   *priv           = NULL; +	afr_local_t     *local          = NULL; +	xlator_t        **children      = NULL; +	int             unwind          = 1; +        int32_t         *last_index     = NULL; +        int32_t         next_call_child = -1; +        int32_t         read_child      = -1; +        int32_t         *fresh_children = NULL; + + +	priv     = this->private; +	children = priv->children; + +	local = frame->local; + +        read_child = (long) cookie; + +	if (op_ret == -1) { +		last_index = &local->cont.getxattr.last_index; +                fresh_children = local->fresh_children; +                next_call_child = afr_next_call_child (fresh_children, +                                                       local->child_up, +                                                       priv->child_count, +                                                       last_index, read_child); +                if (next_call_child < 0) +                        goto out; + +		unwind = 0; +		STACK_WIND_COOKIE (frame, pump_getxattr_cbk, +				   (void *) (long) read_child, +				   children[next_call_child], +				   children[next_call_child]->fops->getxattr, +				   &local->loc, +				   local->cont.getxattr.name, NULL); +	} + +out: +	if (unwind) { +                if (op_ret >= 0 && dict) +                        __filter_xattrs (dict); + +		AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, NULL); +	} + +	return 0; +} + +int32_t +pump_getxattr (call_frame_t *frame, xlator_t *this, +	      loc_t *loc, const char *name, dict_t *xdata) +{ +	afr_private_t *   priv       = NULL; +	xlator_t **       children   = NULL; +	int               call_child = 0; +	afr_local_t       *local     = NULL; +	int32_t           ret     = -1; +	int32_t           op_errno   = 0; +        uint64_t          read_child = 0; + + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv     = this->private; +	VALIDATE_OR_GOTO (priv->children, out); + +	children = priv->children; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, default_getxattr_cbk, +                            FIRST_CHILD (this), +                            (FIRST_CHILD (this))->fops->getxattr, +                            loc, name, xdata); +                return 0; +        } + + +	AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +	local = frame->local; + +        ret = afr_local_init (local, priv, &op_errno); +        if (ret < 0) +                goto out; + +        if (name) { +                if (!strncmp (name, AFR_XATTR_PREFIX, +                              strlen (AFR_XATTR_PREFIX))) { + +                        op_errno = ENODATA; +                        goto out; +                } + +                if (!strcmp (name, RB_PUMP_CMD_STATUS)) { +                        gf_log (this->name, GF_LOG_DEBUG, +                                "Hit pump command - status"); +                        pump_execute_status (frame, this); +                        ret = 0; +                        goto out; +                } +        } + +        local->fresh_children = GF_CALLOC (priv->child_count, +                                          sizeof (*local->fresh_children), +                                          gf_afr_mt_int32_t); +        if (!local->fresh_children) { +                ret = -1; +                op_errno = ENOMEM; +                goto out; +        } + +        read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); +        ret = afr_get_call_child (this, local->child_up, read_child, +                                     local->fresh_children, +                                     &call_child, +                                     &local->cont.getxattr.last_index); +        if (ret < 0) { +                op_errno = -ret; +                goto out; +        } +	loc_copy (&local->loc, loc); +	if (name) +	  local->cont.getxattr.name       = gf_strdup (name); + +	STACK_WIND_COOKIE (frame, pump_getxattr_cbk, +			   (void *) (long) call_child, +			   children[call_child], children[call_child]->fops->getxattr, +			   loc, name, xdata); + +	ret = 0; +out: +	if (ret < 0) +		AFR_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); +	return 0; +} + +static int +afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *   local = NULL; +	call_frame_t   *main_frame = NULL; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.main_frame) +			main_frame = local->transaction.main_frame; +		local->transaction.main_frame = NULL; +	} +	UNLOCK (&frame->lock); + +	if (main_frame) { +		AFR_STACK_UNWIND (setxattr, main_frame, +                                  local->op_ret, local->op_errno, NULL); +	} +	return 0; +} + +static int +afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		       int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int call_count  = -1; +	int need_unwind = 0; + +	local = frame->local; +	priv = this->private; + +	LOCK (&frame->lock); +	{ +		if (op_ret != -1) { +			if (local->success_count == 0) { +				local->op_ret = op_ret; +			} +			local->success_count++; + +			if (local->success_count == priv->child_count) { +				need_unwind = 1; +			} +		} + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	if (need_unwind) +		local->transaction.unwind (frame, this); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +	} + +	return 0; +} + +static int +afr_setxattr_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; + +	int call_count = -1; +	int i = 0; + +	local = frame->local; +	priv = this->private; + +	call_count = afr_up_children_count (local->child_up, priv->child_count); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, +					   (void *) (long) i, +					   priv->children[i], +					   priv->children[i]->fops->setxattr, +					   &local->loc, +					   local->cont.setxattr.dict, +					   local->cont.setxattr.flags, NULL); + +			if (!--call_count) +				break; +		} +	} + +	return 0; +} + + +static int +afr_setxattr_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t * local = frame->local; + +	local->transaction.unwind (frame, this); + +	AFR_STACK_DESTROY (frame); + +	return 0; +} + +int32_t +pump_setxattr_cbk (call_frame_t *frame, +		      void *cookie, +		      xlator_t *this, +		      int32_t op_ret, +		      int32_t op_errno, dict_t *xdata) +{ +	AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); +	return 0; +} + +int +pump_command_reply (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = NULL; + +        local = frame->local; + +        if (local->op_ret < 0) +                gf_log (this->name, GF_LOG_INFO, +                        "Command failed"); +        else +                gf_log (this->name, GF_LOG_INFO, +                        "Command succeeded"); + +        AFR_STACK_UNWIND (setxattr, +                          frame, +                          local->op_ret, +                          local->op_errno, NULL); + +        return 0; +} + +int +pump_parse_command (call_frame_t *frame, xlator_t *this, +                    afr_local_t *local, dict_t *dict) +{ + +        int ret = -1; + +        if (pump_command_start (this, dict)) { +                frame->local = local; +                local->dict = dict_ref (dict); +                ret = pump_execute_start (frame, this); + +        } else if (pump_command_pause (this, dict)) { +                frame->local = local; +                local->dict = dict_ref (dict); +                ret = pump_execute_pause (frame, this); + +        } else if (pump_command_abort (this, dict)) { +                frame->local = local; +                local->dict = dict_ref (dict); +                ret = pump_execute_abort (frame, this); + +        } else if (pump_command_commit (this, dict)) { +                frame->local = local; +                local->dict = dict_ref (dict); +                ret = pump_execute_commit (frame, this); +        } +        return ret; +} + +int +pump_setxattr (call_frame_t *frame, xlator_t *this, +               loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	call_frame_t   *transaction_frame = NULL; +	int ret = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +        GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.pump*", dict, +                                   op_errno, out); + +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, default_setxattr_cbk, +                            FIRST_CHILD (this), +                            (FIRST_CHILD (this))->fops->setxattr, +                            loc, dict, flags, xdata); +                return 0; +        } + + +	AFR_LOCAL_ALLOC_OR_GOTO (local, out); + +	ret = afr_local_init (local, priv, &op_errno); +	if (ret < 0) { +                afr_local_cleanup (local, this); +                mem_put (local); +		goto out; +        } + +        ret = pump_parse_command (frame, this, +                                  local, dict); +        if (ret >= 0) { +                ret = 0; +                goto out; +        } + +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"Out of memory."); +                op_errno = ENOMEM; +                ret = -1; +                afr_local_cleanup (local, this); +		goto out; +	} + +	transaction_frame->local = local; + +	local->op_ret = -1; + +	local->cont.setxattr.dict  = dict_ref (dict); +	local->cont.setxattr.flags = flags; + +	local->transaction.fop    = afr_setxattr_wind; +	local->transaction.done   = afr_setxattr_done; +	local->transaction.unwind = afr_setxattr_unwind; + +	loc_copy (&local->loc, loc); + +	local->transaction.main_frame = frame; +	local->transaction.start   = LLONG_MAX - 1; +	local->transaction.len     = 0; + +	afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + +	ret = 0; +out: +	if (ret < 0) { +		if (transaction_frame) +			AFR_STACK_DESTROY (transaction_frame); +		AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); +	} + +	return 0; +} + +/* Defaults */ +static int32_t +pump_lookup (call_frame_t *frame, +             xlator_t *this, +             loc_t *loc, +             dict_t *xattr_req) +{ +	afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, +                            default_lookup_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->lookup, +                            loc, +                            xattr_req); +                return 0; +        } + +        afr_lookup (frame, this, loc, xattr_req); +        return 0; +} + + +static int32_t +pump_truncate (call_frame_t *frame, +               xlator_t *this, +               loc_t *loc, +               off_t offset, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, +                            default_truncate_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->truncate, +                            loc, +                            offset, xdata); +                return 0; +        } + +        afr_truncate (frame, this, loc, offset, xdata); +        return 0; +} + + +static int32_t +pump_ftruncate (call_frame_t *frame, +                xlator_t *this, +                fd_t *fd, +                off_t offset, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, +                            default_ftruncate_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->ftruncate, +                            fd, +                            offset, xdata); +                return 0; +        } + +        afr_ftruncate (frame, this, fd, offset, xdata); +        return 0; +} + + + + +int +pump_mknod (call_frame_t *frame, xlator_t *this, +            loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, default_mknod_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->mknod, +                            loc, mode, rdev, umask, xdata); +                return 0; +        } +        afr_mknod (frame, this, loc, mode, rdev, umask, xdata); +        return 0; + +} + + + +int +pump_mkdir (call_frame_t *frame, xlator_t *this, +            loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, default_mkdir_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->mkdir, +                            loc, mode, umask, xdata); +                return 0; +        } +        afr_mkdir (frame, this, loc, mode, umask, xdata); +        return 0; + +} + + +static int32_t +pump_unlink (call_frame_t *frame, +             xlator_t *this, +             loc_t *loc, int xflag, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, +                            default_unlink_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->unlink, +                            loc, xflag, xdata); +                return 0; +        } +        afr_unlink (frame, this, loc, xflag, xdata); +        return 0; + +} + + +static int +pump_rmdir (call_frame_t *frame, xlator_t *this, +            loc_t *loc, int flags, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; + +	priv = this->private; + +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, default_rmdir_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->rmdir, +                            loc, flags, xdata); +                return 0; +        } + +        afr_rmdir (frame, this, loc, flags, xdata); +        return 0; + +} + + + +int +pump_symlink (call_frame_t *frame, xlator_t *this, +              const char *linkpath, loc_t *loc, mode_t umask, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, default_symlink_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->symlink, +                            linkpath, loc, umask, xdata); +                return 0; +        } +        afr_symlink (frame, this, linkpath, loc, umask, xdata); +        return 0; + +} + + +static int32_t +pump_rename (call_frame_t *frame, +             xlator_t *this, +             loc_t *oldloc, +             loc_t *newloc, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, +                            default_rename_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->rename, +                            oldloc, newloc, xdata); +                return 0; +        } +        afr_rename (frame, this, oldloc, newloc, xdata); +        return 0; + +} + + +static int32_t +pump_link (call_frame_t *frame, +           xlator_t *this, +           loc_t *oldloc, +           loc_t *newloc, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, +                            default_link_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->link, +                            oldloc, newloc, xdata); +                return 0; +        } +        afr_link (frame, this, oldloc, newloc, xdata); +        return 0; + +} + + +static int32_t +pump_create (call_frame_t *frame, xlator_t *this, +             loc_t *loc, int32_t flags, mode_t mode, +             mode_t umask, fd_t *fd, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, default_create_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->create, +                            loc, flags, mode, umask, fd, xdata); +                return 0; +        } +        afr_create (frame, this, loc, flags, mode, umask, fd, xdata); +        return 0; + +} + + +static int32_t +pump_open (call_frame_t *frame, +           xlator_t *this, +           loc_t *loc, +           int32_t flags, fd_t *fd, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, +                            default_open_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->open, +                            loc, flags, fd, xdata); +                return 0; +        } +        afr_open (frame, this, loc, flags, fd, xdata); +        return 0; + +} + + +static int32_t +pump_writev (call_frame_t *frame, +             xlator_t *this, +             fd_t *fd, +             struct iovec *vector, +             int32_t count, +             off_t off, uint32_t flags, +             struct iobref *iobref, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, +                            default_writev_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->writev, +                            fd, +                            vector, +                            count, +                            off, flags, +                            iobref, xdata); +                return 0; +        } + +        afr_writev (frame, this, fd, vector, count, off, flags, iobref, xdata); +        return 0; +} + + +static int32_t +pump_flush (call_frame_t *frame, +            xlator_t *this, +            fd_t *fd, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, +                            default_flush_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->flush, +                            fd, xdata); +                return 0; +        } +        afr_flush (frame, this, fd, xdata); +        return 0; + +} + + +static int32_t +pump_fsync (call_frame_t *frame, +            xlator_t *this, +            fd_t *fd, +            int32_t flags, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, +                            default_fsync_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->fsync, +                            fd, +                            flags, xdata); +                return 0; +        } +        afr_fsync (frame, this, fd, flags, xdata); +        return 0; + +} + + +static int32_t +pump_opendir (call_frame_t *frame, +              xlator_t *this, +              loc_t *loc, fd_t *fd, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, +                            default_opendir_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->opendir, +                            loc, fd, xdata); +                return 0; +        } +        afr_opendir (frame, this, loc, fd, xdata); +        return 0; + +} + + +static int32_t +pump_fsyncdir (call_frame_t *frame, +               xlator_t *this, +               fd_t *fd, +               int32_t flags, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, +                            default_fsyncdir_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->fsyncdir, +                            fd, +                            flags, xdata); +                return 0; +        } +        afr_fsyncdir (frame, this, fd, flags, xdata); +        return 0; + +} + + +static int32_t +pump_xattrop (call_frame_t *frame, +              xlator_t *this, +              loc_t *loc, +              gf_xattrop_flags_t flags, +              dict_t *dict, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, +                            default_xattrop_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->xattrop, +                            loc, +                            flags, +                            dict, xdata); +                return 0; +        } +        afr_xattrop (frame, this, loc, flags, dict, xdata); +        return 0; + +} + +static int32_t +pump_fxattrop (call_frame_t *frame, +               xlator_t *this, +               fd_t *fd, +               gf_xattrop_flags_t flags, +               dict_t *dict, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, +                            default_fxattrop_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->fxattrop, +                            fd, +                            flags, +                            dict, xdata); +                return 0; +        } +        afr_fxattrop (frame, this, fd, flags, dict, xdata); +        return 0; + +} + + +static int32_t +pump_removexattr (call_frame_t *frame, +                  xlator_t *this, +                  loc_t *loc, +                  const char *name, dict_t *xdata) +{ +        afr_private_t *priv     = NULL; +        int            op_errno = -1; + +        VALIDATE_OR_GOTO (this, out); + +        GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.pump*", +                                 name, op_errno, out); + +        op_errno = 0; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, +                            default_removexattr_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->removexattr, +                            loc, +                            name, xdata); +                return 0; +        } +        afr_removexattr (frame, this, loc, name, xdata); + + out: +        if (op_errno) +                AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); +        return 0; + +} + + + +static int32_t +pump_readdir (call_frame_t *frame, +              xlator_t *this, +              fd_t *fd, +              size_t size, +              off_t off, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, +                            default_readdir_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->readdir, +                            fd, size, off, xdata); +                return 0; +        } +        afr_readdir (frame, this, fd, size, off, xdata); +        return 0; + +} + + +static int32_t +pump_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, +               size_t size, off_t off, dict_t *dict) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, +                            default_readdirp_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->readdirp, +                            fd, size, off, dict); +                return 0; +        } +        afr_readdirp (frame, this, fd, size, off, dict); +        return 0; + +} + + + +static int32_t +pump_releasedir (xlator_t *this, +                 fd_t *fd) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (priv->use_afr_in_pump) +                afr_releasedir (this, fd); +	return 0; + +} + +static int32_t +pump_release (xlator_t *this, +              fd_t *fd) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (priv->use_afr_in_pump) +                afr_release (this, fd); +	return 0; + +} + +static int32_t +pump_forget (xlator_t *this, inode_t *inode) +{ +        afr_private_t  *priv  = NULL; + +        priv = this->private; +        if (priv->use_afr_in_pump) +                afr_forget (this, inode); + +        return 0; +} + +static int32_t +pump_setattr (call_frame_t *frame, +              xlator_t *this, +              loc_t *loc, +              struct iatt *stbuf, +              int32_t valid, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, +                            default_setattr_cbk, +                            FIRST_CHILD (this), +                            FIRST_CHILD (this)->fops->setattr, +                            loc, stbuf, valid, xdata); +                return 0; +        } +        afr_setattr (frame, this, loc, stbuf, valid, xdata); +        return 0; + +} + + +static int32_t +pump_fsetattr (call_frame_t *frame, +               xlator_t *this, +               fd_t *fd, +               struct iatt *stbuf, +               int32_t valid, dict_t *xdata) +{ +        afr_private_t *priv  = NULL; +	priv = this->private; +        if (!priv->use_afr_in_pump) { +                STACK_WIND (frame, +                            default_fsetattr_cbk, +                            FIRST_CHILD (this), +                            FIRST_CHILD (this)->fops->fsetattr, +                            fd, stbuf, valid, xdata); +                return 0; +        } +        afr_fsetattr (frame, this, fd, stbuf, valid, xdata); +        return 0; + +} + + +/* End of defaults */ + + +int32_t +mem_acct_init (xlator_t *this) +{ +        int     ret = -1; + +        if (!this) +                return ret; + +        ret = xlator_mem_acct_init (this, gf_afr_mt_end + 1); + +        if (ret != 0) { +                gf_log(this->name, GF_LOG_ERROR, "Memory accounting init" +                                "failed"); +                return ret; +        } + +        return ret; +} + +static int +is_xlator_pump_sink (xlator_t *child) +{ +        return (child == PUMP_SINK_CHILD(THIS)); +} + +static int +is_xlator_pump_source (xlator_t *child) +{ +        return (child == PUMP_SOURCE_CHILD(THIS)); +} + +int32_t +notify (xlator_t *this, int32_t event, +	void *data, ...) +{ +        int ret = -1; +        xlator_t *child_xl = NULL; + +        child_xl = (xlator_t *) data; + +        ret = afr_notify (this, event, data, NULL); + +	switch (event) { +	case GF_EVENT_CHILD_DOWN: +                if (is_xlator_pump_source (child_xl)) +                        pump_change_state (this, PUMP_STATE_ABORT); +                break; + +        case GF_EVENT_CHILD_UP: +                if (is_xlator_pump_sink (child_xl)) +                        if (is_pump_start_pending (this)) { +                                gf_log (this->name, GF_LOG_DEBUG, +                                        "about to start synctask"); +                                ret = pump_start_synctask (this); +                                if (ret < 0) +                                        gf_log (this->name, GF_LOG_DEBUG, +                                                "Could not start pump " +                                                "synctask"); +                                else +                                        pump_remove_start_pending (this); +                        } +        } + +        return ret; +} + +int32_t +init (xlator_t *this) +{ +	afr_private_t * priv        = NULL; +        pump_private_t *pump_priv   = NULL; +	int             child_count = 0; +	xlator_list_t * trav        = NULL; +	int             i           = 0; +	int             ret         = -1; +	GF_UNUSED int   op_errno    = 0; + +        int source_child = 0; + +	if (!this->children) { +		gf_log (this->name, GF_LOG_ERROR, +			"pump translator needs a source and sink" +                        "subvolumes defined."); +		return -1; +	} + +	if (!this->parents) { +		gf_log (this->name, GF_LOG_WARNING, +			"Volume is dangling."); +	} + +	priv = GF_CALLOC (1, sizeof (afr_private_t), gf_afr_mt_afr_private_t); +        if (!priv) +                goto out; + +        LOCK_INIT (&priv->lock); +        LOCK_INIT (&priv->read_child_lock); +        //lock recovery is not done in afr +        pthread_mutex_init (&priv->mutex, NULL); +        INIT_LIST_HEAD (&priv->saved_fds); + +        child_count = xlator_subvolume_count (this); +        if (child_count != 2) { +                gf_log (this->name, GF_LOG_ERROR, +                        "There should be exactly 2 children - one source " +                        "and one sink"); +                return -1; +        } +	priv->child_count = child_count; + +        priv->read_child = source_child; +        priv->favorite_child = source_child; +        priv->background_self_heal_count = 0; + +	priv->data_self_heal     = "on"; +	priv->metadata_self_heal = 1; +	priv->entry_self_heal    = 1; + +        priv->data_self_heal_window_size = 16; + +	priv->data_change_log     = 1; +	priv->metadata_change_log = 1; +	priv->entry_change_log    = 1; +        priv->use_afr_in_pump = 1; +        priv->sh_readdir_size = 65536; + +	/* Locking options */ + +        /* Lock server count infact does not matter. Locks are held +           on all subvolumes, in this case being the source +           and the sink. +        */ + +	priv->strict_readdir = _gf_false; +	priv->wait_count = 1; +	priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, +                                 gf_afr_mt_char); +	if (!priv->child_up) { +		gf_log (this->name, GF_LOG_ERROR, +			"Out of memory."); +		op_errno = ENOMEM; +		goto out; +	} + +	priv->children = GF_CALLOC (sizeof (xlator_t *), child_count, +                                 gf_afr_mt_xlator_t); +	if (!priv->children) { +		gf_log (this->name, GF_LOG_ERROR, +			"Out of memory."); +		op_errno = ENOMEM; +		goto out; +	} + +        priv->pending_key = GF_CALLOC (sizeof (*priv->pending_key), +                                       child_count, +                                       gf_afr_mt_char); +        if (!priv->pending_key) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Out of memory."); +                op_errno = ENOMEM; +                goto out; +        } + +	trav = this->children; +	i = 0; +	while (i < child_count) { +		priv->children[i] = trav->xlator; + +                ret = gf_asprintf (&priv->pending_key[i], "%s.%s", +                                   AFR_XATTR_PREFIX, +                                   trav->xlator->name); +                if (-1 == ret) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "asprintf failed to set pending key"); +                        op_errno = ENOMEM; +                        goto out; +                } + +		trav = trav->next; +		i++; +	} + +        ret = gf_asprintf (&priv->sh_domain, "%s-self-heal", this->name); +        if (-1 == ret) { +                op_errno = ENOMEM; +                goto out; +        } + +        priv->first_lookup = 1; +        priv->root_inode = NULL; + +        priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event), +                                      gf_afr_mt_int32_t); +        if (!priv->last_event) { +                ret = -ENOMEM; +                goto out; +        } + +	pump_priv = GF_CALLOC (1, sizeof (*pump_priv), +                            gf_afr_mt_pump_priv); +	if (!pump_priv) { +		gf_log (this->name, GF_LOG_ERROR, +			"Out of memory"); +                op_errno = ENOMEM; +		goto out; +	} + +        LOCK_INIT (&pump_priv->resume_path_lock); +        LOCK_INIT (&pump_priv->pump_state_lock); + +        pump_priv->resume_path = GF_CALLOC (1, PATH_MAX, +                                            gf_afr_mt_char); +        if (!pump_priv->resume_path) { +                gf_log (this->name, GF_LOG_ERROR, "Out of memory"); +                ret = -1; +                goto out; +        } + +	pump_priv->env = this->ctx->env; +        if (!pump_priv->env) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Could not create new sync-environment"); +                ret = -1; +                goto out; +        } + +        /* keep more local here as we may need them for self-heal etc */ +        this->local_pool = mem_pool_new (afr_local_t, 128); +        if (!this->local_pool) { +                ret = -1; +                gf_log (this->name, GF_LOG_ERROR, +                        "failed to create local_t's memory pool"); +                goto out; +        } + +	priv->pump_private = pump_priv; +        pump_priv = NULL; + +        this->private = priv; +        priv = NULL; + +        pump_change_state (this, PUMP_STATE_ABORT); + +	ret = 0; +out: + +        if (pump_priv) { +                GF_FREE (pump_priv->resume_path); +                LOCK_DESTROY (&pump_priv->resume_path_lock); +                LOCK_DESTROY (&pump_priv->pump_state_lock); +                GF_FREE (pump_priv); +        } + +        if (priv) { +                GF_FREE (priv->child_up); +                GF_FREE (priv->children); +                GF_FREE (priv->pending_key); +                GF_FREE (priv->last_event); +                LOCK_DESTROY (&priv->lock); +                LOCK_DESTROY (&priv->read_child_lock); +                GF_FREE (priv); +        } + +	return ret; +} + +int +fini (xlator_t *this) +{ +        afr_private_t * priv        = NULL; +        pump_private_t *pump_priv = NULL; + +        priv      = this->private; +        this->private = NULL; +        if (!priv) +                goto out; + +        pump_priv = priv->pump_private; +        if (!pump_priv) +                goto afr_priv; + +        GF_FREE (pump_priv->resume_path); +        LOCK_DESTROY (&pump_priv->resume_path_lock); +        LOCK_DESTROY (&pump_priv->pump_state_lock); +        GF_FREE (pump_priv); +afr_priv: +        afr_priv_destroy (priv); +out: +	return 0; +} + + +struct xlator_fops fops = { +	.lookup      = pump_lookup, +	.open        = pump_open, +	.flush       = pump_flush, +	.fsync       = pump_fsync, +	.fsyncdir    = pump_fsyncdir, +	.xattrop     = pump_xattrop, +	.fxattrop    = pump_fxattrop, +        .getxattr    = pump_getxattr, + +	/* inode write */ +	.writev      = pump_writev, +	.truncate    = pump_truncate, +	.ftruncate   = pump_ftruncate, +	.setxattr    = pump_setxattr, +        .setattr     = pump_setattr, +	.fsetattr    = pump_fsetattr, +	.removexattr = pump_removexattr, + +	/* dir read */ +	.opendir     = pump_opendir, +	.readdir     = pump_readdir, +	.readdirp    = pump_readdirp, + +	/* dir write */ +	.create      = pump_create, +	.mknod       = pump_mknod, +	.mkdir       = pump_mkdir, +	.unlink      = pump_unlink, +	.rmdir       = pump_rmdir, +	.link        = pump_link, +	.symlink     = pump_symlink, +	.rename      = pump_rename, +}; + +struct xlator_dumpops dumpops = { +        .priv       = afr_priv_dump, +}; + + +struct xlator_cbks cbks = { +	.release     = pump_release, +	.releasedir  = pump_releasedir, +        .forget      = pump_forget, +}; + +struct volume_options options[] = { +	{ .key  = {NULL} }, +}; diff --git a/xlators/cluster/afr-v1/src/pump.h b/xlators/cluster/afr-v1/src/pump.h new file mode 100644 index 000000000..bc4c31a78 --- /dev/null +++ b/xlators/cluster/afr-v1/src/pump.h @@ -0,0 +1,78 @@ +/* +  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef __PUMP_H__ +#define __PUMP_H__ + +#include "syncop.h" + +/* FIXME: Needs to be defined in a common file */ +#define CLIENT_CMD_CONNECT "trusted.glusterfs.client-connect" +#define CLIENT_CMD_DISCONNECT "trusted.glusterfs.client-disconnect" + +#define PUMP_SOURCE_COMPLETE "trusted.glusterfs.pump-source-complete" +#define PUMP_SINK_COMPLETE "trusted.glusterfs.pump-sink-complete" + +#define PUMP_PATH "trusted.glusterfs.pump-path" + +#define PUMP_SOURCE_CHILD(xl) (xl->children->xlator) +#define PUMP_SINK_CHILD(xl) (xl->children->next->xlator) + +typedef enum { +        PUMP_STATE_RUNNING,             /* Pump is running and migrating files */ +        PUMP_STATE_RESUME,              /* Pump is resuming from a previous pause */ +        PUMP_STATE_PAUSE,               /* Pump is paused */ +        PUMP_STATE_ABORT,               /* Pump is aborted */ +        PUMP_STATE_COMMIT,              /* Pump is commited */ +} pump_state_t; + +typedef struct _pump_private { +	struct syncenv *env;            /* The env pointer to the pump synctask */ +        char *resume_path;              /* path to resume from the last pause */ +        gf_lock_t resume_path_lock;     /* Synchronize resume_path changes */ +        gf_lock_t pump_state_lock;      /* Synchronize pump_state changes */ +        pump_state_t pump_state;        /* State of pump */ +        char current_file[PATH_MAX];    /* Current file being pumped */ +        uint64_t number_files_pumped;   /* Number of files pumped */ +        gf_boolean_t pump_finished;     /* Boolean to indicate pump termination */ +        char pump_start_pending;        /* Boolean to mark start pending until +                                           CHILD_UP */ +        call_stub_t *cleaner; +} pump_private_t; + +void +build_root_loc (inode_t *inode, loc_t *loc); +int pump_start (call_frame_t *frame, xlator_t *this); + +gf_boolean_t +pump_command_start (xlator_t *this, dict_t *dict); + +int +pump_execute_start (call_frame_t *frame, xlator_t *this); + +gf_boolean_t +pump_command_pause (xlator_t *this, dict_t *dict); + +int +pump_execute_pause (call_frame_t *frame, xlator_t *this); + +gf_boolean_t +pump_command_abort (xlator_t *this, dict_t *dict); + +int +pump_execute_abort (call_frame_t *frame, xlator_t *this); + +gf_boolean_t +pump_command_status (xlator_t *this, dict_t *dict); + +int +pump_execute_status (call_frame_t *frame, xlator_t *this); + +#endif /* __PUMP_H__ */ diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 3868fc38f..3055f4615 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -3120,7 +3120,7 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,                  /* making sure we set the inode ctx right with layout,                     currently possible only for non-directories, so for                     directories don't set entry inodes */ -                if (!IA_ISDIR(entry->d_stat.ia_type) && orig_entry->inode) { +                if (!IA_ISDIR(entry->d_stat.ia_type)) {                          ret = dht_layout_preset (this, prev->this,                                                   orig_entry->inode);                          if (ret) diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c index 79e80b513..32d53e8e6 100644 --- a/xlators/cluster/stripe/src/stripe.c +++ b/xlators/cluster/stripe/src/stripe.c @@ -4886,7 +4886,7 @@ unlock:                  if (!local_entry)                          break; -                if (!IA_ISREG (local_entry->d_stat.ia_type) || !local_entry->inode) { +                if (!IA_ISREG (local_entry->d_stat.ia_type)) {                          LOCK (&frame->lock);                          {                                  local->wind_count--; diff --git a/xlators/features/index/src/index.c b/xlators/features/index/src/index.c index 5c1c65fbd..5edfeda8f 100644 --- a/xlators/features/index/src/index.c +++ b/xlators/features/index/src/index.c @@ -15,9 +15,11 @@  #include "index.h"  #include "options.h"  #include "glusterfs3-xdr.h" +#include "syncop.h"  #include "syscall.h"  #define XATTROP_SUBDIR "xattrop" +#define BASE_INDICES_HOLDER_SUBDIR "base_indices_holder"  call_stub_t *  __index_dequeue (struct list_head *callstubs) @@ -243,20 +245,40 @@ check_delete_stale_index_file (xlator_t *this, char *filename)  {          int             ret = 0;          struct stat     st = {0}; +        struct stat     base_index_st = {0};          char            filepath[PATH_MAX] = {0}; +        char            filepath_under_base_indices_holder[PATH_MAX] = {0};          index_priv_t    *priv = NULL;          priv = this->private; +        if (priv->to_be_healed_states != synced_state) +                return; +          make_file_path (priv->index_basepath, XATTROP_SUBDIR,                          filename, filepath, sizeof (filepath)); + +        make_file_path (priv->index_basepath, BASE_INDICES_HOLDER_SUBDIR, +                        filename, filepath_under_base_indices_holder, +                        sizeof (filepath_under_base_indices_holder)); + + +        ret = stat (filepath_under_base_indices_holder, &base_index_st); +        if (ret) { +                gf_log (THIS->name, GF_LOG_ERROR, "Base index is not created" +                        " under index/base_indices_holder"); +                return; +        } +          ret = stat (filepath, &st); -        if (!ret && st.st_nlink == 1) +        if (!ret && st.st_nlink == 2) {                  unlink (filepath); +                unlink (filepath_under_base_indices_holder); +        }  }  static int  index_fill_readdir (fd_t *fd, DIR *dir, off_t off, -                    size_t size, gf_dirent_t *entries) +                    size_t size, gf_dirent_t *entries, readdir_directory type)  {          off_t     in_case = -1;          size_t    filled = 0; @@ -299,7 +321,8 @@ index_fill_readdir (fd_t *fd, DIR *dir, off_t off,                  }                  if (!strncmp (entry->d_name, XATTROP_SUBDIR"-", -                              strlen (XATTROP_SUBDIR"-"))) { +                              strlen (XATTROP_SUBDIR"-")) && +                              (type == INDEX_XATTROP)) {                          check_delete_stale_index_file (this, entry->d_name);                          continue;                  } @@ -338,16 +361,192 @@ out:  }  int +sync_base_indices (void *index_priv) +{ +        index_priv_t    *priv = NULL; +        DIR             *dir_base_holder  = NULL; +        DIR             *xattrop_dir = NULL; +        struct dirent   *entry = NULL; +        char            base_indices_holder[PATH_MAX] = {0}; +        char            xattrop_directory[PATH_MAX] = {0}; +        char            base_index_path[PATH_MAX] = {0}; +        char            xattrop_index_path[PATH_MAX] = {0}; +        int32_t         op_errno = 0; +        int             ret = 0; + +        priv = index_priv; + +        snprintf (base_indices_holder, PATH_MAX, "%s/%s", priv->index_basepath, +                  BASE_INDICES_HOLDER_SUBDIR); +        snprintf (xattrop_directory, PATH_MAX, "%s/%s", priv->index_basepath, +                  XATTROP_SUBDIR); + +        if ((dir_base_holder = opendir(base_indices_holder)) == NULL) { +                op_errno = errno; +                ret = -1; +                goto out; +        } +        if ((xattrop_dir = opendir (xattrop_directory)) == NULL) { +                op_errno = errno; +                ret = -1; +                (void) closedir (dir_base_holder); +                goto out; +        } + +        priv->to_be_healed_states = sync_started; +        while ((entry = readdir(xattrop_dir)) != NULL) { +                if (!strcmp (entry->d_name, ".") || +                    !strcmp (entry->d_name, "..")) { +                        continue; +                } +                if (strncmp (entry->d_name, XATTROP_SUBDIR"-", +                    strlen (XATTROP_SUBDIR"-"))) { +                        continue; +                } +                if (!strncmp (entry->d_name, XATTROP_SUBDIR"-", +                    strlen (XATTROP_SUBDIR"-"))) { + +                    snprintf (xattrop_index_path, PATH_MAX, "%s/%s", +                              xattrop_directory, entry->d_name); + +                    snprintf (base_index_path, PATH_MAX, "%s/%s", +                              base_indices_holder, entry->d_name); + +                    ret = sys_link (xattrop_index_path, base_index_path); + +                    if (ret && errno != EEXIST) { +                        op_errno = errno; +                        (void) closedir (dir_base_holder); +                        (void) closedir (xattrop_dir); +                        goto out; +                    } + +                } +        } +        ret = closedir (xattrop_dir); +        if (ret) { +                op_errno = errno; +                (void) closedir (dir_base_holder); +                goto out; +        } +        ret = closedir (dir_base_holder); +        if (ret) { +                op_errno = errno; +                goto out; +        } + +        ret = 0; +out: +        errno = op_errno; +        return ret; + +} + +int +base_indices_syncing_done (int ret, call_frame_t *frame, void *data) +{ +        index_priv_t         *priv = NULL; +        priv = data; + +        if (!priv) +                goto out; + +        if (ret) { +                priv->to_be_healed_states = sync_not_started; +        } else { +                priv->to_be_healed_states = synced_state; +        } + +        STACK_DESTROY (frame->root); + +out: +        return 0; +} + +int +sync_base_indices_from_xattrop (xlator_t *this) +{ + +        index_priv_t          *priv = NULL; +        char                  base_indices_holder[PATH_MAX] =  {0}; +        int                   ret = 0; +        struct stat           st = {0}; +        DIR                   *dir = NULL; +        struct dirent         *entry = NULL; +        call_frame_t          *frame = NULL; + +        priv = this->private; + +        if (priv->to_be_healed_states != sync_not_started) { +                ret = -1; +                goto out; +        } + +        snprintf (base_indices_holder, PATH_MAX, "%s/%s", priv->index_basepath, +                  BASE_INDICES_HOLDER_SUBDIR); + +        ret = stat (base_indices_holder, &st); + +        if (ret && (errno != ENOENT)) { +                goto out; +        } else if (errno == ENOENT) { +                ret = index_dir_create (this, BASE_INDICES_HOLDER_SUBDIR); +                if (ret) +                        goto out; +        } else { +                if ((dir = opendir (base_indices_holder)) == NULL) { +                        ret = -1; +                        goto out; +                } +                while ((entry = readdir (dir)) != NULL) { +                        if (!strcmp (entry->d_name, ".") || +                            !strcmp (entry->d_name,"..")) { +                                continue; +                        } +                        ret = unlink (entry->d_name); +                        if (ret) { +                                closedir (dir); +                                goto out; +                        } +                } +                closedir (dir); +        } + +        /*At this point of time we have index/base_indicies_holder directory +         *is with no entries*/ + +        frame = create_frame (this, this->ctx->pool); +        if (!frame) { +                ret = -1; +                goto out; +        } +        set_lk_owner_from_ptr (&frame->root->lk_owner, frame->root); + +        frame->root->pid = LOW_PRIO_PROC_PID; + +        ret = synctask_new (this->ctx->env, sync_base_indices, +                            base_indices_syncing_done,frame, priv); + + + +out: +        return ret; + +} + +int  index_add (xlator_t *this, uuid_t gfid, const char *subdir)  {          int32_t           op_errno = 0;          char              gfid_path[PATH_MAX] = {0};          char              index_path[PATH_MAX] = {0}; +        char              base_path[PATH_MAX] = {0};          int               ret = 0;          uuid_t            index = {0};          index_priv_t      *priv = NULL;          struct stat       st = {0};          int               fd = 0; +        int               index_created = 0;          priv = this->private;          GF_ASSERT_AND_GOTO_WITH_ERROR (this->name, !uuid_is_null (gfid), @@ -362,12 +561,15 @@ index_add (xlator_t *this, uuid_t gfid, const char *subdir)          index_get_index (priv, index);          make_index_path (priv->index_basepath, subdir,                           index, index_path, sizeof (index_path)); +          ret = sys_link (index_path, gfid_path);          if (!ret || (errno == EEXIST))  {                  ret = 0; +                index_created = 1;                  goto out;          } +          op_errno = errno;          if (op_errno == ENOENT) {                  ret = index_dir_create (this, subdir); @@ -399,10 +601,36 @@ index_add (xlator_t *this, uuid_t gfid, const char *subdir)                          "add to index (%s)", uuid_utoa (gfid),                          strerror (errno));                  goto out; +        } else { +                index_created = 1; +        } + +        if (priv->to_be_healed_states != sync_not_started) { +                 make_index_path (priv->index_basepath, +                                  GF_BASE_INDICES_HOLDER_GFID, +                                  index, base_path, sizeof (base_path)); +                 ret = sys_link (index_path, base_path); +                 if (ret) +                         goto out;          }          ret = 0;  out: +        /*If base_indices_holder is not created: create and sync +         *If directory is present: delete contents and start syncing +         *If syncing is in progress :No need to do any thing +         *If syncing is done: No need to do anything*/ +        if (!ret) { +                switch (priv->to_be_healed_states) { +                        case sync_not_started: +                                ret = sync_base_indices_from_xattrop (this); +                                break; +                        case sync_started: +                        case synced_state: +                                /*No need to do anything*/ +                                break; +                } +        }          return ret;  } @@ -738,41 +966,6 @@ out:          return 0;  } -uint64_t -index_entry_count (xlator_t *this, char *subdir) -{ -	index_priv_t *priv = NULL; -	char index_dir[PATH_MAX]; -	DIR *dirp = NULL; -	uint64_t count = 0; -	struct dirent buf; -	struct dirent *entry = NULL; - -	priv = this->private; - -	make_index_dir_path (priv->index_basepath, subdir, -			     index_dir, sizeof (index_dir)); - -	dirp = opendir (index_dir); -	if (!dirp) -		return 0; - -	while (readdir_r (dirp, &buf, &entry) == 0) { -		if (!entry) -			break; -		if (!strcmp (entry->d_name, ".") || -		    !strcmp (entry->d_name, "..")) -			continue; -                if (!strncmp (entry->d_name, subdir, strlen (subdir))) -			continue; -		count++; -	} -	closedir (dirp); - -	return count; -} - -  int32_t  index_getxattr_wrapper (call_frame_t *frame, xlator_t *this,                          loc_t *loc, const char *name, dict_t *xdata) @@ -780,7 +973,6 @@ index_getxattr_wrapper (call_frame_t *frame, xlator_t *this,          index_priv_t    *priv = NULL;          dict_t          *xattr = NULL;          int             ret = 0; -	uint64_t        count = 0;          priv = this->private; @@ -790,26 +982,24 @@ index_getxattr_wrapper (call_frame_t *frame, xlator_t *this,                  goto done;          } -	if (strcmp (name, GF_XATTROP_INDEX_GFID) == 0) { -		ret = dict_set_static_bin (xattr, (char*)name, priv->xattrop_vgfid, -					   sizeof (priv->xattrop_vgfid)); -		if (ret) { -			ret = -ENOMEM; -			gf_log (this->name, GF_LOG_ERROR, "xattrop index " -				"gfid set failed"); -			goto done; -		} -	} else if (strcmp (name, GF_XATTROP_INDEX_COUNT) == 0) { -		count = index_entry_count (this, XATTROP_SUBDIR); - -		ret = dict_set_uint64 (xattr, (char *)name, count); -		if (ret) { -			ret = -ENOMEM; -			gf_log (this->name, GF_LOG_ERROR, "xattrop index " -				"count set failed"); -			goto done; -		} -	} +        if (!strcmp (name, GF_XATTROP_INDEX_GFID)) { + +                ret = dict_set_static_bin (xattr, (char*)name, +                                           priv->xattrop_vgfid, +                                           sizeof (priv->xattrop_vgfid)); + +        } else if (!strcmp (name, GF_BASE_INDICES_HOLDER_GFID)) { + +                ret = dict_set_static_bin (xattr, (char*)name, +                                           priv->base_indices_holder_vgfid, +                                      sizeof (priv->base_indices_holder_vgfid)); +        } +        if (ret) { +                ret = -ENOMEM; +                gf_log (THIS->name, GF_LOG_ERROR, "xattrop index " +                        "gfid set failed"); +                goto done; +        }  done:          if (ret)                  STACK_UNWIND_STRICT (getxattr, frame, -1, -ret, xattr, xdata); @@ -847,6 +1037,15 @@ index_lookup_wrapper (call_frame_t *frame, xlator_t *this,          } else if (!uuid_compare (loc->pargfid, priv->xattrop_vgfid)) {                  make_file_path (priv->index_basepath, XATTROP_SUBDIR,                                  loc->name, path, sizeof (path)); +        } else if (!uuid_compare (loc->gfid,priv->base_indices_holder_vgfid)){ +                make_index_dir_path (priv->index_basepath, +                                     BASE_INDICES_HOLDER_SUBDIR, path, +                                    sizeof (path)); +                is_dir = _gf_true; +        } else if (!uuid_compare (loc->pargfid, priv->base_indices_holder_vgfid)) { +                make_file_path (priv->index_basepath, +                                BASE_INDICES_HOLDER_SUBDIR,loc->name, path, +                                sizeof (path));          }          ret = lstat (path, &lstatbuf); @@ -868,10 +1067,14 @@ index_lookup_wrapper (call_frame_t *frame, xlator_t *this,          }          iatt_from_stat (&stbuf, &lstatbuf); -        if (is_dir) +        if (is_dir && !uuid_compare (loc->gfid, priv->xattrop_vgfid)) {                  uuid_copy (stbuf.ia_gfid, priv->xattrop_vgfid); -        else +        } else if (is_dir && +                !uuid_compare (loc->gfid, priv->base_indices_holder_vgfid)) { +                uuid_copy (stbuf.ia_gfid, priv->base_indices_holder_vgfid); +        } else {                  uuid_generate (stbuf.ia_gfid); +        }          stbuf.ia_ino = -1;          op_ret = 0;  done: @@ -883,6 +1086,44 @@ done:  }  int32_t +base_indices_readdir_wrapper (call_frame_t *frame, xlator_t *this, +                              fd_t *fd, size_t size, off_t off, dict_t *xdata) +{ +        index_priv_t    *priv = NULL; +        char            base_indices_holder[PATH_MAX] = {0}; +        DIR             *dir = NULL; +        int32_t         op_ret = -1; +        int32_t         op_errno = 0; +        int             count = 0; +        gf_dirent_t     entries; + +        priv = this->private; + +        make_index_dir_path (priv->index_basepath, BASE_INDICES_HOLDER_SUBDIR, +                             base_indices_holder, sizeof (base_indices_holder)); + +        dir = opendir (base_indices_holder); +        if (!dir) { +                op_errno = EINVAL; +                goto done; +        } + + +        INIT_LIST_HEAD (&entries.list); + +        count = index_fill_readdir (fd, dir, off, size, &entries, +                                    BASE_INDICES_HOLDER); +        /* pick ENOENT to indicate EOF */ +        op_errno = errno; +        op_ret = count; +        closedir (dir); +done: +        STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, &entries, xdata); +        gf_dirent_free (&entries); +        return 0; +} + +int32_t  index_readdir_wrapper (call_frame_t *frame, xlator_t *this,                         fd_t *fd, size_t size, off_t off, dict_t *xdata)  { @@ -913,7 +1154,8 @@ index_readdir_wrapper (call_frame_t *frame, xlator_t *this,                  goto done;          } -        count = index_fill_readdir (fd, dir, off, size, &entries); +        count = index_fill_readdir (fd, dir, off, size, &entries, +                                    INDEX_XATTROP);          /* pick ENOENT to indicate EOF */          op_errno = errno; @@ -979,12 +1221,11 @@ index_getxattr (call_frame_t *frame, xlator_t *this,                  loc_t *loc, const char *name, dict_t *xdata)  {          call_stub_t     *stub = NULL; -        index_priv_t    *priv = NULL; - -        priv = this->private; -        if (!name || (strcmp (GF_XATTROP_INDEX_GFID, name) && -		      strcmp (GF_XATTROP_INDEX_COUNT, name))) +        if (!name) +                goto out; +        if (strcmp (GF_XATTROP_INDEX_GFID, name) && +            strcmp (GF_BASE_INDICES_HOLDER_GFID, name))                  goto out;          stub = fop_getxattr_stub (frame, index_getxattr_wrapper, loc, name, @@ -1011,7 +1252,9 @@ index_lookup (call_frame_t *frame, xlator_t *this,          priv = this->private;          if (uuid_compare (loc->gfid, priv->xattrop_vgfid) && -            uuid_compare (loc->pargfid, priv->xattrop_vgfid)) +            uuid_compare (loc->pargfid, priv->xattrop_vgfid) && +            uuid_compare (loc->gfid, priv->base_indices_holder_vgfid) && +            uuid_compare (loc->pargfid, priv->base_indices_holder_vgfid))                  goto normal;          stub = fop_lookup_stub (frame, index_lookup_wrapper, loc, xattr_req); @@ -1037,10 +1280,19 @@ index_readdir (call_frame_t *frame, xlator_t *this,          index_priv_t    *priv = NULL;          priv = this->private; -        if (uuid_compare (fd->inode->gfid, priv->xattrop_vgfid)) +        if (uuid_compare (fd->inode->gfid, priv->xattrop_vgfid) && +            uuid_compare (fd->inode->gfid, priv->base_indices_holder_vgfid))                  goto out; -        stub = fop_readdir_stub (frame, index_readdir_wrapper, fd, size, off, -                                 xdata); + +        if (!uuid_compare (fd->inode->gfid, priv->xattrop_vgfid)) { +                stub = fop_readdir_stub (frame, index_readdir_wrapper, fd, size,  +                                         off, xdata); +        } else if (!uuid_compare (fd->inode->gfid, +                                  priv->base_indices_holder_vgfid)) { +                stub = fop_readdir_stub (frame, base_indices_readdir_wrapper, +                                         fd, size, off, xdata); +        } +          if (!stub) {                  STACK_UNWIND_STRICT (readdir, frame, -1, ENOMEM, NULL, NULL);                  return 0; @@ -1144,6 +1396,9 @@ init (xlator_t *this)          GF_OPTION_INIT ("index-base", priv->index_basepath, path, out);          uuid_generate (priv->index);          uuid_generate (priv->xattrop_vgfid); +        /*base_indices_holder is a directory which contains hard links to +         * all base indices inside indices/xattrop directory*/ +        uuid_generate (priv->base_indices_holder_vgfid);          INIT_LIST_HEAD (&priv->callstubs);          this->private = priv; @@ -1160,6 +1415,7 @@ init (xlator_t *this)          }          ret = 0; +  out:          if (ret) {                  if (cond_inited) diff --git a/xlators/features/index/src/index.h b/xlators/features/index/src/index.h index 661dcdbc4..d6dcb1c23 100644 --- a/xlators/features/index/src/index.h +++ b/xlators/features/index/src/index.h @@ -36,14 +36,28 @@ typedef struct index_fd_ctx {          DIR *dir;  } index_fd_ctx_t; +typedef enum { +        sync_not_started, +        sync_started, +        synced_state, +} to_be_healed_states_t; + +typedef enum { +        INDEX_XATTROP, +        BASE_INDICES_HOLDER, +} readdir_directory; +  typedef struct index_priv {          char *index_basepath;          uuid_t index;          gf_lock_t lock;          uuid_t xattrop_vgfid;//virtual gfid of the xattrop index dir +        uuid_t base_indices_holder_vgfid; //virtual gfid of the +                                          //to_be_healed_xattrop directory          struct list_head callstubs;          pthread_mutex_t mutex;          pthread_cond_t  cond; +        to_be_healed_states_t to_be_healed_states;  } index_priv_t;  #define INDEX_STACK_UNWIND(fop, frame, params ...)      \  | 
