summaryrefslogtreecommitdiffstats
path: root/tests/00-geo-rep
diff options
context:
space:
mode:
authorKotresh HR <khiremat@redhat.com>2019-05-08 11:26:06 +0530
committerSunny Kumar <sunkumar@redhat.com>2019-05-13 11:06:41 +0000
commitf0d3690e8916cfb5e10a0df2e9721a0fd079bfce (patch)
tree1c907b441761cca76f2a10e8e827f6b6fbd353ec /tests/00-geo-rep
parente2adc9dc66dc46519007790ecd7dd57642dff0fd (diff)
geo-rep: Fix sync hang with tarssh
Problem: Geo-rep sync hangs when tarssh is used as sync engine at heavy workload. Analysis and Root cause: It's found out that the tar process was hung. When debugged further, it's found out that stderr buffer of tar process on master was full i.e., 64k. When the buffer was copied to a file from /proc/pid/fd/2, the hang is resolved. This can happen when files picked by tar process to sync doesn't exist on master anymore. If this count increases around 1k, the stderr buffer is filled up. Fix: The tar process is executed using Popen with stderr as PIPE. The final execution is something like below. tar | ssh <args> root@slave tar --overwrite -xf - -C <path> It was waiting on ssh process first using communicate() and then tar. Note that communicate() reads stdout and stderr. So when stderr of tar process is filled up, there is no one to read until untar via ssh is completed. This can't happen and leads to deadlock. Hence we should be waiting on both process parallely, so that stderr is read on both processes. Change-Id: I609c7cc5c07e210c504771115b4d551a2e891adf fixes: bz#1707728 Signed-off-by: Kotresh HR <khiremat@redhat.com>
Diffstat (limited to 'tests/00-geo-rep')
-rw-r--r--tests/00-geo-rep/georep-stderr-hang.t128
1 files changed, 128 insertions, 0 deletions
diff --git a/tests/00-geo-rep/georep-stderr-hang.t b/tests/00-geo-rep/georep-stderr-hang.t
new file mode 100644
index 0000000..496f0e6
--- /dev/null
+++ b/tests/00-geo-rep/georep-stderr-hang.t
@@ -0,0 +1,128 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+. $(dirname $0)/../geo-rep.rc
+. $(dirname $0)/../env.rc
+
+SCRIPT_TIMEOUT=500
+
+AREQUAL_PATH=$(dirname $0)/../utils
+test "`uname -s`" != "Linux" && {
+ CFLAGS="$CFLAGS -lintl";
+}
+build_tester $AREQUAL_PATH/arequal-checksum.c $CFLAGS
+
+### Basic Tests with Distribute Replicate volumes
+
+##Cleanup and start glusterd
+cleanup;
+TEST glusterd;
+TEST pidof glusterd
+
+
+##Variables
+GEOREP_CLI="$CLI volume geo-replication"
+master=$GMV0
+SH0="127.0.0.1"
+slave=${SH0}::${GSV0}
+num_active=2
+num_passive=2
+master_mnt=$M0
+slave_mnt=$M1
+
+############################################################
+#SETUP VOLUMES AND GEO-REPLICATION
+############################################################
+
+##create_and_start_master_volume
+TEST $CLI volume create $GMV0 $H0:$B0/${GMV0}1;
+TEST $CLI volume start $GMV0
+
+##create_and_start_slave_volume
+TEST $CLI volume create $GSV0 $H0:$B0/${GSV0}1;
+TEST $CLI volume start $GSV0
+TEST $CLI volume set $GSV0 performance.stat-prefetch off
+TEST $CLI volume set $GSV0 performance.quick-read off
+TEST $CLI volume set $GSV0 performance.readdir-ahead off
+TEST $CLI volume set $GSV0 performance.read-ahead off
+
+##Mount master
+TEST glusterfs -s $H0 --volfile-id $GMV0 $M0
+
+##Mount slave
+TEST glusterfs -s $H0 --volfile-id $GSV0 $M1
+
+############################################################
+#BASIC GEO-REPLICATION TESTS
+############################################################
+
+TEST create_georep_session $master $slave
+EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Created"
+
+#Config gluster-command-dir
+TEST $GEOREP_CLI $master $slave config gluster-command-dir ${GLUSTER_CMD_DIR}
+
+#Config gluster-command-dir
+TEST $GEOREP_CLI $master $slave config slave-gluster-command-dir ${GLUSTER_CMD_DIR}
+
+#Set changelog roll-over time to 45 secs
+TEST $CLI volume set $GMV0 changelog.rollover-time 45
+
+#Wait for common secret pem file to be created
+EXPECT_WITHIN $GEO_REP_TIMEOUT 0 check_common_secret_file
+
+#Verify the keys are distributed
+EXPECT_WITHIN $GEO_REP_TIMEOUT 0 check_keys_distributed
+
+#Set sync-jobs to 1
+TEST $GEOREP_CLI $master $slave config sync-jobs 1
+
+#Start_georep
+TEST $GEOREP_CLI $master $slave start
+
+touch $M0
+EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Active"
+EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Changelog Crawl"
+
+#Check History Crawl.
+TEST $GEOREP_CLI $master $slave stop
+TEST create_data_hang "rsync_hang"
+TEST create_data "history_rsync"
+TEST $GEOREP_CLI $master $slave start
+EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Active"
+
+#Verify arequal for whole volume
+EXPECT_WITHIN $GEO_REP_TIMEOUT "x0" arequal_checksum ${master_mnt} ${slave_mnt}
+
+#Stop Geo-rep
+TEST $GEOREP_CLI $master $slave stop
+
+#Config tarssh as sync-engine
+TEST $GEOREP_CLI $master $slave config sync-method tarssh
+
+#Create tarssh hang data
+TEST create_data_hang "tarssh_hang"
+TEST create_data "history_tar"
+
+TEST $GEOREP_CLI $master $slave start
+EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Active"
+
+#Verify arequal for whole volume
+EXPECT_WITHIN $GEO_REP_TIMEOUT "x0" arequal_checksum ${master_mnt} ${slave_mnt}
+
+#Stop Geo-rep
+TEST $GEOREP_CLI $master $slave stop
+
+#Delete Geo-rep
+TEST $GEOREP_CLI $master $slave delete
+
+#Cleanup are-equal binary
+TEST rm $AREQUAL_PATH/arequal-checksum
+
+#Cleanup authorized keys
+sed -i '/^command=.*SSH_ORIGINAL_COMMAND#.*/d' ~/.ssh/authorized_keys
+sed -i '/^command=.*gsyncd.*/d' ~/.ssh/authorized_keys
+
+cleanup;
+#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=000000