From b475551c66b86863cc93ebdfa6daeeab67bdbd9e Mon Sep 17 00:00:00 2001 From: Mohammed Rafi KC Date: Mon, 6 May 2019 23:35:08 +0530 Subject: shd/glusterd: Serialize shd manager to prevent race condition At the time of a glusterd restart, while doing a handshake there is a possibility that multiple shd manager might get executed. Because of this, there is a chance that multiple shd get spawned during a glusterd restart Change-Id: Ie20798441e07d7d7a93b7d38dfb924cea178a920 fixes: bz#1707081 Signed-off-by: Mohammed Rafi KC --- .../serialize-shd-manager-glusterd-restart.t | 54 ++++++++++++++++++++++ xlators/mgmt/glusterd/src/glusterd-shd-svc.c | 14 ++++++ xlators/mgmt/glusterd/src/glusterd.c | 1 + xlators/mgmt/glusterd/src/glusterd.h | 3 ++ 4 files changed, 72 insertions(+) create mode 100644 tests/bugs/glusterd/serialize-shd-manager-glusterd-restart.t diff --git a/tests/bugs/glusterd/serialize-shd-manager-glusterd-restart.t b/tests/bugs/glusterd/serialize-shd-manager-glusterd-restart.t new file mode 100644 index 00000000000..3a27c2a9d1b --- /dev/null +++ b/tests/bugs/glusterd/serialize-shd-manager-glusterd-restart.t @@ -0,0 +1,54 @@ +#! /bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../cluster.rc + +function check_peers { +count=`$CLI_1 peer status | grep 'Peer in Cluster (Connected)' | wc -l` +echo $count +} + +function check_shd { +ps aux | grep $1 | grep glustershd | wc -l +} + +cleanup + + +TEST launch_cluster 6 + +TESTS_EXPECTED_IN_LOOP=25 +for i in $(seq 2 6); do + hostname="H$i" + TEST $CLI_1 peer probe ${!hostname} +done + + +EXPECT_WITHIN $PROBE_TIMEOUT 5 check_peers; +for i in $(seq 1 5); do + + TEST $CLI_1 volume create ${V0}_$i replica 3 $H1:$B1/${V0}_$i $H2:$B2/${V0}_$i $H3:$B3/${V0}_$i $H4:$B4/${V0}_$i $H5:$B5/${V0}_$i $H6:$B6/${V0}_$i + TEST $CLI_1 volume start ${V0}_$i force + +done + +#kill a node +TEST kill_node 3 + +TEST $glusterd_3; +EXPECT_WITHIN $PROBE_TIMEOUT 5 check_peers + +EXPECT_WITHIN $PROCESS_UP_TIMEOUT 1 check_shd $H3 + +for i in $(seq 1 5); do + + TEST $CLI_1 volume stop ${V0}_$i + TEST $CLI_1 volume delete ${V0}_$i + +done + +for i in $(seq 1 6); do + hostname="H$i" + EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT 0 check_shd ${!hostname} +done +cleanup diff --git a/xlators/mgmt/glusterd/src/glusterd-shd-svc.c b/xlators/mgmt/glusterd/src/glusterd-shd-svc.c index 2edc2738954..598c0823586 100644 --- a/xlators/mgmt/glusterd/src/glusterd-shd-svc.c +++ b/xlators/mgmt/glusterd/src/glusterd-shd-svc.c @@ -254,14 +254,26 @@ glusterd_shdsvc_manager(glusterd_svc_t *svc, void *data, int flags) { int ret = -1; glusterd_volinfo_t *volinfo = NULL; + glusterd_conf_t *conf = NULL; + gf_boolean_t shd_restart = _gf_false; + conf = THIS->private; volinfo = data; + GF_VALIDATE_OR_GOTO("glusterd", conf, out); GF_VALIDATE_OR_GOTO("glusterd", svc, out); GF_VALIDATE_OR_GOTO("glusterd", volinfo, out); if (volinfo) glusterd_volinfo_ref(volinfo); + while (conf->restart_shd) { + synclock_unlock(&conf->big_lock); + sleep(2); + synclock_lock(&conf->big_lock); + } + conf->restart_shd = _gf_true; + shd_restart = _gf_true; + ret = glusterd_shdsvc_create_volfile(volinfo); if (ret) goto out; @@ -310,6 +322,8 @@ glusterd_shdsvc_manager(glusterd_svc_t *svc, void *data, int flags) } } out: + if (shd_restart) + conf->restart_shd = _gf_false; if (volinfo) glusterd_volinfo_unref(volinfo); if (ret) diff --git a/xlators/mgmt/glusterd/src/glusterd.c b/xlators/mgmt/glusterd/src/glusterd.c index c0973cb2bea..6d7dd4a82d5 100644 --- a/xlators/mgmt/glusterd/src/glusterd.c +++ b/xlators/mgmt/glusterd/src/glusterd.c @@ -1819,6 +1819,7 @@ init(xlator_t *this) conf->rpc = rpc; conf->uds_rpc = uds_rpc; conf->gfs_mgmt = &gd_brick_prog; + conf->restart_shd = _gf_false; this->private = conf; /* conf->workdir and conf->rundir are smaller than PATH_MAX; gcc's * snprintf checking will throw an error here if sprintf is used. diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index 5b9fed6c80d..ca8e82afb47 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -219,6 +219,9 @@ typedef struct { gf_atomic_t blockers; uint32_t mgmt_v3_lock_timeout; gf_boolean_t restart_bricks; + gf_boolean_t restart_shd; /* This flag prevents running two shd manager + simultaneously + */ pthread_mutex_t attach_lock; /* Lock can be per process or a common one */ pthread_mutex_t volume_lock; /* We release the big_lock from lot of places which might lead the modification of volinfo -- cgit