mgmt/shd: Implement multiplexing in self heal daemon

Problem: Shd daemon is per node, which means they create a graph with all volumes on it. While this is a great for utilizing resources, it is so good in terms of performance and managebility. Because self-heal daemons doesn't have capability to automatically reconfigure their graphs. So each time when any configurations changes happens to the volumes(replicate/disperse), we need to restart shd to bring the changes into the graph. Because of this all on going heal for all other volumes has to be stopped in the middle, and need to restart all over again. Solution: This changes makes shd as a per volume daemon, so that the graph will be generated for each volumes. When we want to start/reconfigure shd for a volume, we first search for an existing shd running on the node, if there is none, we will start a new process. If already a daemon is running for shd, then we will simply detach a graph for a volume and reatach the updated graph for the volume. This won't touch any of the on going operations for any other volumes on the shd daemon. Example of an shd graph when it is per volume graph ----------------------- | debug-iostat | ----------------------- / | \ / | \ --------- --------- ---------- | AFR-1 | | AFR-2 | | AFR-3 | -------- --------- ---------- A running shd daemon with 3 volumes will be like--> graph ----------------------- | debug-iostat | ----------------------- / | \ / | \ ------------ ------------ ------------ | volume-1 | | volume-2 | | volume-3 | ------------ ------------ ------------ Change-Id: Idcb2698be3eeb95beaac47125565c93370afbd99 fixes: bz#1659708 Signed-off-by: Mohammed Rafi KC <rkavunga@redhat.com>
author: Mohammed Rafi KC <rkavunga@redhat.com> 2019-02-25 10:05:32 +0530
committer: Amar Tumballi <amarts@redhat.com> 2019-04-01 03:44:23 +0000
commit: bc3694d7cfc868a2ed6344ea123faf19fce28d13 (patch)
tree: 51764aa4445462081273444d5ff2499b1e5375f7 /tests/basic
parent: 92ae26ae8039847e38c738ef98835a14be9d4296 (diff)
1 files changed, 29 insertions, 20 deletions
diff --git a/tests/basic/glusterd/heald.t b/tests/basic/glusterd/heald.t
index ca112ad0b75..7dae3c3f0fb 100644
--- a/tests/basic/glusterd/heald.t
+++ b/tests/basic/glusterd/heald.t
@@ -7,11 +7,16 @@
 # Covers enable/disable at the moment. Will be enhanced later to include
 # the other commands as well.
 
+function is_pid_running {
+    local pid=$1
+    num=`ps auxww | grep glustershd | grep $pid | grep -v grep | wc -l`
+    echo $num
+}
+
 cleanup;
 TEST glusterd
 TEST pidof glusterd
 
-volfile=$(gluster system:: getwd)"/glustershd/glustershd-server.vol"
 #Commands should fail when volume doesn't exist
 TEST ! $CLI volume heal non-existent-volume enable
 TEST ! $CLI volume heal non-existent-volume disable
@@ -20,51 +25,55 @@ TEST ! $CLI volume heal non-existent-volume disable
 # volumes
 TEST $CLI volume create dist $H0:$B0/dist
 TEST $CLI volume start dist
-TEST "[ -z $(get_shd_process_pid)]"
+TEST "[ -z $(get_shd_process_pid dist)]"
 TEST ! $CLI volume heal dist enable
 TEST ! $CLI volume heal dist disable
 
 # Commands should work on replicate/disperse volume.
 TEST $CLI volume create r2 replica 2 $H0:$B0/r2_0 $H0:$B0/r2_1
-TEST "[ -z $(get_shd_process_pid)]"
+TEST "[ -z $(get_shd_process_pid r2)]"
 TEST $CLI volume start r2
-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "[0-9][0-9]*" get_shd_process_pid
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "[0-9][0-9]*" get_shd_process_pid r2
 TEST $CLI volume heal r2 enable
 EXPECT "enable" volume_option r2 "cluster.self-heal-daemon"
-EXPECT "enable" volgen_volume_option $volfile r2-replicate-0 cluster replicate self-heal-daemon
-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "[0-9][0-9]*" get_shd_process_pid
+volfiler2=$(gluster system:: getwd)"/vols/r2/r2-shd.vol"
+EXPECT "enable" volgen_volume_option $volfiler2 r2-replicate-0 cluster replicate self-heal-daemon
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "[0-9][0-9]*" get_shd_process_pid r2
+pid=$( get_shd_process_pid r2 )
 TEST $CLI volume heal r2 disable
 EXPECT "disable" volume_option r2 "cluster.self-heal-daemon"
-EXPECT "disable" volgen_volume_option $volfile r2-replicate-0 cluster replicate self-heal-daemon
-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "[0-9][0-9]*" get_shd_process_pid
+EXPECT "disable" volgen_volume_option $volfiler2 r2-replicate-0 cluster replicate self-heal-daemon
+EXPECT "1" is_pid_running $pid
 
 # Commands should work on disperse volume.
 TEST $CLI volume create ec2 disperse 3 redundancy 1 $H0:$B0/ec2_0 $H0:$B0/ec2_1 $H0:$B0/ec2_2
 TEST $CLI volume start ec2
-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "[0-9][0-9]*" get_shd_process_pid
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "[0-9][0-9]*" get_shd_process_pid ec2
 TEST $CLI volume heal ec2 enable
 EXPECT "enable" volume_option ec2 "cluster.disperse-self-heal-daemon"
-EXPECT "enable" volgen_volume_option $volfile ec2-disperse-0 cluster disperse self-heal-daemon
-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "[0-9][0-9]*" get_shd_process_pid
+volfileec2=$(gluster system:: getwd)"/vols/ec2/ec2-shd.vol"
+EXPECT "enable" volgen_volume_option $volfileec2 ec2-disperse-0 cluster disperse self-heal-daemon
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "[0-9][0-9]*" get_shd_process_pid ec2
+pid=$(get_shd_process_pid ec2)
 TEST $CLI volume heal ec2 disable
 EXPECT "disable" volume_option ec2 "cluster.disperse-self-heal-daemon"
-EXPECT "disable" volgen_volume_option $volfile ec2-disperse-0 cluster disperse self-heal-daemon
-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "[0-9][0-9]*" get_shd_process_pid
+EXPECT "disable" volgen_volume_option $volfileec2 ec2-disperse-0 cluster disperse self-heal-daemon
+EXPECT "1" is_pid_running $pid
 
 #Check that shd graph is rewritten correctly on volume stop/start
-EXPECT "Y" volgen_volume_exists $volfile ec2-disperse-0 cluster disperse
-EXPECT "Y" volgen_volume_exists $volfile r2-replicate-0 cluster replicate
+EXPECT "Y" volgen_volume_exists $volfileec2 ec2-disperse-0 cluster disperse
+
+EXPECT "Y" volgen_volume_exists $volfiler2 r2-replicate-0 cluster replicate
 TEST $CLI volume stop r2
-EXPECT "Y" volgen_volume_exists $volfile ec2-disperse-0 cluster disperse
-EXPECT "N" volgen_volume_exists $volfile r2-replicate-0 cluster replicate
+EXPECT "Y" volgen_volume_exists $volfileec2 ec2-disperse-0 cluster disperse
 TEST $CLI volume stop ec2
 # When both the volumes are stopped glustershd volfile is not modified just the
 # process is stopped
-TEST "[ -z $(get_shd_process_pid) ]"
+TEST "[ -z $(get_shd_process_pid dist) ]"
+TEST "[ -z $(get_shd_process_pid ec2) ]"
 
 TEST $CLI volume start r2
-EXPECT "N" volgen_volume_exists $volfile ec2-disperse-0 cluster disperse
-EXPECT "Y" volgen_volume_exists $volfile r2-replicate-0 cluster replicate
+EXPECT "Y" volgen_volume_exists $volfiler2 r2-replicate-0 cluster replicate
 
 TEST $CLI volume set r2 self-heal-daemon on
 TEST $CLI volume set r2 cluster.self-heal-daemon off
author	Mohammed Rafi KC <rkavunga@redhat.com>	2019-02-25 10:05:32 +0530
committer	Amar Tumballi <amarts@redhat.com>	2019-04-01 03:44:23 +0000
commit	bc3694d7cfc868a2ed6344ea123faf19fce28d13 (patch)
tree	51764aa4445462081273444d5ff2499b1e5375f7 /tests/basic
parent	92ae26ae8039847e38c738ef98835a14be9d4296 (diff)