summaryrefslogtreecommitdiffstats
path: root/tests/functional/afr
diff options
context:
space:
mode:
Diffstat (limited to 'tests/functional/afr')
-rw-r--r--[-rwxr-xr-x]tests/functional/afr/heal/test_self_heal_daemon_process.py239
1 files changed, 232 insertions, 7 deletions
diff --git a/tests/functional/afr/heal/test_self_heal_daemon_process.py b/tests/functional/afr/heal/test_self_heal_daemon_process.py
index 1a9fa0987..0c340225f 100755..100644
--- a/tests/functional/afr/heal/test_self_heal_daemon_process.py
+++ b/tests/functional/afr/heal/test_self_heal_daemon_process.py
@@ -29,17 +29,17 @@ from glustolibs.gluster.volume_libs import (
from glustolibs.gluster.rebalance_ops import (rebalance_start,
wait_for_rebalance_to_complete,
rebalance_status)
-from glustolibs.gluster.brick_libs import (get_all_bricks,
- bring_bricks_offline,
- bring_bricks_online,
- are_bricks_online,
- select_bricks_to_bring_offline,
- are_bricks_offline)
+from glustolibs.gluster.brick_libs import (
+ get_all_bricks, bring_bricks_offline, bring_bricks_online,
+ are_bricks_online, select_bricks_to_bring_offline, are_bricks_offline,
+ select_volume_bricks_to_bring_offline, get_online_bricks_list)
from glustolibs.gluster.brick_ops import replace_brick
from glustolibs.gluster.heal_libs import (get_self_heal_daemon_pid,
do_bricks_exist_in_shd_volfile,
is_shd_daemonized,
- are_all_self_heal_daemons_are_online)
+ are_all_self_heal_daemons_are_online,
+ monitor_heal_completion)
+from glustolibs.gluster.heal_ops import get_heal_info_summary
from glustolibs.gluster.volume_ops import (volume_stop, volume_start)
from glustolibs.gluster.gluster_init import restart_glusterd
from glustolibs.io.utils import validate_io_procs
@@ -811,3 +811,228 @@ class ImpactOfReplaceBrickForGlustershdTests(GlusterBaseClass):
"replacing bricks. Please check log file "
"for details"))
g.log.info("Successfully parsed %s file", self.glustershd)
+
+
+@runs_on([['replicated', 'distributed-replicated'],
+ ['glusterfs', 'nfs', 'cifs']])
+class SelfHealDaemonProcessTestsWithHealing(GlusterBaseClass):
+ """
+ SelfHealDaemonProcessTestsWithHealing contains tests which verifies the
+ self-heal daemon process with healing.
+ """
+ @classmethod
+ def setUpClass(cls):
+ """
+ setup volume, mount volume and initialize necessary variables
+ which is used in tests
+ """
+
+ # calling GlusterBaseClass setUpClass
+ GlusterBaseClass.setUpClass.im_func(cls)
+
+ # Setup Volume and Mount Volume
+ g.log.info("Starting to Setup Volume and Mount Volume")
+ ret = cls.setup_volume_and_mount_volume(mounts=cls.mounts)
+ if not ret:
+ raise ExecutionError("Failed to Setup_Volume and Mount_Volume")
+ g.log.info("Successful in Setup Volume and Mount Volume")
+
+ # Verfiy glustershd process releases its parent process
+ g.log.info("Verifying Self Heal Daemon process is daemonized")
+ ret = is_shd_daemonized(cls.servers)
+ if not ret:
+ raise ExecutionError("Self Heal Daemon process was still"
+ " holding parent process.")
+ g.log.info("Self Heal Daemon processes are online")
+
+ # upload script
+ script_abs_path = "/usr/share/glustolibs/io/scripts/file_dir_ops.py"
+ cls.script_upload_path = "/usr/share/glustolibs/io/scripts/" \
+ "file_dir_ops.py"
+
+ ret = upload_scripts(cls.clients, script_abs_path)
+ if not ret:
+ raise ExecutionError("Failed to upload IO scripts to clients")
+
+ cls.GLUSTERSHD = "/var/lib/glusterd/glustershd/glustershd-server.vol"
+
+ @classmethod
+ def tearDownClass(cls):
+ """
+ Clean up the volume and umount volume from client
+ """
+
+ # stopping the volume
+ g.log.info("Starting to Unmount Volume and Cleanup Volume")
+ ret = cls.unmount_volume_and_cleanup_volume(mounts=cls.mounts)
+ if not ret:
+ raise ExecutionError("Failed to Unmount Volume and Cleanup Volume")
+ g.log.info("Successful in Unmount Volume and Cleanup Volume")
+
+ # calling GlusterBaseClass tearDownClass
+ GlusterBaseClass.tearDownClass.im_func(cls)
+
+ def test_existing_glustershd_should_take_care_of_self_healing(self):
+ """
+ Test Script which verifies that the existing glustershd should take
+ care of self healing
+
+ * Create and start the Replicate volume
+ * Check the glustershd processes - Note the pids
+ * Bring down the One brick ( lets say brick1) without affecting
+ the cluster
+ * Create 5000 files on volume
+ * bring the brick1 up which was killed in previous steps
+ * check the heal info - proactive self healing should start
+ * Bring down brick1 again
+ * wait for 60 sec and brought up the brick1
+ * Check the glustershd processes - pids should be different
+ * Monitor the heal till its complete
+
+ """
+ # pylint: disable=too-many-locals,too-many-lines,too-many-statements
+ nodes = self.servers
+
+ # check the self-heal daemon process
+ g.log.info("Starting to get self-heal daemon process on "
+ "nodes %s", nodes)
+ ret, pids = get_self_heal_daemon_pid(nodes)
+ self.assertTrue(ret, ("Either No self heal daemon process found or "
+ "more than One self heal daemon process "
+ "found : %s" % pids))
+ g.log.info("Successful in getting Single self heal daemon process"
+ " on all nodes %s", nodes)
+ glustershd_pids = pids
+
+ # select the bricks to bring offline
+ g.log.info("Selecting bricks to brought offline for volume %s",
+ self.volname)
+ bricks_to_bring_offline = \
+ select_volume_bricks_to_bring_offline(self.mnode,
+ self.volname)
+ g.log.info("Brick List to bring offline : %s",
+ bricks_to_bring_offline)
+
+ # Bring down the selected bricks
+ g.log.info("Going to bring down the brick process "
+ "for %s", bricks_to_bring_offline)
+ ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
+ self.assertTrue(ret, ("Failed to bring down the bricks. Please "
+ "check the log file for more details."))
+ g.log.info("Brought down the brick process "
+ "for %s succesfully", bricks_to_bring_offline)
+
+ # get the bricks which are running
+ g.log.info("getting the brick list which are online")
+ online_bricks = get_online_bricks_list(self.mnode, self.volname)
+ g.log.info("Online Bricks for volume %s : %s",
+ self.volname, online_bricks)
+
+ # write 1MB files to the mounts
+ g.log.info("Starting IO on all mounts...")
+ g.log.info("mounts: %s", self.mounts)
+ all_mounts_procs = []
+ for mount_obj in self.mounts:
+ cmd = "for i in `seq 1 5000`;do dd if=/dev/urandom " \
+ "of=%s/file_$i bs=1M count=1;done" % mount_obj.mountpoint
+ g.log.info(cmd)
+ proc = g.run_async(mount_obj.client_system, cmd,
+ user=mount_obj.user)
+ all_mounts_procs.append(proc)
+
+ # Validate IO
+ g.log.info("Validating IO on mounts.....")
+ ret = validate_io_procs(all_mounts_procs, self.mounts)
+ self.assertTrue(ret, "IO failed on some of the clients")
+ g.log.info("IO is successful on all mounts")
+
+ # check the heal info
+ g.log.info("Get the pending heal info for the volume %s",
+ self.volname)
+ heal_info = get_heal_info_summary(self.mnode, self.volname)
+ g.log.info("Successfully got heal info for the volume %s",
+ self.volname)
+ g.log.info("Heal Info for volume %s : %s", self.volname, heal_info)
+
+ # Bring bricks online
+ g.log.info("Bring bricks: %s online", bricks_to_bring_offline)
+ ret = bring_bricks_online(self.mnode, self.volname,
+ bricks_to_bring_offline, 'glusterd_restart')
+ self.assertTrue(ret, ("Failed to bring bricks: %s online"
+ % bricks_to_bring_offline))
+ g.log.info("Successfully brought all bricks: %s online",
+ bricks_to_bring_offline)
+
+ # Wait for 90 sec to start self healing
+ time.sleep(90)
+
+ # check the heal info
+ g.log.info("Get the pending heal info for the volume %s",
+ self.volname)
+ heal_info_after_brick_online = get_heal_info_summary(self.mnode,
+ self.volname)
+ g.log.info("Successfully got heal info for the volume %s",
+ self.volname)
+ g.log.info("Heal Info for volume %s : %s",
+ self.volname, heal_info_after_brick_online)
+
+ # check heal pending is decreased
+ flag = False
+ for brick in online_bricks:
+ if int(heal_info_after_brick_online[brick]['numberOfEntries'])\
+ < int(heal_info[brick]['numberOfEntries']):
+ flag = True
+ break
+
+ self.assertTrue(flag, ("Pro-active self heal is not started"))
+ g.log.info("Pro-active self heal is started")
+
+ # bring down bricks again
+ g.log.info("Going to bring down the brick process "
+ "for %s", bricks_to_bring_offline)
+ ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
+ self.assertTrue(ret, ("Failed to bring down the bricks. Please "
+ "check the log file for more details."))
+ g.log.info("Brought down the brick process "
+ "for %s succesfully", bricks_to_bring_offline)
+
+ # wait for 60 sec and brought up the brick agian
+ time.sleep(60)
+ g.log.info("Bring bricks: %s online", bricks_to_bring_offline)
+ ret = bring_bricks_online(self.mnode, self.volname,
+ bricks_to_bring_offline, 'glusterd_restart')
+ self.assertTrue(ret, ("Failed to bring bricks: %s online"
+ % bricks_to_bring_offline))
+ g.log.info("Successfully brought all bricks: %s online",
+ bricks_to_bring_offline)
+
+ # Verfiy glustershd process releases its parent process
+ ret = is_shd_daemonized(nodes)
+ self.assertTrue(ret, ("Either No self heal daemon process found or "
+ "more than One self heal daemon process found"))
+
+ # check the self-heal daemon process
+ g.log.info("Starting to get self-heal daemon process on "
+ "nodes %s", nodes)
+ ret, pids = get_self_heal_daemon_pid(nodes)
+ self.assertTrue(ret, ("Either No self heal daemon process found or "
+ "more than One self heal daemon process "
+ "found : %s" % pids))
+ g.log.info("Successful in getting Single self heal daemon process"
+ " on all nodes %s", nodes)
+ glustershd_pids_after_bricks_online = pids
+
+ # compare the glustershd pids
+ self.assertNotEqual(glustershd_pids,
+ glustershd_pids_after_bricks_online,
+ ("self heal daemon process are same before and "
+ "after bringing up bricks online"))
+ g.log.info("EXPECTED : self heal daemon process are different before "
+ "and after bringing up bricks online")
+
+ # wait for heal to complete
+ g.log.info("Monitoring the heal.....")
+ ret = monitor_heal_completion(self.mnode, self.volname)
+ self.assertTrue(ret, ("Heal is not completed on volume %s"
+ % self.volname))
+ g.log.info("Heal Completed on volume %s", self.volname)