From 603413401ce5ebbf6d8d615095ff564f20d8f9aa Mon Sep 17 00:00:00 2001 From: Vijay Avuthu Date: Sun, 28 Jan 2018 22:05:20 +0530 Subject: Adding Test Case : test_existing_glustershd_should_take_care_of_self_healing Description: Test Script which verifies that the existing glustershd should take care of self healing * Create and start the Replicate volume * Check the glustershd processes - Note the pids * Bring down the One brick ( lets say brick1) without affecting the cluster * Create 5000 files on volume * bring the brick1 up which was killed in previous steps * check the heal info - proactive self healing should start * Bring down brick1 again * wait for 60 sec and brought up the brick1 * Check the glustershd processes - pids should be different * Monitor the heal till its complete Change-Id: Ib044ec60214171f136cc4c2f9225b8fe62e6214d Signed-off-by: Vijay Avuthu --- .../afr/heal/test_self_heal_daemon_process.py | 239 ++++++++++++++++++++- 1 file changed, 232 insertions(+), 7 deletions(-) mode change 100755 => 100644 tests/functional/afr/heal/test_self_heal_daemon_process.py (limited to 'tests/functional/afr') diff --git a/tests/functional/afr/heal/test_self_heal_daemon_process.py b/tests/functional/afr/heal/test_self_heal_daemon_process.py old mode 100755 new mode 100644 index 1a9fa0987..0c340225f --- a/tests/functional/afr/heal/test_self_heal_daemon_process.py +++ b/tests/functional/afr/heal/test_self_heal_daemon_process.py @@ -29,17 +29,17 @@ from glustolibs.gluster.volume_libs import ( from glustolibs.gluster.rebalance_ops import (rebalance_start, wait_for_rebalance_to_complete, rebalance_status) -from glustolibs.gluster.brick_libs import (get_all_bricks, - bring_bricks_offline, - bring_bricks_online, - are_bricks_online, - select_bricks_to_bring_offline, - are_bricks_offline) +from glustolibs.gluster.brick_libs import ( + get_all_bricks, bring_bricks_offline, bring_bricks_online, + are_bricks_online, select_bricks_to_bring_offline, are_bricks_offline, + select_volume_bricks_to_bring_offline, get_online_bricks_list) from glustolibs.gluster.brick_ops import replace_brick from glustolibs.gluster.heal_libs import (get_self_heal_daemon_pid, do_bricks_exist_in_shd_volfile, is_shd_daemonized, - are_all_self_heal_daemons_are_online) + are_all_self_heal_daemons_are_online, + monitor_heal_completion) +from glustolibs.gluster.heal_ops import get_heal_info_summary from glustolibs.gluster.volume_ops import (volume_stop, volume_start) from glustolibs.gluster.gluster_init import restart_glusterd from glustolibs.io.utils import validate_io_procs @@ -811,3 +811,228 @@ class ImpactOfReplaceBrickForGlustershdTests(GlusterBaseClass): "replacing bricks. Please check log file " "for details")) g.log.info("Successfully parsed %s file", self.glustershd) + + +@runs_on([['replicated', 'distributed-replicated'], + ['glusterfs', 'nfs', 'cifs']]) +class SelfHealDaemonProcessTestsWithHealing(GlusterBaseClass): + """ + SelfHealDaemonProcessTestsWithHealing contains tests which verifies the + self-heal daemon process with healing. + """ + @classmethod + def setUpClass(cls): + """ + setup volume, mount volume and initialize necessary variables + which is used in tests + """ + + # calling GlusterBaseClass setUpClass + GlusterBaseClass.setUpClass.im_func(cls) + + # Setup Volume and Mount Volume + g.log.info("Starting to Setup Volume and Mount Volume") + ret = cls.setup_volume_and_mount_volume(mounts=cls.mounts) + if not ret: + raise ExecutionError("Failed to Setup_Volume and Mount_Volume") + g.log.info("Successful in Setup Volume and Mount Volume") + + # Verfiy glustershd process releases its parent process + g.log.info("Verifying Self Heal Daemon process is daemonized") + ret = is_shd_daemonized(cls.servers) + if not ret: + raise ExecutionError("Self Heal Daemon process was still" + " holding parent process.") + g.log.info("Self Heal Daemon processes are online") + + # upload script + script_abs_path = "/usr/share/glustolibs/io/scripts/file_dir_ops.py" + cls.script_upload_path = "/usr/share/glustolibs/io/scripts/" \ + "file_dir_ops.py" + + ret = upload_scripts(cls.clients, script_abs_path) + if not ret: + raise ExecutionError("Failed to upload IO scripts to clients") + + cls.GLUSTERSHD = "/var/lib/glusterd/glustershd/glustershd-server.vol" + + @classmethod + def tearDownClass(cls): + """ + Clean up the volume and umount volume from client + """ + + # stopping the volume + g.log.info("Starting to Unmount Volume and Cleanup Volume") + ret = cls.unmount_volume_and_cleanup_volume(mounts=cls.mounts) + if not ret: + raise ExecutionError("Failed to Unmount Volume and Cleanup Volume") + g.log.info("Successful in Unmount Volume and Cleanup Volume") + + # calling GlusterBaseClass tearDownClass + GlusterBaseClass.tearDownClass.im_func(cls) + + def test_existing_glustershd_should_take_care_of_self_healing(self): + """ + Test Script which verifies that the existing glustershd should take + care of self healing + + * Create and start the Replicate volume + * Check the glustershd processes - Note the pids + * Bring down the One brick ( lets say brick1) without affecting + the cluster + * Create 5000 files on volume + * bring the brick1 up which was killed in previous steps + * check the heal info - proactive self healing should start + * Bring down brick1 again + * wait for 60 sec and brought up the brick1 + * Check the glustershd processes - pids should be different + * Monitor the heal till its complete + + """ + # pylint: disable=too-many-locals,too-many-lines,too-many-statements + nodes = self.servers + + # check the self-heal daemon process + g.log.info("Starting to get self-heal daemon process on " + "nodes %s", nodes) + ret, pids = get_self_heal_daemon_pid(nodes) + self.assertTrue(ret, ("Either No self heal daemon process found or " + "more than One self heal daemon process " + "found : %s" % pids)) + g.log.info("Successful in getting Single self heal daemon process" + " on all nodes %s", nodes) + glustershd_pids = pids + + # select the bricks to bring offline + g.log.info("Selecting bricks to brought offline for volume %s", + self.volname) + bricks_to_bring_offline = \ + select_volume_bricks_to_bring_offline(self.mnode, + self.volname) + g.log.info("Brick List to bring offline : %s", + bricks_to_bring_offline) + + # Bring down the selected bricks + g.log.info("Going to bring down the brick process " + "for %s", bricks_to_bring_offline) + ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) + self.assertTrue(ret, ("Failed to bring down the bricks. Please " + "check the log file for more details.")) + g.log.info("Brought down the brick process " + "for %s succesfully", bricks_to_bring_offline) + + # get the bricks which are running + g.log.info("getting the brick list which are online") + online_bricks = get_online_bricks_list(self.mnode, self.volname) + g.log.info("Online Bricks for volume %s : %s", + self.volname, online_bricks) + + # write 1MB files to the mounts + g.log.info("Starting IO on all mounts...") + g.log.info("mounts: %s", self.mounts) + all_mounts_procs = [] + for mount_obj in self.mounts: + cmd = "for i in `seq 1 5000`;do dd if=/dev/urandom " \ + "of=%s/file_$i bs=1M count=1;done" % mount_obj.mountpoint + g.log.info(cmd) + proc = g.run_async(mount_obj.client_system, cmd, + user=mount_obj.user) + all_mounts_procs.append(proc) + + # Validate IO + g.log.info("Validating IO on mounts.....") + ret = validate_io_procs(all_mounts_procs, self.mounts) + self.assertTrue(ret, "IO failed on some of the clients") + g.log.info("IO is successful on all mounts") + + # check the heal info + g.log.info("Get the pending heal info for the volume %s", + self.volname) + heal_info = get_heal_info_summary(self.mnode, self.volname) + g.log.info("Successfully got heal info for the volume %s", + self.volname) + g.log.info("Heal Info for volume %s : %s", self.volname, heal_info) + + # Bring bricks online + g.log.info("Bring bricks: %s online", bricks_to_bring_offline) + ret = bring_bricks_online(self.mnode, self.volname, + bricks_to_bring_offline, 'glusterd_restart') + self.assertTrue(ret, ("Failed to bring bricks: %s online" + % bricks_to_bring_offline)) + g.log.info("Successfully brought all bricks: %s online", + bricks_to_bring_offline) + + # Wait for 90 sec to start self healing + time.sleep(90) + + # check the heal info + g.log.info("Get the pending heal info for the volume %s", + self.volname) + heal_info_after_brick_online = get_heal_info_summary(self.mnode, + self.volname) + g.log.info("Successfully got heal info for the volume %s", + self.volname) + g.log.info("Heal Info for volume %s : %s", + self.volname, heal_info_after_brick_online) + + # check heal pending is decreased + flag = False + for brick in online_bricks: + if int(heal_info_after_brick_online[brick]['numberOfEntries'])\ + < int(heal_info[brick]['numberOfEntries']): + flag = True + break + + self.assertTrue(flag, ("Pro-active self heal is not started")) + g.log.info("Pro-active self heal is started") + + # bring down bricks again + g.log.info("Going to bring down the brick process " + "for %s", bricks_to_bring_offline) + ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) + self.assertTrue(ret, ("Failed to bring down the bricks. Please " + "check the log file for more details.")) + g.log.info("Brought down the brick process " + "for %s succesfully", bricks_to_bring_offline) + + # wait for 60 sec and brought up the brick agian + time.sleep(60) + g.log.info("Bring bricks: %s online", bricks_to_bring_offline) + ret = bring_bricks_online(self.mnode, self.volname, + bricks_to_bring_offline, 'glusterd_restart') + self.assertTrue(ret, ("Failed to bring bricks: %s online" + % bricks_to_bring_offline)) + g.log.info("Successfully brought all bricks: %s online", + bricks_to_bring_offline) + + # Verfiy glustershd process releases its parent process + ret = is_shd_daemonized(nodes) + self.assertTrue(ret, ("Either No self heal daemon process found or " + "more than One self heal daemon process found")) + + # check the self-heal daemon process + g.log.info("Starting to get self-heal daemon process on " + "nodes %s", nodes) + ret, pids = get_self_heal_daemon_pid(nodes) + self.assertTrue(ret, ("Either No self heal daemon process found or " + "more than One self heal daemon process " + "found : %s" % pids)) + g.log.info("Successful in getting Single self heal daemon process" + " on all nodes %s", nodes) + glustershd_pids_after_bricks_online = pids + + # compare the glustershd pids + self.assertNotEqual(glustershd_pids, + glustershd_pids_after_bricks_online, + ("self heal daemon process are same before and " + "after bringing up bricks online")) + g.log.info("EXPECTED : self heal daemon process are different before " + "and after bringing up bricks online") + + # wait for heal to complete + g.log.info("Monitoring the heal.....") + ret = monitor_heal_completion(self.mnode, self.volname) + self.assertTrue(ret, ("Heal is not completed on volume %s" + % self.volname)) + g.log.info("Heal Completed on volume %s", self.volname) -- cgit