diff options
| author | Vijay Avuthu <vavuthu@redhat.com> | 2018-01-28 22:05:20 +0530 | 
|---|---|---|
| committer | Jonathan Holloway <jholloway@redhat.com> | 2018-05-04 14:31:29 +0000 | 
| commit | 603413401ce5ebbf6d8d615095ff564f20d8f9aa (patch) | |
| tree | 22c297956d5dd53e532652cac9e0f446d6e51e89 | |
| parent | 72812c9f72c69fa8083b7f79c49507a2df9817cf (diff) | |
Adding Test Case : test_existing_glustershd_should_take_care_of_self_healing
Description:
Test Script which verifies that the existing glustershd should take
care of self healing
* Create and start the Replicate volume
* Check the glustershd processes - Note the pids
* Bring down the One brick ( lets say brick1)  without affecting the cluster
* Create 5000 files on volume
* bring the brick1 up which was killed in previous steps
* check the heal info - proactive self healing should start
* Bring down brick1 again
* wait for 60 sec and brought up the brick1
* Check the glustershd processes - pids should be different
* Monitor the heal till its complete
Change-Id: Ib044ec60214171f136cc4c2f9225b8fe62e6214d
Signed-off-by: Vijay Avuthu <vavuthu@redhat.com>
| -rw-r--r--[-rwxr-xr-x] | tests/functional/afr/heal/test_self_heal_daemon_process.py | 239 | 
1 files changed, 232 insertions, 7 deletions
| diff --git a/tests/functional/afr/heal/test_self_heal_daemon_process.py b/tests/functional/afr/heal/test_self_heal_daemon_process.py index 1a9fa0987..0c340225f 100755..100644 --- a/tests/functional/afr/heal/test_self_heal_daemon_process.py +++ b/tests/functional/afr/heal/test_self_heal_daemon_process.py @@ -29,17 +29,17 @@ from glustolibs.gluster.volume_libs import (  from glustolibs.gluster.rebalance_ops import (rebalance_start,                                                wait_for_rebalance_to_complete,                                                rebalance_status) -from glustolibs.gluster.brick_libs import (get_all_bricks, -                                           bring_bricks_offline, -                                           bring_bricks_online, -                                           are_bricks_online, -                                           select_bricks_to_bring_offline, -                                           are_bricks_offline) +from glustolibs.gluster.brick_libs import ( +    get_all_bricks, bring_bricks_offline, bring_bricks_online, +    are_bricks_online, select_bricks_to_bring_offline, are_bricks_offline, +    select_volume_bricks_to_bring_offline, get_online_bricks_list)  from glustolibs.gluster.brick_ops import replace_brick  from glustolibs.gluster.heal_libs import (get_self_heal_daemon_pid,                                            do_bricks_exist_in_shd_volfile,                                            is_shd_daemonized, -                                          are_all_self_heal_daemons_are_online) +                                          are_all_self_heal_daemons_are_online, +                                          monitor_heal_completion) +from glustolibs.gluster.heal_ops import get_heal_info_summary  from glustolibs.gluster.volume_ops import (volume_stop, volume_start)  from glustolibs.gluster.gluster_init import restart_glusterd  from glustolibs.io.utils import validate_io_procs @@ -811,3 +811,228 @@ class ImpactOfReplaceBrickForGlustershdTests(GlusterBaseClass):                                "replacing bricks. Please check log file "                                "for details"))          g.log.info("Successfully parsed %s file", self.glustershd) + + +@runs_on([['replicated', 'distributed-replicated'], +          ['glusterfs', 'nfs', 'cifs']]) +class SelfHealDaemonProcessTestsWithHealing(GlusterBaseClass): +    """ +    SelfHealDaemonProcessTestsWithHealing contains tests which verifies the +    self-heal daemon process with healing. +    """ +    @classmethod +    def setUpClass(cls): +        """ +        setup volume, mount volume and initialize necessary variables +        which is used in tests +        """ + +        # calling GlusterBaseClass setUpClass +        GlusterBaseClass.setUpClass.im_func(cls) + +        # Setup Volume and Mount Volume +        g.log.info("Starting to Setup Volume and Mount Volume") +        ret = cls.setup_volume_and_mount_volume(mounts=cls.mounts) +        if not ret: +            raise ExecutionError("Failed to Setup_Volume and Mount_Volume") +        g.log.info("Successful in Setup Volume and Mount Volume") + +        # Verfiy glustershd process releases its parent process +        g.log.info("Verifying Self Heal Daemon process is daemonized") +        ret = is_shd_daemonized(cls.servers) +        if not ret: +            raise ExecutionError("Self Heal Daemon process was still" +                                 " holding parent process.") +        g.log.info("Self Heal Daemon processes are online") + +        # upload script +        script_abs_path = "/usr/share/glustolibs/io/scripts/file_dir_ops.py" +        cls.script_upload_path = "/usr/share/glustolibs/io/scripts/" \ +                                 "file_dir_ops.py" + +        ret = upload_scripts(cls.clients, script_abs_path) +        if not ret: +            raise ExecutionError("Failed to upload IO scripts to clients") + +        cls.GLUSTERSHD = "/var/lib/glusterd/glustershd/glustershd-server.vol" + +    @classmethod +    def tearDownClass(cls): +        """ +        Clean up the volume and umount volume from client +        """ + +        # stopping the volume +        g.log.info("Starting to Unmount Volume and Cleanup Volume") +        ret = cls.unmount_volume_and_cleanup_volume(mounts=cls.mounts) +        if not ret: +            raise ExecutionError("Failed to Unmount Volume and Cleanup Volume") +        g.log.info("Successful in Unmount Volume and Cleanup Volume") + +        # calling GlusterBaseClass tearDownClass +        GlusterBaseClass.tearDownClass.im_func(cls) + +    def test_existing_glustershd_should_take_care_of_self_healing(self): +        """ +        Test Script which verifies that the existing glustershd should take +        care of self healing + +        * Create and start the Replicate volume +        * Check the glustershd processes - Note the pids +        * Bring down the One brick ( lets say brick1)  without affecting +          the cluster +        * Create 5000 files on volume +        * bring the brick1 up which was killed in previous steps +        * check the heal info - proactive self healing should start +        * Bring down brick1 again +        * wait for 60 sec and brought up the brick1 +        * Check the glustershd processes - pids should be different +        * Monitor the heal till its complete + +        """ +        # pylint: disable=too-many-locals,too-many-lines,too-many-statements +        nodes = self.servers + +        # check the self-heal daemon process +        g.log.info("Starting to get self-heal daemon process on " +                   "nodes %s", nodes) +        ret, pids = get_self_heal_daemon_pid(nodes) +        self.assertTrue(ret, ("Either No self heal daemon process found or " +                              "more than One self heal daemon process " +                              "found : %s" % pids)) +        g.log.info("Successful in getting Single self heal daemon process" +                   " on all nodes %s", nodes) +        glustershd_pids = pids + +        # select the bricks to bring offline +        g.log.info("Selecting bricks to brought offline for volume %s", +                   self.volname) +        bricks_to_bring_offline = \ +            select_volume_bricks_to_bring_offline(self.mnode, +                                                  self.volname) +        g.log.info("Brick List to bring offline : %s", +                   bricks_to_bring_offline) + +        # Bring down the selected bricks +        g.log.info("Going to bring down the brick process " +                   "for %s", bricks_to_bring_offline) +        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) +        self.assertTrue(ret, ("Failed to bring down the bricks. Please " +                              "check the log file for more details.")) +        g.log.info("Brought down the brick process " +                   "for %s succesfully", bricks_to_bring_offline) + +        # get the bricks which are running +        g.log.info("getting the brick list which are online") +        online_bricks = get_online_bricks_list(self.mnode, self.volname) +        g.log.info("Online Bricks for volume %s : %s", +                   self.volname, online_bricks) + +        # write 1MB files to the mounts +        g.log.info("Starting IO on all mounts...") +        g.log.info("mounts: %s", self.mounts) +        all_mounts_procs = [] +        for mount_obj in self.mounts: +            cmd = "for i in `seq 1 5000`;do dd if=/dev/urandom " \ +                  "of=%s/file_$i bs=1M count=1;done" % mount_obj.mountpoint +            g.log.info(cmd) +            proc = g.run_async(mount_obj.client_system, cmd, +                               user=mount_obj.user) +            all_mounts_procs.append(proc) + +        # Validate IO +        g.log.info("Validating IO on mounts.....") +        ret = validate_io_procs(all_mounts_procs, self.mounts) +        self.assertTrue(ret, "IO failed on some of the clients") +        g.log.info("IO is successful on all mounts") + +        # check the heal info +        g.log.info("Get the pending heal info for the volume %s", +                   self.volname) +        heal_info = get_heal_info_summary(self.mnode, self.volname) +        g.log.info("Successfully got heal info for the volume %s", +                   self.volname) +        g.log.info("Heal Info for volume %s : %s", self.volname, heal_info) + +        # Bring bricks online +        g.log.info("Bring bricks: %s online", bricks_to_bring_offline) +        ret = bring_bricks_online(self.mnode, self.volname, +                                  bricks_to_bring_offline, 'glusterd_restart') +        self.assertTrue(ret, ("Failed to bring bricks: %s online" +                              % bricks_to_bring_offline)) +        g.log.info("Successfully brought all bricks: %s online", +                   bricks_to_bring_offline) + +        # Wait for 90 sec to start self healing +        time.sleep(90) + +        # check the heal info +        g.log.info("Get the pending heal info for the volume %s", +                   self.volname) +        heal_info_after_brick_online = get_heal_info_summary(self.mnode, +                                                             self.volname) +        g.log.info("Successfully got heal info for the volume %s", +                   self.volname) +        g.log.info("Heal Info for volume %s : %s", +                   self.volname, heal_info_after_brick_online) + +        # check heal pending is decreased +        flag = False +        for brick in online_bricks: +            if int(heal_info_after_brick_online[brick]['numberOfEntries'])\ +                    < int(heal_info[brick]['numberOfEntries']): +                flag = True +                break + +        self.assertTrue(flag, ("Pro-active self heal is not started")) +        g.log.info("Pro-active self heal is started") + +        # bring down bricks again +        g.log.info("Going to bring down the brick process " +                   "for %s", bricks_to_bring_offline) +        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) +        self.assertTrue(ret, ("Failed to bring down the bricks. Please " +                              "check the log file for more details.")) +        g.log.info("Brought down the brick process " +                   "for %s succesfully", bricks_to_bring_offline) + +        # wait for 60 sec and brought up the brick agian +        time.sleep(60) +        g.log.info("Bring bricks: %s online", bricks_to_bring_offline) +        ret = bring_bricks_online(self.mnode, self.volname, +                                  bricks_to_bring_offline, 'glusterd_restart') +        self.assertTrue(ret, ("Failed to bring bricks: %s online" +                              % bricks_to_bring_offline)) +        g.log.info("Successfully brought all bricks: %s online", +                   bricks_to_bring_offline) + +        # Verfiy glustershd process releases its parent process +        ret = is_shd_daemonized(nodes) +        self.assertTrue(ret, ("Either No self heal daemon process found or " +                              "more than One self heal daemon process found")) + +        # check the self-heal daemon process +        g.log.info("Starting to get self-heal daemon process on " +                   "nodes %s", nodes) +        ret, pids = get_self_heal_daemon_pid(nodes) +        self.assertTrue(ret, ("Either No self heal daemon process found or " +                              "more than One self heal daemon process " +                              "found : %s" % pids)) +        g.log.info("Successful in getting Single self heal daemon process" +                   " on all nodes %s", nodes) +        glustershd_pids_after_bricks_online = pids + +        # compare the glustershd pids +        self.assertNotEqual(glustershd_pids, +                            glustershd_pids_after_bricks_online, +                            ("self heal daemon process are same before and " +                             "after bringing up bricks online")) +        g.log.info("EXPECTED : self heal daemon process are different before " +                   "and after bringing up bricks online") + +        # wait for heal to complete +        g.log.info("Monitoring the heal.....") +        ret = monitor_heal_completion(self.mnode, self.volname) +        self.assertTrue(ret, ("Heal is not completed on volume %s" +                              % self.volname)) +        g.log.info("Heal Completed on volume %s", self.volname) | 
