[Test] Resolve and validate gfid split files

Three Scenarios: - Simulate gfid split brain files under a directory - Resolve gfid splits using `source-brick`, `bigger-file` and `latest-mtime` methods - Validate all the files are healed and data is consistent Change-Id: I8b143f341c0db2f32086ecb6878cbfe3bdb247ce Signed-off-by: Leela Venkaiah G <lgangava@redhat.com>
author: Leela Venkaiah G <lgangava@redhat.com> 2020-09-04 19:01:31 +0530
committer: Leela Venkaiah G <lgangava@redhat.com> 2020-09-08 10:46:47 +0530
commit: c79453974f5602830f475fd5f133d5771ce81f23 (patch)
tree: 92db0462212a1ce9b62ce1d787311a4c4537a469 /tests/functional/afr/test_gfid_split_brain_resolution.py
parent: b418f87f90671eed7158d7f7707c1a6c74b7c5ec (diff)
1 files changed, 200 insertions, 228 deletions
diff --git a/tests/functional/afr/test_gfid_split_brain_resolution.py b/tests/functional/afr/test_gfid_split_brain_resolution.py
index 8d8317a01..6e74376fc 100644
--- a/tests/functional/afr/test_gfid_split_brain_resolution.py
+++ b/tests/functional/afr/test_gfid_split_brain_resolution.py
@@ -1,4 +1,4 @@
-#  Copyright (C) 2017-2018  Red Hat, Inc. <http://www.redhat.com>
+#  Copyright (C) 2017-2020 Red Hat, Inc. <http://www.redhat.com>
 #
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
@@ -14,260 +14,232 @@
 #  with this program; if not, write to the Free Software Foundation, Inc.,
 #  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
+from random import choice
+
 from glusto.core import Glusto as g
-from glustolibs.gluster.gluster_base_class import (GlusterBaseClass, runs_on)
-from glustolibs.gluster.exceptions import ExecutionError
-from glustolibs.gluster.volume_libs import get_subvols
+
 from glustolibs.gluster.brick_libs import (bring_bricks_offline,
-                                           bring_bricks_online,
-                                           are_bricks_offline,
-                                           wait_for_bricks_to_be_online,
-                                           get_all_bricks)
-from glustolibs.gluster.volume_ops import set_volume_options
+                                           bring_bricks_online)
+from glustolibs.gluster.exceptions import ExecutionError
+from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on
 from glustolibs.gluster.glusterdir import mkdir
-from glustolibs.gluster.heal_ops import (enable_self_heal_daemon,
-                                         trigger_heal)
 from glustolibs.gluster.heal_libs import (
-    is_volume_in_split_brain,
-    is_heal_complete,
-    wait_for_self_heal_daemons_to_be_online,
-    monitor_heal_completion)
-from glustolibs.gluster.glusterfile import GlusterFile
+    is_volume_in_split_brain, monitor_heal_completion,
+    wait_for_self_heal_daemons_to_be_online)
+from glustolibs.gluster.heal_ops import (enable_self_heal_daemon, trigger_heal,
+                                         trigger_heal_full)
+from glustolibs.gluster.lib_utils import collect_bricks_arequal, list_files
+from glustolibs.gluster.volume_libs import get_subvols
+from glustolibs.gluster.volume_ops import set_volume_options
 
 
-@runs_on([['replicated', 'distributed-replicated'],
-          ['glusterfs']])
+# pylint: disable=stop-iteration-return, too-many-locals, too-many-statements
+@runs_on([[
+    'replicated', 'distributed-replicated', 'arbiter', 'distributed-arbiter'
+], ['glusterfs']])
 class TestSelfHeal(GlusterBaseClass):
-    """
-    Description:
-        Test cases related to
-        healing in default configuration of the volume
-    """
-
-    @classmethod
-    def setUpClass(cls):
-        # Calling GlusterBaseClass setUpClass
-        cls.get_super_method(cls, 'setUpClass')()
-
-        # Override replica count to be 3
-        if cls.volume_type == "replicated":
-            cls.volume['voltype'] = {
-                'type': 'replicated',
-                'replica_count': 3,
-                'transport': 'tcp'}
-
-        if cls.volume_type == "distributed-replicated":
-            cls.volume['voltype'] = {
-                'type': 'distributed-replicated',
-                'dist_count': 2,
-                'replica_count': 3,
-                'transport': 'tcp'}
-
     def setUp(self):
-        # Calling GlusterBaseClass setUp
         self.get_super_method(self, 'setUp')()
 
-        # Setup Volume and Mount Volume
-        g.log.info("Starting to Setup Volume and Mount Volume")
-        ret = self.setup_volume_and_mount_volume(mounts=self.mounts,
-                                                 volume_create_force=False)
-        if not ret:
-            raise ExecutionError("Failed to Setup_Volume and Mount_Volume")
-        g.log.info("Successful in Setup Volume and Mount Volume")
+        # A single mount is enough for the test
+        self.mounts = self.mounts[0::-1]
 
-        self.bricks_list = get_all_bricks(self.mnode, self.volname)
+        if not self.setup_volume_and_mount_volume(mounts=self.mounts):
+            raise ExecutionError('Failed to setup and mount '
+                                 '{}'.format(self.volname))
 
     def tearDown(self):
-        """
-        If test method failed before validating IO, tearDown waits for the
-        IO's to complete and checks for the IO exit status
-
-        Cleanup and umount volume
-        """
-        # Cleanup and umount volume
-        g.log.info("Starting to Unmount Volume and Cleanup Volume")
-        ret = self.unmount_volume_and_cleanup_volume(mounts=self.mounts)
-        if not ret:
-            raise ExecutionError("Failed to umount the vol & cleanup Volume")
-        g.log.info("Successful in umounting the volume and Cleanup")
-
-        # Calling GlusterBaseClass teardown
+        if not self.unmount_volume_and_cleanup_volume(mounts=self.mounts):
+            raise ExecutionError('Not able to unmount and cleanup '
+                                 '{}'.format(self.volname))
         self.get_super_method(self, 'tearDown')()
 
-    def toggle_bricks_and_perform_io(self, file_list, brick_list):
+    @staticmethod
+    def _get_two_bricks(subvols, arbiter):
+        """
+        Yields two bricks from each subvol for dist/pure X arb/repl volumes
+        """
+        # Get an iterator for py2/3 compatibility
+        brick_iter = iter(zip(*subvols))
+        prev_brick = next(brick_iter)
+        first_brick = prev_brick
+
+        for index, curr_brick in enumerate(brick_iter, 1):
+            # `yield` should contain arbiter brick for arbiter type vols
+            if not (index == 1 and arbiter):
+                yield prev_brick + curr_brick
+            prev_brick = curr_brick
+        # At the end yield first and last brick from a subvol
+        yield prev_brick + first_brick
+
+    def _get_files_in_brick(self, brick_path, dir_path):
         """
-        Kills bricks, does I/O and brings the brick back up.
+        Returns files in format of `dir_path/file_name` from the given brick
+        path
         """
-        # Bring down bricks.
-        g.log.info("Going to bring down the brick process for %s", brick_list)
-        ret = bring_bricks_offline(self.volname, brick_list)
-        self.assertTrue(ret, ("Failed to bring down the bricks. Please "
-                              "check the log file for more details."))
-        g.log.info("Brought down the brick process "
-                   "for %s successfully", brick_list)
-        ret = are_bricks_offline(self.mnode, self.volname, brick_list)
-        self.assertTrue(ret, 'Bricks %s are not offline' % brick_list)
-
-        # Perform I/O
-        for filename in file_list:
-            fpath = self.mounts[0].mountpoint + "/test_gfid_split_brain/" + \
-                    filename
-            cmd = ("dd if=/dev/urandom of=%s bs=1024 count=1" % fpath)
-            ret, _, _ = g.run(self.clients[0], cmd)
-            self.assertEqual(ret, 0, "Creating %s failed" % fpath)
-
-        # Bring up bricks
-        ret = bring_bricks_online(self.mnode, self.volname, brick_list)
-        self.assertTrue(ret, 'Failed to bring brick %s online' % brick_list)
-        g.log.info('Bringing brick %s online is successful', brick_list)
-
-        # Waiting for bricks to come online
-        g.log.info("Waiting for brick process to come online")
-        timeout = 30
-        ret = wait_for_bricks_to_be_online(self.mnode, self.volname, timeout)
-        self.assertTrue(ret, "bricks didn't come online after adding bricks")
-        g.log.info("Bricks are online")
-
-    def resolve_gfid_split_brain(self, filename, source_brick):
+        node, path = brick_path.split(':')
+        files = list_files(node, path, dir_path)
+        self.assertIsNotNone(
+            files, 'Unable to get list of files from {}'.format(brick_path))
+
+        files = [file_name.rsplit('/', 1)[-1] for file_name in files]
+        return [
+            each_file for each_file in files
+            if each_file in ('file1', 'file2', 'file3')
+        ]
+
+    def _run_cmd_and_assert(self, cmd):
         """
-        resolves gfid split-brain on files using source-brick option
+        Run `cmd` on `mnode` and assert for success
         """
-        node, _ = source_brick.split(':')
-        command = ("gluster volume heal " + self.volname + " split-brain "
-                   "source-brick " + source_brick + " " + filename)
-        ret, _, _ = g.run(node, command)
-        self.assertEqual(ret, 0, "command execution not successful")
+        ret, _, err = g.run(self.mnode, cmd)
+        self.assertEqual(ret, 0, '`{}` failed with {}'.format(cmd, err))
 
     def test_gfid_split_brain_resolution(self):
         """
-        - create gfid split-brain of files and resolves them using source-brick
-          option of the CLI.
+        Description: Simulates gfid split brain on multiple files in a dir and
+        resolve them via `bigger-file`, `mtime` and `source-brick` methods
+
+        Steps:
+        - Create and mount a replicated volume, create a dir and ~10 data files
+        - Simulate gfid splits in 9 of the files
+        - Resolve each 3 set of files using `bigger-file`, `mtime` and
+          `source-bricks` split-brain resoultion methods
+        - Trigger and monitor for heal completion
+        - Validate all the files are healed and arequal matches for bricks in
+          subvols
         """
-
-        # pylint: disable=too-many-statements
-        # pylint: disable=too-many-locals
-
-        # Disable all self-heals and client-quorum
-        options = {"self-heal-daemon": "off",
-                   "data-self-heal": "off",
-                   "metadata-self-heal": "off",
-                   "entry-self-heal": "off",
-                   "cluster.quorum-type": "none"}
-        g.log.info("setting volume options %s", options)
-        ret = set_volume_options(self.mnode, self.volname, options)
-        self.assertTrue(ret, ("Unable to set volume option %s for "
-                              "volume %s" % (options, self.volname)))
-        g.log.info("Successfully set %s for volume %s", options, self.volname)
-
-        # Create dir inside which I/O will be performed.
-        ret = mkdir(self.mounts[0].client_system, "%s/test_gfid_split_brain"
-                    % self.mounts[0].mountpoint)
-        self.assertTrue(ret, "mkdir failed")
-
-        # get the subvolumes
-        g.log.info("Starting to get sub-volumes for volume %s", self.volname)
-        subvols_dict = get_subvols(self.mnode, self.volname)
-        num_subvols = len(subvols_dict['volume_subvols'])
-        g.log.info("Number of subvolumes in volume %s:", num_subvols)
-
-        # Toggle bricks and perform I/O
-        file_list = ["file1.txt", "file2.txt", "file3.txt", "file4.txt",
-                     "file5.txt", "file6.txt", "file7.txt", "file8.txt",
-                     "file9.txt", "file10.txt"]
-        brick_index = 0
-        offline_bricks = []
-        for _ in range(0, 3):
-            for i in range(0, num_subvols):
-                subvol_brick_list = subvols_dict['volume_subvols'][i]
-                offline_bricks.append(subvol_brick_list[brick_index % 3])
-                offline_bricks.append(subvol_brick_list[(brick_index+1) % 3])
-            self.toggle_bricks_and_perform_io(file_list, offline_bricks)
-            brick_index += 1
-            offline_bricks[:] = []
-
-        # Enable shd
-        g.log.info("enabling the self heal daemon")
+        io_cmd = 'cat /dev/urandom | tr -dc [:space:][:print:] | head -c '
+        client, m_point = (self.mounts[0].client_system,
+                           self.mounts[0].mountpoint)
+        arbiter = self.volume_type.find('arbiter') >= 0
+
+        # Disable self-heal daemon and set `quorum-type` option to `none`
+        ret = set_volume_options(self.mnode, self.volname, {
+            'self-heal-daemon': 'off',
+            'cluster.quorum-type': 'none'
+        })
+        self.assertTrue(
+            ret, 'Not able to disable `quorum-type` and '
+            '`self-heal` daemon volume options')
+
+        # Create required dir and files from the mount
+        split_dir = 'gfid_split_dir'
+        file_io = ('cd %s; for i in {1..10}; do ' + io_cmd +
+                   ' 1M > %s/file$i; done;')
+        ret = mkdir(client, '{}/{}'.format(m_point, split_dir))
+        self.assertTrue(ret, 'Unable to create a directory from mount point')
+        ret, _, _ = g.run(client, file_io % (m_point, split_dir))
+
+        # `file{4,5,6}` are re-created every time to be used in `bigger-file`
+        # resolution method
+        cmd = 'rm -rf {0}/file{1} && {2} {3}M > {0}/file{1}'
+        split_cmds = {
+            1:
+            ';'.join(cmd.format(split_dir, i, io_cmd, 2) for i in range(1, 7)),
+            2:
+            ';'.join(cmd.format(split_dir, i, io_cmd, 3) for i in range(4, 7)),
+            3: ';'.join(
+                cmd.format(split_dir, i, io_cmd, 1) for i in range(4, 10)),
+            4: ';'.join(
+                cmd.format(split_dir, i, io_cmd, 1) for i in range(7, 10)),
+        }
+
+        # Get subvols and simulate entry split brain
+        subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
+        self.assertTrue(subvols, 'Not able to get list of subvols')
+        msg = ('Unable to bring files under {} dir to entry split brain while '
+               '{} are down')
+        for index, bricks in enumerate(self._get_two_bricks(subvols, arbiter),
+                                       1):
+            # Bring down two bricks from each subvol
+            ret = bring_bricks_offline(self.volname, list(bricks))
+            self.assertTrue(ret, 'Unable to bring {} offline'.format(bricks))
+
+            ret, _, _ = g.run(client,
+                              'cd {}; {}'.format(m_point, split_cmds[index]))
+            self.assertEqual(ret, 0, msg.format(split_dir, bricks))
+
+            # Bricks will be brought down only two times in case of arbiter and
+            # bringing remaining files into split brain for `latest-mtime` heal
+            if arbiter and index == 2:
+                ret, _, _ = g.run(client,
+                                  'cd {}; {}'.format(m_point, split_cmds[4]))
+                self.assertEqual(ret, 0, msg.format(split_dir, bricks))
+
+            # Bring offline bricks online
+            ret = bring_bricks_online(
+                self.mnode,
+                self.volname,
+                bricks,
+                bring_bricks_online_methods='volume_start_force')
+            self.assertTrue(ret, 'Unable to bring {} online'.format(bricks))
+
+        # Enable self-heal daemon, trigger heal and assert volume is in split
+        # brain condition
         ret = enable_self_heal_daemon(self.mnode, self.volname)
-        self.assertTrue(ret, "failed to enable self heal daemon")
-        g.log.info("Successfully enabled the self heal daemon")
-
-        # Wait for self heal processes to come online
-        g.log.info("Wait for selfheal process to come online")
-        timeout = 300
-        ret = wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname,
-                                                      timeout)
-        self.assertTrue(ret, "Self-heal process are not online")
-        g.log.info("All self heal process are online")
-
-        # Trigger heal
+        self.assertTrue(ret, 'Failed to enable self heal daemon')
+
+        ret = wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname)
+        self.assertTrue(ret, 'Not all self heal daemons are online')
+
         ret = trigger_heal(self.mnode, self.volname)
-        self.assertTrue(ret, 'Starting heal failed')
-        g.log.info('Index heal launched')
+        self.assertTrue(ret, 'Unable to trigger index heal on the volume')
 
-        # checking if file is in split-brain
         ret = is_volume_in_split_brain(self.mnode, self.volname)
-        self.assertTrue(ret, "Files are not in split-brain as expected.")
-        g.log.info("Files are still in split-brain")
-
-        # First brick of each replica will be used as source-brick
-        first_brick_list = []
-        for i in range(0, num_subvols):
-            subvol_brick_list = subvols_dict['volume_subvols'][i]
-            brick = subvol_brick_list[0]
-            first_brick_list.append(brick)
-
-        # Find which dht subvols the 10 files are present in and trigger heal
-        for filename in file_list:
-            fpath = self.mounts[0].mountpoint + "/test_gfid_split_brain/" + \
-                    filename
-            gfile = GlusterFile(self.clients[0], fpath)
-            for brick in first_brick_list:
-                _, brick_path = brick.split(':')
-                match = [brick for item in gfile.hashed_bricks if brick_path
-                         in item]
-                if match:
-                    self.resolve_gfid_split_brain("/test_gfid_split_brain/" +
-                                                  filename, brick)
-
-        # Trigger heal to complete pending data/metadata heals
-        ret = trigger_heal(self.mnode, self.volname)
-        self.assertTrue(ret, 'Starting heal failed')
-        g.log.info('Index heal launched')
+        self.assertTrue(ret, 'Volume should be in split brain condition')
+
+        # Select source brick and take note of files in source brick
+        stop = len(subvols[0]) - 1 if arbiter else len(subvols[0])
+        source_bricks = [choice(subvol[0:stop]) for subvol in subvols]
+        files = [
+            self._get_files_in_brick(path, split_dir) for path in source_bricks
+        ]
+
+        # Resolve `file1, file2, file3` gfid split files using `source-brick`
+        cmd = ('gluster volume heal ' + self.volname + ' split-brain '
+               'source-brick {} /' + split_dir + '/{}')
+        for index, source_brick in enumerate(source_bricks):
+            for each_file in files[index]:
+                run_cmd = cmd.format(source_brick, each_file)
+                self._run_cmd_and_assert(run_cmd)
+
+        # Resolve `file4, file5, file6` gfid split files using `bigger-file`
+        cmd = ('gluster volume heal ' + self.volname +
+               ' split-brain bigger-file /' + split_dir + '/{}')
+        for each_file in ('file4', 'file5', 'file6'):
+            run_cmd = cmd.format(each_file)
+            self._run_cmd_and_assert(run_cmd)
+
+        # Resolve `file7, file8, file9` gfid split files using `latest-mtime`
+        cmd = ('gluster volume heal ' + self.volname +
+               ' split-brain latest-mtime /' + split_dir + '/{}')
+        for each_file in ('file7', 'file8', 'file9'):
+            run_cmd = cmd.format(each_file)
+            self._run_cmd_and_assert(run_cmd)
+
+        # Unless `shd` is triggered manually/automatically files will still
+        # appear in `heal info`
+        ret = trigger_heal_full(self.mnode, self.volname)
+        self.assertTrue(ret, 'Unable to trigger full self heal')
 
         # Monitor heal completion
         ret = monitor_heal_completion(self.mnode, self.volname)
-        self.assertTrue(ret, 'Heal has not yet completed')
-
-        # Check if heal is completed
-        ret = is_heal_complete(self.mnode, self.volname)
-        self.assertTrue(ret, 'Heal is not complete')
-        g.log.info('Heal is completed successfully')
-
-        # Get arequals and compare
-        for i in range(0, num_subvols):
-            # Get arequal for first brick
-            subvol_brick_list = subvols_dict['volume_subvols'][i]
-            node, brick_path = subvol_brick_list[0].split(':')
-            command = ('arequal-checksum -p %s '
-                       '-i .glusterfs -i .landfill -i .trashcan'
-                       % brick_path)
-            ret, arequal, _ = g.run(node, command)
-            first_brick_total = arequal.splitlines()[-1].split(':')[-1]
-
-            # Get arequal for every brick and compare with first brick
-            for brick in subvol_brick_list[1:]:
-                node, brick_path = brick.split(':')
-                command = ('arequal-checksum -p %s '
-                           '-i .glusterfs -i .landfill -i .trashcan'
-                           % brick_path)
-                ret, brick_arequal, _ = g.run(node, command)
-                self.assertFalse(ret,
-                                 'Failed to get arequal on brick %s'
-                                 % brick)
-                g.log.info('Getting arequal for %s is successful', brick)
-                brick_total = brick_arequal.splitlines()[-1].split(':')[-1]
-
-                self.assertEqual(first_brick_total, brick_total,
-                                 'Arequals for subvol and %s are not equal'
-                                 % brick)
-                g.log.info('Arequals for subvol and %s are equal', brick)
+        self.assertTrue(
+            ret, 'All files in volume should be healed after healing files via'
+            ' `source-brick`, `bigger-file`, `latest-mtime` methods manually')
+
+        # Validate normal file `file10` and healed files don't differ in
+        # subvols via an `arequal`
+        for subvol in subvols:
+            # Disregard last brick if volume is of arbiter type
+            ret, arequal = collect_bricks_arequal(subvol[0:stop])
+            self.assertTrue(
+                ret, 'Unable to get `arequal` checksum on '
+                '{}'.format(subvol[0:stop]))
+            self.assertEqual(
+                len(set(arequal)), 1, 'Mismatch of `arequal` '
+                'checksum among {} is identified'.format(subvol[0:stop]))
+
+        g.log.info('Pass: Resolution of gfid split-brain via `source-brick`, '
+                   '`bigger-file` and `latest-mtime` methods is complete')
author	Leela Venkaiah G <lgangava@redhat.com>	2020-09-04 19:01:31 +0530
committer	Leela Venkaiah G <lgangava@redhat.com>	2020-09-08 10:46:47 +0530
commit	c79453974f5602830f475fd5f133d5771ce81f23 (patch)
tree	92db0462212a1ce9b62ce1d787311a4c4537a469 /tests/functional/afr/test_gfid_split_brain_resolution.py
parent	b418f87f90671eed7158d7f7707c1a6c74b7c5ec (diff)