From c79453974f5602830f475fd5f133d5771ce81f23 Mon Sep 17 00:00:00 2001 From: Leela Venkaiah G Date: Fri, 4 Sep 2020 19:01:31 +0530 Subject: [Test] Resolve and validate gfid split files Three Scenarios: - Simulate gfid split brain files under a directory - Resolve gfid splits using `source-brick`, `bigger-file` and `latest-mtime` methods - Validate all the files are healed and data is consistent Change-Id: I8b143f341c0db2f32086ecb6878cbfe3bdb247ce Signed-off-by: Leela Venkaiah G --- .../afr/test_gfid_split_brain_resolution.py | 428 ++++++++++----------- 1 file changed, 200 insertions(+), 228 deletions(-) (limited to 'tests/functional/afr') diff --git a/tests/functional/afr/test_gfid_split_brain_resolution.py b/tests/functional/afr/test_gfid_split_brain_resolution.py index 8d8317a01..6e74376fc 100644 --- a/tests/functional/afr/test_gfid_split_brain_resolution.py +++ b/tests/functional/afr/test_gfid_split_brain_resolution.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2018 Red Hat, Inc. +# Copyright (C) 2017-2020 Red Hat, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -14,260 +14,232 @@ # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +from random import choice + from glusto.core import Glusto as g -from glustolibs.gluster.gluster_base_class import (GlusterBaseClass, runs_on) -from glustolibs.gluster.exceptions import ExecutionError -from glustolibs.gluster.volume_libs import get_subvols + from glustolibs.gluster.brick_libs import (bring_bricks_offline, - bring_bricks_online, - are_bricks_offline, - wait_for_bricks_to_be_online, - get_all_bricks) -from glustolibs.gluster.volume_ops import set_volume_options + bring_bricks_online) +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on from glustolibs.gluster.glusterdir import mkdir -from glustolibs.gluster.heal_ops import (enable_self_heal_daemon, - trigger_heal) from glustolibs.gluster.heal_libs import ( - is_volume_in_split_brain, - is_heal_complete, - wait_for_self_heal_daemons_to_be_online, - monitor_heal_completion) -from glustolibs.gluster.glusterfile import GlusterFile + is_volume_in_split_brain, monitor_heal_completion, + wait_for_self_heal_daemons_to_be_online) +from glustolibs.gluster.heal_ops import (enable_self_heal_daemon, trigger_heal, + trigger_heal_full) +from glustolibs.gluster.lib_utils import collect_bricks_arequal, list_files +from glustolibs.gluster.volume_libs import get_subvols +from glustolibs.gluster.volume_ops import set_volume_options -@runs_on([['replicated', 'distributed-replicated'], - ['glusterfs']]) +# pylint: disable=stop-iteration-return, too-many-locals, too-many-statements +@runs_on([[ + 'replicated', 'distributed-replicated', 'arbiter', 'distributed-arbiter' +], ['glusterfs']]) class TestSelfHeal(GlusterBaseClass): - """ - Description: - Test cases related to - healing in default configuration of the volume - """ - - @classmethod - def setUpClass(cls): - # Calling GlusterBaseClass setUpClass - cls.get_super_method(cls, 'setUpClass')() - - # Override replica count to be 3 - if cls.volume_type == "replicated": - cls.volume['voltype'] = { - 'type': 'replicated', - 'replica_count': 3, - 'transport': 'tcp'} - - if cls.volume_type == "distributed-replicated": - cls.volume['voltype'] = { - 'type': 'distributed-replicated', - 'dist_count': 2, - 'replica_count': 3, - 'transport': 'tcp'} - def setUp(self): - # Calling GlusterBaseClass setUp self.get_super_method(self, 'setUp')() - # Setup Volume and Mount Volume - g.log.info("Starting to Setup Volume and Mount Volume") - ret = self.setup_volume_and_mount_volume(mounts=self.mounts, - volume_create_force=False) - if not ret: - raise ExecutionError("Failed to Setup_Volume and Mount_Volume") - g.log.info("Successful in Setup Volume and Mount Volume") + # A single mount is enough for the test + self.mounts = self.mounts[0::-1] - self.bricks_list = get_all_bricks(self.mnode, self.volname) + if not self.setup_volume_and_mount_volume(mounts=self.mounts): + raise ExecutionError('Failed to setup and mount ' + '{}'.format(self.volname)) def tearDown(self): - """ - If test method failed before validating IO, tearDown waits for the - IO's to complete and checks for the IO exit status - - Cleanup and umount volume - """ - # Cleanup and umount volume - g.log.info("Starting to Unmount Volume and Cleanup Volume") - ret = self.unmount_volume_and_cleanup_volume(mounts=self.mounts) - if not ret: - raise ExecutionError("Failed to umount the vol & cleanup Volume") - g.log.info("Successful in umounting the volume and Cleanup") - - # Calling GlusterBaseClass teardown + if not self.unmount_volume_and_cleanup_volume(mounts=self.mounts): + raise ExecutionError('Not able to unmount and cleanup ' + '{}'.format(self.volname)) self.get_super_method(self, 'tearDown')() - def toggle_bricks_and_perform_io(self, file_list, brick_list): + @staticmethod + def _get_two_bricks(subvols, arbiter): + """ + Yields two bricks from each subvol for dist/pure X arb/repl volumes + """ + # Get an iterator for py2/3 compatibility + brick_iter = iter(zip(*subvols)) + prev_brick = next(brick_iter) + first_brick = prev_brick + + for index, curr_brick in enumerate(brick_iter, 1): + # `yield` should contain arbiter brick for arbiter type vols + if not (index == 1 and arbiter): + yield prev_brick + curr_brick + prev_brick = curr_brick + # At the end yield first and last brick from a subvol + yield prev_brick + first_brick + + def _get_files_in_brick(self, brick_path, dir_path): """ - Kills bricks, does I/O and brings the brick back up. + Returns files in format of `dir_path/file_name` from the given brick + path """ - # Bring down bricks. - g.log.info("Going to bring down the brick process for %s", brick_list) - ret = bring_bricks_offline(self.volname, brick_list) - self.assertTrue(ret, ("Failed to bring down the bricks. Please " - "check the log file for more details.")) - g.log.info("Brought down the brick process " - "for %s successfully", brick_list) - ret = are_bricks_offline(self.mnode, self.volname, brick_list) - self.assertTrue(ret, 'Bricks %s are not offline' % brick_list) - - # Perform I/O - for filename in file_list: - fpath = self.mounts[0].mountpoint + "/test_gfid_split_brain/" + \ - filename - cmd = ("dd if=/dev/urandom of=%s bs=1024 count=1" % fpath) - ret, _, _ = g.run(self.clients[0], cmd) - self.assertEqual(ret, 0, "Creating %s failed" % fpath) - - # Bring up bricks - ret = bring_bricks_online(self.mnode, self.volname, brick_list) - self.assertTrue(ret, 'Failed to bring brick %s online' % brick_list) - g.log.info('Bringing brick %s online is successful', brick_list) - - # Waiting for bricks to come online - g.log.info("Waiting for brick process to come online") - timeout = 30 - ret = wait_for_bricks_to_be_online(self.mnode, self.volname, timeout) - self.assertTrue(ret, "bricks didn't come online after adding bricks") - g.log.info("Bricks are online") - - def resolve_gfid_split_brain(self, filename, source_brick): + node, path = brick_path.split(':') + files = list_files(node, path, dir_path) + self.assertIsNotNone( + files, 'Unable to get list of files from {}'.format(brick_path)) + + files = [file_name.rsplit('/', 1)[-1] for file_name in files] + return [ + each_file for each_file in files + if each_file in ('file1', 'file2', 'file3') + ] + + def _run_cmd_and_assert(self, cmd): """ - resolves gfid split-brain on files using source-brick option + Run `cmd` on `mnode` and assert for success """ - node, _ = source_brick.split(':') - command = ("gluster volume heal " + self.volname + " split-brain " - "source-brick " + source_brick + " " + filename) - ret, _, _ = g.run(node, command) - self.assertEqual(ret, 0, "command execution not successful") + ret, _, err = g.run(self.mnode, cmd) + self.assertEqual(ret, 0, '`{}` failed with {}'.format(cmd, err)) def test_gfid_split_brain_resolution(self): """ - - create gfid split-brain of files and resolves them using source-brick - option of the CLI. + Description: Simulates gfid split brain on multiple files in a dir and + resolve them via `bigger-file`, `mtime` and `source-brick` methods + + Steps: + - Create and mount a replicated volume, create a dir and ~10 data files + - Simulate gfid splits in 9 of the files + - Resolve each 3 set of files using `bigger-file`, `mtime` and + `source-bricks` split-brain resoultion methods + - Trigger and monitor for heal completion + - Validate all the files are healed and arequal matches for bricks in + subvols """ - - # pylint: disable=too-many-statements - # pylint: disable=too-many-locals - - # Disable all self-heals and client-quorum - options = {"self-heal-daemon": "off", - "data-self-heal": "off", - "metadata-self-heal": "off", - "entry-self-heal": "off", - "cluster.quorum-type": "none"} - g.log.info("setting volume options %s", options) - ret = set_volume_options(self.mnode, self.volname, options) - self.assertTrue(ret, ("Unable to set volume option %s for " - "volume %s" % (options, self.volname))) - g.log.info("Successfully set %s for volume %s", options, self.volname) - - # Create dir inside which I/O will be performed. - ret = mkdir(self.mounts[0].client_system, "%s/test_gfid_split_brain" - % self.mounts[0].mountpoint) - self.assertTrue(ret, "mkdir failed") - - # get the subvolumes - g.log.info("Starting to get sub-volumes for volume %s", self.volname) - subvols_dict = get_subvols(self.mnode, self.volname) - num_subvols = len(subvols_dict['volume_subvols']) - g.log.info("Number of subvolumes in volume %s:", num_subvols) - - # Toggle bricks and perform I/O - file_list = ["file1.txt", "file2.txt", "file3.txt", "file4.txt", - "file5.txt", "file6.txt", "file7.txt", "file8.txt", - "file9.txt", "file10.txt"] - brick_index = 0 - offline_bricks = [] - for _ in range(0, 3): - for i in range(0, num_subvols): - subvol_brick_list = subvols_dict['volume_subvols'][i] - offline_bricks.append(subvol_brick_list[brick_index % 3]) - offline_bricks.append(subvol_brick_list[(brick_index+1) % 3]) - self.toggle_bricks_and_perform_io(file_list, offline_bricks) - brick_index += 1 - offline_bricks[:] = [] - - # Enable shd - g.log.info("enabling the self heal daemon") + io_cmd = 'cat /dev/urandom | tr -dc [:space:][:print:] | head -c ' + client, m_point = (self.mounts[0].client_system, + self.mounts[0].mountpoint) + arbiter = self.volume_type.find('arbiter') >= 0 + + # Disable self-heal daemon and set `quorum-type` option to `none` + ret = set_volume_options(self.mnode, self.volname, { + 'self-heal-daemon': 'off', + 'cluster.quorum-type': 'none' + }) + self.assertTrue( + ret, 'Not able to disable `quorum-type` and ' + '`self-heal` daemon volume options') + + # Create required dir and files from the mount + split_dir = 'gfid_split_dir' + file_io = ('cd %s; for i in {1..10}; do ' + io_cmd + + ' 1M > %s/file$i; done;') + ret = mkdir(client, '{}/{}'.format(m_point, split_dir)) + self.assertTrue(ret, 'Unable to create a directory from mount point') + ret, _, _ = g.run(client, file_io % (m_point, split_dir)) + + # `file{4,5,6}` are re-created every time to be used in `bigger-file` + # resolution method + cmd = 'rm -rf {0}/file{1} && {2} {3}M > {0}/file{1}' + split_cmds = { + 1: + ';'.join(cmd.format(split_dir, i, io_cmd, 2) for i in range(1, 7)), + 2: + ';'.join(cmd.format(split_dir, i, io_cmd, 3) for i in range(4, 7)), + 3: ';'.join( + cmd.format(split_dir, i, io_cmd, 1) for i in range(4, 10)), + 4: ';'.join( + cmd.format(split_dir, i, io_cmd, 1) for i in range(7, 10)), + } + + # Get subvols and simulate entry split brain + subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] + self.assertTrue(subvols, 'Not able to get list of subvols') + msg = ('Unable to bring files under {} dir to entry split brain while ' + '{} are down') + for index, bricks in enumerate(self._get_two_bricks(subvols, arbiter), + 1): + # Bring down two bricks from each subvol + ret = bring_bricks_offline(self.volname, list(bricks)) + self.assertTrue(ret, 'Unable to bring {} offline'.format(bricks)) + + ret, _, _ = g.run(client, + 'cd {}; {}'.format(m_point, split_cmds[index])) + self.assertEqual(ret, 0, msg.format(split_dir, bricks)) + + # Bricks will be brought down only two times in case of arbiter and + # bringing remaining files into split brain for `latest-mtime` heal + if arbiter and index == 2: + ret, _, _ = g.run(client, + 'cd {}; {}'.format(m_point, split_cmds[4])) + self.assertEqual(ret, 0, msg.format(split_dir, bricks)) + + # Bring offline bricks online + ret = bring_bricks_online( + self.mnode, + self.volname, + bricks, + bring_bricks_online_methods='volume_start_force') + self.assertTrue(ret, 'Unable to bring {} online'.format(bricks)) + + # Enable self-heal daemon, trigger heal and assert volume is in split + # brain condition ret = enable_self_heal_daemon(self.mnode, self.volname) - self.assertTrue(ret, "failed to enable self heal daemon") - g.log.info("Successfully enabled the self heal daemon") - - # Wait for self heal processes to come online - g.log.info("Wait for selfheal process to come online") - timeout = 300 - ret = wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname, - timeout) - self.assertTrue(ret, "Self-heal process are not online") - g.log.info("All self heal process are online") - - # Trigger heal + self.assertTrue(ret, 'Failed to enable self heal daemon') + + ret = wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname) + self.assertTrue(ret, 'Not all self heal daemons are online') + ret = trigger_heal(self.mnode, self.volname) - self.assertTrue(ret, 'Starting heal failed') - g.log.info('Index heal launched') + self.assertTrue(ret, 'Unable to trigger index heal on the volume') - # checking if file is in split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) - self.assertTrue(ret, "Files are not in split-brain as expected.") - g.log.info("Files are still in split-brain") - - # First brick of each replica will be used as source-brick - first_brick_list = [] - for i in range(0, num_subvols): - subvol_brick_list = subvols_dict['volume_subvols'][i] - brick = subvol_brick_list[0] - first_brick_list.append(brick) - - # Find which dht subvols the 10 files are present in and trigger heal - for filename in file_list: - fpath = self.mounts[0].mountpoint + "/test_gfid_split_brain/" + \ - filename - gfile = GlusterFile(self.clients[0], fpath) - for brick in first_brick_list: - _, brick_path = brick.split(':') - match = [brick for item in gfile.hashed_bricks if brick_path - in item] - if match: - self.resolve_gfid_split_brain("/test_gfid_split_brain/" + - filename, brick) - - # Trigger heal to complete pending data/metadata heals - ret = trigger_heal(self.mnode, self.volname) - self.assertTrue(ret, 'Starting heal failed') - g.log.info('Index heal launched') + self.assertTrue(ret, 'Volume should be in split brain condition') + + # Select source brick and take note of files in source brick + stop = len(subvols[0]) - 1 if arbiter else len(subvols[0]) + source_bricks = [choice(subvol[0:stop]) for subvol in subvols] + files = [ + self._get_files_in_brick(path, split_dir) for path in source_bricks + ] + + # Resolve `file1, file2, file3` gfid split files using `source-brick` + cmd = ('gluster volume heal ' + self.volname + ' split-brain ' + 'source-brick {} /' + split_dir + '/{}') + for index, source_brick in enumerate(source_bricks): + for each_file in files[index]: + run_cmd = cmd.format(source_brick, each_file) + self._run_cmd_and_assert(run_cmd) + + # Resolve `file4, file5, file6` gfid split files using `bigger-file` + cmd = ('gluster volume heal ' + self.volname + + ' split-brain bigger-file /' + split_dir + '/{}') + for each_file in ('file4', 'file5', 'file6'): + run_cmd = cmd.format(each_file) + self._run_cmd_and_assert(run_cmd) + + # Resolve `file7, file8, file9` gfid split files using `latest-mtime` + cmd = ('gluster volume heal ' + self.volname + + ' split-brain latest-mtime /' + split_dir + '/{}') + for each_file in ('file7', 'file8', 'file9'): + run_cmd = cmd.format(each_file) + self._run_cmd_and_assert(run_cmd) + + # Unless `shd` is triggered manually/automatically files will still + # appear in `heal info` + ret = trigger_heal_full(self.mnode, self.volname) + self.assertTrue(ret, 'Unable to trigger full self heal') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) - self.assertTrue(ret, 'Heal has not yet completed') - - # Check if heal is completed - ret = is_heal_complete(self.mnode, self.volname) - self.assertTrue(ret, 'Heal is not complete') - g.log.info('Heal is completed successfully') - - # Get arequals and compare - for i in range(0, num_subvols): - # Get arequal for first brick - subvol_brick_list = subvols_dict['volume_subvols'][i] - node, brick_path = subvol_brick_list[0].split(':') - command = ('arequal-checksum -p %s ' - '-i .glusterfs -i .landfill -i .trashcan' - % brick_path) - ret, arequal, _ = g.run(node, command) - first_brick_total = arequal.splitlines()[-1].split(':')[-1] - - # Get arequal for every brick and compare with first brick - for brick in subvol_brick_list[1:]: - node, brick_path = brick.split(':') - command = ('arequal-checksum -p %s ' - '-i .glusterfs -i .landfill -i .trashcan' - % brick_path) - ret, brick_arequal, _ = g.run(node, command) - self.assertFalse(ret, - 'Failed to get arequal on brick %s' - % brick) - g.log.info('Getting arequal for %s is successful', brick) - brick_total = brick_arequal.splitlines()[-1].split(':')[-1] - - self.assertEqual(first_brick_total, brick_total, - 'Arequals for subvol and %s are not equal' - % brick) - g.log.info('Arequals for subvol and %s are equal', brick) + self.assertTrue( + ret, 'All files in volume should be healed after healing files via' + ' `source-brick`, `bigger-file`, `latest-mtime` methods manually') + + # Validate normal file `file10` and healed files don't differ in + # subvols via an `arequal` + for subvol in subvols: + # Disregard last brick if volume is of arbiter type + ret, arequal = collect_bricks_arequal(subvol[0:stop]) + self.assertTrue( + ret, 'Unable to get `arequal` checksum on ' + '{}'.format(subvol[0:stop])) + self.assertEqual( + len(set(arequal)), 1, 'Mismatch of `arequal` ' + 'checksum among {} is identified'.format(subvol[0:stop])) + + g.log.info('Pass: Resolution of gfid split-brain via `source-brick`, ' + '`bigger-file` and `latest-mtime` methods is complete') -- cgit