From b47b449595cd179f4f6a00a10f0e1078287f6171 Mon Sep 17 00:00:00 2001 From: Leela Venkaiah G Date: Fri, 19 Jun 2020 17:11:13 +0530 Subject: [Test] IO conitinuity on brick down in EC Volume Test Steps: - Create, start and mount an EC volume in two clients - Create multiple files and directories including all file types on one directory from client 1 - Take arequal check sum of above data - Create another folder and pump different fops from client 2 - Fail and bring up redundant bricks in a cyclic fashion in all of the subvols maintaining a minimum delay between each operation - In every cycle create new dir when brick is down and wait for heal - Validate heal info on volume when brick down erroring out instantly - Validate arequal on brining the brick offline Change-Id: Ied5e0787eef786e5af7ea70191f5521b9d5e34f6 Signed-off-by: Leela Venkaiah G --- tests/functional/disperse/test_ec_io_continuity.py | 215 +++++++++++++++++++++ 1 file changed, 215 insertions(+) create mode 100644 tests/functional/disperse/test_ec_io_continuity.py diff --git a/tests/functional/disperse/test_ec_io_continuity.py b/tests/functional/disperse/test_ec_io_continuity.py new file mode 100644 index 000000000..2a1510ce0 --- /dev/null +++ b/tests/functional/disperse/test_ec_io_continuity.py @@ -0,0 +1,215 @@ +# Copyright (C) 2020 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +from datetime import datetime, timedelta +from time import sleep + +from glusto.core import Glusto as g + +from glustolibs.gluster.brick_libs import bring_bricks_offline +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on +from glustolibs.gluster.glusterdir import mkdir +from glustolibs.gluster.glusterfile import create_link_file +from glustolibs.gluster.heal_libs import monitor_heal_completion +from glustolibs.gluster.heal_ops import heal_info +from glustolibs.gluster.volume_libs import get_subvols, volume_start +from glustolibs.io.utils import (collect_mounts_arequal, validate_io_procs, + wait_for_io_to_complete) +from glustolibs.misc.misc_libs import upload_scripts + + +@runs_on([['dispersed', 'distributed-dispersed'], ['glusterfs', 'nfs']]) +class TestIOsOnECVolume(GlusterBaseClass): + @classmethod + def setUpClass(cls): + cls.get_super_method(cls, 'setUpClass')() + cls.script_path = '/usr/share/glustolibs/io/scripts' + for file_ops in ('file_dir_ops.py', 'fd_writes.py'): + ret = upload_scripts(cls.clients, + '{}/{}'.format(cls.script_path, file_ops)) + if not ret: + raise ExecutionError('Failed to upload IO scripts to client') + + def setUp(self): + self.get_super_method(self, 'setUp')() + ret = self.setup_volume_and_mount_volume(mounts=self.mounts) + self.all_mounts_procs = [] + if not ret: + raise ExecutionError('Failed to setup and mount volume') + + def tearDown(self): + if self.all_mounts_procs: + ret = wait_for_io_to_complete(self.all_mounts_procs, + [self.mounts[1]] * + len(self.all_mounts_procs)) + if not ret: + raise ExecutionError('Wait for IO completion failed on some ' + 'of the clients') + ret = self.unmount_volume_and_cleanup_volume(mounts=self.mounts) + if not ret: + raise ExecutionError("Not able to unmount and cleanup volume") + self.get_super_method(self, 'tearDown')() + + def _bring_bricks_online_and_monitor_heal(self, bricks): + """Bring the bricks online and monitor heal until completion""" + ret, _, _ = volume_start(self.mnode, self.volname, force=True) + self.assertEqual(ret, 0, 'Not able to force start volume') + ret = monitor_heal_completion(self.mnode, + self.volname, + bricks=list(bricks)) + self.assertTrue(ret, 'Heal is not complete for {}'.format(bricks)) + + # pylint: disable=too-many-locals + def test_io_with_cyclic_brick_down(self): + """ + Description: To check heal process on EC volume when brick is brought + down in a cyclic fashion + Steps: + - Create, start and mount an EC volume in two clients + - Create multiple files and directories including all file types on one + directory from client 1 + - Take arequal check sum of above data + - Create another folder and pump different fops from client 2 + - Fail and bring up redundant bricks in a cyclic fashion in all of the + subvols maintaining a minimum delay between each operation + - In every cycle create new dir when brick is down and wait for heal + - Validate heal info on volume when brick down erroring out instantly + - Validate arequal on brining the brick offline + """ + + # Create a directory structure on mount from client 1 + mount_obj = self.mounts[0] + cmd = ('/usr/bin/env python {}/file_dir_ops.py ' + 'create_deep_dirs_with_files --dir-depth 3 ' + '--max-num-of-dirs 5 --fixed-file-size 10k ' + '--num-of-files 9 {}'.format( + self.script_path, + mount_obj.mountpoint, + )) + ret, _, _ = g.run(mount_obj.client_system, cmd) + self.assertEqual(ret, 0, 'Not able to create directory structure') + dir_name = 'user1' + for i in range(5): + ret = create_link_file( + mount_obj.client_system, + '{}/{}/testfile{}.txt'.format(mount_obj.mountpoint, dir_name, + i), + '{}/{}/testfile{}_sl.txt'.format(mount_obj.mountpoint, + dir_name, i), + soft=True) + self.assertTrue(ret, 'Not able to create soft links') + for i in range(5, 9): + ret = create_link_file( + mount_obj.client_system, + '{}/{}/testfile{}.txt'.format(mount_obj.mountpoint, dir_name, + i), + '{}/{}/testfile{}_hl.txt'.format(mount_obj.mountpoint, + dir_name, i)) + self.assertTrue(ret, 'Not able to create hard links') + g.log.info('Successfully created directory structure consisting all ' + 'file types on mount') + + # Take note of arequal checksum + ret, exp_arequal = collect_mounts_arequal(mount_obj, path=dir_name) + self.assertTrue(ret, 'Failed to get arequal checksum on mount') + + # Get all the subvols in the volume + subvols = get_subvols(self.mnode, self.volname) + self.assertTrue(subvols.get('volume_subvols'), 'Not able to get ' + 'subvols of the volume') + + # Create a dir, pump IO in that dir, offline b1, wait for IO and + # online b1, wait for heal of b1, bring b2 offline... + m_point, m_client = (self.mounts[1].mountpoint, + self.mounts[1].client_system) + cur_off_bricks = '' + for count, off_brick in enumerate(zip(*subvols.get('volume_subvols')), + start=1): + + # Bring offline bricks online by force starting volume + if cur_off_bricks: + self._bring_bricks_online_and_monitor_heal(cur_off_bricks) + + # Create a dir for running IO + ret = mkdir(m_client, '{}/dir{}'.format(m_point, count)) + self.assertTrue( + ret, 'Not able to create directory for ' + 'starting IO before offline of brick') + + # Start IO in the newly created directory + cmd = ('/usr/bin/env python {}/fd_writes.py -n 10 -t 480 -d 5 -c ' + '16 --dir {}/dir{}'.format(self.script_path, m_point, + count)) + proc = g.run_async(m_client, cmd) + self.all_mounts_procs.append(proc) + + # Wait IO to partially fill the dir + sleep(10) + + # Bring a single brick offline from all of subvols + ret = bring_bricks_offline(self.volname, list(off_brick)) + self.assertTrue(ret, + 'Not able to bring {} offline'.format(off_brick)) + + # Validate heal info errors out, on brining bricks offline in < 5s + start_time = datetime.now().replace(microsecond=0) + ret, _, _ = heal_info(self.mnode, self.volname) + end_time = datetime.now().replace(microsecond=0) + self.assertEqual( + ret, 0, 'Not able to query heal info status ' + 'of volume when a brick is offline') + self.assertLess( + end_time - start_time, timedelta(seconds=5), + 'Query of heal info of volume when a brick is ' + 'offline is taking more than 5 seconds') + + # Wait for some more IO to fill dir + sleep(10) + + # Validate arequal on initial static dir + ret, act_arequal = collect_mounts_arequal(mount_obj, path=dir_name) + self.assertTrue( + ret, 'Failed to get arequal checksum on bringing ' + 'a brick offline') + self.assertEqual( + exp_arequal, act_arequal, 'Mismatch of arequal ' + 'checksum before and after killing a brick') + + cur_off_bricks = off_brick + + # Take note of ctime on mount + ret, prev_ctime, _ = g.run(m_client, 'date +%s') + self.assertEqual(ret, 0, 'Not able to get epoch time from client') + + self._bring_bricks_online_and_monitor_heal(cur_off_bricks) + + # Validate IO was happening during brick operations + # and compare ctime of recent file to current epoch time + ret = validate_io_procs(self.all_mounts_procs, + [self.mounts[0]] * len(self.all_mounts_procs)) + self.assertTrue(ret, 'Not able to validate completion of IO on mounts') + self.all_mounts_procs *= 0 # don't validate IO in tearDown + ret, curr_ctime, _ = g.run( + m_client, "find {} -printf '%C@\n' -type f | " + 'sort -r | head -n 1'.format(m_point)) + self.assertEqual( + ret, 0, 'Not able to get ctime of last edited file from the mount') + self.assertGreater( + float(curr_ctime), float(prev_ctime), 'Not able ' + 'to validate IO was happening during brick operations') + + g.log.info('Completed IO continuity test on EC volume successfully') -- cgit