From bc4004c19c39c1fd74b900bdcd1d91fe28a9d447 Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Thu, 15 Oct 2020 15:17:45 +0530 Subject: check that heal info does not hang Check that when there are pending heals and healing and I/O are going on, heal info completes successfully. Change-Id: I7b00c5b6446d6ec722c1c48a50e5293272df0fdf Signed-off-by: Ravishankar N --- .../functional/afr/heal/test_heal_info_no_hang.py | 162 +++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 tests/functional/afr/heal/test_heal_info_no_hang.py diff --git a/tests/functional/afr/heal/test_heal_info_no_hang.py b/tests/functional/afr/heal/test_heal_info_no_hang.py new file mode 100644 index 000000000..82f8b0598 --- /dev/null +++ b/tests/functional/afr/heal/test_heal_info_no_hang.py @@ -0,0 +1,162 @@ +# Copyright (C) 2020 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along` +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-131 USA. + +""" +Description: + heal info completes when there is ongoing I/O and a lot of pending heals. +""" +import random +from glusto.core import Glusto as g + +from glustolibs.gluster.gluster_base_class import (GlusterBaseClass, runs_on) +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.volume_libs import get_subvols +from glustolibs.gluster.brick_libs import (bring_bricks_offline, + bring_bricks_online, + are_bricks_offline, + get_all_bricks) +from glustolibs.gluster.heal_ops import trigger_heal +from glustolibs.io.utils import run_linux_untar +from glustolibs.gluster.glusterdir import mkdir + + +@runs_on([['distributed-replicated'], + ['glusterfs']]) +class TestHealInfoNoHang(GlusterBaseClass): + + def setUp(self): + self.get_super_method(self, 'setUp')() + + self.is_io_running = False + + ret = self.setup_volume_and_mount_volume(mounts=self.mounts, + volume_create_force=False) + if not ret: + raise ExecutionError("Failed to Setup_Volume and Mount_Volume") + g.log.info("Successful in Setup Volume and Mount Volume") + + self.bricks_list = get_all_bricks(self.mnode, self.volname) + self.subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] + + def tearDown(self): + if self.is_io_running: + if not self._wait_for_untar_completion(): + g.log.error("I/O failed to stop on clients") + + ret = self.unmount_volume_and_cleanup_volume(mounts=self.mounts) + if not ret: + raise ExecutionError("Failed to umount the vol & cleanup Volume") + g.log.info("Successful in umounting the volume and Cleanup") + + self.get_super_method(self, 'tearDown')() + + def _wait_for_untar_completion(self): + """Wait for the kernel untar to complete""" + has_process_stopped = [] + for proc in self.list_of_io_processes: + try: + ret, _, _ = proc.async_communicate() + if not ret: + has_process_stopped.append(False) + has_process_stopped.append(True) + except ValueError: + has_process_stopped.append(True) + return all(has_process_stopped) + + def _does_heal_info_complete_within_timeout(self): + """Check if heal info CLI completes within a specific timeout""" + # We are just assuming 1 entry takes one second to process, which is + # a very high number but some estimate is better than a random magic + # value for timeout. + timeout = self.num_entries * 1 + + # heal_info_data = get_heal_info(self.mnode, self.volname) + cmd = "timeout %s gluster volume heal %s info" % (timeout, + self.volname) + ret, _, _ = g.run(self.mnode, cmd) + if ret: + return False + return True + + def test_heal_info_no_hang(self): + """ + Testcase steps: + 1. Start kernel untar on the mount + 2. While untar is going on, kill a brick of the replica. + 3. Wait for the untar to be over, resulting in pending heals. + 4. Get the approx. number of pending heals and save it + 5. Bring the brick back online. + 6. Trigger heal + 7. Run more I/Os with dd command + 8. Run heal info command and check that it completes successfully under + a timeout that is based on the no. of heals in step 4. + """ + self.list_of_io_processes = [] + self.linux_untar_dir = "{}/{}".format(self.mounts[0].mountpoint, + "linuxuntar") + ret = mkdir(self.clients[0], self.linux_untar_dir) + self.assertTrue(ret, "Failed to create dir linuxuntar for untar") + + # Start linux untar on dir linuxuntar + ret = run_linux_untar(self.clients[0], self.mounts[0].mountpoint, + dirs=tuple(['linuxuntar'])) + self.list_of_io_processes += ret + self.is_io_running = True + + # Kill brick resulting in heal backlog. + brick_to_bring_offline = random.choice(self.bricks_list) + ret = bring_bricks_offline(self.volname, brick_to_bring_offline) + self.assertTrue(ret, 'Failed to bring bricks %s offline' + % brick_to_bring_offline) + ret = are_bricks_offline(self.mnode, self.volname, + [brick_to_bring_offline]) + self.assertTrue(ret, 'Bricks %s are not offline' + % brick_to_bring_offline) + g.log.info('Bringing bricks %s offline is successful', + brick_to_bring_offline) + + ret = self._wait_for_untar_completion() + self.assertFalse(ret, "IO didn't complete or failed on client") + self.is_io_running = False + + # Get approx. no. of entries to be healed. + cmd = ("gluster volume heal %s statistics heal-count | grep Number " + "| awk '{sum+=$4} END {print sum/2}'" % self.volname) + ret, self.num_entries, _ = g.run(self.mnode, cmd) + self.assertEqual(ret, 0, "Failed to get heal-count statistics") + + # Restart the down bricks + ret = bring_bricks_online(self.mnode, self.volname, + brick_to_bring_offline) + self.assertTrue(ret, 'Failed to bring brick %s online' % + brick_to_bring_offline) + g.log.info('Bringing brick %s online is successful', + brick_to_bring_offline) + # Trigger heal + ret = trigger_heal(self.mnode, self.volname) + self.assertTrue(ret, 'Starting heal failed') + g.log.info('Index heal launched') + + # Run more I/O + cmd = ("for i in `seq 1 10`; do dd if=/dev/urandom of=%s/file_$i " + "bs=1M count=100; done" % self.mounts[0].mountpoint) + ret = g.run_async(self.mounts[0].client_system, cmd, + user=self.mounts[0].user) + + # Get heal info + ret = self._does_heal_info_complete_within_timeout() + self.assertTrue(ret, 'Heal info timed out') + g.log.info('Heal info completed succesfully') -- cgit