diff options
30 files changed, 3745 insertions, 118 deletions
diff --git a/glustolibs-gluster/glustolibs/gluster/dht_test_utils.py b/glustolibs-gluster/glustolibs/gluster/dht_test_utils.py index d205211bb..11f2eda62 100644 --- a/glustolibs-gluster/glustolibs/gluster/dht_test_utils.py +++ b/glustolibs-gluster/glustolibs/gluster/dht_test_utils.py @@ -456,3 +456,5 @@ def is_layout_complete(mnode, volname, dirpath): return False elif hash_difference < 1: g.log.error("Layout has overlaps") + + return True diff --git a/glustolibs-gluster/glustolibs/gluster/gluster_base_class.py b/glustolibs-gluster/glustolibs/gluster/gluster_base_class.py index b2fdc439d..65061cb13 100755 --- a/glustolibs-gluster/glustolibs/gluster/gluster_base_class.py +++ b/glustolibs-gluster/glustolibs/gluster/gluster_base_class.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 Red Hat, Inc. <http://www.redhat.com> +# Copyright (C) 2018-2021 Red Hat, Inc. <http://www.redhat.com> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -474,6 +474,9 @@ class GlusterBaseClass(TestCase): """ g.log.info("Starting to mount volume %s", cls.volname) for mount_obj in mounts: + # For nfs-ganesha, mount is done via vip + if cls.enable_nfs_ganesha: + mount_obj.server_system = cls.vips[0] g.log.info("Mounting volume '%s:%s' on '%s:%s'", mount_obj.server_system, mount_obj.volname, mount_obj.client_system, mount_obj.mountpoint) @@ -1057,6 +1060,7 @@ class GlusterBaseClass(TestCase): ret = setup_nfs_ganesha(cls) if not ret: raise ExecutionError("Failed to setup nfs ganesha") + g.log.info("Successful in setting up NFS Ganesha Cluster") msg = "Setupclass: %s : %s" % (cls.__name__, cls.glustotest_run_id) g.log.info(msg) diff --git a/glustolibs-gluster/glustolibs/gluster/heal_libs.py b/glustolibs-gluster/glustolibs/gluster/heal_libs.py index 91d720e41..4a551cd48 100755 --- a/glustolibs-gluster/glustolibs/gluster/heal_libs.py +++ b/glustolibs-gluster/glustolibs/gluster/heal_libs.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# Copyright (C) 2016-2020 Red Hat, Inc. <http://www.redhat.com> +# Copyright (C) 2016-2021 Red Hat, Inc. <http://www.redhat.com> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -551,3 +551,41 @@ def is_shd_daemon_running(mnode, node, volname): return True except KeyError: return False + + +def enable_granular_heal(mnode, volname): + """Enable granular heal on a given volume + + Args: + mnode(str): Node on which command has to be exectued + volname(str): Name of the volume on which granular heal is to be enabled + + Returns: + bool: True if granular heal is enabled successfully else False + """ + cmd = "gluster volume heal {} granular-entry-heal enable".format(volname) + ret, _, _ = g.run(mnode, cmd) + if ret: + g.log.error('Unable to enable granular-entry-heal on volume %s', + volname) + return False + return True + + +def disable_granular_heal(mnode, volname): + """Diable granular heal on a given volume + + Args: + mnode(str): Node on which command will be exectued + volname(str): Name of the volume on which granular heal is to be disabled + + Returns: + bool: True if granular heal is disabled successfully else False + """ + cmd = "gluster volume heal {} granular-entry-heal disable".format(volname) + ret, _, _ = g.run(mnode, cmd) + if ret: + g.log.error('Unable to disable granular-entry-heal on volume %s', + volname) + return False + return True diff --git a/glustolibs-gluster/glustolibs/gluster/lib_utils.py b/glustolibs-gluster/glustolibs/gluster/lib_utils.py index aa919e344..b04976b1c 100755 --- a/glustolibs-gluster/glustolibs/gluster/lib_utils.py +++ b/glustolibs-gluster/glustolibs/gluster/lib_utils.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# Copyright (C) 2015-2020 Red Hat, Inc. <http://www.redhat.com> +# Copyright (C) 2015-2021 Red Hat, Inc. <http://www.redhat.com> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -1016,6 +1016,30 @@ def group_add(servers, groupname): return True +def group_del(servers, groupname): + """ + Deletes a group in all the servers. + + Args: + servers(list|str): Nodes on which cmd is to be executed. + groupname(str): Name of the group to be removed. + + Return always True + """ + if not isinstance(servers, list): + servers = [servers] + + cmd = "groupdel %s" % groupname + results = g.run_parallel(servers, cmd) + + for server, ret_value in list(results.items()): + retcode, _, err = ret_value + if retcode != 0 and "does not exist" in err: + g.log.error("Group %s on server %s already removed", + groupname, server) + return True + + def ssh_keygen(mnode): """ Creates a pair of ssh private and public key if not present diff --git a/glustolibs-gluster/glustolibs/gluster/nfs_ganesha_ops.py b/glustolibs-gluster/glustolibs/gluster/nfs_ganesha_ops.py index ec4ca176b..d8486c7d2 100755 --- a/glustolibs-gluster/glustolibs/gluster/nfs_ganesha_ops.py +++ b/glustolibs-gluster/glustolibs/gluster/nfs_ganesha_ops.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# Copyright (C) 2016-2017 Red Hat, Inc. <http://www.redhat.com> +# Copyright (C) 2016-2021 Red Hat, Inc. <http://www.redhat.com> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -50,17 +50,33 @@ def teardown_nfs_ganesha_cluster(servers, force=False): Example: teardown_nfs_ganesha_cluster(servers) """ + # Copy ganesha.conf before proceeding to clean up + for server in servers: + cmd = "cp /etc/ganesha/ganesha.conf ganesha.conf" + ret, _, _ = g.run(server, cmd) + if ret: + g.log.error("Failed to copy ganesha.conf") + if force: g.log.info("Executing force cleanup...") + cleanup_ops = ['--teardown', '--cleanup'] for server in servers: - cmd = ("/usr/libexec/ganesha/ganesha-ha.sh --teardown " - "/var/run/gluster/shared_storage/nfs-ganesha") - _, _, _ = g.run(server, cmd) - cmd = ("/usr/libexec/ganesha/ganesha-ha.sh --cleanup /var/run/" - "gluster/shared_storage/nfs-ganesha") - _, _, _ = g.run(server, cmd) + # Perform teardown and cleanup + for op in cleanup_ops: + cmd = ("/usr/libexec/ganesha/ganesha-ha.sh {} /var/run/" + "gluster/shared_storage/nfs-ganesha".format(op)) + _, _, _ = g.run(server, cmd) + + # Stop nfs ganesha service _, _, _ = stop_nfs_ganesha_service(server) + + # Clean shared storage, ganesha.conf, and replace with backup + for cmd in ("rm -rf /var/run/gluster/shared_storage/*", + "rm -rf /etc/ganesha/ganesha.conf", + "cp ganesha.conf /etc/ganesha/ganesha.conf"): + _, _, _ = g.run(server, cmd) return True + ret, _, _ = disable_nfs_ganesha(servers[0]) if ret != 0: g.log.error("Nfs-ganesha disable failed") @@ -755,6 +771,22 @@ def create_nfs_ganesha_cluster(servers, vips): # Create backup of ganesha-ha.conf file in ganesha_mnode g.upload(ganesha_mnode, tmp_ha_conf, '/etc/ganesha/') + # setsebool ganesha_use_fusefs on + cmd = "setsebool ganesha_use_fusefs on" + for server in servers: + ret, _, _ = g.run(server, cmd) + if ret: + g.log.error("Failed to 'setsebool ganesha_use_fusefs on' on %", + server) + return False + + # Verify ganesha_use_fusefs is on + _, out, _ = g.run(server, "getsebool ganesha_use_fusefs") + if "ganesha_use_fusefs --> on" not in out: + g.log.error("Failed to 'setsebool ganesha_use_fusefs on' on %", + server) + return False + # Enabling ganesha g.log.info("Enable nfs-ganesha") ret, _, _ = enable_nfs_ganesha(ganesha_mnode) @@ -768,6 +800,31 @@ def create_nfs_ganesha_cluster(servers, vips): # pcs status output _, _, _ = g.run(ganesha_mnode, "pcs status") + # pacemaker status output + _, _, _ = g.run(ganesha_mnode, "systemctl status pacemaker") + + return True + + +def enable_firewall(servers): + """Enables Firewall if not enabled already + Args: + servers(list): Hostname of ganesha nodes + Returns: + Status (bool) : True/False based on the status of firewall enable + """ + + cmd = "systemctl status firewalld | grep Active" + for server in servers: + ret, out, _ = g.run(server, cmd) + if 'inactive' in out: + g.log.info("Firewalld is not running. Enabling Firewalld") + for command in ("enable", "start"): + ret, out, _ = g.run(server, + "systemctl {} firewalld".format(command)) + if ret: + g.log.error("Failed to enable Firewalld on %s", server) + return False return True @@ -781,9 +838,11 @@ def ganesha_server_firewall_settings(servers): True(bool): If successfully set the firewall settings False(bool): If failed to do firewall settings """ + if not enable_firewall(servers): + return False + services = ['nfs', 'rpc-bind', 'high-availability', 'nlm', 'mountd', 'rquota'] - ret = add_services_to_firewall(servers, services, True) if not ret: g.log.error("Failed to set firewall zone permanently on ganesha nodes") diff --git a/glustolibs-misc/glustolibs/misc/misc_libs.py b/glustolibs-misc/glustolibs/misc/misc_libs.py index 109dc9621..9f9225929 100755 --- a/glustolibs-misc/glustolibs/misc/misc_libs.py +++ b/glustolibs-misc/glustolibs/misc/misc_libs.py @@ -21,7 +21,7 @@ import sys import time from glusto.core import Glusto as g -from glustolibs.gluster.lib_utils import is_rhel6 +from glustolibs.gluster.lib_utils import is_rhel6, is_rhel7 def create_dirs(list_of_nodes, list_of_dir_paths): @@ -666,3 +666,25 @@ def kill_process(mnode, process_ids='', process_names=''): g.log.error("Failed to kill process with pid %s" % str(pid)) return False return True + + +def bring_down_network_interface(mnode, timeout=150): + """Brings the network interface down for a defined time + + Args: + mnode (str): Node at which the interface has to be bought down + timeout (int): Time duration (in secs) for which network has to + be down + + Returns: + network_status(object): Returns a process object + + Example: + >>> bring_down_network_interface("10.70.43.68", timout=100) + """ + interface = "eth0" if is_rhel7(mnode) else "ens3" + cmd = "ifconfig {0} down\nsleep {1}\nifconfig {0} up".format(interface, + timeout) + _, _, _ = g.run(mnode, "echo \"{}\"> 'test.sh'".format(cmd)) + network_status = g.run_async(mnode, "sh test.sh") + return network_status diff --git a/tests/functional/afr/heal/test_no_glustershd_with_distribute.py b/tests/functional/afr/heal/test_no_glustershd_with_distribute.py index d2b43bfe3..bbefe0cff 100644 --- a/tests/functional/afr/heal/test_no_glustershd_with_distribute.py +++ b/tests/functional/afr/heal/test_no_glustershd_with_distribute.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2018 Red Hat, Inc. <http://www.redhat.com> +# Copyright (C) 2017-2021 Red Hat, Inc. <http://www.redhat.com> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -58,7 +58,7 @@ class SelfHealDaemonProcessTestsWithMultipleVolumes(GlusterBaseClass): for volume_config in cls.volume_configs: ret = setup_volume(mnode=cls.mnode, all_servers_info=cls.all_servers_info, - volume_config=volume_config) + volume_config=volume_config, multi_vol=True) volname = volume_config['name'] if not ret: raise ExecutionError("Failed to setup Volume" diff --git a/tests/functional/afr/heal/test_self_heal_with_meta_data_entry_and_files_removed.py b/tests/functional/afr/heal/test_self_heal_with_meta_data_entry_and_files_removed.py new file mode 100644 index 000000000..37bd2ec52 --- /dev/null +++ b/tests/functional/afr/heal/test_self_heal_with_meta_data_entry_and_files_removed.py @@ -0,0 +1,600 @@ +# Copyright (C) 2021 Red Hat, Inc. <http://www.redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along` +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +from glusto.core import Glusto as g +from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on +from glustolibs.gluster.brick_libs import (bring_bricks_offline, + bring_bricks_online, + are_bricks_offline, + are_bricks_online, + get_all_bricks) +from glustolibs.gluster.glusterdir import mkdir +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.heal_libs import (monitor_heal_completion, + is_volume_in_split_brain, + is_heal_complete, + enable_granular_heal, + disable_granular_heal) +from glustolibs.gluster.lib_utils import (add_user, del_user, group_del, + group_add, collect_bricks_arequal) +from glustolibs.gluster.volume_ops import get_volume_options +from glustolibs.gluster.volume_libs import get_subvols +from glustolibs.io.utils import collect_mounts_arequal + + +@runs_on([['distributed-replicated', 'replicated'], ['glusterfs']]) +class TestHealWithLinkFiles(GlusterBaseClass): + + def setUp(self): + + self.get_super_method(self, 'setUp')() + + self.first_client = self.mounts[0].client_system + self.mountpoint = self.mounts[0].mountpoint + self.user_group_created = False + + # If test case running is test_self_heal_meta_data + # create user and group + test_name_splitted = self.id().split('.') + test_id = test_name_splitted[len(test_name_splitted) - 1] + if test_id == 'test_self_heal_meta_data': + + # Create non-root group + if not group_add(self.first_client, 'qa_all'): + raise ExecutionError("Failed to create group qa_all") + + # Create non-root users + self.users = ('qa_func', 'qa_system', 'qa_perf') + for user in self.users: + if not add_user(self.first_client, user, group='qa_all'): + raise ExecutionError("Failed to create user {}" + .format(user)) + + self.user_group_created = True + g.log.info("Successfully created all users.") + + # Setup Volume + if not self.setup_volume_and_mount_volume([self.mounts[0]]): + raise ExecutionError("Failed to setup and mount volume") + + def tearDown(self): + + # Delete non-root users and group if created + if self.user_group_created: + + # Delete non-root users + for user in self.users: + del_user(self.first_client, user) + g.log.info("Successfully deleted all users") + + # Delete non-root group + group_del(self.first_client, 'qa_all') + + if not self.unmount_volume_and_cleanup_volume([self.mounts[0]]): + raise ExecutionError("Failed to cleanup Volume") + + # Calling GlusterBaseClass tearDown + self.get_super_method(self, 'tearDown')() + + def _set_granular_heal_to_on_or_off(self, enabled=False): + """Set granular heal to ON or OFF""" + granular = get_volume_options(self.mnode, self.volname, + 'granular-entry-heal') + if enabled: + if granular['cluster.granular-entry-heal'] != 'on': + ret = enable_granular_heal(self.mnode, self.volname) + self.assertTrue(ret, + "Unable to set granular-entry-heal to on") + else: + if granular['cluster.granular-entry-heal'] == 'on': + ret = disable_granular_heal(self.mnode, self.volname) + self.assertTrue(ret, + "Unable to set granular-entry-heal to off") + + def _run_cmd(self, io_cmd, err_msg): + """Run cmd and show error message if it fails""" + cmd = ("cd {}/test_self_heal;{}".format(self.mountpoint, io_cmd)) + ret, _, _ = g.run(self.first_client, cmd) + self.assertFalse(ret, err_msg) + + def _create_files_and_dirs_on_mount_point(self, index, second_set=False): + """A function to create files and dirs on mount point""" + # Create a parent directory test_self_heal on mount point + if not second_set: + ret = mkdir(self.first_client, '{}/{}'.format( + self.mountpoint, 'test_self_heal')) + self.assertTrue(ret, "Failed to create dir test_self_heal") + + # Create dirctories and files inside directory test_self_heal + io_cmd = ("for i in `seq 1 50`; do mkdir dir.$i; dd if=/dev/random" + " of=file.$i count=1K bs=$i; done", + + "for i in `seq 1 100`; do mkdir dir.$i; for j in `seq 1 5`;" + " do dd if=/dev/random of=dir.$i/file.$j bs=1K count=$j" + ";done;done", + + "for i in `seq 1 10`; do mkdir l1_dir.$i; for j in `seq " + "1 5`; do mkdir l1_dir.$i/l2_dir.$j; for k in `seq 1 10`;" + " do dd if=/dev/random of=l1_dir.$i/l2_dir.$j/test.$k" + " bs=1k count=$k; done; done; done;", + + "for i in `seq 51 100`; do mkdir new_dir.$i; for j in `seq" + " 1 10`; do dd if=/dev/random of=new_dir.$i/new_file.$j " + "bs=1K count=$j; done; dd if=/dev/random of=new_file.$i" + " count=1K bs=$i; done ;") + self._run_cmd( + io_cmd[index], "Failed to create dirs and files inside") + + def _delete_files_and_dirs(self): + """Delete files and dirs from mount point""" + io_cmd = ("for i in `seq 1 50`; do rm -rf dir.$i; rm -f file.$i;done") + self._run_cmd(io_cmd, "Failed to delete dirs and files") + + def _rename_files_and_dirs(self): + """Rename files and dirs from mount point""" + io_cmd = ("for i in `seq 51 100`; do mv new_file.$i renamed_file.$i;" + " for j in `seq 1 10`; do mv new_dir.$i/new_file.$j " + "new_dir.$i/renamed_file.$j ; done ; mv new_dir.$i " + "renamed_dir.$i; done;") + self._run_cmd(io_cmd, "Failed to rename dirs and files") + + def _change_meta_deta_of_dirs_and_files(self): + """Change meta data of dirs and files""" + cmds = ( + # Change permission + "for i in `seq 1 100`; do chmod 555 dir.$i; done; " + "for i in `seq 1 50`; do for j in `seq 1 5`; do chmod 666 " + "dir.$i/file.$j; done; done; for i in `seq 51 100`; do for " + "j in `seq 1 5`;do chmod 444 dir.$i/file.$j; done; done;", + + # Change ownership + "for i in `seq 1 35`; do chown -R qa_func dir.$i; done; " + "for i in `seq 36 70`; do chown -R qa_system dir.$i; done; " + "for i in `seq 71 100`; do chown -R qa_perf dir.$i; done;", + + # Change group + "for i in `seq 1 100`; do chgrp -R qa_all dir.$i; done;") + + for io_cmd in cmds: + self._run_cmd(io_cmd, + "Failed to change meta data on dirs and files") + g.log.info("Successfully changed meta data on dirs and files") + + def _verify_meta_data_of_files_and_dirs(self): + """Verify meta data of files and dirs""" + cmds = ( + # Verify permissions + "for i in `seq 1 50`; do stat -c %a dir.$i | grep -F \"555\";" + " if [ $? -ne 0 ]; then exit 1; fi; for j in `seq 1 5` ; do " + "stat -c %a dir.$i/file.$j | grep -F \"666\"; if [ $? -ne 0 ]" + "; then exit 1; fi; done; done; for i in `seq 51 100`; do " + "stat -c %a dir.$i | grep -F \"555\";if [ $? -ne 0 ]; then " + "exit 1; fi; for j in `seq 1 5`; do stat -c %a dir.$i/file.$j" + " | grep -F \"444\"; if [ $? -ne 0 ]; then exit 1; fi; done;" + "done;", + + # Verify ownership + "for i in `seq 1 35`; do stat -c %U dir.$i | grep -F " + "\"qa_func\"; if [ $? -ne 0 ]; then exit 1; fi; for j in " + "`seq 1 5`; do stat -c %U dir.$i/file.$j | grep -F " + "\"qa_func\"; if [ $? -ne 0 ]; then exit 1; fi; done; done;" + " for i in `seq 36 70` ; do stat -c %U dir.$i | grep -F " + "\"qa_system\"; if [ $? -ne 0 ]; then exit 1; fi; for j in " + "`seq 1 5`; do stat -c %U dir.$i/file.$j | grep -F " + "\"qa_system\"; if [ $? -ne 0 ]; then exit 1; fi; done; done;" + " for i in `seq 71 100` ; do stat -c %U dir.$i | grep -F " + "\"qa_perf\"; if [ $? -ne 0 ]; then exit 1; fi; for j in " + "`seq 1 5`; do stat -c %U dir.$i/file.$j | grep -F " + "\"qa_perf\"; if [ $? -ne 0 ]; then exit 1; fi; done; done;", + + # Verify group + "for i in `seq 1 100`; do stat -c %G dir.$i | grep -F " + "\"qa_all\"; if [ $? -ne 0 ]; then exit 1; fi; for j in " + "`seq 1 5`; do stat -c %G dir.$i/file.$j | grep -F " + "\"qa_all\"; if [ $? -ne 0 ]; then exit 1; fi; done; done;") + + for io_cmd in cmds: + self._run_cmd(io_cmd, "Meta data of dirs and files not proper") + + def _set_and_remove_extended_attributes(self, remove=False): + """Set and remove extended attributes""" + # Command to set extended attribute to files and dirs + io_cmd = ("for i in `seq 1 100`; do setfattr -n trusted.name -v " + "testing_xattr_selfheal_on_dirs dir.$i; for j in `seq 1 " + "5`;do setfattr -n trusted.name -v " + "testing_xattr_selfheal_on_files dir.$i/file.$j; done; " + "done;") + err_msg = "Failed to set extended attributes to files and dirs" + if remove: + # Command to remove extended attribute set on files and dirs + io_cmd = ("for i in `seq 1 100`; do setfattr -x trusted.name " + "dir.$i; for j in `seq 1 5`; do setfattr -x " + "trusted.name dir.$i/file.$j ; done ; done ;") + err_msg = "Failed to remove extended attributes to files and dirs" + + self._run_cmd(io_cmd, err_msg) + + def _verify_if_extended_attributes_are_proper(self, remove=False): + """Verify if extended attributes are set or remove properly""" + io_cmd = ("for i in `seq 1 100`; do getfattr -n trusted.name -e text " + "dir.$i | grep -F 'testing_xattr_selfheal_on_dirs'; if [ $? " + "-ne 0 ]; then exit 1 ; fi ; for j in `seq 1 5` ; do " + "getfattr -n trusted.name -e text dir.$i/file.$j | grep -F " + "'testing_xattr_selfheal_on_files'; if [ $? -ne 0 ]; then " + "exit 1; fi; done; done;") + err_msg = "Extended attributes on files and dirs are not proper" + if remove: + io_cmd = ("for i in `seq 1 100`; do getfattr -n trusted.name -e " + "text dir.$i; if [ $? -eq 0 ]; then exit 1; fi; for j in" + " `seq 1 5`; do getfattr -n trusted.name -e text " + "dir.$i/file.$j; if [ $? -eq 0]; then exit 1; fi; done; " + "done;") + err_msg = "Extended attributes set to files and dirs not removed" + self._run_cmd(io_cmd, err_msg) + + def _remove_files_and_create_dirs_with_the_same_name(self): + """Remove files and create dirs with the same name""" + io_cmd = ("for i in `seq 1 10`; do for j in `seq 1 5`; do for k in " + "`seq 1 10`; do rm -f l1_dir.$i/l2_dir.$j/test.$k; mkdir " + "l1_dir.$i/l2_dir.$j/test.$k; done; done; done;") + self._run_cmd(io_cmd, + "Failed to remove files and create dirs with same name") + + def _verify_if_dirs_are_proper_or_not(self): + """Verify if dirs are proper or not""" + io_cmd = ("for i in `seq 1 10`; do for j in `seq 1 5`; do for k in " + "`seq 1 10`; do stat -c %F l1_dir.$i/l2_dir.$j/test.$k | " + "grep -F 'directory'; if [ $? -ne 0 ]; then exit 1; fi; " + "done; done; done;") + self._run_cmd(io_cmd, "Dirs created instead of files aren't proper") + + def _bring_bricks_offline(self): + """Brings bricks offline and confirms if they are offline""" + # Select bricks to bring offline from a replica set + subvols_dict = get_subvols(self.mnode, self.volname) + subvols = subvols_dict['volume_subvols'] + self.bricks_to_bring_offline = [] + for subvol in subvols: + self.bricks_to_bring_offline.append(subvol[0]) + + # Bring bricks offline + ret = bring_bricks_offline(self.volname, self.bricks_to_bring_offline) + self.assertTrue(ret, 'Failed to bring bricks %s offline' % + self.bricks_to_bring_offline) + + ret = are_bricks_offline(self.mnode, self.volname, + self.bricks_to_bring_offline) + self.assertTrue(ret, 'Bricks %s are not offline' + % self.bricks_to_bring_offline) + g.log.info('Bringing bricks %s offline is successful', + self.bricks_to_bring_offline) + + def _restart_volume_and_bring_all_offline_bricks_online(self): + """Restart volume and bring all offline bricks online""" + ret = bring_bricks_online(self.mnode, self.volname, + self.bricks_to_bring_offline, + bring_bricks_online_methods=[ + 'volume_start_force']) + self.assertTrue(ret, 'Failed to bring bricks %s online' % + self.bricks_to_bring_offline) + + # Check if bricks are back online or not + ret = are_bricks_online(self.mnode, self.volname, + self.bricks_to_bring_offline) + self.assertTrue(ret, 'Bricks not online %s even after restart' % + self.bricks_to_bring_offline) + + g.log.info('Bringing bricks %s online is successful', + self.bricks_to_bring_offline) + + def _check_arequal_on_bricks_with_a_specific_arequal(self, arequal, + brick_list): + """ + Compare an inital arequal checksum with bricks from a given brick list + """ + init_val = arequal[0].splitlines()[-1].split(':')[-1] + ret, arequals = collect_bricks_arequal(brick_list) + self.assertTrue(ret, 'Failed to get arequal on bricks') + for brick_arequal in arequals: + brick_total = brick_arequal.splitlines()[-1].split(':')[-1] + self.assertEqual(init_val, brick_total, 'Arequals not matching') + + @staticmethod + def _add_dir_path_to_brick_list(brick_list): + """Add test_self_heal at the end of brick path""" + dir_brick_list = [] + for brick in brick_list: + dir_brick_list.append('{}/{}'.format(brick, 'test_self_heal')) + return dir_brick_list + + def _check_arequal_checksum_for_the_volume(self): + """ + Check if arequals of mount point and bricks are + are the same. + """ + if self.volume_type == "replicated": + # Check arequals for "replicated" + brick_list = get_all_bricks(self.mnode, self.volname) + dir_brick_list = self._add_dir_path_to_brick_list(brick_list) + + # Get arequal before getting bricks offline + work_dir = '{}/test_self_heal'.format(self.mountpoint) + ret, arequals = collect_mounts_arequal([self.mounts[0]], + path=work_dir) + self.assertTrue(ret, 'Failed to get arequal') + g.log.info('Getting arequal before getting bricks offline ' + 'is successful') + + # Get arequal on bricks and compare with mount_point_total + self._check_arequal_on_bricks_with_a_specific_arequal( + arequals, dir_brick_list) + + # Check arequals for "distributed-replicated" + if self.volume_type == "distributed-replicated": + # Get the subvolumes + subvols_dict = get_subvols(self.mnode, self.volname) + num_subvols = len(subvols_dict['volume_subvols']) + + # Get arequals and compare + for i in range(0, num_subvols): + # Get arequal for first brick + brick_list = subvols_dict['volume_subvols'][i] + dir_brick_list = self._add_dir_path_to_brick_list(brick_list) + ret, arequals = collect_bricks_arequal([dir_brick_list[0]]) + self.assertTrue(ret, 'Failed to get arequal on first brick') + + # Get arequal for every brick and compare with first brick + self._check_arequal_on_bricks_with_a_specific_arequal( + arequals, dir_brick_list) + + def _check_heal_is_completed_and_not_in_split_brain(self): + """Check if heal is completed and volume not in split brain""" + # Check if heal is completed + ret = is_heal_complete(self.mnode, self.volname) + self.assertTrue(ret, 'Heal is not complete') + g.log.info('Heal is completed successfully') + + # Check if volume is in split brian or not + ret = is_volume_in_split_brain(self.mnode, self.volname) + self.assertFalse(ret, 'Volume is in split-brain state') + g.log.info('Volume is not in split-brain state') + + def _check_if_there_are_files_and_dirs_to_be_healed(self): + """Check if there are files and dirs to be healed""" + ret = is_heal_complete(self.mnode, self.volname) + self.assertFalse(ret, 'Heal is completed') + g.log.info('Heal is pending') + + def _wait_for_heal_is_completed(self): + """Check if heal is completed""" + ret = monitor_heal_completion(self.mnode, self.volname, + timeout_period=3600) + self.assertTrue(ret, 'Heal has not yet completed') + + def _check_heal_status_restart_vol_wait_and_check_data(self): + """ + Perform repatative steps mentioned below: + 1 Check if heal info is showing all the files and dirs to be healed + 2 Bring back all brick processes which were killed + 3 Wait for heal to complete on the volume + 4 Check if heal is complete and check if volume is in split brain + 5 Collect and compare arequal-checksum according to the volume type + for bricks + """ + # Check if heal info is showing all the files and dirs to be healed + self._check_if_there_are_files_and_dirs_to_be_healed() + + # Bring back all brick processes which were killed + self._restart_volume_and_bring_all_offline_bricks_online() + + # Wait for heal to complete on the volume + self._wait_for_heal_is_completed() + + # Check if heal is complete and check if volume is in split brain + self._check_heal_is_completed_and_not_in_split_brain() + + # Collect and compare arequal-checksum according to the volume type + # for bricks + self._check_arequal_checksum_for_the_volume() + + def _run_test_self_heal_entry_heal(self): + """Run steps of test_self_heal_entry_heal""" + # Create a directory and create files and directories inside it on + # mount point + self._create_files_and_dirs_on_mount_point(0) + + # Collect and compare arequal-checksum according to the volume type + # for bricks + self._check_arequal_checksum_for_the_volume() + + # Bring down brick processes accoding to the volume type + self._bring_bricks_offline() + + # Create a new set of files and directories on mount point + self._create_files_and_dirs_on_mount_point(3, second_set=True) + + self._check_heal_status_restart_vol_wait_and_check_data() + + # Bring down brick processes accoding to the volume type + self._bring_bricks_offline() + + # Delete files and directories from mount point + self._delete_files_and_dirs() + + self._check_heal_status_restart_vol_wait_and_check_data() + + # Bring down brick processes accoding to the volume type + self._bring_bricks_offline() + + # Rename the existing files and dirs + self._rename_files_and_dirs() + + self._check_heal_status_restart_vol_wait_and_check_data() + + def test_self_heal_entry_heal(self): + """ + Test case: + 1. Create a volume, start it and mount it. + 2. Create a directory and create files and directories inside it + on mount point. + 3. Collect and compare arequal-checksum according to the volume type + for bricks. + 4. Bring down brick processes accoding to the volume type. + 5. Create a new set of files and directories on mount point. + 6. Check if heal info is showing all the files and dirs to be healed. + 7. Bring back all brick processes which were killed. + 8. Wait for heal to complete on the volume. + 9. Check if heal is complete and check if volume is in split brain. + 10. Collect and compare arequal-checksum according to the volume type + for bricks. + 11. Bring down brick processes accoding to the volume type. + 12. Delete files and directories from mount point. + 13. Check if heal info is showing all the files and dirs to be healed. + 14. Bring back all brick processes which were killed. + 15. Wait for heal to complete on the volume. + 16. Check if heal is complete and check if volume is in split brain. + 17. Collect and compare arequal-checksum according to the volume type + for bricks. + 18. Bring down brick processes accoding to the volume type. + 19. Rename the existing files and dirs. + 20. Check if heal info is showing all the files and dirs to be healed. + 21. Bring back all brick processes which were killed. + 22. Wait for heal to complete on the volume. + 23. Check if heal is complete and check if volume is in split brain. + 24. Collect and compare arequal-checksum according to the volume type + for bricks. + + Note: + Do this test with both Granular-entry-heal set enable and disable. + """ + for value in (False, True): + if value: + # Cleanup old data from mount point + ret, _, _ = g.run(self.first_client, + 'rm -rf {}/*'.format(self.mountpoint)) + self.assertFalse(ret, 'Failed to cleanup mount point') + g.log.info("Testing with granular heal set to enabled") + self._set_granular_heal_to_on_or_off(enabled=value) + self._run_test_self_heal_entry_heal() + + def test_self_heal_meta_data(self): + """ + Test case: + 1. Create a volume, start it and mount it. + 2. Create a directory and create files and directories inside it + on mount point. + 3. Collect and compare arequal-checksum according to the volume type + for bricks. + 4. Bring down brick processes accoding to the volume type. + 5. Change the meta data of files and dirs. + 6. Check if heal info is showing all the files and dirs to be healed. + 7. Bring back all brick processes which were killed. + 8. Wait for heal to complete on the volume. + 9. Check if heal is complete and check if volume is in split brain. + 10. Collect and compare arequal-checksum according to the volume type + for bricks. + 11. Verify if the meta data of files and dirs. + 12. Bring down brick processes accoding to the volume type. + 13. Set extended attributes on the files and dirs. + 14. Verify if the extended attributes are set properly or not. + 15. Check if heal info is showing all the files and dirs to be healed. + 16. Bring back all brick processes which were killed. + 17. Wait for heal to complete on the volume. + 18. Check if heal is complete and check if volume is in split brain. + 19. Collect and compare arequal-checksum according to the volume type + for bricks. + 20. Verify if extended attributes are consitent or not. + 21. Bring down brick processes accoding to the volume type + 22. Remove extended attributes on the files and dirs. + 23. Verify if extended attributes were removed properly. + 24. Check if heal info is showing all the files and dirs to be healed. + 25. Bring back all brick processes which were killed. + 26. Wait for heal to complete on the volume. + 27. Check if heal is complete and check if volume is in split brain. + 28. Collect and compare arequal-checksum according to the volume type + for bricks. + 29. Verify if extended attributes are removed or not. + """ + # Create a directory and create files and directories inside it + # on mount point + self._create_files_and_dirs_on_mount_point(1) + + # Collect and compare arequal-checksum according to the volume type + # for bricks + self._check_arequal_checksum_for_the_volume() + + # Bring down brick processes accoding to the volume type + self._bring_bricks_offline() + + # Change the meta data of files and dirs + self._change_meta_deta_of_dirs_and_files() + + self._check_heal_status_restart_vol_wait_and_check_data() + + # Verify if the meta data of files and dirs + self._verify_meta_data_of_files_and_dirs() + + for value in (False, True): + # Bring down brick processes accoding to the volume type + self._bring_bricks_offline() + + # Set or remove extended attributes on the files and dirs + self._set_and_remove_extended_attributes(remove=value) + + # Verify if the extended attributes are set properly or not + self._verify_if_extended_attributes_are_proper(remove=value) + + self._check_heal_status_restart_vol_wait_and_check_data() + + # Verify if extended attributes are consitent or not + self._verify_if_extended_attributes_are_proper(remove=value) + + def test_self_heal_of_dir_with_files_removed(self): + """ + Test case: + 1. Create a volume, start it and mount it. + 2. Create a directory and create files and directories inside it + on mount point. + 3. Collect and compare arequal-checksum according to the volume type + for bricks. + 4. Bring down brick processes accoding to the volume type. + 5. Remove all files and create dir which have name of files. + 6. Check if heal info is showing all the files and dirs to be healed. + 7. Bring back all brick processes which were killed. + 8. Wait for heal to complete on the volume. + 9. Check if heal is complete and check if volume is in split brain. + 10. Collect and compare arequal-checksum according to the volume type + for bricks. + 11. Verify if dirs are healed properly or not. + """ + # Create a directory and create files and directories inside it + # on mount point + self._create_files_and_dirs_on_mount_point(2) + + # Collect and compare arequal-checksum according to the volume type + # for bricks + self._check_arequal_checksum_for_the_volume() + + # Bring down brick processes accoding to the volume type + self._bring_bricks_offline() + + # Remove all files and create dir which have name of files + self._remove_files_and_create_dirs_with_the_same_name() + + self._check_heal_status_restart_vol_wait_and_check_data() + + # Verify if dirs are healed properly or not + self._verify_if_dirs_are_proper_or_not() diff --git a/tests/functional/afr/heal/test_server_side_healing_happens_only_when_glustershd_running.py b/tests/functional/afr/heal/test_server_side_healing_happens_only_when_glustershd_running.py index 43b4f4edf..a449e396f 100644 --- a/tests/functional/afr/heal/test_server_side_healing_happens_only_when_glustershd_running.py +++ b/tests/functional/afr/heal/test_server_side_healing_happens_only_when_glustershd_running.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Red Hat, Inc. <http://www.redhat.com> +# Copyright (C) 2021 Red Hat, Inc. <http://www.redhat.com> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -26,12 +26,14 @@ from glustolibs.gluster.brick_libs import ( select_volume_bricks_to_bring_offline, get_online_bricks_list) from glustolibs.gluster.heal_libs import ( get_self_heal_daemon_pid, is_shd_daemonized, - monitor_heal_completion, bring_self_heal_daemon_process_offline) + monitor_heal_completion, bring_self_heal_daemon_process_offline, + disable_granular_heal) from glustolibs.gluster.heal_ops import (get_heal_info_summary, trigger_heal_full) from glustolibs.io.utils import validate_io_procs from glustolibs.misc.misc_libs import upload_scripts -from glustolibs.gluster.volume_ops import set_volume_options +from glustolibs.gluster.volume_ops import (set_volume_options, + get_volume_options) from glustolibs.gluster.mount_ops import mount_volume, umount_volume @@ -99,6 +101,15 @@ class SelfHealDaemonProcessTestsWithSingleVolume(GlusterBaseClass): * heal should complete successfully """ # pylint: disable=too-many-locals,too-many-statements,too-many-lines + + # Disable granular heal if not disabled already + granular = get_volume_options(self.mnode, self.volname, + 'granular-entry-heal') + if granular['cluster.granular-entry-heal'] == 'on': + ret = disable_granular_heal(self.mnode, self.volname) + self.assertTrue(ret, + "Unable to set granular-entry-heal to on") + # Setting Volume options options = {"metadata-self-heal": "on", "entry-self-heal": "on", @@ -131,7 +142,7 @@ class SelfHealDaemonProcessTestsWithSingleVolume(GlusterBaseClass): all_mounts_procs, num_files_to_write = [], 100 for mount_obj in self.mounts: cmd = ("/usr/bin/env python %s create_files " - "-f %s --base-file-name file %s" % (self.script_upload_path, + "-f %d --base-file-name file %s" % (self.script_upload_path, num_files_to_write, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, @@ -221,8 +232,8 @@ class SelfHealDaemonProcessTestsWithSingleVolume(GlusterBaseClass): all_mounts_procs = [] for mount_obj in self.mounts: - cmd = ("/usr/bin/env python %s read %s" - % (self.script_upload_path, mount_obj.mountpoint)) + cmd = ("cd %s;for i in `seq 1 5`; do ls -l;cat *; stat *; sleep 5;" + " done " % (mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) diff --git a/tests/functional/afr/test_add_brick_followed_by_remove_brick.py b/tests/functional/afr/test_add_brick_followed_by_remove_brick.py new file mode 100644 index 000000000..a653b792d --- /dev/null +++ b/tests/functional/afr/test_add_brick_followed_by_remove_brick.py @@ -0,0 +1,170 @@ +# Copyright (C) 2021 Red Hat, Inc. <http://www.redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along` +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +from glusto.core import Glusto as g +from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.brick_libs import get_all_bricks +from glustolibs.gluster.dht_test_utils import is_layout_complete +from glustolibs.gluster.glusterfile import (file_exists, + occurences_of_pattern_in_file) +from glustolibs.gluster.rebalance_ops import (rebalance_start, + wait_for_rebalance_to_complete) +from glustolibs.gluster.volume_libs import expand_volume, shrink_volume +from glustolibs.io.utils import (validate_io_procs, wait_for_io_to_complete) +from glustolibs.misc.misc_libs import upload_scripts + + +@runs_on([['replicated'], ['glusterfs']]) +class TestAddBrickFollowedByRemoveBrick(GlusterBaseClass): + + @classmethod + def setUpClass(cls): + cls.get_super_method(cls, 'setUpClass')() + + cls.first_client = cls.mounts[0].client_system + cls.mountpoint = cls.mounts[0].mountpoint + cls.is_io_running = False + + # Upload IO scripts for running IO on mounts + cls.script_upload_path = ("/usr/share/glustolibs/io/scripts/" + "file_dir_ops.py") + if not file_exists(cls.first_client, cls.script_upload_path): + if not upload_scripts(cls.first_client, cls.script_upload_path): + raise ExecutionError( + "Failed to upload IO scripts to client %s" + % cls.first_client) + + def setUp(self): + + self.get_super_method(self, 'setUp')() + + # Setup Volume + if not self.setup_volume_and_mount_volume([self.mounts[0]]): + raise ExecutionError("Failed to setup and mount volume") + + def tearDown(self): + + if self.is_io_running: + if not wait_for_io_to_complete(self.all_mounts_procs, + [self.mounts[0]]): + raise ExecutionError("IO failed on some of the clients") + + if not self.unmount_volume_and_cleanup_volume([self.mounts[0]]): + raise ExecutionError("Failed to cleanup Volume") + + # Calling GlusterBaseClass tearDown + self.get_super_method(self, 'tearDown')() + + def _check_layout_of_bricks(self): + """Check the layout of bricks""" + ret = is_layout_complete(self.mnode, self.volname, "/") + self.assertTrue(ret, ("Volume %s: Layout is not complete", + self.volname)) + g.log.info("Volume %s: Layout is complete", self.volname) + + def _add_brick_and_wait_for_rebalance_to_complete(self): + """Add brick and wait for rebalance to complete""" + + # Add brick to volume + ret = expand_volume(self.mnode, self.volname, self.servers, + self.all_servers_info) + self.assertTrue(ret, "Failed to add brick on volume %s" + % self.volname) + + # Trigger rebalance and wait for it to complete + ret, _, _ = rebalance_start(self.mnode, self.volname, + force=True) + self.assertEqual(ret, 0, "Failed to start rebalance on the volume %s" + % self.volname) + + # Wait for rebalance to complete + ret = wait_for_rebalance_to_complete(self.mnode, self.volname, + timeout=1200) + self.assertTrue(ret, "Rebalance is not yet complete on the volume " + "%s" % self.volname) + g.log.info("Rebalance successfully completed") + + self._check_layout_of_bricks() + + def _remove_brick_from_volume(self): + """Remove bricks from volume""" + # Remove bricks from the volume + ret = shrink_volume(self.mnode, self.volname, rebalance_timeout=2000) + self.assertTrue(ret, "Failed to remove-brick from volume") + g.log.info("Remove-brick rebalance successful") + + def test_add_brick_followed_by_remove_brick(self): + """ + Test case: + 1. Create a volume, start it and mount it to a client. + 2. Start I/O on volume. + 3. Add brick and trigger rebalance, wait for rebalance to complete. + (The volume which was 1x3 should now be 2x3) + 4. Add brick and trigger rebalance, wait for rebalance to complete. + (The volume which was 2x3 should now be 3x3) + 5. Remove brick from volume such that it becomes a 2x3. + 6. Remove brick from volume such that it becomes a 1x3. + 7. Wait for I/O to complete and check for any input/output errors in + both client and rebalance logs. + """ + # Start I/O on mount point + self.all_mounts_procs = [] + cmd = ("/usr/bin/env python {} create_deep_dirs_with_files " + "--dirname-start-num {} --dir-depth 5 --dir-length 5 " + "--max-num-of-dirs 5 --num-of-files 5 {}" + .format(self.script_upload_path, 10, self.mountpoint)) + proc = g.run_async(self.first_client, cmd) + self.all_mounts_procs.append(proc) + self.is_io_running = True + + # Convert 1x3 to 2x3 and then convert 2x3 to 3x3 + for _ in range(0, 2): + self._add_brick_and_wait_for_rebalance_to_complete() + + # Convert 3x3 to 2x3 and then convert 2x3 to 1x3 + for _ in range(0, 2): + self._remove_brick_from_volume() + + # Validate I/O processes running on the nodes + ret = validate_io_procs(self.all_mounts_procs, [self.mounts[0]]) + self.is_io_running = False + self.assertTrue(ret, "IO failed on some of the clients") + g.log.info("IO on all mounts: Complete") + + # Check for Input/output errors in rebalance logs + particiapting_nodes = [] + for brick in get_all_bricks(self.mnode, self.volname): + node, _ = brick.split(':') + particiapting_nodes.append(node) + + for server in particiapting_nodes: + ret = occurences_of_pattern_in_file( + server, "Input/output error", + "/var/log/glusterfs/{}-rebalance.log".format(self.volname)) + self.assertEqual(ret, 0, + "[Input/output error] present in rebalance log" + " file") + + # Check for Input/output errors in client logs + ret = occurences_of_pattern_in_file( + self.first_client, "Input/output error", + "/var/log/glusterfs/mnt-{}_{}.log".format(self.volname, + self.mount_type)) + self.assertEqual(ret, 0, + "[Input/output error] present in client log file") + g.log.info("Expanding and shrinking volume successful and no I/O " + "errors see in rebalance and client logs") diff --git a/tests/functional/afr/test_afr_cli_no_splitbrain_resolution.py b/tests/functional/afr/test_afr_cli_no_splitbrain_resolution.py index ad6f336a5..1acd11faa 100644 --- a/tests/functional/afr/test_afr_cli_no_splitbrain_resolution.py +++ b/tests/functional/afr/test_afr_cli_no_splitbrain_resolution.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2020 Red Hat, Inc. <http://www.redhat.com> +# Copyright (C) 2017-2021 Red Hat, Inc. <http://www.redhat.com> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -112,17 +112,16 @@ class TestSelfHeal(GlusterBaseClass): g.log.info("creating 5 files from mount point") all_mounts_procs = [] - for mount_obj in self.mounts: - cmd = ("/usr/bin/env python %s create_files -f 5 " - "--base-file-name test_file --fixed-file-size 1k %s" % ( - self.script_upload_path, - mount_obj.mountpoint)) - proc = g.run_async(mount_obj.client_system, cmd, - user=mount_obj.user) - all_mounts_procs.append(proc) + cmd = ("/usr/bin/env python %s create_files -f 5 " + "--base-file-name test_file --fixed-file-size 1k %s" % ( + self.script_upload_path, self.mounts[0].mountpoint)) + proc = g.run_async(self.mounts[0].client_system, cmd, + user=self.mounts[0].user) + all_mounts_procs.append(proc) + # Validate I/O g.log.info("Wait for IO to complete and validate IO.....") - ret = validate_io_procs(all_mounts_procs, self.mounts) + ret = validate_io_procs(all_mounts_procs, [self.mounts[0]]) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("IO is successful on all mounts") g.log.info("Successfully created a file from mount point") @@ -149,17 +148,16 @@ class TestSelfHeal(GlusterBaseClass): g.log.info("creating 5 new files of same name from mount point") all_mounts_procs = [] - for mount_obj in self.mounts: - cmd = ("/usr/bin/env python %s create_files -f 5 " - "--base-file-name test_file --fixed-file-size 10k %s" % ( - self.script_upload_path, - mount_obj.mountpoint)) - proc = g.run_async(mount_obj.client_system, cmd, - user=mount_obj.user) - all_mounts_procs.append(proc) + cmd = ("/usr/bin/env python %s create_files -f 5 " + "--base-file-name test_file --fixed-file-size 10k %s" % ( + self.script_upload_path, self.mounts[0].mountpoint)) + proc = g.run_async(self.mounts[0].client_system, cmd, + user=self.mounts[0].user) + all_mounts_procs.append(proc) + # Validate I/O g.log.info("Wait for IO to complete and validate IO.....") - ret = validate_io_procs(all_mounts_procs, self.mounts) + ret = validate_io_procs(all_mounts_procs, [self.mounts[0]]) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("IO is successful on all mounts") g.log.info("Successfully created a new file of same name " @@ -225,10 +223,11 @@ class TestSelfHeal(GlusterBaseClass): fpath = (self.mounts[0].mountpoint + '/test_file' + str(fcount) + '.txt') status = get_fattr(self.mounts[0].client_system, - fpath, 'replica.split-brain-status') + fpath, 'replica.split-brain-status', + encode="text") compare_string = ("The file is not under data or metadata " "split-brain") - self.assertEqual(status.rstrip('\x00'), compare_string, + self.assertEqual(status, compare_string, "file test_file%s is under" " split-brain" % str(fcount)) g.log.info("none of the files are under split-brain") diff --git a/tests/functional/afr/test_default_granular_entry_heal.py b/tests/functional/afr/test_default_granular_entry_heal.py new file mode 100644 index 000000000..91ca25907 --- /dev/null +++ b/tests/functional/afr/test_default_granular_entry_heal.py @@ -0,0 +1,235 @@ +# Copyright (C) 2021 Red Hat, Inc. <http://www.redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along` +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +from random import choice + +from glusto.core import Glusto as g +from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.brick_libs import (bring_bricks_offline, + bring_bricks_online, + are_bricks_offline, + are_bricks_online, get_all_bricks) +from glustolibs.gluster.glusterfile import occurences_of_pattern_in_file +from glustolibs.gluster.heal_libs import (monitor_heal_completion, + is_heal_complete) +from glustolibs.gluster.lib_utils import collect_bricks_arequal +from glustolibs.gluster.volume_libs import get_subvols +from glustolibs.gluster.volume_ops import get_volume_options +from glustolibs.io.utils import collect_mounts_arequal + + +@runs_on([['distributed-replicated', 'replicated', + 'arbiter', 'distributed-arbiter'], ['glusterfs']]) +class TestDefaultGranularEntryHeal(GlusterBaseClass): + + def setUp(self): + + self.get_super_method(self, 'setUp')() + + self.first_client = self.mounts[0].client_system + self.mountpoint = self.mounts[0].mountpoint + + # Setup Volume + if not self.setup_volume_and_mount_volume([self.mounts[0]]): + raise ExecutionError("Failed to setup and mount volume") + + def tearDown(self): + + if not self.unmount_volume_and_cleanup_volume([self.mounts[0]]): + raise ExecutionError("Failed to cleanup Volume") + + # Calling GlusterBaseClass tearDown + self.get_super_method(self, 'tearDown')() + + def _bring_bricks_offline(self): + """Brings bricks offline and confirms if they are offline""" + # Select bricks to bring offline from a replica set + subvols_dict = get_subvols(self.mnode, self.volname) + subvols = subvols_dict['volume_subvols'] + self.bricks_to_bring_offline = [] + for subvol in subvols: + self.bricks_to_bring_offline.append(choice(subvol)) + + # Bring bricks offline + ret = bring_bricks_offline(self.volname, self.bricks_to_bring_offline) + self.assertTrue(ret, 'Failed to bring bricks %s offline' % + self.bricks_to_bring_offline) + + ret = are_bricks_offline(self.mnode, self.volname, + self.bricks_to_bring_offline) + self.assertTrue(ret, 'Bricks %s are not offline' + % self.bricks_to_bring_offline) + g.log.info('Bringing bricks %s offline is successful', + self.bricks_to_bring_offline) + + def _restart_volume_and_bring_all_offline_bricks_online(self): + """Restart volume and bring all offline bricks online""" + + ret = is_heal_complete(self.mnode, self.volname) + self.assertFalse(ret, 'Heal is completed') + g.log.info('Heal is pending') + + ret = bring_bricks_online(self.mnode, self.volname, + self.bricks_to_bring_offline, + bring_bricks_online_methods=[ + 'volume_start_force']) + self.assertTrue(ret, 'Failed to bring bricks %s online' % + self.bricks_to_bring_offline) + + # Check if bricks are back online or not + ret = are_bricks_online(self.mnode, self.volname, + self.bricks_to_bring_offline) + self.assertTrue(ret, 'Bricks not online %s even after restart' % + self.bricks_to_bring_offline) + + g.log.info('Bringing bricks %s online is successful', + self.bricks_to_bring_offline) + + def _wait_for_heal_to_completed(self): + """Check if heal is completed""" + ret = monitor_heal_completion(self.mnode, self.volname, + timeout_period=3600) + self.assertTrue(ret, 'Heal has not yet completed') + + def _check_arequal_on_bricks_with_a_specific_arequal(self, arequal, + brick_list): + """ + Compare an inital arequal checksum with bricks from a given brick list + """ + init_val = arequal[0].splitlines()[-1].split(':')[-1] + ret, arequals = collect_bricks_arequal(brick_list) + self.assertTrue(ret, 'Failed to get arequal on bricks') + for brick_arequal in arequals: + brick_total = brick_arequal.splitlines()[-1].split(':')[-1] + self.assertEqual(init_val, brick_total, 'Arequals not matching') + + @staticmethod + def _add_dir_path_to_brick_list(brick_list): + """Add test_self_heal at the end of brick path""" + dir_brick_list = [] + for brick in brick_list: + dir_brick_list.append('{}/{}'.format(brick, 'mydir')) + return dir_brick_list + + def _check_arequal_checksum_for_the_volume(self): + """ + Check if arequals of mount point and bricks are + are the same. + """ + if self.volume_type == "replicated": + # Check arequals for "replicated" + brick_list = get_all_bricks(self.mnode, self.volname) + dir_brick_list = self._add_dir_path_to_brick_list(brick_list) + + # Get arequal before getting bricks offline + work_dir = '{}/mydir'.format(self.mountpoint) + ret, arequals = collect_mounts_arequal([self.mounts[0]], + path=work_dir) + self.assertTrue(ret, 'Failed to get arequal') + g.log.info('Getting arequal before getting bricks offline ' + 'is successful') + + # Get arequal on bricks and compare with mount_point_total + self._check_arequal_on_bricks_with_a_specific_arequal( + arequals, dir_brick_list) + + # Check arequals for "distributed-replicated" + if self.volume_type == "distributed-replicated": + # Get the subvolumes + subvols_dict = get_subvols(self.mnode, self.volname) + num_subvols = len(subvols_dict['volume_subvols']) + + # Get arequals and compare + for i in range(0, num_subvols): + # Get arequal for first brick + brick_list = subvols_dict['volume_subvols'][i] + dir_brick_list = self._add_dir_path_to_brick_list(brick_list) + ret, arequals = collect_bricks_arequal([dir_brick_list[0]]) + self.assertTrue(ret, 'Failed to get arequal on first brick') + + # Get arequal for every brick and compare with first brick + self._check_arequal_on_bricks_with_a_specific_arequal( + arequals, dir_brick_list) + + def test_default_granular_entry_heal(self): + """ + Test case: + 1. Create a cluster. + 2. Create volume start it and mount it. + 3. Check if cluster.granular-entry-heal is ON by default or not. + 4. Check /var/lib/glusterd/<volname>/info for + cluster.granular-entry-heal=on. + 5. Check if option granular-entry-heal is present in the + volume graph or not. + 6. Kill one or two bricks of the volume depending on volume type. + 7. Create all types of files on the volume like text files, hidden + files, link files, dirs, char device, block device and so on. + 8. Bring back the killed brick by restarting the volume. + 9. Wait for heal to complete. + 10. Check arequal-checksum of all the bricks and see if it's proper or + not. + """ + # Check if cluster.granular-entry-heal is ON by default or not + ret = get_volume_options(self.mnode, self.volname, + 'granular-entry-heal') + self.assertEqual(ret['cluster.granular-entry-heal'], 'on', + "Value of cluster.granular-entry-heal not on " + "by default") + + # Check var/lib/glusterd/<volname>/info for + # cluster.granular-entry-heal=on + ret = occurences_of_pattern_in_file(self.mnode, + 'cluster.granular-entry-heal=on', + '/var/lib/glusterd/vols/{}/info' + .format(self.volname)) + self.assertEqual(ret, 1, "Failed get cluster.granular-entry-heal=on in" + " info file") + + # Check if option granular-entry-heal is present in the + # volume graph or not + ret = occurences_of_pattern_in_file(self.first_client, + 'option granular-entry-heal on', + "/var/log/glusterfs/mnt-{}_{}.log" + .format(self.volname, + self.mount_type)) + self.assertTrue(ret > 0, + "Failed to find granular-entry-heal in volume graph") + g.log.info("granular-entry-heal properly set to ON by default") + + # Kill one or two bricks of the volume depending on volume type + self._bring_bricks_offline() + + # Create all types of files on the volume like text files, hidden + # files, link files, dirs, char device, block device and so on + cmd = ("cd {};mkdir mydir;cd mydir;mkdir dir;mkdir .hiddendir;" + "touch file;touch .hiddenfile;mknod blockfile b 1 5;" + "mknod charfile b 1 5; mkfifo pipefile;touch fileforhardlink;" + "touch fileforsoftlink;ln fileforhardlink hardlinkfile;" + "ln -s fileforsoftlink softlinkfile".format(self.mountpoint)) + ret, _, _ = g.run(self.first_client, cmd) + self.assertFalse(ret, "Failed to create files of all types") + + # Bring back the killed brick by restarting the volume Bricks should + # be online again + self._restart_volume_and_bring_all_offline_bricks_online() + + # Wait for heal to complete + self._wait_for_heal_to_completed() + + # Check arequal-checksum of all the bricks and see if it's proper or + # not + self._check_arequal_checksum_for_the_volume() diff --git a/tests/functional/afr/test_self_heal_with_expand_volume.py b/tests/functional/afr/test_self_heal_with_expand_volume.py new file mode 100644 index 000000000..d5b6d5d43 --- /dev/null +++ b/tests/functional/afr/test_self_heal_with_expand_volume.py @@ -0,0 +1,221 @@ +# Copyright (C) 2021 Red Hat, Inc. <http://www.redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along` +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +from random import choice + +from glusto.core import Glusto as g +from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.brick_libs import (bring_bricks_offline, + bring_bricks_online, + are_bricks_offline, + are_bricks_online, get_all_bricks) +from glustolibs.gluster.glusterfile import (set_file_permissions, + occurences_of_pattern_in_file) +from glustolibs.gluster.heal_libs import (monitor_heal_completion, + is_heal_complete) +from glustolibs.gluster.rebalance_ops import ( + rebalance_start, wait_for_rebalance_to_complete) +from glustolibs.gluster.lib_utils import (add_user, del_user) +from glustolibs.gluster.volume_libs import (get_subvols, expand_volume) + + +@runs_on([['distributed-replicated'], ['glusterfs']]) +class TestHealWithExpandVolume(GlusterBaseClass): + + def setUp(self): + + self.get_super_method(self, 'setUp')() + + self.first_client = self.mounts[0].client_system + self.mountpoint = self.mounts[0].mountpoint + + # Create non-root users + self.users = ('qa_user', 'qa_admin') + for user in self.users: + if not add_user(self.first_client, user): + raise ExecutionError("Failed to create non-root user {}" + .format(user)) + g.log.info("Successfully created non-root users") + + # Setup Volume + if not self.setup_volume_and_mount_volume([self.mounts[0]]): + raise ExecutionError("Failed to setup and mount volume") + + def tearDown(self): + + # Delete non-root users + for user in self.users: + del_user(self.first_client, user) + ret, _, _ = g.run(self.first_client, + "rm -rf /home/{}".format(user)) + if ret: + raise ExecutionError("Failed to remove home dir of " + "non-root user") + g.log.info("Successfully deleted all users") + + if not self.unmount_volume_and_cleanup_volume([self.mounts[0]]): + raise ExecutionError("Failed to cleanup Volume") + + # Calling GlusterBaseClass tearDown + self.get_super_method(self, 'tearDown')() + + def _bring_bricks_offline(self): + """Brings bricks offline and confirms if they are offline""" + # Select bricks to bring offline from a replica set + subvols_dict = get_subvols(self.mnode, self.volname) + subvols = subvols_dict['volume_subvols'] + self.bricks_to_bring_offline = [] + self.bricks_to_bring_offline.append(choice(subvols[0])) + + # Bring bricks offline + ret = bring_bricks_offline(self.volname, self.bricks_to_bring_offline) + self.assertTrue(ret, 'Failed to bring bricks %s offline' % + self.bricks_to_bring_offline) + + ret = are_bricks_offline(self.mnode, self.volname, + self.bricks_to_bring_offline) + self.assertTrue(ret, 'Bricks %s are not offline' + % self.bricks_to_bring_offline) + g.log.info('Bringing bricks %s offline is successful', + self.bricks_to_bring_offline) + + def _restart_volume_and_bring_all_offline_bricks_online(self): + """Restart volume and bring all offline bricks online""" + ret = bring_bricks_online(self.mnode, self.volname, + self.bricks_to_bring_offline, + bring_bricks_online_methods=[ + 'volume_start_force']) + self.assertTrue(ret, 'Failed to bring bricks %s online' % + self.bricks_to_bring_offline) + + # Check if bricks are back online or not + ret = are_bricks_online(self.mnode, self.volname, + self.bricks_to_bring_offline) + self.assertTrue(ret, 'Bricks not online %s even after restart' % + self.bricks_to_bring_offline) + + g.log.info('Bringing bricks %s online is successful', + self.bricks_to_bring_offline) + + def _wait_for_heal_to_completed(self): + """Check if heal is completed""" + ret = monitor_heal_completion(self.mnode, self.volname, + timeout_period=3600) + self.assertTrue(ret, 'Heal has not yet completed') + + def _check_if_there_are_files_to_be_healed(self): + """Check if there are files and dirs to be healed""" + ret = is_heal_complete(self.mnode, self.volname) + self.assertFalse(ret, 'Heal is completed') + g.log.info('Heal is pending') + + def _expand_volume_and_wait_for_rebalance_to_complete(self): + """Expand volume and wait for rebalance to complete""" + # Add brick to volume + ret = expand_volume(self.mnode, self.volname, self.servers, + self.all_servers_info) + self.assertTrue(ret, "Failed to add brick on volume %s" + % self.volname) + + # Trigger rebalance and wait for it to complete + ret, _, _ = rebalance_start(self.mnode, self.volname, + force=True) + self.assertEqual(ret, 0, "Failed to start rebalance on the volume %s" + % self.volname) + + # Wait for rebalance to complete + ret = wait_for_rebalance_to_complete(self.mnode, self.volname, + timeout=6000) + self.assertTrue(ret, "Rebalance is not yet complete on the volume " + "%s" % self.volname) + g.log.info("Rebalance successfully completed") + + def test_self_heal_and_add_brick_with_data_from_diff_users(self): + """ + Test case: + 1. Created a 2X3 volume. + 2. Mount the volume using FUSE and give 777 permissions to the mount. + 3. Added a new user. + 4. Login as new user and created 100 files from the new user: + for i in {1..100};do dd if=/dev/urandom of=$i bs=1024 count=1;done + 5. Kill a brick which is part of the volume. + 6. On the mount, login as root user and create 1000 files: + for i in {1..1000};do dd if=/dev/urandom of=f$i bs=10M count=1;done + 7. On the mount, login as new user, and copy existing data to + the mount. + 8. Start volume using force. + 9. While heal is in progress, add-brick and start rebalance. + 10. Wait for rebalance and heal to complete, + 11. Check for MSGID: 108008 errors in rebalance logs. + """ + # Change permissions of mount point to 777 + ret = set_file_permissions(self.first_client, self.mountpoint, + '-R 777') + self.assertTrue(ret, "Unable to change mount point permissions") + g.log.info("Mount point permissions set to 777") + + # Create 100 files from non-root user + cmd = ("su -l %s -c 'cd %s; for i in {1..100};do dd if=/dev/urandom " + "of=nonrootfile$i bs=1024 count=1; done'" % (self.users[0], + self.mountpoint)) + ret, _, _ = g.run(self.first_client, cmd) + self.assertFalse(ret, "Failed to create files from non-root user") + + # Kill one brick which is part of the volume + self._bring_bricks_offline() + + # Create 1000 files from root user + cmd = ("cd %s; for i in {1..1000};do dd if=/dev/urandom of=rootfile$i" + " bs=10M count=1;done" % self.mountpoint) + ret, _, _ = g.run(self.first_client, cmd) + self.assertFalse(ret, "Failed to creare files from root user") + + # On the mount, login as new user, and copy existing data to + # the mount + cmd = ("su -l %s -c 'wget https://cdn.kernel.org/pub/linux/kernel/" + "v5.x/linux-5.4.54.tar.xz; tar -xvf linux-5.4.54.tar.xz;" + "cd %s; cp -r ~/ .;'" % (self.users[1], self.mountpoint)) + ret, _, _ = g.run(self.first_client, cmd) + self.assertFalse(ret, "Failed to copy files from non-root user") + + # Check if there are files to be healed + self._check_if_there_are_files_to_be_healed() + + # Start the vol using force + self._restart_volume_and_bring_all_offline_bricks_online() + + # Add bricks to volume and wait for heal to complete + self._expand_volume_and_wait_for_rebalance_to_complete() + + # Wait for heal to complete + self._wait_for_heal_to_completed() + + # Check for MSGID: 108008 errors in rebalance logs + particiapting_nodes = [] + for brick in get_all_bricks(self.mnode, self.volname): + node, _ = brick.split(':') + particiapting_nodes.append(node) + + for server in particiapting_nodes: + ret = occurences_of_pattern_in_file( + server, "MSGID: 108008", + "/var/log/glusterfs/{}-rebalance.log".format(self.volname)) + self.assertEqual(ret, 0, + "[Input/output error] present in rebalance log" + " file") + g.log.info("Expanding volume successful and no MSGID: 108008 " + "errors see in rebalance logs") diff --git a/tests/functional/afr/test_split_brain_with_hard_link_file.py b/tests/functional/afr/test_split_brain_with_hard_link_file.py new file mode 100644 index 000000000..a8248fb72 --- /dev/null +++ b/tests/functional/afr/test_split_brain_with_hard_link_file.py @@ -0,0 +1,175 @@ +# Copyright (C) 2021 Red Hat, Inc. <http://www.redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +# pylint: disable=too-many-statements, too-many-locals, unused-variable +from glusto.core import Glusto as g + +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on +from glustolibs.gluster.brick_libs import (get_all_bricks, + bring_bricks_offline, + bring_bricks_online, + are_bricks_offline) +from glustolibs.gluster.heal_ops import trigger_heal +from glustolibs.gluster.heal_libs import (is_volume_in_split_brain, + monitor_heal_completion, + is_heal_complete) + +from glustolibs.gluster.volume_ops import set_volume_options +from glustolibs.gluster.glusterfile import create_link_file + + +@runs_on([['distributed-replicated'], ['glusterfs']]) +class TestSelfHeal(GlusterBaseClass): + + @classmethod + def setUpClass(cls): + + # Calling GlusterBaseClass setUpClass + cls.get_super_method(cls, 'setUpClass')() + + # Override Volumes + if cls.volume_type == "distributed-replicated": + # Define x3 distributed-replicated volume + cls.volume['voltype'] = { + 'type': 'distributed-replicated', + 'dist_count': 2, + 'replica_count': 3, + 'transport': 'tcp'} + + # Setup Volume and Mount Volume + ret = cls.setup_volume_and_mount_volume(cls.mounts) + if not ret: + raise ExecutionError("Failed to Setup_Volume and Mount_Volume") + g.log.info("Successful in Setup Volume and Mount Volume") + + @classmethod + def tearDownClass(cls): + + # Cleanup Volume + ret = cls.unmount_volume_and_cleanup_volume(cls.mounts) + if not ret: + raise ExecutionError("Failed to create volume") + g.log.info("Successful in cleaning up Volume %s", cls.volname) + + cls.get_super_method(cls, 'tearDownClass')() + + def _test_brick_down_with_file_rename(self, pfile, rfile, brick): + # Bring brick offline + g.log.info('Bringing brick %s offline', brick) + ret = bring_bricks_offline(self.volname, brick) + self.assertTrue(ret, 'Failed to bring brick %s offline' + % brick) + + ret = are_bricks_offline(self.mnode, self.volname, + [brick]) + self.assertTrue(ret, 'Brick %s is not offline' + % brick) + g.log.info('Bringing brick %s offline is successful', + brick) + + # Rename file + cmd = ("mv %s/%s %s/%s" + % (self.mounts[0].mountpoint, pfile, + self.mounts[0].mountpoint, rfile)) + ret, _, _ = g.run(self.clients[0], cmd) + self.assertEqual(ret, 0, "rename of file failed") + + # Bring brick back online + g.log.info('Bringing brick %s online', brick) + ret = bring_bricks_online(self.mnode, self.volname, + brick) + self.assertTrue(ret, 'Failed to bring brick %s online' % + brick) + g.log.info('Bringing brick %s online is successful', brick) + + def test_afr_heal_with_brickdown_hardlink(self): + """ + Steps: + 1. Create 2 * 3 distribute replicate volume and disable all heals + 2. Create a file and 3 hardlinks to it from fuse mount. + 3. Kill brick4, rename HLINK1 to an appropriate name so that + it gets hashed to replicate-1 + 4. Likewise rename HLINK3 and HLINK7 as well, killing brick5 and brick6 + respectively each time. i.e. a different brick of the 2nd + replica is down each time. + 5. Now enable shd and let selfheals complete. + 6. Heal should complete without split-brains. + """ + bricks_list = get_all_bricks(self.mnode, self.volname) + options = {"metadata-self-heal": "off", + "entry-self-heal": "off", + "data-self-heal": "off", + "self-heal-daemon": "off"} + g.log.info("setting options %s", options) + ret = set_volume_options(self.mnode, self.volname, options) + self.assertTrue(ret, ("Unable to set volume option %s for" + "volume %s" % (options, self.volname))) + g.log.info("Successfully set %s for volume %s", options, self.volname) + + cmd = ("touch %s/FILE" % self.mounts[0].mountpoint) + ret, _, _ = g.run(self.clients[0], cmd) + self.assertEqual(ret, 0, "file creation failed") + + # Creating a hardlink for the file created + for i in range(1, 4): + ret = create_link_file(self.clients[0], + '{}/FILE'.format(self.mounts[0].mountpoint), + '{}/HLINK{}'.format + (self.mounts[0].mountpoint, i)) + self.assertTrue(ret, "Unable to create hard link file ") + + # Bring brick3 offline,Rename file HLINK1,and bring back brick3 online + self._test_brick_down_with_file_rename("HLINK1", "NEW-HLINK1", + bricks_list[3]) + + # Bring brick4 offline,Rename file HLINK2,and bring back brick4 online + self._test_brick_down_with_file_rename("HLINK2", "NEW-HLINK2", + bricks_list[4]) + + # Bring brick5 offline,Rename file HLINK3,and bring back brick5 online + self._test_brick_down_with_file_rename("HLINK3", "NEW-HLINK3", + bricks_list[5]) + + # Setting options + options = {"self-heal-daemon": "on"} + ret = set_volume_options(self.mnode, self.volname, options) + self.assertTrue(ret, 'Failed to set options %s' % options) + g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") + + # Start healing + ret = trigger_heal(self.mnode, self.volname) + self.assertTrue(ret, 'Heal is not started') + g.log.info('Healing is started') + + # Monitor heal completion + ret = monitor_heal_completion(self.mnode, self.volname) + self.assertTrue(ret, 'Heal has not yet completed') + + # Check if heal is completed + ret = is_heal_complete(self.mnode, self.volname) + self.assertTrue(ret, 'Heal is not complete') + g.log.info('Heal is completed successfully') + + # Check for split-brain + ret = is_volume_in_split_brain(self.mnode, self.volname) + self.assertFalse(ret, 'Volume is in split-brain state') + g.log.info('Volume is not in split-brain state') + + # Check data on mount point + cmd = ("ls %s" % (self.mounts[0].mountpoint)) + ret, _, _ = g.run(self.clients[0], cmd) + self.assertEqual(ret, 0, "failed to fetch data from mount point") diff --git a/tests/functional/afr/test_split_brain_with_node_reboot.py b/tests/functional/afr/test_split_brain_with_node_reboot.py new file mode 100644 index 000000000..9b630ba75 --- /dev/null +++ b/tests/functional/afr/test_split_brain_with_node_reboot.py @@ -0,0 +1,149 @@ +# Copyright (C) 2021 Red Hat, Inc. <http://www.redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +# pylint: disable=too-many-statements, too-many-locals +from unittest import SkipTest +from glusto.core import Glusto as g +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on +from glustolibs.misc.misc_libs import upload_scripts +from glustolibs.gluster.glusterdir import mkdir +from glustolibs.gluster.heal_libs import (monitor_heal_completion, + is_heal_complete) +from glustolibs.io.utils import (run_linux_untar, run_crefi, + wait_for_io_to_complete) + + +@runs_on([['replicated', 'distributed-replicated'], ['glusterfs']]) +class TestSelfHeal(GlusterBaseClass): + + @classmethod + def setUpClass(cls): + + # Calling GlusterBaseClass setUpClass + cls.get_super_method(cls, 'setUpClass')() + + # Check for availability of atleast 3 clients + if len(cls.clients) < 3: + raise SkipTest("This test requires atleast 3 clients") + + # Upload io scripts for running IO on mounts + cls.script_upload_path = ("/usr/share/glustolibs/io/scripts/" + "file_dir_ops.py") + ret = upload_scripts(cls.clients, cls.script_upload_path) + if not ret: + raise ExecutionError("Failed to upload IO scripts " + "to clients %s" % cls.clients) + g.log.info("Successfully uploaded IO scripts to clients %s", + cls.clients) + + # Setup Volume and Mount Volume + ret = cls.setup_volume_and_mount_volume(cls.mounts, True) + if not ret: + raise ExecutionError("Failed to Setup_Volume and Mount_Volume") + g.log.info("Successful in Setup Volume and Mount Volume") + + cls.list_of_io_processes = [] + cls.is_io_running = False + + def tearDown(self): + + # If I/O processes are running wait from them to complete + if self.is_io_running: + if not wait_for_io_to_complete(self.list_of_io_processes, + self.mounts): + raise ExecutionError("Failed to wait for I/O to complete") + + # Unmounting and cleaning volume + ret = self.unmount_volume_and_cleanup_volume([self.mounts[0]]) + if not ret: + raise ExecutionError("Unable to delete volume %s" % self.volname) + + self.get_super_method(self, 'tearDown')() + + def test_afr_node_reboot_self_heal(self): + """ + Steps: + 1. Create *3 replica volume + 2. Mount the volume on 3 clients + 3. Run following workload from clients + Client 1: Linux Untars + Client 2: Lookups ls + Client 3: Lookups du + 4. Create a directory on mount point + 5. Create deep dirs and file in the directory created at step 4 + 6. Perform node reboot + 7. Check for heal status + 8. Reboot another node + 9. Check for heal status + """ + + # Create a dir to start untar + self.linux_untar_dir = "{}/{}".format(self.mounts[0].mountpoint, + "linuxuntar") + ret = mkdir(self.clients[0], self.linux_untar_dir) + self.assertTrue(ret, "Failed to create dir linuxuntar for untar") + + # Start linux untar on dir linuxuntar from client 1 + ret = run_linux_untar(self.clients[0], self.mounts[0].mountpoint, + dirs=tuple(['linuxuntar'])) + self.list_of_io_processes += ret + self.is_io_running = True + + # Run lookup operation ls from client 2 + cmd = ("cd {}; for i in `seq 1 1000000`;do du -sh; done" + .format(self.mounts[1].mountpoint)) + ret = g.run_async(self.mounts[1].client_system, cmd) + self.list_of_io_processes += [ret] + + # Run lookup operation du from client 3 + cmd = ("cd {}; for i in `seq 1 1000000`;do ls -laRt; done" + .format(self.mounts[2].mountpoint)) + ret = g.run_async(self.mounts[2].client_system, cmd) + self.list_of_io_processes += [ret] + + # Create a dir to start crefi tool + self.linux_untar_dir = "{}/{}".format(self.mounts[3].mountpoint, + "crefi") + ret = mkdir(self.clients[3], self.linux_untar_dir) + self.assertTrue(ret, "Failed to create dir for crefi") + + # Create deep dirs and files on mount point from client 4 + list_of_fops = ("create", "rename", "chmod", "chown", "chgrp", + "hardlink", "truncate", "setxattr") + for fops in list_of_fops: + ret = run_crefi(self.clients[3], + self.linux_untar_dir, 10, 3, 3, thread=4, + random_size=True, fop=fops, minfs=0, + maxfs=102400, multi=True, random_filename=True) + self.assertTrue(ret, "crefi failed during {}".format(fops)) + g.log.info("crefi PASSED FOR fop %s", fops) + g.log.info("IOs were successful using crefi") + + for server_num in (1, 2): + # Perform node reboot for servers + g.log.info("Rebooting %s", self.servers[server_num]) + ret = g.run_async(self.servers[server_num], "reboot") + self.assertTrue(ret, 'Failed to reboot node') + + # Monitor heal completion + ret = monitor_heal_completion(self.mnode, self.volname) + self.assertTrue(ret, 'Heal has not yet completed') + + # Check if heal is completed + ret = is_heal_complete(self.mnode, self.volname) + self.assertTrue(ret, 'Heal is not complete') + g.log.info('Heal is completed successfully') diff --git a/tests/functional/arbiter/test_brick_down_cyclic.py b/tests/functional/arbiter/test_brick_down_cyclic.py new file mode 100644 index 000000000..8639a4dc5 --- /dev/null +++ b/tests/functional/arbiter/test_brick_down_cyclic.py @@ -0,0 +1,140 @@ +# Copyright (C) 2021 Red Hat, Inc. <http://www.redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +# pylint: disable=too-many-statements, too-many-locals +import time +from glusto.core import Glusto as g +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on +from glustolibs.gluster.heal_ops import trigger_heal +from glustolibs.gluster.heal_libs import (is_volume_in_split_brain, + is_heal_complete) +from glustolibs.gluster.brick_libs import (bring_bricks_offline, + bring_bricks_online, + are_bricks_offline, + get_all_bricks, + are_bricks_online) +from glustolibs.gluster.heal_libs import ( + monitor_heal_completion, are_all_self_heal_daemons_are_online) + + +@runs_on([['arbiter', 'distributed-arbiter'], ['glusterfs']]) +class TestBrickDownHeal(GlusterBaseClass): + + @classmethod + def setUpClass(cls): + + # Calling GlusterBaseClass setUpClass + cls.get_super_method(cls, 'setUpClass')() + + # Setup Volume and Mount Volume + ret = cls.setup_volume_and_mount_volume(cls.mounts, True) + if not ret: + raise ExecutionError("Failed to Setup_Volume and Mount_Volume") + + @classmethod + def tearDownClass(cls): + """ + Cleanup Volume + """ + ret = cls.unmount_volume_and_cleanup_volume(cls.mounts) + if not ret: + raise ExecutionError("Failed to create volume") + + cls.get_super_method(cls, 'tearDownClass')() + + def test_brick_down_heal(self): + """ + - Run IO's from client on a single file + - Now bring down bricks in cyclic order + - kill brick 1, sleep for 5 seconds, bring brick 1 up, wait for 10s + - Now repeat step3 for brick2 and brick 3 + - Repeat the cycle a few times + - Trigger heal, check for split brain using command + """ + # Write IO's + self.all_mounts_procs = [] + cmd = ("for i in `seq 1 10`;" + "do dd if=/dev/urandom of=%s/file$i bs=1K count=1;" + "done" % self.mounts[0].mountpoint) + proc = g.run_async(self.mounts[0].client_system, cmd) + self.all_mounts_procs.append(proc) + + # Killing bricks in cyclic order + bricks_list = get_all_bricks(self.mnode, self.volname) + + # Total number of cyclic brick-down cycles to be executed + number_of_cycles = 0 + while number_of_cycles < 3: + number_of_cycles += 1 + for brick in bricks_list: + # Bring brick offline + g.log.info('Bringing bricks %s offline', brick) + ret = bring_bricks_offline(self.volname, [brick]) + self.assertTrue(ret, ("Failed to bring bricks %s offline" + % brick)) + + ret = are_bricks_offline(self.mnode, self.volname, [brick]) + self.assertTrue(ret, 'Bricks %s are not offline' % brick) + g.log.info('Bringing bricks %s offline is successful', brick) + + # Introducing 5 second sleep when brick is down + g.log.info("Waiting for 5 seconds, with ongoing IO while " + "brick %s is offline", brick) + ret = time.sleep(5) + + # Bring brick online + g.log.info('Bringing bricks %s online', brick) + ret = bring_bricks_online(self.mnode, self.volname, [brick]) + self.assertTrue(ret, ("Failed to bring bricks %s online " + % brick)) + g.log.info('Bricks %s are online', brick) + + # Introducing 10 second sleep when brick is up + g.log.info("Waiting for 10 seconds,when " + "brick %s is online", brick) + ret = time.sleep(10) + + # Check if bricks are online + ret = are_bricks_online(self.mnode, self.volname, bricks_list) + self.assertTrue(ret, 'Bricks %s are not online' % bricks_list) + g.log.info('Bricks %s are online', bricks_list) + + # Check daemons + g.log.info('Checking daemons...') + ret = are_all_self_heal_daemons_are_online(self.mnode, + self.volname) + self.assertTrue(ret, ("Some of the self-heal Daemons are " + "offline")) + g.log.info('All self-heal Daemons are online') + + # Trigger self heal + ret = trigger_heal(self.mnode, self.volname) + self.assertTrue(ret, 'Unable to trigger heal on volume') + + # Monitor heal completion + ret = monitor_heal_completion(self.mnode, self.volname) + self.assertTrue(ret, 'Heal has not yet completed') + + # Check if heal is completed + ret = is_heal_complete(self.mnode, self.volname) + self.assertTrue(ret, 'Heal is not complete') + g.log.info('Heal is completed successfully') + + # Check for split-brain + ret = is_volume_in_split_brain(self.mnode, self.volname) + self.assertFalse(ret, 'Volume is in split-brain state') + g.log.info('Volume is not in split-brain state') diff --git a/tests/functional/arbiter/test_verify_metadata_and_data_heal.py b/tests/functional/arbiter/test_verify_metadata_and_data_heal.py new file mode 100644 index 000000000..d48e36e73 --- /dev/null +++ b/tests/functional/arbiter/test_verify_metadata_and_data_heal.py @@ -0,0 +1,297 @@ +# Copyright (C) 2021 Red Hat, Inc. <http://www.redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +from glusto.core import Glusto as g + +from glustolibs.gluster.brick_libs import (bring_bricks_offline, + bring_bricks_online, + get_online_bricks_list) +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on +from glustolibs.gluster.glusterdir import mkdir +from glustolibs.gluster.heal_libs import ( + is_heal_complete, is_volume_in_split_brain, monitor_heal_completion, + wait_for_self_heal_daemons_to_be_online) +from glustolibs.gluster.heal_ops import (disable_self_heal_daemon, + enable_self_heal_daemon, trigger_heal) +from glustolibs.gluster.lib_utils import (add_user, collect_bricks_arequal, + del_user, group_add, group_del) +from glustolibs.gluster.volume_libs import get_subvols +from glustolibs.io.utils import list_all_files_and_dirs_mounts + + +@runs_on([['arbiter', 'replicated'], ['glusterfs']]) +class TestMetadataAndDataHeal(GlusterBaseClass): + '''Description: Verify shd heals files after performing metadata and data + operations while a brick was down''' + def _dac_helper(self, host, option): + '''Helper for creating, deleting users and groups''' + + # Permission/Ownership changes required only for `test_metadata..` + # tests, using random group and usernames + if 'metadata' not in self.test_dir: + return + + if option == 'create': + # Groups + for group in ('qa_func', 'qa_system'): + if not group_add(host, group): + raise ExecutionError('Unable to {} group {} on ' + '{}'.format(option, group, host)) + + # User + if not add_user(host, 'qa_all', group='qa_func'): + raise ExecutionError('Unable to {} user {} under {} on ' + '{}'.format(option, 'qa_all', 'qa_func', + host)) + elif option == 'delete': + # Groups + for group in ('qa_func', 'qa_system'): + if not group_del(host, group): + raise ExecutionError('Unable to {} group {} on ' + '{}'.format(option, group, host)) + + # User + if not del_user(host, 'qa_all'): + raise ExecutionError('Unable to {} user on {}'.format( + option, host)) + + def setUp(self): + self.get_super_method(self, 'setUp')() + + # A single mount is enough for all the tests + self.mounts = self.mounts[0:1] + self.client = self.mounts[0].client_system + + # Use testcase name as test directory + self.test_dir = self.id().split('.')[-1] + self.fqpath = self.mounts[0].mountpoint + '/' + self.test_dir + + if not self.setup_volume_and_mount_volume(mounts=self.mounts): + raise ExecutionError('Failed to setup and mount ' + '{}'.format(self.volname)) + + # Crete group and user names required for the test + self._dac_helper(host=self.client, option='create') + + def tearDown(self): + # Delete group and user names created as part of setup + self._dac_helper(host=self.client, option='delete') + + if not self.unmount_volume_and_cleanup_volume(mounts=self.mounts): + raise ExecutionError('Not able to unmount and cleanup ' + '{}'.format(self.volname)) + + self.get_super_method(self, 'tearDown')() + + def _perform_io_and_disable_self_heal(self): + '''Refactor of steps common to all tests: Perform IO, disable heal''' + ret = mkdir(self.client, self.fqpath) + self.assertTrue(ret, + 'Directory creation failed on {}'.format(self.client)) + self.io_cmd = 'cat /dev/urandom | tr -dc [:space:][:print:] | head -c ' + # Create 6 dir's, 6 files and 6 files in each subdir with 10K data + file_io = ('''cd {0}; for i in `seq 1 6`; + do mkdir dir.$i; {1} 10K > file.$i; + for j in `seq 1 6`; + do {1} 10K > dir.$i/file.$j; done; + done;'''.format(self.fqpath, self.io_cmd)) + ret, _, err = g.run(self.client, file_io) + self.assertEqual(ret, 0, 'Unable to create directories and data files') + self.assertFalse(err, '{0} failed with {1}'.format(file_io, err)) + + # Disable self heal deamon + self.assertTrue(disable_self_heal_daemon(self.mnode, self.volname), + 'Disabling self-heal-daemon falied') + + def _perform_brick_ops_and_enable_self_heal(self, op_type): + '''Refactor of steps common to all tests: Brick down and perform + metadata/data operations''' + # First brick in the subvol will always be online and used for self + # heal, so make keys match brick index + self.op_cmd = { + # Metadata Operations (owner and permission changes) + 'metadata': { + 2: + '''cd {0}; for i in `seq 1 3`; do chown -R qa_all:qa_func \ + dir.$i file.$i; chmod -R 555 dir.$i file.$i; done;''', + 3: + '''cd {0}; for i in `seq 1 3`; do chown -R :qa_system \ + dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''', + # 4 - Will be used for final data consistency check + 4: + '''cd {0}; for i in `seq 1 6`; do chown -R qa_all:qa_system \ + dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''', + }, + # Data Operations (append data to the files) + 'data': { + 2: + '''cd {0}; for i in `seq 1 3`; + do {1} 2K >> file.$i; + for j in `seq 1 3`; + do {1} 2K >> dir.$i/file.$j; done; + done;''', + 3: + '''cd {0}; for i in `seq 1 3`; + do {1} 3K >> file.$i; + for j in `seq 1 3`; + do {1} 3K >> dir.$i/file.$j; done; + done;''', + # 4 - Will be used for final data consistency check + 4: + '''cd {0}; for i in `seq 1 6`; + do {1} 4K >> file.$i; + for j in `seq 1 6`; + do {1} 4K >> dir.$i/file.$j; done; + done;''', + }, + } + bricks = get_online_bricks_list(self.mnode, self.volname) + self.assertIsNotNone(bricks, + 'Not able to get list of bricks in the volume') + + # Make first brick always online and start operations from second brick + for index, brick in enumerate(bricks[1:], start=2): + + # Bring brick offline + ret = bring_bricks_offline(self.volname, brick) + self.assertTrue(ret, 'Unable to bring {} offline'.format(bricks)) + + # Perform metadata/data operation + cmd = self.op_cmd[op_type][index].format(self.fqpath, self.io_cmd) + ret, _, err = g.run(self.client, cmd) + self.assertEqual(ret, 0, '{0} failed with {1}'.format(cmd, err)) + self.assertFalse(err, '{0} failed with {1}'.format(cmd, err)) + + # Bring brick online + ret = bring_bricks_online( + self.mnode, + self.volname, + brick, + bring_bricks_online_methods='volume_start_force') + + # Assert metadata/data operations resulted in pending heals + self.assertFalse(is_heal_complete(self.mnode, self.volname)) + + # Enable and wait self heal daemon to be online + self.assertTrue(enable_self_heal_daemon(self.mnode, self.volname), + 'Enabling self heal daemon failed') + self.assertTrue( + wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname), + 'Not all self heal daemons are online') + + def _validate_heal_completion_and_arequal(self, op_type): + '''Refactor of steps common to all tests: Validate heal from heal + commands, verify arequal, perform IO and verify arequal after IO''' + + # Validate heal completion + self.assertTrue(monitor_heal_completion(self.mnode, self.volname), + 'Self heal is not completed within timeout') + self.assertFalse( + is_volume_in_split_brain(self.mnode, self.volname), + 'Volume is in split brain even after heal completion') + + subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] + self.assertTrue(subvols, 'Not able to get list of subvols') + arbiter = self.volume_type.find('arbiter') >= 0 + stop = len(subvols[0]) - 1 if arbiter else len(subvols[0]) + + # Validate arequal + self._validate_arequal_and_perform_lookup(subvols, stop) + + # Perform some additional metadata/data operations + cmd = self.op_cmd[op_type][4].format(self.fqpath, self.io_cmd) + ret, _, err = g.run(self.client, cmd) + self.assertEqual(ret, 0, '{0} failed with {1}'.format(cmd, err)) + self.assertFalse(err, '{0} failed with {1}'.format(cmd, err)) + + # Validate arequal after additional operations + self._validate_arequal_and_perform_lookup(subvols, stop) + + def _validate_arequal_and_perform_lookup(self, subvols, stop): + '''Refactor of steps common to all tests: Validate arequal from bricks + backend and perform a lookup of all files from mount''' + for subvol in subvols: + ret, arequal = collect_bricks_arequal(subvol[0:stop]) + self.assertTrue( + ret, 'Unable to get `arequal` checksum on ' + '{}'.format(subvol[0:stop])) + self.assertEqual( + len(set(arequal)), 1, 'Mismatch of `arequal` ' + 'checksum among {} is identified'.format(subvol[0:stop])) + + # Perform a lookup of all files and directories on mounts + self.assertTrue(list_all_files_and_dirs_mounts(self.mounts), + 'Failed to list all files and dirs from mount') + + def test_metadata_heal_from_shd(self): + '''Description: Verify files heal after switching on `self-heal-daemon` + when metadata operations are performed while a brick was down + + Steps: + 1. Create, mount and run IO on volume + 2. Set `self-heal-daemon` to `off`, cyclic brick down and perform + metadata operations + 3. Set `self-heal-daemon` to `on` and wait for heal completion + 4. Validate areequal checksum on backend bricks + ''' + op_type = 'metadata' + self._perform_io_and_disable_self_heal() + self._perform_brick_ops_and_enable_self_heal(op_type=op_type) + self._validate_heal_completion_and_arequal(op_type=op_type) + g.log.info('Pass: Verification of metadata heal after switching on ' + '`self heal daemon` is complete') + + def test_metadata_heal_from_heal_cmd(self): + '''Description: Verify files heal after triggering heal command when + metadata operations are performed while a brick was down + + Steps: + 1. Create, mount and run IO on volume + 2. Set `self-heal-daemon` to `off`, cyclic brick down and perform + metadata operations + 3. Set `self-heal-daemon` to `on`, invoke `gluster vol <vol> heal` + 4. Validate areequal checksum on backend bricks + ''' + op_type = 'metadata' + self._perform_io_and_disable_self_heal() + self._perform_brick_ops_and_enable_self_heal(op_type=op_type) + + # Invoke `glfsheal` + self.assertTrue(trigger_heal(self.mnode, self.volname), + 'Unable to trigger index heal on the volume') + + self._validate_heal_completion_and_arequal(op_type=op_type) + g.log.info( + 'Pass: Verification of metadata heal via `glfsheal` is complete') + + def test_data_heal_from_shd(self): + '''Description: Verify files heal after triggering heal command when + data operations are performed while a brick was down + + Steps: + 1. Create, mount and run IO on volume + 2. Set `self-heal-daemon` to `off`, cyclic brick down and perform data + operations + 3. Set `self-heal-daemon` to `on` and wait for heal completion + 4. Validate areequal checksum on backend bricks + ''' + op_type = 'data' + self._perform_io_and_disable_self_heal() + self._perform_brick_ops_and_enable_self_heal(op_type=op_type) + self._validate_heal_completion_and_arequal(op_type=op_type) + g.log.info('Pass: Verification of data heal after switching on ' + '`self heal daemon` is complete') diff --git a/tests/functional/authentication/test_auth_allow_with_brick_down.py b/tests/functional/authentication/test_auth_allow_with_brick_down.py new file mode 100644 index 000000000..8fe365aed --- /dev/null +++ b/tests/functional/authentication/test_auth_allow_with_brick_down.py @@ -0,0 +1,171 @@ +# Copyright (C) 2021 Red Hat, Inc. <http://www.redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +""" Description: + Test cases in this module tests the authentication allow feature +""" +from glusto.core import Glusto as g +from glustolibs.gluster.gluster_base_class import (GlusterBaseClass, + runs_on) +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.auth_ops import set_auth_allow +from glustolibs.gluster.brick_libs import (get_all_bricks, + bring_bricks_offline, + bring_bricks_online, + are_bricks_offline) +from glustolibs.gluster.heal_ops import trigger_heal +from glustolibs.gluster.heal_libs import (monitor_heal_completion, + is_heal_complete) + + +@runs_on([['distributed-replicated', 'distributed-dispersed'], ['glusterfs']]) +class FuseAuthAllow(GlusterBaseClass): + """ + Tests to verify auth.allow feature on fuse mount. + """ + @classmethod + def setUpClass(cls): + """ + Create and start volume + """ + cls.get_super_method(cls, 'setUpClass')() + # Create and start volume + ret = cls.setup_volume() + if not ret: + raise ExecutionError("Failed to setup " + "and start volume %s" % cls.volname) + + def _authenticated_mount(self, mount_obj): + """ + Mount volume on authenticated client + + Args: + mount_obj(obj): Object of GlusterMount class + """ + # Mount volume + ret = mount_obj.mount() + self.assertTrue(ret, ("Failed to mount %s on client %s" % + (mount_obj.volname, + mount_obj.client_system))) + g.log.info("Successfully mounted %s on client %s", mount_obj.volname, + mount_obj.client_system) + + # Verify mount + ret = mount_obj.is_mounted() + self.assertTrue(ret, ("%s is not mounted on client %s" + % (mount_obj.volname, mount_obj.client_system))) + g.log.info("Verified: %s is mounted on client %s", + mount_obj.volname, mount_obj.client_system) + + def _brick_down_heal(self): + # Create files on mount point using dd command + cmd = ('cd %s;for i in {1..10};' + 'do dd if=/dev/urandom bs=1024 count=1 of=file$i;done;' + % (self.mounts[0].mountpoint)) + ret, _, _ = g.run(self.mounts[0].client_system, cmd) + self.assertEqual(ret, 0, "Failed to createfiles on mountpoint") + g.log.info("Successfully created files on mountpoint") + + # Bring brick1 offline + bricks_list = get_all_bricks(self.mnode, self.volname) + ret = bring_bricks_offline(self.volname, bricks_list[1]) + self.assertTrue(ret, 'Failed to bring brick1 offline') + ret = are_bricks_offline(self.mnode, self.volname, + [bricks_list[1]]) + self.assertTrue(ret, 'Brick1 is not offline') + g.log.info('Bringing brick1 offline is successful') + + # Bring brick1 back online + ret = bring_bricks_online(self.mnode, self.volname, + [bricks_list[1]]) + self.assertTrue(ret, 'Failed to bring brick1 online') + g.log.info('Bringing brick1 online is successful') + + # Start healing + ret = trigger_heal(self.mnode, self.volname) + self.assertTrue(ret, 'Heal is not started') + g.log.info('Healing is started') + + # Monitor heal completion + ret = monitor_heal_completion(self.mnode, self.volname) + self.assertTrue(ret, 'Heal has not yet completed') + + # Check if heal is completed + ret = is_heal_complete(self.mnode, self.volname) + self.assertTrue(ret, 'Heal is not complete') + g.log.info('Heal is completed successfully') + + def test_auth_allow_with_heal(self): + """ + Validating the FUSE authentication volume options with Heal. + Steps: + 1. Setup and start volume + 2. Set auth.allow on volume for client1 using ip of client1 + 3. Mount volume on client1. + 4. Create files on mount point using dd command + 5. Bring down one brick of the volume + 6. Bring the brick back up after few seconds using + "gluster volume start force" + 7. Start volume heal by using gluster volume heal + 8. See the heal status using gluster volume heal info + 9. Set auth.allow on volume for client1 using hostname of client1. + 10. Repeat steps from 3 to 9 + """ + # Setting authentication on volume for client1 using ip + auth_dict = {'all': [self.mounts[0].client_system]} + ret = set_auth_allow(self.volname, self.mnode, auth_dict) + self.assertTrue(ret, "Failed to set authentication") + + # Mounting volume on client1 + self._authenticated_mount(self.mounts[0]) + + # Create files,bring brick down and check heal + self._brick_down_heal() + + # Unmount volume from client1 + ret = self.mounts[0].unmount() + self.assertTrue(ret, ("Failed to unmount volume %s from client %s" + % (self.volname, self.mounts[0].client_system))) + + # Obtain hostname of client1 + ret, hostname_client1, _ = g.run(self.mounts[0].client_system, + "hostname") + self.assertEqual(ret, 0, ("Failed to obtain hostname of client %s" + % self.mounts[0].client_system)) + g.log.info("Obtained hostname of client. IP- %s, hostname- %s", + self.mounts[0].client_system, hostname_client1.strip()) + + # Setting authentication on volume for client1 using hostname + auth_dict = {'all': [hostname_client1.strip()]} + ret = set_auth_allow(self.volname, self.mnode, auth_dict) + self.assertTrue(ret, "Failed to set authentication") + + # Mounting volume on client1 + self._authenticated_mount(self.mounts[0]) + + # Create files,bring brick down and check heal + self._brick_down_heal() + + def tearDown(self): + """ + Cleanup volume + """ + ret = self.cleanup_volume() + if not ret: + raise ExecutionError("Failed to cleanup volume.") + + # Calling GlusterBaseClass tearDown + self.get_super_method(self, 'tearDown')() diff --git a/tests/functional/bvt/test_cvt.py b/tests/functional/bvt/test_cvt.py index dea251256..f8cb4f2ba 100644 --- a/tests/functional/bvt/test_cvt.py +++ b/tests/functional/bvt/test_cvt.py @@ -41,15 +41,13 @@ from glustolibs.gluster.volume_libs import ( from glustolibs.gluster.volume_libs import ( log_volume_info_and_status, expand_volume, shrink_volume, replace_brick_from_volume, wait_for_volume_process_to_be_online) -from glustolibs.gluster.glusterfile import get_fattr_list from glustolibs.gluster.rebalance_ops import (rebalance_start, wait_for_rebalance_to_complete, rebalance_status) from glustolibs.gluster.brick_libs import (select_bricks_to_bring_offline, bring_bricks_offline, bring_bricks_online, - are_bricks_offline, - get_all_bricks) + are_bricks_offline) from glustolibs.gluster.heal_libs import monitor_heal_completion from glustolibs.gluster.quota_ops import (quota_enable, quota_disable, quota_limit_usage, @@ -286,39 +284,9 @@ class TestGlusterShrinkVolumeSanity(GlusterBasicFeaturesSanityBaseClass): g.log.info("Successful in logging volume info and status of volume %s", self.volname) - # Temporary code: - # Additional checks to gather infomartion from all - # servers for Bug 1810901 and setting log level to debug. - if self.volume_type == 'distributed-dispersed': - for brick_path in get_all_bricks(self.mnode, self.volname): - node, path = brick_path.split(':') - ret, out, _ = g.run(node, 'find {}/'.format(path)) - g.log.info(out) - for filedir in out.split('\n'): - ret, out, _ = g.run(node, 'ls -l {}'.format(filedir)) - g.log.info("Return value for ls -l command: %s", ret) - g.log.info(out) - ret = get_fattr_list(node, filedir, encode_hex=True) - g.log.info(ret) - # Shrinking volume by removing bricks from volume when IO in progress ret = shrink_volume(self.mnode, self.volname) - # Temporary code: - # Additional checks to gather infomartion from all - # servers for Bug 1810901. - if not ret and self.volume_type == 'distributed-dispersed': - for brick_path in get_all_bricks(self.mnode, self.volname): - node, path = brick_path.split(':') - ret, out, _ = g.run(node, 'find {}/'.format(path)) - g.log.info(out) - for filedir in out.split('\n'): - ret, out, _ = g.run(node, 'ls -l {}'.format(filedir)) - g.log.info("Return value for ls -l command: %s", ret) - g.log.info(out) - ret = get_fattr_list(node, filedir, encode_hex=True) - g.log.info(ret) - self.assertTrue(ret, ("Failed to shrink the volume when IO in " "progress on volume %s", self.volname)) g.log.info("Shrinking volume when IO in progress is successful on " diff --git a/tests/functional/dht/test_rebalance_multiple_expansions.py b/tests/functional/dht/test_rebalance_multiple_expansions.py new file mode 100644 index 000000000..e96d88d56 --- /dev/null +++ b/tests/functional/dht/test_rebalance_multiple_expansions.py @@ -0,0 +1,100 @@ +# Copyright (C) 2021 Red Hat, Inc. <http://www.redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along` +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +from glusto.core import Glusto as g +from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.rebalance_ops import ( + rebalance_start, wait_for_rebalance_to_complete) +from glustolibs.gluster.volume_libs import expand_volume +from glustolibs.io.utils import collect_mounts_arequal + + +@runs_on([['distributed', 'distributed-replicated'], + ['glusterfs']]) +class TestRebalanceMultipleExpansions(GlusterBaseClass): + + def setUp(self): + + self.get_super_method(self, 'setUp')() + + # Setup Volume + if not self.setup_volume_and_mount_volume([self.mounts[0]]): + raise ExecutionError("Failed to Setup and mount volume") + + self.first_client = self.mounts[0].client_system + + def tearDown(self): + + # Unmount and clean volume + if not self.unmount_volume_and_cleanup_volume([self.mounts[0]]): + raise ExecutionError("Failed to Cleanup Volume") + + # Calling GlusterBaseClass tearDown + self.get_super_method(self, 'tearDown')() + + def test_rebalance_multiple_expansions(self): + """ + Test case: + 1. Create a volume, start it and mount it + 2. Create some file on mountpoint + 3. Collect arequal checksum on mount point pre-rebalance + 4. Do the following 3 times: + 5. Expand the volume + 6. Start rebalance and wait for it to finish + 7. Collect arequal checksum on mount point post-rebalance + and compare with value from step 3 + """ + + # Create some file on mountpoint + cmd = ("cd %s; for i in {1..500} ; do " + "dd if=/dev/urandom of=file$i bs=10M count=1; done" + % self.mounts[0].mountpoint) + ret, _, _ = g.run(self.first_client, cmd) + self.assertEqual(ret, 0, "IO failed on volume %s" + % self.volname) + + # Collect arequal checksum before rebalance + arequal_checksum_before = collect_mounts_arequal(self.mounts[0]) + + for _ in range(3): + # Add brick to volume + ret = expand_volume(self.mnode, self.volname, self.servers, + self.all_servers_info) + self.assertTrue(ret, "Failed to add brick on volume %s" + % self.volname) + + # Trigger rebalance and wait for it to complete + ret, _, _ = rebalance_start(self.mnode, self.volname, + force=True) + self.assertEqual(ret, 0, "Failed to start rebalance on " + "volume %s" % self.volname) + + # Wait for rebalance to complete + ret = wait_for_rebalance_to_complete(self.mnode, self.volname, + timeout=1200) + self.assertTrue(ret, "Rebalance is not yet complete on the volume " + "%s" % self.volname) + g.log.info("Rebalance successfully completed") + + # Collect arequal checksum after rebalance + arequal_checksum_after = collect_mounts_arequal(self.mounts[0]) + + # Check for data loss by comparing arequal before and after + # rebalance + self.assertEqual(arequal_checksum_before, arequal_checksum_after, + "arequal checksum is NOT MATCHNG") + g.log.info("arequal checksum is SAME") diff --git a/tests/functional/dht/test_rebalance_multiple_shrinks.py b/tests/functional/dht/test_rebalance_multiple_shrinks.py new file mode 100644 index 000000000..a95cdf141 --- /dev/null +++ b/tests/functional/dht/test_rebalance_multiple_shrinks.py @@ -0,0 +1,87 @@ +# Copyright (C) 2021 Red Hat, Inc. <http://www.redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along` +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +from glusto.core import Glusto as g +from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.volume_libs import shrink_volume +from glustolibs.io.utils import collect_mounts_arequal + + +@runs_on([['distributed'], ['glusterfs']]) +class TestRebalanceMultipleShrinks(GlusterBaseClass): + + def setUp(self): + + self.get_super_method(self, 'setUp')() + + # Changing dist_count to 6 + self.volume['voltype']['dist_count'] = 6 + + # Setup Volume + if not self.setup_volume_and_mount_volume([self.mounts[0]]): + raise ExecutionError("Failed to Setup and mount volume") + + self.first_client = self.mounts[0].client_system + + def tearDown(self): + + # Unmount and clean volume + if not self.unmount_volume_and_cleanup_volume([self.mounts[0]]): + raise ExecutionError("Failed to Cleanup Volume") + + # Calling GlusterBaseClass tearDown + self.get_super_method(self, 'tearDown')() + + def test_rebalance_multiple_shrinks(self): + """ + Test case: + 1. Modify the distribution count of a volume + 2. Create a volume, start it and mount it + 3. Create some file on mountpoint + 4. Collect arequal checksum on mount point pre-rebalance + 5. Do the following 3 times: + 6. Shrink the volume + 7. Collect arequal checksum on mount point post-rebalance + and compare with value from step 4 + """ + + # Create some file on mountpoint + cmd = ("cd %s; for i in {1..500} ; do " + "dd if=/dev/urandom of=file$i bs=10M count=1; done" + % self.mounts[0].mountpoint) + ret, _, _ = g.run(self.first_client, cmd) + self.assertEqual(ret, 0, "IO failed on volume %s" + % self.volname) + + # Collect arequal checksum before rebalance + arequal_checksum_before = collect_mounts_arequal(self.mounts[0]) + + for _ in range(3): + # Shrink volume + ret = shrink_volume(self.mnode, self.volname, + rebalance_timeout=16000) + self.assertTrue(ret, "Failed to remove-brick from volume") + g.log.info("Remove-brick rebalance successful") + + # Collect arequal checksum after rebalance + arequal_checksum_after = collect_mounts_arequal(self.mounts[0]) + + # Check for data loss by comparing arequal before and after + # rebalance + self.assertEqual(arequal_checksum_before, arequal_checksum_after, + "arequal checksum is NOT MATCHNG") + g.log.info("arequal checksum is SAME") diff --git a/tests/functional/dht/test_rebalance_nested_dir.py b/tests/functional/dht/test_rebalance_nested_dir.py new file mode 100644 index 000000000..77f099ad3 --- /dev/null +++ b/tests/functional/dht/test_rebalance_nested_dir.py @@ -0,0 +1,99 @@ +# Copyright (C) 2020 Red Hat, Inc. <http://www.redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along` +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +from glusto.core import Glusto as g +from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.rebalance_ops import ( + rebalance_start, wait_for_rebalance_to_complete) +from glustolibs.gluster.volume_libs import expand_volume +from glustolibs.io.utils import collect_mounts_arequal + + +@runs_on([['distributed', 'distributed-replicated'], + ['glusterfs']]) +class TestRebalanceNestedDir(GlusterBaseClass): + + def setUp(self): + + self.get_super_method(self, 'setUp')() + + # Setup Volume + if not self.setup_volume_and_mount_volume([self.mounts[0]]): + raise ExecutionError("Failed to Setup and mount volume") + + self.first_client = self.mounts[0].client_system + + def tearDown(self): + + # Unmount and clean volume + if not self.unmount_volume_and_cleanup_volume([self.mounts[0]]): + raise ExecutionError("Failed to Cleanup Volume") + + # Calling GlusterBaseClass tearDown + self.get_super_method(self, 'tearDown')() + + def test_rebalance_nested_dir(self): + """ + Test case: + 1. Create a volume, start it and mount it + 2. On mount point, create a large nested dir structure with + files in the inner-most dir + 3. Collect arequal checksum on mount point pre-rebalance + 4. Expand the volume + 5. Start rebalance and wait for it to finish + 6. Collect arequal checksum on mount point post-rebalance + and compare wth value from step 3 + """ + + # create a large nested dir structure with files in the inner-most dir + cmd = ("cd %s; for i in {1..100} ; do mkdir $i; cd $i; done;" + "for j in {1..100} ; do " + "dd if=/dev/urandom of=file$j bs=10M count=1; done" + % self.mounts[0].mountpoint) + ret, _, _ = g.run(self.first_client, cmd) + self.assertEqual(ret, 0, "IO failed on volume %s" + % self.volname) + + # Collect arequal checksum before rebalance + arequal_checksum_before = collect_mounts_arequal(self.mounts[0]) + + # Add brick to volume + ret = expand_volume(self.mnode, self.volname, self.servers, + self.all_servers_info) + self.assertTrue(ret, "Failed to add brick on volume %s" + % self.volname) + + # Trigger rebalance and wait for it to complete + ret, _, _ = rebalance_start(self.mnode, self.volname, + force=True) + self.assertEqual(ret, 0, "Failed to start rebalance on the volume %s" + % self.volname) + + # Wait for rebalance to complete + ret = wait_for_rebalance_to_complete(self.mnode, self.volname, + timeout=1200) + self.assertTrue(ret, "Rebalance is not yet complete on the volume " + "%s" % self.volname) + g.log.info("Rebalance successfully completed") + + # Collect arequal checksum after rebalance + arequal_checksum_after = collect_mounts_arequal(self.mounts[0]) + + # Check for data loss by comparing arequal before and after rebalance + self.assertEqual(arequal_checksum_before, arequal_checksum_after, + "arequal checksum is NOT MATCHNG") + g.log.info("arequal checksum is SAME") diff --git a/tests/functional/dht/test_verify_permissions_on_root_dir_when_brick_down.py b/tests/functional/dht/test_verify_permissions_on_root_dir_when_brick_down.py new file mode 100644 index 000000000..f6228c122 --- /dev/null +++ b/tests/functional/dht/test_verify_permissions_on_root_dir_when_brick_down.py @@ -0,0 +1,134 @@ +# Copyright (C) 2021 Red Hat, Inc. <http://www.redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +from glusto.core import Glusto as g + +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on +from glustolibs.gluster.glusterfile import set_file_permissions +from glustolibs.gluster.brick_libs import (get_all_bricks, + bring_bricks_offline, + bring_bricks_online) + + +@runs_on([['distributed', 'distributed-replicated', 'distributed-dispersed', + 'distributed-arbiter'], + ['glusterfs']]) +class TestVerifyPermissionChanges(GlusterBaseClass): + def setUp(self): + """ + Setup and mount volume + """ + self.get_super_method(self, 'setUp')() + + # Setup Volume + if not self.setup_volume_and_mount_volume(mounts=[self.mounts[0]]): + raise ExecutionError("Failed to Setup and Mount Volume") + + def _set_root_dir_permission(self, permission): + """ Sets the root dir permission to the given value""" + m_point = self.mounts[0].mountpoint + ret = set_file_permissions(self.clients[0], m_point, permission) + self.assertTrue(ret, "Failed to set root dir permissions") + + def _get_dir_permissions(self, host, directory): + """ Returns dir permissions""" + cmd = 'stat -c "%a" {}'.format(directory) + ret, out, _ = g.run(host, cmd) + self.assertEqual(ret, 0, "Failed to get permission on {}".format(host)) + return out.strip() + + def _get_root_dir_permission(self, expected=None): + """ Returns the root dir permission """ + permission = self._get_dir_permissions(self.mounts[0].client_system, + self.mounts[0].mountpoint) + if not expected: + return permission.strip() + self.assertEqual(permission, expected, "The permissions doesn't match") + return True + + def _bring_a_brick_offline(self): + """ Brings down a brick from the volume""" + brick_to_kill = get_all_bricks(self.mnode, self.volname)[-1] + ret = bring_bricks_offline(self.volname, brick_to_kill) + self.assertTrue(ret, "Failed to bring brick offline") + return brick_to_kill + + def _bring_back_brick_online(self, brick): + """ Brings back down brick from the volume""" + ret = bring_bricks_online(self.mnode, self.volname, brick) + self.assertTrue(ret, "Failed to bring brick online") + + def _verify_mount_dir_and_brick_dir_permissions(self, expected, + down_brick=None): + """ Verifies the mount directory and brick dir permissions are same""" + # Get root dir permission and verify + self._get_root_dir_permission(expected) + + # Verify brick dir permission + brick_list = get_all_bricks(self.mnode, self.volname) + for brick in brick_list: + brick_node, brick_path = brick.split(":") + if down_brick and down_brick.split(":")[-1] != brick_path: + actual_perm = self._get_dir_permissions(brick_node, + brick_path) + self.assertEqual(actual_perm, expected, + "The permissions are not same") + + def test_verify_root_dir_permission_changes(self): + """ + 1. create pure dist volume + 2. mount on client + 3. Checked default permission (should be 755) + 4. Change the permission to 444 and verify + 5. Kill a brick + 6. Change root permission to 755 + 7. Verify permission changes on all bricks, except down brick + 8. Bring back the brick and verify the changes are reflected + """ + + # Verify the default permission on root dir is 755 + self._verify_mount_dir_and_brick_dir_permissions("755") + + # Change root permission to 444 + self._set_root_dir_permission("444") + + # Verify the changes were successful + self._verify_mount_dir_and_brick_dir_permissions("444") + + # Kill a brick + offline_brick = self._bring_a_brick_offline() + + # Change root permission to 755 + self._set_root_dir_permission("755") + + # Verify the permission changed to 755 on mount and brick dirs + self._verify_mount_dir_and_brick_dir_permissions("755", offline_brick) + + # Bring brick online + self._bring_back_brick_online(offline_brick) + + # Verify the permission changed to 755 on mount and brick dirs + self._verify_mount_dir_and_brick_dir_permissions("755") + + def tearDown(self): + # Unmount and cleanup original volume + if not self.unmount_volume_and_cleanup_volume(mounts=[self.mounts[0]]): + raise ExecutionError("Failed to umount the vol & cleanup Volume") + g.log.info("Successful in umounting the volume and Cleanup") + + # Calling GlusterBaseClass tearDown + self.get_super_method(self, 'tearDown')() diff --git a/tests/functional/glusterd/test_glusterd_memory_consumption_increase.py b/tests/functional/glusterd/test_glusterd_memory_consumption_increase.py new file mode 100644 index 000000000..92c48da6f --- /dev/null +++ b/tests/functional/glusterd/test_glusterd_memory_consumption_increase.py @@ -0,0 +1,207 @@ +# Copyright (C) 2021 Red Hat, Inc. <http://www.redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +""" Description: + Increase in glusterd memory consumption on repetetive operations + for 100 volumes +""" + +from glusto.core import Glusto as g +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.gluster_base_class import GlusterBaseClass +from glustolibs.gluster.volume_ops import (volume_stop, volume_delete, + get_volume_list, + volume_start) +from glustolibs.gluster.gluster_init import (restart_glusterd, + wait_for_glusterd_to_start) +from glustolibs.gluster.volume_libs import (bulk_volume_creation, + cleanup_volume) +from glustolibs.gluster.volume_ops import set_volume_options + + +class TestGlusterMemoryConsumptionIncrease(GlusterBaseClass): + def tearDown(self): + # Clean up all volumes + if self.volume_present: + vol_list = get_volume_list(self.mnode) + if vol_list is None: + raise ExecutionError("Failed to get the volume list") + + for volume in vol_list: + ret = cleanup_volume(self.mnode, volume) + if not ret: + raise ExecutionError("Unable to delete volume %s" % volume) + g.log.info("Volume deleted successfully : %s", volume) + + # Disable multiplex + ret = set_volume_options(self.mnode, 'all', + {'cluster.brick-multiplex': 'disable'}) + self.assertTrue(ret, "Failed to enable brick-multiplex" + " for the cluster") + + # Calling baseclass tearDown method + self.get_super_method(self, 'tearDown')() + + def _volume_operations_in_loop(self): + """ Create, start, stop and delete 100 volumes in a loop """ + # Create and start 100 volumes in a loop + self.volume_config = { + 'name': 'volume-', + 'servers': self.servers, + 'voltype': {'type': 'distributed-replicated', + 'dist_count': 2, + 'replica_count': 3}, + } + + ret = bulk_volume_creation(self.mnode, 100, self.all_servers_info, + self.volume_config, "", False, True) + self.assertTrue(ret, "Failed to create volumes") + + self.volume_present = True + + g.log.info("Successfully created all the volumes") + + # Start 100 volumes in loop + for i in range(100): + self.volname = "volume-%d" % i + ret, _, _ = volume_start(self.mnode, self.volname) + self.assertEqual(ret, 0, "Failed to start volume: %s" + % self.volname) + + g.log.info("Successfully started all the volumes") + + # Stop 100 volumes in loop + for i in range(100): + self.volname = "volume-%d" % i + ret, _, _ = volume_stop(self.mnode, self.volname) + self.assertEqual(ret, 0, "Failed to stop volume: %s" + % self.volname) + + g.log.info("Successfully stopped all the volumes") + + # Delete 100 volumes in loop + for i in range(100): + self.volname = "volume-%d" % i + ret = volume_delete(self.mnode, self.volname) + self.assertTrue(ret, "Failed to delete volume: %s" + % self.volname) + + self.volume_present = False + + g.log.info("Successfully deleted all the volumes") + + def _memory_consumption_for_all_nodes(self, pid_list): + """Fetch the memory consumption by glusterd process for + all the nodes + """ + memory_consumed_list = [] + for i, server in enumerate(self.servers): + # Get the memory consumption of glusterd in each node + cmd = "top -b -n 1 -p %d | awk 'FNR==8 {print $6}'" % pid_list[i] + ret, mem, _ = g.run(server, cmd) + self.assertEqual(ret, 0, "Failed to get the memory usage of" + " glusterd process") + mem = int(mem)//1024 + memory_consumed_list.append(mem) + + return memory_consumed_list + + def test_glusterd_memory_consumption_increase(self): + """ + Test Case: + 1) Enable brick-multiplex and set max-bricks-per-process to 3 in + the cluster + 2) Get the glusterd memory consumption + 3) Perform create,start,stop,delete operation for 100 volumes + 4) Check glusterd memory consumption, it should not increase by + more than 50MB + 5) Repeat steps 3-4 for two more time + 6) Check glusterd memory consumption it should not increase by + more than 10MB + """ + # pylint: disable=too-many-locals + # Restarting glusterd to refresh its memory consumption + ret = restart_glusterd(self.servers) + self.assertTrue(ret, "Restarting glusterd failed") + + # check if glusterd is running post reboot + ret = wait_for_glusterd_to_start(self.servers) + self.assertTrue(ret, "Glusterd service is not running post reboot") + + # Enable brick-multiplex, set max-bricks-per-process to 3 in cluster + for key, value in (('cluster.brick-multiplex', 'enable'), + ('cluster.max-bricks-per-process', '3')): + ret = set_volume_options(self.mnode, 'all', {key: value}) + self.assertTrue(ret, "Failed to set {} to {} " + " for the cluster".format(key, value)) + + # Get the pidof of glusterd process + pid_list = [] + for server in self.servers: + # Get the pidof of glusterd process + cmd = "pidof glusterd" + ret, pid, _ = g.run(server, cmd) + self.assertEqual(ret, 0, "Failed to get the pid of glusterd") + pid = int(pid) + pid_list.append(pid) + + # Fetch the list of memory consumed in all the nodes + mem_consumed_list = self._memory_consumption_for_all_nodes(pid_list) + + # Perform volume operations for 100 volumes for first time + self._volume_operations_in_loop() + + # Fetch the list of memory consumed in all the nodes after 1 iteration + mem_consumed_list_1 = self._memory_consumption_for_all_nodes(pid_list) + + for i, mem in enumerate(mem_consumed_list_1): + condition_met = False + if mem - mem_consumed_list[i] <= 50: + condition_met = True + + self.assertTrue(condition_met, "Unexpected: Memory consumption" + " glusterd increased more than the expected" + " of value") + + # Perform volume operations for 100 volumes for second time + self._volume_operations_in_loop() + + # Fetch the list of memory consumed in all the nodes after 2 iterations + mem_consumed_list_2 = self._memory_consumption_for_all_nodes(pid_list) + + for i, mem in enumerate(mem_consumed_list_2): + condition_met = False + if mem - mem_consumed_list_1[i] <= 10: + condition_met = True + + self.assertTrue(condition_met, "Unexpected: Memory consumption" + " glusterd increased more than the expected" + " of value") + + # Perform volume operations for 100 volumes for third time + self._volume_operations_in_loop() + + # Fetch the list of memory consumed in all the nodes after 3 iterations + mem_consumed_list_3 = self._memory_consumption_for_all_nodes(pid_list) + + for i, mem in enumerate(mem_consumed_list_3): + condition_met = False + if mem - mem_consumed_list_2[i] <= 10: + condition_met = True + + self.assertTrue(condition_met, "Unexpected: Memory consumption" + " glusterd increased more than the expected" + " of value") diff --git a/tests/functional/glusterd/test_probe_glusterd_down.py b/tests/functional/glusterd/test_probe_glusterd_down.py index 3705904a9..c851bf104 100644 --- a/tests/functional/glusterd/test_probe_glusterd_down.py +++ b/tests/functional/glusterd/test_probe_glusterd_down.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Red Hat, Inc. <http://www.redhat.com> +# Copyright (C) 2020-2021 Red Hat, Inc. <http://www.redhat.com> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -14,17 +14,14 @@ # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -from time import sleep - from glusto.core import Glusto as g from glustolibs.gluster.gluster_base_class import GlusterBaseClass from glustolibs.gluster.exceptions import ExecutionError from glustolibs.gluster.peer_ops import peer_probe from glustolibs.gluster.lib_utils import is_core_file_created from glustolibs.gluster.peer_ops import peer_detach, is_peer_connected -from glustolibs.gluster.gluster_init import (stop_glusterd, start_glusterd, - wait_for_glusterd_to_start) -from glustolibs.misc.misc_libs import are_nodes_online +from glustolibs.gluster.gluster_init import stop_glusterd, start_glusterd +from glustolibs.misc.misc_libs import bring_down_network_interface class PeerProbeWhenGlusterdDown(GlusterBaseClass): @@ -57,7 +54,7 @@ class PeerProbeWhenGlusterdDown(GlusterBaseClass): ret, test_timestamp, _ = g.run_local('date +%s') test_timestamp = test_timestamp.strip() - # detach one of the nodes which is part of the cluster + # Detach one of the nodes which is part of the cluster g.log.info("detaching server %s ", self.servers[1]) ret, _, err = peer_detach(self.mnode, self.servers[1]) msg = 'peer detach: failed: %s is not part of cluster\n' \ @@ -66,12 +63,12 @@ class PeerProbeWhenGlusterdDown(GlusterBaseClass): self.assertEqual(err, msg, "Failed to detach %s " % (self.servers[1])) - # bring down glusterd of the server which has been detached + # Bring down glusterd of the server which has been detached g.log.info("Stopping glusterd on %s ", self.servers[1]) ret = stop_glusterd(self.servers[1]) self.assertTrue(ret, "Fail to stop glusterd on %s " % self.servers[1]) - # trying to peer probe the node whose glusterd was stopped using its IP + # Trying to peer probe the node whose glusterd was stopped using IP g.log.info("Peer probing %s when glusterd down ", self.servers[1]) ret, _, err = peer_probe(self.mnode, self.servers[1]) self.assertNotEqual(ret, 0, "Peer probe should not pass when " @@ -79,7 +76,7 @@ class PeerProbeWhenGlusterdDown(GlusterBaseClass): self.assertEqual(err, "peer probe: failed: Probe returned with " "Transport endpoint is not connected\n") - # trying to peer probe the same node with hostname + # Trying to peer probe the same node with hostname g.log.info("Peer probing node %s using hostname with glusterd down ", self.servers[1]) hostname = g.run(self.servers[1], "hostname") @@ -89,27 +86,24 @@ class PeerProbeWhenGlusterdDown(GlusterBaseClass): self.assertEqual(err, "peer probe: failed: Probe returned with" " Transport endpoint is not connected\n") - # start glusterd again for the next set of test steps + # Start glusterd again for the next set of test steps g.log.info("starting glusterd on %s ", self.servers[1]) ret = start_glusterd(self.servers[1]) self.assertTrue(ret, "glusterd couldn't start successfully on %s" % self.servers[1]) - # reboot a server and then trying to peer probe at the time of reboot - g.log.info("Rebooting %s and checking peer probe", self.servers[1]) - reboot = g.run_async(self.servers[1], "reboot") - - # Mandatory sleep for 3 seconds to make sure node is in halted state - sleep(3) + # Bring down the network for sometime + network_status = bring_down_network_interface(self.servers[1], 150) # Peer probing the node using IP when it is still not online - g.log.info("Peer probing node %s which has been issued a reboot ", + g.log.info("Peer probing node %s when network is down", self.servers[1]) ret, _, err = peer_probe(self.mnode, self.servers[1]) self.assertNotEqual(ret, 0, "Peer probe passed when it was expected to" " fail") - self.assertEqual(err, "peer probe: failed: Probe returned with " - "Transport endpoint is not connected\n") + self.assertEqual(err.split("\n")[0], "peer probe: failed: Probe " + "returned with Transport endpoint" + " is not connected") # Peer probing the node using hostname when it is still not online g.log.info("Peer probing node %s using hostname which is still " @@ -118,35 +112,21 @@ class PeerProbeWhenGlusterdDown(GlusterBaseClass): ret, _, err = peer_probe(self.mnode, hostname[1].strip()) self.assertNotEqual(ret, 0, "Peer probe should not pass when node " "has not come online") - self.assertEqual(err, "peer probe: failed: Probe returned with " - "Transport endpoint is not connected\n") + self.assertEqual(err.split("\n")[0], "peer probe: failed: Probe " + "returned with Transport endpoint" + " is not connected") + + ret, _, _ = network_status.async_communicate() + if ret != 0: + g.log.error("Failed to perform network interface ops") - ret, _, _ = reboot.async_communicate() - self.assertEqual(ret, 255, "reboot failed") - - # Validate if rebooted node is online or not - count = 0 - while count < 40: - sleep(15) - ret, _ = are_nodes_online(self.servers[1]) - if ret: - g.log.info("Node %s is online", self.servers[1]) - break - count += 1 - self.assertTrue(ret, "Node in test not yet online") - - # check if glusterd is running post reboot - ret = wait_for_glusterd_to_start(self.servers[1], - glusterd_start_wait_timeout=120) - self.assertTrue(ret, "Glusterd service is not running post reboot") - - # peer probe the node must pass + # Peer probe the node must pass g.log.info("peer probing node %s", self.servers[1]) ret, _, err = peer_probe(self.mnode, self.servers[1]) self.assertEqual(ret, 0, "Peer probe has failed unexpectedly with " "%s " % err) - # checking if core file created in "/", "/tmp" and "/var/log/core" + # Checking if core file created in "/", "/tmp" and "/var/log/core" ret = is_core_file_created(self.servers, test_timestamp) self.assertTrue(ret, "core file found") diff --git a/tests/functional/glusterd/test_verify_df_output.py b/tests/functional/glusterd/test_verify_df_output.py new file mode 100644 index 000000000..4eac9193b --- /dev/null +++ b/tests/functional/glusterd/test_verify_df_output.py @@ -0,0 +1,171 @@ +# Copyright (C) 2021 Red Hat, Inc. <http://www.redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + +from glusto.core import Glusto as g +from glustolibs.gluster.gluster_base_class import (GlusterBaseClass, + runs_on) +from glustolibs.gluster.heal_libs import monitor_heal_completion +from glustolibs.io.utils import validate_io_procs +from glustolibs.misc.misc_libs import upload_scripts +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.volume_libs import (replace_brick_from_volume, + shrink_volume, expand_volume) +from glustolibs.gluster.brick_libs import get_all_bricks + + +@runs_on([['distributed-dispersed', 'distributed-replicated', + 'distributed-arbiter', 'dispersed', 'replicated', + 'arbiter'], + ['glusterfs']]) +class VerifyDFWithReplaceBrick(GlusterBaseClass): + + @classmethod + def setUpClass(cls): + # Calling GlusterBaseClass setUpClass + cls.get_super_method(cls, 'setUpClass')() + + # Upload io scripts for running IO on mounts + cls.script_upload_path = ("/usr/share/glustolibs/io/scripts/" + "file_dir_ops.py") + if not upload_scripts(cls.clients, [cls.script_upload_path]): + raise ExecutionError("Failed to upload IO scripts to clients %s" + % cls.clients) + g.log.info("Successfully uploaded IO scripts to clients %s", + cls.clients) + + def setUp(self): + # Calling GlusterBaseClass setUp + self.get_super_method(self, 'setUp')() + + # Setup Volume and Mount Volume + if not self.setup_volume_and_mount_volume(mounts=self.mounts): + raise ExecutionError("Failed to Setup_Volume and Mount_Volume") + g.log.info("Successful in Setup Volume and Mount Volume") + + def _perform_io_and_validate(self): + """ Performs IO on the mount points and validates it""" + all_mounts_procs, count = [], 1 + for mount_obj in self.mounts: + cmd = ("/usr/bin/env python %s create_deep_dirs_with_files " + "--dirname-start-num %d --dir-depth 2 " + "--dir-length 3 --max-num-of-dirs 3 " + "--num-of-files 2 %s" % ( + self.script_upload_path, count, + mount_obj.mountpoint)) + proc = g.run_async(mount_obj.client_system, cmd, + user=mount_obj.user) + all_mounts_procs.append(proc) + count = count + 10 + + # Validating IO's on mount point and waiting to complete + ret = validate_io_procs(all_mounts_procs, self.mounts) + self.assertTrue(ret, "IO failed on some of the clients") + g.log.info("Successfully validated IO's") + + def _replace_bricks_and_wait_for_heal_completion(self): + """ Replaces all the bricks and waits for the heal to complete""" + existing_bricks = get_all_bricks(self.mnode, self.volname) + for brick_to_replace in existing_bricks: + ret = replace_brick_from_volume(self.mnode, self.volname, + self.servers, + self.all_servers_info, + src_brick=brick_to_replace) + self.assertTrue(ret, + "Replace of %s failed" % brick_to_replace) + g.log.info("Replace of brick %s successful for volume %s", + brick_to_replace, self.volname) + + # Monitor heal completion + ret = monitor_heal_completion(self.mnode, self.volname) + self.assertTrue(ret, 'Heal has not yet completed') + g.log.info('Heal has completed successfully') + + def _get_mount_size_from_df_h_output(self): + """ Extracts the mount size from the df -h output""" + + split_cmd = " | awk '{split($0,a,\" \");print a[2]}' | sed 's/.$//'" + cmd = ("cd {};df -h | grep {} {}".format(self.mounts[0].mountpoint, + self.volname, split_cmd)) + ret, mount_size, _ = g.run(self.clients[0], cmd) + self.assertEqual(ret, 0, "Failed to extract mount size") + return float(mount_size.split("\n")[0]) + + def test_verify_df_output_when_brick_replaced(self): + """ + - Take the output of df -h. + - Replace any one brick for the volumes. + - Wait till the heal is completed + - Repeat steps 1, 2 and 3 for all bricks for all volumes. + - Check if there are any inconsistencies in the output of df -h + - Remove bricks from volume and check output of df -h + - Add bricks to volume and check output of df -h + """ + + # Perform some IO on the mount point + self._perform_io_and_validate() + + # Get the mount size from df -h output + initial_mount_size = self._get_mount_size_from_df_h_output() + + # Replace all the bricks and wait till the heal completes + self._replace_bricks_and_wait_for_heal_completion() + + # Get df -h output after brick replace + mount_size_after_replace = self._get_mount_size_from_df_h_output() + + # Verify the mount point size remains the same after brick replace + self.assertEqual(initial_mount_size, mount_size_after_replace, + "The mount sizes before and after replace bricks " + "are not same") + + # Add bricks + ret = expand_volume(self.mnode, self.volname, self.servers, + self.all_servers_info, force=True) + self.assertTrue(ret, "Failed to add-brick to volume") + + # Get df -h output after volume expand + mount_size_after_expand = self._get_mount_size_from_df_h_output() + + # Verify df -h output returns greater value + self.assertGreater(mount_size_after_expand, initial_mount_size, + "The mount size has not increased after expanding") + + # Remove bricks + ret = shrink_volume(self.mnode, self.volname, force=True) + self.assertTrue(ret, ("Remove brick operation failed on " + "%s", self.volname)) + g.log.info("Remove brick operation is successful on " + "volume %s", self.volname) + + # Get df -h output after volume shrink + mount_size_after_shrink = self._get_mount_size_from_df_h_output() + + # Verify the df -h output returns smaller value + self.assertGreater(mount_size_after_expand, mount_size_after_shrink, + "The mount size has not reduced after shrinking") + + def tearDown(self): + """ + Cleanup and umount volume + """ + # Cleanup and umount volume + if not self.unmount_volume_and_cleanup_volume(mounts=self.mounts): + raise ExecutionError("Failed to umount the vol & cleanup Volume") + g.log.info("Successful in umounting the volume and Cleanup") + + # Calling GlusterBaseClass teardown + self.get_super_method(self, 'tearDown')() diff --git a/tests/functional/glusterfind/test_glusterfind_when_brick_down.py b/tests/functional/glusterfind/test_glusterfind_when_brick_down.py new file mode 100644 index 000000000..de1ebaf23 --- /dev/null +++ b/tests/functional/glusterfind/test_glusterfind_when_brick_down.py @@ -0,0 +1,219 @@ +# Copyright (C) 2020 Red Hat, Inc. <http://www.redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY :or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +""" +Description: + Test Glusterfind when brick is down +""" + +from random import choice +from time import sleep +from glusto.core import Glusto as g +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on +from glustolibs.gluster.volume_ops import set_volume_options +from glustolibs.gluster.peer_ops import wait_for_peers_to_connect +from glustolibs.gluster.lib_utils import list_files +from glustolibs.gluster.volume_libs import volume_start +from glustolibs.gluster.glusterfile import ( + file_exists, + remove_file, + check_if_pattern_in_file) +from glustolibs.gluster.glusterfind_ops import ( + gfind_create, + gfind_list, + gfind_pre, + gfind_post, + gfind_delete) +from glustolibs.gluster.brick_libs import ( + get_all_bricks, + bring_bricks_offline) + + +@runs_on([["replicated", "distributed-replicated", "dispersed", + "distributed", "distributed-dispersed"], + ["glusterfs"]]) +class TestGlusterFindBrickDown(GlusterBaseClass): + """ + Test glusterfind operation when a brick is down. + """ + + def setUp(self): + """ + setup volume and mount volume + Initiate necessary variables + """ + + # calling GlusterBaseClass setUp + self.get_super_method(self, 'setUp')() + + self.file_limit = 0 + + # Setup Volume and Mount Volume + ret = self.setup_volume_and_mount_volume(mounts=self.mounts) + if not ret: + raise ExecutionError("Failed to Setup_Volume %s" % self.volname) + g.log.info("Successful in Setup Volume %s", self.volname) + self.session = "test-session-%s" % self.volname + self.outfiles = [("/tmp/test-outfile-%s-%s.txt" + % (self.volname, i))for i in range(0, 2)] + + # Set the changelog rollover-time to 1 second + # This needs to be done in order for glusterfind to keep checking + # for changes in the mount point + option = {'changelog.rollover-time': '1'} + ret = set_volume_options(self.mnode, self.volname, option) + if not ret: + raise ExecutionError("Failed to set the volume option %s for %s" + % (option, self.volname)) + g.log.info("Successfully set the volume option for the volume %s", + self.volname) + + def _perform_io_and_validate_presence_of_files(self): + """ + Function to perform the IO and validate the presence of files. + """ + self.file_limit += 10 + # Starting IO on the mounts + cmd = ("cd %s ; touch file{%d..%d}" % (self.mounts[0].mountpoint, + self.file_limit-10, + self.file_limit)) + + ret, _, _ = g.run(self.mounts[0].client_system, cmd) + self.assertEqual(ret, 0, "Failed to create files on mountpoint") + g.log.info("Files created successfully on mountpoint") + + # Gather the list of files from the mount point + files = list_files(self.mounts[0].client_system, + self.mounts[0].mountpoint) + self.assertIsNotNone(files, "Failed to get the list of files") + g.log.info("Successfully gathered the list of files from mount point") + + # Check if the files exist + for filename in files: + ret = file_exists(self.mounts[0].client_system, filename) + self.assertTrue(ret, ("Unexpected: File '%s' does not exist" + % filename)) + g.log.info("Successfully validated existence of '%s'", filename) + + def _perform_glusterfind_pre_and_validate_outfile(self): + """ + Function to perform glusterfind pre and validate outfile + """ + # Perform glusterfind pre for the session + ret, _, _ = gfind_pre(self.mnode, self.volname, self.session, + self.outfiles[0], full=True, noencode=True, + debug=True) + self.assertEqual(ret, 0, ("Failed to perform glusterfind pre")) + g.log.info("Successfully performed glusterfind pre") + + # Check if the outfile exists + ret = file_exists(self.mnode, self.outfiles[0]) + self.assertTrue(ret, ("Unexpected: File '%s' does not exist" + % self.outfiles[0])) + g.log.info("Successfully validated existence of '%s'", + self.outfiles[0]) + + # Check if all the files are listed in the outfile + for i in range(1, self.file_limit+1): + ret = check_if_pattern_in_file(self.mnode, "file%s" % i, + self.outfiles[0]) + self.assertEqual(ret, 0, ("File 'file%s' not listed in %s" + % (i, self.outfiles[0]))) + g.log.info("File 'file%s' listed in %s", i, self.outfiles[0]) + + def test_gfind_when_brick_down(self): + """ + Verifying the glusterfind functionality when a brick is down. + + 1. Create a volume + 2. Create a session on the volume + 3. Create various files from mount point + 4. Bring down brick process on one of the node + 5. Perform glusterfind pre + 6. Perform glusterfind post + 7. Check the contents of outfile + """ + + # pylint: disable=too-many-statements + # Create a session for the volume + ret, _, _ = gfind_create(self.mnode, self.volname, self.session) + self.assertEqual(ret, 0, ("Unexpected: Creation of a session for the " + "volume %s failed" % self.volname)) + g.log.info("Successfully created a session for the volume %s", + self.volname) + + # Perform glusterfind list to check if session exists + _, out, _ = gfind_list(self.mnode, volname=self.volname, + sessname=self.session) + self.assertNotEqual(out, "No sessions found.", + "Failed to list the glusterfind session") + g.log.info("Successfully listed the glusterfind session") + + self._perform_io_and_validate_presence_of_files() + + # Wait for changelog to get updated + sleep(2) + + # Bring one of the brick down. + brick_list = get_all_bricks(self.mnode, self.volname) + ret = bring_bricks_offline(self.volname, choice(brick_list)) + self.assertTrue(ret, "Failed to bring down the brick.") + g.log.info("Succesfully brought down one brick.") + + self._perform_glusterfind_pre_and_validate_outfile() + + # Perform glusterfind post for the session + ret, _, _ = gfind_post(self.mnode, self.volname, self.session) + self.assertEqual(ret, 0, ("Failed to perform glusterfind post")) + g.log.info("Successfully performed glusterfind post") + + # Bring the brick process up. + ret = volume_start(self.mnode, self.volname, force=True) + self.assertTrue(ret, "Failed to start the volume.") + g.log.info("Successfully started the volume.") + + def tearDown(self): + """ + tearDown for every test + Clean up and unmount the volume + """ + # calling GlusterBaseClass tearDown + self.get_super_method(self, 'tearDown')() + + # Delete the glusterfind sessions + ret, _, _ = gfind_delete(self.mnode, self.volname, self.session) + if ret: + raise ExecutionError("Failed to delete session %s" % self.session) + g.log.info("Successfully deleted session %s", self.session) + + # Remove the outfiles created during 'glusterfind pre' + for out in self.outfiles: + ret = remove_file(self.mnode, out, force=True) + if not ret: + raise ExecutionError("Failed to remove the outfile %s" % out) + g.log.info("Successfully removed the outfiles") + + # Wait for the peers to be connected. + ret = wait_for_peers_to_connect(self.mnode, self.servers, 100) + if not ret: + raise ExecutionError("Peers are not in connected state.") + + # Cleanup the volume + ret = self.unmount_volume_and_cleanup_volume(mounts=self.mounts) + if not ret: + raise ExecutionError("Failed to Cleanup Volume") + g.log.info("Successful in Cleanup Volume") diff --git a/tests/functional/resource_leak/test_memory_leaks_with_files_delete.py b/tests/functional/resource_leak/test_memory_leaks_with_files_delete.py new file mode 100644 index 000000000..ab29fdbe7 --- /dev/null +++ b/tests/functional/resource_leak/test_memory_leaks_with_files_delete.py @@ -0,0 +1,113 @@ +# Copyright (C) 2020 Red Hat, Inc. <http://www.redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along` +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +from glusto.core import Glusto as g +from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.glusterdir import get_dir_contents +from glustolibs.io.memory_and_cpu_utils import ( + wait_for_logging_processes_to_stop) +from glustolibs.gluster.brick_libs import get_all_bricks + + +@runs_on([['arbiter', 'distributed-arbiter'], ['glusterfs']]) +class TestMemoryLeakWithRm(GlusterBaseClass): + + def setUp(self): + + self.get_super_method(self, 'setUp')() + + # Set test_id for get gathering + self.test_id = self.id() + + # Set I/O flag to false + self.is_io_running = False + + # Creating Volume and mounting the volume + ret = self.setup_volume_and_mount_volume(self.mounts) + if not ret: + raise ExecutionError("Volume creation or mount failed: %s" + % self.volname) + + def tearDown(self): + + # Unmounting and cleaning volume + ret = self.unmount_volume_and_cleanup_volume(self.mounts) + if not ret: + raise ExecutionError("Unable to delete volume %s" % self.volname) + + self.get_super_method(self, 'tearDown')() + + def test_memory_leak_with_rm(self): + """ + Test case: + 1. Create a volume, start it and mount it. + 2. Create 10,000 files each of size 200K + 3. Delete the files created at step 2 + 4. Check if the files are deleted from backend + 5. Check if there are any memory leaks and OOM killers. + """ + # Start monitoring resource usage on servers and clients + monitor_proc_dict = self.start_memory_and_cpu_usage_logging( + self.test_id, count=30) + self.assertIsNotNone(monitor_proc_dict, + "Failed to start monitoring on servers and " + "clients") + # Create files on mount point + cmd = ('cd %s;for i in {1..10000};' + 'do dd if=/dev/urandom bs=200K count=1 of=file$i;done;' + 'rm -rf %s/file*' + % (self.mounts[0].mountpoint, self.mounts[0].mountpoint)) + ret, _, _ = g.run(self.mounts[0].client_system, cmd) + self.assertEqual(ret, 0, "Failed to create and delete files on" + " mountpoint") + g.log.info("Successfully created and removed files on mountpoint") + + # Delete files from mount point and check if all files + # are deleted or not from mount point as well as backend bricks. + ret, _, _ = g.run(self.clients[0], + "rm -rf {}/*".format(self.mounts[0].mountpoint)) + self.assertFalse(ret, "rm -rf * failed on mount point") + + ret = get_dir_contents(self.clients[0], + "{}/".format(self.mounts[0].mountpoint)) + self.assertEqual(ret, [], "Unexpected: Files and directories still " + "seen from mount point") + + for brick in get_all_bricks(self.mnode, self.volname): + node, brick_path = brick.split(":") + ret = get_dir_contents(node, "{}/".format(brick_path)) + self.assertEqual(ret, [], "Unexpected: Files and dirs still seen " + "on brick %s on node %s" % (brick_path, node)) + g.log.info("rm -rf * on mount point successful") + + # Wait for monitoring processes to complete + ret = wait_for_logging_processes_to_stop(monitor_proc_dict, + cluster=True) + self.assertTrue(ret, + "ERROR: Failed to stop monitoring processes") + + # Check if there are any memory leaks and OOM killers + ret = self.check_for_memory_leaks_and_oom_kills_on_servers( + self.test_id) + self.assertFalse(ret, + "Memory leak and OOM kills check failed on servers") + + ret = self.check_for_memory_leaks_and_oom_kills_on_clients( + self.test_id) + self.assertFalse(ret, + "Memory leak and OOM kills check failed on clients") + g.log.info("No memory leaks or OOM kills found on serves and clients") diff --git a/tests/functional/resource_leak/test_verify_gluster_memleak_with_management_encryption.py b/tests/functional/resource_leak/test_verify_gluster_memleak_with_management_encryption.py new file mode 100644 index 000000000..25f8325df --- /dev/null +++ b/tests/functional/resource_leak/test_verify_gluster_memleak_with_management_encryption.py @@ -0,0 +1,231 @@ +# Copyright (C) 2021 Red Hat, Inc. <http://www.redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + +from datetime import datetime, timedelta +from glusto.core import Glusto as g +from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on +from glustolibs.gluster.lib_utils import get_usable_size_per_disk +from glustolibs.gluster.volume_libs import (get_subvols, bulk_volume_creation, + volume_stop, volume_start, + set_volume_options) +from glustolibs.io.memory_and_cpu_utils import ( + wait_for_logging_processes_to_stop) +from glustolibs.gluster.brick_libs import get_all_bricks +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.io.utils import validate_io_procs +from glustolibs.gluster.brickmux_ops import (enable_brick_mux, + disable_brick_mux, + is_brick_mux_enabled) +from glustolibs.gluster.mount_ops import mount_volume, umount_volume + + +@runs_on([['distributed-replicated'], ['glusterfs']]) +class TestMemLeakAfterMgmntEncrypEnabled(GlusterBaseClass): + + def setUp(self): + """ + Setup and mount volume or raise ExecutionError + """ + self.get_super_method(self, 'setUp')() + self.test_id = self.id() + # Setup Volume + self.volume['dist_count'] = 2 + self.volume['replica_count'] = 3 + + ret = self.setup_volume_and_mount_volume([self.mounts[0]]) + if not ret: + raise ExecutionError("Failed to Setup and Mount Volume") + + # Disable I/O encryption + self._disable_io_encryption() + + def tearDown(self): + # Disable brick_mux + if is_brick_mux_enabled(self.mnode): + ret = disable_brick_mux(self.mnode) + self.assertTrue(ret, "Failed to brick multiplex") + g.log.info("Disable brick multiplex") + + # Unmount and cleanup original volume + ret = self.unmount_volume_and_cleanup_volume(mounts=[self.mounts[0]]) + if not ret: + raise ExecutionError("Failed to umount the vol & cleanup Volume") + g.log.info("Successful in umounting the volume and Cleanup") + + # Calling GlusterBaseClass tearDown + self.get_super_method(self, 'tearDown')() + + def _run_io(self): + """ Run IO and fill vol upto ~88%""" + bricks = get_all_bricks(self.mnode, self.volname) + usable_size = int(get_usable_size_per_disk(bricks[0]) * 0.88) + + self.procs = [] + counter = 1 + for _ in get_subvols(self.mnode, self.volname)['volume_subvols']: + filename = "{}/test_file_{}".format(self.mounts[0].mountpoint, + str(counter)) + proc = g.run_async(self.mounts[0].client_system, + "fallocate -l {}G {}".format(usable_size, + filename)) + self.procs.append(proc) + counter += 1 + + def _perform_gluster_v_heal_for_12_hrs(self): + """ Run 'guster v heal info' for 12 hours""" + # Perform gluster heal info for 12 hours + end_time = datetime.now() + timedelta(hours=12) + while True: + curr_time = datetime.now() + cmd = "gluster volume heal %s info" % self.volname + ret, _, _ = g.run(self.mnode, cmd) + self.assertEqual(ret, 0, "Failed to execute heal info cmd") + if curr_time > end_time: + g.log.info("Successfully ran for 12 hours. Checking for " + "memory leaks") + break + + def _verify_memory_leak(self): + """ Verify memory leak is found """ + + ret = self.check_for_memory_leaks_and_oom_kills_on_servers( + self.test_id) + self.assertFalse(ret, + "Memory leak and OOM kills check failed on servers") + + ret = self.check_for_memory_leaks_and_oom_kills_on_clients( + self.test_id) + self.assertFalse(ret, + "Memory leak and OOM kills check failed on clients") + + def _disable_io_encryption(self): + """ Disables IO encryption """ + # UnMount Volume + g.log.info("Starting to Unmount Volume %s", self.volname) + ret, _, _ = umount_volume(self.mounts[0].client_system, + self.mounts[0].mountpoint, + mtype=self.mount_type) + self.assertEqual(ret, 0, "Failed to Unmount volume") + + # Stop Volume + ret, _, _ = volume_stop(self.mnode, self.volname) + self.assertEqual(ret, 0, "Failed to Stop volume") + + # Disable server and client SSL usage + options = {"server.ssl": "off", + "client.ssl": "off"} + ret = set_volume_options(self.mnode, self.volname, options) + self.assertTrue(ret, "Failed to set volume options") + + # Start Volume + ret, _, _ = volume_start(self.mnode, self.volname) + self.assertEqual(ret, 0, "Failed to Start volume") + + # Mount Volume + ret, _, _ = mount_volume(self.volname, mtype=self.mount_type, + mpoint=self.mounts[0].mountpoint, + mserver=self.mnode, + mclient=self.mounts[0].client_system) + self.assertEqual(ret, 0, "Failed to mount the volume back") + + def test_mem_leak_on_gluster_procs_with_management_encrpytion(self): + """ + Steps: + 1) Enable management encryption on the cluster. + 2) Create a 2X3 volume. + 3) Mount the volume using FUSE on a client node. + 4) Start doing IO on the mount (ran IO till the volume is ~88% full) + 5) Simultaneously start collecting the memory usage for + 'glusterfsd' process. + 6) Issue the command "# gluster v heal <volname> info" continuously + in a loop. + """ + # Run IO + self._run_io() + + # Start monitoring resource usage on servers and clients + # default interval = 60 sec, count = 780 (60 *12) => for 12 hrs + monitor_proc_dict = self.start_memory_and_cpu_usage_logging( + self.test_id, count=780) + self.assertIsNotNone(monitor_proc_dict, + "Failed to start monitoring on servers and " + "clients") + + ret = validate_io_procs(self.procs, self.mounts) + self.assertTrue(ret, "IO Failed") + + self._perform_gluster_v_heal_for_12_hrs() + + # Wait for monitoring processes to complete + ret = wait_for_logging_processes_to_stop(monitor_proc_dict, + cluster=True) + self.assertTrue(ret, "ERROR: Failed to stop monitoring processes") + + # Check if there are any memory leaks and OOM killers + self._verify_memory_leak() + g.log.info("No memory leaks/OOM kills found on serves and clients") + + def test_mem_leak_on_gluster_procs_with_brick_multiplex(self): + """ + Steps: + 1) Enable cluster.brick-multiplex + 2) Enable SSL on management layer + 3) Start creating volumes + 4) Mount a volume and starting I/O + 5) Monitor the memory consumption by glusterd process + """ + + # Enable cluster.brick-mulitplex + ret = enable_brick_mux(self.mnode) + self.assertTrue(ret, "Failed to enable brick-multiplex") + + # Verify the operation + ret = is_brick_mux_enabled(self.mnode) + self.assertTrue(ret, "Brick mux enble op not successful") + + # Create few volumes + self.volume['replica_count'] = 3 + ret = bulk_volume_creation(self.mnode, 20, self.all_servers_info, + self.volume, is_force=True) + + self.assertTrue(ret, "Failed to create bulk volume") + + # Run IO + self._run_io() + + # Start memory usage logging + monitor_proc_dict = self.start_memory_and_cpu_usage_logging( + self.test_id, count=60) + self.assertIsNotNone(monitor_proc_dict, + "Failed to start monitoring on servers and " + "clients") + + ret = validate_io_procs(self.procs, self.mounts) + self.assertTrue(ret, "IO Failed") + + # Wait for monitoring processes to complete + ret = wait_for_logging_processes_to_stop(monitor_proc_dict, + cluster=True) + self.assertTrue(ret, "ERROR: Failed to stop monitoring processes") + + # Check if there are any memory leaks and OOM killers + self._verify_memory_leak() + g.log.info("No memory leaks/OOM kills found on serves and clients") + + # Disable Brick multiplex + ret = disable_brick_mux(self.mnode) + self.assertTrue(ret, "Failed to brick multiplex") diff --git a/tools/generate_glusto_config/glusto_config_template.jinja b/tools/generate_glusto_config/glusto_config_template.jinja index 79a3d57e2..3146586d8 100644 --- a/tools/generate_glusto_config/glusto_config_template.jinja +++ b/tools/generate_glusto_config/glusto_config_template.jinja @@ -17,8 +17,9 @@ clients:{% for client in clients %} # Note: Use the same Hostname/IP used in the above 'servers' section. servers_info:{% for server_item in servers %} + {% set outer_loop = loop %} {% for server, value in server_item.items() %} - {{server}}: &server{{ loop.index }} + {{server}}: &server{{ outer_loop.index }} host: {{server}} devices: {{ value["devices"] }} brick_root: {{ value["brick_root"] }}{% endfor %}{% endfor %} |