From 08faae06ab07b56b815aec5bfbfcf72d653e8055 Mon Sep 17 00:00:00 2001 From: kshithijiyer Date: Tue, 6 Oct 2020 09:05:44 +0530 Subject: [Test] Add 2 memory leak tests and fix library issues Scenarios added: ---------------- Test case: 1. Create a volume, start it and mount it. 2. Start I/O from mount point. 3. Check if there are any memory leaks and OOM killers. Test case: 1. Create a volume, start it and mount it. 2. Set features.cache-invalidation to ON. 3. Start I/O from mount point. 4. Run gluster volume heal command in a loop 5. Check if there are any memory leaks and OOM killers on servers. Design change: -------------- - self.id() is moved into test class as it was hitting bound errors in the original logic. - Logic changed for checking leaks fuse. - Fixed breakage in methods where ever needed. Change-Id: Icb600d833d0c08636b6002abb489342ea1f946d7 Signed-off-by: kshithijiyer --- .../glustolibs/gluster/gluster_base_class.py | 75 ++++++++----- .../glustolibs/io/memory_and_cpu_utils.py | 79 +++++++++----- tests/functional/resource_leak/__init__.py | 0 .../resource_leak/test_basic_memory_leaks.py | 120 +++++++++++++++++++++ ...emory_leak_in_shd_with_cache_invalidation_on.py | 117 ++++++++++++++++++++ 5 files changed, 337 insertions(+), 54 deletions(-) create mode 100644 tests/functional/resource_leak/__init__.py create mode 100644 tests/functional/resource_leak/test_basic_memory_leaks.py create mode 100644 tests/functional/resource_leak/test_memory_leak_in_shd_with_cache_invalidation_on.py diff --git a/glustolibs-gluster/glustolibs/gluster/gluster_base_class.py b/glustolibs-gluster/glustolibs/gluster/gluster_base_class.py index baec1be8a..3ce38a304 100755 --- a/glustolibs-gluster/glustolibs/gluster/gluster_base_class.py +++ b/glustolibs-gluster/glustolibs/gluster/gluster_base_class.py @@ -1107,9 +1107,13 @@ class GlusterBaseClass(TestCase): g.log.info("Teardown nfs ganesha cluster succeeded") @classmethod - def start_memory_and_cpu_usage_logging(cls, interval=60, count=100): + def start_memory_and_cpu_usage_logging(cls, test_id, interval=60, + count=100): """Upload logger script and start logging usage on cluster + Args: + test_id(str): ID of the test running fetched from self.id() + Kawrgs: interval(int): Time interval after which logs are to be collected (Default: 60) @@ -1137,16 +1141,18 @@ class GlusterBaseClass(TestCase): # Start logging on servers and clients proc_dict = log_memory_and_cpu_usage_on_cluster( - cls.servers, cls.clients, cls.id(), interval, count) + cls.servers, cls.clients, test_id, interval, count) return proc_dict @classmethod - def compute_and_print_usage_stats(cls, proc_dict, kill_proc=False): + def compute_and_print_usage_stats(cls, test_id, proc_dict, + kill_proc=False): """Compute and print CPU and memory usage statistics Args: proc_dict(dict):Dictionary of logging processes + test_id(str): ID of the test running fetched from self.id() Kwargs: kill_proc(bool): Kill logging process if true else wait @@ -1172,21 +1178,25 @@ class GlusterBaseClass(TestCase): g.log.error("Processes didn't complete still running.") # Compute and print stats for servers - ret = compute_data_usage_stats_on_servers(cls.servers, cls.id()) + ret = compute_data_usage_stats_on_servers(cls.servers, test_id) g.log.info('*' * 50) g.log.info(ret) # TODO: Make logged message more structured g.log.info('*' * 50) # Compute and print stats for clients - ret = compute_data_usage_stats_on_clients(cls.clients, cls.id()) + ret = compute_data_usage_stats_on_clients(cls.clients, test_id) g.log.info('*' * 50) g.log.info(ret) # TODO: Make logged message more structured g.log.info('*' * 50) @classmethod - def check_for_memory_leaks_and_oom_kills_on_servers(cls, gain=30.0): + def check_for_memory_leaks_and_oom_kills_on_servers(cls, test_id, + gain=30.0): """Check for memory leaks and OOM kills on servers + Args: + test_id(str): ID of the test running fetched from self.id() + Kwargs: gain(float): Accepted amount of leak for a given testcase in MB (Default:30) @@ -1204,31 +1214,35 @@ class GlusterBaseClass(TestCase): check_for_oom_killers_on_servers) # Check for memory leaks on glusterd - if check_for_memory_leaks_in_glusterd(cls.servers, cls.id(), gain): + if check_for_memory_leaks_in_glusterd(cls.servers, test_id, gain): g.log.error("Memory leak on glusterd.") return True - # Check for memory leaks on shd - if check_for_memory_leaks_in_glusterfs(cls.servers, cls.id(), gain): - g.log.error("Memory leak on shd.") - return True + if cls.volume_type != "distributed": + # Check for memory leaks on shd + if check_for_memory_leaks_in_glusterfs(cls.servers, test_id, + gain): + g.log.error("Memory leak on shd.") + return True # Check for memory leaks on brick processes - if check_for_memory_leaks_in_glusterfsd(cls.servers, cls.id(), gain): + if check_for_memory_leaks_in_glusterfsd(cls.servers, test_id, gain): g.log.error("Memory leak on brick process.") return True # Check OOM kills on servers for all gluster server processes - ret = check_for_oom_killers_on_servers(cls.servers) - if not ret: + if check_for_oom_killers_on_servers(cls.servers): g.log.error('OOM kills present on servers.') return True return False @classmethod - def check_for_memory_leaks_and_oom_kills_on_clients(cls, gain=30): + def check_for_memory_leaks_and_oom_kills_on_clients(cls, test_id, gain=30): """Check for memory leaks and OOM kills on clients + Args: + test_id(str): ID of the test running fetched from self.id() + Kwargs: gain(float): Accepted amount of leak for a given testcase in MB (Default:30) @@ -1244,7 +1258,7 @@ class GlusterBaseClass(TestCase): check_for_oom_killers_on_clients) # Check for memory leak on glusterfs fuse process - if check_for_memory_leaks_in_glusterfs_fuse(cls.clients, cls.id(), + if check_for_memory_leaks_in_glusterfs_fuse(cls.clients, test_id, gain): g.log.error("Memory leaks observed on FUSE clients.") return True @@ -1256,9 +1270,12 @@ class GlusterBaseClass(TestCase): return False @classmethod - def check_for_cpu_usage_spikes_on_servers(cls, threshold=3): + def check_for_cpu_usage_spikes_on_servers(cls, test_id, threshold=3): """Check for CPU usage spikes on servers + Args: + test_id(str): ID of the test running fetched from self.id() + Kwargs: threshold(int): Accepted amount of instances of 100% CPU usage (Default:3) @@ -1274,21 +1291,22 @@ class GlusterBaseClass(TestCase): check_for_cpu_usage_spikes_on_glusterfsd) # Check for CPU usage spikes on glusterd - if check_for_cpu_usage_spikes_on_glusterd(cls.servers, cls.id(), + if check_for_cpu_usage_spikes_on_glusterd(cls.servers, test_id, threshold): g.log.error("CPU usage spikes observed more than threshold " "on glusterd.") return True - # Check for CPU usage spikes on shd - if check_for_cpu_usage_spikes_on_glusterfs(cls.servers, cls.id(), - threshold): - g.log.error("CPU usage spikes observed more than threshold " - "on shd.") - return True + if cls.volume_type != "distributed": + # Check for CPU usage spikes on shd + if check_for_cpu_usage_spikes_on_glusterfs(cls.servers, test_id, + threshold): + g.log.error("CPU usage spikes observed more than threshold " + "on shd.") + return True # Check for CPU usage spikes on brick processes - if check_for_cpu_usage_spikes_on_glusterfsd(cls.servers, cls.id(), + if check_for_cpu_usage_spikes_on_glusterfsd(cls.servers, test_id, threshold): g.log.error("CPU usage spikes observed more than threshold " "on shd.") @@ -1296,9 +1314,12 @@ class GlusterBaseClass(TestCase): return False @classmethod - def check_for_cpu_spikes_on_clients(cls, threshold=3): + def check_for_cpu_spikes_on_clients(cls, test_id, threshold=3): """Check for CPU usage spikes on clients + Args: + test_id(str): ID of the test running fetched from self.id() + Kwargs: threshold(int): Accepted amount of instances of 100% CPU usage (Default:3) @@ -1312,6 +1333,6 @@ class GlusterBaseClass(TestCase): check_for_cpu_usage_spikes_on_glusterfs_fuse) ret = check_for_cpu_usage_spikes_on_glusterfs_fuse(cls.clients, - cls.id(), + test_id, threshold) return ret diff --git a/glustolibs-io/glustolibs/io/memory_and_cpu_utils.py b/glustolibs-io/glustolibs/io/memory_and_cpu_utils.py index 3d105bf5e..4e1dadbd7 100644 --- a/glustolibs-io/glustolibs/io/memory_and_cpu_utils.py +++ b/glustolibs-io/glustolibs/io/memory_and_cpu_utils.py @@ -363,7 +363,7 @@ def compute_data_usage_stats_on_servers(nodes, test_name): # Generate a dataframe from the csv file dataframe = create_dataframe_from_csv(node, process, test_name) - if not dataframe: + if dataframe.empty: return {} data_dict[node][process] = {} @@ -424,7 +424,7 @@ def compute_data_usage_stats_on_clients(nodes, test_name): for node in nodes: data_dict[node] = {} dataframe = create_dataframe_from_csv(node, 'glusterfs', test_name) - if not dataframe: + if dataframe.empty: return {} data_dict[node]['glusterfs'] = {} @@ -436,7 +436,8 @@ def compute_data_usage_stats_on_clients(nodes, test_name): def _perform_three_point_check_for_memory_leak(dataframe, node, process, gain, volume_status=None, - volume=None): + volume=None, + vol_name=None): """Perform three point check Args: @@ -448,14 +449,16 @@ def _perform_three_point_check_for_memory_leak(dataframe, node, process, gain, kwargs: volume_status(dict): Volume status output on the give name volumne(str):Name of volume for which 3 point check has to be done + vol_name(str): Name of volume process according to volume status Returns: bool: True if memory leak instances are observed else False """ # Filter dataframe to be process wise if it's volume specific process if process in ('glusterfs', 'glusterfsd'): - pid = int(volume_status[volume][node][process]['pid']) - dataframe = dataframe[dataframe['Process ID'] == pid] + if process == 'glusterfs' and vol_name: + pid = int(volume_status[volume][node][vol_name]['pid']) + dataframe = dataframe[dataframe['Process ID'] == pid] # Compute usage gain throught the data frame memory_increments = list(dataframe['Memory Usage'].diff().dropna()) @@ -476,12 +479,12 @@ def _perform_three_point_check_for_memory_leak(dataframe, node, process, gain, try: # Check if memory gain had decrease in the consecutive # entries, after 2 entry and betwen current and last entry - if all(memory_increments[instance+1] > + if all([memory_increments[instance+1] > memory_increments[instance], memory_increments[instance+2] > memory_increments[instance], (memory_increments[len(memory_increments)-1] > - memory_increments[instance])): + memory_increments[instance])]): return True except IndexError: @@ -490,7 +493,7 @@ def _perform_three_point_check_for_memory_leak(dataframe, node, process, gain, g.log.info('Instance at last log entry.') if process in ('glusterfs', 'glusterfsd'): cmd = ("ps u -p %s | awk 'NR>1 && $11~/%s$/{print " - "$6/1024}'" % (pid, process)) + " $6/1024}'" % (pid, process)) else: cmd = ("ps u -p `pgrep glusterd` | awk 'NR>1 && $11~/" "glusterd$/{print $6/1024}'") @@ -526,7 +529,7 @@ def check_for_memory_leaks_in_glusterd(nodes, test_name, gain=30.0): is_there_a_leak = [] for node in nodes: dataframe = create_dataframe_from_csv(node, 'glusterd', test_name) - if not dataframe: + if dataframe.empty: return False # Call 3 point check function @@ -562,7 +565,7 @@ def check_for_memory_leaks_in_glusterfs(nodes, test_name, gain=30.0): # Get the volume status on the node volume_status = get_volume_status(node) dataframe = create_dataframe_from_csv(node, 'glusterfs', test_name) - if not dataframe: + if dataframe.empty: return False for volume in volume_status.keys(): @@ -573,7 +576,8 @@ def check_for_memory_leaks_in_glusterfs(nodes, test_name, gain=30.0): # Call 3 point check function three_point_check = _perform_three_point_check_for_memory_leak( - dataframe, node, 'glusterfs', gain, volume_status, volume) + dataframe, node, 'glusterfs', gain, volume_status, volume, + 'Self-heal Daemon') if three_point_check: g.log.error("Memory leak observed on node %s in shd " "on volume %s", node, volume) @@ -604,7 +608,7 @@ def check_for_memory_leaks_in_glusterfsd(nodes, test_name, gain=30.0): # Get the volume status on the node volume_status = get_volume_status(node) dataframe = create_dataframe_from_csv(node, 'glusterfsd', test_name) - if not dataframe: + if dataframe.empty: return False for volume in volume_status.keys(): @@ -615,7 +619,8 @@ def check_for_memory_leaks_in_glusterfsd(nodes, test_name, gain=30.0): # Call 3 point check function three_point_check = _perform_three_point_check_for_memory_leak( - dataframe, node, 'glusterfsd', gain, volume_status, volume) + dataframe, node, 'glusterfsd', gain, volume_status, volume, + process) if three_point_check: g.log.error("Memory leak observed on node %s in brick " " process for brick %s on volume %s", node, @@ -637,7 +642,7 @@ def check_for_memory_leaks_in_glusterfs_fuse(nodes, test_name, gain=30.0): (Default:30) Returns: - bool: True if memory leak was obsevred else False + bool: True if memory leak was observed else False NOTE: This function should be executed when the volume is still mounted. @@ -646,7 +651,7 @@ def check_for_memory_leaks_in_glusterfs_fuse(nodes, test_name, gain=30.0): for node in nodes: # Get the volume status on the node dataframe = create_dataframe_from_csv(node, 'glusterfs', test_name) - if not dataframe: + if dataframe.empty: return False # Call 3 point check function @@ -655,7 +660,25 @@ def check_for_memory_leaks_in_glusterfs_fuse(nodes, test_name, gain=30.0): if three_point_check: g.log.error("Memory leak observed on node %s for client", node) - is_there_a_leak.append(three_point_check) + + # If I/O is constantly running on Clients the memory + # usage spikes up and stays at a point for long. + last_entry = dataframe['Memory Usage'].iloc[-1] + cmd = ("ps u -p `pidof glusterfs` | " + "awk 'NR>1 && $11~/glusterfs$/{print" + " $6/1024}'") + ret, out, _ = g.run(node, cmd) + if ret: + g.log.error('Unable to run the command to fetch current ' + 'memory utilization.') + continue + + if float(out) != last_entry: + if float(out) > last_entry: + is_there_a_leak.append(True) + continue + + is_there_a_leak.append(False) return any(is_there_a_leak) @@ -671,9 +694,9 @@ def _check_for_oom_killers(nodes, process, oom_killer_list): """ cmd = ("grep -i 'killed process' /var/log/messages* " "| grep -w '{}'".format(process)) - ret = g.run_parallel(nodes, cmd) - for key in ret.keys(): - ret, out, _ = ret[key] + ret_codes = g.run_parallel(nodes, cmd) + for key in ret_codes.keys(): + ret, out, _ = ret_codes[key] if not ret: g.log.error('OOM killer observed on %s for %s', key, process) g.log.error(out) @@ -712,7 +735,8 @@ def check_for_oom_killers_on_clients(nodes): def _check_for_cpu_usage_spikes(dataframe, node, process, threshold, - volume_status=None, volume=None): + volume_status=None, volume=None, + vol_name=None): """Check for cpu spikes for a given process Args: @@ -724,13 +748,14 @@ def _check_for_cpu_usage_spikes(dataframe, node, process, threshold, kwargs: volume_status(dict): Volume status output on the give name volume(str):Name of volume for which check has to be done + vol_name(str): Name of volume process according to volume status Returns: bool: True if number of instances more than threshold else False """ # Filter dataframe to be process wise if it's volume specific process if process in ('glusterfs', 'glusterfsd'): - pid = int(volume_status[volume][node][process]['pid']) + pid = int(volume_status[volume][node][vol_name]['pid']) dataframe = dataframe[dataframe['Process ID'] == pid] # Check if usage is more than accepted amount of leak @@ -758,7 +783,7 @@ def check_for_cpu_usage_spikes_on_glusterd(nodes, test_name, threshold=3): is_there_a_spike = [] for node in nodes: dataframe = create_dataframe_from_csv(node, 'glusterd', test_name) - if not dataframe: + if dataframe.empty: return False # Call function to check for cpu spikes @@ -795,7 +820,7 @@ def check_for_cpu_usage_spikes_on_glusterfs(nodes, test_name, threshold=3): # Get the volume status on the node volume_status = get_volume_status(node) dataframe = create_dataframe_from_csv(node, 'glusterfs', test_name) - if not dataframe: + if dataframe.empty: return False for volume in volume_status.keys(): @@ -807,7 +832,7 @@ def check_for_cpu_usage_spikes_on_glusterfs(nodes, test_name, threshold=3): # Call function to check for cpu spikes cpu_spikes = _check_for_cpu_usage_spikes( dataframe, node, 'glusterfs', threshold, volume_status, - volume) + volume, 'Self-heal Daemon') if cpu_spikes: g.log.error("CPU usage spikes observed more than " "threshold %d on node %s on volume %s for shd", @@ -839,7 +864,7 @@ def check_for_cpu_usage_spikes_on_glusterfsd(nodes, test_name, threshold=3): # Get the volume status on the node volume_status = get_volume_status(node) dataframe = create_dataframe_from_csv(node, 'glusterfsd', test_name) - if not dataframe: + if dataframe.empty: return False for volume in volume_status.keys(): @@ -851,7 +876,7 @@ def check_for_cpu_usage_spikes_on_glusterfsd(nodes, test_name, threshold=3): # Call function to check for cpu spikes cpu_spikes = _check_for_cpu_usage_spikes( dataframe, node, 'glusterfsd', threshold, volume_status, - volume) + volume, process) if cpu_spikes: g.log.error("CPU usage spikes observed more than " "threshold %d on node %s on volume %s for " @@ -884,7 +909,7 @@ def check_for_cpu_usage_spikes_on_glusterfs_fuse(nodes, test_name, for node in nodes: # Get the volume status on the node dataframe = create_dataframe_from_csv(node, 'glusterfs', test_name) - if not dataframe: + if dataframe.empty: return False # Call function to check for cpu spikes diff --git a/tests/functional/resource_leak/__init__.py b/tests/functional/resource_leak/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/functional/resource_leak/test_basic_memory_leaks.py b/tests/functional/resource_leak/test_basic_memory_leaks.py new file mode 100644 index 000000000..46b2c0c6d --- /dev/null +++ b/tests/functional/resource_leak/test_basic_memory_leaks.py @@ -0,0 +1,120 @@ +# Copyright (C) 2020 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along` +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +from glusto.core import Glusto as g +from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.glusterdir import mkdir +from glustolibs.io.utils import (run_linux_untar, validate_io_procs, + wait_for_io_to_complete) +from glustolibs.io.memory_and_cpu_utils import ( + wait_for_logging_processes_to_stop) + + +@runs_on([['distributed-replicated', 'distributed-arbiter', + 'distributed-dispersed', 'distributed', 'replicated', + 'arbiter', 'dispersed'], ['glusterfs']]) +class TestBasicMemoryleak(GlusterBaseClass): + + def setUp(self): + + self.get_super_method(self, 'setUp')() + + # Set test_id for get gathering + self.test_id = self.id() + + # Set I/O flag to false + self.is_io_running = False + + # Creating Volume and mounting the volume + ret = self.setup_volume_and_mount_volume(self.mounts) + if not ret: + raise ExecutionError("Volume creation or mount failed: %s" + % self.volname) + + def tearDown(self): + + # Wait for I/O to complete + if self.is_io_running: + if wait_for_io_to_complete(self.list_of_io_processes, + self.mounts): + raise ExecutionError("Failed to wait for I/O to complete") + + # Unmounting and cleaning volume + ret = self.unmount_volume_and_cleanup_volume(self.mounts) + if not ret: + raise ExecutionError("Unable to delete volume %s" % self.volname) + + self.get_super_method(self, 'tearDown')() + + def test_basic_memory_leak(self): + """ + Test case: + 1. Create a volume, start it and mount it. + 2. Start I/O from mount point. + 3. Check if there are any memory leaks and OOM killers. + """ + # Start monitoring resource usage on servers and clients + monitor_proc_dict = self.start_memory_and_cpu_usage_logging( + self.test_id, count=30) + self.assertIsNotNone(monitor_proc_dict, + "Failed to start monitoring on servers and " + "clients") + + # Create a dir to start untar + self.linux_untar_dir = "{}/{}".format(self.mounts[1].mountpoint, + "linuxuntar") + ret = mkdir(self.mounts[1].client_system, self.linux_untar_dir) + self.assertTrue(ret, "Failed to create dir linuxuntar for untar") + + # Start multiple I/O from mount points + self.list_of_io_processes = [] + cmd = ("cd {};for i in `seq 1 100`; do mkdir dir.$i ;" + "for j in `seq 1 1000`; do dd if=/dev/random " + "of=dir.$i/testfile.$j bs=1k count=10;done;done" + .format(self.mounts[0].mountpoint)) + ret = g.run_async(self.mounts[0].client_system, cmd) + self.list_of_io_processes = [ret] + + # Start linux untar on dir linuxuntar + ret = run_linux_untar(self.mounts[1].client_system, + self.mounts[1].mountpoint, + dirs=tuple(['linuxuntar'])) + self.list_of_io_processes += ret + self.is_io_running = True + + # Wait for I/O to complete and validate I/O on mount points + ret = validate_io_procs(self.list_of_io_processes, self.mounts) + self.assertTrue(ret, "I/O failed on mount point") + self.is_io_running = False + + # Wait for monitoring processes to complete + ret = wait_for_logging_processes_to_stop(monitor_proc_dict, + cluster=True) + self.assertTrue(ret, + "ERROR: Failed to stop monitoring processes") + + # Check if there are any memory leaks and OOM killers + ret = self.check_for_memory_leaks_and_oom_kills_on_servers( + self.test_id) + self.assertFalse(ret, + "Memory leak and OOM kills check failed on servers") + + ret = self.check_for_memory_leaks_and_oom_kills_on_clients( + self.test_id) + self.assertFalse(ret, + "Memory leak and OOM kills check failed on clients") + g.log.info("No memory leaks or OOM kills found on serves and clients") diff --git a/tests/functional/resource_leak/test_memory_leak_in_shd_with_cache_invalidation_on.py b/tests/functional/resource_leak/test_memory_leak_in_shd_with_cache_invalidation_on.py new file mode 100644 index 000000000..3a22a5068 --- /dev/null +++ b/tests/functional/resource_leak/test_memory_leak_in_shd_with_cache_invalidation_on.py @@ -0,0 +1,117 @@ +# Copyright (C) 2020 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along` +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +from glusto.core import Glusto as g +from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.heal_ops import trigger_heal +from glustolibs.gluster.volume_ops import set_volume_options +from glustolibs.io.utils import (validate_io_procs, wait_for_io_to_complete) +from glustolibs.io.memory_and_cpu_utils import ( + wait_for_logging_processes_to_stop) + + +@runs_on([['distributed-replicated', 'distributed-arbiter', + 'distributed-dispersed', 'replicated', + 'arbiter', 'dispersed'], ['glusterfs']]) +class TestMemoryLeakInShdWithCacheInvalidationOn(GlusterBaseClass): + + def setUp(self): + + self.get_super_method(self, 'setUp')() + + # Set test_id for get gathering + self.test_id = self.id() + + # Set I/O flag to false + self.is_io_running = False + + # Creating Volume and mounting the volume + ret = self.setup_volume_and_mount_volume([self.mounts[0]]) + if not ret: + raise ExecutionError("Volume creation or mount failed: %s" + % self.volname) + + def tearDown(self): + + # Wait for I/O to complete + if self.is_io_running: + if wait_for_io_to_complete(self.list_of_io_processes, + self.mounts[0]): + raise ExecutionError("Failed to wait for I/O to complete") + + # Unmounting and cleaning volume + ret = self.unmount_volume_and_cleanup_volume([self.mounts[0]]) + if not ret: + raise ExecutionError("Unable to delete volume %s" % self.volname) + + self.get_super_method(self, 'tearDown')() + + def test_memory_leak_in_shd_with_cache_invalidation_on(self): + """ + Test case: + 1. Create a volume, start it and mount it. + 2. Set features.cache-invalidation to ON. + 3. Start I/O from mount point. + 4. Run gluster volume heal command in a loop + 5. Check if there are any memory leaks and OOM killers on servers. + """ + # Start monitoring resource usage on servers and clients + monitor_proc_dict = self.start_memory_and_cpu_usage_logging( + self.test_id, count=10) + self.assertIsNotNone(monitor_proc_dict, + "Failed to start monitoring on servers and" + " clients") + + # Set features.cache-invalidation to ON + ret = set_volume_options(self.mnode, self.volname, + {'features.cache-invalidation': 'on'}) + self.assertTrue(ret, "Failed to set features.cache-invalidation to ON") + g.log.info("Successfully set features.cache-invalidation to ON") + + # Start multiple I/O from mount points + self.list_of_io_processes = [] + cmd = ("cd {};for i in `seq 1 1000`;do echo 'abc' > myfile;done" + .format(self.mounts[0].mountpoint)) + ret = g.run_async(self.mounts[0].client_system, cmd) + self.list_of_io_processes = [ret] + self.is_io_running = True + + # Run gluster volume heal command in a loop for 100 iterations + for iteration in range(0, 100): + g.log.info("Running gluster volume heal command for %d time", + iteration) + ret = trigger_heal(self.mnode, self.volname) + self.assertTrue(ret, "Heal command triggered successfully") + g.log.info("Ran gluster volume heal command in a loop for " + "100 iterations.") + + # Wait for I/O to complete and validate I/O on mount points + ret = validate_io_procs(self.list_of_io_processes, self.mounts[0]) + self.assertTrue(ret, "I/O failed on mount point") + self.is_io_running = False + + # Wait for monitoring processes to complete + ret = wait_for_logging_processes_to_stop(monitor_proc_dict, + cluster=True) + self.assertTrue(ret, + "ERROR: Failed to stop monitoring processes") + + # Check if there are any memory leaks and OOM killers + ret = self.check_for_memory_leaks_and_oom_kills_on_servers( + self.test_id) + self.assertFalse(ret, + "Memory leak and OOM kills check failed on servers") -- cgit