diff options
Diffstat (limited to 'glustolibs-io/glustolibs/io')
-rw-r--r-- | glustolibs-io/glustolibs/io/memory_and_cpu_utils.py | 924 | ||||
-rwxr-xr-x | glustolibs-io/glustolibs/io/utils.py | 389 |
2 files changed, 1253 insertions, 60 deletions
diff --git a/glustolibs-io/glustolibs/io/memory_and_cpu_utils.py b/glustolibs-io/glustolibs/io/memory_and_cpu_utils.py new file mode 100644 index 000000000..4e1dadbd7 --- /dev/null +++ b/glustolibs-io/glustolibs/io/memory_and_cpu_utils.py @@ -0,0 +1,924 @@ +# Copyright (C) 2020 Red Hat, Inc. <http://www.redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +from glusto.core import Glusto as g + +from glustolibs.gluster.volume_ops import get_volume_status +from glustolibs.gluster.glusterfile import file_exists +from glustolibs.misc.misc_libs import upload_scripts, kill_process + +import numpy as np +import pandas as pd +from statistics import mean, median + + +def check_upload_memory_and_cpu_logger_script(servers): + """Check and upload memory_and_cpu_logger.py to servers if not present + + Args: + servers(list): List of all servers where script has to be uploaded + + Returns: + bool: True if script is uploaded successfully else false + """ + script = "/usr/share/glustolibs/io/scripts/memory_and_cpu_logger.py" + is_present = [] + for server in servers: + if not file_exists(server, script): + if not upload_scripts(server, script): + g.log.error("Unable to upload memory_and_cpu_logger.py on %s", + server) + is_present.append(False) + else: + is_present.append(True) + return all(is_present) + + +def _start_logging_processes(process, servers, test_name, interval, count): + """Start logging processes on all nodes for a given process + + Args: + servers(list): Servers on which CPU and memory usage has to be logged + test_name(str): Name of testcase for which logs are to be collected + interval(int): Time interval after which logs are to be collected + count(int): Number of samples to be captured + + Returns: + list: A list of logging processes + """ + cmd = ("/usr/bin/env python " + "/usr/share/glustolibs/io/scripts/memory_and_cpu_logger.py" + " -p %s -t %s -i %d -c %d" % (process, test_name, + interval, count)) + logging_process = [] + for server in servers: + proc = g.run_async(server, cmd) + logging_process.append(proc) + return logging_process + + +def log_memory_and_cpu_usage_on_servers(servers, test_name, interval=60, + count=100): + """Log memory and CPU usage of gluster server processes + + Args: + servers(list): Servers on which CPU and memory usage has to be logged + test_name(str): Name of the testcase for which logs are to be collected + + Kwargs: + interval(int): Time interval after which logs are to be collected + (Default:60) + count(int): Number of samples to be captured (Default:100) + + Returns: + dict: Logging processes dict for all gluster server processes + """ + logging_process_dict = {} + for proc_name in ('glusterd', 'glusterfs', 'glusterfsd'): + logging_procs = _start_logging_processes( + proc_name, servers, test_name, interval, count) + logging_process_dict[proc_name] = logging_procs + return logging_process_dict + + +def log_memory_and_cpu_usage_on_clients(servers, test_name, interval=60, + count=100): + """Log memory and CPU usage of gluster client processes + + Args: + servers(list): Clients on which CPU and memory usage has to be logged + test_name(str): Name of testcase for which logs are to be collected + + Kwargs: + interval(int): Time interval after which logs are to be collected + (Defaults:60) + count(int): Number of samples to be captured (Default:100) + + Returns: + dict: Logging processes dict for all gluster client processes + """ + logging_process_dict = {} + logging_procs = _start_logging_processes( + 'glusterfs', servers, test_name, interval, count) + logging_process_dict['glusterfs'] = logging_procs + return logging_process_dict + + +def log_memory_and_cpu_usage_on_cluster(servers, clients, test_name, + interval=60, count=100): + """Log memory and CPU usage on gluster cluster + + Args: + servers(list): Servers on which memory and CPU usage is to be logged + clients(list): Clients on which memory and CPU usage is to be logged + test_name(str): Name of testcase for which logs are to be collected + + Kwargs: + interval(int): Time interval after which logs are to be collected + (Default:60) + count(int): Number of samples to be captured (Default:100) + + Returns: + dict: Logging processes dict for all servers and clients + """ + # Start logging on all servers + server_logging_processes = log_memory_and_cpu_usage_on_servers( + servers, test_name, interval, count) + if not server_logging_processes: + return {} + + # Starting logging on all clients + client_logging_processes = log_memory_and_cpu_usage_on_clients( + clients, test_name, interval, count) + if not client_logging_processes: + return {} + + # Combining dicts + logging_process_dict = {} + for node_type, proc_dict in (('server', server_logging_processes), + ('client', client_logging_processes)): + logging_process_dict[node_type] = {} + for proc in proc_dict: + logging_process_dict[node_type][proc] = ( + proc_dict[proc]) + return logging_process_dict + + +def _process_wait_flag_append(proc, flag): + """Run async communicate and adds true to flag list""" + # If the process is already completed async_communicate() + # throws a ValueError + try: + proc.async_communicate() + flag.append(True) + except ValueError: + flag.append(True) + + +def wait_for_logging_processes_to_stop(proc_dict, cluster=False): + """Wait for all given logging processes to stop + + Args: + proc_dict(dict): Dictionary of all the active logging processes + + Kwargs: + cluster(bool): True if proc_dict is for the entire cluster else False + (Default:False) + + Retruns: + bool: True if processes are completed else False + """ + flag = [] + if cluster: + for sub_dict in proc_dict: + for proc_name in proc_dict[sub_dict]: + for proc in proc_dict[sub_dict][proc_name]: + _process_wait_flag_append(proc, flag) + else: + for proc_name in proc_dict: + for proc in proc_dict[proc_name]: + _process_wait_flag_append(proc, flag) + return all(flag) + + +def kill_all_logging_processes(proc_dict, nodes, cluster=False): + """Kill logging processes on all given nodes + + Args: + proc_dict(dict): Dictonary of all active logging processes + nodes(list): List of nodes where logging has to be stopped + + Kwargs: + cluster(bool): True if proc_dict is for a full cluster else False + (Default:False) + + Retruns: + bool: True if processes are completed else False + """ + # Kill all logging processes + for server in nodes: + if not kill_process(server, process_names='memory_and_cpu_logger.py'): + g.log.error("Unable to kill some of the processes at %s.", server) + + # This will stop the async threads created by run_aysnc() as the proc is + # already killed. + ret = wait_for_logging_processes_to_stop(proc_dict, cluster) + if ret: + return True + return False + + +def create_dataframe_from_csv(node, proc_name, test_name): + """Creates a dataframe from a given process. + + Args: + node(str): Node from which csv is to be picked + proc_name(str): Name of process for which csv is to picked + test_name(str): Name of the testcase for which CSV + + Returns: + dataframe: Pandas dataframe if CSV file exits else None + """ + # Read the csv file generated by memory_and_cpu_logger.py + ret, raw_data, _ = g.run(node, "cat /root/{}.csv" + .format(proc_name)) + if ret: + return None + + # Split the complete dump to individual lines + data = raw_data.split("\r\n") + rows, flag = [], False + for line in data: + values = line.split(',') + if test_name == values[0]: + # Reset rows if it's the second instance + if flag: + rows = [] + flag = True + continue + + # Pick and append values which have complete entry + if flag and len(values) == 4: + rows.append(values) + + # Create a panda dataframe and set the type for columns + dataframe = pd.DataFrame(rows[1:], columns=rows[0]) + conversion_dict = {'Process ID': int, + 'CPU Usage': float, + 'Memory Usage': float} + dataframe = dataframe.astype(conversion_dict) + return dataframe + + +def _get_min_max_mean_median(entrylist): + """Get the mix, max. mean and median of a list + + Args: + entrylist(list): List of values to be used + + Returns: + dict:Result dict generate from list + """ + result = {} + result['Min'] = min(entrylist) + result['Max'] = max(entrylist) + result['Mean'] = mean(entrylist) + result['Median'] = median(entrylist) + return result + + +def _compute_min_max_mean_median(dataframe, data_dict, process, node, + volume=None, brick=None): + """Compute min, max, mean and median for a given process + + Args: + dataframe(panda dataframe): Panda data frame of the csv file + data_dict(dict): data dict to which info is to be added + process(str): Name of process for which data is to be computed + node(str): Node for which min, max, mean and median has to be computed + + Kwargs: + volume(str): Volume name of the volume for which data is to be computed + brick(str): Brick path of the brick for which data is to be computed + """ + if volume and process == 'glusterfs': + # Create subdict inside dict + data_dict[node][process][volume] = {} + for usage in ('CPU Usage', 'Memory Usage'): + # Create usage subdict + data_dict[node][process][volume][usage] = {} + + # Clean data and compute values + cleaned_usage = list(dataframe[usage].dropna()) + out = _get_min_max_mean_median(cleaned_usage) + + # Add values to data_dict + for key in ('Min', 'Max', 'Mean', 'Median'): + data_dict[node][process][volume][usage][key] = out[key] + + if volume and brick and process == 'glusterfsd': + # Create subdict inside dict + data_dict[node][process][volume] = {} + data_dict[node][process][volume][brick] = {} + for usage in ('CPU Usage', 'Memory Usage'): + # Create usage subdict + data_dict[node][process][volume][brick][usage] = {} + + # Clean data and compute values + cleaned_usage = list(dataframe[usage].dropna()) + out = _get_min_max_mean_median(cleaned_usage) + + # Add values to data_dict + for key in ('Min', 'Max', 'Mean', 'Median'): + data_dict[node][process][volume][brick][usage][key] = out[key] + + # Compute CPU Uage and Memory Usage for glusterd + else: + for usage in ('CPU Usage', 'Memory Usage'): + # Create uage subdict + data_dict[node][process][usage] = {} + + # Clean data and compute value + cleaned_usage = list(dataframe[usage].dropna()) + out = _get_min_max_mean_median(cleaned_usage) + + # Add values to data_dict + for key in ('Min', 'Max', 'Mean', 'Median'): + data_dict[node][process][usage][key] = out[key] + + +def compute_data_usage_stats_on_servers(nodes, test_name): + """Compute min, max, mean and median for servers + + Args: + nodes(list): Servers from which data is to be used to compute min, max + , mean, mode and median + test_name(str): Name of testcase for which data has to be processed + + Returns: + dict: dict of min, max, mean and median for a given process + + NOTE: + This function has to be always run before cleanup. + """ + data_dict = {} + for node in nodes: + # Get the volume status on the node + volume_status = get_volume_status(node) + data_dict[node] = {} + for process in ('glusterd', 'glusterfs', 'glusterfsd'): + + # Generate a dataframe from the csv file + dataframe = create_dataframe_from_csv(node, process, test_name) + if dataframe.empty: + return {} + + data_dict[node][process] = {} + if process == 'glusterd': + # Checking if glusterd is restarted. + if len(set(dataframe['Process ID'])) > 1: + data_dict[node][process]['is_restarted'] = True + else: + data_dict[node][process]['is_restarted'] = False + + # Call function to compute min, max, mean and median + _compute_min_max_mean_median(dataframe, data_dict, process, + node) + continue + + # Map volumes to volume process + for volume in volume_status.keys(): + for proc in volume_status[volume][node].keys(): + if (proc == 'Self-heal Daemon' and process == 'glusterfs'): + # Fetching pid from volume status output and create a + # dataframe with the entries of only that pid + pid = volume_status[volume][node][proc]['pid'] + proc_dataframe = dataframe[ + dataframe['Process ID'] == pid] + + # Call function to compute min, max, mean + # and median + _compute_min_max_mean_median( + proc_dataframe, data_dict, process, node, volume) + + if (proc.count('/') >= 2 and process == 'glusterfsd'): + # Fetching pid from volume status output and create a + # dataframe with the entries of only that pid + pid = volume_status[volume][node][proc]['pid'] + proc_dataframe = dataframe[ + dataframe['Process ID'] == pid] + + # Call function to compute min, max, mean and median + _compute_min_max_mean_median( + proc_dataframe, data_dict, process, node, volume, + proc) + + return data_dict + + +def compute_data_usage_stats_on_clients(nodes, test_name): + """Compute min, max, mean and median for clients + + Args: + nodes(list): Clients from which data is to be used to compute min, max + , mean, mode and median + test_name(str): Name of the testcase for which data has to be processed + + Returns: + dict: dict of min, max, mean and median for a given process + """ + data_dict = {} + for node in nodes: + data_dict[node] = {} + dataframe = create_dataframe_from_csv(node, 'glusterfs', test_name) + if dataframe.empty: + return {} + + data_dict[node]['glusterfs'] = {} + # Call function to compute min, max, mean and median + _compute_min_max_mean_median(dataframe, data_dict, 'glusterfs', node) + + return data_dict + + +def _perform_three_point_check_for_memory_leak(dataframe, node, process, gain, + volume_status=None, + volume=None, + vol_name=None): + """Perform three point check + + Args: + dataframe(panda dataframe): Panda dataframe of a given process + node(str): Node on which memory leak has to be checked + process(str): Name of process for which check has to be done + gain(float): Accepted amount of leak for a given testcase in MB + + kwargs: + volume_status(dict): Volume status output on the give name + volumne(str):Name of volume for which 3 point check has to be done + vol_name(str): Name of volume process according to volume status + + Returns: + bool: True if memory leak instances are observed else False + """ + # Filter dataframe to be process wise if it's volume specific process + if process in ('glusterfs', 'glusterfsd'): + if process == 'glusterfs' and vol_name: + pid = int(volume_status[volume][node][vol_name]['pid']) + dataframe = dataframe[dataframe['Process ID'] == pid] + + # Compute usage gain throught the data frame + memory_increments = list(dataframe['Memory Usage'].diff().dropna()) + + # Check if usage is more than accepted amount of leak + memory_leak_decision_array = np.where( + dataframe['Memory Usage'].diff().dropna() > gain, True, False) + instances_of_leak = np.where(memory_leak_decision_array)[0] + + # If memory leak instances are present check if it's reduced + count_of_leak_instances = len(instances_of_leak) + if count_of_leak_instances > 0: + g.log.error('There are %s instances of memory leaks on node %s', + count_of_leak_instances, node) + for instance in instances_of_leak: + # In cases of last log file entry the below op could throw + # IndexError which is handled as below. + try: + # Check if memory gain had decrease in the consecutive + # entries, after 2 entry and betwen current and last entry + if all([memory_increments[instance+1] > + memory_increments[instance], + memory_increments[instance+2] > + memory_increments[instance], + (memory_increments[len(memory_increments)-1] > + memory_increments[instance])]): + return True + + except IndexError: + # In case of last log file entry rerun the command + # and check for difference + g.log.info('Instance at last log entry.') + if process in ('glusterfs', 'glusterfsd'): + cmd = ("ps u -p %s | awk 'NR>1 && $11~/%s$/{print " + " $6/1024}'" % (pid, process)) + else: + cmd = ("ps u -p `pgrep glusterd` | awk 'NR>1 && $11~/" + "glusterd$/{print $6/1024}'") + ret, out, _ = g.run(node, cmd) + if ret: + g.log.error('Unable to run the command to fetch current ' + 'memory utilization.') + continue + usage_now = float(out.replace('\n', '')[2]) + last_entry = dataframe['Memory Usage'].iloc[-1] + + # Check if current memory usage is higher than last entry + fresh_diff = last_entry - usage_now + if fresh_diff > gain and last_entry > fresh_diff: + return True + return False + + +def check_for_memory_leaks_in_glusterd(nodes, test_name, gain=30.0): + """Check for memory leaks in glusterd + + Args: + nodes(list): Servers on which memory leaks have to be checked + test_name(str): Name of testcase for which memory leaks has to be checked + + Kwargs: + gain(float): Accepted amount of leak for a given testcase in MB + (Default:30) + + Returns: + bool: True if memory leak was obsevred else False + """ + is_there_a_leak = [] + for node in nodes: + dataframe = create_dataframe_from_csv(node, 'glusterd', test_name) + if dataframe.empty: + return False + + # Call 3 point check function + three_point_check = _perform_three_point_check_for_memory_leak( + dataframe, node, 'glusterd', gain) + if three_point_check: + g.log.error("Memory leak observed on node %s in glusterd", + node) + is_there_a_leak.append(three_point_check) + + return any(is_there_a_leak) + + +def check_for_memory_leaks_in_glusterfs(nodes, test_name, gain=30.0): + """Check for memory leaks in glusterfs + + Args: + nodes(list): Servers on which memory leaks have to be checked + test_name(str): Name of testcase for which memory leaks has to be checked + + Kwargs: + gain(float): Accepted amount of leak for a given testcase in MB + (Default:30) + + Returns: + bool: True if memory leak was obsevred else False + + NOTE: + This function should be executed with the volumes present on the cluster + """ + is_there_a_leak = [] + for node in nodes: + # Get the volume status on the node + volume_status = get_volume_status(node) + dataframe = create_dataframe_from_csv(node, 'glusterfs', test_name) + if dataframe.empty: + return False + + for volume in volume_status.keys(): + for process in volume_status[volume][node].keys(): + # Skiping if process isn't Self-heal Deamon + if process != 'Self-heal Daemon': + continue + + # Call 3 point check function + three_point_check = _perform_three_point_check_for_memory_leak( + dataframe, node, 'glusterfs', gain, volume_status, volume, + 'Self-heal Daemon') + if three_point_check: + g.log.error("Memory leak observed on node %s in shd " + "on volume %s", node, volume) + is_there_a_leak.append(three_point_check) + + return any(is_there_a_leak) + + +def check_for_memory_leaks_in_glusterfsd(nodes, test_name, gain=30.0): + """Check for memory leaks in glusterfsd + + Args: + nodes(list): Servers on which memory leaks have to be checked + test_name(str): Name of testcase for which memory leaks has to be checked + + Kwargs: + gain(float): Accepted amount of leak for a given testcase in MB + (Default:30) + + Returns: + bool: True if memory leak was obsevred else False + + NOTE: + This function should be executed with the volumes present on the cluster. + """ + is_there_a_leak = [] + for node in nodes: + # Get the volume status on the node + volume_status = get_volume_status(node) + dataframe = create_dataframe_from_csv(node, 'glusterfsd', test_name) + if dataframe.empty: + return False + + for volume in volume_status.keys(): + for process in volume_status[volume][node].keys(): + # Skiping if process isn't brick process + if not process.count('/'): + continue + + # Call 3 point check function + three_point_check = _perform_three_point_check_for_memory_leak( + dataframe, node, 'glusterfsd', gain, volume_status, volume, + process) + if three_point_check: + g.log.error("Memory leak observed on node %s in brick " + " process for brick %s on volume %s", node, + process, volume) + is_there_a_leak.append(three_point_check) + + return any(is_there_a_leak) + + +def check_for_memory_leaks_in_glusterfs_fuse(nodes, test_name, gain=30.0): + """Check for memory leaks in glusterfs fuse + + Args: + nodes(list): Servers on which memory leaks have to be checked + test_name(str): Name of testcase for which memory leaks has to be checked + + Kwargs: + gain(float): Accepted amount of leak for a given testcase in MB + (Default:30) + + Returns: + bool: True if memory leak was observed else False + + NOTE: + This function should be executed when the volume is still mounted. + """ + is_there_a_leak = [] + for node in nodes: + # Get the volume status on the node + dataframe = create_dataframe_from_csv(node, 'glusterfs', test_name) + if dataframe.empty: + return False + + # Call 3 point check function + three_point_check = _perform_three_point_check_for_memory_leak( + dataframe, node, 'glusterfs', gain) + if three_point_check: + g.log.error("Memory leak observed on node %s for client", + node) + + # If I/O is constantly running on Clients the memory + # usage spikes up and stays at a point for long. + last_entry = dataframe['Memory Usage'].iloc[-1] + cmd = ("ps u -p `pidof glusterfs` | " + "awk 'NR>1 && $11~/glusterfs$/{print" + " $6/1024}'") + ret, out, _ = g.run(node, cmd) + if ret: + g.log.error('Unable to run the command to fetch current ' + 'memory utilization.') + continue + + if float(out) != last_entry: + if float(out) > last_entry: + is_there_a_leak.append(True) + continue + + is_there_a_leak.append(False) + + return any(is_there_a_leak) + + +def _check_for_oom_killers(nodes, process, oom_killer_list): + """Checks for OOM killers for a specific process + + Args: + nodes(list): Nodes on which OOM killers have to be checked + process(str): Process for which OOM killers have to be checked + oom_killer_list(list): A list in which the presence of + OOM killer has to be noted + """ + cmd = ("grep -i 'killed process' /var/log/messages* " + "| grep -w '{}'".format(process)) + ret_codes = g.run_parallel(nodes, cmd) + for key in ret_codes.keys(): + ret, out, _ = ret_codes[key] + if not ret: + g.log.error('OOM killer observed on %s for %s', key, process) + g.log.error(out) + oom_killer_list.append(True) + else: + oom_killer_list.append(False) + + +def check_for_oom_killers_on_servers(nodes): + """Check for OOM killers on servers + + Args: + nodes(list): Servers on which OOM kills have to be checked + + Returns: + bool: True if OOM killers are present on any server else False + """ + oom_killer_list = [] + for process in ('glusterfs', 'glusterfsd', 'glusterd'): + _check_for_oom_killers(nodes, process, oom_killer_list) + return any(oom_killer_list) + + +def check_for_oom_killers_on_clients(nodes): + """Check for OOM killers on clients + + Args: + nodes(list): Clients on which OOM kills have to be checked + + Returns: + bool: True if OOM killers are present on any client else false + """ + oom_killer_list = [] + _check_for_oom_killers(nodes, 'glusterfs', oom_killer_list) + return any(oom_killer_list) + + +def _check_for_cpu_usage_spikes(dataframe, node, process, threshold, + volume_status=None, volume=None, + vol_name=None): + """Check for cpu spikes for a given process + + Args: + dataframe(panda dataframe): Panda dataframe of a given process + node(str): Node on which cpu spikes has to be checked + process(str): Name of process for which check has to be done + threshold(int): Accepted amount of 100% CPU usage instances + + kwargs: + volume_status(dict): Volume status output on the give name + volume(str):Name of volume for which check has to be done + vol_name(str): Name of volume process according to volume status + + Returns: + bool: True if number of instances more than threshold else False + """ + # Filter dataframe to be process wise if it's volume specific process + if process in ('glusterfs', 'glusterfsd'): + pid = int(volume_status[volume][node][vol_name]['pid']) + dataframe = dataframe[dataframe['Process ID'] == pid] + + # Check if usage is more than accepted amount of leak + cpu_spike_decision_array = np.where( + dataframe['CPU Usage'].dropna() == 100.0, True, False) + instances_of_spikes = np.where(cpu_spike_decision_array)[0] + + return bool(len(instances_of_spikes) > threshold) + + +def check_for_cpu_usage_spikes_on_glusterd(nodes, test_name, threshold=3): + """Check for CPU usage spikes on glusterd + + Args: + nodes(list): Servers on which memory leaks have to be checked + test_name(str): Name of testcase for which memory leaks has to be checked + + Kwargs: + threshold(int): Accepted amount of instances of 100% CPU usage + (Default:3) + + Returns: + bool: True if CPU spikes are more than threshold else False + """ + is_there_a_spike = [] + for node in nodes: + dataframe = create_dataframe_from_csv(node, 'glusterd', test_name) + if dataframe.empty: + return False + + # Call function to check for cpu spikes + cpu_spikes = _check_for_cpu_usage_spikes( + dataframe, node, 'glusterd', threshold) + if cpu_spikes: + g.log.error("CPU usage spikes observed more than " + "threshold %d on node %s for glusterd", + threshold, node) + is_there_a_spike.append(cpu_spikes) + + return any(is_there_a_spike) + + +def check_for_cpu_usage_spikes_on_glusterfs(nodes, test_name, threshold=3): + """Check for CPU usage spikes on glusterfs + + Args: + nodes(list): Servers on which memory leaks have to be checked + test_name(str): Name of testcase for which memory leaks has to be checked + + Kwargs: + threshold(int): Accepted amount of instances of 100% CPU usage + (Default:3) + + Returns: + bool: True if CPU spikes are more than threshold else False + + NOTE: + This function should be exuected with the volumes present on the cluster. + """ + is_there_a_spike = [] + for node in nodes: + # Get the volume status on the node + volume_status = get_volume_status(node) + dataframe = create_dataframe_from_csv(node, 'glusterfs', test_name) + if dataframe.empty: + return False + + for volume in volume_status.keys(): + for process in volume_status[volume][node].keys(): + # Skiping if process isn't Self-heal Deamon + if process != 'Self-heal Daemon': + continue + + # Call function to check for cpu spikes + cpu_spikes = _check_for_cpu_usage_spikes( + dataframe, node, 'glusterfs', threshold, volume_status, + volume, 'Self-heal Daemon') + if cpu_spikes: + g.log.error("CPU usage spikes observed more than " + "threshold %d on node %s on volume %s for shd", + threshold, node, volume) + is_there_a_spike.append(cpu_spikes) + + return any(is_there_a_spike) + + +def check_for_cpu_usage_spikes_on_glusterfsd(nodes, test_name, threshold=3): + """Check for CPU usage spikes in glusterfsd + + Args: + nodes(list): Servers on which memory leaks have to be checked + test_name(str): Name of testcase for which memory leaks has to be checked + + Kwargs: + threshold(int): Accepted amount of instances of 100% CPU usage + (Default:3) + + Returns: + bool: True if CPU spikes are more than threshold else False + + NOTE: + This function should be exuected with the volumes present on the cluster. + """ + is_there_a_spike = [] + for node in nodes: + # Get the volume status on the node + volume_status = get_volume_status(node) + dataframe = create_dataframe_from_csv(node, 'glusterfsd', test_name) + if dataframe.empty: + return False + + for volume in volume_status.keys(): + for process in volume_status[volume][node].keys(): + # Skiping if process isn't brick process + if process in ('Self-heal Daemon', 'Quota Daemon'): + continue + + # Call function to check for cpu spikes + cpu_spikes = _check_for_cpu_usage_spikes( + dataframe, node, 'glusterfsd', threshold, volume_status, + volume, process) + if cpu_spikes: + g.log.error("CPU usage spikes observed more than " + "threshold %d on node %s on volume %s for " + "brick process %s", + threshold, node, volume, process) + is_there_a_spike.append(cpu_spikes) + + return any(is_there_a_spike) + + +def check_for_cpu_usage_spikes_on_glusterfs_fuse(nodes, test_name, + threshold=3): + """Check for CPU usage spikes on glusterfs fuse + + Args: + nodes(list): Servers on which memory leaks have to be checked + test_name(str): Name of testcase for which memory leaks has to be checked + + Kwargs: + threshold(int): Accepted amount of instances of 100% CPU usage + (Default:3) + + Returns: + bool: True if CPU spikes are more than threshold else False + + NOTE: + This function should be executed when the volume is still mounted. + """ + is_there_a_spike = [] + for node in nodes: + # Get the volume status on the node + dataframe = create_dataframe_from_csv(node, 'glusterfs', test_name) + if dataframe.empty: + return False + + # Call function to check for cpu spikes + cpu_spikes = _check_for_cpu_usage_spikes( + dataframe, node, 'glusterfs', threshold) + if cpu_spikes: + g.log.error("CPU usage spikes observed more than " + "threshold %d on node %s for client", + threshold, node) + is_there_a_spike.append(cpu_spikes) + + return any(is_there_a_spike) diff --git a/glustolibs-io/glustolibs/io/utils.py b/glustolibs-io/glustolibs/io/utils.py index 52c2c7df0..16ee93f21 100755 --- a/glustolibs-io/glustolibs/io/utils.py +++ b/glustolibs-io/glustolibs/io/utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2016 Red Hat, Inc. <http://www.redhat.com> +# Copyright (C) 2015-2020 Red Hat, Inc. <http://www.redhat.com> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -17,20 +17,26 @@ """ Description: Helper library for io modules. """ +from multiprocessing import Pool import os import subprocess + from glusto.core import Glusto as g +from glustolibs.gluster.glusterfile import file_exists from glustolibs.gluster.mount_ops import GlusterMount -from multiprocessing import Pool from glustolibs.gluster.volume_libs import get_subvols +from glustolibs.misc.misc_libs import upload_scripts -def collect_mounts_arequal(mounts): +def collect_mounts_arequal(mounts, path=''): """Collects arequal from all the mounts Args: mounts (list): List of all GlusterMount objs. + Kwargs: + path (str): Path whose arequal is to be calculated. + Defaults to root of mountpoint Returns: tuple(bool, list): On success returns (True, list of arequal-checksums of each mount) @@ -45,11 +51,11 @@ def collect_mounts_arequal(mounts): g.log.info("Start collecting arequal-checksum from all mounts") all_mounts_procs = [] for mount_obj in mounts: + total_path = os.path.join(mount_obj.mountpoint, path) g.log.info("arequal-checksum of mount %s:%s", mount_obj.client_system, - mount_obj.mountpoint) - cmd = "arequal-checksum -p %s -i .trashcan" % mount_obj.mountpoint - proc = g.run_async(mount_obj.client_system, cmd, - user=mount_obj.user) + total_path) + cmd = "arequal-checksum -p %s -i .trashcan" % total_path + proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) all_mounts_arequal_checksums = [] _rc = True @@ -68,7 +74,7 @@ def collect_mounts_arequal(mounts): def log_mounts_info(mounts): - """Logs mount information like df, stat, ls + """Log mount information like df, stat, ls Args: mounts (list): List of all GlusterMount objs. @@ -83,22 +89,22 @@ def log_mounts_info(mounts): # Mount Info g.log.info("Look For Mountpoint:\n") cmd = "mount | grep %s" % mount_obj.mountpoint - _, _, _ = g.run(mount_obj.client_system, cmd) + g.run(mount_obj.client_system, cmd) # Disk Space Usage g.log.info("Disk Space Usage Of Mountpoint:\n") cmd = "df -h %s" % mount_obj.mountpoint - _, _, _ = g.run(mount_obj.client_system, cmd) + g.run(mount_obj.client_system, cmd) # Long list the mountpoint g.log.info("List Mountpoint Entries:\n") cmd = "ls -ld %s" % mount_obj.mountpoint - _, _, _ = g.run(mount_obj.client_system, cmd) + g.run(mount_obj.client_system, cmd) # Stat mountpoint g.log.info("Mountpoint Status:\n") cmd = "stat %s" % mount_obj.mountpoint - _, _, _ = g.run(mount_obj.client_system, cmd) + g.run(mount_obj.client_system, cmd) def get_mounts_stat(mounts): @@ -119,9 +125,8 @@ def get_mounts_stat(mounts): for mount_obj in mounts: g.log.info("Stat of mount %s:%s", mount_obj.client_system, mount_obj.mountpoint) - cmd = ("find %s | xargs stat" % (mount_obj.mountpoint)) - proc = g.run_async(mount_obj.client_system, cmd, - user=mount_obj.user) + cmd = "find %s | xargs stat" % (mount_obj.mountpoint) + proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) _rc = True for i, proc in enumerate(all_mounts_procs): @@ -157,7 +162,7 @@ def list_all_files_and_dirs_mounts(mounts): for mount_obj in mounts: g.log.info("Listing files and dirs on %s:%s", mount_obj.client_system, mount_obj.mountpoint) - cmd = ("find %s | grep -ve '%s'" % (mount_obj.mountpoint, ignore_dirs)) + cmd = "find %s | grep -ve '%s'" % (mount_obj.mountpoint, ignore_dirs) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) _rc = True @@ -194,7 +199,7 @@ def view_snaps_from_mount(mounts, snaps): for mount_obj in mounts: g.log.info("Viewing '.snaps' on %s:%s", mount_obj.client_system, mount_obj.mountpoint) - cmd = ("ls -1 %s/.snaps" % mount_obj.mountpoint) + cmd = "ls -1 %s/.snaps" % mount_obj.mountpoint proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) @@ -229,7 +234,7 @@ def view_snaps_from_mount(mounts, snaps): def validate_io_procs(all_mounts_procs, mounts): - """Validates whether IO was successful or not + """Validate whether IO was successful or not. Args: all_mounts_procs (list): List of open connection descriptor as @@ -316,19 +321,16 @@ def cleanup_mounts(mounts): for mount_obj in mounts: g.log.info("Cleaning up data from %s:%s", mount_obj.client_system, mount_obj.mountpoint) - if (not mount_obj.mountpoint or - (os.path.realpath(os.path.abspath(mount_obj.mountpoint)) - == '/')): + if (not mount_obj.mountpoint or (os.path.realpath(os.path.abspath( + mount_obj.mountpoint)) == '/')): g.log.error("%s on %s is not a valid mount point", mount_obj.mountpoint, mount_obj.client_system) continue cmd = "rm -rf %s/*" % (mount_obj.mountpoint) - proc = g.run_async(mount_obj.client_system, cmd, - user=mount_obj.user) + proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) valid_mounts.append(mount_obj) - g.log.info("rm -rf on all clients is complete. Validating " - "deletion now...") + g.log.info("rm -rf on all clients is complete. Validating deletion now...") # Get cleanup status _rc_rmdir = True @@ -355,8 +357,7 @@ def cleanup_mounts(mounts): for mount_obj in mounts: cmd = ("find %s -mindepth 1 | grep -ve '%s'" % (mount_obj.mountpoint, ignore_dirs)) - proc = g.run_async(mount_obj.client_system, cmd, - user=mount_obj.user) + proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Get cleanup status @@ -383,8 +384,7 @@ def cleanup_mounts(mounts): def run_bonnie(servers, directory_to_run, username="root"): - """ - Module to run bonnie test suite on the given servers. + """Run bonnie test suite on the given servers. Args: servers (list): servers in which tests to be run. @@ -459,8 +459,7 @@ def run_bonnie(servers, directory_to_run, username="root"): def run_fio(servers, directory_to_run): - """ - Module to run fio test suite on the given servers. + """Run fio test suite on the given servers. Args: servers (list): servers in which tests to be run. @@ -536,15 +535,14 @@ def run_fio(servers, directory_to_run): def run_mixed_io(servers, io_tools, directory_to_run): - """ - Module to run different io patterns on each given servers. + """Run different io patterns on each given servers. Args: servers (list): servers in which tests to be run. io_tools (list): different io tools. Currently fio, bonnie are - supported. + supported. directory_to_run (list): directory path where tests will run for - each server. + each server. Returns: bool: True, if test passes in all servers, False otherwise @@ -565,8 +563,7 @@ def run_mixed_io(servers, io_tools, directory_to_run): for items in zip(servers, io_tools): server_io_dict[items[0]] = items[1] - io_dict = {'fio': run_fio, - 'bonnie': run_bonnie} + io_dict = {'fio': run_fio, 'bonnie': run_bonnie} func_list = [] for index, server in enumerate(servers): @@ -586,8 +583,7 @@ def run_mixed_io(servers, io_tools, directory_to_run): def is_io_procs_fail_with_rofs(self, all_mounts_procs, mounts): - """ - Checks whether IO failed with Read-only file system error + """Check whether IO failed with Read-only file system error. Args: all_mounts_procs (list): List of open connection descriptor as @@ -619,8 +615,8 @@ def is_io_procs_fail_with_rofs(self, all_mounts_procs, mounts): g.log.info("EXPECTED : IO Failed on %s:%s", self.mounts[i].client_system, self.mounts[i].mountpoint) - if ("Read-only file system" in err or - "Read-only file system" in out): + if ("Read-only file system" in err + or "Read-only file system" in out): g.log.info("EXPECTED : Read-only file system in output") io_results[proc] = True else: @@ -637,8 +633,7 @@ def is_io_procs_fail_with_rofs(self, all_mounts_procs, mounts): def is_io_procs_fail_with_error(self, all_mounts_procs, mounts, mount_type): - """ - Checks whether IO failed with connection error + """Check whether IO failed with connection error. Args: all_mounts_procs (list): List of open connection descriptor as @@ -672,8 +667,8 @@ def is_io_procs_fail_with_error(self, all_mounts_procs, mounts, mount_type): self.mounts[i].client_system, self.mounts[i].mountpoint) if mount_type == "glusterfs": - if ("Transport endpoint is not connected" in err or - "Transport endpoint is not connected" in out): + if ("Transport endpoint is not connected" in err + or "Transport endpoint is not connected" in out): g.log.info("EXPECTED : Transport endpoint is not connected" " in output") io_results[proc] = True @@ -683,8 +678,7 @@ def is_io_procs_fail_with_error(self, all_mounts_procs, mounts, mount_type): "not found in output") io_results[proc] = False if mount_type == "nfs": - if ("Input/output error" in err or - "Input/output error" in out): + if "Input/output error" in err or "Input/output error" in out: g.log.info("EXPECTED : Input/output error in output") io_results[proc] = True else: @@ -702,8 +696,7 @@ def is_io_procs_fail_with_error(self, all_mounts_procs, mounts, mount_type): def compare_dir_structure_mount_with_brick(mnthost, mntloc, brick_list, type): - """ Compare directory structure from mount point with brick path along - with stat parameter + """Compare mount point dir structure with brick path along with stat param.. Args: mnthost (str): hostname or ip of mnt system @@ -725,8 +718,8 @@ def compare_dir_structure_mount_with_brick(mnthost, mntloc, brick_list, type): if type == 2: statformat = '%A' - command = ("find %s -mindepth 1 -type d | xargs -r stat -c '%s'" - % (mntloc, statformat)) + command = "find %s -mindepth 1 -type d | xargs -r stat -c '%s'" % ( + mntloc, statformat) rcode, rout, _ = g.run(mnthost, command) all_dir_mnt_perm = rout.strip().split('\n') @@ -736,7 +729,8 @@ def compare_dir_structure_mount_with_brick(mnthost, mntloc, brick_list, type): "xargs -r stat -c '%s'" % (brick_path, statformat)) rcode, rout, _ = g.run(brick_node, command) all_brick_dir_perm = rout.strip().split('\n') - retval = cmp(all_dir_mnt_perm, all_brick_dir_perm) + retval = (all_dir_mnt_perm > all_brick_dir_perm) - ( + all_dir_mnt_perm < all_brick_dir_perm) if retval != 0: return False @@ -769,8 +763,7 @@ def check_arequal_bricks_replicated(mnode, volname): subvol_brick_list = subvols_dict['volume_subvols'][i] node, brick_path = subvol_brick_list[0].split(':') command = ('arequal-checksum -p %s ' - '-i .glusterfs -i .landfill -i .trashcan' - % brick_path) + '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, arequal, _ = g.run(node, command) if ret != 0: g.log.error("Failed to calculate arequal for first brick" @@ -782,21 +775,297 @@ def check_arequal_bricks_replicated(mnode, volname): for brick in subvol_brick_list[1:]: node, brick_path = brick.split(':') command = ('arequal-checksum -p %s ' - '-i .glusterfs -i .landfill -i .trashcan' - % brick_path) + '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, brick_arequal, _ = g.run(node, command) if ret != 0: - g.log.error('Failed to get arequal on brick %s' - % brick) + g.log.error('Failed to get arequal on brick %s' % brick) return False g.log.info('Getting arequal for %s is successful', brick) brick_total = brick_arequal.splitlines()[-1].split(':')[-1] # compare arequal of first brick of subvol with all brick other # bricks in subvol if first_brick_total != brick_total: - g.log.error('Arequals for subvol and %s are not equal' - % brick) + g.log.error('Arequals for subvol and %s are not equal' % brick) return False g.log.info('Arequals for subvol and %s are equal', brick) g.log.info('All arequals are equal for volume %s', volname) return True + + +def run_crefi(client, mountpoint, number, breadth, depth, thread=5, + random_size=False, fop='create', filetype='text', + minfs=10, maxfs=500, single=False, multi=False, size=100, + interval=100, nameBytes=10, random_filename=True): + """Run crefi on a given mount point and generate I/O. + + Args: + client(str): Client on which I/O has to be performed. + mountpoint(str): Mount point where the client is mounted. + number(int): Number of files to be created. + breadth(int): Number of directories in one level. + depth(int): Number of levels of directories. + + Kwargs: + thread(int): Number of threads used to generate fop. + random_size(bool): Random size of the file between min and max. + fop(str): fop can be [create|rename|chmod|chown|chgrp|symlink|hardlink| + truncate|setxattr] this specifies the type of fop to be + executed by default it is create. + filetype(str): filetype can be [text|sparse|binary|tar] this specifies + the type of file by default it is text. + minfs(int): If random is set to true then this value has to be altered + to change minimum file size. (Value is in KB) + maxfs(int): If random is set to true then this value has to be altered + to change maximum file size. (Value is in KB) + single(bool): Create files in a single directory. + multi(bool): Create files in sub-dir and sub-sub dir. + size(int): Size of the files to be created. (Value is in KB) + interval(int): Print number files created of interval. + nameBytes(int): Number of bytes for filename. (Value is in Bytes) + random_filename(bool): It creates files with random names, if set to + False it creates files with file name file1, + file2 and so on. + + Returns: + bool: True if I/O was sucessfully otherwise False. + + NOTE: + To use this function it is a prerequisite to have crefi installed + on all the clients. Please use the below command to install it: + $ pip install crefi + $ pip install pyxattr + """ + + # Checking value of fop. + list_of_fops = ["create", "rename", "chmod", "chown", "chgrp", "symlink", + "hardlink", "truncate", "setxattr"] + if fop not in list_of_fops: + g.log.error("fop value is not valid.") + return False + + # Checking value of filetype. + list_of_filetypes = ["text", "sparse", "binary", "tar"] + if filetype not in list_of_filetypes: + g.log.error("filetype is not a valid file type.") + return False + + # Checking if single and multi both are set to true. + if single and multi: + g.log.error("single and mutli both can't be true.") + return False + + # Checking if file size and random size arguments are given together. + if (size > 100 or size < 100) and random_size: + g.log.error("Size and Random size can't be used together.") + return False + + # Checking if minfs is greater than or equal to maxfs. + if random_size and (minfs >= maxfs): + g.log.error("minfs shouldn't be greater than or equal to maxfs.") + return False + + # Creating basic command. + command = "crefi %s -n %s -b %s -d %s " % ( + mountpoint, number, breadth, depth) + + # Checking thread value and adding it, If it is greater or smaller than 5. + if thread > 5 or thread < 5: + command = command + ("-T %s " % thread) + + # Checking if random size is true or false. + if random_size: + command = command + "--random " + if minfs > 10 or minfs < 10: + command = command + ("--min %s " % minfs) + if maxfs > 500 or maxfs < 500: + command = command + ("--max %s " % maxfs) + + # Checking fop and adding it if not create. + if fop != "create": + command = command + ("--fop %s " % fop) + + # Checking if size if greater than or less than 100. + if size > 100 or size < 100: + command = command + ("--size %s " % size) + + # Checking if single or mutli is true. + if single: + command = command + "--single " + if multi: + command = command + "--multi " + + # Checking if random_filename is false. + if not random_filename: + command = command + "-R " + + # Checking if print interval is greater than or less than 100. + if interval > 100 or interval < 100: + command = command + ("-I %s " % interval) + + # Checking if name Bytes is greater than or less than 10. + if nameBytes > 10 or nameBytes < 10: + command = command + ("-l %s " % nameBytes) + + # Checking filetype and setting it if not + # text. + if filetype != "text": + command = command + ("-t %s " % filetype) + + # Running the command on the client node. + ret, _, _ = g.run(client, command) + if ret: + g.log.error("Failed to run crefi on %s." % client) + return False + return True + + +def run_cthon(mnode, volname, clients, dir_name): + """This function runs the cthon test suite. + + Args: + mnode (str) : IP of the server exporting the gluster volume. + volname (str) : The volume name. + clients (list) : List of client machines where + the test needs to be run. + dir_name (str) : Directory where the repo + is cloned. + + Returns: + bool : True if the cthon test passes successfully + False otherwise. + """ + param_list = ['-b', '-g', '-s', '-l'] + vers_list = ['4.0', '4.1'] + + for client in clients: + g.log.info("Running tests on client %s" % client) + for vers in vers_list: + g.log.info("Running tests on client version %s" % vers) + for param in param_list: + # Initialising the test_type that will be running + if param == '-b': + test_type = "Basic" + elif param == '-g': + test_type = "General" + elif param == '-s': + test_type = "Special" + else: + test_type = "Lock" + g.log.info("Running %s test" % test_type) + cmd = "cd /root/%s; ./server %s -o vers=%s -p %s -N 1 %s;" % ( + dir_name, param, vers, volname, mnode) + ret, _, _ = g.run(client, cmd) + if ret: + g.log.error("Error with %s test" % test_type) + return False + else: + g.log.info("%s test successfully passed" % test_type) + return True + + +def upload_file_dir_ops(clients): + """Upload file_dir_ops.py to all the clients. + + Args: + clients(list): List of client machines where we need to upload + the script. + + Returns: + bool: True if script is uploaded successfully + False otherwise. + """ + + g.log.info("Upload io scripts to clients %s for running IO on " + "mounts", clients) + file_dir_ops_path = ("/usr/share/glustolibs/io/scripts/" + "file_dir_ops.py") + + if not upload_scripts(clients, file_dir_ops_path): + g.log.error("Failed to upload IO scripts to clients %s" % + clients) + return False + + g.log.info("Successfully uploaded IO scripts to clients %s", + clients) + return True + + +def open_file_fd(mountpoint, time, client, start_range=0, + end_range=0): + """Open FD for a file and write to file. + + Args: + mountpoint(str): The mount point where the FD of file is to + be opened. + time(int): The time to wait after opening an FD. + client(str): The client from which FD is to be opened. + + Kwargs: + start_range(int): The start range of the open FD. + (Default: 0) + end_range(int): The end range of the open FD. + (Default: 0) + + Returns: + proc(object): Returns a process object + + NOTE: + Before opening FD, check the currently used fds on the + system as only a limited number of fds can be opened on + a system at a given time for each process. + """ + if not (start_range and end_range): + cmd = ("cd {}; exec 30<> file_openfd ; sleep {};" + "echo 'xyz' >&30".format(mountpoint, time)) + else: + cmd = ('cd {}; for i in `seq {} {}`;' + ' do eval "exec $i<>file_openfd$i"; sleep {};' + ' echo "Write to open FD" >&$i; done'.format( + mountpoint, start_range, end_range, time)) + proc = g.run_async(client, cmd) + return proc + + +def run_linux_untar(clients, mountpoint, dirs=('.')): + """Run linux kernal untar on a given mount point + + Args: + clients(str|list): Client nodes on which I/O + has to be started. + mountpoint(str): Mount point where the volume is + mounted. + Kwagrs: + dirs(tuple): A tuple of dirs where untar has to + started. (Default:('.')) + Returns: + list: Returns a list of process object else None + """ + # Checking and convering clients to list. + if not isinstance(clients, list): + clients = [clients] + + list_of_procs = [] + for client in clients: + # Download linux untar to root, so that it can be + # utilized in subsequent run_linux_untar() calls. + cmd = ("wget https://cdn.kernel.org/pub/linux/kernel/" + "v5.x/linux-5.4.54.tar.xz") + if not file_exists(client, '/root/linux-5.4.54.tar.xz'): + ret, _, _ = g.run(client, cmd) + if ret: + return None + + for directory in dirs: + # copy linux tar to dir + cmd = ("cp /root/linux-5.4.54.tar.xz {}/{}" + .format(mountpoint, directory)) + ret, _, _ = g.run(client, cmd) + if ret: + return None + # Start linux untar + cmd = ("cd {}/{};tar -xvf linux-5.4.54.tar.xz" + .format(mountpoint, directory)) + proc = g.run_async(client, cmd) + list_of_procs.append(proc) + + return list_of_procs |