3 files changed, 401 insertions, 43 deletions
diff --git a/tests/functional/glusterd/test_glusterd_memory_consumption_increase.py b/tests/functional/glusterd/test_glusterd_memory_consumption_increase.py
new file mode 100644
index 000000000..92c48da6f
--- /dev/null
+++ b/tests/functional/glusterd/test_glusterd_memory_consumption_increase.py
@@ -0,0 +1,207 @@
+#  Copyright (C) 2021 Red Hat, Inc. <http://www.redhat.com>
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License along
+#  with this program; if not, write to the Free Software Foundation, Inc.,
+#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+""" Description:
+      Increase in glusterd memory consumption on repetetive operations
+      for 100 volumes
+"""
+
+from glusto.core import Glusto as g
+from glustolibs.gluster.exceptions import ExecutionError
+from glustolibs.gluster.gluster_base_class import GlusterBaseClass
+from glustolibs.gluster.volume_ops import (volume_stop, volume_delete,
+                                           get_volume_list,
+                                           volume_start)
+from glustolibs.gluster.gluster_init import (restart_glusterd,
+                                             wait_for_glusterd_to_start)
+from glustolibs.gluster.volume_libs import (bulk_volume_creation,
+                                            cleanup_volume)
+from glustolibs.gluster.volume_ops import set_volume_options
+
+
+class TestGlusterMemoryConsumptionIncrease(GlusterBaseClass):
+    def tearDown(self):
+        # Clean up all volumes
+        if self.volume_present:
+            vol_list = get_volume_list(self.mnode)
+            if vol_list is None:
+                raise ExecutionError("Failed to get the volume list")
+
+            for volume in vol_list:
+                ret = cleanup_volume(self.mnode, volume)
+                if not ret:
+                    raise ExecutionError("Unable to delete volume %s" % volume)
+                g.log.info("Volume deleted successfully : %s", volume)
+
+        # Disable multiplex
+        ret = set_volume_options(self.mnode, 'all',
+                                 {'cluster.brick-multiplex': 'disable'})
+        self.assertTrue(ret, "Failed to enable brick-multiplex"
+                        " for the cluster")
+
+        # Calling baseclass tearDown method
+        self.get_super_method(self, 'tearDown')()
+
+    def _volume_operations_in_loop(self):
+        """ Create, start, stop and delete 100 volumes in a loop """
+        # Create and start 100 volumes in a loop
+        self.volume_config = {
+            'name': 'volume-',
+            'servers': self.servers,
+            'voltype': {'type': 'distributed-replicated',
+                        'dist_count': 2,
+                        'replica_count': 3},
+        }
+
+        ret = bulk_volume_creation(self.mnode, 100, self.all_servers_info,
+                                   self.volume_config, "", False, True)
+        self.assertTrue(ret, "Failed to create volumes")
+
+        self.volume_present = True
+
+        g.log.info("Successfully created all the volumes")
+
+        # Start 100 volumes in loop
+        for i in range(100):
+            self.volname = "volume-%d" % i
+            ret, _, _ = volume_start(self.mnode, self.volname)
+            self.assertEqual(ret, 0, "Failed to start volume: %s"
+                             % self.volname)
+
+        g.log.info("Successfully started all the volumes")
+
+        # Stop 100 volumes in loop
+        for i in range(100):
+            self.volname = "volume-%d" % i
+            ret, _, _ = volume_stop(self.mnode, self.volname)
+            self.assertEqual(ret, 0, "Failed to stop volume: %s"
+                             % self.volname)
+
+        g.log.info("Successfully stopped all the volumes")
+
+        # Delete 100 volumes in loop
+        for i in range(100):
+            self.volname = "volume-%d" % i
+            ret = volume_delete(self.mnode, self.volname)
+            self.assertTrue(ret, "Failed to delete volume: %s"
+                            % self.volname)
+
+        self.volume_present = False
+
+        g.log.info("Successfully deleted all the volumes")
+
+    def _memory_consumption_for_all_nodes(self, pid_list):
+        """Fetch the memory consumption by glusterd process for
+           all the nodes
+        """
+        memory_consumed_list = []
+        for i, server in enumerate(self.servers):
+            # Get the memory consumption of glusterd in each node
+            cmd = "top -b -n 1 -p %d | awk 'FNR==8 {print $6}'" % pid_list[i]
+            ret, mem, _ = g.run(server, cmd)
+            self.assertEqual(ret, 0, "Failed to get the memory usage of"
+                             " glusterd process")
+            mem = int(mem)//1024
+            memory_consumed_list.append(mem)
+
+        return memory_consumed_list
+
+    def test_glusterd_memory_consumption_increase(self):
+        """
+        Test Case:
+        1) Enable brick-multiplex and set max-bricks-per-process to 3 in
+           the cluster
+        2) Get the glusterd memory consumption
+        3) Perform create,start,stop,delete operation for 100 volumes
+        4) Check glusterd memory consumption, it should not increase by
+           more than 50MB
+        5) Repeat steps 3-4 for two more time
+        6) Check glusterd memory consumption it should not increase by
+           more than 10MB
+        """
+        # pylint: disable=too-many-locals
+        # Restarting glusterd to refresh its memory consumption
+        ret = restart_glusterd(self.servers)
+        self.assertTrue(ret, "Restarting glusterd failed")
+
+        # check if glusterd is running post reboot
+        ret = wait_for_glusterd_to_start(self.servers)
+        self.assertTrue(ret, "Glusterd service is not running post reboot")
+
+        # Enable brick-multiplex, set max-bricks-per-process to 3 in cluster
+        for key, value in (('cluster.brick-multiplex', 'enable'),
+                           ('cluster.max-bricks-per-process', '3')):
+            ret = set_volume_options(self.mnode, 'all', {key: value})
+            self.assertTrue(ret, "Failed to set {} to {} "
+                            " for the cluster".format(key, value))
+
+        # Get the pidof of glusterd process
+        pid_list = []
+        for server in self.servers:
+            # Get the pidof of glusterd process
+            cmd = "pidof glusterd"
+            ret, pid, _ = g.run(server, cmd)
+            self.assertEqual(ret, 0, "Failed to get the pid of glusterd")
+            pid = int(pid)
+            pid_list.append(pid)
+
+        # Fetch the list of memory consumed in all the nodes
+        mem_consumed_list = self._memory_consumption_for_all_nodes(pid_list)
+
+        # Perform volume operations for 100 volumes for first time
+        self._volume_operations_in_loop()
+
+        # Fetch the list of memory consumed in all the nodes after 1 iteration
+        mem_consumed_list_1 = self._memory_consumption_for_all_nodes(pid_list)
+
+        for i, mem in enumerate(mem_consumed_list_1):
+            condition_met = False
+            if mem - mem_consumed_list[i] <= 50:
+                condition_met = True
+
+            self.assertTrue(condition_met, "Unexpected: Memory consumption"
+                            " glusterd increased more than the expected"
+                            " of value")
+
+        # Perform volume operations for 100 volumes for second time
+        self._volume_operations_in_loop()
+
+        # Fetch the list of memory consumed in all the nodes after 2 iterations
+        mem_consumed_list_2 = self._memory_consumption_for_all_nodes(pid_list)
+
+        for i, mem in enumerate(mem_consumed_list_2):
+            condition_met = False
+            if mem - mem_consumed_list_1[i] <= 10:
+                condition_met = True
+
+            self.assertTrue(condition_met, "Unexpected: Memory consumption"
+                            " glusterd increased more than the expected"
+                            " of value")
+
+        # Perform volume operations for 100 volumes for third time
+        self._volume_operations_in_loop()
+
+        # Fetch the list of memory consumed in all the nodes after 3 iterations
+        mem_consumed_list_3 = self._memory_consumption_for_all_nodes(pid_list)
+
+        for i, mem in enumerate(mem_consumed_list_3):
+            condition_met = False
+            if mem - mem_consumed_list_2[i] <= 10:
+                condition_met = True
+
+            self.assertTrue(condition_met, "Unexpected: Memory consumption"
+                            " glusterd increased more than the expected"
+                            " of value")
diff --git a/tests/functional/glusterd/test_probe_glusterd_down.py b/tests/functional/glusterd/test_probe_glusterd_down.py
index 3705904a9..c851bf104 100644
--- a/tests/functional/glusterd/test_probe_glusterd_down.py
+++ b/tests/functional/glusterd/test_probe_glusterd_down.py
@@ -1,4 +1,4 @@
-#  Copyright (C) 2020  Red Hat, Inc. <http://www.redhat.com>
+#  Copyright (C) 2020-2021 Red Hat, Inc. <http://www.redhat.com>
 #
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
@@ -14,17 +14,14 @@
 #  with this program; if not, write to the Free Software Foundation, Inc.,
 #  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
-from time import sleep
-
 from glusto.core import Glusto as g
 from glustolibs.gluster.gluster_base_class import GlusterBaseClass
 from glustolibs.gluster.exceptions import ExecutionError
 from glustolibs.gluster.peer_ops import peer_probe
 from glustolibs.gluster.lib_utils import is_core_file_created
 from glustolibs.gluster.peer_ops import peer_detach, is_peer_connected
-from glustolibs.gluster.gluster_init import (stop_glusterd, start_glusterd,
-                                             wait_for_glusterd_to_start)
-from glustolibs.misc.misc_libs import are_nodes_online
+from glustolibs.gluster.gluster_init import stop_glusterd, start_glusterd
+from glustolibs.misc.misc_libs import bring_down_network_interface
 
 
 class PeerProbeWhenGlusterdDown(GlusterBaseClass):
@@ -57,7 +54,7 @@ class PeerProbeWhenGlusterdDown(GlusterBaseClass):
         ret, test_timestamp, _ = g.run_local('date +%s')
         test_timestamp = test_timestamp.strip()
 
-        # detach one of the nodes which is part of the cluster
+        # Detach one of the nodes which is part of the cluster
         g.log.info("detaching server %s ", self.servers[1])
         ret, _, err = peer_detach(self.mnode, self.servers[1])
         msg = 'peer detach: failed: %s is not part of cluster\n' \
@@ -66,12 +63,12 @@ class PeerProbeWhenGlusterdDown(GlusterBaseClass):
             self.assertEqual(err, msg, "Failed to detach %s "
                              % (self.servers[1]))
 
-        # bring down glusterd of the server which has been detached
+        # Bring down glusterd of the server which has been detached
         g.log.info("Stopping glusterd on %s ", self.servers[1])
         ret = stop_glusterd(self.servers[1])
         self.assertTrue(ret, "Fail to stop glusterd on %s " % self.servers[1])
 
-        # trying to peer probe the node whose glusterd was stopped using its IP
+        # Trying to peer probe the node whose glusterd was stopped using IP
         g.log.info("Peer probing %s when glusterd down ", self.servers[1])
         ret, _, err = peer_probe(self.mnode, self.servers[1])
         self.assertNotEqual(ret, 0, "Peer probe should not pass when "
@@ -79,7 +76,7 @@ class PeerProbeWhenGlusterdDown(GlusterBaseClass):
         self.assertEqual(err, "peer probe: failed: Probe returned with "
                               "Transport endpoint is not connected\n")
 
-        # trying to peer probe the same node with hostname
+        # Trying to peer probe the same node with hostname
         g.log.info("Peer probing node %s using hostname with glusterd down ",
                    self.servers[1])
         hostname = g.run(self.servers[1], "hostname")
@@ -89,27 +86,24 @@ class PeerProbeWhenGlusterdDown(GlusterBaseClass):
         self.assertEqual(err, "peer probe: failed: Probe returned with"
                               " Transport endpoint is not connected\n")
 
-        # start glusterd again for the next set of test steps
+        # Start glusterd again for the next set of test steps
         g.log.info("starting glusterd on %s ", self.servers[1])
         ret = start_glusterd(self.servers[1])
         self.assertTrue(ret, "glusterd couldn't start successfully on %s"
                         % self.servers[1])
 
-        # reboot a server and then trying to peer probe at the time of reboot
-        g.log.info("Rebooting %s and checking peer probe", self.servers[1])
-        reboot = g.run_async(self.servers[1], "reboot")
-
-        # Mandatory sleep for 3 seconds to make sure node is in halted state
-        sleep(3)
+        # Bring down the network for sometime
+        network_status = bring_down_network_interface(self.servers[1], 150)
 
         # Peer probing the node using IP when it is still not online
-        g.log.info("Peer probing node %s which has been issued a reboot ",
+        g.log.info("Peer probing node %s when network is down",
                    self.servers[1])
         ret, _, err = peer_probe(self.mnode, self.servers[1])
         self.assertNotEqual(ret, 0, "Peer probe passed when it was expected to"
                                     " fail")
-        self.assertEqual(err, "peer probe: failed: Probe returned with "
-                              "Transport endpoint is not connected\n")
+        self.assertEqual(err.split("\n")[0], "peer probe: failed: Probe "
+                                             "returned with Transport endpoint"
+                                             " is not connected")
 
         # Peer probing the node using hostname when it is still not online
         g.log.info("Peer probing node %s using hostname which is still "
@@ -118,35 +112,21 @@ class PeerProbeWhenGlusterdDown(GlusterBaseClass):
         ret, _, err = peer_probe(self.mnode, hostname[1].strip())
         self.assertNotEqual(ret, 0, "Peer probe should not pass when node "
                                     "has not come online")
-        self.assertEqual(err, "peer probe: failed: Probe returned with "
-                              "Transport endpoint is not connected\n")
+        self.assertEqual(err.split("\n")[0], "peer probe: failed: Probe "
+                                             "returned with Transport endpoint"
+                                             " is not connected")
+
+        ret, _, _ = network_status.async_communicate()
+        if ret != 0:
+            g.log.error("Failed to perform network interface ops")
 
-        ret, _, _ = reboot.async_communicate()
-        self.assertEqual(ret, 255, "reboot failed")
-
-        # Validate if rebooted node is online or not
-        count = 0
-        while count < 40:
-            sleep(15)
-            ret, _ = are_nodes_online(self.servers[1])
-            if ret:
-                g.log.info("Node %s is online", self.servers[1])
-                break
-            count += 1
-        self.assertTrue(ret, "Node in test not yet online")
-
-        # check if glusterd is running post reboot
-        ret = wait_for_glusterd_to_start(self.servers[1],
-                                         glusterd_start_wait_timeout=120)
-        self.assertTrue(ret, "Glusterd service is not running post reboot")
-
-        # peer probe the node must pass
+        # Peer probe the node must pass
         g.log.info("peer probing node %s", self.servers[1])
         ret, _, err = peer_probe(self.mnode, self.servers[1])
         self.assertEqual(ret, 0, "Peer probe has failed unexpectedly with "
                                  "%s " % err)
 
-        # checking if core file created in "/", "/tmp" and "/var/log/core"
+        # Checking if core file created in "/", "/tmp" and "/var/log/core"
         ret = is_core_file_created(self.servers, test_timestamp)
         self.assertTrue(ret, "core file found")
 
diff --git a/tests/functional/glusterd/test_verify_df_output.py b/tests/functional/glusterd/test_verify_df_output.py
new file mode 100644
index 000000000..4eac9193b
--- /dev/null
+++ b/tests/functional/glusterd/test_verify_df_output.py
@@ -0,0 +1,171 @@
+#  Copyright (C) 2021 Red Hat, Inc. <http://www.redhat.com>
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License along
+#  with this program; if not, write to the Free Software Foundation, Inc.,
+#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+
+from glusto.core import Glusto as g
+from glustolibs.gluster.gluster_base_class import (GlusterBaseClass,
+                                                   runs_on)
+from glustolibs.gluster.heal_libs import monitor_heal_completion
+from glustolibs.io.utils import validate_io_procs
+from glustolibs.misc.misc_libs import upload_scripts
+from glustolibs.gluster.exceptions import ExecutionError
+from glustolibs.gluster.volume_libs import (replace_brick_from_volume,
+                                            shrink_volume, expand_volume)
+from glustolibs.gluster.brick_libs import get_all_bricks
+
+
+@runs_on([['distributed-dispersed', 'distributed-replicated',
+           'distributed-arbiter', 'dispersed', 'replicated',
+           'arbiter'],
+          ['glusterfs']])
+class VerifyDFWithReplaceBrick(GlusterBaseClass):
+
+    @classmethod
+    def setUpClass(cls):
+        # Calling GlusterBaseClass setUpClass
+        cls.get_super_method(cls, 'setUpClass')()
+
+        # Upload io scripts for running IO on mounts
+        cls.script_upload_path = ("/usr/share/glustolibs/io/scripts/"
+                                  "file_dir_ops.py")
+        if not upload_scripts(cls.clients, [cls.script_upload_path]):
+            raise ExecutionError("Failed to upload IO scripts to clients %s"
+                                 % cls.clients)
+        g.log.info("Successfully uploaded IO scripts to clients %s",
+                   cls.clients)
+
+    def setUp(self):
+        # Calling GlusterBaseClass setUp
+        self.get_super_method(self, 'setUp')()
+
+        # Setup Volume and Mount Volume
+        if not self.setup_volume_and_mount_volume(mounts=self.mounts):
+            raise ExecutionError("Failed to Setup_Volume and Mount_Volume")
+        g.log.info("Successful in Setup Volume and Mount Volume")
+
+    def _perform_io_and_validate(self):
+        """ Performs IO on the mount points and validates it"""
+        all_mounts_procs, count = [], 1
+        for mount_obj in self.mounts:
+            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
+                   "--dirname-start-num %d --dir-depth 2 "
+                   "--dir-length 3 --max-num-of-dirs 3 "
+                   "--num-of-files 2 %s" % (
+                       self.script_upload_path, count,
+                       mount_obj.mountpoint))
+            proc = g.run_async(mount_obj.client_system, cmd,
+                               user=mount_obj.user)
+            all_mounts_procs.append(proc)
+            count = count + 10
+
+        # Validating IO's on mount point and waiting to complete
+        ret = validate_io_procs(all_mounts_procs, self.mounts)
+        self.assertTrue(ret, "IO failed on some of the clients")
+        g.log.info("Successfully validated IO's")
+
+    def _replace_bricks_and_wait_for_heal_completion(self):
+        """ Replaces all the bricks and waits for the heal to complete"""
+        existing_bricks = get_all_bricks(self.mnode, self.volname)
+        for brick_to_replace in existing_bricks:
+            ret = replace_brick_from_volume(self.mnode, self.volname,
+                                            self.servers,
+                                            self.all_servers_info,
+                                            src_brick=brick_to_replace)
+            self.assertTrue(ret,
+                            "Replace of %s failed" % brick_to_replace)
+            g.log.info("Replace of brick %s successful for volume %s",
+                       brick_to_replace, self.volname)
+
+            # Monitor heal completion
+            ret = monitor_heal_completion(self.mnode, self.volname)
+            self.assertTrue(ret, 'Heal has not yet completed')
+            g.log.info('Heal has completed successfully')
+
+    def _get_mount_size_from_df_h_output(self):
+        """ Extracts the mount size from the df -h output"""
+
+        split_cmd = " | awk '{split($0,a,\" \");print a[2]}' | sed 's/.$//'"
+        cmd = ("cd {};df -h | grep {} {}".format(self.mounts[0].mountpoint,
+                                                 self.volname, split_cmd))
+        ret, mount_size, _ = g.run(self.clients[0], cmd)
+        self.assertEqual(ret, 0, "Failed to extract mount size")
+        return float(mount_size.split("\n")[0])
+
+    def test_verify_df_output_when_brick_replaced(self):
+        """
+        - Take the output of df -h.
+        - Replace any one brick for the volumes.
+        - Wait till the heal is completed
+        - Repeat steps 1, 2 and 3 for all bricks for all volumes.
+        - Check if there are any inconsistencies in the output of df -h
+        - Remove bricks from volume and check output of df -h
+        - Add bricks to volume and check output of df -h
+        """
+
+        # Perform some IO on the mount point
+        self._perform_io_and_validate()
+
+        # Get the mount size from df -h output
+        initial_mount_size = self._get_mount_size_from_df_h_output()
+
+        # Replace all the bricks and wait till the heal completes
+        self._replace_bricks_and_wait_for_heal_completion()
+
+        # Get df -h output after brick replace
+        mount_size_after_replace = self._get_mount_size_from_df_h_output()
+
+        # Verify the mount point size remains the same after brick replace
+        self.assertEqual(initial_mount_size, mount_size_after_replace,
+                         "The mount sizes before and after replace bricks "
+                         "are not same")
+
+        # Add bricks
+        ret = expand_volume(self.mnode, self.volname, self.servers,
+                            self.all_servers_info, force=True)
+        self.assertTrue(ret, "Failed to add-brick to volume")
+
+        # Get df -h output after volume expand
+        mount_size_after_expand = self._get_mount_size_from_df_h_output()
+
+        # Verify df -h output returns greater value
+        self.assertGreater(mount_size_after_expand, initial_mount_size,
+                           "The mount size has not increased after expanding")
+
+        # Remove bricks
+        ret = shrink_volume(self.mnode, self.volname, force=True)
+        self.assertTrue(ret, ("Remove brick operation failed on "
+                              "%s", self.volname))
+        g.log.info("Remove brick operation is successful on "
+                   "volume %s", self.volname)
+
+        # Get df -h output after volume shrink
+        mount_size_after_shrink = self._get_mount_size_from_df_h_output()
+
+        # Verify the df -h output returns smaller value
+        self.assertGreater(mount_size_after_expand, mount_size_after_shrink,
+                           "The mount size has not reduced after shrinking")
+
+    def tearDown(self):
+        """
+        Cleanup and umount volume
+        """
+        # Cleanup and umount volume
+        if not self.unmount_volume_and_cleanup_volume(mounts=self.mounts):
+            raise ExecutionError("Failed to umount the vol & cleanup Volume")
+        g.log.info("Successful in umounting the volume and Cleanup")
+
+        # Calling GlusterBaseClass teardown
+        self.get_super_method(self, 'tearDown')()