[CNS-1314][CNS-1285] Restart gluster block volumes and validate

Change-Id: Ib7e3125e5120a91fe431816b33be4d4e6f15078e Signed-off-by: vamahaja <vamahaja@redhat.com>
author: vamahaja <vamahaja@redhat.com> 2018-10-04 17:30:11 +0530
committer: vamahaja <vamahaja@redhat.com> 2018-12-10 09:22:38 +0530
commit: 024f0f0bc9f8c969c8f5a3ec494cee0c019f1868 (patch)
tree: 8a000350e82a43e01e90b31d437cf3973d29f9ff /cns-libs
parent: 31492fa754bd77e583564d8356822500078d1e2c (diff)
3 files changed, 342 insertions, 47 deletions
diff --git a/cns-libs/cnslibs/common/gluster_ops.py b/cns-libs/cnslibs/common/gluster_ops.py
new file mode 100644
index 00000000..76b3bc7d
--- /dev/null
+++ b/cns-libs/cnslibs/common/gluster_ops.py
@@ -0,0 +1,296 @@
+import six
+import time
+import json
+import re
+
+from glusto.core import Glusto as g
+from glustolibs.gluster.heal_libs import is_heal_complete
+from glustolibs.gluster.volume_ops import (
+    get_volume_status,
+    get_volume_list,
+    volume_status,
+    volume_start,
+    volume_stop
+)
+from glustolibs.gluster.block_ops import block_list
+from cnslibs.common.openshift_ops import (
+    oc_get_pods,
+    oc_rsh,
+    wait_for_process_to_kill_on_pod
+)
+from cnslibs.common.heketi_ops import heketi_blockvolume_info
+from cnslibs.common import exceptions, podcmd
+from cnslibs.common import waiter
+
+
+def _get_gluster_pod(gluster_pod, hostname=None):
+    """create glusto.podcmd object if gluster_pod is string and
+       hostname is given else returns gluster_pod object given
+
+    Args:
+        gluster_pod (podcmd | str): gluster pod class object has gluster
+                                    pod and ocp master node or gluster
+                                    pod name
+        hostname (str): master node on which gluster pod exists
+    """
+    if isinstance(gluster_pod, podcmd.Pod):
+        return gluster_pod
+    elif isinstance(gluster_pod, six.string_types):
+        if hostname:
+            return podcmd.Pod(hostname, gluster_pod)
+        else:
+            raise exceptions.ExecutionError(
+                "gluster pod is string '%s' but hostname '%s' not valid" % (
+                    gluster_pod, hostname)
+            )
+    else:
+        raise exceptions.ExecutionError(
+            "invalid gluster pod parameter '%s', '%s'" % (
+                gluster_pod, type(gluster_pod))
+        )
+
+
+@podcmd.GlustoPod()
+def wait_to_heal_complete(
+        gluster_pod, hostname=None, timeout=300, wait_step=5):
+    """Monitors heal for volumes on gluster
+        gluster_pod (podcmd | str): gluster pod class object has gluster
+                                    pod and ocp master node or gluster
+                                    pod name
+        hostname (str): master node on which gluster pod exists
+    """
+    gluster_pod = _get_gluster_pod(gluster_pod, hostname)
+
+    gluster_vol_list = get_volume_list(gluster_pod)
+    if not gluster_vol_list:
+        raise AssertionError("failed to get gluster volume list")
+
+    _waiter = waiter.Waiter(timeout=timeout, interval=wait_step)
+    for gluster_vol in gluster_vol_list:
+        for w in _waiter:
+            if is_heal_complete(gluster_pod, gluster_vol):
+                break
+
+    if w.expired:
+        err_msg = ("reached timeout waiting for all the gluster volumes "
+                   "to reach the 'healed' state.")
+        g.log.error(err_msg)
+        raise AssertionError(err_msg)
+
+
+@podcmd.GlustoPod()
+def get_brick_pids(gluster_pod, block_hosting_vol, hostname=None):
+    """gets brick pids from gluster pods
+
+    Args:
+        hostname (str): hostname on which gluster pod exists
+        gluster_pod (podcmd | str): gluster pod class object has gluster
+                                    pod and ocp master node or gluster
+                                    pod name
+        block_hosting_vol (str): Block hosting volume id
+    """
+    gluster_pod = _get_gluster_pod(gluster_pod, hostname)
+
+    gluster_volume_status = get_volume_status(gluster_pod, block_hosting_vol)
+    if not gluster_volume_status:
+        raise AssertionError("failed to get volume status for gluster "
+                             "volume '%s' on pod '%s'" % (
+                                gluster_pod, block_hosting_vol))
+
+    gluster_volume_status = gluster_volume_status.get(block_hosting_vol)
+    assert gluster_volume_status, ("gluster volume %s not present" % (
+                                        block_hosting_vol))
+
+    pids = {}
+    for parent_key, parent_val in gluster_volume_status.items():
+        for child_key, child_val in parent_val.items():
+            if not child_key.startswith("/var"):
+                continue
+
+            pid = child_val["pid"]
+            # When birck is down, pid of the brick is returned as -1.
+            # Which is unexepeted situation, hence raising error.
+            if pid == "-1":
+                raise AssertionError("Something went wrong brick pid is -1")
+
+            pids[parent_key] = pid
+
+    return pids
+
+
+@podcmd.GlustoPod()
+def restart_brick_process(hostname, gluster_pod, block_hosting_vol):
+    """restarts brick process of block hosting volumes
+
+    Args:
+        hostname (str): hostname on which gluster pod exists
+        gluster_pod (podcmd | str): gluster pod class object has gluster
+                                    pod and ocp master node or gluster
+                                    pod name
+        block_hosting_vol (str): block hosting volume name
+    """
+    pids = get_brick_pids(gluster_pod, block_hosting_vol, hostname)
+
+    # using count variable to limit the max pod process kill to 2
+    count = 0
+    killed_process = {}
+    pid_keys = pids.keys()
+    oc_pods = oc_get_pods(hostname)
+    for pod in oc_pods.keys():
+        if not (oc_pods[pod]["ip"] in pid_keys and count <= 1):
+            continue
+
+        ret, out, err = oc_rsh(
+            hostname, pod, "kill -9 %s" % pids[oc_pods[pod]["ip"]]
+        )
+        if ret != 0:
+            err_msg = "failed to kill process id %s error: %s" % (
+                pids[oc_pods[pod]["ip"]], err)
+            g.log.error(err_msg)
+            raise AssertionError(err_msg)
+
+        killed_process[pod] = pids[oc_pods[pod]["ip"]]
+        count += 1
+
+    for pod, pid in killed_process.items():
+        wait_for_process_to_kill_on_pod(pod, pid, hostname)
+
+    ret, out, err = volume_start(gluster_pod, block_hosting_vol, force=True)
+    if ret != 0:
+        err_msg = "failed to start gluster volume %s on pod %s error: %s" % (
+            block_hosting_vol, gluster_pod, err)
+        g.log.error(err_msg)
+        raise AssertionError(err_msg)
+
+
+@podcmd.GlustoPod()
+def restart_block_hosting_volume(
+        gluster_pod, block_hosting_vol, sleep_time=120, hostname=None):
+    """restars block hosting volume service
+
+    Args:
+        hostname (str): hostname on which gluster pod exists
+        gluster_pod (podcmd | str): gluster pod class object has gluster
+                                    pod and ocp master node or gluster
+                                    pod name
+        block_hosting_vol (str): name of block hosting volume
+    """
+    gluster_pod = _get_gluster_pod(gluster_pod, hostname)
+
+    gluster_volume_status = get_volume_status(gluster_pod, block_hosting_vol)
+    if not gluster_volume_status:
+        raise AssertionError("failed to get gluster volume status")
+
+    g.log.info("Gluster volume %s status\n%s : " % (
+        block_hosting_vol, gluster_volume_status)
+    )
+
+    ret, out, err = volume_stop(gluster_pod, block_hosting_vol)
+    if ret != 0:
+        err_msg = "failed to stop gluster volume %s on pod %s error: %s" % (
+            block_hosting_vol, gluster_pod, err)
+        g.log.error(err_msg)
+        raise AssertionError(err_msg)
+
+    # Explicit wait to stop ios and pvc creation for 2 mins
+    time.sleep(sleep_time)
+    ret, out, err = volume_start(gluster_pod, block_hosting_vol, force=True)
+    if ret != 0:
+        err_msg = "failed to start gluster volume %s on pod %s error: %s" % (
+            block_hosting_vol, gluster_pod, err)
+        g.log.error(err_msg)
+        raise AssertionError(err_msg)
+
+    ret, out, err = volume_status(gluster_pod, block_hosting_vol)
+    if ret != 0:
+        err_msg = ("failed to get status for gluster volume %s on pod %s "
+                   "error: %s" % (block_hosting_vol, gluster_pod, err))
+        g.log.error(err_msg)
+        raise AssertionError(err_msg)
+
+
+@podcmd.GlustoPod()
+def match_heketi_and_gluster_block_volumes_by_prefix(
+        gluster_pod, heketi_block_volumes, block_vol_prefix, hostname=None):
+    """Match block volumes from heketi and gluster. This function can't
+       be used for block volumes with custom prefixes
+
+    Args:
+        gluster_pod (podcmd | str): gluster pod class object has gluster
+                                    pod and ocp master node or gluster
+                                    pod name
+        heketi_block_volumes (list): list of heketi block volumes with
+                                     which gluster block volumes need to
+                                     be matched
+        block_vol_prefix (str): block volume prefix by which the block
+                                volumes needs to be filtered
+        hostname (str): ocp master node on which oc command gets executed
+
+    """
+    gluster_pod = _get_gluster_pod(gluster_pod, hostname)
+
+    gluster_vol_list = get_volume_list(gluster_pod)
+
+    gluster_vol_block_list = []
+    for gluster_vol in gluster_vol_list[1:]:
+        ret, out, err = block_list(gluster_pod, gluster_vol)
+        try:
+            if ret != 0 and json.loads(out)["RESULT"] == "FAIL":
+                msg = "failed to get block volume list with error: %s" % err
+                g.log.error(msg)
+                raise AssertionError(msg)
+        except Exception as e:
+            g.log.error(e)
+            raise
+
+        gluster_vol_block_list.extend([
+            block_vol.replace(block_vol_prefix, "")
+            for block_vol in json.loads(out)["blocks"]
+            if block_vol.startswith(block_vol_prefix)
+        ])
+
+    if cmp(sorted(gluster_vol_block_list), heketi_block_volumes) != 0:
+        err_msg = "Gluster and Heketi Block volume list match failed"
+        err_msg += "\nGluster Volumes: %s, " % gluster_vol_block_list
+        err_msg += "\nBlock volumes %s" % heketi_block_volumes
+        err_msg += "\nDifference: %s" % (set(gluster_vol_block_list) ^
+                                         set(heketi_block_volumes))
+        raise AssertionError(err_msg)
+
+
+@podcmd.GlustoPod()
+def get_block_hosting_volume_name(heketi_client_node, heketi_server_url,
+                                  block_volume, gluster_pod, hostname=None):
+    """Returns block hosting volume name of given block volume
+
+    Args:
+        heketi_client_node (str): Node on which cmd has to be executed.
+        heketi_server_url (str): Heketi server url
+        block_volume (str): Block volume of which block hosting volume
+                            returned
+        gluster_pod (podcmd | str): Gluster pod class object has gluster
+                                    pod and ocp master node or gluster
+                                    pod name
+        hostname (str): OCP master node on which ocp commands get executed
+
+    Returns:
+        str : Name of the block hosting volume for given block volume
+    """
+    gluster_pod = _get_gluster_pod(gluster_pod, hostname)
+
+    block_vol_info = heketi_blockvolume_info(
+        heketi_client_node, heketi_server_url, block_volume
+    )
+
+    for line in block_vol_info.splitlines():
+        block_hosting_vol_match = re.search(
+            "^Block Hosting Volume: (.*)$", line
+        )
+
+        if not block_hosting_vol_match:
+            continue
+
+        gluster_vol_list = get_volume_list(gluster_pod)
+        for vol in gluster_vol_list:
+            if block_hosting_vol_match.group(1).strip() in vol:
+                return vol
diff --git a/cns-libs/cnslibs/common/heketi_ops.py b/cns-libs/cnslibs/common/heketi_ops.py
index 534017ff..12910492 100644
--- a/cns-libs/cnslibs/common/heketi_ops.py
+++ b/cns-libs/cnslibs/common/heketi_ops.py
@@ -3,20 +3,18 @@
 """
 
 import json
-import six
 
 from glusto.core import Glusto as g
-from glustolibs.gluster.block_ops import block_list
-from glustolibs.gluster.volume_ops import get_volume_list
 from collections import OrderedDict
 try:
     from heketi import HeketiClient
 except ImportError:
     g.log.error("Please install python-client for heketi and re-run the test")
 
-from cnslibs.common import exceptions, podcmd
+from cnslibs.common import exceptions
 from cnslibs.common.utils import parse_prometheus_data
 
+
 HEKETI_SSH_KEY = "/etc/heketi/heketi_key"
 HEKETI_CONFIG_FILE = "/etc/heketi/heketi.json"
 
@@ -2351,46 +2349,6 @@ def rm_arbiter_tag(heketi_client_node, heketi_server_url, source, source_id,
                    source, source_id, 'arbiter', **kwargs)
 
 
-@podcmd.GlustoPod()
-def match_heketi_and_gluster_block_volumes(
-        gluster_pod, heketi_block_volumes, block_vol_prefix, hostname=None):
-    """Match block volumes from heketi and gluster
-
-    Args:
-        gluster_pod (podcmd | str): gluster pod class object has gluster
-                                    pod and ocp master node or gluster
-                                    pod name
-        heketi_block_volumes (list): list of heketi block volumes with
-                                     which gluster block volumes need to
-                                     be matched
-        block_vol_prefix (str): block volume prefix by which the block
-                                volumes needs to be filtered
-        hostname (str): master node on which gluster pod exists
-
-    """
-    if isinstance(gluster_pod, podcmd.Pod):
-        g.log.info("Recieved gluster pod object using same")
-    elif isinstance(gluster_pod, six.string_types) and hostname:
-        g.log.info("Recieved gluster pod name and hostname")
-        gluster_pod = podcmd.Pod(hostname, gluster_pod)
-    else:
-        raise exceptions.ExecutionError("Invalid glsuter pod parameter")
-
-    gluster_vol_list = get_volume_list(gluster_pod)
-
-    gluster_vol_block_list = []
-    for gluster_vol in gluster_vol_list[1:]:
-        ret, out, err = block_list(gluster_pod, gluster_vol)
-        gluster_vol_block_list.extend([
-            block_vol.replace(block_vol_prefix, "")
-            for block_vol in json.loads(out)["blocks"]
-            if block_vol.startswith(block_vol_prefix)
-        ])
-
-    assert sorted(gluster_vol_block_list) == heketi_block_volumes, (
-        "Gluster and Heketi Block volume list match failed")
-
-
 def get_heketi_metrics(heketi_client_node, heketi_server_url,
                        prometheus_format=False):
     ''' Execute curl command to get metrics output
diff --git a/cns-libs/cnslibs/common/openshift_ops.py b/cns-libs/cnslibs/common/openshift_ops.py
index 7e000bc7..3a6f38b3 100644
--- a/cns-libs/cnslibs/common/openshift_ops.py
+++ b/cns-libs/cnslibs/common/openshift_ops.py
@@ -1422,7 +1422,12 @@ def match_pvc_and_pv(hostname, prefix):
         if pv[0].startswith(prefix)
     ])
 
-    assert pvc_list == pv_list, "PVC and PV list match failed"
+    if cmp(pvc_list, pv_list) != 0:
+        err_msg = "PVC and PV list match failed"
+        err_msg += "\nPVC list: %s, " % pvc_list
+        err_msg += "\nPV list %s" % pv_list
+        err_msg += "\nDifference: %s" % (set(pvc_list) ^ set(pv_list))
+        raise AssertionError(err_msg)
 
 
 def match_pv_and_heketi_block_volumes(
@@ -1446,8 +1451,13 @@ def match_pv_and_heketi_block_volumes(
         if pv[0].startswith(pvc_prefix) and pv[1] == "gluster.org/glusterblock"
     ])
 
-    assert pv_block_volumes == heketi_block_volumes, (
-        "PV and Heketi Block list match failed")
+    if cmp(pv_block_volumes, heketi_block_volumes) != 0:
+        err_msg = "PV block volumes and Heketi Block volume list match failed"
+        err_msg += "\nPV Block Volumes: %s, " % pv_block_volumes
+        err_msg += "\nHeketi Block volumes %s" % heketi_block_volumes
+        err_msg += "\nDifference: %s" % (set(pv_block_volumes) ^
+                                         set(heketi_block_volumes))
+        raise AssertionError(err_msg)
 
 
 def check_service_status(
@@ -1502,3 +1512,34 @@ def restart_service_on_pod(hostname, podname, service):
                    (service, podname))
         g.log.error(err_msg)
         raise AssertionError(err_msg)
+
+
+def wait_for_process_to_kill_on_pod(
+       pod, pid, hostname, timeout=60, interval=3):
+    """check for process presence if process is present for more than
+       timeout sec raise exception
+
+    Args:
+        pid (int | str): process id to be killed on pod
+        pod (str): pod name on which process id to be killed
+        hostname (str): hostname on which pod is present
+    """
+    killed_pid_cmd = "ps -eaf | grep %s | grep -v grep | awk '{print $2}'"
+    _waiter = waiter.Waiter(timeout=60, interval=3)
+    for w in _waiter:
+        ret, out, err = oc_rsh(hostname, pod, killed_pid_cmd % pid)
+        if ret != 0:
+            err_msg = ("failed to get killed process id '%s' details "
+                       "from pod '%s' err: %s" % (pid, pod, err))
+            g.log.error(err_msg)
+            raise AssertionError(err_msg)
+
+        if not out.strip() == pid:
+            g.log.info("brick process '%s' killed on pod '%s'" % (pid, pod))
+            break
+
+    if w.expired:
+        error_msg = ("process id '%s' still exists on pod '%s' after waiting "
+                     "for it '%s' seconds to get kill" % (pid, pod, timeout))
+        g.log.error(error_msg)
+        raise exceptions.ExecutionError(error_msg)
author	vamahaja <vamahaja@redhat.com>	2018-10-04 17:30:11 +0530
committer	vamahaja <vamahaja@redhat.com>	2018-12-10 09:22:38 +0530
commit	024f0f0bc9f8c969c8f5a3ec494cee0c019f1868 (patch)
tree	8a000350e82a43e01e90b31d437cf3973d29f9ff /cns-libs
parent	31492fa754bd77e583564d8356822500078d1e2c (diff)