diff options
-rw-r--r-- | openshift-storage-libs/openshiftstoragelibs/openshift_ops.py | 15 | ||||
-rw-r--r-- | tests/functional/gluster_stability/test_gluster_block_stability.py | 91 |
2 files changed, 101 insertions, 5 deletions
diff --git a/openshift-storage-libs/openshiftstoragelibs/openshift_ops.py b/openshift-storage-libs/openshiftstoragelibs/openshift_ops.py index c8275c5c..24dcbfd1 100644 --- a/openshift-storage-libs/openshiftstoragelibs/openshift_ops.py +++ b/openshift-storage-libs/openshiftstoragelibs/openshift_ops.py @@ -696,7 +696,8 @@ def get_gluster_pod_names_by_pvc_name(ocp_node, pvc_name): return data -def cmd_run_on_gluster_pod_or_node(ocp_client_node, cmd, gluster_node=None): +def cmd_run_on_gluster_pod_or_node( + ocp_client_node, cmd, gluster_node=None, raise_on_error=True): """Run shell command on either Gluster PODs or Gluster nodes. Args: @@ -727,7 +728,9 @@ def cmd_run_on_gluster_pod_or_node(ocp_client_node, cmd, gluster_node=None): for gluster_pod_name in gluster_pod_names: try: pod_cmd = "oc exec %s -- %s" % (gluster_pod_name, cmd) - return command.cmd_run(pod_cmd, hostname=ocp_client_node) + return command.cmd_run( + pod_cmd, hostname=ocp_client_node, + raise_on_error=raise_on_error) except Exception as e: err = ("Failed to run '%s' command on '%s' Gluster POD. " "Error: %s\n" % (cmd, gluster_pod_name, e)) @@ -742,7 +745,8 @@ def cmd_run_on_gluster_pod_or_node(ocp_client_node, cmd, gluster_node=None): g_hosts = list(g.config.get("gluster_servers", {}).keys()) for g_host in g_hosts: try: - return command.cmd_run(cmd, hostname=g_host) + return command.cmd_run( + cmd, hostname=g_host, raise_on_error=raise_on_error) except Exception as e: err = ("Failed to run '%s' command on '%s' Gluster node. " "Error: %s\n" % (cmd, g_host, e)) @@ -1336,7 +1340,7 @@ def check_service_status_on_pod( def wait_for_service_status_on_gluster_pod_or_node( ocp_client, service, status, state, gluster_node, - timeout=180, wait_step=3): + raise_on_error=True, timeout=180, wait_step=3): """Wait for a service specific status on a Gluster POD or node. Args: @@ -1358,7 +1362,8 @@ def wait_for_service_status_on_gluster_pod_or_node( for w in waiter.Waiter(timeout, wait_step): out = cmd_run_on_gluster_pod_or_node( - ocp_client, SERVICE_STATUS % service, gluster_node) + ocp_client, SERVICE_STATUS % service, gluster_node, + raise_on_error=raise_on_error) for line in out.splitlines(): status_match = re.search(SERVICE_STATUS_REGEX, line) if (status_match and status_match.group(1) == status and diff --git a/tests/functional/gluster_stability/test_gluster_block_stability.py b/tests/functional/gluster_stability/test_gluster_block_stability.py index bea92138..0232c790 100644 --- a/tests/functional/gluster_stability/test_gluster_block_stability.py +++ b/tests/functional/gluster_stability/test_gluster_block_stability.py @@ -17,6 +17,7 @@ from openshiftstoragelibs.openshift_storage_libs import ( get_iscsi_session, get_mpath_name_from_device_name, ) +from openshiftstoragelibs.waiter import Waiter class TestGlusterBlockStability(GlusterBlockBaseClass): @@ -172,3 +173,93 @@ class TestGlusterBlockStability(GlusterBlockBaseClass): # Verify that active path is same as before mpath_dev_new = get_active_and_enabled_devices_from_mpath(node, mpath) self.assertEqual(mpath_dev['active'][0], mpath_dev_new['active'][0]) + + def test_target_side_failures_tcmu_runner_kill_when_ios_going_on(self): + """Run I/Os on block volume while tcmu-runner is stoped""" + self.create_and_wait_for_pvc() + + # Create app pod + dc_name, pod_name = self.create_dc_with_pvc(self.pvc_name) + + iqn, hacount, node = self.verify_iscsi_sessions_and_multipath( + self.pvc_name, dc_name) + + # Run I/O + cmd_run_io = 'dd if=/dev/urandom of=/mnt/%s bs=4k count=10000' + + devices = get_iscsi_block_devices_by_path(node, iqn) + mpath = get_mpath_name_from_device_name(node, list(devices.keys())[0]) + mpath_dev = get_active_and_enabled_devices_from_mpath(node, mpath) + + paths = mpath_dev['active'] + mpath_dev['enabled'] + + for path in paths: + node_ip = devices[path] + + # Stop tcmu-runner service + cmd_run_on_gluster_pod_or_node( + self.node, 'systemctl stop tcmu-runner', node_ip) + start_svc = ('systemctl start gluster-blockd gluster-block-target ' + 'tcmu-runner') + self.addCleanup( + cmd_run_on_gluster_pod_or_node, self.node, start_svc, node_ip) + + wait_for_pod_be_ready(self.node, pod_name, 6, 3) + + # Verify gluster-blockd gluster-block-target service is not running + wait_for_service_status_on_gluster_pod_or_node( + self.node, 'gluster-blockd', 'inactive', 'dead', node_ip, + raise_on_error=False) + wait_for_service_status_on_gluster_pod_or_node( + self.node, 'gluster-block-target', 'failed', + 'Result: exit-code', node_ip, raise_on_error=False) + wait_for_service_status_on_gluster_pod_or_node( + self.node, 'tcmu-runner', 'inactive', 'dead', node_ip, + raise_on_error=False) + + # Wait for path to be failed + for w in Waiter(120, 5): + out = cmd_run('multipath -ll %s | grep %s' % ( + mpath, path), node) + if 'failed faulty running' in out: + break + if w.expired: + self.assertIn( + 'failed faulty running', out, 'path %s of mpath %s is ' + 'still up and running. It should not be running ' + 'because tcmu-runner is down' % (path, mpath)) + + # Run I/O + wait_for_pod_be_ready(self.node, pod_name, 6, 3) + oc_rsh(self.node, pod_name, cmd_run_io % 'file2') + + # Start services + cmd_run_on_gluster_pod_or_node(self.node, start_svc, node_ip) + + # Verify services are running + wait_for_service_status_on_gluster_pod_or_node( + self.node, 'tcmu-runner', 'active', 'running', node_ip) + wait_for_service_status_on_gluster_pod_or_node( + self.node, 'gluster-block-target', 'active', 'exited', node_ip) + wait_for_service_status_on_gluster_pod_or_node( + self.node, 'gluster-blockd', 'active', 'running', node_ip) + + # Wait for path to come up + self.verify_all_paths_are_up_in_multipath(mpath, hacount, node) + + # Run I/O + wait_for_pod_be_ready(self.node, pod_name, 6, 3) + oc_rsh(self.node, pod_name, cmd_run_io % 'file3') + + # Verify it returns to the original active path + for w in Waiter(120, 5): + mpath_dev_new = get_active_and_enabled_devices_from_mpath( + node, mpath) + if mpath_dev['active'][0] == mpath_dev_new['active'][0]: + break + if w.expired: + self.assertEqual( + mpath_dev['active'][0], mpath_dev_new['active'][0]) + + # Verify that all the paths are up + self.verify_all_paths_are_up_in_multipath(mpath, hacount, node) |