from openshiftstoragelibs.baseclass import GlusterBlockBaseClass from openshiftstoragelibs.command import cmd_run from openshiftstoragelibs.exceptions import ConfigError from openshiftstoragelibs.heketi_ops import ( heketi_node_info, heketi_node_list, ) from openshiftstoragelibs.node_ops import ( find_vm_name_by_ip_or_hostname, power_off_vm_by_name, power_on_vm_by_name, ) from openshiftstoragelibs.openshift_ops import ( cmd_run_on_gluster_pod_or_node, get_ocp_gluster_pod_details, get_pod_name_from_dc, get_pv_name_from_pvc, oc_adm_manage_node, oc_delete, oc_get_custom_resource, oc_get_schedulable_nodes, oc_rsh, wait_for_pod_be_ready, wait_for_resource_absence, wait_for_service_status_on_gluster_pod_or_node, ) from openshiftstoragelibs.openshift_storage_libs import ( get_active_and_enabled_devices_from_mpath, get_iscsi_block_devices_by_path, get_iscsi_session, get_mpath_name_from_device_name, ) from openshiftstoragelibs.waiter import Waiter class TestGlusterBlockStability(GlusterBlockBaseClass): '''Class that contain gluster-block stability TC''' def setUp(self): super(TestGlusterBlockStability, self).setUp() self.node = self.ocp_master_node[0] def initiator_side_failures(self): self.create_storage_class() self.create_and_wait_for_pvc() # Create app pod dc_name, pod_name = self.create_dc_with_pvc(self.pvc_name) iqn, _, node = self.verify_iscsi_sessions_and_multipath( self.pvc_name, dc_name) # Make node unschedulabe where pod is running oc_adm_manage_node( self.node, '--schedulable=false', nodes=[node]) # Make node schedulabe where pod is running self.addCleanup( oc_adm_manage_node, self.node, '--schedulable=true', nodes=[node]) # Delete pod so it get respun on any other node oc_delete(self.node, 'pod', pod_name) wait_for_resource_absence(self.node, 'pod', pod_name) # Wait for pod to come up pod_name = get_pod_name_from_dc(self.node, dc_name) wait_for_pod_be_ready(self.node, pod_name) # Get the iscsi session from the previous node to verify logout iscsi = get_iscsi_session(node, iqn, raise_on_error=False) self.assertFalse(iscsi) self.verify_iscsi_sessions_and_multipath(self.pvc_name, dc_name) def test_initiator_side_failures_initiator_and_target_on_different_node( self): nodes = oc_get_schedulable_nodes(self.node) # Get list of all gluster nodes cmd = ("oc get pods --no-headers -l glusterfs-node=pod " "-o=custom-columns=:.spec.nodeName") g_nodes = cmd_run(cmd, self.node) g_nodes = g_nodes.split('\n') if g_nodes else g_nodes # Skip test case if required schedulable node count not met if len(set(nodes) - set(g_nodes)) < 2: self.skipTest("skipping test case because it needs at least two" " nodes schedulable") # Make containerized Gluster nodes unschedulable if g_nodes: # Make gluster nodes unschedulable oc_adm_manage_node( self.node, '--schedulable=false', nodes=g_nodes) # Make gluster nodes schedulable self.addCleanup( oc_adm_manage_node, self.node, '--schedulable=true', nodes=g_nodes) self.initiator_side_failures() def test_initiator_side_failures_initiator_and_target_on_same_node(self): # Note: This test case is supported for containerized gluster only. nodes = oc_get_schedulable_nodes(self.node) # Get list of all gluster nodes cmd = ("oc get pods --no-headers -l glusterfs-node=pod " "-o=custom-columns=:.spec.nodeName") g_nodes = cmd_run(cmd, self.node) g_nodes = g_nodes.split('\n') if g_nodes else g_nodes # Get the list of nodes other than gluster o_nodes = list((set(nodes) - set(g_nodes))) # Skip the test case if it is crs setup if not g_nodes: self.skipTest("skipping test case because it is not a " "containerized gluster setup. " "This test case is for containerized gluster only.") # Make other nodes unschedulable oc_adm_manage_node( self.node, '--schedulable=false', nodes=o_nodes) # Make other nodes schedulable self.addCleanup( oc_adm_manage_node, self.node, '--schedulable=true', nodes=o_nodes) self.initiator_side_failures() def test_target_side_failures_gluster_blockd_kill_when_ios_going_on(self): """Run I/Os on block volume while gluster-blockd is stoped""" self.create_and_wait_for_pvc() # Create app pod dc_name, pod_name = self.create_dc_with_pvc(self.pvc_name) iqn, hacount, node = self.verify_iscsi_sessions_and_multipath( self.pvc_name, dc_name) cmd_run_io = 'dd if=/dev/urandom of=/mnt/%s bs=4k count=10000' # Get the paths devices = get_iscsi_block_devices_by_path(node, iqn) mpath = get_mpath_name_from_device_name(node, list(devices.keys())[0]) mpath_dev = get_active_and_enabled_devices_from_mpath(node, mpath) paths = mpath_dev['active'] + mpath_dev['enabled'] for path in paths: node_ip = devices[path] # Stop gluster-blockd service cmd_run_on_gluster_pod_or_node( self.node, 'systemctl stop gluster-blockd', node_ip) self.addCleanup( cmd_run_on_gluster_pod_or_node, self.node, 'systemctl start gluster-blockd', node_ip) wait_for_pod_be_ready(self.node, pod_name, 6, 3) # Verify tcmu-runner, gluster-block-target service is running wait_for_service_status_on_gluster_pod_or_node( self.node, 'tcmu-runner', 'active', 'running', node_ip) wait_for_service_status_on_gluster_pod_or_node( self.node, 'gluster-block-target', 'active', 'exited', node_ip) self.verify_all_paths_are_up_in_multipath(mpath, hacount, node) # Run I/O oc_rsh(self.node, pod_name, cmd_run_io % 'file1') # Start service and verify status cmd_run_on_gluster_pod_or_node( self.node, 'systemctl start gluster-blockd', node_ip) wait_for_service_status_on_gluster_pod_or_node( self.node, 'gluster-blockd', 'active', 'running', node_ip) # Run I/O oc_rsh(self.node, pod_name, cmd_run_io % 'file2') # Verify that active path is same as before mpath_dev_new = get_active_and_enabled_devices_from_mpath(node, mpath) self.assertEqual(mpath_dev['active'][0], mpath_dev_new['active'][0]) def test_target_side_failures_tcmu_runner_kill_when_ios_going_on(self): """Run I/Os on block volume while tcmu-runner is stoped""" self.create_and_wait_for_pvc() # Create app pod dc_name, pod_name = self.create_dc_with_pvc(self.pvc_name) iqn, hacount, node = self.verify_iscsi_sessions_and_multipath( self.pvc_name, dc_name) # Run I/O cmd_run_io = 'dd if=/dev/urandom of=/mnt/%s bs=4k count=10000' devices = get_iscsi_block_devices_by_path(node, iqn) mpath = get_mpath_name_from_device_name(node, list(devices.keys())[0]) mpath_dev = get_active_and_enabled_devices_from_mpath(node, mpath) paths = mpath_dev['active'] + mpath_dev['enabled'] for path in paths: node_ip = devices[path] # Stop tcmu-runner service cmd_run_on_gluster_pod_or_node( self.node, 'systemctl stop tcmu-runner', node_ip) start_svc = ('systemctl start gluster-blockd gluster-block-target ' 'tcmu-runner') self.addCleanup( cmd_run_on_gluster_pod_or_node, self.node, start_svc, node_ip) wait_for_pod_be_ready(self.node, pod_name, 6, 3) # Verify gluster-blockd gluster-block-target service is not running wait_for_service_status_on_gluster_pod_or_node( self.node, 'gluster-blockd', 'inactive', 'dead', node_ip, raise_on_error=False) wait_for_service_status_on_gluster_pod_or_node( self.node, 'gluster-block-target', 'failed', 'Result: exit-code', node_ip, raise_on_error=False) wait_for_service_status_on_gluster_pod_or_node( self.node, 'tcmu-runner', 'inactive', 'dead', node_ip, raise_on_error=False) # Wait for path to be failed for w in Waiter(120, 5): out = cmd_run('multipath -ll %s | grep %s' % ( mpath, path), node) if 'failed faulty running' in out: break if w.expired: self.assertIn( 'failed faulty running', out, 'path %s of mpath %s is ' 'still up and running. It should not be running ' 'because tcmu-runner is down' % (path, mpath)) # Run I/O wait_for_pod_be_ready(self.node, pod_name, 6, 3) oc_rsh(self.node, pod_name, cmd_run_io % 'file2') # Start services cmd_run_on_gluster_pod_or_node(self.node, start_svc, node_ip) # Verify services are running wait_for_service_status_on_gluster_pod_or_node( self.node, 'tcmu-runner', 'active', 'running', node_ip) wait_for_service_status_on_gluster_pod_or_node( self.node, 'gluster-block-target', 'active', 'exited', node_ip) wait_for_service_status_on_gluster_pod_or_node( self.node, 'gluster-blockd', 'active', 'running', node_ip) # Wait for path to come up self.verify_all_paths_are_up_in_multipath(mpath, hacount, node) # Run I/O wait_for_pod_be_ready(self.node, pod_name, 6, 3) oc_rsh(self.node, pod_name, cmd_run_io % 'file3') # Verify it returns to the original active path for w in Waiter(120, 5): mpath_dev_new = get_active_and_enabled_devices_from_mpath( node, mpath) if mpath_dev['active'][0] == mpath_dev_new['active'][0]: break if w.expired: self.assertEqual( mpath_dev['active'][0], mpath_dev_new['active'][0]) # Verify that all the paths are up self.verify_all_paths_are_up_in_multipath(mpath, hacount, node) def test_initiator_side_failure_restart_pod_when_target_node_is_down(self): """Restart app pod when one gluster node is down""" # Skip test if does not meets requirements try: vm_name = find_vm_name_by_ip_or_hostname(self.node) except (NotImplementedError, ConfigError) as e: self.skipTest(e) # Get heketi node list h_nodes_ids = heketi_node_list( self.heketi_client_node, self.heketi_server_url) # Get the ips and hostname of gluster nodes from heketi h_nodes = {} for node in h_nodes_ids: info = heketi_node_info( self.heketi_client_node, self.heketi_server_url, node, json=True) h_nodes[info['hostnames']['storage'][0]] = ( info['hostnames']['manage'][0]) pvc_name = self.create_and_wait_for_pvc() pv_name = get_pv_name_from_pvc(self.node, pvc_name) # Create app pod dc_name, pod_name = self.create_dc_with_pvc(self.pvc_name) iqn, hacount, p_node = self.verify_iscsi_sessions_and_multipath( self.pvc_name, dc_name) # Get list of containerized gluster nodes g_nodes = get_ocp_gluster_pod_details(self.node) # Get target portals for the PVC targets = oc_get_custom_resource( self.node, 'pv', ':.spec.iscsi.portals,:.spec.iscsi.targetPortal', name=pv_name) targets = [item.strip('[').strip( ']') for item in targets if isinstance(item, str)] # Select hostname for powering off if h_nodes[targets[0]] == p_node: vm_hostname = h_nodes[targets[1]] else: vm_hostname = h_nodes[targets[0]] # Find VM Name for powering it off vm_name = find_vm_name_by_ip_or_hostname(vm_hostname) # Unschedulable Node if containerised glusterfs if g_nodes: oc_adm_manage_node(self.node, '--schedulable=false', [vm_hostname]) self.addCleanup( oc_adm_manage_node, self.node, '--schedulable', [vm_hostname]) # Power off gluster node power_off_vm_by_name(vm_name) self.addCleanup(power_on_vm_by_name, vm_name) # Delete pod so it get respun oc_delete(self.node, 'pod', pod_name) wait_for_resource_absence(self.node, 'pod', pod_name) # Wait for pod to come up when 1 target node is down pod_name = get_pod_name_from_dc(self.node, dc_name) wait_for_pod_be_ready(self.node, pod_name, timeout=120, wait_step=5)