diff options
4 files changed, 247 insertions, 5 deletions
diff --git a/openshift-storage-libs/openshiftstoragelibs/baseclass.py b/openshift-storage-libs/openshiftstoragelibs/baseclass.py index ebf5c77f..4ca48f0d 100644 --- a/openshift-storage-libs/openshiftstoragelibs/baseclass.py +++ b/openshift-storage-libs/openshiftstoragelibs/baseclass.py @@ -7,6 +7,7 @@ import six from openshiftstoragelibs import command from openshiftstoragelibs.exceptions import ( + CloudProviderError, ConfigError, ExecutionError, ) @@ -26,6 +27,8 @@ from openshiftstoragelibs.heketi_ops import ( from openshiftstoragelibs.node_ops import ( node_add_iptables_rules, node_delete_iptables_rules, + power_off_vm_by_name, + power_on_vm_by_name, ) from openshiftstoragelibs.openshift_ops import ( get_block_provisioner, @@ -41,9 +44,12 @@ from openshiftstoragelibs.openshift_ops import ( oc_label, scale_dcs_pod_amount_and_wait, switch_oc_project, + wait_for_gluster_pod_be_ready_on_specific_node, + wait_for_ocp_node_be_ready, wait_for_pvcs_be_bound, wait_for_pods_be_ready, wait_for_resources_absence, + wait_for_service_status_on_gluster_pod_or_node, ) from openshiftstoragelibs.openshift_storage_libs import ( get_iscsi_block_devices_by_path, @@ -549,6 +555,64 @@ class BaseClass(unittest.TestCase): g.log.warn(msg) return super(BaseClass, cls).doClassCleanups() + def power_on_vm(self, vm_name): + try: + power_on_vm_by_name(vm_name) + except CloudProviderError as e: + # Try to power on VM, if it raises already powered On error + # then don't raise exception. + if 'VM %s is already powered On' % vm_name not in e: + raise + + def power_off_vm(self, vm_name): + self.addCleanup(self.power_on_vm, vm_name) + power_off_vm_by_name(vm_name) + + def power_on_gluster_node_vm( + self, vm_name, gluster_hostname, timeout=300, wait_step=3): + # NOTE(Nitin Goyal): Same timeout is used for all functions. + + # Bring up the target node + power_on_vm_by_name(vm_name) + + # Wait for gluster node and pod to be ready + if self.is_containerized_gluster(): + wait_for_ocp_node_be_ready( + self.node, gluster_hostname, + timeout=timeout, wait_step=wait_step) + wait_for_gluster_pod_be_ready_on_specific_node( + self.node, gluster_hostname, + timeout=timeout, wait_step=wait_step) + + # Wait for gluster services to be up + for service in ('glusterd', 'gluster-blockd'): + wait_for_service_status_on_gluster_pod_or_node( + self.node, service, 'active', 'running', gluster_hostname, + raise_on_error=False, timeout=timeout, wait_step=wait_step) + + def power_off_gluster_node_vm( + self, vm_name, gluster_hostname, timeout=300, wait_step=3): + # NOTE(Nitin Goyal): Same timeout is used for all functions. + + # Wait for gluster services to be up in cleanup + for service in ('gluster-blockd', 'glusterd'): + self.addCleanup( + wait_for_service_status_on_gluster_pod_or_node, + self.node, service, 'active', 'running', gluster_hostname, + raise_on_error=False, timeout=timeout, wait_step=wait_step) + + # Wait for gluster pod to be up and node to be ready in cleanup + if self.is_containerized_gluster(): + self.addCleanup( + wait_for_gluster_pod_be_ready_on_specific_node, self.node, + gluster_hostname, timeout=timeout, wait_step=wait_step) + self.addCleanup( + wait_for_ocp_node_be_ready, self.node, gluster_hostname, + timeout=timeout, wait_step=wait_step) + + # Power off vm + self.power_off_vm(vm_name) + class GlusterBlockBaseClass(BaseClass): """Base class for gluster-block test cases.""" diff --git a/openshift-storage-libs/openshiftstoragelibs/cloundproviders/vmware.py b/openshift-storage-libs/openshiftstoragelibs/cloundproviders/vmware.py index fe69b532..1d4d4c38 100644 --- a/openshift-storage-libs/openshiftstoragelibs/cloundproviders/vmware.py +++ b/openshift-storage-libs/openshiftstoragelibs/cloundproviders/vmware.py @@ -198,6 +198,8 @@ class VmWare(object): g.log.error(msg) raise exceptions.CloudProviderError(msg) + # TODO(Nitin Goyal): Need to raise exact same below exception in other + # cloud providers as well in future e.g. AWS etc. if vm[0].summary.runtime.powerState == 'poweredOn': msg = 'VM %s is already powered On' % vm_name g.log.error(msg) @@ -228,6 +230,8 @@ class VmWare(object): g.log.error(msg) raise exceptions.CloudProviderError(msg) + # TODO(Nitin Goyal): Need to raise exact same below exception in other + # cloud providers as well in future e.g. AWS etc. if vm[0].summary.runtime.powerState == 'poweredOff': msg = 'VM %s is already powered Off' % vm_name g.log.error(msg) diff --git a/openshift-storage-libs/openshiftstoragelibs/openshift_ops.py b/openshift-storage-libs/openshiftstoragelibs/openshift_ops.py index a20e91e6..56b06297 100644 --- a/openshift-storage-libs/openshiftstoragelibs/openshift_ops.py +++ b/openshift-storage-libs/openshiftstoragelibs/openshift_ops.py @@ -467,7 +467,8 @@ def oc_delete(ocp_node, rtype, name, raise_on_absence=True): g.log.info('Deleted resource: %r %r', rtype, name) -def oc_get_custom_resource(ocp_node, rtype, custom, name=None, selector=None): +def oc_get_custom_resource(ocp_node, rtype, custom, name=None, selector=None, + field_selector=None): """Get an OCP resource by custom column names. Args: @@ -477,6 +478,8 @@ def oc_get_custom_resource(ocp_node, rtype, custom, name=None, selector=None): name (str|None): Name of the resource to fetch. selector (str|list|None): Column Name or list of column names select to. + field_selector (str|list|None): object field selector + which looks like following: 'spec.nodeName=foo_node_1' Returns: list: List containting data about the resource custom column Raises: @@ -489,15 +492,25 @@ def oc_get_custom_resource(ocp_node, rtype, custom, name=None, selector=None): """ cmd = ['oc', 'get', rtype, '--no-headers'] - cmd.append('-o=custom-columns=%s' % ( - ','.join(custom) if isinstance(custom, list) else custom)) + if name: + cmd.append(name) if selector: cmd.append('--selector %s' % ( ','.join(selector) if isinstance(selector, list) else selector)) - if name: - cmd.append(name) + if field_selector: + # NOTE(Nitin Goyal): Add field-selector parameters to custom because it + # is not supported in ocp 3.6 and 3.7 and filter them via python later + custom = ','.join(custom) if isinstance(custom, list) else custom + field_selector = (field_selector.split(',') if isinstance( + field_selector, six.string_types) else field_selector) + + for fs in field_selector: + custom += ',:' + re.split('=|!=', fs)[0] + + cmd.append('-o=custom-columns=%s' % ( + ','.join(custom) if isinstance(custom, list) else custom)) out = command.cmd_run(cmd, hostname=ocp_node) @@ -508,7 +521,21 @@ def oc_get_custom_resource(ocp_node, rtype, custom, name=None, selector=None): for line in (out.strip()).split('\n'): out_list.append( list(filter(None, map(str.strip, line.split(' '))))) + + if not field_selector: return out_list + # Filter out field-selector parameters + for fs in field_selector[::-1]: + fs_value = re.split('=|!=', fs)[1] + found = fs.find('!=') + for out in out_list[:]: + # Not equalto in fs and value present in list then remove it + if (found > 0 and out[-1] == fs_value + # Equalto in fs and value does not match to fs then remove + or found == -1 and out[-1] != fs_value): + out_list.remove(out) + out.pop() + return out_list def get_block_provisioner(ocp_node): @@ -802,6 +829,46 @@ def get_gluster_pod_names_by_pvc_name( return None +def wait_for_gluster_pod_be_ready_on_specific_node( + ocp_client_node, gluster_hostname, selector='glusterfs=storage-pod', + timeout=300, wait_step=10): + """Wait for gluster pod to be ready on specific node. + + Args: + ocp_client_node (str): Node to execute OCP commands on. + gluster_hostname (str): Name of node hosting gluster pod. + selector (str): Selector for gluster pod. + + Returns: + None + """ + g_pod_name = get_gluster_pod_name_for_specific_node( + ocp_client_node, gluster_hostname, selector=selector) + wait_for_pod_be_ready( + ocp_client_node, g_pod_name, timeout=timeout, wait_step=wait_step) + + +def get_gluster_pod_name_for_specific_node( + ocp_client_node, gluster_hostname, selector='glusterfs=storage-pod'): + """Get gluster pod name on specific gluster node. + + Args: + ocp_client_node (str): Node to execute OCP commands on. + gluster_hostname (str): Name of node hosting gluster pod. + selector (str): Selector for gluster pod. + + Returns: + str: Name of the gluster pod + """ + g_pod_name = oc_get_custom_resource( + ocp_client_node, 'pod', ':.metadata.name', selector=selector, + field_selector='spec.nodeName=%s' % gluster_hostname) + if not g_pod_name: + raise AssertionError( + 'Gluster pod was not found for node %s' % gluster_hostname) + return g_pod_name[0][0] + + def cmd_run_on_gluster_pod_or_node( ocp_client_node, cmd, gluster_node=None, raise_on_error=True): """Run shell command on either Gluster PODs or Gluster nodes. diff --git a/tests/functional/gluster_stability/test_gluster_block_stability.py b/tests/functional/gluster_stability/test_gluster_block_stability.py index e0c43786..e81949f4 100644 --- a/tests/functional/gluster_stability/test_gluster_block_stability.py +++ b/tests/functional/gluster_stability/test_gluster_block_stability.py @@ -1138,3 +1138,110 @@ class TestGlusterBlockStability(GlusterBlockBaseClass): node_add_iptables_rules(path_node, chain, rules % tcmu_port) oc_rsh(self.node, pod_name, cmd_run_io % file1) self.verify_iscsi_sessions_and_multipath(self.pvc_name, dc_name) + + def test_initiator_failures_reboot_initiator_node_when_target_node_is_down( + self): + """Restart initiator node when gluster node is down, to make sure paths + rediscovery is happening. + """ + # Skip test if not able to connect to Cloud Provider + try: + find_vm_name_by_ip_or_hostname(self.node) + except (NotImplementedError, ConfigError) as e: + self.skipTest(e) + + ini_node = self.get_initiator_node_and_mark_other_nodes_unschedulable() + + # Create 5 PVC's and DC's + pvcs = self.create_and_wait_for_pvcs(pvc_amount=5) + dcs = self.create_dcs_with_pvc(pvcs) + + # Run I/O on app pods + _file, base_size, count = '/mnt/file', 4096, 1000 + file_size = base_size * count + cmd_run_io = 'dd if=/dev/urandom of=%s bs=%s count=%s' % ( + _file, base_size, count) + for _, pod_name in dcs.values(): + oc_rsh(self.node, pod_name, cmd_run_io) + + vol_info = {} + for pvc, (dc_name, _) in dcs.items(): + # Get target portals + pv_name = get_pv_name_from_pvc(self.node, pvc) + targets = oc_get_custom_resource( + self.node, 'pv', ':.spec.iscsi.portals,' + ':.spec.iscsi.targetPortal', name=pv_name) + targets = [item.strip('[').strip( + ']') for item in targets if isinstance(item, six.string_types)] + + iqn, hacount, _ = self.verify_iscsi_sessions_and_multipath( + pvc, dc_name) + vol_info[pvc] = (iqn, hacount, targets) + + target = targets[0] + + # Get hostname of target node from heketi + h_node_list = heketi_node_list( + self.heketi_client_node, self.heketi_server_url) + for node_id in h_node_list: + node_info = heketi_node_info( + self.heketi_client_node, self.heketi_server_url, node_id, + json=True) + if node_info['hostnames']['storage'][0] == target: + target_hostname = node_info['hostnames']['manage'][0] + break + self.assertTrue(target_hostname) + + # Find VM Name and power it off + target_vm_name = find_vm_name_by_ip_or_hostname(target_hostname) + self.power_off_gluster_node_vm(target_vm_name, target_hostname) + + # Sync I/O + for _, pod_name in dcs.values(): + oc_rsh(self.node, pod_name, "/bin/sh -c 'cd /mnt && sync'") + + # Reboot initiator node where all the app pods are running + node_reboot_by_command(ini_node) + wait_for_ocp_node_be_ready(self.node, ini_node) + + # Wait for pods to be ready after node reboot + dc_names = [dc[0] for dc in dcs.values()] + pod_names = scale_dcs_pod_amount_and_wait(self.node, dc_names) + + # Verify one path is down, because one gluster node is down + for iqn, _, _ in vol_info.values(): + devices = get_iscsi_block_devices_by_path(ini_node, iqn).keys() + mpath = get_mpath_name_from_device_name(ini_node, list(devices)[0]) + with self.assertRaises(AssertionError): + self.verify_all_paths_are_up_in_multipath( + mpath, hacount, ini_node, timeout=1) + + # Power on gluster node and wait for the services to be up + self.power_on_gluster_node_vm(target_vm_name, target_hostname) + + # Delete pod so it can rediscover paths while creating new pod + scale_dcs_pod_amount_and_wait(self.node, dc_names, pod_amount=0) + pod_names = scale_dcs_pod_amount_and_wait(self.node, dc_names) + + # Verify file + for pvc, (dc_name, _) in dcs.items(): + pod_name = pod_names[dc_name][0] + _, out, _ = oc_rsh(self.node, pod_name, 'ls -l %s' % _file) + msg = ("Expected size %s of file '%s' is not present " + "in out '%s'" % (file_size, _file, out)) + self.assertIn(six.text_type(file_size), out, msg) + + # Verify all paths are up and running + for iqn, hacount, _ in vol_info.values(): + devices = get_iscsi_block_devices_by_path(ini_node, iqn).keys() + + # Get mpath names and verify that only one mpath is there + mpaths = set() + for device in devices: + mpaths.add(get_mpath_name_from_device_name(ini_node, device)) + msg = ("Only one mpath was expected on Node %s, but got %s" % ( + ini_node, mpaths)) + self.assertEqual(1, len(mpaths), msg) + + self.verify_all_paths_are_up_in_multipath( + list(mpaths)[0], hacount, ini_node, timeout=1) |