From f90d985d386f84d8c07fab91dca3365d7e9309b5 Mon Sep 17 00:00:00 2001 From: Nishanth Thomas Date: Mon, 26 May 2014 17:34:37 +0530 Subject: nagios-server-addons:volume status based on volume type Added the volume type in the plugin output The logic of determining the volume status changed based on the volume type.Added the volume type and the list of bricks down in the plugin output Change-Id: Ib8d3111bdcc04264ec8bb6383fcb4fad97a17bab Bug-Url:https://bugzilla.redhat.com/show_bug.cgi?id=1096159 Bug-Url:https://bugzilla.redhat.com/show_bug.cgi?id=1096169 Signed-off-by: Nishanth Thomas Reviewed-on: http://review.gluster.org/7874 Reviewed-by: Kanagaraj M Reviewed-by: Sahina Bose Tested-by: Nishanth Thomas --- plugins/check_vol_server.py | 114 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 102 insertions(+), 12 deletions(-) diff --git a/plugins/check_vol_server.py b/plugins/check_vol_server.py index 4a4262f..faaa50b 100755 --- a/plugins/check_vol_server.py +++ b/plugins/check_vol_server.py @@ -48,29 +48,119 @@ def _getVolGeoRepStatusNRPECommand(volume): return ("check_vol_status -a %s %s" % (volume, 'geo-rep')) +#This function gets the replica pairs +#bricks - list of bricks in the volume +#pair_index - nth pair of replica's needs to be returned +#rCount - replica count +def getReplicaSet(bricks, pair_index, rCount): + start_index = (pair_index*rCount)-rCount + return(bricks[start_index:start_index+rCount]) + + +def _getVolDetailNRPECommand(volume): + return ("discover_volume_info -a %s" % (volume)) + + def _getVolumeStatusOutput(hostgroup, volume): status, output = _executeRandomHost(hostgroup, _getVolStatusNRPECommand(volume)) if status == utils.PluginStatusCode.OK: - #Following query will return the output in format [[2,0]] - #no.of bricks in OK state - 2 , CRITICAL state - 0 - brick_states_output = livestatus.readLiveStatusAsJSON( + brick_details = json.loads(livestatus.readLiveStatusAsJSON( "GET services\n" + "Columns: description state host_address host_name\n" "Filter: host_groups >= %s\n" "Filter: custom_variable_values >= %s\n" "Filter: description ~ Brick - \n" - "Stats: state = 0\n" - "Stats: state = 2\n" - % (hostgroup, volume)) - brick_states = json.loads(brick_states_output) - bricks_ok = brick_states[0][0] - bricks_critical = brick_states[0][1] + % (hostgroup, volume))) + #output will be as below: + #[[u'Brick - /root/b3', 0, u'10.70.42.246', u'nishanth-rhs-2']] + #parse this to find the no of critical/ok bricks and list of + #critical bricks + bricks_ok = 0 + bricks_critical = 0 + brick_list_critical = [] + for brick_detail in brick_details: + if brick_detail[1] == utils.PluginStatusCode.OK: + bricks_ok += 1 + elif brick_detail[1] == utils.PluginStatusCode.CRITICAL: + bricks_critical += 1 + #get the critical brick's host uuid if not present + #int the list + custom_vars = json.loads(livestatus.readLiveStatusAsJSON( + "GET hosts\n" + "Columns: custom_variables\n" + "Filter: groups >= %s\n" + "Filter: name = %s\n" + % (hostgroup, brick_detail[3]))) + brick_dict = {} + brick_dict['brick'] = brick_detail[2] + ":" + \ + brick_detail[0][brick_detail[0].find("/"):] + brick_dict['uuid'] = custom_vars[0][0]['HOST_UUID'] + brick_list_critical.append(brick_dict) + #Get volume details + nrpeStatus, nrpeOut = _executeRandomHost( + hostgroup, _getVolDetailNRPECommand(volume)) + volInfo = json.loads(nrpeOut) + #Get the volume type + vol_type = volInfo[volume]['type'] if bricks_ok == 0 and bricks_critical > 0: status = utils.PluginStatusCode.CRITICAL - output = "All the bricks are in CRITICAL state" + output = "CRITICAL: Volume : %s type - All bricks " \ + "are down " % (vol_type) + elif bricks_ok > 0 and bricks_critical == 0: + status = utils.PluginStatusCode.OK + output = "OK: Volume : %s type - All bricks " \ + "are Up " % (vol_type) elif bricks_critical > 0: - status = utils.PluginStatusCode.WARNING - output = "One or more bricks are in CRITICAL state" + if (vol_type == "DISTRIBUTE"): + status = utils.PluginStatusCode.CRITICAL + output = "CRITICAL: Volume : %s type \n Brick(s) - <%s> " \ + "is|are down " % \ + (vol_type, ', '.join(dict['brick']for dict in + brick_list_critical)) + elif (vol_type == "DISTRIBUTED_REPLICATE" or + vol_type == "REPLICATE"): + output = "WARNING: Volume : %s type \n Brick(s) - <%s> " \ + "is|are down, but replica pair(s) are up" % \ + (vol_type, ', '.join(dict['brick']for dict in + brick_list_critical)) + status = utils.PluginStatusCode.WARNING + bricks = [] + for brick in volInfo[volume]['bricks']: + bricks.append( + {'brick': brick['brickaddress'] + ":" + + brick['brickpath'], 'uuid': brick['hostUuid']}) + #check whether the replica is up for the bricks + # which are down + rCount = int(volInfo[volume]['replicaCount']) + noOfReplicas = len(bricks)/rCount + for index in range(1, noOfReplicas+1): + replica_list = getReplicaSet(bricks, index, rCount) + noOfBricksDown = 0 + for brick in replica_list: + for brick_critical in brick_list_critical: + if brick.get('uuid') == brick_critical.get('uuid')\ + and brick.get('brick').split(':')[1] == \ + brick_critical.get('brick').split(':')[1]: + noOfBricksDown += 1 + break + if noOfBricksDown == rCount: + output = "CRITICAL: Volume : %s type \n Bricks " \ + "- <%s> are down, along with one or more " \ + "replica pairs" % \ + (vol_type, + ', '.join(dict['brick']for dict in + brick_list_critical)) + status = utils.PluginStatusCode.CRITICAL + break + else: + output = "WARNING: Volume : %s type \n Brick(s) - <%s> " \ + "is|are down" % (vol_type, + ', '.join(dict['brick'] + for dict in + brick_list_critical)) + status = utils.PluginStatusCode.WARNING + return status, output return status, output -- cgit