summaryrefslogtreecommitdiffstats
path: root/plugins/check_vol_server.py
blob: e7dd150e77e313d6feb8e09b2dd6f9b1aeaba31a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
#!/usr/bin/python
import sys
import json
import random
import argparse
import livestatus
import time

from glusternagios import utils
import server_utils


def _getListHosts(hostgroup):
    list_hosts = []
    table = json.loads(livestatus.readLiveStatusAsJSON(
        "GET hostgroups\nColumns: members_with_state\n"
        "Filter: name = " + hostgroup + "\n"))[0][0]
    # Get the only those nodes which are UP and
    #glusterd service is running
    for row in table:
        if row[1] == utils.HostStatusCode.UP and \
            _getGlusterdStatus(row[0]) \
                == utils.PluginStatusCode.OK:
            list_hosts.append(row[0])
    return list_hosts


def _getGlusterdStatus(hostname):
    status = json.loads(livestatus.readLiveStatusAsJSON(
        "GET services\nColumns: state\n"
        "Filter: description = Gluster Management\n"
        "Filter: host_name = " + hostname + "\n"))[0][0]
    return status


def _getHostAddress(host):
    # Get the address of the host
    host_address = livestatus.checkLiveStatus("GET hosts\nColumns: address\n"
                                              "Filter: display_name = "
                                              + host + "\n")
    return host_address.rstrip()


def _getVolUtilizationNRPECommand(volume, warning, critical):
    return ("check_vol_utilization -a " + volume + " " +
            str(warning) + " " + str(critical))


def _getVolStatusNRPECommand(volume):
    return ("check_vol_status -a %s %s" % (volume, 'info'))


def _getVolQuotaStatusNRPECommand(volume):
    return ("check_vol_status -a %s %s" % (volume, 'quota'))


def _getVolSelfHealStatusNRPECommand(volume):
    return ("check_vol_status -a %s %s" % (volume, 'self-heal'))


def _getVolGeoRepStatusNRPECommand(volume):
    return ("check_vol_status -a %s %s" % (volume, 'geo-rep'))


# This function gets the replica pairs
# bricks - list of bricks in the volume
# pair_index - nth pair of replica's needs to be returned
# rCount - replica count
def getReplicaSet(bricks, pair_index, rCount):
    start_index = (pair_index*rCount)-rCount
    return(bricks[start_index:start_index+rCount])


def _getVolDetailNRPECommand(volume):
    return ("discover_volume_info -a %s" % (volume))


def _getVolumeStatusOutput(hostgroup, volume):
    status, output = _executeRandomHost(hostgroup,
                                        _getVolStatusNRPECommand(volume))
    if status == utils.PluginStatusCode.OK:
        brick_details = json.loads(livestatus.readLiveStatusAsJSON(
            "GET services\n"
            "Columns: description state host_address host_name\n"
            "Filter: host_groups >= %s\n"
            "Filter: custom_variable_values >= %s\n"
            "Filter: description ~ Brick - \n"
            % (hostgroup, volume)))
        # output will be as below:
        # [[u'Brick - /root/b3', 0, u'10.70.42.246', u'nishanth-rhs-2']]
        # parse this to find the no of critical/ok bricks and list of
        # critical bricks
        bricks_ok = 0
        bricks_critical = 0
        brick_list_critical = []
        for brick_detail in brick_details:
            if brick_detail[1] == utils.PluginStatusCode.OK:
                bricks_ok += 1
            elif brick_detail[1] == utils.PluginStatusCode.CRITICAL:
                bricks_critical += 1
                # get the critical brick's host uuid if not present
                # int the list
                custom_vars = json.loads(livestatus.readLiveStatusAsJSON(
                    "GET hosts\n"
                    "Columns: custom_variables\n"
                    "Filter: groups >= %s\n"
                    "Filter: name = %s\n"
                    % (hostgroup, brick_detail[3])))
                brick_dict = {}
                brick_dict['brick'] = brick_detail[2] + ":" + \
                    brick_detail[0][brick_detail[0].find("/"):]
                brick_dict['uuid'] = custom_vars[0][0]['HOST_UUID']
                brick_list_critical.append(brick_dict)
        # Get volume details
        nrpeStatus, nrpeOut = _executeRandomHost(
            hostgroup, _getVolDetailNRPECommand(volume))
        if nrpeStatus != utils.PluginStatusCode.OK:
            return utils.PluginStatusCode.UNKNOWN, nrpeOut
        volInfo = json.loads(nrpeOut)
        # Get the volume type
        vol_type = volInfo[volume]['type']
        if bricks_ok == 0 and bricks_critical > 0:
            status = utils.PluginStatusCode.CRITICAL
            output = "CRITICAL: Volume : %s type - All bricks " \
                     "are down " % (vol_type)
        elif bricks_ok > 0 and bricks_critical == 0:
            status = utils.PluginStatusCode.OK
            output = "OK: Volume : %s type - All bricks " \
                     "are Up " % (vol_type)
        elif bricks_critical > 0:
            if (vol_type == "DISTRIBUTE"):
                status = utils.PluginStatusCode.CRITICAL
                output = "CRITICAL: Volume : %s type \n Brick(s) - <%s> " \
                         "is|are down " % \
                         (vol_type, ', '.join(dict['brick']for dict in
                                              brick_list_critical))
            elif (vol_type == "DISTRIBUTED_REPLICATE" or
                    vol_type == "REPLICATE"):
                output = "WARNING: Volume : %s type \n Brick(s) - <%s> " \
                         "is|are down, but replica pair(s) are up" % \
                         (vol_type, ', '.join(dict['brick']for dict in
                                              brick_list_critical))
                status = utils.PluginStatusCode.WARNING
                bricks = []
                for brick in volInfo[volume]['bricks']:
                    bricks.append(
                        {'brick': brick['brickaddress'] + ":" +
                            brick['brickpath'], 'uuid': brick['hostUuid']})
                # check whether the replica is up for the bricks
                # which are down
                rCount = int(volInfo[volume]['replicaCount'])
                noOfReplicas = len(bricks)/rCount
                for index in range(1, noOfReplicas+1):
                    replica_list = getReplicaSet(bricks, index, rCount)
                    noOfBricksDown = 0
                    for brick in replica_list:
                        for brick_critical in brick_list_critical:
                            if brick.get('uuid') == brick_critical.get('uuid')\
                                    and brick.get('brick').split(':')[1] == \
                                    brick_critical.get('brick').split(':')[1]:
                                noOfBricksDown += 1
                                break
                    if noOfBricksDown == rCount:
                        output = "CRITICAL: Volume : %s type \n Bricks " \
                                 "- <%s> are down, along with one or more " \
                                 "replica pairs" % \
                                 (vol_type,
                                  ', '.join(dict['brick']for dict in
                                            brick_list_critical))
                        status = utils.PluginStatusCode.CRITICAL
                        break
            else:
                output = "WARNING: Volume : %s type \n Brick(s) - <%s> " \
                         "is|are down" % (vol_type,
                                          ', '.join(dict['brick']
                                                    for dict in
                                                    brick_list_critical))
                status = utils.PluginStatusCode.WARNING
        return status, output
    return status, output


def _getVolumeQuotaStatusOutput(hostgroup, volume):
    # get current volume quota status
    table = livestatus.readLiveStatus("GET services\n"
                                      "Columns: state long_plugin_output\n"
                                      "Filter: description = "
                                      "Volume Quota - %s" % volume)
    servicestatus = utils.PluginStatusCode.UNKNOWN
    statusoutput = ''
    if len(table) > 0:
        servicetab = table[0]
        servicestatus = servicetab[0]
        statusoutput = servicetab[1]
    if (int(servicestatus) == utils.PluginStatusCode.OK and
            statusoutput.find("QUOTA: OK") > -1):
        # if ok, don't poll
        return servicestatus, statusoutput
    return _executeRandomHost(hostgroup, _getVolQuotaStatusNRPECommand(volume))


def execNRPECommand(command):
    status, output, err = utils.execCmd(command.split(), raw=True)
    return status, output


def _executeRandomHost(hostgroup, command):
    list_hosts = _getListHosts(hostgroup)
    if not list_hosts:
        status = utils.PluginStatusCode.UNKNOWN
        output = " UNKNOWN: No hosts(with state UP) found in the cluster"
        return status, output
    host = random.choice(list_hosts)
    # Get the address of the host
    host_address = _getHostAddress(host)

    status, output = execNRPECommand(server_utils.getNRPEBaseCommand(
                                     host_address,
                                     timeout=args.timeout) + command)

    if status != utils.PluginStatusCode.UNKNOWN:
        return status, output
    #random host is not able to execute the command
    #Now try to iterate through the list of hosts
    #in the host group and send the command until
    #the command is successful

    #No need to send it to host which we already sent unless volume locked
    if not "UNKNOWN: temporary error" in output:
        # if volume locked,we can try on same host
        list_hosts.remove(host)
    for host in list_hosts:
        if "UNKNOWN: temporary error" in output:
            # volume locked, so wait before trying again
            time.sleep(2)  # sleep for 2 seconds
        host_address = _getHostAddress(host)
        status, output = execNRPECommand(server_utils.getNRPEBaseCommand(
                                         host_address,
                                         timeout=args.timeout) + command)
        if status != utils.PluginStatusCode.UNKNOWN:
            return status, output
    return status, output


def showVolumeOutput(args):

    if args.option == 'status':
        return _getVolumeStatusOutput(args.hostgroup, args.volume)
    elif args.option == 'utilization':
        command = _getVolUtilizationNRPECommand(
            args.volume, args.warning, args.critical)
    elif args.option == 'quota':
        return _getVolumeQuotaStatusOutput(args.hostgroup, args.volume)
    elif args.option == 'self-heal':
        command = _getVolSelfHealStatusNRPECommand(args.volume)
    elif args.option == 'geo-rep':
        command = _getVolGeoRepStatusNRPECommand(args.volume)

    return _executeRandomHost(args.hostgroup, command)


def parse_input():
    parser = argparse.ArgumentParser(
        usage='%(prog)s [-h] <hostgroup>  <volume> -w <Warning>'
        ' -c <Critical> [-o|--option]')
    parser.add_argument(
        "hostgroup",
        help="Name of the hostgroup to which the volume belongs")
    parser.add_argument(
        "volume",
        help="Name of the volume being queried")
    parser.add_argument(
        "-w",
        "--warning",
        action="store",
        type=int,
        default=70,
        help="Warning Threshold in percentage")
    parser.add_argument(
        "-c",
        "--critical",
        action="store",
        type=int,
        default=90,
        help="Critical Threshold in percentage")
    parser.add_argument('-o', '--option',
                        action='store',
                        help='the volume option to check',
                        choices=['utilization',
                                 'status',
                                 'quota',
                                 'self-heal',
                                 'geo-rep'])
    parser.add_argument('-t', '--timeout',
                        action='store',
                        help='NRPE timeout')
    args = parser.parse_args()
    if args.critical <= args.warning:
        print "UNKNOWN:Critical must be greater than Warning."
        sys.exit(utils.PluginStatusCode.UNKNOWN)
    return args

if __name__ == '__main__':
    args = parse_input()
    status, output = showVolumeOutput(args)
    print (output)
    exit(status)