1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
|
#!/usr/bin/python
import sys
import json
import random
import argparse
import livestatus
import time
from glusternagios import utils
import server_utils
def _getListHosts(hostgroup):
list_hosts = []
table = json.loads(livestatus.readLiveStatusAsJSON(
"GET hostgroups\nColumns: members_with_state\n"
"Filter: name = " + hostgroup + "\n"))[0][0]
# Get the only those nodes which are UP and
# glusterd service is running
for row in table:
if row[1] == utils.HostStatusCode.UP and \
_getGlusterdStatus(row[0]) \
== utils.PluginStatusCode.OK:
list_hosts.append(row[0])
return list_hosts
def _getGlusterdStatus(hostname):
status = json.loads(livestatus.readLiveStatusAsJSON(
"GET services\nColumns: state\n"
"Filter: description = Gluster Management\n"
"Filter: host_name = " + hostname + "\n"))[0][0]
return status
def _getHostAddress(host):
# Get the address of the host
host_address = livestatus.checkLiveStatus("GET hosts\nColumns: address\n"
"Filter: display_name = "
+ host + "\n")
return host_address.rstrip()
def _getVolUtilizationNRPECommand(volume, warning, critical):
return ("check_vol_utilization -a " + volume + " " +
str(warning) + " " + str(critical))
def _getVolStatusNRPECommand(volume):
return ("check_vol_status -a %s %s" % (volume, 'info'))
def _getVolQuotaStatusNRPECommand(volume):
return ("check_vol_status -a %s %s" % (volume, 'quota'))
def _getVolSelfHealStatusNRPECommand(volume):
return ("check_vol_status -a %s %s" % (volume, 'self-heal'))
def _getVolSelfHealInfoNRPECommand(volume):
return ("check_vol_status -a %s %s" % (volume, 'heal-info'))
def _getVolGeoRepStatusNRPECommand(volume):
return ("check_vol_status -a %s %s" % (volume, 'geo-rep'))
def _getQuorumStatusNRPECommand():
return ("check_quorum_status")
# This function gets the replica pairs
# bricks - list of bricks in the volume
# pair_index - nth pair of replica's needs to be returned
# rCount - replica count
def getSubVolumeSet(bricks, pair_index, count):
start_index = (pair_index*count)-count
return(bricks[start_index:start_index+count])
def _getVolDetailNRPECommand(volume):
return ("discover_volume_info -a %s" % (volume))
def _getDisperseOrReplicaStatus(vol_type, brick_list_critical,
volInfo, volume):
message_string = "replica pair" if (vol_type.find(
"REPLICATE") >= 0) else "disperse pair"
output = "WARNING: Volume : %s type \n Brick(s) - <%s> " \
"is|are down, but %s(s) are up" % \
(vol_type, ', '.join(dict['brick']for dict in
brick_list_critical), message_string)
status = utils.PluginStatusCode.WARNING
bricks = []
for brick in volInfo[volume]['bricks']:
bricks.append(
{'brick': brick['brickaddress'] + ":" +
brick['brickpath'], 'uuid': brick['hostUuid']})
# check whether the replica/disperse pair is up for the bricks
# which are down
count = int(volInfo[volume]['replicaCount']) if (vol_type.find(
"REPLICATE") >= 0) else int(volInfo[volume]['disperseCount'])
if (vol_type.find("DISPERSE") >= 0):
redundancyCount = int(volInfo[volume]['redundancyCount'])
noOfPairs = len(bricks)/count
for index in range(1, noOfPairs+1):
brick_list = getSubVolumeSet(bricks, index, count)
noOfBricksDown = 0
for brick in brick_list:
for brick_critical in brick_list_critical:
if brick.get('uuid') == brick_critical.get('uuid')\
and brick.get('brick').split(':')[1] == \
brick_critical.get('brick').split(':')[1]:
noOfBricksDown += 1
break
if (noOfBricksDown == count and vol_type.find("REPLICATE") >= 0) or\
vol_type.find("DISPERSE") >= 0 and noOfBricksDown > redundancyCount:
output = "CRITICAL: Volume : %s type \n Bricks " \
"- <%s> are down, along with one or more " \
"%s(s)" % \
(vol_type,
', '.join(dict['brick']for dict in
brick_list_critical), message_string)
status = utils.PluginStatusCode.CRITICAL
break
return status, output
def _getVolumeStatusOutput(hostgroup, volume):
status, output = _executeRandomHost(hostgroup,
_getVolStatusNRPECommand(volume))
if status == utils.PluginStatusCode.OK:
brick_details = json.loads(livestatus.readLiveStatusAsJSON(
"GET services\n"
"Columns: description state host_address host_name\n"
"Filter: host_groups >= %s\n"
"Filter: custom_variable_values >= %s\n"
"Filter: description ~ Brick - \n"
% (hostgroup, volume)))
# output will be as below:
# [[u'Brick - /root/b3', 0, u'10.70.42.246', u'nishanth-rhs-2']]
# parse this to find the no of critical/ok bricks and list of
# critical bricks
bricks_ok = 0
bricks_critical = 0
brick_list_critical = []
for brick_detail in brick_details:
if brick_detail[1] == utils.PluginStatusCode.OK:
bricks_ok += 1
elif brick_detail[1] == utils.PluginStatusCode.CRITICAL:
bricks_critical += 1
# get the critical brick's host uuid if not present
# int the list
custom_vars = json.loads(livestatus.readLiveStatusAsJSON(
"GET hosts\n"
"Columns: custom_variables\n"
"Filter: groups >= %s\n"
"Filter: name = %s\n"
% (hostgroup, brick_detail[3])))
brick_dict = {}
brick_dict['brick'] = brick_detail[2] + ":" + \
brick_detail[0][brick_detail[0].find("/"):]
brick_dict['uuid'] = custom_vars[0][0]['HOST_UUID']
brick_list_critical.append(brick_dict)
# Get volume details
nrpeStatus, nrpeOut = _executeRandomHost(
hostgroup, _getVolDetailNRPECommand(volume))
if nrpeStatus != utils.PluginStatusCode.OK:
return utils.PluginStatusCode.UNKNOWN, nrpeOut
volInfo = json.loads(nrpeOut)
# Get the volume type
vol_type = volInfo[volume]['type']
if bricks_ok == 0 and bricks_critical > 0:
status = utils.PluginStatusCode.CRITICAL
output = "CRITICAL: Volume : %s type - All bricks " \
"are down " % (vol_type)
elif bricks_ok > 0 and bricks_critical == 0:
status = utils.PluginStatusCode.OK
output = "OK: Volume : %s type - All bricks " \
"are Up " % (vol_type)
elif bricks_critical > 0:
if (vol_type == "DISTRIBUTE"):
status = utils.PluginStatusCode.CRITICAL
output = "CRITICAL: Volume : %s type \n Brick(s) - <%s> " \
"is|are down " % \
(vol_type, ', '.join(dict['brick']for dict in
brick_list_critical))
elif (vol_type == "DISTRIBUTED_REPLICATE" or
vol_type == "REPLICATE" or
vol_type == "DISTRIBUTED_DISPERSE" or
vol_type == "DISPERSE"):
status, output = _getDisperseOrReplicaStatus(
vol_type,
brick_list_critical,
volInfo, volume)
else:
output = "WARNING: Volume : %s type \n Brick(s) - <%s> " \
"is|are down" % (vol_type,
', '.join(dict['brick']
for dict in
brick_list_critical))
status = utils.PluginStatusCode.WARNING
return status, output
return status, output
def _getVolumeQuotaStatusOutput(hostgroup, volume):
# get current volume quota status
table = livestatus.readLiveStatus("GET services\n"
"Columns: state long_plugin_output\n"
"Filter: description = "
"Volume Quota - %s" % volume)
servicestatus = utils.PluginStatusCode.UNKNOWN
statusoutput = ''
if len(table) > 0:
servicetab = table[0]
servicestatus = servicetab[0]
statusoutput = servicetab[1]
if (int(servicestatus) == utils.PluginStatusCode.OK and
statusoutput.find("QUOTA: OK") > -1):
# if ok, don't poll
return servicestatus, statusoutput
return _executeRandomHost(hostgroup, _getVolQuotaStatusNRPECommand(volume))
def _getQuorumStatusOutput(hostgroup):
# get current volume quorum status
table = json.loads(livestatus.readLiveStatusAsJSON("GET services\n"
"Columns: state plugin_output\n"
"Filter: description = "
"Cluster - Quorum Status\n"
"Filter: host_name = %s\n" % hostgroup))
servicestatus = utils.PluginStatusCode.UNKNOWN
pluginoutput = ''
for row in table:
servicestatus = row[0]
pluginoutput = row[1]
if (int(servicestatus) != utils.PluginStatusCode.CRITICAL):
return _executeRandomHost(hostgroup, _getQuorumStatusNRPECommand())
else:
return servicestatus, pluginoutput
def execNRPECommand(command):
status, output, err = utils.execCmd(command.split(), raw=True)
return status, output
def _executeRandomHost(hostgroup, command):
list_hosts = _getListHosts(hostgroup)
if not list_hosts:
status = utils.PluginStatusCode.UNKNOWN
output = " UNKNOWN: No hosts(with state UP) found in the cluster"
return status, output
host = random.choice(list_hosts)
# Get the address of the host
host_address = _getHostAddress(host)
status, output = execNRPECommand(server_utils.getNRPEBaseCommand(
host_address,
timeout=args.timeout) + command)
if status != utils.PluginStatusCode.UNKNOWN:
return status, output
# random host is not able to execute the command
# Now try to iterate through the list of hosts
# in the host group and send the command until
# the command is successful
# No need to send it to host which we already sent unless volume locked
if "UNKNOWN: temporary error" not in output:
# if volume locked,we can try on same host
list_hosts.remove(host)
for host in list_hosts:
if "UNKNOWN: temporary error" in output:
# volume locked, so wait before trying again
time.sleep(2) # sleep for 2 seconds
host_address = _getHostAddress(host)
status, output = execNRPECommand(server_utils.getNRPEBaseCommand(
host_address,
timeout=args.timeout) + command)
if status != utils.PluginStatusCode.UNKNOWN:
return status, output
return status, output
def showVolumeOutput(args):
if args.option == 'status':
return _getVolumeStatusOutput(args.hostgroup, args.volume)
elif args.option == 'utilization':
command = _getVolUtilizationNRPECommand(
args.volume, args.warning, args.critical)
elif args.option == 'quota':
return _getVolumeQuotaStatusOutput(args.hostgroup, args.volume)
elif args.option == 'self-heal':
command = _getVolSelfHealStatusNRPECommand(args.volume)
elif args.option == 'geo-rep':
command = _getVolGeoRepStatusNRPECommand(args.volume)
elif args.option == 'heal-info':
command = _getVolSelfHealInfoNRPECommand(args.volume)
elif args.option == 'quorum':
return _getQuorumStatusOutput(args.hostgroup)
return _executeRandomHost(args.hostgroup, command)
def parse_input():
parser = argparse.ArgumentParser(
usage='%(prog)s [-h] <hostgroup> <volume> -w <Warning>'
' -c <Critical> [-o|--option]')
parser.add_argument(
"hostgroup",
help="Name of the hostgroup to which the volume belongs")
parser.add_argument(
"volume",
help="Name of the volume being queried")
parser.add_argument(
"-w",
"--warning",
action="store",
type=int,
default=70,
help="Warning Threshold in percentage")
parser.add_argument(
"-c",
"--critical",
action="store",
type=int,
default=90,
help="Critical Threshold in percentage")
parser.add_argument('-o', '--option',
action='store',
help='the volume option to check',
choices=['utilization',
'status',
'quota',
'self-heal',
'heal-info',
'geo-rep',
'quorum'])
parser.add_argument('-t', '--timeout',
action='store',
help='NRPE timeout')
args = parser.parse_args()
if args.critical <= args.warning:
print "UNKNOWN:Critical must be greater than Warning."
sys.exit(utils.PluginStatusCode.UNKNOWN)
return args
if __name__ == '__main__':
args = parse_input()
status, output = showVolumeOutput(args)
print (output)
exit(status)
|