diff options
author | Timothy Asir <tjeyasin@redhat.com> | 2014-05-30 17:05:14 +0530 |
---|---|---|
committer | Sahina Bose <sabose@redhat.com> | 2014-06-10 02:52:20 -0700 |
commit | 8e977e1fd0a0bed52049344765ab4581d7f3c761 (patch) | |
tree | 4580b6e64385800bc511cf2fd7a76537c2dd09da | |
parent | ab5dd8ea647fc1aa80f7ba6b43520979eb1827cc (diff) |
Add status information to show disk usage
This will show the status like:
for critical usage:
CRITICAL: 4% used (4GB out of 100GB):mounts:
(CRITICAL : <critical list> followed by WARNING
if any followed by OK
for disk status "WARNING":
WARNING: 4% used (4GB out of 100GB):mounts:
(WARNING if any followed by OK
for disk status "OK":
OK: 4% used (4GB out of 100GB):mounts:(<mounts>)
Bug-Url: https://bugzilla.redhat.com/show_bug.cgi?id=1081495
Change-Id: I9dbda7a5d6ea992ba73acce2174e6d66f2e16066
Signed-off-by: Timothy Asir <tjeyasin@redhat.com>
Reviewed-on: http://review.gluster.org/7936
Tested-by: Timothy Asir <tim.gluster@gmail.com>
Reviewed-by: Sahina Bose <sabose@redhat.com>
-rw-r--r-- | config/nrpe.in | 1 | ||||
-rw-r--r-- | gluster-nagios-addons.spec.in | 4 | ||||
-rwxr-xr-x | plugins/check_disk_and_inode.py | 250 | ||||
-rw-r--r-- | tests/test_disk.py | 92 |
4 files changed, 222 insertions, 125 deletions
diff --git a/config/nrpe.in b/config/nrpe.in index b0fa08c..8b4f6e9 100644 --- a/config/nrpe.in +++ b/config/nrpe.in @@ -1,4 +1,5 @@ Cmnd_Alias NRPE_PATHS = @sbindir@/send_nsca, \ + @libdir@/nagios/plugins/gluster/check_disk_and_inode.py, \ @libdir@/nagios/plugins/gluster/check_vol_utilization.py, \ @libdir@/nagios/plugins/gluster/check_volume_status.py, \ @libdir@/nagios/plugins/gluster/check_gluster_proc_status.py, \ diff --git a/gluster-nagios-addons.spec.in b/gluster-nagios-addons.spec.in index 497f085..0ae3bed 100644 --- a/gluster-nagios-addons.spec.in +++ b/gluster-nagios-addons.spec.in @@ -141,12 +141,12 @@ fi cat >> %{_sysconfdir}/nagios/nrpe.cfg <<EOF %{_start_conf_section} -command[check_disk_and_inode]=%{_libdir}/nagios/plugins/gluster/check_disk_and_inode.py -w 80 -c 90 -l -i /boot -i /var -i /root -n --inode +command[check_disk_and_inode]=sudo %{_libdir}/nagios/plugins/gluster/check_disk_and_inode.py -w 80 -c 90 -l -i /boot -i /var -i /root -n --inode command[check_memory]=%{_libdir}/nagios/plugins/gluster/memory.py -w 80 -c 90 -t 2 command[check_swap_usage]=%{_libdir}/nagios/plugins/gluster/swap.py -w 80 -c 90 -t 2 command[check_cpu_multicore]=%{_libdir}/nagios/plugins/gluster/cpu.py -w 80 -c 90 -t 2 command[check_interfaces]=%{_libdir}/nagios/plugins/gluster/network.py -e lo -e ';vdsmdummy;' -t 2 -command[check_brick_usage]=%{_libdir}/nagios/plugins/gluster/check_disk_and_inode.py -w 80 -c 90 -n -i \$ARG1\$ +command[check_brick_usage]=sudo %{_libdir}/nagios/plugins/gluster/check_disk_and_inode.py -w 80 -c 90 -n -i \$ARG1\$ command[check_vol_utilization]=sudo %{_libdir}/nagios/plugins/gluster/check_vol_utilization.py \$ARG1\$ -w \$ARG2\$ -c \$ARG3\$ command[check_vol_status]=sudo %{_libdir}/nagios/plugins/gluster/check_volume_status.py -v \$ARG1\$ -t \$ARG2\$ command[check_proc_status]=sudo %{_libdir}/nagios/plugins/gluster/check_gluster_proc_status.py -t \$ARG1\$ diff --git a/plugins/check_disk_and_inode.py b/plugins/check_disk_and_inode.py index d7ee148..30e8035 100755 --- a/plugins/check_disk_and_inode.py +++ b/plugins/check_disk_and_inode.py @@ -1,5 +1,4 @@ #!/usr/bin/python -# sadf.py -- nagios plugin uses sadf output for perf data # Copyright (C) 2014 Red Hat Inc # # This program is free software; you can redistribute it and/or @@ -18,6 +17,7 @@ # +import os import re import sys import commands @@ -26,6 +26,7 @@ from glusternagios import utils WARNING_LEVEL = 80 CRITICAL_LEVEL = 90 +INVALID_STATUS_CODE = -1 def getVal(val): @@ -36,18 +37,40 @@ def getVal(val): return 0 -def getUsageAndFree(command, lvm): - disk = {'path': None, 'usePercent': None, 'avail': None, - 'used': None, 'size': None, 'fs': None, 'status': None, - 'retCode': 0} +def getUsageAndFree(command, path, crit, warn, lvm): + disk = {'path': None, 'usePcent': 0, 'avail': 0, + 'used': 0, 'size': 0, 'fs': None, + 'status': None, 'msg': None, 'availPcent': 0, + 'statusCode': utils.PluginStatusCode.UNKNOWN} + + # Check if device exists and permissions are ok + if not os.access(path, os.F_OK): + disk['status'] = "Device not found!" + disk['msg'] = 'no device' + disk['fs'] = path + disk['statusCode'] = utils.PluginStatusCode.CRITICAL + return disk + + if not os.access(path, os.R_OK): + disk['status'] = "Unable to access the device" + disk['msg'] = 'no access' + disk['fs'] = path + disk['statusCode'] = utils.PluginStatusCode.CRITICAL + return disk + status = commands.getstatusoutput(command) + # Sample output + # (0, 'Filesystem 1G-blocks Used Available Use% Mounted on\n/dev/sda1 + # 290G 196G 79G 72% /') if status[0] != 0: - disk['retCode'] = status[0] + disk['msg'] = 'error:%s' % status[0] if status[0] == 256: - disk['status'] = "Brick path not found!" + disk['status'] = "Brick/Device path not found!" else: disk['status'] = status[1] + disk['statusCode'] = utils.PluginStatusCode.CRITICAL return disk + status = status[1].split() disk['path'] = status[-1] disk['avail'] = getVal(status[-3]) @@ -55,19 +78,29 @@ def getUsageAndFree(command, lvm): disk['size'] = getVal(status[-5]) disk['fs'] = status[-6] disk['usePcent'] = getVal(status[-2]) + if disk['usePcent'] >= crit: + disk['statusCode'] = utils.PluginStatusCode.CRITICAL + elif disk['usePcent'] >= warn: + disk['statusCode'] = utils.PluginStatusCode.WARNING + elif disk['usePcent'] < warn: + disk['statusCode'] = utils.PluginStatusCode.OK disk['availPcent'] = 100 - disk['usePcent'] + return disk -def getDisk(path, usage=None, lvm=False): +def getDisk(path, crit, warn, usage=None, lvm=False): if usage: - return getUsageAndFree("df -B%s %s" % (usage, path), lvm) + return getUsageAndFree("df -B%s %s" % (usage, path), + path, crit, warn, lvm) else: - return getUsageAndFree("df -BG %s" % path, lvm) + return getUsageAndFree("df -BG %s" % path, + path, crit, warn, lvm) -def getInode(path, lvm=False): - return getUsageAndFree("df -i %s" % path, lvm) +def getInode(path, crit, warn, lvm=False): + return getUsageAndFree("df -i %s" % path, + path, crit, warn, lvm) def getMounts(searchQuery, excludeList=[]): @@ -117,40 +150,60 @@ def parse_input(): return parser.parse_args() +def _getMsg(okList, warnList, critList): + msg = ", ".join(critList) + if critList and (warnList or okList): + msg = "CRITICAL: " + msg + if warnList: + if msg: + msg += "; WARNING: " + msg += ", ".join(warnList) + if okList: + if msg: + msg += "; OK: " + msg += ", ".join(okList) + return msg + + +def _getUnitAndType(val): + unit = utils.convertSize(val, "GB", "TB") + if unit >= 1: + return unit, "TB" + else: + return val, "GB" + + def showDiskUsage(warn, crit, mountPaths, toListInode, usage=False, isLvm=False, ignoreError=False): diskPerf = [] warnList = [] critList = [] - diskList = [] + okList = [] mounts = [] - level = -1 - msg = "" + statusCode = INVALID_STATUS_CODE + totalUsed = 0 + totalSize = 0 + noOfMounts = len(mountPaths) + maxPercentUsed = 0 for path in mountPaths: - disk = getDisk(path, - usage, - isLvm) - - inode = getInode(path, - isLvm) - - if disk['retCode'] != 0 or inode['retCode'] != 0: - return utils.PluginStatusCode.CRITICAL, disk['status'], "" + disk = getDisk(path, crit, warn, usage, isLvm) + inode = getInode(path, crit, warn, isLvm) if disk['path'] in mounts: continue if not disk['used'] or not inode['used']: - if ignoreError: - continue - else: + if not ignoreError: sys.exit(utils.PluginStatusCode.UNKNOWN) - mounts.append(disk['path']) - if usage: - data = "%s=%.1f;%.1f;%.1f;0;%.1f" % ( + if disk['path']: + mounts.append(disk['path']) + data = "" + if usage and disk['path']: + data = "%s=%.1f%s;%.1f;%.1f;0;%.1f" % ( disk['path'], disk['used'], + usage, warn * disk['size'] / 100, crit * disk['size'] / 100, disk['size']) @@ -160,9 +213,9 @@ def showDiskUsage(warn, crit, mountPaths, toListInode, usage=False, inode['used'], warn * inode['used'] / 100, crit * inode['used'] / 100, - inode['used']) - else: - data = "%s=%.2f%%;%s;%s;0;%s" % ( + inode['size']) + elif disk['path']: + data = "%s=%.2f%%;%s;%s;0;%sGB" % ( disk['path'], disk['usePcent'], warn, @@ -178,41 +231,81 @@ def showDiskUsage(warn, crit, mountPaths, toListInode, usage=False, inode['size']) diskPerf.append(data) - if disk['usePcent'] >= crit or inode['usePcent'] >= crit: - if disk['usePcent'] >= crit: - critList.append( - "disk:%s;%s;%s%%" % (disk['fs'], - disk['path'], - disk['usePcent'])) + totalUsed += disk['used'] + totalSize += disk['size'] + if disk['usePcent'] > maxPercentUsed: + maxPercentUsed = disk['usePcent'] + + # adding into status message if there is any + # specfic status found (short msg for list of disks) + msg = "" + if disk['status'] and disk['msg']: + if noOfMounts == 1: + msg = "%s=%s(%s)" % (disk['fs'], disk['path'], + disk['status']) else: - critList.append("inode:%s;%s;%s%%" % (inode['fs'], - inode['path'], - inode['usePcent'])) - if not level > utils.PluginStatusCode.WARNING: - level = utils.PluginStatusCode.CRITICAL - elif (disk['usePcent'] >= warn and disk['usePcent'] < crit) or ( - inode['usePcent'] >= warn and inode['usePcent'] < crit): - if disk['usePcent'] >= warn: - warnList.append("disk:%s;%s;%s%%" % (disk['fs'], - disk['path'], - disk['usePcent'])) + msg = "%s(%s)" % (disk['fs'], disk['msg']) + else: + if noOfMounts == 1: + msg = "%s=%s" % (disk['fs'], disk['path']) else: - warnList.append("inode:%s;%s;%s%%" % (inode['fs'], - inode['path'], - inode['usePcent'])) - if not level > utils.PluginStatusCode.OK: - level = utils.PluginStatusCode.WARNING + msg = "%s" % (disk['path']) + + if disk['statusCode'] == utils.PluginStatusCode.CRITICAL or \ + inode['statusCode'] == utils.PluginStatusCode.CRITICAL: + statusCode = utils.PluginStatusCode.CRITICAL + critList.append(msg) + elif (disk['statusCode'] == utils.PluginStatusCode.WARNING or + inode['statusCode'] == utils.PluginStatusCode.WARNING): + # if any previous disk statusCode is not critical + # we should not change the statusCode into warning + if statusCode != utils.PluginStatusCode.CRITICAL: + statusCode = utils.PluginStatusCode.WARNING + # just adding warning values into the list + warnList.append(msg) + elif disk['statusCode'] == utils.PluginStatusCode.OK: + if statusCode == INVALID_STATUS_CODE or \ + statusCode == utils.PluginStatusCode.OK: + statusCode = utils.PluginStatusCode.OK + okList.append(msg) else: - diskList.append("%s=%s" % (disk['fs'], disk['path'])) - - if len(critList) > 0: - msg += "CRITICAL: " + ",".join(critList) + " " - if len(warnList) > 0: - msg += "WARNING: " + ",".join(warnList) + " " - if len(diskList) > 0: - msg += "OK: disks:mounts:(" + ",".join(diskList) + ")" + # added \ to fix E125 pep8 error + if statusCode != utils.PluginStatusCode.CRITICAL or \ + statusCode != utils.PluginStatusCode.WARNING: + statusCode = utils.PluginStatusCode.UNKNOWN + okList.append(msg) + + msg = _getMsg(okList, warnList, critList) + + if totalUsed == 0 and totalSize == 0: + # avoid zero div error + return statusCode, "mount: %s" % msg, diskPerf + if totalUsed == 0: + # avoid zero div error + totUsagePercent = 0 + elif len(mounts) > 1: + totUsagePercent = totalUsed / totalSize * 100 + else: + totUsagePercent = maxPercentUsed + usageMsg = "" + if not usage: + totUsedSz, totUsedSzUnit = _getUnitAndType(totalUsed) + totSpaceSz, totSpaceSzUnit = _getUnitAndType(totalSize) + usageMsg = "%.1f%% used (%s%s out of %s%s)\n" % (totUsagePercent, + totUsedSz, + totUsedSzUnit, + totSpaceSz, + totSpaceSzUnit) + else: + usageMsg = "%.1f%% used (%s%s out of %s%s)\n" % (totUsagePercent, + totalUsed, + usage, + totalSize, + usage) - return level, msg, diskPerf + if usageMsg: + msg = "%s:mount(s): (%s)" % (usageMsg, msg) + return statusCode, msg, diskPerf if __name__ == '__main__': @@ -226,25 +319,28 @@ if __name__ == '__main__': if not options.mountPath or options.lvm or options.all: options.mountPath += getMounts(searchQuery, options.exclude) - level, msg, diskPerf = showDiskUsage(options.warn, - options.crit, - options.mountPath, - options.inode, - options.usage, - options.lvm, - options.ignore) - - if utils.PluginStatusCode.CRITICAL == level: - sys.stdout.write("%s | %s\n" % ( + statusCode, msg, diskPerf = showDiskUsage(options.warn, + options.crit, + options.mountPath, + options.inode, + options.usage, + options.lvm, + options.ignore) + + if utils.PluginStatusCode.CRITICAL == statusCode: + sys.stdout.write("%s : %s | %s\n" % ( + utils.PluginStatus.CRITICAL, msg, " ".join(diskPerf))) sys.exit(utils.PluginStatusCode.CRITICAL) - elif utils.PluginStatusCode.WARNING == level: - sys.stdout.write("%s | %s\n" % ( + elif utils.PluginStatusCode.WARNING == statusCode: + sys.stdout.write("%s : %s | %s\n" % ( + utils.PluginStatus.WARNING, msg, " ".join(diskPerf))) sys.exit(utils.PluginStatusCode.WARNING) else: - sys.stdout.write("%s | %s\n" % ( + sys.stdout.write("%s : %s | %s\n" % ( + utils.PluginStatus.OK, msg, " ".join(diskPerf))) diff --git a/tests/test_disk.py b/tests/test_disk.py index a9096c2..b41d3bf 100644 --- a/tests/test_disk.py +++ b/tests/test_disk.py @@ -18,12 +18,16 @@ # Refer to the README and COPYING files for full details of the license # +import os import commands from testrunner import PluginsTestCase as TestCaseBase from plugins import check_disk_and_inode as checkDisk class TestDisk(TestCaseBase): + def mock_osaccess(self, path=None, osflag=None): + return True + def mock_getstatusoutput(self, i): out = [ "Filesystem Size Used Avail Use% Mounted on", @@ -78,28 +82,29 @@ class TestDisk(TestCaseBase): def test_getUsageAndFree(self): commands.getstatusoutput = self.mock_getstatusoutput - disk = checkDisk.getUsageAndFree(1, True) + os.access = self.mock_osaccess + disk = checkDisk.getUsageAndFree(1, "", 80, 90, "") self.assertEqual(disk['usePcent'], 64) self.assertEqual(disk['availPcent'], 36) self.assertEqual(disk['used'], 174) self.assertEqual(disk['avail'], 102) self.assertEqual(disk['path'], '/') - disk = checkDisk.getUsageAndFree(2, True) + disk = checkDisk.getUsageAndFree(2, "", 80, 90, "") self.assertEqual(disk['usePcent'], 0) self.assertEqual(disk['availPcent'], 100) self.assertEqual(disk['used'], 0) self.assertEqual(disk['avail'], 3.0) self.assertEqual(disk['path'], '/var') - disk = checkDisk.getUsageAndFree(3, True) + disk = checkDisk.getUsageAndFree(3, "", 80, 90, "") self.assertEqual(disk['usePcent'], 40) self.assertEqual(disk['availPcent'], 60) self.assertEqual(disk['used'], 200) self.assertEqual(disk['avail'], 100) self.assertEqual(disk['path'], '/mnt1') - disk = checkDisk.getUsageAndFree(4, True) + disk = checkDisk.getUsageAndFree(4, "", 80, 90, "") self.assertEqual(disk['usePcent'], 85) self.assertEqual(disk['availPcent'], 15) self.assertEqual(disk['used'], 1774) @@ -123,65 +128,60 @@ class TestDisk(TestCaseBase): def test_diskUsage(self): commands.getstatusoutput = self.mock_getstatusoutput checkDisk.open = self.mock_open + os.access = self.mock_osaccess mounts = checkDisk.getMounts("/", []) self.assertEqual(checkDisk.showDiskUsage(80, 90, [mounts[1]], True, - usage='BGB', - ignoreError=True), - (-1, ' disks:mounts:(/dev/sda1=/)', - ['/=174.0;232.0;261.0;0;290.0 ' - '/=174.0;139.2;156.6;0;174.0'])) + usage='BGB'), + (0, '64.0% used (174.0BGB out of 290.0BGB)\n' + ':mount(s): (/dev/sda1=/)', + ['/=174.0BGB;232.0;261.0;0;290.0 ' + '/=174.0;139.2;156.6;0;290.0'])) self.assertEqual(checkDisk.showDiskUsage(80, 90, - [mounts[1]], True, - ignoreError=True), - (-1, ' disks:mounts:(/dev/sda1=/)', - ['/=64.00;80;90;0;100 /=64.00;80;90;0;100'])) + [mounts[1]], + True), + (0, '64.0% used (174.0GB out of 290.0GB)\n' + ':mount(s): (/dev/sda1=/)', + ['/=64.00%;80;90;0;290.0GB ' + '/=64.00%;80;90;0;290.0'])) self.assertEqual(checkDisk.showDiskUsage(80, 90, - ["/mnt/vol2"], True, - ignoreError=True), - (-1, ' disks:mounts:(10.70.43.190:vol2=/mnt/vol2)', - ['/mnt/vol2=47.00;80;90;0;100 ' - '/mnt/vol2=47.00;80;90;0;100'])) + [mounts[1]], True), + (0, '64.0% used (174.0GB out of 290.0GB)\n' + ':mount(s): (/dev/sda1=/)', + ['/=64.00%;80;90;0;290.0GB ' + '/=64.00%;80;90;0;290.0'])) self.assertEqual(checkDisk.showDiskUsage(80, 90, - ["/mnt/vol2"], True, - usage="MB", - ignoreError=True), - (-1, ' disks:mounts:(10.70.43.190:vol2=/mnt/vol2)', - ['/mnt/vol2=23228.0;42276.8;47561.4;0;52846.0 ' - '/mnt/vol2=23228.0;18582.4;20905.2;0;23228.0'])) + ["/mnt/vol2"], True), + (0, '47.0% used (22.68359375TB out of ' + '51.607421875TB)\n' + ':mount(s): (10.70.43.190:vol2=/mnt/vol2)', + ['/mnt/vol2=47.00%;80;90;0;52846.0GB ' + '/mnt/vol2=47.00%;80;90;0;52846.0'])) self.assertEqual(checkDisk.showDiskUsage(10, 20, - ["/mnt/vol2"], True, - usage="MB", - ignoreError=True), - (2, 'crit:disk:10.70.43.190:vol2;/mnt/vol2;47.0', - ['/mnt/vol2=23228.0;5284.6;10569.2;0;52846.0 ' - '/mnt/vol2=23228.0;2322.8;4645.6;0;23228.0'])) + ["/mnt/vol2"], True), + (2, '47.0% used (22.68359375TB out of ' + '51.607421875TB)\n' + ':mount(s): (10.70.43.190:vol2=/mnt/vol2)', + ['/mnt/vol2=47.00%;10;20;0;52846.0GB ' + '/mnt/vol2=47.00%;10;20;0;52846.0'])) # negative test - self.assertEqual(checkDisk.showDiskUsage(-1, - 200, - ["/mnt/vol2"], True, - usage="MB", - ignoreError=True), - (1, 'warn:disk:10.70.43.190:vol2;/mnt/vol2;47.0', - ['/mnt/vol2=23228.0;-528.5;105692.0;0;52846.0 ' - '/mnt/vol2=23228.0;-232.3;46456.0;0;23228.0'])) - - # testing warning level - self.assertEqual(checkDisk.showDiskUsage(40, 50, ["/mnt/vol2"], True, - usage="MB", - ignoreError=True), - (1, 'warn:disk:10.70.43.190:vol2;/mnt/vol2;47.0', - ['/mnt/vol2=23228.0;21138.4;26423.0;0;52846.0 ' - '/mnt/vol2=23228.0;9291.2;11614.0;0;23228.0'])) + self.assertEqual(checkDisk.showDiskUsage(1, + 100, + ["/mnt/vol2"], True), + (1, '47.0% used (22.68359375TB out of ' + '51.607421875TB)\n' + ':mount(s): (10.70.43.190:vol2=/mnt/vol2)', + ['/mnt/vol2=47.00%;1;100;0;52846.0GB ' + '/mnt/vol2=47.00%;1;100;0;52846.0'])) |