diff options
author | Ramesh Nachimuthu <rnachimu@redhat.com> | 2014-05-07 15:23:55 +0530 |
---|---|---|
committer | Sahina Bose <sabose@redhat.com> | 2014-05-07 06:40:00 -0700 |
commit | 4c63ceaf48a3641adb4e087125d6698fa857ad37 (patch) | |
tree | 78a9a13f971d7e02811115e36756a3c2759200dd | |
parent | a4f97c15206c4930e6336c7a1fba67a231615486 (diff) |
nagios-addons: Add NRPE command for gluster processes
Adding NRPE command for all process related services. We already
have passive checks through NSCA for all these services and bricks.
But passive checks are submitted only when there is a state change and
Nagios may miss those check results if those services are not
cofigured when we submit passive checks. So, we should have active
checks running atleast for the first time to determine the initial
status of the service
Command 'check_proc_status' helps to check the status of all
gluster related process
Command 'check_brick_status' helps to check the status of bricks
in the host.
Change-Id: I1f442c0c1e54d606576bc0814044f2f149bca220
Signed-off-by: Ramesh Nachimuthu <rnachimu@redhat.com>
Reviewed-on: http://review.gluster.org/7694
Reviewed-by: Kanagaraj M <kmayilsa@redhat.com>
Reviewed-by: Sahina Bose <sabose@redhat.com>
-rw-r--r-- | gluster-nagios-addons.spec.in | 2 | ||||
-rw-r--r-- | plugins/Makefile.am | 2 | ||||
-rwxr-xr-x | plugins/check_gluster_proc_status.py | 90 | ||||
-rwxr-xr-x | plugins/check_proc_status.py | 182 | ||||
-rwxr-xr-x | plugins/check_proc_util.py | 196 |
5 files changed, 300 insertions, 172 deletions
diff --git a/gluster-nagios-addons.spec.in b/gluster-nagios-addons.spec.in index 6e305bf..363ead4 100644 --- a/gluster-nagios-addons.spec.in +++ b/gluster-nagios-addons.spec.in @@ -144,6 +144,8 @@ command[check_interfaces]=%{_libdir}/nagios/plugins/gluster/network.py -e lo -e command[check_brick_usage]=%{_libdir}/nagios/plugins/gluster/check_disk_and_inode.py -w 80 -c 90 -u MB -n -i \$ARG1\$ command[check_vol_utilization]=sudo %{_libdir}/nagios/plugins/gluster/check_vol_utilization.py \$ARG1\$ -w \$ARG2\$ -c \$ARG3\$ command[check_vol_status]=sudo %{_libdir}/nagios/plugins/gluster/check_volume_status.py -v \$ARG1\$ -t \$ARG2\$ +command[check_proc_status]=sudo %{_libdir}/nagios/plugins/gluster/check_gluster_proc_status.py -t \$ARG1\$ +command[check_brick_status]=sudo %{_libdir}/nagios/plugins/gluster/check_gluster_proc_status.py -t BRICK -v \$ARG1\$ -b \$ARG2\$ ###Auto Discovery related command[discoverpeers]=sudo %{_libdir}/nagios/plugins/gluster/discoverpeers.py command[discover_volume_list]=sudo %{_libdir}/nagios/plugins/gluster/discover_volumes.py -l diff --git a/plugins/Makefile.am b/plugins/Makefile.am index c809b99..5f993aa 100644 --- a/plugins/Makefile.am +++ b/plugins/Makefile.am @@ -24,6 +24,8 @@ dist_glusternagiosplugins_PYTHON = \ discover_volumes.py \ discoverhostparams.py \ configure_gluster_node.py \ + check_gluster_proc_status.py \ + check_proc_util.py \ __init__.py \ memory.py \ network.py \ diff --git a/plugins/check_gluster_proc_status.py b/plugins/check_gluster_proc_status.py new file mode 100755 index 0000000..bc15672 --- /dev/null +++ b/plugins/check_gluster_proc_status.py @@ -0,0 +1,90 @@ +#!/usr/bin/python +# Copyright (C) 2014 Red Hat Inc +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +# + +import argparse + +import check_proc_util +from glusternagios import utils +from glusternagios import glustercli + + +_NFS = "NFS" +_SMB = "CIFS" +_CTDB = "CTDB" +_SHD = "SHD" +_QUOTA = "QUOTA" +_BRICK = "BRICK" +_GLUSTERD = "GLUSTERD" + + +def parse_input(): + parser = argparse.ArgumentParser() + parser.add_argument("-t", "--type", action="store", dest="type", + required=True, + help="Type of status to be shown. Possible values:", + choices=[_NFS, _SMB, _CTDB, _SHD, _QUOTA, _BRICK, + _GLUSTERD]) + parser.add_argument("-v", "--volume", action="store", required=False, + help="Name of the volume for status") + parser.add_argument("-b", "--brickPath", action="store", required=False, + help="Brick Path") + args = parser.parse_args() + return args + + +def _findBrickName(volInfo, brickPath): + hostUuid = glustercli.hostUUIDGet() + for volumeName, volumeInfo in volInfo.iteritems(): + for brick in volumeInfo['bricksInfo']: + if brick.get('hostUuid') == hostUuid \ + and brick['name'].split(':')[1] == brickPath: + return brick['name'] + + +if __name__ == '__main__': + args = parse_input() + status, msg = check_proc_util.getGlusterdStatus() + if status == utils.PluginStatusCode.OK: + if args.type == _NFS: + status, msg = check_proc_util.getNfsStatus(glustercli.volumeInfo()) + elif args.type == _SMB: + status, msg = check_proc_util.getSmbStatus(glustercli.volumeInfo()) + elif args.type == _SHD: + status, msg = check_proc_util.getShdStatus(glustercli.volumeInfo()) + elif args.type == _QUOTA: + status, msg = check_proc_util.getQuotadStatus( + glustercli.volumeInfo()) + elif args.type == _CTDB: + volInfo = glustercli.volumeInfo() + nfsStatus, nfsMsg = check_proc_util.getNfsStatus(volInfo) + smbStatus, smbMsg = check_proc_util.getSmbStatus(volInfo) + status, msg = check_proc_util.getCtdbStatus(smbStatus, nfsStatus) + elif args.type == _BRICK: + volInfo = glustercli.volumeInfo(args.volume) + brickName = _findBrickName(volInfo, args.brickPath) + if brickName: + status, msg = check_proc_util.getBrickStatus(args.volume, + brickName) + else: + status = utils.PluginStatusCode.CRITICAL + msg = "Brick - %s not found" % args.brickPath + elif args.type != _GLUSTERD: + msg = "UNKNOWN: Could not determine %s status " % args.type + status = utils.PluginStatusCode.UNKNOWN + print msg + exit(status) diff --git a/plugins/check_proc_status.py b/plugins/check_proc_status.py index 83bde1f..8895b0f 100755 --- a/plugins/check_proc_status.py +++ b/plugins/check_proc_status.py @@ -17,34 +17,19 @@ # import sys -import errno import lockfile import logging -import psutil import time from daemon import runner from logging import handlers import nscautils +import check_proc_util import glusternagios from glusternagios import utils from glusternagios import glustercli -from glusternagios import storage -_checkProc = utils.CommandPath('check_proc', - '/usr/lib64/nagios/plugins/check_procs') - -_glusterVolPath = "/var/lib/glusterd/vols" -_checkNfsCmd = [_checkProc.cmd, "-c", "1:", "-C", "glusterfs", "-a", "nfs"] -_checkShdCmd = [_checkProc.cmd, "-c", "1:", "-C", "glusterfs", "-a", - "glustershd"] -_checkSmbCmd = [_checkProc.cmd, "-c", "1:", "-C", "smbd"] -_checkQuotaCmd = [_checkProc.cmd, "-c", "1:", "-C", "glusterfs", "-a", - "quotad"] -_checkBrickCmd = [_checkProc.cmd, "-C", "glusterfsd"] -_checkGlusterdCmd = [_checkProc.cmd, "-c", "1:", "-w", "1:1", "-C", "glusterd"] -_checkCtdbCmd = [_checkProc.cmd, "-c", "1:", "-C", "ctdbd"] _nfsService = "NFS" _shdService = "Self-Heal" _smbService = "CIFS" @@ -59,166 +44,20 @@ checkIdeSmartCmdPath = utils.CommandPath( def getBrickStatus(volInfo): bricks = {} hostUuid = glustercli.hostUUIDGet() - status = None for volumeName, volumeInfo in volInfo.iteritems(): if volumeInfo['volumeStatus'] == glustercli.VolumeStatus.OFFLINE: continue for brick in volumeInfo['bricksInfo']: if brick.get('hostUuid') != hostUuid: continue + status, msg = check_proc_util.getBrickStatus(volumeName, + brick['name']) brickPath = brick['name'].split(':')[1] brickService = _brickService % brickPath - pidFile = brick['name'].replace( - ":/", "-").replace("/", "-") + ".pid" - try: - with open("%s/%s/run/%s" % ( - _glusterVolPath, volumeName, pidFile)) as f: - if psutil.pid_exists(int(f.read().strip())): - status = utils.PluginStatusCode.OK - #Now check the status of the underlying physical disk - brickDevice = storage.getBrickDeviceName( - brick['name'].split(":")[1]) - disk = storage.getDisksForBrick( - brickDevice) - cmd = [checkIdeSmartCmdPath.cmd, "-d", disk, "-n"] - rc, out, err = utils.execCmd(cmd) - if rc == utils.PluginStatusCode.CRITICAL and \ - "tests failed" in out[0]: - status = utils.PluginStatusCode.WARNING - msg = "WARNING: Brick %s: %s" % ( - brick['name'], out[0]) - else: - status = utils.PluginStatusCode.CRITICAL - except IOError, e: - if e.errno == errno.ENOENT: - status = utils.PluginStatusCode.CRITICAL - else: - status = utils.PluginStatusCode.UNKNOWN - msg = "UNKNOWN: Brick %s: %s" % (brickPath, str(e)) - finally: - if status == utils.PluginStatusCode.OK: - msg = "OK: Brick %s" % brickPath - elif status != utils.PluginStatusCode.UNKNOWN: - msg = "CRITICAL: Brick %s is down" % brickPath - bricks[brickService] = [status, msg] + bricks[brickService] = [status, msg] return bricks -def getNfsStatus(volInfo): - # if nfs is already running we need not to check further - status, msg, error = utils.execCmd(_checkNfsCmd) - if status == utils.PluginStatusCode.OK: - return status, msg[0] if len(msg) > 0 else "" - - # if nfs is not running and any of the volume uses nfs - # then its required to alert the user - for volume, volumeInfo in volInfo.iteritems(): - if volumeInfo['volumeStatus'] == glustercli.VolumeStatus.OFFLINE: - continue - nfsStatus = volumeInfo.get('options', {}).get('nfs.disable', 'off') - if nfsStatus == 'off': - msg = "CRITICAL: Process glusterfs-nfs is not running" - status = utils.PluginStatusCode.CRITICAL - break - else: - msg = "OK: No gluster volume uses nfs" - status = utils.PluginStatusCode.OK - return status, msg - - -def getCtdbStatus(smbStatus, nfsStatus): - if smbStatus != utils.PluginStatusCode.OK and \ - nfsStatus != utils.PluginStatusCode.OK: - return (utils.PluginStatusCode.OK, - "CTDB ignored as SMB and NFS are not running") - - status, msg, error = utils.execCmd(_checkCtdbCmd) - if status != utils.PluginStatusCode.OK: - return utils.PluginStatusCode.UNKNOWN, "CTDB not configured" - - # CTDB, SMB/NFS are running - status, msg, error = utils.execCmd(['ctdb', 'nodestatus']) - if status == utils.PluginStatusCode.OK: - if len(msg) > -1: - message = msg[0].split() - if len(message) > 1: - msg = "Node status: %s" % message[2] - if message[2] == 'UNHEALTHY': - status = utils.PluginStatusCode.WARNING - elif message[2] in ['DISCONNECTED', 'BANNED', 'INACTIVE']: - status = utils.PluginStatusCode.CRITICAL - else: - status = utils.PluginStatusCode.UNKNOWN - return status, msg - - -def getSmbStatus(volInfo): - status, msg, error = utils.execCmd(_checkSmbCmd) - if status == utils.PluginStatusCode.OK: - return status, msg[0] if len(msg) > 0 else "" - - # if smb is not running and any of the volume uses smb - # then its required to alert the user - for k, v in volInfo.iteritems(): - cifsStatus = v.get('options', {}).get('user.cifs', 'enable') - smbStatus = v.get('options', {}).get('user.smb', 'enable') - if cifsStatus == 'enable' and smbStatus == 'enable': - msg = "CRITICAL: Process smb is not running" - status = utils.PluginStatusCode.CRITICAL - break - else: - msg = "OK: No gluster volume uses smb" - status = utils.PluginStatusCode.OK - return status, msg - - -def getQuotadStatus(volInfo): - # if quota is already running we need not to check further - status, msg, error = utils.execCmd(_checkQuotaCmd) - if status == utils.PluginStatusCode.OK: - return status, msg[0] if len(msg) > 0 else "" - - # if quota is not running and any of the volume uses quota - # then the quotad process should be running in the host - for k, v in volInfo.iteritems(): - quotadStatus = v.get('options', {}).get('features.quota', '') - if quotadStatus == 'on': - msg = "CRITICAL: Process quotad is not running" - utils.PluginStatusCode.CRITICAL - break - else: - msg = "OK: Quota not enabled" - status = utils.PluginStatusCode.OK - return status, msg - - -def getShdStatus(volInfo): - status, msg, error = utils.execCmd(_checkShdCmd) - if status == utils.PluginStatusCode.OK: - return status, msg[0] if len(msg) > 0 else "" - - hostUuid = glustercli.hostUUIDGet() - for volumeName, volumeInfo in volInfo.iteritems(): - if volumeInfo['volumeStatus'] == glustercli.VolumeStatus.OFFLINE: - continue - if hasBricks(hostUuid, volumeInfo['bricksInfo']) and \ - int(volumeInfo['replicaCount']) > 1: - status = utils.PluginStatusCode.CRITICAL - msg = "CRITICAL: Gluster Self Heal Daemon not running" - break - else: - msg = "OK: Process Gluster Self Heal Daemon" - status = utils.PluginStatusCode.OK - return status, msg - - -def hasBricks(hostUuid, bricks): - for brick in bricks: - if brick['hostUuid'] == hostUuid: - return True - return False - - class App(): def __init__(self): self.stdin_path = '/dev/null' @@ -244,11 +83,10 @@ class App(): logger.warn("Hostname is not configured") time.sleep(sleepTime) continue - status, msg, error = utils.execCmd(_checkGlusterdCmd) + status, msg = check_proc_util.getGlusterdStatus() if status != glusterdStatus or \ status == utils.PluginStatusCode.CRITICAL: glusterdStatus = status - msg = msg[0] if len(msg) > 0 else "" nscautils.send_to_nsca(hostName, _glusterdService, status, msg) # Get the volume status only if glusterfs is running to avoid @@ -265,31 +103,31 @@ class App(): time.sleep(sleepTime) continue - status, msg = getNfsStatus(volInfo) + status, msg = check_proc_util.getNfsStatus(volInfo) if status != nfsStatus or \ status == utils.PluginStatusCode.CRITICAL: nfsStatus = status nscautils.send_to_nsca(hostName, _nfsService, status, msg) - status, msg = getSmbStatus(volInfo) + status, msg = check_proc_util.getSmbStatus(volInfo) if status != smbStatus or \ status == utils.PluginStatusCode.CRITICAL: smbStatus = status nscautils.send_to_nsca(hostName, _smbService, status, msg) - status, msg = getCtdbStatus(smbStatus, nfsStatus) + status, msg = check_proc_util.getCtdbStatus(smbStatus, nfsStatus) if status != ctdbStatus or \ status == utils.PluginStatusCode.CRITICAL: ctdbStatus = status nscautils.send_to_nsca(hostName, _ctdbdService, status, msg) - status, msg = getShdStatus(volInfo) + status, msg = check_proc_util.getShdStatus(volInfo) if status != shdStatus or \ status == utils.PluginStatusCode.CRITICAL: shdStatus = status nscautils.send_to_nsca(hostName, _shdService, status, msg) - status, msg = getQuotadStatus(volInfo) + status, msg = check_proc_util.getQuotadStatus(volInfo) if status != quotaStatus or \ status == utils.PluginStatusCode.CRITICAL: quotaStatus = status diff --git a/plugins/check_proc_util.py b/plugins/check_proc_util.py new file mode 100755 index 0000000..20f57eb --- /dev/null +++ b/plugins/check_proc_util.py @@ -0,0 +1,196 @@ +#!/usr/bin/python +# Copyright (C) 2014 Red Hat Inc +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +# + +import errno +import psutil + + +from glusternagios import utils +from glusternagios import glustercli +from glusternagios import storage + + +_checkProc = utils.CommandPath('check_proc', + '/usr/lib64/nagios/plugins/check_procs') + +_glusterVolPath = "/var/lib/glusterd/vols" +_checkNfsCmd = [_checkProc.cmd, "-c", "1:", "-C", "glusterfs", "-a", "nfs"] +_checkShdCmd = [_checkProc.cmd, "-c", "1:", "-C", "glusterfs", "-a", + "glustershd"] +_checkSmbCmd = [_checkProc.cmd, "-c", "1:", "-C", "smbd"] +_checkQuotaCmd = [_checkProc.cmd, "-c", "1:", "-C", "glusterfs", "-a", + "quotad"] +_checkBrickCmd = [_checkProc.cmd, "-C", "glusterfsd"] +_checkGlusterdCmd = [_checkProc.cmd, "-c", "1:", "-w", "1:1", "-C", "glusterd"] +_checkCtdbCmd = [_checkProc.cmd, "-c", "1:", "-C", "ctdbd"] +checkIdeSmartCmdPath = utils.CommandPath( + 'check_ide_smart', '/usr/lib64/nagios/plugins/check_ide_smart') + + +def getBrickStatus(volumeName, brickName): + status = None + brickPath = brickName.split(':')[1] + pidFile = brickName.replace(":/", "-").replace("/", "-") + ".pid" + try: + with open("%s/%s/run/%s" % ( + _glusterVolPath, volumeName, pidFile)) as f: + if psutil.pid_exists(int(f.read().strip())): + status = utils.PluginStatusCode.OK + brickDevice = storage.getBrickDeviceName(brickPath) + disk = storage.getDisksForBrick(brickDevice) + cmd = [checkIdeSmartCmdPath.cmd, "-d", disk, "-n"] + rc, out, err = utils.execCmd(cmd) + if rc == utils.PluginStatusCode.CRITICAL and \ + "tests failed" in out[0]: + status = utils.PluginStatusCode.WARNING + msg = "WARNING: Brick %s: %s" % (brickPath, out[0]) + else: + status = utils.PluginStatusCode.CRITICAL + except IOError as e: + if e.errno == errno.ENOENT: + status = utils.PluginStatusCode.CRITICAL + else: + status = utils.PluginStatusCode.UNKNOWN + msg = "UNKNOWN: Brick %s: %s" % (brickPath, str(e)) + finally: + if status == utils.PluginStatusCode.OK: + msg = "OK: Brick %s" % brickPath + elif status == utils.PluginStatusCode.CRITICAL: + msg = "CRITICAL: Brick %s is down" % brickPath + return status, msg + + +def getNfsStatus(volInfo): + # if nfs is already running we need not to check further + status, msg, error = utils.execCmd(_checkNfsCmd) + if status == utils.PluginStatusCode.OK: + return status, msg[0] if len(msg) > 0 else "" + + # if nfs is not running and any of the volume uses nfs + # then its required to alert the user + for volume, volumeInfo in volInfo.iteritems(): + if volumeInfo['volumeStatus'] == glustercli.VolumeStatus.OFFLINE: + continue + nfsStatus = volumeInfo.get('options', {}).get('nfs.disable', 'off') + if nfsStatus == 'off': + msg = "CRITICAL: Process glusterfs-nfs is not running" + status = utils.PluginStatusCode.CRITICAL + break + else: + msg = "OK: No gluster volume uses nfs" + status = utils.PluginStatusCode.OK + return status, msg + + +def getCtdbStatus(smbStatus, nfsStatus): + if smbStatus != utils.PluginStatusCode.OK and \ + nfsStatus != utils.PluginStatusCode.OK: + return (utils.PluginStatusCode.OK, + "CTDB ignored as SMB and NFS are not running") + + status, msg, error = utils.execCmd(_checkCtdbCmd) + if status != utils.PluginStatusCode.OK: + return utils.PluginStatusCode.UNKNOWN, "CTDB not configured" + + # CTDB, SMB/NFS are running + status, msg, error = utils.execCmd(['ctdb', 'nodestatus']) + if status == utils.PluginStatusCode.OK: + if len(msg) > -1: + message = msg[0].split() + if len(message) > 1: + msg = "Node status: %s" % message[2] + if message[2] == 'UNHEALTHY': + status = utils.PluginStatusCode.WARNING + elif message[2] in ['DISCONNECTED', 'BANNED', 'INACTIVE']: + status = utils.PluginStatusCode.CRITICAL + else: + status = utils.PluginStatusCode.UNKNOWN + return status, msg + + +def getSmbStatus(volInfo): + status, msg, error = utils.execCmd(_checkSmbCmd) + if status == utils.PluginStatusCode.OK: + return status, msg[0] if len(msg) > 0 else "" + + # if smb is not running and any of the volume uses smb + # then its required to alert the user + for k, v in volInfo.iteritems(): + cifsStatus = v.get('options', {}).get('user.cifs', 'enable') + smbStatus = v.get('options', {}).get('user.smb', 'enable') + if cifsStatus == 'enable' and smbStatus == 'enable': + msg = "CRITICAL: Process smb is not running" + status = utils.PluginStatusCode.CRITICAL + break + else: + msg = "OK: No gluster volume uses smb" + status = utils.PluginStatusCode.OK + return status, msg + + +def getQuotadStatus(volInfo): + # if quota is already running we need not to check further + status, msg, error = utils.execCmd(_checkQuotaCmd) + if status == utils.PluginStatusCode.OK: + return status, msg[0] if len(msg) > 0 else "" + + # if quota is not running and any of the volume uses quota + # then the quotad process should be running in the host + for k, v in volInfo.iteritems(): + quotadStatus = v.get('options', {}).get('features.quota', '') + if quotadStatus == 'on': + msg = "CRITICAL: Process quotad is not running" + utils.PluginStatusCode.CRITICAL + break + else: + msg = "OK: Quota not enabled" + status = utils.PluginStatusCode.OK + return status, msg + + +def getShdStatus(volInfo): + status, msg, error = utils.execCmd(_checkShdCmd) + if status == utils.PluginStatusCode.OK: + return status, msg[0] if len(msg) > 0 else "" + + hostUuid = glustercli.hostUUIDGet() + for volumeName, volumeInfo in volInfo.iteritems(): + if volumeInfo['volumeStatus'] == glustercli.VolumeStatus.OFFLINE: + continue + if hasBricks(hostUuid, volumeInfo['bricksInfo']) and \ + int(volumeInfo['replicaCount']) > 1: + status = utils.PluginStatusCode.CRITICAL + msg = "CRITICAL: Gluster Self Heal Daemon not running" + break + else: + msg = "OK: Process Gluster Self Heal Daemon" + status = utils.PluginStatusCode.OK + return status, msg + + +def getGlusterdStatus(): + status, msg, error = utils.execCmd(_checkGlusterdCmd) + msg = msg[0] if len(msg) > 0 else "" + return status, msg + + +def hasBricks(hostUuid, bricks): + for brick in bricks: + if brick['hostUuid'] == hostUuid: + return True + return False |