diff options
author | Timothy Asir <tjeyasin@redhat.com> | 2014-04-29 18:28:14 +0530 |
---|---|---|
committer | Sahina Bose <sabose@redhat.com> | 2014-05-02 06:16:03 -0700 |
commit | d646c986a3ba54570c9a0d367d106deeb0a80e38 (patch) | |
tree | b721ee3b3d1b9f53e10ac417b77f5040a6e2a9bb | |
parent | ccec0742af257e13effafa30a1184541c3cf5b65 (diff) |
Run check gluster process status as a daemon
Enhanced to send specific gluster related process status only
when there is a change detected in a service status or for any
critical status to avoid too many logs in the nagios service side.
Change-Id: I26e389ae2d1ccba1b5ccadc45d202d3b5219c74a
Signed-off-by: Timothy Asir <tjeyasin@redhat.com>
Reviewed-on: http://review.gluster.org/7602
Reviewed-by: Sahina Bose <sabose@redhat.com>
Tested-by: Timothy Asir <tim.gluster@gmail.com>
-rw-r--r-- | config/nagios_server.conf | 5 | ||||
-rw-r--r-- | gluster-nagios-addons.spec.in | 10 | ||||
-rw-r--r-- | plugins/Makefile.am | 7 | ||||
-rwxr-xr-x | plugins/check_proc_status.py | 158 | ||||
-rwxr-xr-x | plugins/glusterpmd | 63 | ||||
-rw-r--r-- | plugins/nscautils.py.in | 6 |
6 files changed, 203 insertions, 46 deletions
diff --git a/config/nagios_server.conf b/config/nagios_server.conf index 381e588..4f6b9e3 100644 --- a/config/nagios_server.conf +++ b/config/nagios_server.conf @@ -17,3 +17,8 @@ cluster_name= [HOST-NAME] hostname_in_nagios= + +# LOCAL HOST CONFIGURATION +# Process monitoring sleeping intevel +[HOST-CONF] +proc-mon-sleep-time=60 diff --git a/gluster-nagios-addons.spec.in b/gluster-nagios-addons.spec.in index fb31946..6e305bf 100644 --- a/gluster-nagios-addons.spec.in +++ b/gluster-nagios-addons.spec.in @@ -82,6 +82,8 @@ Requires: python-netaddr Requires: python-pthreading Requires: python-cpopen >= 1.3 Requires: python-psutil +Requires: python-lockfile +Requires: python-daemon Requires: sysstat %description @@ -150,11 +152,14 @@ command[discoverhostparams]=sudo %{_libdir}/nagios/plugins/gluster/discoverhostp command[configure_gluster_node]=sudo %{_libdir}/nagios/plugins/gluster/configure_gluster_node.py -c \$ARG1\$ -n \$ARG2\$ -H \$ARG3\$ EOF %_init_enable nrpe +%_init_enable glusterpmd %_init_restart crond %_init_restart rsyslog - +%_init_restart glusterpmd %preun +%_init_disable glusterpmd +%_init_stop glusterpmd sed -i '/gluster nrpe plugins/d' %{_sysconfdir}/nagios/nrpe.cfg sed -i '/check_disk_and_inode/d' %{_sysconfdir}/nagios/nrpe.cfg sed -i '/check_memory/d' %{_sysconfdir}/nagios/nrpe.cfg @@ -167,10 +172,11 @@ sed -i '/check_vol_quota_status/d' %{_sysconfdir}/nagios/nrpe.cfg %files %defattr(-,root,root,-) %attr(0755, -, -) %{_libdir}/nagios/plugins/gluster/* +%attr(0755, -, -) %{_sysconfdir}/init.d/glusterpmd %{_sysconfdir}/cron.d/gluster-sysstat.crontab %{_sysconfdir}/rsyslog.d/glusternagios.conf %{_sysconfdir}/nagios/nagios_server.conf -%{_sysconfdir}/cron.d/gluster-proc.crontab +%{_sysconfdir}/init.d/glusterpmd %files tests %defattr(-,root,root,-) diff --git a/plugins/Makefile.am b/plugins/Makefile.am index de6bf41..c809b99 100644 --- a/plugins/Makefile.am +++ b/plugins/Makefile.am @@ -2,10 +2,14 @@ SUBDIRS = \ volcap \ $(NULL) +initdir = $(sysconfdir)/init.d +init_DATA = \ + glusterpmd \ + $(NULL) + cronddir = $(sysconfdir)/cron.d crond_DATA = \ gluster-sysstat.crontab \ - gluster-proc.crontab \ $(NULL) dist_glusternagiosplugins_PYTHON = \ @@ -29,5 +33,6 @@ dist_glusternagiosplugins_PYTHON = \ $(NULL) EXTRA_DIST = \ + $(init_DATA) \ $(crond_DATA) \ $(NULL) diff --git a/plugins/check_proc_status.py b/plugins/check_proc_status.py index 2ac1bc3..95a9b96 100755 --- a/plugins/check_proc_status.py +++ b/plugins/check_proc_status.py @@ -19,7 +19,11 @@ import sys import errno import socket +import lockfile +import logging import psutil +import time +from daemon import runner import nscautils import glusternagios @@ -47,7 +51,8 @@ _glusterdService = "Gluster Management Daemon" _quotadService = "Gluster Quota Daemon" -def sendBrickStatus(hostName, volInfo): +def getBrickStatus(hostName, volInfo): + bricks = {} hostUuid = glustercli.hostUUIDGet() status = None for volumeName, volumeInfo in volInfo.iteritems(): @@ -78,15 +83,15 @@ def sendBrickStatus(hostName, volInfo): msg = "OK: Brick %s" % brickPath elif status != utils.PluginStatusCode.UNKNOWN: msg = "CRITICAL: Brick %s is down" % brickPath - nscautils.send_to_nsca(hostName, brickService, status, msg) + bricks[brickService] = [status, msg] + return bricks -def sendNfsStatus(hostName, volInfo): +def getNfsStatus(hostName, volInfo): # if nfs is already running we need not to check further status, msg, error = utils.execCmd(_checkNfsCmd) if status == utils.PluginStatusCode.OK: - nscautils.send_to_nsca(hostName, _nfsService, status, msg) - return + return status, msg[0] if len(msg) > 0 else "" # if nfs is not running and any of the volume uses nfs # then its required to alert the user @@ -101,36 +106,34 @@ def sendNfsStatus(hostName, volInfo): else: msg = "OK: No gluster volume uses nfs" status = utils.PluginStatusCode.OK - nscautils.send_to_nsca(hostName, _nfsService, status, msg) + return status, msg -def sendSmbStatus(hostName, volInfo): +def getSmbStatus(hostName, volInfo): status, msg, error = utils.execCmd(_checkSmbCmd) if status == utils.PluginStatusCode.OK: - nscautils.send_to_nsca(hostName, _smbService, status, msg) - return + return status, msg[0] if len(msg) > 0 else "" # if smb is not running and any of the volume uses smb # then its required to alert the use for k, v in volInfo.iteritems(): - cifsStatus = v.get('options', {}).get('user.cifs', '') - smbStatus = v.get('options', {}).get('user.smb', '') - if cifsStatus == 'disable' or smbStatus == 'disable': + cifsStatus = v.get('options', {}).get('user.cifs', 'enable') + smbStatus = v.get('options', {}).get('user.smb', 'enable') + if cifsStatus == 'enable' and smbStatus == 'enable': msg = "CRITICAL: Process smb is not running" status = utils.PluginStatusCode.CRITICAL break else: msg = "OK: No gluster volume uses smb" status = utils.PluginStatusCode.OK - nscautils.send_to_nsca(hostName, _smbService, status, msg) + return status, msg -def sendQuotadStatus(hostName, volInfo): +def getQuotadStatus(hostName, volInfo): # if quota is already running we need not to check further status, msg, error = utils.execCmd(_checkQuotaCmd) if status == utils.PluginStatusCode.OK: - nscautils.send_to_nsca(hostName, _quotadService, status, msg) - return + return status, msg[0] if len(msg) > 0 else "" # if quota is not running and any of the volume uses quota # then the quotad process should be running in the host @@ -143,14 +146,13 @@ def sendQuotadStatus(hostName, volInfo): else: msg = "OK: Quota not enabled" status = utils.PluginStatusCode.OK - nscautils.send_to_nsca(hostName, _quotadService, status, msg) + return status, msg -def sendShdStatus(hostName, volInfo): +def getShdStatus(hostName, volInfo): status, msg, error = utils.execCmd(_checkShdCmd) if status == utils.PluginStatusCode.OK: - nscautils.send_to_nsca(hostName, _shdService, status, msg) - return + return status, msg[0] if len(msg) > 0 else "" hostUuid = glustercli.hostUUIDGet() for volumeName, volumeInfo in volInfo.iteritems(): @@ -164,7 +166,7 @@ def sendShdStatus(hostName, volInfo): else: msg = "OK: Process Gluster Self Heal Daemon" status = utils.PluginStatusCode.OK - nscautils.send_to_nsca(hostName, _shdService, status, msg) + return status, msg def hasBricks(hostUuid, bricks): @@ -174,31 +176,101 @@ def hasBricks(hostUuid, bricks): return False -if __name__ == '__main__': - hostName = nscautils.getCurrentHostNameInNagiosServer() - if not hostName: - hostName = socket.getfqdn() - if hostName == "localhost.localdomain" or hostName == "localhost": - sys.stderr.write("failed to find localhost fqdn") +class App(): + def __init__(self): + self.stdin_path = '/dev/null' + self.stdout_path = '/dev/tty' + self.stderr_path = '/dev/null' + self.pidfile_path = '/var/run/glusterpmd.pid' + self.pidfile_timeout = 5 - ### service check ### - status, msg, error = utils.execCmd(_checkGlusterdCmd) - nscautils.send_to_nsca(hostName, _glusterdService, status, msg) + def run(self): + hostName = nscautils.getCurrentHostNameInNagiosServer() + sleepTime = int(nscautils.getProcessMonitorSleepTime()) + glusterdStatus = None + nfsStatus = None + smbStatus = None + shdStatus = None + quotaStatus = None + brickStatus = {} + while True: + if not hostName: + hostName = nscautils.getCurrentHostNameInNagiosServer() + if not hostName: + logger.warn("Hostname is not configured") + time.sleep(sleepTime) + continue + status, msg, error = utils.execCmd(_checkGlusterdCmd) + if status != glusterdStatus or \ + status == utils.PluginStatusCode.CRITICAL: + glusterdStatus = status + msg = msg[0] if len(msg) > 0 else "" + nscautils.send_to_nsca(hostName, _glusterdService, status, msg) - # Get the volume status only if glusterfs is running to avoid - # unusual delay - if status != utils.PluginStatusCode.OK: - sys.exit(status) + # Get the volume status only if glusterfs is running to avoid + # unusual delay + if status != utils.PluginStatusCode.OK: + logger.warn("Glusterd is not running") + time.sleep(sleepTime) + continue - try: - volInfo = glustercli.volumeInfo() - except glusternagios.glustercli.GlusterCmdFailedException as e: - sys.exit(utils.PluginStatusCode.UNKNOWN) + try: + volInfo = glustercli.volumeInfo() + except glusternagios.glustercli.GlusterCmdFailedException: + logger.error("failed to find volume info") + time.sleep(sleepTime) + continue + + status, msg = getNfsStatus(hostName, volInfo) + if status != nfsStatus or \ + status == utils.PluginStatusCode.CRITICAL: + nfsStatus = status + nscautils.send_to_nsca(hostName, _nfsService, status, msg) - sendNfsStatus(hostName, volInfo) - sendSmbStatus(hostName, volInfo) - sendShdStatus(hostName, volInfo) - sendQuotadStatus(hostName, volInfo) - sendBrickStatus(hostName, volInfo) + status, msg = getSmbStatus(hostName, volInfo) + if status != smbStatus or \ + status == utils.PluginStatusCode.CRITICAL: + smbStatus = status + nscautils.send_to_nsca(hostName, _smbService, status, msg) + status, msg = getShdStatus(hostName, volInfo) + if status != shdStatus or \ + status == utils.PluginStatusCode.CRITICAL: + shdStatus = status + nscautils.send_to_nsca(hostName, _shdService, status, msg) + + status, msg = getQuotadStatus(hostName, volInfo) + if status != quotaStatus or \ + status == utils.PluginStatusCode.CRITICAL: + quotaStatus = status + nscautils.send_to_nsca(hostName, _quotadService, status, msg) + + brick = getBrickStatus(hostName, volInfo) + # brickInfo contains status, and message + for brickService, brickInfo in brick.iteritems(): + if brickInfo[0] != brickStatus.get(brickService, [None])[0] \ + or brickInfo[0] == utils.PluginStatusCode.CRITICAL: + brickStatus[brickService] = brickInfo + nscautils.send_to_nsca(hostName, brickService, + brickInfo[0], brickInfo[1]) + time.sleep(sleepTime) + +if __name__ == '__main__': + app = App() + logger = logging.getLogger("GlusterProcLog") + logger.setLevel(logging.INFO) + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s") + handler = logging.FileHandler("/var/log/glusterpmd.log") + handler.setFormatter(formatter) + logger.addHandler(handler) + + daemonRunner = runner.DaemonRunner(app) + daemonRunner.daemon_context.files_preserve = [handler.stream] + try: + daemonRunner.do_action() + except lockfile.LockTimeout: + logger.error("failed to aquire lock") + except runner.DaemonRunnerStopFailureError: + logger.error("failed to get the lock file") sys.exit(utils.PluginStatusCode.OK) diff --git a/plugins/glusterpmd b/plugins/glusterpmd new file mode 100755 index 0000000..383057e --- /dev/null +++ b/plugins/glusterpmd @@ -0,0 +1,63 @@ +#! /bin/sh +# glusterpmd Start/Stop the gluster process monitoring daemon. +# +# chkconfig: 2345 90 60 +# description: Monitor gluster related processes and send +# details to nagios server whenever any changes +# observed in those services. +# +# Copyright (C) 2014 Red Hat Inc +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +# + +BASE=glusterpmd + +# Fedora File System Layout dictates /run +[ -e /run ] && RUNDIR="/run" +PIDFILE="${RUNDIR:-/var/run}/${BASE}.pid" + +PID=`test -f $PIDFILE && cat $PIDFILE` + +case "$1" in + start) + if [ -f $PIDFILE ]; then + echo "glusterpmd service is already running with pid $PID" + else + echo "Starting gluster process monitoring service" + python /usr/lib64/nagios/plugins/gluster/check_proc_status.py start + fi + ;; + stop) + if [ -f $PIDFILE ]; then + echo "Stopping gluster process monitoring service" + python /usr/lib64/nagios/plugins/gluster/check_proc_status.py stop + fi + ;; + restart) + if [ -f $PIDFILE ]; then + echo "Restarting gluster process monitoring service" + python /usr/lib64/nagios/plugins/gluster/check_proc_status.py restart + else + python /usr/lib64/nagios/plugins/gluster/check_proc_status.py start + fi + ;; + *) + echo "Usage: /etc/init.d/glusterpmd {start|stop|restart}" + exit 1 + ;; +esac + +exit 0 diff --git a/plugins/nscautils.py.in b/plugins/nscautils.py.in index 56db577..51df927 100644 --- a/plugins/nscautils.py.in +++ b/plugins/nscautils.py.in @@ -43,6 +43,12 @@ def getNagiosClusterName(): return config.get('NAGIOS-DEFINTIONS', 'cluster_name') +def getProcessMonitorSleepTime(): + config = ConfigParser.ConfigParser() + config.read(__NAGIOSSERVER_CONF) + return config.get('HOST-CONF', 'proc-mon-sleep-time') + + def send_to_nsca(hostName, serviceName, exitStatus, resultString): cmddata = '%s\t%s\t%s\t%s\n' % (hostName, serviceName, |