summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--config/nagios_server.conf5
-rw-r--r--gluster-nagios-addons.spec.in10
-rw-r--r--plugins/Makefile.am7
-rwxr-xr-xplugins/check_proc_status.py158
-rwxr-xr-xplugins/glusterpmd63
-rw-r--r--plugins/nscautils.py.in6
6 files changed, 203 insertions, 46 deletions
diff --git a/config/nagios_server.conf b/config/nagios_server.conf
index 381e588..4f6b9e3 100644
--- a/config/nagios_server.conf
+++ b/config/nagios_server.conf
@@ -17,3 +17,8 @@ cluster_name=
[HOST-NAME]
hostname_in_nagios=
+
+# LOCAL HOST CONFIGURATION
+# Process monitoring sleeping intevel
+[HOST-CONF]
+proc-mon-sleep-time=60
diff --git a/gluster-nagios-addons.spec.in b/gluster-nagios-addons.spec.in
index fb31946..6e305bf 100644
--- a/gluster-nagios-addons.spec.in
+++ b/gluster-nagios-addons.spec.in
@@ -82,6 +82,8 @@ Requires: python-netaddr
Requires: python-pthreading
Requires: python-cpopen >= 1.3
Requires: python-psutil
+Requires: python-lockfile
+Requires: python-daemon
Requires: sysstat
%description
@@ -150,11 +152,14 @@ command[discoverhostparams]=sudo %{_libdir}/nagios/plugins/gluster/discoverhostp
command[configure_gluster_node]=sudo %{_libdir}/nagios/plugins/gluster/configure_gluster_node.py -c \$ARG1\$ -n \$ARG2\$ -H \$ARG3\$
EOF
%_init_enable nrpe
+%_init_enable glusterpmd
%_init_restart crond
%_init_restart rsyslog
-
+%_init_restart glusterpmd
%preun
+%_init_disable glusterpmd
+%_init_stop glusterpmd
sed -i '/gluster nrpe plugins/d' %{_sysconfdir}/nagios/nrpe.cfg
sed -i '/check_disk_and_inode/d' %{_sysconfdir}/nagios/nrpe.cfg
sed -i '/check_memory/d' %{_sysconfdir}/nagios/nrpe.cfg
@@ -167,10 +172,11 @@ sed -i '/check_vol_quota_status/d' %{_sysconfdir}/nagios/nrpe.cfg
%files
%defattr(-,root,root,-)
%attr(0755, -, -) %{_libdir}/nagios/plugins/gluster/*
+%attr(0755, -, -) %{_sysconfdir}/init.d/glusterpmd
%{_sysconfdir}/cron.d/gluster-sysstat.crontab
%{_sysconfdir}/rsyslog.d/glusternagios.conf
%{_sysconfdir}/nagios/nagios_server.conf
-%{_sysconfdir}/cron.d/gluster-proc.crontab
+%{_sysconfdir}/init.d/glusterpmd
%files tests
%defattr(-,root,root,-)
diff --git a/plugins/Makefile.am b/plugins/Makefile.am
index de6bf41..c809b99 100644
--- a/plugins/Makefile.am
+++ b/plugins/Makefile.am
@@ -2,10 +2,14 @@ SUBDIRS = \
volcap \
$(NULL)
+initdir = $(sysconfdir)/init.d
+init_DATA = \
+ glusterpmd \
+ $(NULL)
+
cronddir = $(sysconfdir)/cron.d
crond_DATA = \
gluster-sysstat.crontab \
- gluster-proc.crontab \
$(NULL)
dist_glusternagiosplugins_PYTHON = \
@@ -29,5 +33,6 @@ dist_glusternagiosplugins_PYTHON = \
$(NULL)
EXTRA_DIST = \
+ $(init_DATA) \
$(crond_DATA) \
$(NULL)
diff --git a/plugins/check_proc_status.py b/plugins/check_proc_status.py
index 2ac1bc3..95a9b96 100755
--- a/plugins/check_proc_status.py
+++ b/plugins/check_proc_status.py
@@ -19,7 +19,11 @@
import sys
import errno
import socket
+import lockfile
+import logging
import psutil
+import time
+from daemon import runner
import nscautils
import glusternagios
@@ -47,7 +51,8 @@ _glusterdService = "Gluster Management Daemon"
_quotadService = "Gluster Quota Daemon"
-def sendBrickStatus(hostName, volInfo):
+def getBrickStatus(hostName, volInfo):
+ bricks = {}
hostUuid = glustercli.hostUUIDGet()
status = None
for volumeName, volumeInfo in volInfo.iteritems():
@@ -78,15 +83,15 @@ def sendBrickStatus(hostName, volInfo):
msg = "OK: Brick %s" % brickPath
elif status != utils.PluginStatusCode.UNKNOWN:
msg = "CRITICAL: Brick %s is down" % brickPath
- nscautils.send_to_nsca(hostName, brickService, status, msg)
+ bricks[brickService] = [status, msg]
+ return bricks
-def sendNfsStatus(hostName, volInfo):
+def getNfsStatus(hostName, volInfo):
# if nfs is already running we need not to check further
status, msg, error = utils.execCmd(_checkNfsCmd)
if status == utils.PluginStatusCode.OK:
- nscautils.send_to_nsca(hostName, _nfsService, status, msg)
- return
+ return status, msg[0] if len(msg) > 0 else ""
# if nfs is not running and any of the volume uses nfs
# then its required to alert the user
@@ -101,36 +106,34 @@ def sendNfsStatus(hostName, volInfo):
else:
msg = "OK: No gluster volume uses nfs"
status = utils.PluginStatusCode.OK
- nscautils.send_to_nsca(hostName, _nfsService, status, msg)
+ return status, msg
-def sendSmbStatus(hostName, volInfo):
+def getSmbStatus(hostName, volInfo):
status, msg, error = utils.execCmd(_checkSmbCmd)
if status == utils.PluginStatusCode.OK:
- nscautils.send_to_nsca(hostName, _smbService, status, msg)
- return
+ return status, msg[0] if len(msg) > 0 else ""
# if smb is not running and any of the volume uses smb
# then its required to alert the use
for k, v in volInfo.iteritems():
- cifsStatus = v.get('options', {}).get('user.cifs', '')
- smbStatus = v.get('options', {}).get('user.smb', '')
- if cifsStatus == 'disable' or smbStatus == 'disable':
+ cifsStatus = v.get('options', {}).get('user.cifs', 'enable')
+ smbStatus = v.get('options', {}).get('user.smb', 'enable')
+ if cifsStatus == 'enable' and smbStatus == 'enable':
msg = "CRITICAL: Process smb is not running"
status = utils.PluginStatusCode.CRITICAL
break
else:
msg = "OK: No gluster volume uses smb"
status = utils.PluginStatusCode.OK
- nscautils.send_to_nsca(hostName, _smbService, status, msg)
+ return status, msg
-def sendQuotadStatus(hostName, volInfo):
+def getQuotadStatus(hostName, volInfo):
# if quota is already running we need not to check further
status, msg, error = utils.execCmd(_checkQuotaCmd)
if status == utils.PluginStatusCode.OK:
- nscautils.send_to_nsca(hostName, _quotadService, status, msg)
- return
+ return status, msg[0] if len(msg) > 0 else ""
# if quota is not running and any of the volume uses quota
# then the quotad process should be running in the host
@@ -143,14 +146,13 @@ def sendQuotadStatus(hostName, volInfo):
else:
msg = "OK: Quota not enabled"
status = utils.PluginStatusCode.OK
- nscautils.send_to_nsca(hostName, _quotadService, status, msg)
+ return status, msg
-def sendShdStatus(hostName, volInfo):
+def getShdStatus(hostName, volInfo):
status, msg, error = utils.execCmd(_checkShdCmd)
if status == utils.PluginStatusCode.OK:
- nscautils.send_to_nsca(hostName, _shdService, status, msg)
- return
+ return status, msg[0] if len(msg) > 0 else ""
hostUuid = glustercli.hostUUIDGet()
for volumeName, volumeInfo in volInfo.iteritems():
@@ -164,7 +166,7 @@ def sendShdStatus(hostName, volInfo):
else:
msg = "OK: Process Gluster Self Heal Daemon"
status = utils.PluginStatusCode.OK
- nscautils.send_to_nsca(hostName, _shdService, status, msg)
+ return status, msg
def hasBricks(hostUuid, bricks):
@@ -174,31 +176,101 @@ def hasBricks(hostUuid, bricks):
return False
-if __name__ == '__main__':
- hostName = nscautils.getCurrentHostNameInNagiosServer()
- if not hostName:
- hostName = socket.getfqdn()
- if hostName == "localhost.localdomain" or hostName == "localhost":
- sys.stderr.write("failed to find localhost fqdn")
+class App():
+ def __init__(self):
+ self.stdin_path = '/dev/null'
+ self.stdout_path = '/dev/tty'
+ self.stderr_path = '/dev/null'
+ self.pidfile_path = '/var/run/glusterpmd.pid'
+ self.pidfile_timeout = 5
- ### service check ###
- status, msg, error = utils.execCmd(_checkGlusterdCmd)
- nscautils.send_to_nsca(hostName, _glusterdService, status, msg)
+ def run(self):
+ hostName = nscautils.getCurrentHostNameInNagiosServer()
+ sleepTime = int(nscautils.getProcessMonitorSleepTime())
+ glusterdStatus = None
+ nfsStatus = None
+ smbStatus = None
+ shdStatus = None
+ quotaStatus = None
+ brickStatus = {}
+ while True:
+ if not hostName:
+ hostName = nscautils.getCurrentHostNameInNagiosServer()
+ if not hostName:
+ logger.warn("Hostname is not configured")
+ time.sleep(sleepTime)
+ continue
+ status, msg, error = utils.execCmd(_checkGlusterdCmd)
+ if status != glusterdStatus or \
+ status == utils.PluginStatusCode.CRITICAL:
+ glusterdStatus = status
+ msg = msg[0] if len(msg) > 0 else ""
+ nscautils.send_to_nsca(hostName, _glusterdService, status, msg)
- # Get the volume status only if glusterfs is running to avoid
- # unusual delay
- if status != utils.PluginStatusCode.OK:
- sys.exit(status)
+ # Get the volume status only if glusterfs is running to avoid
+ # unusual delay
+ if status != utils.PluginStatusCode.OK:
+ logger.warn("Glusterd is not running")
+ time.sleep(sleepTime)
+ continue
- try:
- volInfo = glustercli.volumeInfo()
- except glusternagios.glustercli.GlusterCmdFailedException as e:
- sys.exit(utils.PluginStatusCode.UNKNOWN)
+ try:
+ volInfo = glustercli.volumeInfo()
+ except glusternagios.glustercli.GlusterCmdFailedException:
+ logger.error("failed to find volume info")
+ time.sleep(sleepTime)
+ continue
+
+ status, msg = getNfsStatus(hostName, volInfo)
+ if status != nfsStatus or \
+ status == utils.PluginStatusCode.CRITICAL:
+ nfsStatus = status
+ nscautils.send_to_nsca(hostName, _nfsService, status, msg)
- sendNfsStatus(hostName, volInfo)
- sendSmbStatus(hostName, volInfo)
- sendShdStatus(hostName, volInfo)
- sendQuotadStatus(hostName, volInfo)
- sendBrickStatus(hostName, volInfo)
+ status, msg = getSmbStatus(hostName, volInfo)
+ if status != smbStatus or \
+ status == utils.PluginStatusCode.CRITICAL:
+ smbStatus = status
+ nscautils.send_to_nsca(hostName, _smbService, status, msg)
+ status, msg = getShdStatus(hostName, volInfo)
+ if status != shdStatus or \
+ status == utils.PluginStatusCode.CRITICAL:
+ shdStatus = status
+ nscautils.send_to_nsca(hostName, _shdService, status, msg)
+
+ status, msg = getQuotadStatus(hostName, volInfo)
+ if status != quotaStatus or \
+ status == utils.PluginStatusCode.CRITICAL:
+ quotaStatus = status
+ nscautils.send_to_nsca(hostName, _quotadService, status, msg)
+
+ brick = getBrickStatus(hostName, volInfo)
+ # brickInfo contains status, and message
+ for brickService, brickInfo in brick.iteritems():
+ if brickInfo[0] != brickStatus.get(brickService, [None])[0] \
+ or brickInfo[0] == utils.PluginStatusCode.CRITICAL:
+ brickStatus[brickService] = brickInfo
+ nscautils.send_to_nsca(hostName, brickService,
+ brickInfo[0], brickInfo[1])
+ time.sleep(sleepTime)
+
+if __name__ == '__main__':
+ app = App()
+ logger = logging.getLogger("GlusterProcLog")
+ logger.setLevel(logging.INFO)
+ formatter = logging.Formatter(
+ "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+ handler = logging.FileHandler("/var/log/glusterpmd.log")
+ handler.setFormatter(formatter)
+ logger.addHandler(handler)
+
+ daemonRunner = runner.DaemonRunner(app)
+ daemonRunner.daemon_context.files_preserve = [handler.stream]
+ try:
+ daemonRunner.do_action()
+ except lockfile.LockTimeout:
+ logger.error("failed to aquire lock")
+ except runner.DaemonRunnerStopFailureError:
+ logger.error("failed to get the lock file")
sys.exit(utils.PluginStatusCode.OK)
diff --git a/plugins/glusterpmd b/plugins/glusterpmd
new file mode 100755
index 0000000..383057e
--- /dev/null
+++ b/plugins/glusterpmd
@@ -0,0 +1,63 @@
+#! /bin/sh
+# glusterpmd Start/Stop the gluster process monitoring daemon.
+#
+# chkconfig: 2345 90 60
+# description: Monitor gluster related processes and send
+# details to nagios server whenever any changes
+# observed in those services.
+#
+# Copyright (C) 2014 Red Hat Inc
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+#
+
+BASE=glusterpmd
+
+# Fedora File System Layout dictates /run
+[ -e /run ] && RUNDIR="/run"
+PIDFILE="${RUNDIR:-/var/run}/${BASE}.pid"
+
+PID=`test -f $PIDFILE && cat $PIDFILE`
+
+case "$1" in
+ start)
+ if [ -f $PIDFILE ]; then
+ echo "glusterpmd service is already running with pid $PID"
+ else
+ echo "Starting gluster process monitoring service"
+ python /usr/lib64/nagios/plugins/gluster/check_proc_status.py start
+ fi
+ ;;
+ stop)
+ if [ -f $PIDFILE ]; then
+ echo "Stopping gluster process monitoring service"
+ python /usr/lib64/nagios/plugins/gluster/check_proc_status.py stop
+ fi
+ ;;
+ restart)
+ if [ -f $PIDFILE ]; then
+ echo "Restarting gluster process monitoring service"
+ python /usr/lib64/nagios/plugins/gluster/check_proc_status.py restart
+ else
+ python /usr/lib64/nagios/plugins/gluster/check_proc_status.py start
+ fi
+ ;;
+ *)
+ echo "Usage: /etc/init.d/glusterpmd {start|stop|restart}"
+ exit 1
+ ;;
+esac
+
+exit 0
diff --git a/plugins/nscautils.py.in b/plugins/nscautils.py.in
index 56db577..51df927 100644
--- a/plugins/nscautils.py.in
+++ b/plugins/nscautils.py.in
@@ -43,6 +43,12 @@ def getNagiosClusterName():
return config.get('NAGIOS-DEFINTIONS', 'cluster_name')
+def getProcessMonitorSleepTime():
+ config = ConfigParser.ConfigParser()
+ config.read(__NAGIOSSERVER_CONF)
+ return config.get('HOST-CONF', 'proc-mon-sleep-time')
+
+
def send_to_nsca(hostName, serviceName, exitStatus, resultString):
cmddata = '%s\t%s\t%s\t%s\n' % (hostName,
serviceName,