6 files changed, 203 insertions, 46 deletions
diff --git a/config/nagios_server.conf b/config/nagios_server.conf
index 381e588..4f6b9e3 100644
--- a/config/nagios_server.conf
+++ b/config/nagios_server.conf
@@ -17,3 +17,8 @@ cluster_name=
 [HOST-NAME]
 hostname_in_nagios=
 
+
+# LOCAL HOST CONFIGURATION
+# Process monitoring sleeping intevel
+[HOST-CONF]
+proc-mon-sleep-time=60
diff --git a/gluster-nagios-addons.spec.in b/gluster-nagios-addons.spec.in
index fb31946..6e305bf 100644
--- a/gluster-nagios-addons.spec.in
+++ b/gluster-nagios-addons.spec.in
@@ -82,6 +82,8 @@ Requires:         python-netaddr
 Requires:         python-pthreading
 Requires:         python-cpopen >= 1.3
 Requires:         python-psutil
+Requires:         python-lockfile
+Requires:         python-daemon
 Requires:         sysstat
 
 %description
@@ -150,11 +152,14 @@ command[discoverhostparams]=sudo %{_libdir}/nagios/plugins/gluster/discoverhostp
 command[configure_gluster_node]=sudo %{_libdir}/nagios/plugins/gluster/configure_gluster_node.py -c \$ARG1\$ -n \$ARG2\$ -H \$ARG3\$
 EOF
 %_init_enable nrpe
+%_init_enable glusterpmd
 %_init_restart crond
 %_init_restart rsyslog
-
+%_init_restart glusterpmd
 
 %preun
+%_init_disable glusterpmd
+%_init_stop glusterpmd
 sed -i '/gluster nrpe plugins/d' %{_sysconfdir}/nagios/nrpe.cfg
 sed -i '/check_disk_and_inode/d' %{_sysconfdir}/nagios/nrpe.cfg
 sed -i '/check_memory/d' %{_sysconfdir}/nagios/nrpe.cfg
@@ -167,10 +172,11 @@ sed -i '/check_vol_quota_status/d' %{_sysconfdir}/nagios/nrpe.cfg
 %files
 %defattr(-,root,root,-)
 %attr(0755, -, -) %{_libdir}/nagios/plugins/gluster/*
+%attr(0755, -, -) %{_sysconfdir}/init.d/glusterpmd
 %{_sysconfdir}/cron.d/gluster-sysstat.crontab
 %{_sysconfdir}/rsyslog.d/glusternagios.conf
 %{_sysconfdir}/nagios/nagios_server.conf
-%{_sysconfdir}/cron.d/gluster-proc.crontab
+%{_sysconfdir}/init.d/glusterpmd
 
 %files tests
 %defattr(-,root,root,-)
diff --git a/plugins/Makefile.am b/plugins/Makefile.am
index de6bf41..c809b99 100644
--- a/plugins/Makefile.am
+++ b/plugins/Makefile.am
@@ -2,10 +2,14 @@ SUBDIRS = \
 	volcap \
 	$(NULL)
 
+initdir = $(sysconfdir)/init.d
+init_DATA = \
+	glusterpmd \
+	$(NULL)
+
 cronddir = $(sysconfdir)/cron.d
 crond_DATA = \
 	gluster-sysstat.crontab \
-	gluster-proc.crontab \
 	$(NULL)
 
 dist_glusternagiosplugins_PYTHON = \
@@ -29,5 +33,6 @@ dist_glusternagiosplugins_PYTHON = \
 	$(NULL)
 
 EXTRA_DIST = \
+	$(init_DATA) \
 	$(crond_DATA) \
 	$(NULL)
diff --git a/plugins/check_proc_status.py b/plugins/check_proc_status.py
index 2ac1bc3..95a9b96 100755
--- a/plugins/check_proc_status.py
+++ b/plugins/check_proc_status.py
@@ -19,7 +19,11 @@
 import sys
 import errno
 import socket
+import lockfile
+import logging
 import psutil
+import time
+from daemon import runner
 import nscautils
 import glusternagios
 
@@ -47,7 +51,8 @@ _glusterdService = "Gluster Management Daemon"
 _quotadService = "Gluster Quota Daemon"
 
 
-def sendBrickStatus(hostName, volInfo):
+def getBrickStatus(hostName, volInfo):
+    bricks = {}
     hostUuid = glustercli.hostUUIDGet()
     status = None
     for volumeName, volumeInfo in volInfo.iteritems():
@@ -78,15 +83,15 @@ def sendBrickStatus(hostName, volInfo):
                     msg = "OK: Brick %s" % brickPath
                 elif status != utils.PluginStatusCode.UNKNOWN:
                     msg = "CRITICAL: Brick %s is down" % brickPath
-                nscautils.send_to_nsca(hostName, brickService, status, msg)
+                bricks[brickService] = [status, msg]
+    return bricks
 
 
-def sendNfsStatus(hostName, volInfo):
+def getNfsStatus(hostName, volInfo):
     # if nfs is already running we need not to check further
     status, msg, error = utils.execCmd(_checkNfsCmd)
     if status == utils.PluginStatusCode.OK:
-        nscautils.send_to_nsca(hostName, _nfsService, status, msg)
-        return
+        return status, msg[0] if len(msg) > 0 else ""
 
     # if nfs is not running and any of the volume uses nfs
     # then its required to alert the user
@@ -101,36 +106,34 @@ def sendNfsStatus(hostName, volInfo):
     else:
         msg = "OK: No gluster volume uses nfs"
         status = utils.PluginStatusCode.OK
-    nscautils.send_to_nsca(hostName, _nfsService, status, msg)
+    return status, msg
 
 
-def sendSmbStatus(hostName, volInfo):
+def getSmbStatus(hostName, volInfo):
     status, msg, error = utils.execCmd(_checkSmbCmd)
     if status == utils.PluginStatusCode.OK:
-        nscautils.send_to_nsca(hostName, _smbService, status, msg)
-        return
+        return status, msg[0] if len(msg) > 0 else ""
 
     # if smb is not running and any of the volume uses smb
     # then its required to alert the use
     for k, v in volInfo.iteritems():
-        cifsStatus = v.get('options', {}).get('user.cifs', '')
-        smbStatus = v.get('options', {}).get('user.smb', '')
-        if cifsStatus == 'disable' or smbStatus == 'disable':
+        cifsStatus = v.get('options', {}).get('user.cifs', 'enable')
+        smbStatus = v.get('options', {}).get('user.smb', 'enable')
+        if cifsStatus == 'enable' and smbStatus == 'enable':
             msg = "CRITICAL: Process smb is not running"
             status = utils.PluginStatusCode.CRITICAL
             break
     else:
         msg = "OK: No gluster volume uses smb"
         status = utils.PluginStatusCode.OK
-    nscautils.send_to_nsca(hostName, _smbService, status, msg)
+    return status, msg
 
 
-def sendQuotadStatus(hostName, volInfo):
+def getQuotadStatus(hostName, volInfo):
     # if quota is already running we need not to check further
     status, msg, error = utils.execCmd(_checkQuotaCmd)
     if status == utils.PluginStatusCode.OK:
-        nscautils.send_to_nsca(hostName, _quotadService, status, msg)
-        return
+        return status, msg[0] if len(msg) > 0 else ""
 
     # if quota is not running and any of the volume uses quota
     # then the quotad process should be running in the host
@@ -143,14 +146,13 @@ def sendQuotadStatus(hostName, volInfo):
     else:
         msg = "OK: Quota not enabled"
         status = utils.PluginStatusCode.OK
-    nscautils.send_to_nsca(hostName, _quotadService, status, msg)
+    return status, msg
 
 
-def sendShdStatus(hostName, volInfo):
+def getShdStatus(hostName, volInfo):
     status, msg, error = utils.execCmd(_checkShdCmd)
     if status == utils.PluginStatusCode.OK:
-        nscautils.send_to_nsca(hostName, _shdService, status, msg)
-        return
+        return status, msg[0] if len(msg) > 0 else ""
 
     hostUuid = glustercli.hostUUIDGet()
     for volumeName, volumeInfo in volInfo.iteritems():
@@ -164,7 +166,7 @@ def sendShdStatus(hostName, volInfo):
     else:
         msg = "OK: Process Gluster Self Heal Daemon"
         status = utils.PluginStatusCode.OK
-    nscautils.send_to_nsca(hostName, _shdService, status, msg)
+    return status, msg
 
 
 def hasBricks(hostUuid, bricks):
@@ -174,31 +176,101 @@ def hasBricks(hostUuid, bricks):
     return False
 
 
-if __name__ == '__main__':
-    hostName = nscautils.getCurrentHostNameInNagiosServer()
-    if not hostName:
-        hostName = socket.getfqdn()
-    if hostName == "localhost.localdomain" or hostName == "localhost":
-        sys.stderr.write("failed to find localhost fqdn")
+class App():
+    def __init__(self):
+        self.stdin_path = '/dev/null'
+        self.stdout_path = '/dev/tty'
+        self.stderr_path = '/dev/null'
+        self.pidfile_path = '/var/run/glusterpmd.pid'
+        self.pidfile_timeout = 5
 
-    ### service check ###
-    status, msg, error = utils.execCmd(_checkGlusterdCmd)
-    nscautils.send_to_nsca(hostName, _glusterdService, status, msg)
+    def run(self):
+        hostName = nscautils.getCurrentHostNameInNagiosServer()
+        sleepTime = int(nscautils.getProcessMonitorSleepTime())
+        glusterdStatus = None
+        nfsStatus = None
+        smbStatus = None
+        shdStatus = None
+        quotaStatus = None
+        brickStatus = {}
+        while True:
+            if not hostName:
+                hostName = nscautils.getCurrentHostNameInNagiosServer()
+                if not hostName:
+                    logger.warn("Hostname is not configured")
+                    time.sleep(sleepTime)
+                    continue
+            status, msg, error = utils.execCmd(_checkGlusterdCmd)
+            if status != glusterdStatus or \
+                    status == utils.PluginStatusCode.CRITICAL:
+                glusterdStatus = status
+                msg = msg[0] if len(msg) > 0 else ""
+                nscautils.send_to_nsca(hostName, _glusterdService, status, msg)
 
-    # Get the volume status only if glusterfs is running to avoid
-    # unusual delay
-    if status != utils.PluginStatusCode.OK:
-        sys.exit(status)
+            # Get the volume status only if glusterfs is running to avoid
+            # unusual delay
+            if status != utils.PluginStatusCode.OK:
+                logger.warn("Glusterd is not running")
+                time.sleep(sleepTime)
+                continue
 
-    try:
-        volInfo = glustercli.volumeInfo()
-    except glusternagios.glustercli.GlusterCmdFailedException as e:
-        sys.exit(utils.PluginStatusCode.UNKNOWN)
+            try:
+                volInfo = glustercli.volumeInfo()
+            except glusternagios.glustercli.GlusterCmdFailedException:
+                logger.error("failed to find volume info")
+                time.sleep(sleepTime)
+                continue
+
+            status, msg = getNfsStatus(hostName, volInfo)
+            if status != nfsStatus or \
+                    status == utils.PluginStatusCode.CRITICAL:
+                nfsStatus = status
+                nscautils.send_to_nsca(hostName, _nfsService, status, msg)
 
-    sendNfsStatus(hostName, volInfo)
-    sendSmbStatus(hostName, volInfo)
-    sendShdStatus(hostName, volInfo)
-    sendQuotadStatus(hostName, volInfo)
-    sendBrickStatus(hostName, volInfo)
+            status, msg = getSmbStatus(hostName, volInfo)
+            if status != smbStatus or \
+                    status == utils.PluginStatusCode.CRITICAL:
+                smbStatus = status
+                nscautils.send_to_nsca(hostName, _smbService, status, msg)
 
+            status, msg = getShdStatus(hostName, volInfo)
+            if status != shdStatus or \
+                    status == utils.PluginStatusCode.CRITICAL:
+                shdStatus = status
+                nscautils.send_to_nsca(hostName, _shdService, status, msg)
+
+            status, msg = getQuotadStatus(hostName, volInfo)
+            if status != quotaStatus or \
+                    status == utils.PluginStatusCode.CRITICAL:
+                quotaStatus = status
+                nscautils.send_to_nsca(hostName, _quotadService, status, msg)
+
+            brick = getBrickStatus(hostName, volInfo)
+            # brickInfo contains status, and message
+            for brickService, brickInfo in brick.iteritems():
+                if brickInfo[0] != brickStatus.get(brickService, [None])[0] \
+                   or brickInfo[0] == utils.PluginStatusCode.CRITICAL:
+                    brickStatus[brickService] = brickInfo
+                    nscautils.send_to_nsca(hostName, brickService,
+                                           brickInfo[0], brickInfo[1])
+            time.sleep(sleepTime)
+
+if __name__ == '__main__':
+    app = App()
+    logger = logging.getLogger("GlusterProcLog")
+    logger.setLevel(logging.INFO)
+    formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+    handler = logging.FileHandler("/var/log/glusterpmd.log")
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+
+    daemonRunner = runner.DaemonRunner(app)
+    daemonRunner.daemon_context.files_preserve = [handler.stream]
+    try:
+        daemonRunner.do_action()
+    except lockfile.LockTimeout:
+        logger.error("failed to aquire lock")
+    except runner.DaemonRunnerStopFailureError:
+        logger.error("failed to get the lock file")
     sys.exit(utils.PluginStatusCode.OK)
diff --git a/plugins/glusterpmd b/plugins/glusterpmd
new file mode 100755
index 0000000..383057e
--- /dev/null
+++ b/plugins/glusterpmd
@@ -0,0 +1,63 @@
+#! /bin/sh
+# glusterpmd    Start/Stop the gluster process monitoring daemon.
+#
+# chkconfig: 2345 90 60
+# description: Monitor gluster related processes and send
+#              details to nagios server whenever any changes
+#              observed in those services.
+#
+# Copyright (C) 2014 Red Hat Inc
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+#
+
+BASE=glusterpmd
+
+# Fedora File System Layout dictates /run
+[ -e /run ] && RUNDIR="/run"
+PIDFILE="${RUNDIR:-/var/run}/${BASE}.pid"
+
+PID=`test -f $PIDFILE && cat $PIDFILE`
+
+case "$1" in
+  start)
+    if [ -f $PIDFILE ]; then
+        echo "glusterpmd service is already running with pid $PID"
+    else
+        echo "Starting gluster process monitoring service"
+        python /usr/lib64/nagios/plugins/gluster/check_proc_status.py start
+    fi
+    ;;
+  stop)
+    if [ -f $PIDFILE ]; then
+        echo "Stopping gluster process monitoring service"
+        python /usr/lib64/nagios/plugins/gluster/check_proc_status.py stop
+    fi
+    ;;
+  restart)
+    if [ -f $PIDFILE ]; then
+        echo "Restarting gluster process monitoring service"
+        python /usr/lib64/nagios/plugins/gluster/check_proc_status.py restart
+    else
+        python /usr/lib64/nagios/plugins/gluster/check_proc_status.py start
+    fi
+    ;;
+  *)
+    echo "Usage: /etc/init.d/glusterpmd {start|stop|restart}"
+    exit 1
+    ;;
+esac
+
+exit 0
diff --git a/plugins/nscautils.py.in b/plugins/nscautils.py.in
index 56db577..51df927 100644
--- a/plugins/nscautils.py.in
+++ b/plugins/nscautils.py.in
@@ -43,6 +43,12 @@ def getNagiosClusterName():
     return config.get('NAGIOS-DEFINTIONS', 'cluster_name')
 
 
+def getProcessMonitorSleepTime():
+    config = ConfigParser.ConfigParser()
+    config.read(__NAGIOSSERVER_CONF)
+    return config.get('HOST-CONF', 'proc-mon-sleep-time')
+
+
 def send_to_nsca(hostName, serviceName, exitStatus, resultString):
     cmddata = '%s\t%s\t%s\t%s\n' % (hostName,
                                     serviceName,