summaryrefslogtreecommitdiffstats
path: root/plugins
diff options
context:
space:
mode:
Diffstat (limited to 'plugins')
-rw-r--r--plugins/Makefile.am6
-rw-r--r--plugins/__init__.py0
-rwxr-xr-xplugins/check_remote_host.py199
-rwxr-xr-xplugins/gluster_host_service_handler.py154
4 files changed, 359 insertions, 0 deletions
diff --git a/plugins/Makefile.am b/plugins/Makefile.am
index c12520c..329af89 100644
--- a/plugins/Makefile.am
+++ b/plugins/Makefile.am
@@ -1,2 +1,8 @@
dist_glusternagiosplugins_PYTHON = \
+ check_remote_host.py \
+ gluster_host_service_handler.py \
+ $(NULL)
+
+EXTRA_DIST = \
+ __init__.py \
$(NULL)
diff --git a/plugins/__init__.py b/plugins/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/plugins/__init__.py
diff --git a/plugins/check_remote_host.py b/plugins/check_remote_host.py
new file mode 100755
index 0000000..7350e27
--- /dev/null
+++ b/plugins/check_remote_host.py
@@ -0,0 +1,199 @@
+#!/usr/bin/python
+#
+# check_remote_host.py -- nagios plugin uses Mklivestatus to get the overall
+# status
+# of a host. The entities considered for the status of the host are -
+# 1. Host is reachable
+# 2. LV/Inode Service status
+# 3. CPU Utilization
+# 4. Memory Utilization
+# 5. Network Utilization
+# 6. Swap Utilization
+#
+# Copyright (C) 2014 Red Hat Inc
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,USA
+#
+
+import os
+import sys
+import shlex
+import subprocess
+import socket
+import getopt
+
+STATUS_OK = 0
+STATUS_WARNING = 1
+STATUS_CRITICAL = 2
+STATUS_UNKNOWN = 3
+_checkPingCommand = "/usr/lib64/nagios/plugins/check_ping"
+_commandStatusStrs = {STATUS_OK: 'OK', STATUS_WARNING: 'WARNING',
+ STATUS_CRITICAL: 'CRITICAL', STATUS_UNKNOWN: 'UNKNOWN'}
+_socketPath = '/var/spool/nagios/cmd/live'
+
+
+# Class for exception definition
+class checkPingCmdExecFailedException(Exception):
+ message = "check_ping command failed"
+
+ def __init__(self, rc=0, out=(), err=()):
+ self.rc = rc
+ self.out = out
+ self.err = err
+
+ def __str__(self):
+ o = '\n'.join(self.out)
+ e = '\n'.join(self.err)
+ if o and e:
+ m = o + '\n' + e
+ else:
+ m = o or e
+
+ s = self.message
+ if m:
+ s += '\nerror: ' + m
+ if self.rc:
+ s += '\nreturn code: %s' % self.rc
+ return s
+
+
+# Method to execute a command
+def execCmd(command):
+ proc = subprocess.Popen(command,
+ close_fds=True,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
+ (out, err) = proc.communicate()
+ return (proc.returncode, out, err)
+
+
+# Method to check the ing status of the host
+def getPingStatus(hostAddr):
+ cmd = "%s -H %s" % (_checkPingCommand, hostAddr)
+ cmd += " -w 3000.0,80% -c 5000.0,100%"
+
+ try:
+ (rc, out, err) = execCmd(shlex.split(cmd))
+ except (OSError, ValueError) as e:
+ raise checkPingCmdExecFailedException(err=[str(e)])
+
+ if rc != 0:
+ raise checkPingCmdExecFailedException(rc, [out], [err])
+
+ return rc
+
+
+# Method to execute livestatus
+def checkLiveStatus(hostAddr, srvc):
+ s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+ s.connect(_socketPath)
+
+ # Write command to socket
+ cmd = "GET services\nColumns: state\nFilter: "
+ "description = %s\nFilter: host_address = %s\n" % (srvc, hostAddr)
+ s.send(cmd)
+
+ # Close socket
+ s.shutdown(socket.SHUT_WR)
+
+ # Read the answer
+ answer = s.recv(1000000)
+
+ # Parse the answer into a table
+ table = [line.split(';') for line in answer.split('\n')[:-1]]
+
+ if len(table) > 0 and len(table[0]) > 0:
+ return int(table[0][0])
+ else:
+ return STATUS_UNKNOWN
+
+
+# Method to show the usage
+def showUsage():
+ usage = "Usage: %s -H <Host Address>\n" % os.path.basename(sys.argv[0])
+ sys.stderr.write(usage)
+
+
+# Main method
+if __name__ == "__main__":
+ try:
+ opts, args = getopt.getopt(sys.argv[1:], "hH:", ["help", "host="])
+ except getopt.GetoptError as e:
+ print (str(e))
+ showUsage()
+ sys.exit(STATUS_CRITICAL)
+
+ hostAddr = ''
+ if len(opts) == 0:
+ showUsage()
+ sys.exit(STATUS_CRITICAL)
+ else:
+ for opt, arg in opts:
+ if opt in ("-h", "--help"):
+ showUsage()
+ sys.exit()
+ elif opt in ("-H", "--host"):
+ hostAddr = arg
+ else:
+ showUsage()
+ sys.exit(STATUS_CRITICAL)
+
+ # Check ping status of the node, if its not reachable exit
+ try:
+ pingStatus = getPingStatus(hostAddr)
+ except (checkPingCmdExecFailedException) as e:
+ print "Host Status %s - Host not reachable" % \
+ (_commandStatusStrs[STATUS_UNKNOWN])
+ sys.exit(_commandStatusStrs[STATUS_UNKNOWN])
+
+ if pingStatus != STATUS_OK:
+ print "Host Status %s - Host not reachable" % \
+ (_commandStatusStrs[STATUS_UNKNOWN])
+ sys.exit(pingStatus)
+
+ # Check the various performance statuses for the host
+ diskPerfStatus = checkLiveStatus(hostAddr, 'Disk Utilization')
+ cpuPerfStatus = checkLiveStatus(hostAddr, 'Cpu Utilization')
+ memPerfStatus = checkLiveStatus(hostAddr, 'Memory Utilization')
+ swapPerfStatus = checkLiveStatus(hostAddr, 'Swap Utilization')
+ nwPerfStatus = checkLiveStatus(hostAddr, 'Network Utilization')
+
+ # Calculate the consolidated status for the host based on above status
+ # details
+ finalStatus = pingStatus | diskPerfStatus | cpuPerfStatus | \
+ memPerfStatus | swapPerfStatus | nwPerfStatus
+
+ # Get the list of ciritical services
+ criticalSrvcs = []
+ if diskPerfStatus == STATUS_CRITICAL:
+ criticalSrvcs.append('Disk Utilization')
+ if cpuPerfStatus == STATUS_CRITICAL:
+ criticalSrvcs.append('Cpu Utilization')
+ if memPerfStatus == STATUS_CRITICAL:
+ criticalSrvcs.append('Memory Utilization')
+ if swapPerfStatus == STATUS_CRITICAL:
+ criticalSrvcs.append('Swap Utilization')
+ if nwPerfStatus == STATUS_CRITICAL:
+ criticalSrvcs.append('Network Utilization')
+
+ # Return the status
+ if finalStatus == STATUS_CRITICAL:
+ print "Host Status %s - Service(s) %s in CRITICAL state" % \
+ (_commandStatusStrs[STATUS_WARNING], criticalSrvcs)
+ sys.exit(STATUS_WARNING)
+
+ print "Host Status %s - Services in good health" % \
+ _commandStatusStrs[STATUS_OK]
+ sys.exit(STATUS_OK)
diff --git a/plugins/gluster_host_service_handler.py b/plugins/gluster_host_service_handler.py
new file mode 100755
index 0000000..283ac69
--- /dev/null
+++ b/plugins/gluster_host_service_handler.py
@@ -0,0 +1,154 @@
+#!/usr/bin/python
+#
+# gluster_host_service_handler.py -- Event handler which checks the
+# status of defined services and accordingly changes the host status
+#
+# Copyright (C) 2014 Red Hat Inc
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,USA
+#
+
+import os
+import sys
+import datetime
+import socket
+import getopt
+
+STATUS_OK = "OK"
+STATUS_WARNING = "WARNING"
+STATUS_CRITICAL = "CRITICAL"
+STATUS_UNKNOWN = "UNKNOWN"
+SRVC_STATE_TYPE_SOFT = "SOFT"
+SRVC_STATE_TYPE_HARD = "HARD"
+statusCodes = {STATUS_OK: 0, STATUS_WARNING: 1, STATUS_CRITICAL: 2,
+ STATUS_UNKNOWN: 3}
+NAGIOS_COMMAND_FILE = "/var/spool/nagios/cmd/nagios.cmd"
+SRVC_LIST = ['Disk Utilization', 'Cpu Utilization', 'Memory Utilization',
+ 'Swap Utilization', 'Network Utilization']
+_socketPath = '/var/spool/nagios/cmd/live'
+
+
+# Shows the usage of the script
+def showUsage():
+ usage = "Usage: %s -s <Service State (OK/WARNING/CRITICAL/UNKNOWN)> "
+ "-t <Service State Type (SOFT/HARD)> -a <No of Service attempts> "
+ "-l <Host Address> -n <Service Name>\n" % os.path.basename(sys.argv[0])
+ sys.stderr.write(usage)
+
+
+# Method to change the host status
+def update_host_state(hostAddr, srvcName, statusCode):
+ now = datetime.datetime.now()
+ if statusCode == statusCodes[STATUS_WARNING]:
+ cmdStr = "[%s] PROCESS_HOST_CHECK_RESULT;%s;%s;Host Status WARNING - "
+ "Service(s) ['%s'] in CRITICAL state\n" % (now, hostAddr, statusCode,
+ srvcName)
+ else:
+ cmdStr = "[%s] PROCESS_HOST_CHECK_RESULT;%s;%s;Host Status OK - "
+ "Services in good health\n" % (now, hostAddr, statusCode)
+
+ f = open(NAGIOS_COMMAND_FILE, "w")
+ f.write(cmdStr)
+ f.close()
+
+
+# Method to execute livestatus
+def checkLiveStatus(hostAddr, srvc):
+ s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+ s.connect(_socketPath)
+
+ # Write command to socket
+ cmd = "GET services\nColumns: state\nFilter: "
+ "description = %s\nFilter: host_address = %s\n" % (srvc, hostAddr)
+ s.send(cmd)
+
+ # Close socket
+ s.shutdown(socket.SHUT_WR)
+
+ # Read the answer
+ answer = s.recv(1000)
+
+ # Parse the answer into a table
+ table = [line.split(';') for line in answer.split('\n')[:-1]]
+
+ if len(table) > 0 and len(table[0]) > 0:
+ return int(table[0][0])
+ else:
+ return statusCodes[STATUS_UNKNOWN]
+
+
+# Method to change the host state to UP based on other service type status
+def check_and_update_host_state_to_up(hostAddr, srvcName):
+ finalState = 0
+ for item in SRVC_LIST:
+ if item != srvcName:
+ finalState = finalState | checkLiveStatus(hostAddr, item)
+
+ if finalState == statusCodes[STATUS_OK]:
+ update_host_state(hostAddr, srvcName, statusCodes[STATUS_OK])
+
+
+# Main method
+if __name__ == "__main__":
+ try:
+ opts, args = getopt.getopt(sys.argv[1:], "hs:t:a:l:n:",
+ ["help", "state=", "type=",
+ "attempts=", "location=", "name="])
+ except getopt.GetoptError as e:
+ print (str(e))
+ showUsage()
+ sys.exit(STATUS_CRITICAL)
+
+ srvcState = ''
+ srvcStateType = ''
+ attempts = ''
+ hostAddr = ''
+ srvcName = ''
+ if len(opts) == 0:
+ showUsage()
+ else:
+ for opt, arg in opts:
+ if opt in ('-h', '--help'):
+ showUsage()
+ sys.exit()
+ elif opt in ('-s', '--state'):
+ srvcState = arg
+ elif opt in ('-t', '--type'):
+ srvcStateType = arg
+ elif opt in ('-a', '--attempts'):
+ attempts = arg
+ elif opt in ('-l', '--location'):
+ hostAddr = arg
+ elif opt in ('-n', '--name'):
+ srvcName = arg
+ else:
+ showUsage()
+ sys.exit()
+
+ # Swicth over the service state values and do the needful
+ if srvcState == STATUS_CRITICAL:
+ if srvcStateType == SRVC_STATE_TYPE_SOFT:
+ if int(attempts) == 3:
+ print "Updating the host status to warning "
+ "(3rd SOFT critical state)..."
+ update_host_state(hostAddr, srvcName,
+ statusCodes[STATUS_WARNING])
+ elif srvcStateType == SRVC_STATE_TYPE_HARD:
+ print "Updating the host status to warning..."
+ update_host_state(hostAddr, srvcName, statusCodes[STATUS_WARNING])
+ elif srvcState == STATUS_OK:
+ check_and_update_host_state_to_up(hostAddr, srvcName)
+
+ sys.exit(0)