geo-rep: Handle Worker kill gracefully if worker already died

If Agent dies for any reason, monitor tries to kill Worker also. But if worker is also died then kill command raises error ESRCH: No such process. [2016-05-23 16:49:33.903965] I [monitor(monitor):326:monitor] Monitor: Changelog Agent died, Aborting Worker(/bricks/brick0/master_brick0) [2016-05-23 16:49:33.904535] E [syncdutils(monitor):276:log_raise_exception] <top>: FAIL: Traceback (most recent call last): File "/usr/libexec/glusterfs/python/syncdaemon/syncdutils.py", line 306 in twrap tf(*aa) File "/usr/libexec/glusterfs/python/syncdaemon/monitor.py", line 393, in wmon slave_host, master) File "/usr/libexec/glusterfs/python/syncdaemon/monitor.py", line 327, in monitor os.kill(cpid, signal.SIGKILL) OSError: [Errno 3] No such process With this patch, monitor will gracefully handle if worker is already died. Change-Id: I3ae5f816a3a197343b64540cf46f5453167fb660 Signed-off-by: Aravinda VK <avishwan@redhat.com> BUG: 1341068 Reviewed-on: http://review.gluster.org/14512 Smoke: Gluster Build System <jenkins@build.gluster.com> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> Reviewed-by: Kotresh HR <khiremat@redhat.com> CentOS-regression: Gluster Build System <jenkins@build.gluster.com> (cherry picked from commit 4f4a94a35a24d781f3f0e584a8cb59c019e50d6f) Reviewed-on: http://review.gluster.org/14562 Reviewed-by: Saravanakumar Arumugam <sarumuga@redhat.com>
author: Aravinda VK <avishwan@redhat.com> 2016-05-24 14:13:29 +0530
committer: Aravinda VK <avishwan@redhat.com> 2016-06-01 09:30:36 -0700
commit: 0e3b6f39924d8d1ef3b5fa57b5d38886b5d3626e (patch)
tree: 990960caff00ea063f75f37517fa166f03910b09 /geo-replication
parent: 1d28634b9aab65b08c1c2e9a6f48619c9fa494dc (diff)
1 files changed, 9 insertions, 9 deletions
diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py
index 2b570a9f4fc..050218b6d1b 100644
--- a/geo-replication/syncdaemon/monitor.py
+++ b/geo-replication/syncdaemon/monitor.py
@@ -18,11 +18,11 @@ import xml.etree.ElementTree as XET
 from subprocess import PIPE
 from resource import Popen, FILE, GLUSTER, SSH
 from threading import Lock
-from errno import ECHILD
+from errno import ECHILD, ESRCH
 import re
 import random
 from gconf import gconf
-from syncdutils import select, waitpid
+from syncdutils import select, waitpid, errno_wrap
 from syncdutils import set_term_handler, is_host_local, GsyncdError
 from syncdutils import escape, Thread, finalize, memoize
 
@@ -187,7 +187,7 @@ class Monitor(object):
         # standard handler
         set_term_handler(lambda *a: set_term_handler())
         # give a chance to graceful exit
-        os.kill(-os.getpid(), signal.SIGTERM)
+        errno_wrap(os.kill, [-os.getpid(), signal.SIGTERM], [ESRCH])
 
     def monitor(self, w, argv, cpids, agents, slave_vol, slave_host, master):
         """the monitor loop
@@ -324,7 +324,7 @@ class Monitor(object):
                     # Agent is died Kill Worker
                     logging.info("Changelog Agent died, "
                                  "Aborting Worker(%s)" % w[0])
-                    os.kill(cpid, signal.SIGKILL)
+                    errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                     nwait(cpid)
                     nwait(apid)
 
@@ -348,7 +348,7 @@ class Monitor(object):
                             # Agent is died Kill Worker
                             logging.info("Changelog Agent died, Aborting "
                                          "Worker(%s)" % w[0])
-                            os.kill(cpid, signal.SIGKILL)
+                            errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                             nwait(cpid)
                             nwait(apid)
                             break
@@ -357,7 +357,7 @@ class Monitor(object):
             else:
                 logging.info("worker(%s) not confirmed in %d sec, "
                              "aborting it" % (w[0], conn_timeout))
-                os.kill(cpid, signal.SIGKILL)
+                errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                 nwait(apid)  # wait for agent
                 ret = nwait(cpid)
             if ret is None:
@@ -394,9 +394,9 @@ class Monitor(object):
                 time.sleep(1)
                 self.lock.acquire()
                 for cpid in cpids:
-                    os.kill(cpid, signal.SIGKILL)
+                    errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                 for apid in agents:
-                    os.kill(apid, signal.SIGKILL)
+                    errno_wrap(os.kill, [apid, signal.SIGKILL], [ESRCH])
                 self.lock.release()
                 finalize(exval=1)
             t = Thread(target=wmon, args=[wx])
@@ -464,7 +464,7 @@ def monitor(*resources):
     # yes, send SIGSTOP to negative of monitor pid
     # to go back to pause state.
     if gconf.pause_on_start:
-        os.kill(-os.getpid(), signal.SIGSTOP)
+        errno_wrap(os.kill, [-os.getpid(), signal.SIGSTOP], [ESRCH])
 
     """oh yeah, actually Monitor is used as singleton, too"""
     return Monitor().multiplex(*distribute(*resources))
author	Aravinda VK <avishwan@redhat.com>	2016-05-24 14:13:29 +0530
committer	Aravinda VK <avishwan@redhat.com>	2016-06-01 09:30:36 -0700
commit	0e3b6f39924d8d1ef3b5fa57b5d38886b5d3626e (patch)
tree	990960caff00ea063f75f37517fa166f03910b09 /geo-replication
parent	1d28634b9aab65b08c1c2e9a6f48619c9fa494dc (diff)