From c40b73fc453caf123e806adebac6c69b003bc636 Mon Sep 17 00:00:00 2001
From: Csaba Henk <csaba@redhat.com>
Date: Sat, 19 May 2012 02:43:05 +0530
Subject: geo-rep / gsyncd: further cleanup refinements

- Regarding issue of leftover ssh control dirs:

  If master side worker is stuck in connection establishment
  phase, have the monitor kill it softly (ie. first by SIGTERM,
  to let it cleanup). This is trickier than sounds on first hearing,
  because if worker is stuck in waiting for a RePCe answer
  (in threading.Condition().wait()), then SIGTERM is ignored
  (more precisely, Python holds it back for the wait and resends it to
  itself when wait is over).

  So instead of signalling the worker only, we send TERM to the
  whole process group -- that brings down the ssh connection, which
  wakes up the waiting worker, which then can cleanup. Only problem
  is that monitor is also in the process group and it should not coomit
  a suicide. That is taken care by setting up a one-time SIGTERM
  handler in the monitor.

- Regarding slave gsyncd stuck in chdir:

  Slave gsyncd is usually well behaved: if master does not send
  keepalives, it takes care to exit. However, if a hang occurs
  in early phase, when slave is to change to the gluster mountpoint,
  no timeout is set up for that (and unlike on master side, neither
  is there an external actor like the monitor to do that).

  So, to manage this scenario, we do the chdir in a (supposedly)
  short lived thread, and in the main thread we wait for the termination
  of this thread. If that does not happen within the time limit, main
  thread calls for cleanup and exit. (This logic explicitely takes the
  appropriate action in the cases when chdir succeeds or when hangs;
  but what about the remaining case, when chdir fails? Well in that case
  the chdir thread's exception handler will put the process to
  cleanup and exit route.)

Change-Id: I6ad6faa9c7b1c37084d171d1e1a756abaff9eba8
BUG: 786291
Signed-off-by: Csaba Henk <csaba@redhat.com>
Reviewed-on: http://review.gluster.com/3376
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Anand Avati <avati@redhat.com>
---
 xlators/features/marker/utils/syncdaemon/gsyncd.py     |  5 +++--
 xlators/features/marker/utils/syncdaemon/monitor.py    |  8 +++++++-
 xlators/features/marker/utils/syncdaemon/resource.py   | 10 +++++++++-
 xlators/features/marker/utils/syncdaemon/syncdutils.py |  5 ++++-
 4 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/xlators/features/marker/utils/syncdaemon/gsyncd.py b/xlators/features/marker/utils/syncdaemon/gsyncd.py
index 196ed7ab6d4..9ac32ce4267 100644
--- a/xlators/features/marker/utils/syncdaemon/gsyncd.py
+++ b/xlators/features/marker/utils/syncdaemon/gsyncd.py
@@ -17,7 +17,7 @@ from ipaddr import IPAddress, IPNetwork
 
 from gconf import gconf
 from syncdutils import FreeObject, norm, grabpidfile, finalize, log_raise_exception
-from syncdutils import GsyncdError, select
+from syncdutils import GsyncdError, select, set_term_handler
 from configinterface import GConffile
 import resource
 from monitor import monitor
@@ -107,7 +107,8 @@ def startup(**kw):
 
 def main():
     """main routine, signal/exception handling boilerplates"""
-    signal.signal(signal.SIGTERM, lambda *a: finalize(*a, **{'exval': 1}))
+    gconf.starttime = time.time()
+    set_term_handler()
     GLogger.setup()
     excont = FreeObject(exval = 0)
     try:
diff --git a/xlators/features/marker/utils/syncdaemon/monitor.py b/xlators/features/marker/utils/syncdaemon/monitor.py
index 981643ec0a5..b8956dcc2b9 100644
--- a/xlators/features/marker/utils/syncdaemon/monitor.py
+++ b/xlators/features/marker/utils/syncdaemon/monitor.py
@@ -4,7 +4,7 @@ import time
 import signal
 import logging
 from gconf import gconf
-from syncdutils import update_file, select, waitpid
+from syncdutils import update_file, select, waitpid, set_term_handler
 
 class Monitor(object):
     """class which spawns and manages gsyncd workers"""
@@ -103,6 +103,12 @@ class Monitor(object):
             else:
                 logging.debug("worker not confirmed in %d sec, aborting it" % \
                               conn_timeout)
+                # relax one SIGTERM by setting a handler that sets back
+                # standard handler
+                set_term_handler(lambda *a: set_term_handler())
+                # give a chance to graceful exit
+                os.kill(-os.getpid(), signal.SIGTERM)
+                time.sleep(1)
                 os.kill(cpid, signal.SIGKILL)
                 ret = nwait(cpid)
             if ret == None:
diff --git a/xlators/features/marker/utils/syncdaemon/resource.py b/xlators/features/marker/utils/syncdaemon/resource.py
index de271bd3939..c4cd19c9fb7 100644
--- a/xlators/features/marker/utils/syncdaemon/resource.py
+++ b/xlators/features/marker/utils/syncdaemon/resource.py
@@ -692,7 +692,15 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote):
                     d = self.mntpt
                     os.write(mpo, d + '\0')
                 os.write(mpo, 'M')
-                os.chdir(d)
+                t = syncdutils.Thread(target=lambda: os.chdir(d))
+                t.start()
+                tlim = gconf.starttime + int(gconf.connection_timeout)
+                while True:
+                    if not t.isAlive():
+                        break
+                    if time.time() >= tlim:
+                        syncdutils.finalize(exval = 1)
+                    time.sleep(1)
                 os.close(mpo)
                 _, rv = syncdutils.waitpid(mh, 0)
                 if rv:
diff --git a/xlators/features/marker/utils/syncdaemon/syncdutils.py b/xlators/features/marker/utils/syncdaemon/syncdutils.py
index b29936d596d..f786bc34326 100644
--- a/xlators/features/marker/utils/syncdaemon/syncdutils.py
+++ b/xlators/features/marker/utils/syncdaemon/syncdutils.py
@@ -7,7 +7,7 @@ import shutil
 import logging
 from threading import Lock, Thread as baseThread
 from errno import EACCES, EAGAIN, EPIPE, ENOTCONN, ECONNABORTED, EINTR, errorcode
-from signal import SIGTERM, SIGKILL
+from signal import signal, SIGTERM, SIGKILL
 from time import sleep
 import select as oselect
 from os import waitpid as owaitpid
@@ -271,3 +271,6 @@ def select(*a):
 
 def waitpid (*a):
     return eintr_wrap(owaitpid, OSError, *a)
+
+def set_term_handler(hook=lambda *a: finalize(*a, **{'exval': 1})):
+    signal(SIGTERM, hook)
-- 
cgit