summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCsaba Henk <csaba@redhat.com>2012-05-19 02:43:05 +0530
committerVijay Bellur <vbellur@redhat.com>2012-10-31 09:45:54 -0400
commite9cd619c8d77d8f914661dad61aae4c40fdc1fcf (patch)
tree5ac1437e167e1bb5d931d7ae00308125def46dcc
parent890eb31ca8c37c6e083a418824d0b2dec81b15f0 (diff)
geo-rep / gsyncd: further cleanup refinements
- Regarding issue of leftover ssh control dirs: If master side worker is stuck in connection establishment phase, have the monitor kill it softly (ie. first by SIGTERM, to let it cleanup). This is trickier than sounds on first hearing, because if worker is stuck in waiting for a RePCe answer (in threading.Condition().wait()), then SIGTERM is ignored (more precisely, Python holds it back for the wait and resends it to itself when wait is over). So instead of signalling the worker only, we send TERM to the whole process group -- that brings down the ssh connection, which wakes up the waiting worker, which then can cleanup. Only problem is that monitor is also in the process group and it should not coomit a suicide. That is taken care by setting up a one-time SIGTERM handler in the monitor. - Regarding slave gsyncd stuck in chdir: Slave gsyncd is usually well behaved: if master does not send keepalives, it takes care to exit. However, if a hang occurs in early phase, when slave is to change to the gluster mountpoint, no timeout is set up for that (and unlike on master side, neither is there an external actor like the monitor to do that). So, to manage this scenario, we do the chdir in a (supposedly) short lived thread, and in the main thread we wait for the termination of this thread. If that does not happen within the time limit, main thread calls for cleanup and exit. (This logic explicitely takes the appropriate action in the cases when chdir succeeds or when hangs; but what about the remaining case, when chdir fails? Well in that case the chdir thread's exception handler will put the process to cleanup and exit route.) Change-Id: I6ad6faa9c7b1c37084d171d1e1a756abaff9eba8 BUG: 870503 Signed-off-by: Csaba Henk <csaba@redhat.com> Reviewed-on: https://code.engineering.redhat.com/gerrit/165 Reviewed-by: Vijay Bellur <vbellur@redhat.com> Tested-by: Vijay Bellur <vbellur@redhat.com>
-rw-r--r--xlators/features/marker/utils/syncdaemon/gsyncd.py5
-rw-r--r--xlators/features/marker/utils/syncdaemon/monitor.py8
-rw-r--r--xlators/features/marker/utils/syncdaemon/resource.py10
-rw-r--r--xlators/features/marker/utils/syncdaemon/syncdutils.py5
4 files changed, 23 insertions, 5 deletions
diff --git a/xlators/features/marker/utils/syncdaemon/gsyncd.py b/xlators/features/marker/utils/syncdaemon/gsyncd.py
index 78754cd7595..9e946946932 100644
--- a/xlators/features/marker/utils/syncdaemon/gsyncd.py
+++ b/xlators/features/marker/utils/syncdaemon/gsyncd.py
@@ -17,7 +17,7 @@ from ipaddr import IPAddress, IPNetwork
from gconf import gconf
from syncdutils import FreeObject, norm, grabpidfile, finalize, log_raise_exception
-from syncdutils import GsyncdError, select
+from syncdutils import GsyncdError, select, set_term_handler
from configinterface import GConffile
import resource
from monitor import monitor
@@ -111,7 +111,8 @@ def startup(**kw):
def main():
"""main routine, signal/exception handling boilerplates"""
- signal.signal(signal.SIGTERM, lambda *a: finalize(*a, **{'exval': 1}))
+ gconf.starttime = time.time()
+ set_term_handler()
GLogger.setup()
excont = FreeObject(exval = 0)
try:
diff --git a/xlators/features/marker/utils/syncdaemon/monitor.py b/xlators/features/marker/utils/syncdaemon/monitor.py
index 981643ec0a5..b8956dcc2b9 100644
--- a/xlators/features/marker/utils/syncdaemon/monitor.py
+++ b/xlators/features/marker/utils/syncdaemon/monitor.py
@@ -4,7 +4,7 @@ import time
import signal
import logging
from gconf import gconf
-from syncdutils import update_file, select, waitpid
+from syncdutils import update_file, select, waitpid, set_term_handler
class Monitor(object):
"""class which spawns and manages gsyncd workers"""
@@ -103,6 +103,12 @@ class Monitor(object):
else:
logging.debug("worker not confirmed in %d sec, aborting it" % \
conn_timeout)
+ # relax one SIGTERM by setting a handler that sets back
+ # standard handler
+ set_term_handler(lambda *a: set_term_handler())
+ # give a chance to graceful exit
+ os.kill(-os.getpid(), signal.SIGTERM)
+ time.sleep(1)
os.kill(cpid, signal.SIGKILL)
ret = nwait(cpid)
if ret == None:
diff --git a/xlators/features/marker/utils/syncdaemon/resource.py b/xlators/features/marker/utils/syncdaemon/resource.py
index d5df03c0f74..90db1707be9 100644
--- a/xlators/features/marker/utils/syncdaemon/resource.py
+++ b/xlators/features/marker/utils/syncdaemon/resource.py
@@ -731,7 +731,15 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote):
d = self.mntpt
os.write(mpo, d + '\0')
os.write(mpo, 'M')
- os.chdir(d)
+ t = syncdutils.Thread(target=lambda: os.chdir(d))
+ t.start()
+ tlim = gconf.starttime + int(gconf.connection_timeout)
+ while True:
+ if not t.isAlive():
+ break
+ if time.time() >= tlim:
+ syncdutils.finalize(exval = 1)
+ time.sleep(1)
os.close(mpo)
_, rv = syncdutils.waitpid(mh, 0)
if rv:
diff --git a/xlators/features/marker/utils/syncdaemon/syncdutils.py b/xlators/features/marker/utils/syncdaemon/syncdutils.py
index 61e14322382..1d4eb20032c 100644
--- a/xlators/features/marker/utils/syncdaemon/syncdutils.py
+++ b/xlators/features/marker/utils/syncdaemon/syncdutils.py
@@ -7,7 +7,7 @@ import shutil
import logging
from threading import Lock, Thread as baseThread
from errno import EACCES, EAGAIN, EPIPE, ENOTCONN, ECONNABORTED, EINTR, errorcode
-from signal import SIGTERM, SIGKILL
+from signal import signal, SIGTERM, SIGKILL
from time import sleep
import select as oselect
from os import waitpid as owaitpid
@@ -277,3 +277,6 @@ def select(*a):
def waitpid (*a):
return eintr_wrap(owaitpid, OSError, *a)
+
+def set_term_handler(hook=lambda *a: finalize(*a, **{'exval': 1})):
+ signal(SIGTERM, hook)