diff options
author | Csaba Henk <csaba@gluster.com> | 2011-04-21 16:43:49 +0000 |
---|---|---|
committer | Anand Avati <avati@gluster.com> | 2011-04-22 04:05:20 -0700 |
commit | 775323c1b2fa9a557d3ea74d57e843575f7b1278 (patch) | |
tree | 58ceec9deac4c3b947a58a025255720abba45223 /xlators/features/marker/utils/syncdaemon/monitor.py | |
parent | de809504282731332c9bc0fc7f7da5be34f206ce (diff) |
syncdaemon: have the monitor kill the worker if it does not connect in 60 sec
Signed-off-by: Csaba Henk <csaba@gluster.com>
Signed-off-by: Anand Avati <avati@gluster.com>
BUG: 2736 (gsyncd hangs if crash occurs in the non-main thread)
URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=2736
Diffstat (limited to 'xlators/features/marker/utils/syncdaemon/monitor.py')
-rw-r--r-- | xlators/features/marker/utils/syncdaemon/monitor.py | 28 |
1 files changed, 24 insertions, 4 deletions
diff --git a/xlators/features/marker/utils/syncdaemon/monitor.py b/xlators/features/marker/utils/syncdaemon/monitor.py index 5e5d22f4f..a86acdc75 100644 --- a/xlators/features/marker/utils/syncdaemon/monitor.py +++ b/xlators/features/marker/utils/syncdaemon/monitor.py @@ -2,6 +2,8 @@ import os import sys import time import logging +import select +from signal import SIGKILL from gconf import gconf from syncdutils import update_file @@ -35,12 +37,30 @@ class Monitor(object): if os.WIFEXITED(r): return os.WEXITSTATUS(r) return 1 + conn_timeout = 60 while ret in (0, 1): - logging.info('-' * 60) + logging.info('-' * conn_timeout) logging.info('starting gsyncd worker') - cpid = os.spawnv(os.P_NOWAIT, sys.executable, argv) - time.sleep(60) - ret = nwait(cpid, os.WNOHANG) + pr, pw = os.pipe() + cpid = os.fork() + if cpid == 0: + os.close(pr) + os.execv(sys.executable, argv + ['--feedback-fd', str(pw)]) + os.close(pw) + t0 = time.time() + select.select((pr,), (), (), conn_timeout) + os.close(pr) + et = time.time() - t0 + if et < conn_timeout: + et2 = conn_timeout - et + logging.debug("worker got connected in %d sec, " + "waiting %d more to make sure it's fine" % (et, et2)) + time.sleep(et2) + ret = nwait(cpid, os.WNOHANG) + else: + logging.debug("worker not confirmed in %d sec, aborting it" % et) + os.kill(cpid, SIGKILL) + ret = nwait(cpid) if ret == None: self.set_state('OK') ret = nwait(cpid) |