summaryrefslogtreecommitdiffstats
path: root/xlators/features/marker/utils/syncdaemon/monitor.py
diff options
context:
space:
mode:
authorCsaba Henk <csaba@gluster.com>2011-04-21 16:43:49 +0000
committerAnand Avati <avati@gluster.com>2011-04-22 04:05:20 -0700
commit775323c1b2fa9a557d3ea74d57e843575f7b1278 (patch)
tree58ceec9deac4c3b947a58a025255720abba45223 /xlators/features/marker/utils/syncdaemon/monitor.py
parentde809504282731332c9bc0fc7f7da5be34f206ce (diff)
syncdaemon: have the monitor kill the worker if it does not connect in 60 sec
Signed-off-by: Csaba Henk <csaba@gluster.com> Signed-off-by: Anand Avati <avati@gluster.com> BUG: 2736 (gsyncd hangs if crash occurs in the non-main thread) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=2736
Diffstat (limited to 'xlators/features/marker/utils/syncdaemon/monitor.py')
-rw-r--r--xlators/features/marker/utils/syncdaemon/monitor.py28
1 files changed, 24 insertions, 4 deletions
diff --git a/xlators/features/marker/utils/syncdaemon/monitor.py b/xlators/features/marker/utils/syncdaemon/monitor.py
index 5e5d22f4f..a86acdc75 100644
--- a/xlators/features/marker/utils/syncdaemon/monitor.py
+++ b/xlators/features/marker/utils/syncdaemon/monitor.py
@@ -2,6 +2,8 @@ import os
import sys
import time
import logging
+import select
+from signal import SIGKILL
from gconf import gconf
from syncdutils import update_file
@@ -35,12 +37,30 @@ class Monitor(object):
if os.WIFEXITED(r):
return os.WEXITSTATUS(r)
return 1
+ conn_timeout = 60
while ret in (0, 1):
- logging.info('-' * 60)
+ logging.info('-' * conn_timeout)
logging.info('starting gsyncd worker')
- cpid = os.spawnv(os.P_NOWAIT, sys.executable, argv)
- time.sleep(60)
- ret = nwait(cpid, os.WNOHANG)
+ pr, pw = os.pipe()
+ cpid = os.fork()
+ if cpid == 0:
+ os.close(pr)
+ os.execv(sys.executable, argv + ['--feedback-fd', str(pw)])
+ os.close(pw)
+ t0 = time.time()
+ select.select((pr,), (), (), conn_timeout)
+ os.close(pr)
+ et = time.time() - t0
+ if et < conn_timeout:
+ et2 = conn_timeout - et
+ logging.debug("worker got connected in %d sec, "
+ "waiting %d more to make sure it's fine" % (et, et2))
+ time.sleep(et2)
+ ret = nwait(cpid, os.WNOHANG)
+ else:
+ logging.debug("worker not confirmed in %d sec, aborting it" % et)
+ os.kill(cpid, SIGKILL)
+ ret = nwait(cpid)
if ret == None:
self.set_state('OK')
ret = nwait(cpid)