From a2b30a1848ee69854c1de87cd1e3d1d74e96a964 Mon Sep 17 00:00:00 2001 From: Csaba Henk Date: Tue, 31 May 2011 11:06:54 +0000 Subject: syncdaemon: some refactor on monitor - detect faulty state early - keep the feedback fd in gsyncd module Signed-off-by: Csaba Henk Signed-off-by: Anand Avati BUG: 2537 (gsync autorestart) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=2537 BUG: 2537 (gsync autorestart) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=2537 --- xlators/features/marker/utils/syncdaemon/gsyncd.py | 5 +++-- xlators/features/marker/utils/syncdaemon/master.py | 3 --- .../features/marker/utils/syncdaemon/monitor.py | 22 ++++++++++++++-------- 3 files changed, 17 insertions(+), 13 deletions(-) (limited to 'xlators/features') diff --git a/xlators/features/marker/utils/syncdaemon/gsyncd.py b/xlators/features/marker/utils/syncdaemon/gsyncd.py index 60980f54659..193af9d5f37 100644 --- a/xlators/features/marker/utils/syncdaemon/gsyncd.py +++ b/xlators/features/marker/utils/syncdaemon/gsyncd.py @@ -249,8 +249,7 @@ def main_i(): ffd = rconf.get('feedback_fd') if ffd: - gconf.feedback_fd = ffd - fcntl.fcntl(int(ffd), fcntl.F_SETFD, fcntl.FD_CLOEXEC) + fcntl.fcntl(ffd, fcntl.F_SETFD, fcntl.FD_CLOEXEC) #normalize loglevel lvl0 = gconf.log_level @@ -292,6 +291,8 @@ def main_i(): # complete remote connection in child remote.connect_remote(go_daemon='done') local.connect() + if ffd: + os.close(ffd) local.service_loop(*[r for r in [remote] if r]) logging.info("exiting.") diff --git a/xlators/features/marker/utils/syncdaemon/master.py b/xlators/features/marker/utils/syncdaemon/master.py index 76f924ed37a..35dc4ee06aa 100644 --- a/xlators/features/marker/utils/syncdaemon/master.py +++ b/xlators/features/marker/utils/syncdaemon/master.py @@ -87,9 +87,6 @@ class GMaster(object): self.terminate = False def crawl_loop(self): - ffd = getattr(gconf, 'feedback_fd', None) - if ffd: - os.close(int(ffd)) timo = int(gconf.timeout or 0) if timo > 0: def keep_alive(): diff --git a/xlators/features/marker/utils/syncdaemon/monitor.py b/xlators/features/marker/utils/syncdaemon/monitor.py index a86acdc7566..365e91435fd 100644 --- a/xlators/features/marker/utils/syncdaemon/monitor.py +++ b/xlators/features/marker/utils/syncdaemon/monitor.py @@ -48,17 +48,23 @@ class Monitor(object): os.execv(sys.executable, argv + ['--feedback-fd', str(pw)]) os.close(pw) t0 = time.time() - select.select((pr,), (), (), conn_timeout) + so = select.select((pr,), (), (), conn_timeout)[0] os.close(pr) - et = time.time() - t0 - if et < conn_timeout: - et2 = conn_timeout - et - logging.debug("worker got connected in %d sec, " - "waiting %d more to make sure it's fine" % (et, et2)) - time.sleep(et2) + if so: ret = nwait(cpid, os.WNOHANG) + if ret != None: + logging.debug("worker died before establishing connection") + else: + logging.debug("worker seems to be connected (?? racy check)") + while time.time() < t0 + conn_timeout: + ret = nwait(cpid, os.WNOHANG) + if ret != None: + logging.debug("worker died in startup phase") + break + time.sleep(1) else: - logging.debug("worker not confirmed in %d sec, aborting it" % et) + logging.debug("worker not confirmed in %d sec, aborting it" % \ + conn_timeout) os.kill(cpid, SIGKILL) ret = nwait(cpid) if ret == None: -- cgit