From f727e847133e479a37e86a7feb5826496a7313f7 Mon Sep 17 00:00:00 2001 From: Csaba Henk Date: Fri, 15 Apr 2011 09:10:03 +0000 Subject: syncdaemon: yet another try to exit properly The final cleanup sequence + call to _exit, which was just done in the main thread, now is called for in each thread when the thread crashes. Seems we aren't left there hanging this way. Signed-off-by: Csaba Henk Signed-off-by: Anand Avati BUG: 2736 (gsyncd hangs if crash occurs in the non-main thread) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=2736 --- xlators/features/marker/utils/syncdaemon/gconf.py | 1 + xlators/features/marker/utils/syncdaemon/gsyncd.py | 86 ++------------------- .../features/marker/utils/syncdaemon/syncdutils.py | 88 +++++++++++++++++++++- 3 files changed, 92 insertions(+), 83 deletions(-) (limited to 'xlators/features/marker') diff --git a/xlators/features/marker/utils/syncdaemon/gconf.py b/xlators/features/marker/utils/syncdaemon/gconf.py index cec5be0789b..24165b6191a 100644 --- a/xlators/features/marker/utils/syncdaemon/gconf.py +++ b/xlators/features/marker/utils/syncdaemon/gconf.py @@ -4,6 +4,7 @@ class GConf(object): ssh_ctl_dir = None ssh_ctl_args = None cpid = None + pid_file_owned = False permanent_handles = [] @classmethod diff --git a/xlators/features/marker/utils/syncdaemon/gsyncd.py b/xlators/features/marker/utils/syncdaemon/gsyncd.py index 0ed120e0397..ba4d7a6dda4 100644 --- a/xlators/features/marker/utils/syncdaemon/gsyncd.py +++ b/xlators/features/marker/utils/syncdaemon/gsyncd.py @@ -7,15 +7,14 @@ import time import logging import signal import select -import shutil import optparse import fcntl from optparse import OptionParser, SUPPRESS_HELP from logging import Logger -from errno import EEXIST, ENOENT, EACCES, EAGAIN +from errno import EEXIST, ENOENT from gconf import gconf -from syncdutils import FreeObject, norm +from syncdutils import FreeObject, norm, grabpidfile, finalize, log_raise_exception from configinterface import GConffile import resource from monitor import monitor @@ -53,47 +52,12 @@ class GLogger(Logger): logging.basicConfig(**lprm) -def grabfile(fname, content=None): - # damn those messy open() mode codes - fd = os.open(fname, os.O_CREAT|os.O_RDWR) - f = os.fdopen(fd, 'r+b', 0) - try: - fcntl.lockf(f, fcntl.LOCK_EX|fcntl.LOCK_NB) - except: - ex = sys.exc_info()[1] - f.close() - if isinstance(ex, IOError) and ex.errno in (EACCES, EAGAIN): - # cannot grab, it's taken - return - raise - if content: - try: - f.truncate() - f.write(content) - except: - f.close() - raise - gconf.permanent_handles.append(f) - return f - -pid_file_owned = False - -def grabpidfile(fname=None, setpid=True): - if not fname: - fname = gconf.pid_file - content = None - if setpid: - content = str(os.getpid()) + '\n' - return grabfile(fname, content=content) - def startup(**kw): - global pid_file_owned - if getattr(gconf, 'pid_file', None) and kw.get('go_daemon') != 'postconn': if not grabpidfile(): sys.stderr.write("pidfile is taken, exiting.\n") sys.exit(2) - pid_file_owned = True + gconf.pid_file_owned = True if kw.get('go_daemon') == 'should': x, y = os.pipe() @@ -129,57 +93,19 @@ def startup(**kw): lkw['filename'] = kw['log_file'] GLogger.setup(label=kw.get('label'), **lkw) -def finalize(*a): - if getattr(gconf, 'pid_file', None): - rm_pidf = pid_file_owned - if gconf.cpid: - # exit path from parent branch of daemonization - rm_pidf = False - while True: - f = grabpidfile(setpid=False) - if not f: - # child has already taken over pidfile - break - if os.waitpid(gconf.cpid, os.WNOHANG)[0] == gconf.cpid: - # child has terminated - rm_pidf = True - break; - time.sleep(0.1) - if rm_pidf: - try: - os.unlink(gconf.pid_file) - except: - ex = sys.exc_info()[1] - if ex.errno == ENOENT: - pass - else: - raise - if gconf.ssh_ctl_dir and not gconf.cpid: - shutil.rmtree(gconf.ssh_ctl_dir) - sys.stdout.flush() - sys.stderr.flush() - def main(): signal.signal(signal.SIGTERM, lambda *a: (finalize(*a), os._exit(1))) GLogger.setup() - exval = 0 + excont = FreeObject(exval = 0) try: try: main_i() except: - exc = sys.exc_info()[1] - if isinstance(exc, SystemExit): - exval = exc.code or 0 - raise - else: - logging.exception("FAIL: ") - sys.stderr.write("failed with %s.\n" % type(exc).__name__) - exval = 1 - sys.exit(exval) + log_raise_exception(excont) finally: finalize() # force exit in non-main thread too - os._exit(exval) + os._exit(excont.exval) def main_i(): rconf = {'go_daemon': 'should'} diff --git a/xlators/features/marker/utils/syncdaemon/syncdutils.py b/xlators/features/marker/utils/syncdaemon/syncdutils.py index 48694d238fd..c8f751d33f4 100644 --- a/xlators/features/marker/utils/syncdaemon/syncdutils.py +++ b/xlators/features/marker/utils/syncdaemon/syncdutils.py @@ -1,7 +1,15 @@ import os +import sys +import time import fcntl +import shutil +import logging from threading import Thread as baseThread -from signal import SIGTERM +from errno import EACCES, EAGAIN +from signal import SIGTERM, SIGKILL +from time import sleep + +from gconf import gconf try: # py 3 @@ -49,6 +57,78 @@ def update_file(path, updater, merger = lambda f: True): if fx: fx.close() +def grabfile(fname, content=None): + # damn those messy open() mode codes + fd = os.open(fname, os.O_CREAT|os.O_RDWR) + f = os.fdopen(fd, 'r+b', 0) + try: + fcntl.lockf(f, fcntl.LOCK_EX|fcntl.LOCK_NB) + except: + ex = sys.exc_info()[1] + f.close() + if isinstance(ex, IOError) and ex.errno in (EACCES, EAGAIN): + # cannot grab, it's taken + return + raise + if content: + try: + f.truncate() + f.write(content) + except: + f.close() + raise + gconf.permanent_handles.append(f) + return f + +def grabpidfile(fname=None, setpid=True): + if not fname: + fname = gconf.pid_file + content = None + if setpid: + content = str(os.getpid()) + '\n' + return grabfile(fname, content=content) + +def finalize(*a): + if getattr(gconf, 'pid_file', None): + rm_pidf = gconf.pid_file_owned + if gconf.cpid: + # exit path from parent branch of daemonization + rm_pidf = False + while True: + f = grabpidfile(setpid=False) + if not f: + # child has already taken over pidfile + break + if os.waitpid(gconf.cpid, os.WNOHANG)[0] == gconf.cpid: + # child has terminated + rm_pidf = True + break; + time.sleep(0.1) + if rm_pidf: + try: + os.unlink(gconf.pid_file) + except: + ex = sys.exc_info()[1] + if ex.errno == ENOENT: + pass + else: + raise + if gconf.ssh_ctl_dir and not gconf.cpid: + shutil.rmtree(gconf.ssh_ctl_dir) + sys.stdout.flush() + sys.stderr.flush() + +def log_raise_exception(excont): + exc = sys.exc_info()[1] + if isinstance(exc, SystemExit): + excont.exval = exc.code or 0 + raise + else: + logging.exception("FAIL: ") + sys.stderr.write("failed with %s.\n" % type(exc).__name__) + excont.exval = 1 + sys.exit(excont.exval) + class FreeObject(object): """wildcard class for which any attribute can be set""" @@ -63,13 +143,15 @@ class Thread(baseThread): tf = kw.get('target') if tf: def twrap(*aa): + excont = FreeObject(exval = 0) try: tf(*aa) except: try: - raise + log_raise_exception(excont) finally: - os.kill(os.getpid(), SIGTERM) + finalize() + os._exit(excont.exval) kw['target'] = twrap baseThread.__init__(self, *a, **kw) self.setDaemon(True) -- cgit