From 01b3dff29adee2041b0ef1b374eda8c88fb07678 Mon Sep 17 00:00:00 2001 From: Csaba Henk Date: Sat, 2 Apr 2011 19:40:49 +0000 Subject: syncdaemon: add monitor mode to support autorestart Signed-off-by: Csaba Henk Signed-off-by: Vijay Bellur BUG: 2537 (gsync autorestart) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=2537 --- .../features/marker/utils/syncdaemon/Makefile.am | 2 +- xlators/features/marker/utils/syncdaemon/gsyncd.py | 36 +++++++++++---- xlators/features/marker/utils/syncdaemon/master.py | 21 ++++++--- .../features/marker/utils/syncdaemon/monitor.py | 54 ++++++++++++++++++++++ .../features/marker/utils/syncdaemon/syncdutils.py | 8 ++++ 5 files changed, 105 insertions(+), 16 deletions(-) create mode 100644 xlators/features/marker/utils/syncdaemon/monitor.py (limited to 'xlators/features/marker/utils/syncdaemon') diff --git a/xlators/features/marker/utils/syncdaemon/Makefile.am b/xlators/features/marker/utils/syncdaemon/Makefile.am index 03ac9762541..c900fa93260 100644 --- a/xlators/features/marker/utils/syncdaemon/Makefile.am +++ b/xlators/features/marker/utils/syncdaemon/Makefile.am @@ -1,5 +1,5 @@ syncdaemondir = $(libexecdir)/glusterfs/python/syncdaemon -syncdaemon_PYTHON = gconf.py gsyncd.py __init__.py master.py README.md repce.py resource.py configinterface.py syncdutils.py +syncdaemon_PYTHON = gconf.py gsyncd.py __init__.py master.py README.md repce.py resource.py configinterface.py syncdutils.py monitor.py CLEANFILES = diff --git a/xlators/features/marker/utils/syncdaemon/gsyncd.py b/xlators/features/marker/utils/syncdaemon/gsyncd.py index a992005ecd3..fb2fe522bce 100644 --- a/xlators/features/marker/utils/syncdaemon/gsyncd.py +++ b/xlators/features/marker/utils/syncdaemon/gsyncd.py @@ -17,6 +17,7 @@ from errno import EEXIST, ENOENT, EACCES, EAGAIN from gconf import gconf from configinterface import GConffile import resource +from monitor import monitor class GLogger(Logger): @@ -37,12 +38,11 @@ class GLogger(Logger): @classmethod def setup(cls, **kw): - if kw.get('slave'): - sls = "(slave)" - else: - sls = "" + lbl = kw.get('label', "") + if lbl: + lbl = '(' + lbl + ')' lprm = {'datefmt': "%Y-%m-%d %H:%M:%S", - 'format': "[%(asctime)s.%(nsecs)d] %(lvlnam)s [%(module)s" + sls + ":%(lineno)s:%(funcName)s] %(ctx)s: %(message)s"} + 'format': "[%(asctime)s.%(nsecs)d] %(lvlnam)s [%(module)s" + lbl + ":%(lineno)s:%(funcName)s] %(ctx)s: %(message)s"} lprm.update(kw) lvl = kw.get('level', logging.INFO) lprm['level'] = lvl @@ -121,7 +121,7 @@ def startup(**kw): lkw['stream'] = sys.stdout else: lkw['filename'] = kw['log_file'] - GLogger.setup(slave=kw.get('slave'), **lkw) + GLogger.setup(label=kw.get('label'), **lkw) def finalize(*a): if getattr(gconf, 'pid_file', None): @@ -178,7 +178,9 @@ def main_i(): rconf = {'go_daemon': 'should'} def store_abs(opt, optstr, val, parser): - setattr(parser.values, opt.dest, os.path.abspath(val)) + if val: + val = os.path.abspath(val) + setattr(parser.values, opt.dest, val) def store_local(opt, optstr, val, parser): rconf[opt.dest] = val def store_local_curry(val): @@ -190,8 +192,10 @@ def main_i(): op.add_option('--gluster-log-level', metavar='LVL') op.add_option('-p', '--pid-file', metavar='PIDF', type=str, action='callback', callback=store_abs) op.add_option('-l', '--log-file', metavar='LOGF', type=str, action='callback', callback=store_abs) + op.add_option('--state-file', metavar='STATF', type=str, action='callback', callback=store_abs) op.add_option('-L', '--log-level', metavar='LVL') op.add_option('-r', '--remote-gsyncd', metavar='CMD', default=os.path.abspath(sys.argv[0])) + op.add_option('--volume-id', metavar='UUID') op.add_option('-s', '--ssh-command', metavar='CMD', default='ssh') op.add_option('--rsync-command', metavar='CMD', default='rsync') op.add_option('--rsync-extra', metavar='ARGS', default='-sS', help=SUPPRESS_HELP) @@ -201,6 +205,7 @@ def main_i(): op.add_option('-c', '--config-file', metavar='CONF', type=str, action='callback', callback=store_local) # duh. need to specify dest or value will be mapped to None :S + op.add_option('--monitor', dest='monitor', action='callback', callback=store_local_curry(True)) op.add_option('--listen', dest='listen', help=SUPPRESS_HELP, action='callback', callback=store_local_curry(True)) op.add_option('-N', '--no-daemon', dest="go_daemon", action='callback', callback=store_local_curry('dont')) op.add_option('--debug', dest="go_daemon", action='callback', callback=lambda *a: (store_local_curry('dont')(*a), @@ -277,6 +282,7 @@ def main_i(): gconf.__dict__.update(defaults.__dict__) gcnf.update_to(gconf.__dict__) gconf.__dict__.update(opts.__dict__) + gconf.configinterface = gcnf #normalize loglevel lvl0 = gconf.log_level @@ -290,13 +296,25 @@ def main_i(): gconf.log_level = lvl2 go_daemon = rconf['go_daemon'] + be_monitor = rconf.get('monitor') - if isinstance(remote, resource.SSH) and go_daemon == 'should': + if not be_monitor and isinstance(remote, resource.SSH) and \ + go_daemon == 'should': go_daemon = 'postconn' log_file = None else: log_file = gconf.log_file - startup(go_daemon=go_daemon, log_file=log_file, slave=(not remote)) + if be_monitor: + label = 'monitor' + elif remote: + #master + label = '' + else: + label = 'slave' + startup(go_daemon=go_daemon, log_file=log_file, label=label) + + if be_monitor: + return monitor() logging.info("syncing: %s" % " -> ".join(peers)) if remote: diff --git a/xlators/features/marker/utils/syncdaemon/master.py b/xlators/features/marker/utils/syncdaemon/master.py index 2df1470d5f7..87610f3879a 100644 --- a/xlators/features/marker/utils/syncdaemon/master.py +++ b/xlators/features/marker/utils/syncdaemon/master.py @@ -10,6 +10,7 @@ from errno import ENOENT, ENODATA from threading import Thread, currentThread, Condition, Lock from gconf import gconf +from syncdutils import FreeObject URXTIME = (-1, 0) @@ -80,7 +81,8 @@ class GMaster(object): # the authorative (foreign, native) volinfo pair # which lets us deduce what to do when we refetch # the volinfos from system - self.volinfo_state = (None, None) + uuid_preset = getattr(gconf, 'volume_id', None) + self.volinfo_state = (uuid_preset and {'uuid': uuid_preset}, None) # the actual volinfo we make use of self.volinfo = None @@ -140,14 +142,16 @@ class GMaster(object): # store the value below "boxed" to emulate proper closures # (variables of the enclosing scope are available inner functions # provided they are no reassigned; mutation is OK). - relax_mismatch = [False] + param = FreeObject(relax_mismatch = False, state_change = False) def select_vi(vi0, vi): if vi and (not vi0 or vi0['uuid'] == vi['uuid']): + if not vi0 and not param.relax_mismatch: + param.state_change = True # valid new value found; for the rest, we are graceful about # uuid mismatch - relax_mismatch[0] = True + param.relax_mismatch = True return vi - if vi0 and vi and vi0['uuid'] != vi['uuid'] and not relax_mismatch[0]: + if vi0 and vi and vi0['uuid'] != vi['uuid'] and not param.relax_mismatch: # uuid mismatch for master candidate, bail out raise RuntimeError("aborting on uuid change from %s to %s" % \ (vi0['uuid'], vi['uuid'])) @@ -157,7 +161,7 @@ class GMaster(object): srep = lambda vi: vi and vi['uuid'][0:8] logging.debug('(%s, %s) << (%s, %s) -> (%s, %s)' % \ tuple(srep(vi) for vi in volinfo_state + volinfo_sys + newstate)) - return newstate + return newstate, param.state_change def crawl(self, path='.', xtl=None): if path == '.': @@ -166,11 +170,16 @@ class GMaster(object): time.sleep(1) self.start = time.time() volinfo_sys = self.get_sys_volinfo() - self.volinfo_state = self.volinfo_state_machine(self.volinfo_state, volinfo_sys) + self.volinfo_state, state_change = self.volinfo_state_machine(self.volinfo_state, + volinfo_sys) if self.inter_master: self.volinfo = volinfo_sys[self.KFGN] else: self.volinfo = volinfo_sys[self.KNAT] + if state_change: + logging.info('new master is %s', self.uuid) + if self.inter_master: + gconf.configinterface.set('volume_id', self.uuid) if self.volinfo: if self.volinfo['retval']: raise RuntimeError ("master is corrupt") diff --git a/xlators/features/marker/utils/syncdaemon/monitor.py b/xlators/features/marker/utils/syncdaemon/monitor.py new file mode 100644 index 00000000000..3f327b6d04d --- /dev/null +++ b/xlators/features/marker/utils/syncdaemon/monitor.py @@ -0,0 +1,54 @@ +import os +import sys +import time +import logging +from gconf import gconf +from syncdutils import update_file + +class Monitor(object): + + def __init__(self): + self.state = None + + def set_state(self, state): + if state == self.state: + return + self.state = state + logging.info('new state: %s' % state) + if getattr(gconf, 'state_file', None): + update_file(gconf.state_file, lambda f: f.write(state + '\n')) + + def monitor(self): + argv = sys.argv[:] + for o in ('-N', '--no-daemon', '--monitor'): + while o in argv: + argv.remove(o) + argv.extend(('-N', '-p', '')) + argv.insert(0, os.path.basename(sys.executable)) + + self.set_state('starting...') + ret = 0 + def nwait(p, o=0): + p2, r = os.waitpid(p, o) + if not p2: + return + if os.WIFEXITED(r): + return os.WEXITSTATUS(r) + return 1 + while ret in (0, 1): + logging.info('-' * 60) + logging.info('starting gsyncd worker') + cpid = os.spawnv(os.P_NOWAIT, sys.executable, argv) + time.sleep(60) + ret = nwait(cpid, os.WNOHANG) + if not ret: + self.set_state('OK') + ret = nwait(cpid) + elif ret in (0, 1): + self.set_state('faulty') + time.sleep(10) + self.set_state('inconsistent') + return ret + +def monitor(): + return Monitor().monitor() diff --git a/xlators/features/marker/utils/syncdaemon/syncdutils.py b/xlators/features/marker/utils/syncdaemon/syncdutils.py index 723ab8fb5fc..5c17d0579b0 100644 --- a/xlators/features/marker/utils/syncdaemon/syncdutils.py +++ b/xlators/features/marker/utils/syncdaemon/syncdutils.py @@ -40,3 +40,11 @@ def update_file(path, updater, merger = lambda f: True): for fx in (fr, fw): if fx: fx.close() + + +class FreeObject(object): + """wildcard class for which any attribute can be set""" + + def __init__(self, **kw): + for k,v in kw.iteritems(): + setattr(self, k, v) -- cgit