summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCsaba Henk <csaba@lowlife.hu>2011-04-02 19:40:49 +0000
committerVijay Bellur <vijay@dev.gluster.com>2011-04-04 08:02:27 -0700
commit01b3dff29adee2041b0ef1b374eda8c88fb07678 (patch)
treec8f6c7eabb962c97f3e88add716eda429e2c3567
parente77c35248e8ce796bc5b108c10013089a0c65bde (diff)
syncdaemon: add monitor mode to support autorestart
Signed-off-by: Csaba Henk <csaba@gluster.com> Signed-off-by: Vijay Bellur <vijay@dev.gluster.com> BUG: 2537 (gsync autorestart) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=2537
-rw-r--r--cli/src/cli-rpc-ops.c76
-rw-r--r--xlators/features/marker/utils/syncdaemon/Makefile.am2
-rw-r--r--xlators/features/marker/utils/syncdaemon/gsyncd.py36
-rw-r--r--xlators/features/marker/utils/syncdaemon/master.py21
-rw-r--r--xlators/features/marker/utils/syncdaemon/monitor.py54
-rw-r--r--xlators/features/marker/utils/syncdaemon/syncdutils.py8
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-op-sm.c18
7 files changed, 174 insertions, 41 deletions
diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c
index 6d47374..110962c 100644
--- a/cli/src/cli-rpc-ops.c
+++ b/cli/src/cli-rpc-ops.c
@@ -2603,15 +2603,30 @@ out:
}
int
-gf_cli3_1_gsync_get_pid_file (char *pidfolder, char *pidfile, char *master, char *slave, char *gl_workdir)
+gf_cli3_1_gsync_get_param_file (char *prmfile, const char *ext, char *master, char *slave, char *gl_workdir)
{
FILE *in = NULL;
char buff[PATH_MAX] = {0, };
char cmd[PATH_MAX] = {0, };
char *ptr = NULL;
- char buffer[PATH_MAX] = {0, };
+ char pidfolder[PATH_MAX] = {0, };
+ char *dotp = NULL;
int ret = 0;
+ if (!(master && slave && gl_workdir)) {
+ GF_ASSERT (!master && !slave && !gl_workdir);
+ /* extension adjustment mode */
+
+ dotp = strrchr (prmfile, '.');
+ if (!dotp++ ||
+ /* overflow */
+ dotp - prmfile + strlen (ext) + 1 > PATH_MAX)
+ return -1;
+
+ strcpy (dotp, ext);
+ return 0;
+ }
+
snprintf (cmd, PATH_MAX, GSYNCD_PREFIX"/gsyncd --canonicalize-escape-url"
" %s %s", master, slave);
if (!(in = popen(cmd, "r"))) {
@@ -2622,21 +2637,18 @@ gf_cli3_1_gsync_get_pid_file (char *pidfolder, char *pidfile, char *master, char
ptr = fgets(buff, sizeof(buff), in);
if (ptr) {
buff[strlen(buff)-1]='\0'; //strip off \n
- snprintf (buffer, PATH_MAX, "%s/gsync/%s", gl_workdir, buff);
- strncpy (pidfolder, buffer, PATH_MAX);
+ snprintf (pidfolder, PATH_MAX, "%s/gsync/%s", gl_workdir, buff);
} else {
ret = -1;
goto out;
}
memset (buff, 0, PATH_MAX);
- memset (buffer, 0, PATH_MAX);
ptr = fgets(buff, sizeof(buff), in);
if (ptr) {
buff[strlen(buff)-1]='\0'; //strip off \n
- snprintf (buffer, PATH_MAX, "%s/%s.pid", pidfolder, buff);
- strncpy (pidfile, buffer, PATH_MAX);
+ snprintf (prmfile, PATH_MAX, "%s/%s.pid", pidfolder, buff);
}
out:
@@ -2684,19 +2696,19 @@ gf_cli3_1_start_gsync (char *master, char *slave, char *gl_workdir)
int32_t ret = -1;
int32_t status = 0;
char cmd[PATH_MAX] = {0,};
- char pidfile[PATH_MAX] = {0,};
- char pidfolder[PATH_MAX] = {0,};
+ char prmfile[PATH_MAX] = {0,};
+ char *tslash = NULL;
- ret = gf_cli3_1_gsync_get_pid_file (pidfolder, pidfile, master,
- slave, gl_workdir);
+ ret = gf_cli3_1_gsync_get_param_file (prmfile, "pid", master,
+ slave, gl_workdir);
if (ret == -1) {
ret = -1;
gf_log ("", GF_LOG_WARNING, "failed to construct the "
- "pidfile string");
+ "prmfile string");
goto out;
}
- ret = gf_cli3_1_gsync_status (master, slave, pidfile, &status);
+ ret = gf_cli3_1_gsync_status (master, slave, prmfile, &status);
if ((ret == 0 && status == 0)) {
gf_log ("", GF_LOG_WARNING, "gsync %s:%s"
"already started", master, slave);
@@ -2707,19 +2719,24 @@ gf_cli3_1_start_gsync (char *master, char *slave, char *gl_workdir)
goto out;
}
- unlink (pidfile);
+ unlink (prmfile);
- ret = mkdir (pidfolder, 0777);
- if (ret && (errno != EEXIST)) {
- gf_log ("", GF_LOG_DEBUG, "mkdir failed, errno: %d",
- errno);
- goto out;
+ tslash = strrchr(prmfile, '/');
+ if (tslash) {
+ *tslash = '\0';
+ ret = mkdir (prmfile, 0777);
+ if (ret && (errno != EEXIST)) {
+ gf_log ("", GF_LOG_DEBUG, "mkdir failed, errno: %d",
+ errno);
+ goto out;
+ }
+ *tslash = '/';
}
memset (cmd, 0, sizeof (cmd));
ret = snprintf (cmd, PATH_MAX, GSYNCD_PREFIX "/gsyncd -c %s/%s %s %s"
" --config-set pid-file %s", gl_workdir,
- GSYNC_CONF, master, slave, pidfile);
+ GSYNC_CONF, master, slave, prmfile);
if (ret <= 0) {
ret = -1;
gf_log ("", GF_LOG_WARNING, "failed to construct the "
@@ -2728,14 +2745,29 @@ gf_cli3_1_start_gsync (char *master, char *slave, char *gl_workdir)
}
ret = system (cmd);
- if (ret == -1) {
+ if (ret) {
gf_log ("", GF_LOG_WARNING, "failed to set the pid "
"option for %s %s", master, slave);
goto out;
}
+ ret = gf_cli3_1_gsync_get_param_file (prmfile, "status", NULL, NULL, NULL);
+ if (ret != -1)
+ ret = snprintf (cmd, PATH_MAX, GSYNCD_PREFIX "/gsyncd -c %s/%s %s %s"
+ " --config-set state-file %s", gl_workdir,
+ GSYNC_CONF, master, slave, prmfile);
+ if (ret >= PATH_MAX)
+ ret = -1;
+ if (ret != -1)
+ ret = system (cmd) ? -1 : 0;
+ if (ret == -1) {
+ gf_log ("", GF_LOG_WARNING, "failed to set status file "
+ "for %s %s", master, slave);
+ goto out;
+ }
+
memset (cmd, 0, sizeof (cmd));
- ret = snprintf (cmd, PATH_MAX, GSYNCD_PREFIX "/gsyncd -c %s/%s %s %s"
+ ret = snprintf (cmd, PATH_MAX, GSYNCD_PREFIX "/gsyncd --monitor -c %s/%s %s %s"
, gl_workdir, GSYNC_CONF, master, slave);
if (ret <= 0) {
ret = -1;
diff --git a/xlators/features/marker/utils/syncdaemon/Makefile.am b/xlators/features/marker/utils/syncdaemon/Makefile.am
index 03ac976..c900fa9 100644
--- a/xlators/features/marker/utils/syncdaemon/Makefile.am
+++ b/xlators/features/marker/utils/syncdaemon/Makefile.am
@@ -1,5 +1,5 @@
syncdaemondir = $(libexecdir)/glusterfs/python/syncdaemon
-syncdaemon_PYTHON = gconf.py gsyncd.py __init__.py master.py README.md repce.py resource.py configinterface.py syncdutils.py
+syncdaemon_PYTHON = gconf.py gsyncd.py __init__.py master.py README.md repce.py resource.py configinterface.py syncdutils.py monitor.py
CLEANFILES =
diff --git a/xlators/features/marker/utils/syncdaemon/gsyncd.py b/xlators/features/marker/utils/syncdaemon/gsyncd.py
index a992005..fb2fe52 100644
--- a/xlators/features/marker/utils/syncdaemon/gsyncd.py
+++ b/xlators/features/marker/utils/syncdaemon/gsyncd.py
@@ -17,6 +17,7 @@ from errno import EEXIST, ENOENT, EACCES, EAGAIN
from gconf import gconf
from configinterface import GConffile
import resource
+from monitor import monitor
class GLogger(Logger):
@@ -37,12 +38,11 @@ class GLogger(Logger):
@classmethod
def setup(cls, **kw):
- if kw.get('slave'):
- sls = "(slave)"
- else:
- sls = ""
+ lbl = kw.get('label', "")
+ if lbl:
+ lbl = '(' + lbl + ')'
lprm = {'datefmt': "%Y-%m-%d %H:%M:%S",
- 'format': "[%(asctime)s.%(nsecs)d] %(lvlnam)s [%(module)s" + sls + ":%(lineno)s:%(funcName)s] %(ctx)s: %(message)s"}
+ 'format': "[%(asctime)s.%(nsecs)d] %(lvlnam)s [%(module)s" + lbl + ":%(lineno)s:%(funcName)s] %(ctx)s: %(message)s"}
lprm.update(kw)
lvl = kw.get('level', logging.INFO)
lprm['level'] = lvl
@@ -121,7 +121,7 @@ def startup(**kw):
lkw['stream'] = sys.stdout
else:
lkw['filename'] = kw['log_file']
- GLogger.setup(slave=kw.get('slave'), **lkw)
+ GLogger.setup(label=kw.get('label'), **lkw)
def finalize(*a):
if getattr(gconf, 'pid_file', None):
@@ -178,7 +178,9 @@ def main_i():
rconf = {'go_daemon': 'should'}
def store_abs(opt, optstr, val, parser):
- setattr(parser.values, opt.dest, os.path.abspath(val))
+ if val:
+ val = os.path.abspath(val)
+ setattr(parser.values, opt.dest, val)
def store_local(opt, optstr, val, parser):
rconf[opt.dest] = val
def store_local_curry(val):
@@ -190,8 +192,10 @@ def main_i():
op.add_option('--gluster-log-level', metavar='LVL')
op.add_option('-p', '--pid-file', metavar='PIDF', type=str, action='callback', callback=store_abs)
op.add_option('-l', '--log-file', metavar='LOGF', type=str, action='callback', callback=store_abs)
+ op.add_option('--state-file', metavar='STATF', type=str, action='callback', callback=store_abs)
op.add_option('-L', '--log-level', metavar='LVL')
op.add_option('-r', '--remote-gsyncd', metavar='CMD', default=os.path.abspath(sys.argv[0]))
+ op.add_option('--volume-id', metavar='UUID')
op.add_option('-s', '--ssh-command', metavar='CMD', default='ssh')
op.add_option('--rsync-command', metavar='CMD', default='rsync')
op.add_option('--rsync-extra', metavar='ARGS', default='-sS', help=SUPPRESS_HELP)
@@ -201,6 +205,7 @@ def main_i():
op.add_option('-c', '--config-file', metavar='CONF', type=str, action='callback', callback=store_local)
# duh. need to specify dest or value will be mapped to None :S
+ op.add_option('--monitor', dest='monitor', action='callback', callback=store_local_curry(True))
op.add_option('--listen', dest='listen', help=SUPPRESS_HELP, action='callback', callback=store_local_curry(True))
op.add_option('-N', '--no-daemon', dest="go_daemon", action='callback', callback=store_local_curry('dont'))
op.add_option('--debug', dest="go_daemon", action='callback', callback=lambda *a: (store_local_curry('dont')(*a),
@@ -277,6 +282,7 @@ def main_i():
gconf.__dict__.update(defaults.__dict__)
gcnf.update_to(gconf.__dict__)
gconf.__dict__.update(opts.__dict__)
+ gconf.configinterface = gcnf
#normalize loglevel
lvl0 = gconf.log_level
@@ -290,13 +296,25 @@ def main_i():
gconf.log_level = lvl2
go_daemon = rconf['go_daemon']
+ be_monitor = rconf.get('monitor')
- if isinstance(remote, resource.SSH) and go_daemon == 'should':
+ if not be_monitor and isinstance(remote, resource.SSH) and \
+ go_daemon == 'should':
go_daemon = 'postconn'
log_file = None
else:
log_file = gconf.log_file
- startup(go_daemon=go_daemon, log_file=log_file, slave=(not remote))
+ if be_monitor:
+ label = 'monitor'
+ elif remote:
+ #master
+ label = ''
+ else:
+ label = 'slave'
+ startup(go_daemon=go_daemon, log_file=log_file, label=label)
+
+ if be_monitor:
+ return monitor()
logging.info("syncing: %s" % " -> ".join(peers))
if remote:
diff --git a/xlators/features/marker/utils/syncdaemon/master.py b/xlators/features/marker/utils/syncdaemon/master.py
index 2df1470d..87610f3 100644
--- a/xlators/features/marker/utils/syncdaemon/master.py
+++ b/xlators/features/marker/utils/syncdaemon/master.py
@@ -10,6 +10,7 @@ from errno import ENOENT, ENODATA
from threading import Thread, currentThread, Condition, Lock
from gconf import gconf
+from syncdutils import FreeObject
URXTIME = (-1, 0)
@@ -80,7 +81,8 @@ class GMaster(object):
# the authorative (foreign, native) volinfo pair
# which lets us deduce what to do when we refetch
# the volinfos from system
- self.volinfo_state = (None, None)
+ uuid_preset = getattr(gconf, 'volume_id', None)
+ self.volinfo_state = (uuid_preset and {'uuid': uuid_preset}, None)
# the actual volinfo we make use of
self.volinfo = None
@@ -140,14 +142,16 @@ class GMaster(object):
# store the value below "boxed" to emulate proper closures
# (variables of the enclosing scope are available inner functions
# provided they are no reassigned; mutation is OK).
- relax_mismatch = [False]
+ param = FreeObject(relax_mismatch = False, state_change = False)
def select_vi(vi0, vi):
if vi and (not vi0 or vi0['uuid'] == vi['uuid']):
+ if not vi0 and not param.relax_mismatch:
+ param.state_change = True
# valid new value found; for the rest, we are graceful about
# uuid mismatch
- relax_mismatch[0] = True
+ param.relax_mismatch = True
return vi
- if vi0 and vi and vi0['uuid'] != vi['uuid'] and not relax_mismatch[0]:
+ if vi0 and vi and vi0['uuid'] != vi['uuid'] and not param.relax_mismatch:
# uuid mismatch for master candidate, bail out
raise RuntimeError("aborting on uuid change from %s to %s" % \
(vi0['uuid'], vi['uuid']))
@@ -157,7 +161,7 @@ class GMaster(object):
srep = lambda vi: vi and vi['uuid'][0:8]
logging.debug('(%s, %s) << (%s, %s) -> (%s, %s)' % \
tuple(srep(vi) for vi in volinfo_state + volinfo_sys + newstate))
- return newstate
+ return newstate, param.state_change
def crawl(self, path='.', xtl=None):
if path == '.':
@@ -166,11 +170,16 @@ class GMaster(object):
time.sleep(1)
self.start = time.time()
volinfo_sys = self.get_sys_volinfo()
- self.volinfo_state = self.volinfo_state_machine(self.volinfo_state, volinfo_sys)
+ self.volinfo_state, state_change = self.volinfo_state_machine(self.volinfo_state,
+ volinfo_sys)
if self.inter_master:
self.volinfo = volinfo_sys[self.KFGN]
else:
self.volinfo = volinfo_sys[self.KNAT]
+ if state_change:
+ logging.info('new master is %s', self.uuid)
+ if self.inter_master:
+ gconf.configinterface.set('volume_id', self.uuid)
if self.volinfo:
if self.volinfo['retval']:
raise RuntimeError ("master is corrupt")
diff --git a/xlators/features/marker/utils/syncdaemon/monitor.py b/xlators/features/marker/utils/syncdaemon/monitor.py
new file mode 100644
index 0000000..3f327b6
--- /dev/null
+++ b/xlators/features/marker/utils/syncdaemon/monitor.py
@@ -0,0 +1,54 @@
+import os
+import sys
+import time
+import logging
+from gconf import gconf
+from syncdutils import update_file
+
+class Monitor(object):
+
+ def __init__(self):
+ self.state = None
+
+ def set_state(self, state):
+ if state == self.state:
+ return
+ self.state = state
+ logging.info('new state: %s' % state)
+ if getattr(gconf, 'state_file', None):
+ update_file(gconf.state_file, lambda f: f.write(state + '\n'))
+
+ def monitor(self):
+ argv = sys.argv[:]
+ for o in ('-N', '--no-daemon', '--monitor'):
+ while o in argv:
+ argv.remove(o)
+ argv.extend(('-N', '-p', ''))
+ argv.insert(0, os.path.basename(sys.executable))
+
+ self.set_state('starting...')
+ ret = 0
+ def nwait(p, o=0):
+ p2, r = os.waitpid(p, o)
+ if not p2:
+ return
+ if os.WIFEXITED(r):
+ return os.WEXITSTATUS(r)
+ return 1
+ while ret in (0, 1):
+ logging.info('-' * 60)
+ logging.info('starting gsyncd worker')
+ cpid = os.spawnv(os.P_NOWAIT, sys.executable, argv)
+ time.sleep(60)
+ ret = nwait(cpid, os.WNOHANG)
+ if not ret:
+ self.set_state('OK')
+ ret = nwait(cpid)
+ elif ret in (0, 1):
+ self.set_state('faulty')
+ time.sleep(10)
+ self.set_state('inconsistent')
+ return ret
+
+def monitor():
+ return Monitor().monitor()
diff --git a/xlators/features/marker/utils/syncdaemon/syncdutils.py b/xlators/features/marker/utils/syncdaemon/syncdutils.py
index 723ab8f..5c17d05 100644
--- a/xlators/features/marker/utils/syncdaemon/syncdutils.py
+++ b/xlators/features/marker/utils/syncdaemon/syncdutils.py
@@ -40,3 +40,11 @@ def update_file(path, updater, merger = lambda f: True):
for fx in (fr, fw):
if fx:
fx.close()
+
+
+class FreeObject(object):
+ """wildcard class for which any attribute can be set"""
+
+ def __init__(self, **kw):
+ for k,v in kw.iteritems():
+ setattr(self, k, v)
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
index f8e043e..0ae4f93 100644
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
@@ -3599,6 +3599,7 @@ stop_gsync (char *master, char *slave, char **op_errstr)
FILE *file = NULL;
char pidfile[PATH_MAX] = {0,};
char buf [1024] = {0,};
+ int i = 0;
ret = gsync_status (master, slave, &status);
if (ret == 0 && status == -1) {
@@ -3632,14 +3633,25 @@ stop_gsync (char *master, char *slave, char **op_errstr)
ret = read (fileno(file), buf, 1024);
if (ret > 0) {
pid = strtol (buf, NULL, 10);
- ret = kill (pid, SIGTERM);
+ ret = kill (-pid, SIGTERM);
if (ret) {
gf_log ("", GF_LOG_WARNING,
"failed to stop gsyncd");
goto out;
}
- sleep (0.1);
- kill (pid, SIGTERM);
+ for (i = 0; i < 20; i++) {
+ if (gsync_status (master, slave, &status) == -1 ||
+ status == -1) {
+ /* monitor gsyncd is dead but worker may
+ * still be alive, give some more time
+ * before SIGKILL (hack)
+ */
+ sleep (0.05);
+ break;
+ }
+ sleep (0.05);
+ }
+ kill (-pid, SIGKILL);
unlink (pidfile);
}
ret = 0;