summaryrefslogtreecommitdiffstats
path: root/xlators
diff options
context:
space:
mode:
authorVenky Shankar <venky@gluster.com>2011-10-24 15:29:00 +0530
committerVijay Bellur <vijay@gluster.com>2011-11-04 02:06:44 -0700
commit8e171a03d81452a2998b01997da7647c0b3bf52b (patch)
tree54117831ec7c55fe0b1bb67f9ec227c5fe2059c6 /xlators
parentcc0a908bc5d5046bcb2909fc98765038f15e918d (diff)
cli: add geo-replication log-rotate command
Rotating geo-replication master/monitor log files from cli. On invocation, the log file for a given master-slave session is backed up with the current timestamp suffixed to the file name and signal is sent to gsyncd to start logging to a new log file. Sample commands: * Rotate log file for this <master>:<slave> session: gluster volume geo-replication <master> <slave> log-rotate * Rotate log files for all session for master volume <master> gluster volume geo-replication <master> log-rotate * Rotate log files for all sessions: gluster volume geo-replication log-rotate Change-Id: If801743e9f37bd282e68d262203141626ce77e55 BUG: 3519 Reviewed-on: http://review.gluster.com/653 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Csaba Henk <csaba@gluster.com>
Diffstat (limited to 'xlators')
-rw-r--r--xlators/features/marker/utils/syncdaemon/gconf.py1
-rw-r--r--xlators/features/marker/utils/syncdaemon/gsyncd.py7
-rw-r--r--xlators/features/marker/utils/syncdaemon/monitor.py41
-rw-r--r--xlators/features/marker/utils/syncdaemon/repce.py5
-rw-r--r--xlators/features/marker/utils/syncdaemon/resource.py6
-rw-r--r--xlators/features/marker/utils/syncdaemon/syncdutils.py25
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handler.c3
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-op-sm.c359
8 files changed, 428 insertions, 19 deletions
diff --git a/xlators/features/marker/utils/syncdaemon/gconf.py b/xlators/features/marker/utils/syncdaemon/gconf.py
index 24165b6191a..803ca2acdde 100644
--- a/xlators/features/marker/utils/syncdaemon/gconf.py
+++ b/xlators/features/marker/utils/syncdaemon/gconf.py
@@ -6,6 +6,7 @@ class GConf(object):
cpid = None
pid_file_owned = False
permanent_handles = []
+ log_metadata = {}
@classmethod
def setup_ssh_ctl(cls, ctld):
diff --git a/xlators/features/marker/utils/syncdaemon/gsyncd.py b/xlators/features/marker/utils/syncdaemon/gsyncd.py
index 193af9d5f37..99e2204b2e4 100644
--- a/xlators/features/marker/utils/syncdaemon/gsyncd.py
+++ b/xlators/features/marker/utils/syncdaemon/gsyncd.py
@@ -6,7 +6,6 @@ import sys
import time
import logging
import signal
-import select
import optparse
import fcntl
from optparse import OptionParser, SUPPRESS_HELP
@@ -15,6 +14,7 @@ from errno import EEXIST, ENOENT
from gconf import gconf
from syncdutils import FreeObject, norm, grabpidfile, finalize, log_raise_exception
+from syncdutils import select
from configinterface import GConffile
import resource
from monitor import monitor
@@ -78,7 +78,7 @@ def startup(**kw):
# so we can start up with
# no messing from the dirty
# ol' bustard
- select.select((x,), (), ())
+ select((x,), (), ())
os.close(x)
lkw = {}
@@ -93,6 +93,9 @@ def startup(**kw):
lkw['filename'] = kw['log_file']
GLogger.setup(label=kw.get('label'), **lkw)
+ lkw.update({'saved_label': kw.get('label')})
+ gconf.log_metadata = lkw
+
def main():
signal.signal(signal.SIGTERM, lambda *a: finalize(*a, **{'exval': 1}))
GLogger.setup()
diff --git a/xlators/features/marker/utils/syncdaemon/monitor.py b/xlators/features/marker/utils/syncdaemon/monitor.py
index 365e91435fd..f58aa592c9f 100644
--- a/xlators/features/marker/utils/syncdaemon/monitor.py
+++ b/xlators/features/marker/utils/syncdaemon/monitor.py
@@ -1,11 +1,10 @@
import os
import sys
import time
+import signal
import logging
-import select
-from signal import SIGKILL
from gconf import gconf
-from syncdutils import update_file
+from syncdutils import update_file, select, waitpid
class Monitor(object):
@@ -21,6 +20,19 @@ class Monitor(object):
update_file(gconf.state_file, lambda f: f.write(state + '\n'))
def monitor(self):
+ def sigcont_handler(*a):
+ """
+ Re-init logging and send group kill signal
+ """
+ md = gconf.log_metadata
+ logging.shutdown()
+ lcls = logging.getLoggerClass()
+ lcls.setup(label=md.get('saved_label'), **md)
+ pid = os.getpid()
+ os.kill(-pid, signal.SIGUSR1)
+ signal.signal(signal.SIGUSR1, lambda *a: ())
+ signal.signal(signal.SIGCONT, sigcont_handler)
+
argv = sys.argv[:]
for o in ('-N', '--no-daemon', '--monitor'):
while o in argv:
@@ -31,11 +43,16 @@ class Monitor(object):
self.set_state('starting...')
ret = 0
def nwait(p, o=0):
- p2, r = os.waitpid(p, o)
+ p2, r = waitpid(p, o)
if not p2:
return
- if os.WIFEXITED(r):
- return os.WEXITSTATUS(r)
+ return r
+ def exit_signalled(s):
+ """ child teminated due to receipt of SIGUSR1 """
+ return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1))
+ def exit_status(s):
+ if os.WIFEXITED(s):
+ return os.WEXITSTATUS(s)
return 1
conn_timeout = 60
while ret in (0, 1):
@@ -48,7 +65,7 @@ class Monitor(object):
os.execv(sys.executable, argv + ['--feedback-fd', str(pw)])
os.close(pw)
t0 = time.time()
- so = select.select((pr,), (), (), conn_timeout)[0]
+ so = select((pr,), (), (), conn_timeout)[0]
os.close(pr)
if so:
ret = nwait(cpid, os.WNOHANG)
@@ -65,13 +82,17 @@ class Monitor(object):
else:
logging.debug("worker not confirmed in %d sec, aborting it" % \
conn_timeout)
- os.kill(cpid, SIGKILL)
+ os.kill(cpid, signal.SIGKILL)
ret = nwait(cpid)
if ret == None:
self.set_state('OK')
ret = nwait(cpid)
- elif ret in (0, 1):
- self.set_state('faulty')
+ if exit_signalled(ret):
+ ret = 0
+ else:
+ ret = exit_status(ret)
+ if ret in (0,1):
+ self.set_state('faulty')
time.sleep(10)
self.set_state('inconsistent')
return ret
diff --git a/xlators/features/marker/utils/syncdaemon/repce.py b/xlators/features/marker/utils/syncdaemon/repce.py
index 47691301e29..02833b0382b 100644
--- a/xlators/features/marker/utils/syncdaemon/repce.py
+++ b/xlators/features/marker/utils/syncdaemon/repce.py
@@ -1,6 +1,5 @@
import os
import sys
-import select
import time
import logging
from threading import Condition
@@ -20,7 +19,7 @@ except ImportError:
# py 3
import pickle
-from syncdutils import Thread
+from syncdutils import Thread, select
pickle_proto = -1
repce_version = 1.0
@@ -114,7 +113,7 @@ class RepceClient(object):
def listen(self):
while True:
- select.select((self.inf,), (), ())
+ select((self.inf,), (), ())
rid, exc, res = recv(self.inf)
rjob = self.jtab.pop(rid)
if rjob.cbk:
diff --git a/xlators/features/marker/utils/syncdaemon/resource.py b/xlators/features/marker/utils/syncdaemon/resource.py
index 800d297bacd..5f46edd44cf 100644
--- a/xlators/features/marker/utils/syncdaemon/resource.py
+++ b/xlators/features/marker/utils/syncdaemon/resource.py
@@ -6,7 +6,6 @@ import stat
import time
import errno
import struct
-import select
import socket
import logging
import tempfile
@@ -17,6 +16,7 @@ import repce
from repce import RepceServer, RepceClient
from master import GMaster
import syncdutils
+from syncdutils import select
UrlRX = re.compile('\A(\w+)://(.*)')
HostRX = re.compile('[a-z\d](?:[a-z\d.-]*[a-z\d])?', re.I)
@@ -206,7 +206,7 @@ class SlaveLocal(object):
logging.info("connection inactive for %d seconds, stopping" % int(gconf.timeout))
break
else:
- select.select((), (), ())
+ select((), (), ())
class SlaveRemote(object):
@@ -455,7 +455,7 @@ class SSH(AbstractUrl, SlaveRemote):
i, o = ret
inf = os.fdopen(i)
repce.send(o, None, '__repce_version__')
- select.select((inf,), (), ())
+ select((inf,), (), ())
repce.recv(inf)
# hack hack hack: store a global reference to the file
# to save it from getting GC'd which implies closing it
diff --git a/xlators/features/marker/utils/syncdaemon/syncdutils.py b/xlators/features/marker/utils/syncdaemon/syncdutils.py
index 4bf51da746e..81c4e2e4be5 100644
--- a/xlators/features/marker/utils/syncdaemon/syncdutils.py
+++ b/xlators/features/marker/utils/syncdaemon/syncdutils.py
@@ -5,9 +5,11 @@ import fcntl
import shutil
import logging
from threading import Lock, Thread as baseThread
-from errno import EACCES, EAGAIN
+from errno import EACCES, EAGAIN, EINTR
from signal import SIGTERM, SIGKILL
from time import sleep
+import select as oselect
+from os import waitpid as owaitpid
from gconf import gconf
@@ -158,3 +160,24 @@ class Thread(baseThread):
kw['target'] = twrap
baseThread.__init__(self, *a, **kw)
self.setDaemon(True)
+
+class GsyncdError(Exception):
+ pass
+
+def eintr_wrap(func, exc, *a):
+ """
+ wrapper around syscalls resilient to interrupt caused
+ by signals
+ """
+ while True:
+ try:
+ return func(*a)
+ except exc, ex:
+ if not ex[0] == EINTR:
+ raise GsyncdError(ex[1])
+
+def select(*a):
+ return eintr_wrap(oselect.select, oselect.error, *a)
+
+def waitpid(*a):
+ return eintr_wrap(owaitpid, OSError, *a)
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
index f072a70047e..e9d3b2400f1 100644
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
@@ -1761,6 +1761,9 @@ glusterd_handle_gsync_set (rpcsvc_request_t *req)
case GF_GSYNC_OPTION_TYPE_STATUS:
strncpy (operation, "status", sizeof (operation));
break;
+ case GF_GSYNC_OPTION_TYPE_ROTATE:
+ strncpy (operation, "rotate", sizeof(operation));
+ break;
}
gf_cmd_log ("volume "GEOREP, " %s command on %s,%s", operation, master,
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
index 9db4c2c1949..82eb6b05d6d 100644
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
@@ -2364,6 +2364,11 @@ glusterd_op_stage_gsync_set (dict_t *dict, char **op_errstr)
ret = gsync_verify_config_options (dict, op_errstr);
goto out;
+
+ case GF_GSYNC_OPTION_TYPE_ROTATE:
+ /* checks same as status mode */
+ ret = glusterd_verify_gsync_status_opts(dict, op_errstr);
+ goto out;
}
ret = glusterd_op_gsync_args_get (dict, op_errstr, &volname, &slave);
@@ -4372,8 +4377,358 @@ glusterd_get_gsync_status (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
out:
gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
+}
+
+int
+glusterd_send_sigstop (pid_t pid)
+{
+ int ret = 0;
+ ret = kill (pid, SIGSTOP);
+ if (ret)
+ gf_log ("", GF_LOG_ERROR, GEOREP"failed to send SIGSTOP signal");
+ return ret;
+}
+
+int
+glusterd_send_sigcont (pid_t pid)
+{
+ int ret = 0;
+ ret = kill (pid, SIGCONT);
+ if (ret)
+ gf_log ("", GF_LOG_ERROR, GEOREP"failed to send SIGCONT signal");
+ return ret;
+}
+
+/*
+ * Log rotations flow is something like this:
+ * - Send SIGSTOP to process group (this will stop monitor/worker process
+ * and also the slave if it's local)
+ * - Rotate log file for monitor/worker
+ * - Rotate log file for slave if it's local
+ * - Send SIGCONT to the process group. Monitor wakes up, kills the worker
+ * (this is done in the SIGCONT handler), which results in the termination
+ * of the slave (local/remote). After returning from signal handler,
+ * monitor detects absence of worker and starts it again, which in-turn
+ * starts the slave.
+ */
+int
+glusterd_send_log_rotate_signal (pid_t pid, char *logfile1, char *logfile2)
+{
+ int ret = 0;
+ struct stat stbuf = {0,};
+ char rlogfile[PATH_MAX] = {0,};
+ time_t rottime = 0;
+
+ ret = glusterd_send_sigstop (-pid);
+ rottime = time (NULL);
+
+ snprintf (rlogfile, sizeof (rlogfile), "%s.%"PRIu64, logfile1,
+ (uint64_t) rottime);
+ ret = rename (logfile1, rlogfile);
+ if (ret)
+ gf_log ("", GF_LOG_ERROR, "rename failed for geo-rep log file");
+
+ snprintf (rlogfile, sizeof (rlogfile), "%s.%"PRIu64, logfile2,
+ (uint64_t) rottime);
+ ret = stat (logfile2, &stbuf);
+ if (ret) {
+ if (errno != ENOENT)
+ gf_log("", GF_LOG_ERROR, "stat failed for slave log"
+ " file: %s", logfile2);
+ else {
+ gf_log ("", GF_LOG_DEBUG, "Slave is not local, skipping rotation");
+ ret = 0;
+ }
+ goto out;
+ }
+
+ ret = rename (logfile2, rlogfile);
+ if (ret)
+ gf_log ("", GF_LOG_ERROR, "rename failed for geo-rep slave log file");
+
+ out:
+ ret = glusterd_send_sigcont (-pid);
+ return ret;
+}
+
+int
+glusterd_gsync_get_session_owner (char *master, char *slave, char *session_owner,
+ char *gl_workdir)
+{
+ char cmd[PATH_MAX] = {0,};
+
+ snprintf (cmd, PATH_MAX,
+ GSYNCD_PREFIX"/gsyncd -c %s/"GSYNC_CONF" :%s %s --config-get session-owner",
+ gl_workdir, master, slave);
+
+ return glusterd_query_extutil (session_owner, cmd);
+}
+
+int
+glusterd_gsync_get_slave_log_file (char *master, char *slave, char *log_file)
+{
+ int ret = -1;
+ char uuid_str[64] = {0,};
+ char cmd[PATH_MAX] = {0,};
+ glusterd_conf_t *priv = NULL;
+ char *gl_workdir = NULL;
+
+ GF_ASSERT(THIS);
+ GF_ASSERT(THIS->private);
+
+ priv = THIS->private;
+
+ GF_VALIDATE_OR_GOTO("gsyncd", master, out);
+ GF_VALIDATE_OR_GOTO("gsyncd", slave, out);
+
+ gl_workdir = priv->workdir;
+
+ /* get the session owner for the master-slave session */
+ ret = glusterd_gsync_get_session_owner (master, slave, uuid_str,
+ gl_workdir);
+
+ if (ret)
+ goto out;
+
+ /* get the log file for the slave */
+ snprintf (cmd, PATH_MAX,
+ GSYNCD_PREFIX"/gsyncd -c %s/"GSYNC_CONF" --session-owner=%s %s --config-get log-file",
+ gl_workdir, uuid_str, slave);
+ ret = glusterd_query_extutil (log_file, cmd);
+
+ out:
+ return ret;
+}
+static int
+glusterd_gsyncd_getlogfile (char *master, char *slave, char *log_file)
+{
+ int ret = -1;
+ glusterd_conf_t *priv = NULL;
+
+ GF_ASSERT (THIS);
+ GF_ASSERT (THIS->private);
+ priv = THIS->private;
+
+ GF_VALIDATE_OR_GOTO ("gsync", master, out);
+ GF_VALIDATE_OR_GOTO ("gsync", slave, out);
+
+ ret = glusterd_gsync_get_param_file (log_file, "log", master,
+ slave, priv->workdir);
+
+ if (ret == -1) {
+ ret = -2;
+ gf_log ("", GF_LOG_WARNING, "failed to gsyncd logfile");
+ }
+
+ out:
+ return ret;
+}
+
+int
+glusterd_get_pid_from_file (char *master, char *slave, pid_t *pid)
+{
+ int ret = -1;
+ int pfd = 0;
+ char pidfile[PATH_MAX] = {0,};
+ char buff[1024] = {0,};
+
+ pfd = gsyncd_getpidfile (master, slave, pidfile);
+
+ if (pfd == -2) {
+ gf_log ("", GF_LOG_ERROR, GEOREP" log-rotate validation "
+ " failed for %s & %s", master, slave);
+ goto out;
+ }
+
+ if (gsync_status_byfd (pfd) == -1) {
+ gf_log ("", GF_LOG_ERROR, "gsyncd b/w %s & %s is not"
+ " running", master, slave);
+ goto out;
+ }
+
+ ret = read (pfd, buff, 1024);
+ if (ret < 0) {
+ gf_log ("", GF_LOG_ERROR, GEOREP" cannot read pid from pid-file");
+ goto out;
+ }
+
+ close(pfd);
+
+ *pid = strtol (buff, NULL, 10);
+ ret = 0;
+
+ out:
+ return ret;
+}
+
+int
+glusterd_do_gsync_log_rotate (char *master, char *slave, uuid_t *uuid, char **op_errstr)
+{
+ int ret = 0;
+ glusterd_conf_t *priv = NULL;
+ pid_t pid = 0;
+ char log_file1[PATH_MAX] = {0,};
+ char log_file2[PATH_MAX] = {0,};
+
+ GF_ASSERT (THIS);
+ GF_ASSERT (THIS->private);
+
+ priv = THIS->private;
+
+ ret = glusterd_get_pid_from_file (master, slave, &pid);
+ if (ret)
+ goto out;
+
+ /* log file */
+ ret = glusterd_gsyncd_getlogfile (master, slave, log_file1);
+ if (ret)
+ goto out;
+
+ /* slave log file */
+ ret = glusterd_gsync_get_slave_log_file (master, slave, log_file2);
+ if (ret)
+ goto out;
+
+ ret = glusterd_send_log_rotate_signal (pid, log_file1, log_file2);
+
+ out:
+ if (ret && op_errstr)
+ *op_errstr = gf_strdup("Error rotating log file");
+ return ret;
+}
+
+int
+glusterd_do_gsync_log_rotation_mst_slv (glusterd_volinfo_t *volinfo, char *slave,
+ char **op_errstr)
+{
+ uuid_t uuid = {0, };
+ glusterd_conf_t *priv = NULL;
+ int ret = 0;
+ char errmsg[1024] = {0,};
+
+ GF_ASSERT (volinfo);
+ GF_ASSERT (slave);
+ GF_ASSERT (THIS);
+ GF_ASSERT (THIS->private);
+
+ priv = THIS->private;
+
+ ret = glusterd_gsync_get_uuid (slave, volinfo, uuid);
+ if ((ret == 0) && (uuid_compare (priv->uuid, uuid) != 0))
+ goto out;
+
+ if (ret) {
+ snprintf(errmsg, sizeof(errmsg), "geo-replication session b/w %s %s not active",
+ volinfo->volname, slave);
+ gf_log ("", GF_LOG_WARNING, errmsg);
+ if (op_errstr)
+ *op_errstr = gf_strdup(errmsg);
+ goto out;
+ }
+
+ ret = glusterd_do_gsync_log_rotate (volinfo->volname, slave, &uuid, op_errstr);
+
+ out:
+ gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ return ret;
+}
+
+static void
+_iterate_log_rotate_mst_slv (dict_t *this, char *key, data_t *value, void *data)
+{
+ glusterd_gsync_status_temp_t *param = NULL;
+ char *slave = NULL;
+
+ param = (glusterd_gsync_status_temp_t *) data;
+
+ GF_ASSERT (param);
+ GF_ASSERT (param->volinfo);
+
+ slave = strchr (value->data, ':');
+ if (slave)
+ slave++;
+ else {
+ gf_log ("", GF_LOG_ERROR, "geo-replication log-rotate: slave (%s) "
+ "not comfirming to format", slave);
+ return;
+ }
+
+ (void) glusterd_do_gsync_log_rotation_mst_slv (param->volinfo, slave, NULL);
+}
+
+int
+glusterd_do_gsync_log_rotation_mst (glusterd_volinfo_t *volinfo)
+{
+ glusterd_gsync_status_temp_t param = {0, };
+
+ GF_ASSERT (volinfo);
+
+ param.volinfo = volinfo;
+ dict_foreach (volinfo->gsync_slaves, _iterate_log_rotate_mst_slv, &param);
+ return 0;
+}
+
+static int
+glusterd_rotate_gsync_all ()
+{
+ int32_t ret = 0;
+ glusterd_conf_t *priv = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+
+ GF_ASSERT (THIS);
+ priv = THIS->private;
+ GF_ASSERT (priv);
+
+ list_for_each_entry (volinfo, &priv->volumes, vol_list) {
+ ret = glusterd_do_gsync_log_rotation_mst (volinfo);
+ if (ret)
+ goto out;
+ }
+
+ out:
+ gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ return ret;
+}
+
+static int
+glusterd_rotate_gsync_logs (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+ char *slave = NULL;
+ char *volname = NULL;
+ char errmsg[1024] = {0,};
+ gf_boolean_t exists = _gf_false;
+ glusterd_volinfo_t *volinfo = NULL;
+ int ret = 0;
+
+ ret = dict_get_str (dict, "master", &volname);
+ if (ret < 0) {
+ ret = glusterd_rotate_gsync_all ();
+ goto out;
+ }
+
+ exists = glusterd_check_volume_exists (volname);
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if ((ret) || (!exists)) {
+ snprintf (errmsg, sizeof(errmsg), "Volume %s does not"
+ " exist", volname);
+ gf_log ("", GF_LOG_WARNING, errmsg);
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "slave", &slave);
+ if (ret < 0) {
+ ret = glusterd_do_gsync_log_rotation_mst (volinfo);
+ goto out;
+ }
+
+ ret = glusterd_do_gsync_log_rotation_mst_slv (volinfo, slave, op_errstr);
+
+ out:
+ return ret;
}
@@ -4412,7 +4767,11 @@ glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
ret = glusterd_get_gsync_status (dict, op_errstr, resp_dict);
goto out;
+ }
+ if (type == GF_GSYNC_OPTION_TYPE_ROTATE) {
+ ret = glusterd_rotate_gsync_logs (dict, op_errstr, resp_dict);
+ goto out;
}
ret = dict_get_str (dict, "slave", &slave);