summaryrefslogtreecommitdiffstats
path: root/geo-replication
diff options
context:
space:
mode:
authorKotresh HR <khiremat@redhat.com>2017-04-04 15:39:46 -0400
committerAravinda VK <avishwan@redhat.com>2017-04-07 02:09:34 -0400
commite01025973c73e2bd0eda8cfed22b75617305d740 (patch)
tree9afdb57aaf3474a54bc222c657ed3de00f40cf4c /geo-replication
parentcbcb1d33de8c4bd7250a5c038e8f95456772add1 (diff)
geo-rep: Improve worker log messages
Monitor process expects worker to establish SSH Tunnel to slave node and mount master volume locally with in 60 secs and acknowledge monitor process by closing feedback fd. If something goes wrong and worker does not close feedback fd with in 60 secs, monitor kills the worker. But there was no clue in log message about the actual issue. This patch adds log and indicates whether the worker is hung during SSH or master mount. Change-Id: Id08a12fa6f3bba1d4fe8036728dbc290e6c14c8c BUG: 1261689 Signed-off-by: Kotresh HR <khiremat@redhat.com> Reviewed-on: https://review.gluster.org/16997 Smoke: Gluster Build System <jenkins@build.gluster.org> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Aravinda VK <avishwan@redhat.com>
Diffstat (limited to 'geo-replication')
-rw-r--r--geo-replication/syncdaemon/gsyncd.py1
-rw-r--r--geo-replication/syncdaemon/monitor.py9
-rw-r--r--geo-replication/syncdaemon/resource.py8
3 files changed, 16 insertions, 2 deletions
diff --git a/geo-replication/syncdaemon/gsyncd.py b/geo-replication/syncdaemon/gsyncd.py
index adc48f146a6..ac39a79128b 100644
--- a/geo-replication/syncdaemon/gsyncd.py
+++ b/geo-replication/syncdaemon/gsyncd.py
@@ -777,6 +777,7 @@ def main_i():
remote.connect_remote(go_daemon='done')
local.connect()
if ffd:
+ logging.info ("Closing feedback fd, waking up the monitor")
os.close(ffd)
local.service_loop(*[r for r in [remote] if r])
diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py
index d23d4542fd6..c54c07d600c 100644
--- a/geo-replication/syncdaemon/monitor.py
+++ b/geo-replication/syncdaemon/monitor.py
@@ -369,8 +369,13 @@ class Monitor(object):
time.sleep(1)
else:
- logging.info("worker(%s) not confirmed in %d sec, "
- "aborting it" % (w[0]['dir'], conn_timeout))
+ logging.info("worker(%s) not confirmed in %d sec, aborting it. "
+ "Gsyncd invocation on remote slave via SSH or "
+ "gluster master mount might have hung. Please "
+ "check the above logs for exact issue and check "
+ "master or slave volume for errors. Restarting "
+ "master/slave volume accordingly might help."
+ % (w[0]['dir'], conn_timeout))
errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
nwait(apid) # wait for agent
ret = nwait(cpid)
diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py
index 0e718b28344..eb295ad8601 100644
--- a/geo-replication/syncdaemon/resource.py
+++ b/geo-replication/syncdaemon/resource.py
@@ -1452,6 +1452,8 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote):
with given backend
"""
+ logging.info ("Mounting gluster volume locally...")
+ t0 = time.time()
label = getattr(gconf, 'mountbroker', None)
if not label and not privileged():
label = syncdutils.getusername()
@@ -1462,6 +1464,8 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote):
['log-file=' + gconf.gluster_log_file, 'volfile-server=' +
self.host, 'volfile-id=' + self.volume, 'client-pid=-1']
mounter(params).inhibit(*[l for l in [label] if l])
+ logging.info ("Mounted gluster volume. Time taken: {0:.4f} "
+ "secs".format((time.time() - t0)))
def connect_remote(self, *a, **kw):
sup(self, *a, **kw)
@@ -1723,10 +1727,14 @@ class SSH(AbstractUrl, SlaveRemote):
self.inner_rsc.url)
deferred = go_daemon == 'postconn'
+ logging.info ("Initializing SSH connection between master and slave...")
+ t0 = time.time()
ret = sup(self, gconf.ssh_command.split() +
["-p", str(gconf.ssh_port)] +
gconf.ssh_ctl_args + [self.remote_addr],
slave=self.inner_rsc.url, deferred=deferred)
+ logging.info ("SSH connection between master and slave established. "
+ "Time taken: {0:.4f} secs".format((time.time() - t0)))
if deferred:
# send a message to peer so that we can wait for