summaryrefslogtreecommitdiffstats
path: root/geo-replication/syncdaemon/monitor.py
diff options
context:
space:
mode:
authorAravinda VK <avishwan@redhat.com>2017-06-15 18:09:36 +0530
committerAravinda VK <avishwan@redhat.com>2017-06-20 06:00:47 +0000
commit0a8dac38ac4415ea770fb36b34e3c494e8713e6e (patch)
treeb66cd2f3583466bfc2eeb16bdf724e4494cdbc3e /geo-replication/syncdaemon/monitor.py
parent52d0886cfbcdfd69fa0cac0a6d51cd8811d8c6d7 (diff)
geo-rep: Structured log support
Changed all log messages to structured log format Change-Id: Idae25f8b4ad0bbae38f4362cbda7bbf51ce7607b Updates: #240 Signed-off-by: Aravinda VK <avishwan@redhat.com> Reviewed-on: https://review.gluster.org/17551 Smoke: Gluster Build System <jenkins@build.gluster.org> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Kotresh HR <khiremat@redhat.com>
Diffstat (limited to 'geo-replication/syncdaemon/monitor.py')
-rw-r--r--geo-replication/syncdaemon/monitor.py61
1 files changed, 35 insertions, 26 deletions
diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py
index c54c07d600c..b65f1948050 100644
--- a/geo-replication/syncdaemon/monitor.py
+++ b/geo-replication/syncdaemon/monitor.py
@@ -22,7 +22,7 @@ from errno import ECHILD, ESRCH
import re
import random
from gconf import gconf
-from syncdutils import select, waitpid, errno_wrap
+from syncdutils import select, waitpid, errno_wrap, lf
from syncdutils import set_term_handler, is_host_local, GsyncdError
from syncdutils import escape, Thread, finalize, memoize
from syncdutils import gf_event, EVENT_GEOREP_FAULTY
@@ -63,15 +63,17 @@ def get_slave_bricks_status(host, vol):
po.wait()
po.terminate_geterr(fail_on_err=False)
if po.returncode != 0:
- logging.info("Volume status command failed, unable to get "
- "list of up nodes of %s, returning empty list: %s" %
- (vol, po.returncode))
+ logging.info(lf("Volume status command failed, unable to get "
+ "list of up nodes, returning empty list",
+ volume=vol,
+ error=po.returncode))
return []
vi = XET.fromstring(vix)
if vi.find('opRet').text != '0':
- logging.info("Unable to get list of up nodes of %s, "
- "returning empty list: %s" %
- (vol, vi.find('opErrstr').text))
+ logging.info(lf("Unable to get list of up nodes, "
+ "returning empty list",
+ volume=vol,
+ error=vi.find('opErrstr').text))
return []
up_hosts = set()
@@ -81,8 +83,10 @@ def get_slave_bricks_status(host, vol):
if el.find('status').text == '1':
up_hosts.add(el.find('hostname').text)
except (ParseError, AttributeError, ValueError) as e:
- logging.info("Parsing failed to get list of up nodes of %s, "
- "returning empty list: %s" % (vol, e))
+ logging.info(lf("Parsing failed to get list of up nodes, "
+ "returning empty list",
+ volume=vol,
+ error=e))
return list(up_hosts)
@@ -271,8 +275,9 @@ class Monitor(object):
# Spawn the worker and agent in lock to avoid fd leak
self.lock.acquire()
- logging.info('starting gsyncd worker(%s). Slave node: %s' %
- (w[0]['dir'], remote_host))
+ logging.info(lf('starting gsyncd worker',
+ brick=w[0]['dir'],
+ slave_node=remote_host))
# Couple of pipe pairs for RPC communication b/w
# worker and changelog agent.
@@ -336,15 +341,16 @@ class Monitor(object):
if ret_agent is not None:
# Agent is died Kill Worker
- logging.info("Changelog Agent died, "
- "Aborting Worker(%s)" % w[0]['dir'])
+ logging.info(lf("Changelog Agent died, Aborting Worker",
+ brick=w[0]['dir']))
errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
nwait(cpid)
nwait(apid)
if ret is not None:
- logging.info("worker(%s) died before establishing "
- "connection" % w[0]['dir'])
+ logging.info(lf("worker died before establishing "
+ "connection",
+ brick=w[0]['dir']))
nwait(apid) # wait for agent
else:
logging.debug("worker(%s) connected" % w[0]['dir'])
@@ -353,15 +359,16 @@ class Monitor(object):
ret_agent = nwait(apid, os.WNOHANG)
if ret is not None:
- logging.info("worker(%s) died in startup "
- "phase" % w[0]['dir'])
+ logging.info(lf("worker died in startup phase",
+ brick=w[0]['dir']))
nwait(apid) # wait for agent
break
if ret_agent is not None:
# Agent is died Kill Worker
- logging.info("Changelog Agent died, Aborting "
- "Worker(%s)" % w[0]['dir'])
+ logging.info(lf("Changelog Agent died, Aborting "
+ "Worker",
+ brick=w[0]['dir']))
errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
nwait(cpid)
nwait(apid)
@@ -369,13 +376,15 @@ class Monitor(object):
time.sleep(1)
else:
- logging.info("worker(%s) not confirmed in %d sec, aborting it. "
- "Gsyncd invocation on remote slave via SSH or "
- "gluster master mount might have hung. Please "
- "check the above logs for exact issue and check "
- "master or slave volume for errors. Restarting "
- "master/slave volume accordingly might help."
- % (w[0]['dir'], conn_timeout))
+ logging.info(
+ lf("Worker not confirmed after wait, aborting it. "
+ "Gsyncd invocation on remote slave via SSH or "
+ "gluster master mount might have hung. Please "
+ "check the above logs for exact issue and check "
+ "master or slave volume for errors. Restarting "
+ "master/slave volume accordingly might help.",
+ brick=w[0]['dir'],
+ timeout=conn_timeout))
errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
nwait(apid) # wait for agent
ret = nwait(cpid)