summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAravinda VK <avishwan@redhat.com>2014-03-19 12:00:44 +0530
committerVijay Bellur <vbellur@redhat.com>2014-03-20 23:25:43 -0700
commit3aeb95cfd072e06ed9dfbbb9249837da58c2e56a (patch)
treeabd3fb7e993b64ae86e4325308cac9a89b6acf86
parentd4343f0d192862cb378eac13365bcfde31fa1d37 (diff)
geo-rep: Fix ValueError - signal only works in main thread
When a worker process not confirmed within 60 seconds of start then monitor thread was terminated instead of stopping and restarting the worker thread. Before terminate monitor thread tries to add a signal handler for SIGTERM to cleanup the stuff before terminate. Signal handling will not work inside thread, so ValueError was raised. This patch will not terminate monitor thread, instead only kills and restarts the worker. Change-Id: I14df26c0cc3097af29293c81536c13b86075e28f BUG: 1078068 Signed-off-by: Aravinda VK <avishwan@redhat.com> Reviewed-on: http://review.gluster.org/7294 Reviewed-by: Venky Shankar <vshankar@redhat.com> Reviewed-by: Vijay Bellur <vbellur@redhat.com> Tested-by: Vijay Bellur <vbellur@redhat.com>
-rw-r--r--geo-replication/syncdaemon/monitor.py15
1 files changed, 7 insertions, 8 deletions
diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py
index 0c3a42f..b0262ee 100644
--- a/geo-replication/syncdaemon/monitor.py
+++ b/geo-replication/syncdaemon/monitor.py
@@ -146,20 +146,20 @@ class Monitor(object):
if so:
ret = nwait(cpid, os.WNOHANG)
if ret != None:
- logging.debug("worker died before establishing connection")
+ logging.info("worker(%s) died before establishing " \
+ "connection" % w[0])
else:
- logging.debug("worker seems to be connected (?? racy check)")
+ logging.debug("worker(%s) connected" % w[0])
while time.time() < t0 + conn_timeout:
ret = nwait(cpid, os.WNOHANG)
if ret != None:
- logging.debug("worker died in startup phase")
+ logging.info("worker(%s) died in startup " \
+ "phase" % w[0])
break
time.sleep(1)
else:
- logging.debug("worker not confirmed in %d sec, aborting it" % \
- conn_timeout)
- self.terminate()
- time.sleep(1)
+ logging.info("worker(%s) not confirmed in %d sec, " \
+ "aborting it" % (w[0], conn_timeout))
os.kill(cpid, signal.SIGKILL)
ret = nwait(cpid)
if ret == None:
@@ -188,7 +188,6 @@ class Monitor(object):
for wx in wspx:
def wmon(w):
cpid, _ = self.monitor(w, argv, cpids)
- terminate()
time.sleep(1)
self.lock.acquire()
for cpid in cpids: