From 853a90f9d7399e4afdb685946a809e9dd30a1b98 Mon Sep 17 00:00:00 2001 From: Aravinda VK Date: Wed, 19 Mar 2014 12:00:44 +0530 Subject: geo-rep: Fix ValueError - signal only works in main thread When a worker process not confirmed within 60 seconds of start then monitor thread was terminated instead of stopping and restarting the worker thread. Before terminate monitor thread tries to add a signal handler for SIGTERM to cleanup the stuff before terminate. Signal handling will not work inside thread, so ValueError was raised. This patch will not terminate monitor thread, instead only kills and restarts the worker. BUG: 1078068 Change-Id: Icf0df7ef492da636d0d20e42750747e404d897df Signed-off-by: Aravinda VK Reviewed-on: http://review.gluster.org/7294 Reviewed-on: http://review.gluster.org/7313 Tested-by: Gluster Build System Reviewed-by: Vijay Bellur --- geo-replication/syncdaemon/monitor.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'geo-replication') diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py index 0c3a42fa6..b0262ee30 100644 --- a/geo-replication/syncdaemon/monitor.py +++ b/geo-replication/syncdaemon/monitor.py @@ -146,20 +146,20 @@ class Monitor(object): if so: ret = nwait(cpid, os.WNOHANG) if ret != None: - logging.debug("worker died before establishing connection") + logging.info("worker(%s) died before establishing " \ + "connection" % w[0]) else: - logging.debug("worker seems to be connected (?? racy check)") + logging.debug("worker(%s) connected" % w[0]) while time.time() < t0 + conn_timeout: ret = nwait(cpid, os.WNOHANG) if ret != None: - logging.debug("worker died in startup phase") + logging.info("worker(%s) died in startup " \ + "phase" % w[0]) break time.sleep(1) else: - logging.debug("worker not confirmed in %d sec, aborting it" % \ - conn_timeout) - self.terminate() - time.sleep(1) + logging.info("worker(%s) not confirmed in %d sec, " \ + "aborting it" % (w[0], conn_timeout)) os.kill(cpid, signal.SIGKILL) ret = nwait(cpid) if ret == None: @@ -188,7 +188,6 @@ class Monitor(object): for wx in wspx: def wmon(w): cpid, _ = self.monitor(w, argv, cpids) - terminate() time.sleep(1) self.lock.acquire() for cpid in cpids: -- cgit