From a694e86cd5340fff1143e2ac55ec908d3ef890b3 Mon Sep 17 00:00:00 2001 From: Aravinda VK Date: Fri, 30 Oct 2015 17:06:58 +0530 Subject: geo-rep: Kill Geo-rep Worker when Agent process dies When Changelog agent process dies, Geo-replication fails to detect and worker will run without respective Changelog agent. Status shows Active/Passive without any progress. With this patch, Worker process gets killed whenever Changelog agent dies. Change-Id: I30b4cc77f924f7e8174b8bfe415ac17f0b3851b4 Signed-off-by: Aravinda VK BUG: 1279362 Reviewed-on: http://review.gluster.org/12485 Tested-by: NetBSD Build System Tested-by: Gluster Build System Reviewed-by: Venky Shankar Reviewed-by: Kotresh HR (cherry picked from commit 5d1ff7efd6ab3bd29a29922a9ea1e1aaf02544ad) Reviewed-on: http://review.gluster.org/12550 --- geo-replication/syncdaemon/changelogagent.py | 2 -- geo-replication/syncdaemon/monitor.py | 54 ++++++++++++++++++++++------ geo-replication/syncdaemon/resource.py | 2 -- 3 files changed, 44 insertions(+), 14 deletions(-) (limited to 'geo-replication') diff --git a/geo-replication/syncdaemon/changelogagent.py b/geo-replication/syncdaemon/changelogagent.py index ad5f69cfb23..731dbd06f57 100644 --- a/geo-replication/syncdaemon/changelogagent.py +++ b/geo-replication/syncdaemon/changelogagent.py @@ -66,8 +66,6 @@ class Changelog(object): class ChangelogAgent(object): def __init__(self, obj, fd_tup): (inf, ouf, rw, ww) = fd_tup.split(',') - os.close(int(rw)) - os.close(int(ww)) repce = RepceServer(obj, int(inf), int(ouf), 1) t = syncdutils.Thread(target=lambda: (repce.service_loop(), syncdutils.finalize())) diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py index ba5c8e32514..ecf48c51f7b 100644 --- a/geo-replication/syncdaemon/monitor.py +++ b/geo-replication/syncdaemon/monitor.py @@ -18,7 +18,7 @@ import xml.etree.ElementTree as XET from subprocess import PIPE from resource import Popen, FILE, GLUSTER, SSH from threading import Lock -from errno import EEXIST +from errno import ECHILD import re import random from gconf import gconf @@ -180,10 +180,18 @@ class Monitor(object): ret = 0 def nwait(p, o=0): - p2, r = waitpid(p, o) - if not p2: - return - return r + try: + p2, r = waitpid(p, o) + if not p2: + return + return r + except OSError as e: + # no child process, this happens if the child process + # already died and has been cleaned up + if e.errno == ECHILD: + return -1 + else: + raise def exit_signalled(s): """ child teminated due to receipt of SIGUSR1 """ @@ -232,6 +240,8 @@ class Monitor(object): # spawn the agent process apid = os.fork() if apid == 0: + os.close(rw) + os.close(ww) os.execv(sys.executable, argv + ['--local-path', w[0], '--agent', '--rpc-fd', @@ -241,6 +251,8 @@ class Monitor(object): cpid = os.fork() if cpid == 0: os.close(pr) + os.close(ra) + os.close(wa) os.execv(sys.executable, argv + ['--feedback-fd', str(pw), '--local-path', w[0], '--local-id', @@ -269,30 +281,52 @@ class Monitor(object): if so: ret = nwait(cpid, os.WNOHANG) + ret_agent = nwait(apid, os.WNOHANG) + + if ret_agent is not None: + # Agent is died Kill Worker + logging.info("Changelog Agent died, " + "Aborting Worker(%s)" % w[0]) + os.kill(cpid, signal.SIGKILL) + nwait(cpid) + nwait(apid) + if ret is not None: logging.info("worker(%s) died before establishing " "connection" % w[0]) - nwait(apid) #wait for agent + nwait(apid) # wait for agent else: logging.debug("worker(%s) connected" % w[0]) while time.time() < t0 + conn_timeout: ret = nwait(cpid, os.WNOHANG) + ret_agent = nwait(apid, os.WNOHANG) + if ret is not None: logging.info("worker(%s) died in startup " "phase" % w[0]) - nwait(apid) #wait for agent + nwait(apid) # wait for agent + break + + if ret_agent is not None: + # Agent is died Kill Worker + logging.info("Changelog Agent died, Aborting " + "Worker(%s)" % w[0]) + os.kill(cpid, signal.SIGKILL) + nwait(cpid) + nwait(apid) break + time.sleep(1) else: logging.info("worker(%s) not confirmed in %d sec, " "aborting it" % (w[0], conn_timeout)) os.kill(cpid, signal.SIGKILL) - nwait(apid) #wait for agent + nwait(apid) # wait for agent ret = nwait(cpid) if ret is None: self.status[w[0]].set_worker_status(self.ST_STABLE) - #If worker dies, agent terminates on EOF. - #So lets wait for agent first. + # If worker dies, agent terminates on EOF. + # So lets wait for agent first. nwait(apid) ret = nwait(cpid) if exit_signalled(ret): diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py index c73347aaf17..8869a109cf9 100644 --- a/geo-replication/syncdaemon/resource.py +++ b/geo-replication/syncdaemon/resource.py @@ -1394,8 +1394,6 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): # g3 ==> changelog History changelog_register_failed = False (inf, ouf, ra, wa) = gconf.rpc_fd.split(',') - os.close(int(ra)) - os.close(int(wa)) changelog_agent = RepceClient(int(inf), int(ouf)) status = GeorepStatus(gconf.state_file, gconf.local_path) status.reset_on_worker_start() -- cgit