diff options
Diffstat (limited to 'geo-replication')
| -rw-r--r-- | geo-replication/syncdaemon/monitor.py | 16 | 
1 files changed, 12 insertions, 4 deletions
diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py index f3700c1a390..0bde216d761 100644 --- a/geo-replication/syncdaemon/monitor.py +++ b/geo-replication/syncdaemon/monitor.py @@ -189,21 +189,22 @@ class Monitor(object):              self.lock.release()              os.close(pw) -            t0 = time.time() -            so = select((pr,), (), (), conn_timeout)[0] -            os.close(pr) -              # close all RPC pipes in monitor              os.close(ra)              os.close(wa)              os.close(rw)              os.close(ww) +            t0 = time.time() +            so = select((pr,), (), (), conn_timeout)[0] +            os.close(pr) +              if so:                  ret = nwait(cpid, os.WNOHANG)                  if ret is not None:                      logging.info("worker(%s) died before establishing "                                   "connection" % w[0]) +                    nwait(apid) #wait for agent                  else:                      logging.debug("worker(%s) connected" % w[0])                      while time.time() < t0 + conn_timeout: @@ -211,15 +212,20 @@ class Monitor(object):                          if ret is not None:                              logging.info("worker(%s) died in startup "                                           "phase" % w[0]) +                            nwait(apid) #wait for agent                              break                          time.sleep(1)              else:                  logging.info("worker(%s) not confirmed in %d sec, "                               "aborting it" % (w[0], conn_timeout))                  os.kill(cpid, signal.SIGKILL) +                nwait(apid) #wait for agent                  ret = nwait(cpid)              if ret is None:                  self.set_state(self.ST_STABLE, w) +                #If worker dies, agent terminates on EOF. +                #So lets wait for agent first. +                nwait(apid)                  ret = nwait(cpid)              if exit_signalled(ret):                  ret = 0 @@ -249,6 +255,8 @@ class Monitor(object):                  self.lock.acquire()                  for cpid in cpids:                      os.kill(cpid, signal.SIGKILL) +                for apid in agents: +                    os.kill(apid, signal.SIGKILL)                  self.lock.release()                  finalize(exval=1)              t = Thread(target=wmon, args=[wx])  | 
