From 8883c12216cc0c0770a4207e1e2a62fa16dc1528 Mon Sep 17 00:00:00 2001 From: Aravinda VK Date: Wed, 2 Dec 2015 19:37:55 +0530 Subject: geo-rep: Handling Rsync/Tar errors efficiently Geo-rep processes Changelogs in Batch, if one file in batch fails with rsync error that Changelog file is reprocessed multiple times. After MAX_RETRY, it logs all the GFIDs from that batch as Skipped. This patch addresses following issues, 1. When Rsync/Tar fails do not parse Changelog again for retry 2. When Rsync/Tar fails do not replay Entry operations, only retry rsync/tar for those GFIDs 3. Log Error in Rsync/Tar only in the last Retry 4. Do not log Skipped GFIDs since Rsync/Tar errors are logged for only failed files. 5. Changed Entry failures as Error instead of Warning BUG: 1313309 Change-Id: Ie134ce2572693056ab9b9008cd8aa5b5d87f7975 Signed-off-by: Aravinda VK Reviewed-on: http://review.gluster.org/12856 Reviewed-by: Kotresh HR Reviewed-by: Saravanakumar Arumugam CentOS-regression: Gluster Build System Smoke: Gluster Build System NetBSD-regression: NetBSD Build System Reviewed-by: Venky Shankar (cherry picked from commit d136a789258e8f600e536717da156a242d8ed9a5) Reviewed-on: http://review.gluster.org/13558 --- geo-replication/syncdaemon/resource.py | 49 ++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 23 deletions(-) (limited to 'geo-replication/syncdaemon/resource.py') diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py index 19363401e65..ac697eb39ed 100644 --- a/geo-replication/syncdaemon/resource.py +++ b/geo-replication/syncdaemon/resource.py @@ -906,7 +906,7 @@ class SlaveRemote(object): "RePCe major version mismatch: local %s, remote %s" % (exrv, rv)) - def rsync(self, files, *args): + def rsync(self, files, *args, **kw): """invoke rsync""" if not files: raise GsyncdError("no files to sync") @@ -932,12 +932,15 @@ class SlaveRemote(object): po.stdin.write(f) po.stdin.write('\0') - po.stdin.close() + stdout, stderr = po.communicate() + + if kw.get("log_err", False): + for errline in stderr.strip().split("\n")[:-1]: + logging.error("SYNC Error(Rsync): %s" % errline) if gconf.log_rsync_performance: - out = po.stdout.read() rsync_msg = [] - for line in out.split("\n"): + for line in stdout.split("\n"): if line.startswith("Number of files:") or \ line.startswith("Number of regular files transferred:") or \ line.startswith("Total file size:") or \ @@ -949,12 +952,10 @@ class SlaveRemote(object): line.startswith("sent "): rsync_msg.append(line) logging.info("rsync performance: %s" % ", ".join(rsync_msg)) - po.wait() - po.terminate_geterr(fail_on_err=False) return po - def tarssh(self, files, slaveurl): + def tarssh(self, files, slaveurl, log_err=False): """invoke tar+ssh -z (compress) can be use if needed, but omitting it now as it results in weird error (tar+ssh errors out (errcode: 2) @@ -975,15 +976,16 @@ class SlaveRemote(object): for f in files: p0.stdin.write(f) p0.stdin.write('\n') - p0.stdin.close() - # wait() for tar to terminate, collecting any errors, further + p0.stdin.close() + p0.stdout.close() # Allow p0 to receive a SIGPIPE if p1 exits. + # wait for tar to terminate, collecting any errors, further # waiting for transfer to complete - p0.wait() - p0.terminate_geterr(fail_on_err=False) + _, stderr1 = p1.communicate() - p1.wait() - p1.terminate_geterr(fail_on_err=False) + if log_err: + for errline in stderr1.strip().split("\n")[:-1]: + logging.error("SYNC Error(Untar): %s" % errline) return p1 @@ -1045,8 +1047,8 @@ class FILE(AbstractUrl, SlaveLocal, SlaveRemote): """inhibit the resource beyond""" os.chdir(self.path) - def rsync(self, files): - return sup(self, files, self.path) + def rsync(self, files, log_err=False): + return sup(self, files, self.path, log_err=log_err) class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): @@ -1460,11 +1462,11 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): else: sup(self, *args) - def rsync(self, files): - return sup(self, files, self.slavedir) + def rsync(self, files, log_err=False): + return sup(self, files, self.slavedir, log_err=log_err) - def tarssh(self, files): - return sup(self, files, self.slavedir) + def tarssh(self, files, log_err=False): + return sup(self, files, self.slavedir, log_err=log_err) class SSH(AbstractUrl, SlaveRemote): @@ -1571,12 +1573,13 @@ class SSH(AbstractUrl, SlaveRemote): self.fd_pair = (i, o) return 'should' - def rsync(self, files): + def rsync(self, files, log_err=False): return sup(self, files, '-e', " ".join(gconf.ssh_command.split() + ["-p", str(gconf.ssh_port)] + gconf.ssh_ctl_args), - *(gconf.rsync_ssh_options.split() + [self.slaveurl])) + *(gconf.rsync_ssh_options.split() + [self.slaveurl]), + log_err=log_err) - def tarssh(self, files): - return sup(self, files, self.slaveurl) + def tarssh(self, files, log_err=False): + return sup(self, files, self.slaveurl, log_err=log_err) -- cgit