summaryrefslogtreecommitdiffstats
path: root/geo-replication
diff options
context:
space:
mode:
authorMilind Changire <mchangir@redhat.com>2015-03-30 16:53:05 +0530
committerVijay Bellur <vbellur@redhat.com>2015-04-27 22:05:18 -0700
commit70a729e9751e45e266f7462443dcf2b6be3cecbe (patch)
treeb845410906c3f6f508316c28f79a1f958bff1907 /geo-replication
parent1adf231fc425ac35eb91a5b771ea0c5c97b1f6c3 (diff)
geo-rep: log ENTRY failures from slave on master
ENTRY operations failures on slave left no trace for debugging purposes. This patch captures such failures on slave cluster and forwards them to the master and logs them. Failures of specific interest are the ones which return code EEXIST on the failing operations. Change-Id: Iecab876f16593c746d53f4b7ec2e0783367856bb BUG: 1207115 Signed-off-by: Milind Changire <mchangir@redhat.com> Reviewed-on: http://review.gluster.org/10048 Reviewed-by: Aravinda VK <avishwan@redhat.com> Tested-by: NetBSD Build System Tested-by: Gluster Build System <jenkins@build.gluster.com>
Diffstat (limited to 'geo-replication')
-rw-r--r--geo-replication/syncdaemon/master.py12
-rw-r--r--geo-replication/syncdaemon/resource.py44
-rw-r--r--geo-replication/syncdaemon/syncdutils.py2
3 files changed, 48 insertions, 10 deletions
diff --git a/geo-replication/syncdaemon/master.py b/geo-replication/syncdaemon/master.py
index babf94cbc11..dabf5536c64 100644
--- a/geo-replication/syncdaemon/master.py
+++ b/geo-replication/syncdaemon/master.py
@@ -927,6 +927,12 @@ class GMasterChangelogMixin(GMasterCommon):
def purge_update():
files_pending['purge'] += 1
+ def log_failures(failures, entry_key, gfid_prefix, log_prefix):
+ for failure in failures:
+ st = lstat(os.path.join(gfid_prefix, failure[0][entry_key]))
+ if not isinstance(st, int):
+ logging.warn('%s FAILED: %s' % (log_prefix, repr(failure)))
+
for e in clist:
e = e.strip()
et = e[self.IDX_START:self.IDX_END] # entry type
@@ -1029,7 +1035,8 @@ class GMasterChangelogMixin(GMasterCommon):
self.update_worker_cumilitive_status(files_pending)
# sync namespace
if entries:
- self.slave.server.entry_ops(entries)
+ failures = self.slave.server.entry_ops(entries)
+ log_failures(failures, 'gfid', gauxpfx(), 'ENTRY')
# sync metadata
if meta_gfid:
meta_entries = []
@@ -1043,7 +1050,8 @@ class GMasterChangelogMixin(GMasterCommon):
continue
meta_entries.append(edct('META', go=go[0], stat=st))
if meta_entries:
- self.slave.server.meta_ops(meta_entries)
+ failures = self.slave.server.meta_ops(meta_entries)
+ log_failures(failures, 'go', '', 'META')
# sync data
if datas:
self.a_syncdata(datas)
diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py
index fe86044d546..d3d1ee36e01 100644
--- a/geo-replication/syncdaemon/resource.py
+++ b/geo-replication/syncdaemon/resource.py
@@ -607,6 +607,19 @@ class Server(object):
er = errno_wrap(os.rmdir, [entry], [ENOENT, ENOTEMPTY])
if er == ENOTEMPTY:
return er
+
+ def collect_failure(e, cmd_ret):
+ # We do this for failing fops on Slave
+ # Master should be logging this
+ if cmd_ret == EEXIST:
+ disk_gfid = cls.gfid_mnt(e['entry'])
+ if isinstance(disk_gfid, basestring):
+ if e['gfid'] != disk_gfid:
+ failures.append((e, cmd_ret, disk_gfid))
+ else:
+ failures.append((e, cmd_ret))
+
+ failures = []
for e in entries:
blob = None
op = e['op']
@@ -644,7 +657,10 @@ class Server(object):
(pg, bname) = entry2pb(entry)
blob = entry_pack_reg_stat(gfid, bname, e['stat'])
else:
- errno_wrap(os.link, [slink, entry], [ENOENT, EEXIST])
+ cmd_ret = errno_wrap(os.link,
+ [slink, entry],
+ [ENOENT, EEXIST])
+ collect_failure(e, cmd_ret)
elif op == 'SYMLINK':
blob = entry_pack_symlink(gfid, bname, e['link'], e['stat'])
elif op == 'RENAME':
@@ -655,16 +671,22 @@ class Server(object):
(pg, bname) = entry2pb(en)
blob = entry_pack_reg_stat(gfid, bname, e['stat'])
else:
- errno_wrap(os.rename, [entry, en], [ENOENT, EEXIST])
+ cmd_ret = errno_wrap(os.rename,
+ [entry, en],
+ [ENOENT, EEXIST])
+ collect_failure(e, cmd_ret)
if blob:
- errno_wrap(Xattr.lsetxattr,
- [pg, 'glusterfs.gfid.newfile', blob],
- [EEXIST],
- [ENOENT, ESTALE, EINVAL])
+ cmd_ret = errno_wrap(Xattr.lsetxattr,
+ [pg, 'glusterfs.gfid.newfile', blob],
+ [EEXIST],
+ [ENOENT, ESTALE, EINVAL])
+ collect_failure(e, cmd_ret)
+ return failures
@classmethod
def meta_ops(cls, meta_entries):
logging.debug('Meta-entries: %s' % repr(meta_entries))
+ failures = []
for e in meta_entries:
mode = e['stat']['mode']
uid = e['stat']['uid']
@@ -672,10 +694,18 @@ class Server(object):
atime = e['stat']['atime']
mtime = e['stat']['mtime']
go = e['go']
- errno_wrap(os.chmod, [go, mode], [ENOENT], [ESTALE, EINVAL])
+ cmd_ret = errno_wrap(os.chmod, [go, mode],
+ [ENOENT], [ESTALE, EINVAL])
+ # This is a fail fast mechanism
+ # We do this for failing fops on Slave
+ # Master should be logging this
+ if isinstance(cmd_ret, int):
+ failures.append((e, cmd_ret))
+ continue
errno_wrap(os.chown, [go, uid, gid], [ENOENT], [ESTALE, EINVAL])
errno_wrap(os.utime, [go, (atime, mtime)],
[ENOENT], [ESTALE, EINVAL])
+ return failures
@classmethod
@_pathguard
diff --git a/geo-replication/syncdaemon/syncdutils.py b/geo-replication/syncdaemon/syncdutils.py
index dbbc31deb2a..2614c828104 100644
--- a/geo-replication/syncdaemon/syncdutils.py
+++ b/geo-replication/syncdaemon/syncdutils.py
@@ -484,7 +484,7 @@ def errno_wrap(call, arg=[], errnos=[], retry_errnos=[ESTALE]):
if nr_tries == GF_OP_RETRIES:
# probably a screwed state, cannot do much...
logging.warn('reached maximum retries (%s)...' % repr(arg))
- return
+ return ex.errno
time.sleep(0.250) # retry the call