summaryrefslogtreecommitdiffstats
path: root/geo-replication/syncdaemon/master.py
diff options
context:
space:
mode:
Diffstat (limited to 'geo-replication/syncdaemon/master.py')
-rw-r--r--geo-replication/syncdaemon/master.py141
1 files changed, 100 insertions, 41 deletions
diff --git a/geo-replication/syncdaemon/master.py b/geo-replication/syncdaemon/master.py
index d9f63a440fb..665a51f64dd 100644
--- a/geo-replication/syncdaemon/master.py
+++ b/geo-replication/syncdaemon/master.py
@@ -711,7 +711,8 @@ class GMasterChangelogMixin(GMasterCommon):
TYPE_GFID = "D "
TYPE_ENTRY = "E "
- MAX_EF_RETRIES = 15
+ MAX_EF_RETRIES = 10
+ MAX_OE_RETRIES = 5
# flat directory hierarchy for gfid based access
FLAT_DIR_HIERARCHY = '.'
@@ -803,21 +804,28 @@ class GMasterChangelogMixin(GMasterCommon):
self.status.inc_value("failures", num_failures)
- def fix_possible_entry_failures(self, failures, retry_count):
+ def fix_possible_entry_failures(self, failures, retry_count, entries):
pfx = gauxpfx()
fix_entry_ops = []
failures1 = []
for failure in failures:
- if failure[2]['dst']:
+ if failure[2]['name_mismatch']:
+ pbname = failure[2]['slave_entry']
+ elif failure[2]['dst']:
pbname = failure[0]['entry1']
else:
pbname = failure[0]['entry']
- if failure[2]['gfid_mismatch']:
+
+ op = failure[0]['op']
+ # name exists but gfid is different
+ if failure[2]['gfid_mismatch'] or failure[2]['name_mismatch']:
slave_gfid = failure[2]['slave_gfid']
st = lstat(os.path.join(pfx, slave_gfid))
+ # Takes care of scenarios with no hardlinks
if isinstance(st, int) and st == ENOENT:
- logging.info(lf('Fixing gfid mismatch in slave. Deleting'
- ' the entry', retry_count=retry_count,
+ logging.info(lf('Entry not present on master. Fixing gfid '
+ 'mismatch in slave. Deleting the entry',
+ retry_count=retry_count,
entry=repr(failure)))
# Add deletion to fix_entry_ops list
if failure[2]['slave_isdir']:
@@ -830,79 +838,119 @@ class GMasterChangelogMixin(GMasterCommon):
edct('UNLINK',
gfid=failure[2]['slave_gfid'],
entry=pbname))
+ # Takes care of scenarios of hardlinks/renames on master
elif not isinstance(st, int):
- # The file exists on master but with different name.
- # Probabaly renamed and got missed during xsync crawl.
- if failure[2]['slave_isdir']:
- logging.info(lf('Fixing gfid mismatch in slave',
+ if matching_disk_gfid(slave_gfid, pbname):
+ # Safe to ignore the failure as master contains same
+ # file with same gfid. Remove entry from entries list
+ logging.info(lf('Fixing gfid mismatch in slave. '
+ ' Safe to ignore, take out entry',
retry_count=retry_count,
entry=repr(failure)))
- realpath = os.readlink(os.path.join(
- rconf.args.local_path,
- ".glusterfs",
- slave_gfid[0:2],
- slave_gfid[2:4],
- slave_gfid))
+ entries.remove(failure[0])
+ # The file exists on master but with different name.
+ # Probably renamed and got missed during xsync crawl.
+ elif failure[2]['slave_isdir']:
+ realpath = os.readlink(os.path.join(gconf.local_path,
+ ".glusterfs",
+ slave_gfid[0:2],
+ slave_gfid[2:4],
+ slave_gfid))
dst_entry = os.path.join(pfx, realpath.split('/')[-2],
realpath.split('/')[-1])
- rename_dict = edct('RENAME', gfid=slave_gfid,
- entry=failure[0]['entry'],
- entry1=dst_entry, stat=st,
- link=None)
- logging.info(lf('Fixing gfid mismatch in slave. '
- 'Renaming', retry_count=retry_count,
- entry=repr(rename_dict)))
- fix_entry_ops.append(rename_dict)
+ src_entry = pbname
+ logging.info(lf('Fixing dir name/gfid mismatch in '
+ 'slave', retry_count=retry_count,
+ entry=repr(failure)))
+ if src_entry == dst_entry:
+ # Safe to ignore the failure as master contains
+ # same directory as in slave with same gfid.
+ # Remove the failure entry from entries list
+ logging.info(lf('Fixing dir name/gfid mismatch'
+ ' in slave. Safe to ignore, '
+ 'take out entry',
+ retry_count=retry_count,
+ entry=repr(failure)))
+ entries.remove(failure[0])
+ else:
+ rename_dict = edct('RENAME', gfid=slave_gfid,
+ entry=src_entry,
+ entry1=dst_entry, stat=st,
+ link=None)
+ logging.info(lf('Fixing dir name/gfid mismatch'
+ ' in slave. Renaming',
+ retry_count=retry_count,
+ entry=repr(rename_dict)))
+ fix_entry_ops.append(rename_dict)
else:
- logging.info(lf('Fixing gfid mismatch in slave. '
- ' Deleting the entry',
+ # A hardlink file exists with different name or
+ # renamed file exists and we are sure from
+ # matching_disk_gfid check that the entry doesn't
+ # exist with same gfid so we can safely delete on slave
+ logging.info(lf('Fixing file gfid mismatch in slave. '
+ 'Hardlink/Rename Case. Deleting entry',
retry_count=retry_count,
entry=repr(failure)))
fix_entry_ops.append(
edct('UNLINK',
gfid=failure[2]['slave_gfid'],
entry=pbname))
- logging.error(lf('Entry cannot be fixed in slave due '
- 'to GFID mismatch, find respective '
- 'path for the GFID and trigger sync',
- gfid=slave_gfid))
+ elif failure[1] == ENOENT:
+ # Ignore ENOENT error for fix_entry_ops aka retry_count > 1
+ if retry_count > 1:
+ logging.info(lf('ENOENT error while fixing entry ops. '
+ 'Safe to ignore, take out entry',
+ retry_count=retry_count,
+ entry=repr(failure)))
+ entries.remove(failure[0])
+ elif op in ('MKNOD', 'CREATE', 'MKDIR'):
+ pargfid = pbname.split('/')[1]
+ st = lstat(os.path.join(pfx, pargfid))
+ # Safe to ignore the failure as master doesn't contain
+ # parent directory.
+ if isinstance(st, int):
+ logging.info(lf('Fixing ENOENT error in slave. Parent '
+ 'does not exist on master. Safe to '
+ 'ignore, take out entry',
+ retry_count=retry_count,
+ entry=repr(failure)))
+ entries.remove(failure[0])
if fix_entry_ops:
# Process deletions of entries whose gfids are mismatched
failures1 = self.slave.server.entry_ops(fix_entry_ops)
- if not failures1:
- logging.info("Sucessfully fixed entry ops with gfid mismatch")
- return failures1
+ return (failures1, fix_entry_ops)
def handle_entry_failures(self, failures, entries):
retries = 0
pending_failures = False
failures1 = []
failures2 = []
+ entry_ops1 = []
+ entry_ops2 = []
if failures:
pending_failures = True
failures1 = failures
+ entry_ops1 = entries
while pending_failures and retries < self.MAX_EF_RETRIES:
retries += 1
- failures2 = self.fix_possible_entry_failures(failures1,
- retries)
+ (failures2, entry_ops2) = self.fix_possible_entry_failures(
+ failures1, retries, entry_ops1)
if not failures2:
pending_failures = False
+ logging.info(lf('Sucessfully fixed entry ops with gfid '
+ 'mismatch', retry_count=retries))
else:
pending_failures = True
failures1 = failures2
+ entry_ops1 = entry_ops2
if pending_failures:
for failure in failures1:
logging.error("Failed to fix entry ops %s", repr(failure))
- else:
- # Retry original entry list 5 times
- failures = self.slave.server.entry_ops(entries)
-
- self.log_failures(failures, 'gfid', gauxpfx(), 'ENTRY')
def process_change(self, change, done, retry):
pfx = gauxpfx()
@@ -1129,7 +1177,18 @@ class GMasterChangelogMixin(GMasterCommon):
self.status.inc_value("entry", len(entries))
failures = self.slave.server.entry_ops(entries)
- self.handle_entry_failures(failures, entries)
+ count = 0
+ while failures and count < self.MAX_OE_RETRIES:
+ count += 1
+ self.handle_entry_failures(failures, entries)
+ logging.info("Retry original entries. count = %s" % count)
+ failures = self.slave.server.entry_ops(entries)
+ if not failures:
+ logging.info("Sucessfully fixed all entry ops with gfid "
+ "mismatch")
+ break
+
+ self.log_failures(failures, 'gfid', gauxpfx(), 'ENTRY')
self.status.dec_value("entry", len(entries))
# Update Entry stime in Brick Root only in case of Changelog mode