geo-rep: Do not use xsync_upper_limit for change detection

Use register time(xsync_upper_limit) only for stime update, do not use for change detection. Problem 1: If a file created before geo-rep, xtime xattr does not exist. Geo-rep updates xtime of the file to current time if not exists. xtime > upper_limit so geo-rep will not pick those files. Changelog either will have SETXATTR, and fails to sync the file. Problem 2: If a file is created before geo-rep create and updated after geo-rep start. xtime of the file is greater than upper limit(geo-rep start time/changelog register time). Geo-rep(XSync) will not pick this file for syncing. Changelog will have only DATA recorded for that file. Geo-rep tries DATA without any ENTRY ops and fails with rsync error. BUG: 1200733 Change-Id: Ie4e8f284db689d2c755ef8e7ecbb658db1c0785f Signed-off-by: Aravinda VK <avishwan@redhat.com> Reviewed-on: http://review.gluster.org/9855 Reviewed-by: Kotresh HR <khiremat@redhat.com> Reviewed-by: Saravanakumar Arumugam <sarumuga@redhat.com> Tested-by: Saravanakumar Arumugam <sarumuga@redhat.com> Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Vijay Bellur <vbellur@redhat.com>
author: Aravinda VK <avishwan@redhat.com> 2015-03-11 13:31:09 +0530
committer: Vijay Bellur <vbellur@redhat.com> 2015-03-15 22:55:31 -0700
commit: 7d8be3613f7384f5118f26e194fe7c64ea69d11c (patch)
tree: b6dcdd9c04ebb8f855e40390165d790ba7fcfa75 /geo-replication/syncdaemon
parent: f0224ce93ae9ad420e23612fe6e6707a821f9cab (diff)
2 files changed, 26 insertions, 34 deletions
diff --git a/geo-replication/syncdaemon/master.py b/geo-replication/syncdaemon/master.py
index dfe65fe6709..e60624391a1 100644
--- a/geo-replication/syncdaemon/master.py
+++ b/geo-replication/syncdaemon/master.py
@@ -170,10 +170,7 @@ class NormalMixin(object):
             raise GsyncdError("timestamp corruption for " + path)
 
     def need_sync(self, e, xte, xtrd):
-        if self.xsync_upper_limit:
-            return xte > xtrd and xte <= self.xsync_upper_limit
-        else:
-            return xte > xtrd
+        return xte > xtrd
 
     def set_slave_xtime(self, path, mark):
         self.slave.server.set_stime(path, self.uuid, mark)
@@ -491,8 +488,7 @@ class GMasterCommon(object):
     def register(self):
         self.register()
 
-    def crawlwrap(self, oneshot=False, no_stime_update=False,
-                  register_time=None):
+    def crawlwrap(self, oneshot=False, register_time=None):
         if oneshot:
             # it's important to do this during the oneshot crawl as
             # for a passive gsyncd (ie. in a replicate scenario)
@@ -503,11 +499,11 @@ class GMasterCommon(object):
         # then it sets register_time which is the time when geo-rep
         # worker registerd to changelog consumption. Since nsec is
         # not considered in register time, their are chances of skipping
-        # changes detection in xsync crawl. Add 1 sec to upper_limit.
-        # This limit will be reset when crawlwrap is called again.
-        self.xsync_upper_limit = None
+        # changes detection in xsync crawl. This limit will be reset when
+        # crawlwrap is called again.
+        self.live_changelog_start_time = None
         if register_time:
-            self.xsync_upper_limit = (register_time + 1, 0)
+            self.live_changelog_start_time = (register_time, 0)
 
         # no need to maintain volinfo state machine.
         # in a cascading setup, each geo-replication session is
@@ -583,7 +579,7 @@ class GMasterCommon(object):
                 time.sleep(5)
                 continue
             self.update_worker_health("Active")
-            self.crawl(no_stime_update=no_stime_update)
+            self.crawl()
             if oneshot:
                 return
             time.sleep(self.sleep_interval)
@@ -1278,7 +1274,7 @@ class GMasterChangelogMixin(GMasterCommon):
             except:
                 raise
 
-    def crawl(self, no_stime_update=False):
+    def crawl(self):
         self.update_worker_crawl_status("Changelog Crawl")
         changes = []
         # get stime (from the brick) and purge changelogs
@@ -1323,7 +1319,7 @@ class GMasterChangeloghistoryMixin(GMasterChangelogMixin):
         self.processed_changelogs_dir = os.path.join(self.setup_working_dir(),
                                                      ".history/.processed")
 
-    def crawl(self, no_stime_update=False):
+    def crawl(self):
         self.history_turns += 1
         self.update_worker_crawl_status("History Crawl")
         purge_time = self.get_purge_time()
@@ -1425,7 +1421,7 @@ class GMasterXsyncMixin(GMasterChangelogMixin):
             else:
                 raise
 
-    def crawl(self, no_stime_update=False):
+    def crawl(self):
         """
         event dispatcher thread
 
@@ -1451,18 +1447,8 @@ class GMasterXsyncMixin(GMasterChangelogMixin):
                     self.process([item[1]], 0)
                     self.archive_and_purge_changelogs([item[1]])
                 elif item[0] == 'stime':
-                    if not no_stime_update:
-                        # xsync is started after running history but if
-                        # history actual end time is less than register time
-                        # then if we update stime, live changelog processing
-                        # will skip the changelogs for which TS is less than
-                        # stime. During this deletes and renames are not
-                        # propogated. By not setting stime live changelog will
-                        # start processing from the register time. Since we
-                        # have xsync_upper_limit their will not be much
-                        # overlap/redo of changelogs.
-                        logging.debug('setting slave time: %s' % repr(item[1]))
-                        self.upd_stime(item[1][1], item[1][0])
+                    logging.debug('setting slave time: %s' % repr(item[1]))
+                    self.upd_stime(item[1][1], item[1][0])
                 else:
                     logging.warn('unknown tuple in comlist (%s)' % repr(item))
             except IndexError:
@@ -1603,8 +1589,15 @@ class GMasterXsyncMixin(GMasterChangelogMixin):
                                               str(st.st_mtime)])
                 self.Xcrawl(e, xtr_root)
                 stime_to_update = xte
-                if self.xsync_upper_limit:
-                    stime_to_update = min(self.xsync_upper_limit, xte)
+                # Live Changelog Start time indicates that from that time
+                # onwards Live changelogs are available. If we update stime
+                # greater than live_changelog_start time then Geo-rep will
+                # skip those changelogs as already processed. But Xsync
+                # actually failed to sync the deletes and Renames. Update
+                # stime as min(Live_changelogs_time, Actual_stime) When it
+                # switches to Changelog mode, it syncs Deletes and Renames.
+                if self.live_changelog_start_time:
+                    stime_to_update = min(self.live_changelog_start_time, xte)
                 self.stimes.append((e, stime_to_update))
             elif stat.S_ISLNK(mo):
                 self.write_entry_change(
@@ -1630,8 +1623,8 @@ class GMasterXsyncMixin(GMasterChangelogMixin):
                 self.write_entry_change("D", [gfid])
         if path == '.':
             stime_to_update = xtl
-            if self.xsync_upper_limit:
-                stime_to_update = min(self.xsync_upper_limit, xtl)
+            if self.live_changelog_start_time:
+                stime_to_update = min(self.live_changelog_start_time, xtl)
             self.stimes.append((path, stime_to_update))
             self.sync_done(self.stimes, True)
 
diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py
index e9796fc48f6..ae94f04aa37 100644
--- a/geo-replication/syncdaemon/resource.py
+++ b/geo-replication/syncdaemon/resource.py
@@ -1333,8 +1333,8 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote):
                     g3.crawlwrap(oneshot=True)
                 else:
                     g1.crawlwrap(oneshot=True)
-            except (ChangelogException, NoPurgeTimeAvailable,
-                    PartialHistoryAvailable) as e:
+            except (ChangelogException, PartialHistoryAvailable,
+                    NoPurgeTimeAvailable) as e:
                 if isinstance(e, ChangelogException):
                     logging.info('Changelog history crawl failed, fallback '
                                  'to xsync: %s - %s' % (e.errno, e.strerror))
@@ -1342,8 +1342,7 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote):
                     logging.info('Partial history available, using xsync crawl'
                                  ' after consuming history '
                                  'till %s' % str(e))
-                g1.crawlwrap(oneshot=True, no_stime_update=True,
-                             register_time=register_time)
+                g1.crawlwrap(oneshot=True, register_time=register_time)
 
             # crawl loop: Try changelog crawl, if failed
             # switch to FS crawl
author	Aravinda VK <avishwan@redhat.com>	2015-03-11 13:31:09 +0530
committer	Vijay Bellur <vbellur@redhat.com>	2015-03-15 22:55:31 -0700
commit	7d8be3613f7384f5118f26e194fe7c64ea69d11c (patch)
tree	b6dcdd9c04ebb8f855e40390165d790ba7fcfa75 /geo-replication/syncdaemon
parent	f0224ce93ae9ad420e23612fe6e6707a821f9cab (diff)