1 files changed, 108 insertions, 36 deletions
diff --git a/tools/glusterfind/src/changelog.py b/tools/glusterfind/src/changelog.py
index 4d0a190286e..a5e9ea4288f 100644
--- a/tools/glusterfind/src/changelog.py
+++ b/tools/glusterfind/src/changelog.py
@@ -1,4 +1,5 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
 
 # Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com/>
 # This file is part of GlusterFS.
@@ -13,9 +14,14 @@ import sys
 import time
 import xattr
 import logging
+from gfind_py2py3 import bytearray_to_str
 from argparse import ArgumentParser, RawDescriptionHelpFormatter
 import hashlib
-import urllib
+try:
+    import urllib.parse as urllib
+except ImportError:
+    import urllib
+import codecs
 
 import libgfchangelog
 from utils import mkdirp, symlink_gfid_to_path
@@ -38,8 +44,6 @@ history_turn_time = 0
 logger = logging.getLogger()
 
 
-
-
 def pgfid_to_path(brick, changelog_data):
     """
     For all the pgfids in table, converts into path using recursive
@@ -47,13 +51,13 @@ def pgfid_to_path(brick, changelog_data):
     """
     # pgfid1 to path1 in case of CREATE/MKNOD/MKDIR/LINK/SYMLINK
     for row in changelog_data.gfidpath_get_distinct("pgfid1", {"path1": ""}):
-        # In case of Data/Metadata only, pgfid1 will not be their
+        # In case of Data/Metadata only, pgfid1 will not be there
         if row[0] == "":
             continue
 
         try:
             path = symlink_gfid_to_path(brick, row[0])
-            path = output_path_prepare(path, args.output_prefix)
+            path = output_path_prepare(path, args)
             changelog_data.gfidpath_set_path1(path, row[0])
         except (IOError, OSError) as e:
             logger.warn("Error converting to path: %s" % e)
@@ -69,7 +73,7 @@ def pgfid_to_path(brick, changelog_data):
 
         try:
             path = symlink_gfid_to_path(brick, row[0])
-            path = output_path_prepare(path, args.output_prefix)
+            path = output_path_prepare(path, args)
             changelog_data.gfidpath_set_path2(path, row[0])
         except (IOError, OSError) as e:
             logger.warn("Error converting to path: %s" % e)
@@ -90,9 +94,9 @@ def populate_pgfid_and_inodegfid(brick, changelog_data):
             # It is a Directory if GFID backend path is symlink
             try:
                 path = symlink_gfid_to_path(brick, gfid)
-                path = output_path_prepare(path, args.output_prefix)
+                path = output_path_prepare(path, args)
                 changelog_data.gfidpath_update({"path1": path},
-                                                {"gfid": gfid})
+                                               {"gfid": gfid})
             except (IOError, OSError) as e:
                 logger.warn("Error converting to path: %s" % e)
                 continue
@@ -102,15 +106,55 @@ def populate_pgfid_and_inodegfid(brick, changelog_data):
                 changelog_data.inodegfid_add(os.stat(p).st_ino, gfid)
                 file_xattrs = xattr.list(p)
                 for x in file_xattrs:
-                    if x.startswith("trusted.pgfid."):
+                    x_str = bytearray_to_str(x)
+                    if x_str.startswith("trusted.pgfid."):
                         # PGFID in pgfid table
-                        changelog_data.pgfid_add(x.split(".")[-1])
+                        changelog_data.pgfid_add(x_str.split(".")[-1])
             except (IOError, OSError):
                 # All OS Errors ignored, since failures will be logged
                 # in End. All GFIDs present in gfidpath table
                 continue
 
 
+def enum_hard_links_using_gfid2path(brick, gfid, args):
+    hardlinks = []
+    p = os.path.join(brick, ".glusterfs", gfid[0:2], gfid[2:4], gfid)
+    if not os.path.isdir(p):
+        # we have a symlink or a normal file
+        try:
+            file_xattrs = xattr.list(p)
+            for x in file_xattrs:
+                x_str = bytearray_to_str(x)
+                if x_str.startswith("trusted.gfid2path."):
+                    # get the value for the xattr i.e. <PGFID>/<BN>
+                    v = xattr.getxattr(p, x_str)
+                    v_str = bytearray_to_str(v)
+                    pgfid, bn = v_str.split(os.sep)
+                    try:
+                        path = symlink_gfid_to_path(brick, pgfid)
+                        fullpath = os.path.join(path, bn)
+                        fullpath = output_path_prepare(fullpath, args)
+                        hardlinks.append(fullpath)
+                    except (IOError, OSError) as e:
+                        logger.warn("Error converting to path: %s" % e)
+                        continue
+        except (IOError, OSError):
+            pass
+    return hardlinks
+
+
+def gfid_to_all_paths_using_gfid2path(brick, changelog_data, args):
+    path = ""
+    for row in changelog_data.gfidpath_get({"path1": "", "type": "MODIFY"}):
+        gfid = row[3].strip()
+        logger.debug("Processing gfid %s" % gfid)
+        hardlinks = enum_hard_links_using_gfid2path(brick, gfid, args)
+
+        path = ",".join(hardlinks)
+
+        changelog_data.gfidpath_update({"path1": path}, {"gfid": gfid})
+
+
 def gfid_to_path_using_pgfid(brick, changelog_data, args):
     """
     For all the pgfids collected, Converts to Path and
@@ -145,7 +189,7 @@ def gfid_to_path_using_pgfid(brick, changelog_data, args):
         path = path.strip()
         path = path[brick_path_len+1:]
 
-        path = output_path_prepare(path, args.output_prefix)
+        path = output_path_prepare(path, args)
 
         changelog_data.append_path1(path, inode)
         changelog_data.inodegfid_update({"converted": 1}, {"inode": inode})
@@ -158,10 +202,10 @@ def gfid_to_path_using_pgfid(brick, changelog_data, args):
         try:
             path = symlink_gfid_to_path(brick, row[0])
             find(os.path.join(brick, path),
-                callback_func=output_callback,
-                filter_func=inode_filter,
-                ignore_dirs=ignore_dirs,
-                subdirs_crawl=False)
+                 callback_func=output_callback,
+                 filter_func=inode_filter,
+                 ignore_dirs=ignore_dirs,
+                 subdirs_crawl=False)
         except (IOError, OSError) as e:
             logger.warn("Error converting to path: %s" % e)
             continue
@@ -193,7 +237,7 @@ def gfid_to_path_using_batchfind(brick, changelog_data):
         # Also updates converted flag in inodegfid table as 1
         path = path.strip()
         path = path[brick_path_len+1:]
-        path = output_path_prepare(path, args.output_prefix)
+        path = output_path_prepare(path, args)
 
         changelog_data.append_path1(path, inode)
 
@@ -211,7 +255,7 @@ def parse_changelog_to_db(changelog_data, filename, args):
     """
     Parses a Changelog file and populates data in gfidpath table
     """
-    with open(filename) as f:
+    with codecs.open(filename, encoding="utf-8") as f:
         changelogfile = os.path.basename(filename)
         for line in f:
             data = line.strip().split(" ")
@@ -230,7 +274,7 @@ def parse_changelog_to_db(changelog_data, filename, args):
                 changelog_data.when_rename(changelogfile, data)
             elif data[0] == "E" and data[2] in ["UNLINK", "RMDIR"]:
                 # UNLINK/RMDIR
-                changelog_data.when_unlink_rmdir(changelogfile, data, args)
+                changelog_data.when_unlink_rmdir(changelogfile, data)
 
 
 def get_changes(brick, hash_dir, log_file, start, end, args):
@@ -243,7 +287,7 @@ def get_changes(brick, hash_dir, log_file, start, end, args):
     session_dir = os.path.join(conf.get_opt("session_dir"),
                                args.session)
     status_file = os.path.join(session_dir, args.volume,
-                               "%s.status" % urllib.quote_plus(args.brick))
+                     "%s.status" % urllib.quote_plus(args.brick))
 
     # Get previous session
     try:
@@ -260,7 +304,7 @@ def get_changes(brick, hash_dir, log_file, start, end, args):
         fail("%s Changelog register failed: %s" % (brick, e), logger=logger)
 
     # Output files to record GFIDs and GFID to Path failure GFIDs
-    changelog_data = ChangelogData(args.outfile)
+    changelog_data = ChangelogData(args.outfile, args)
 
     # Changelogs path(Hard coded to BRICK/.glusterfs/changelogs
     cl_path = os.path.join(brick, ".glusterfs/changelogs")
@@ -270,9 +314,10 @@ def get_changes(brick, hash_dir, log_file, start, end, args):
         actual_end = libgfchangelog.cl_history_changelog(
             cl_path, start, end, CHANGELOGAPI_NUM_WORKERS)
     except libgfchangelog.ChangelogException as e:
-        fail("%s Historical Changelogs not available: %s" % (brick, e),
-             logger=logger)
+        fail("%s: %s Historical Changelogs not available: %s" %
+             (args.node, brick, e), logger=logger)
 
+    logger.info("[1/4] Starting changelog parsing ...")
     try:
         # scan followed by getchanges till scan returns zero.
         # history_scan() is blocking call, till it gets the number
@@ -282,7 +327,7 @@ def get_changes(brick, hash_dir, log_file, start, end, args):
         # history_getchanges()
         changes = []
         while libgfchangelog.cl_history_scan() > 0:
-            changes += libgfchangelog.cl_history_getchanges()
+            changes = libgfchangelog.cl_history_getchanges()
 
             for change in changes:
                 # Ignore if last processed changelog comes
@@ -294,25 +339,34 @@ def get_changes(brick, hash_dir, log_file, start, end, args):
                     libgfchangelog.cl_history_done(change)
                 except IOError as e:
                     logger.warn("Error parsing changelog file %s: %s" %
-                        (change, e))
+                                (change, e))
 
             changelog_data.commit()
     except libgfchangelog.ChangelogException as e:
         fail("%s Error during Changelog Crawl: %s" % (brick, e),
              logger=logger)
 
+    logger.info("[1/4] Finished changelog parsing.")
+
     # Convert all pgfid available from Changelogs
+    logger.info("[2/4] Starting 'pgfid to path' conversions ...")
     pgfid_to_path(brick, changelog_data)
     changelog_data.commit()
+    logger.info("[2/4] Finished 'pgfid to path' conversions.")
 
-    # Convert all GFIDs for which no other additional details available
-    gfid_to_path_using_pgfid(brick, changelog_data, args)
+    # Convert all gfids recorded for data and metadata to all hardlink paths
+    logger.info("[3/4] Starting 'gfid2path' conversions ...")
+    gfid_to_all_paths_using_gfid2path(brick, changelog_data, args)
     changelog_data.commit()
+    logger.info("[3/4] Finished 'gfid2path' conversions.")
 
     # If some GFIDs fail to get converted from previous step,
     # convert using find
+    logger.info("[4/4] Starting 'gfid to path using batchfind' "
+                "conversions ...")
     gfid_to_path_using_batchfind(brick, changelog_data)
     changelog_data.commit()
+    logger.info("[4/4] Finished 'gfid to path using batchfind' conversions.")
 
     return actual_end
 
@@ -326,7 +380,7 @@ def changelog_crawl(brick, start, end, args):
 
     # WORKING_DIR/BRICKHASH/OUTFILE
     working_dir = os.path.dirname(args.outfile)
-    brickhash = hashlib.sha1(brick)
+    brickhash = hashlib.sha1(brick.encode())
     brickhash = str(brickhash.hexdigest())
     working_dir = os.path.join(working_dir, brickhash)
 
@@ -348,12 +402,20 @@ def _get_args():
 
     parser.add_argument("session", help="Session Name")
     parser.add_argument("volume", help="Volume Name")
+    parser.add_argument("node", help="Node Name")
     parser.add_argument("brick", help="Brick Name")
     parser.add_argument("outfile", help="Output File")
     parser.add_argument("start", help="Start Time", type=int)
+    parser.add_argument("end", help="End Time", type=int)
+    parser.add_argument("--only-query", help="Query mode only (no session)",
+                        action="store_true")
     parser.add_argument("--debug", help="Debug", action="store_true")
+    parser.add_argument("--no-encode",
+                        help="Do not encode path in outfile",
+                        action="store_true")
     parser.add_argument("--output-prefix", help="File prefix in output",
                         default=".")
+    parser.add_argument("--type",default="both")
     parser.add_argument("-N", "--only-namespace-changes",
                         help="List only namespace changes",
                         action="store_true")
@@ -373,24 +435,34 @@ if __name__ == "__main__":
 
     session_dir = os.path.join(conf.get_opt("session_dir"), args.session)
     status_file = os.path.join(session_dir, args.volume,
-                               "%s.status" % urllib.quote_plus(args.brick))
+                     "%s.status" % urllib.quote_plus(args.brick))
     status_file_pre = status_file + ".pre"
     mkdirp(os.path.join(session_dir, args.volume), exit_on_err=True,
            logger=logger)
 
-    try:
-        with open(status_file) as f:
-            start = int(f.read().strip())
-    except (ValueError, OSError, IOError):
+    end = -1
+    if args.only_query:
         start = args.start
+        end = args.end
+    else:
+        try:
+            with open(status_file) as f:
+                start = int(f.read().strip())
+        except (ValueError, OSError, IOError):
+            start = args.start
+
+    # end time is optional; so a -1 may be sent to use the default method of
+    # identifying the end time
+    if end == -1:
+        end = int(time.time()) - get_changelog_rollover_time(args.volume)
 
-    end = int(time.time()) - get_changelog_rollover_time(args.volume)
     logger.info("%s Started Changelog Crawl - Start: %s End: %s" % (args.brick,
                                                                     start,
                                                                     end))
     actual_end = changelog_crawl(args.brick, start, end, args)
-    with open(status_file_pre, "w", buffering=0) as f:
-        f.write(str(actual_end))
+    if not args.only_query:
+        with open(status_file_pre, "w") as f:
+            f.write(str(actual_end))
 
     logger.info("%s Finished Changelog Crawl - End: %s" % (args.brick,
                                                            actual_end))