From 7a9a66cc5fb7f06118fab1fc2ae1c43cfbb1178f Mon Sep 17 00:00:00 2001 From: Kotresh HR Date: Thu, 29 Jan 2015 15:53:19 +0530 Subject: tools: Finds missing files in gluster volume given backend brickpath The tool finds the missing files in a geo-replication slave volume. The tool crawls backend .glusterfs of the brickpath, which is passed as a parameter and stats each entry on slave volume mount to check the presence of file. The mount used is aux-gfid-mount, hence no path conversion is required and is fast. The tool needs to be run on every node in cluster for each brickpath of geo-rep master volume to find missing files on slave volume. The tool is generic enough and can be used in non geo-replication context as well. Most of the crawler code is leverged from Avati's xfind and is modified to crawl only .glusterfs (https://github.com/avati/xsync) Thanks Aravinda for scripts to convert gfid to path. Change-Id: I84deaaaf638f7c571ff1319b67a3440fe27da810 BUG: 1187140 Signed-off-by: Aravinda VK Signed-off-by: Kotresh HR Reviewed-on: http://review.gluster.org/9503 Tested-by: Gluster Build System Reviewed-by: Vijay Bellur --- tools/gfind_missing_files/gfid_to_path.py | 162 ++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 tools/gfind_missing_files/gfid_to_path.py (limited to 'tools/gfind_missing_files/gfid_to_path.py') diff --git a/tools/gfind_missing_files/gfid_to_path.py b/tools/gfind_missing_files/gfid_to_path.py new file mode 100644 index 00000000000..8362f68b955 --- /dev/null +++ b/tools/gfind_missing_files/gfid_to_path.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python + +# Copyright (c) 2015 Red Hat, Inc. +# This file is part of GlusterFS. +# +# This file is licensed to you under your choice of the GNU Lesser +# General Public License, version 3 or any later version (LGPLv3 or +# later), or the GNU General Public License, version 2 (GPLv2), in all +# cases as published by the Free Software Foundation. + +import sys +import os +import xattr +import uuid +import re +import errno + +CHANGELOG_SEARCH_MAX_TRY = 31 +DEC_CTIME_START = 5 +ROOT_GFID = "00000000-0000-0000-0000-000000000001" +MAX_NUM_CHANGELOGS_TRY = 2 + + +def output_not_found(gfid): + # Write GFID to stderr + sys.stderr.write("%s\n" % gfid) + + +def output_success(path): + # Write converted Path to Stdout + sys.stdout.write("%s\n" % path) + + +def full_dir_path(gfid): + out_path = "" + while True: + path = os.path.join(".glusterfs", gfid[0:2], gfid[2:4], gfid) + path_readlink = os.readlink(path) + pgfid = os.path.dirname(path_readlink) + out_path = os.path.join(os.path.basename(path_readlink), out_path) + if pgfid == "../../00/00/%s" % ROOT_GFID: + out_path = os.path.join("./", out_path) + break + gfid = os.path.basename(pgfid) + return out_path + + +def find_path_from_changelog(fd, gfid): + """ + In given Changelog File, finds using following pattern + \x00\x00\x00\x00\x00/ + Pattern search finds PARGFID and BASENAME, Convert PARGFID to Path + Using readlink and add basename to form Full path. + """ + content = fd.read() + + pattern = "E%s" % gfid + pattern += "\x00(3|23)\x00\d+\x00\d+\x00\d+\x00([^\x00]+)/([^\x00]+)" + pat = re.compile(pattern) + match = pat.search(content) + + if match: + pgfid = match.group(2) + basename = match.group(3) + if pgfid == ROOT_GFID: + return os.path.join("./", basename) + else: + full_path_parent = full_dir_path(pgfid) + if full_path_parent: + return os.path.join(full_path_parent, basename) + + return None + + +def gfid_to_path(gfid): + """ + Try readlink, if it is directory it succeeds. + Get ctime of the GFID file, Decrement by 5 sec + Search for Changelog filename, Since Changelog file generated + every 15 sec, Search and get immediate next Changelog after the file + Creation. Get the Path by searching in Changelog file. + Get the resultant file's GFID and Compare with the input, If these + GFIDs are different then Some thing is changed(May be Rename) + """ + gfid = gfid.strip() + gpath = os.path.join(".glusterfs", gfid[0:2], gfid[2:4], gfid) + try: + output_success(full_dir_path(gfid)) + return + except OSError: + # Not an SymLink + pass + + try: + ctime = int(os.stat(gpath).st_ctime) + ctime -= DEC_CTIME_START + except (OSError, IOError): + output_not_found(gfid) + return + + path = None + found_changelog = False + changelog_parse_try = 0 + for i in range(CHANGELOG_SEARCH_MAX_TRY): + cl = os.path.join(".glusterfs/changelogs", "CHANGELOG.%s" % ctime) + + try: + with open(cl, "rb") as f: + changelog_parse_try += 1 + found_changelog = True + path = find_path_from_changelog(f, gfid) + if not path and changelog_parse_try < MAX_NUM_CHANGELOGS_TRY: + ctime += 1 + continue + break + except (IOError, OSError) as e: + if e.errno == errno.ENOENT: + ctime += 1 + else: + break + + if not found_changelog: + output_not_found(gfid) + return + + if not path: + output_not_found(gfid) + return + gfid1 = str(uuid.UUID(bytes=xattr.get(path, "trusted.gfid"))) + if gfid != gfid1: + output_not_found(gfid) + return + + output_success(path) + + +def main(): + num_arguments = 3 + if not sys.stdin.isatty(): + num_arguments = 2 + + if len(sys.argv) != num_arguments: + sys.stderr.write("Invalid arguments\nUsage: " + "%s \n" % sys.argv[0]) + sys.exit(1) + + path = sys.argv[1] + + if sys.stdin.isatty(): + gfid_list = os.path.abspath(sys.argv[2]) + os.chdir(path) + with open(gfid_list) as f: + for gfid in f: + gfid_to_path(gfid) + else: + os.chdir(path) + for gfid in sys.stdin: + gfid_to_path(gfid) + + +if __name__ == "__main__": + main() -- cgit