From 3af9443c770837abe4f54db399623380ab9767a7 Mon Sep 17 00:00:00 2001
From: Susant Palai <spalai@redhat.com>
Date: Mon, 27 Apr 2020 16:59:16 +0530
Subject: dht: optimize rebalance crawl path

For distribute only volumes we can use the information for
local subvolumes to avoid syncop calls which goes through the
whole stack to fetch stat and entries.

A separate function gf_defrag_fix_layout_puredist is introduced.

TODO: A glusterd flag needs to be introduced in case we want to
fall back to run the old way.

Perf numbers:
DirSize - 1Million	Old	New	%diff
Depth 	- 100 (Run 1)	353	74	+377%
Depth 	- 100 (Run 2)	348	72	+377~%
Depth 	- 50		246	122	+100%
Depth 	- 3		174	114	+52%

Change-Id: I67cc136cebd34092fd775e69f74c2d5b33d3156d
Fixes: #1242
Signed-off-by: Susant Palai <spalai@redhat.com>
---
 xlators/cluster/dht/src/dht-common.c    | 114 +++++++++
 xlators/cluster/dht/src/dht-common.h    |  11 +
 xlators/cluster/dht/src/dht-rebalance.c | 400 +++++++++++++++++++++++++++++++-
 xlators/cluster/dht/src/dht-shared.c    |   4 +
 4 files changed, 524 insertions(+), 5 deletions(-)

(limited to 'xlators/cluster/dht/src')

diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index 65c9c0b0a31..be92236e3bd 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -11465,3 +11465,117 @@ dht_dir_layout_error_check(xlator_t *this, inode_t *inode)
     /* Returning the first xlator error as all xlators have errors */
     return layout->list[0].err;
 }
+
+/* Get brick paths from all the local subvols and store for use.
+ *
+ * TODO: Make sure newly added brick is not picked for migration.
+ * Otherwise there will be no rebalance as directory entries won't be present
+ * on a newly added brick */
+int
+dht_get_brick_paths(xlator_t *this, dht_conf_t *conf, loc_t *loc)
+{
+    dict_t *dict = NULL;
+    gf_defrag_info_t *defrag = conf->defrag;
+    char *key = NULL;
+    char *tmp = NULL;
+    char *str = NULL;
+    char *token;
+    char *saveptr = NULL;
+    int i = 1;
+    int j = 0;
+    int ret = 0;
+
+    key = gf_strdup("glusterfs.pathinfo");
+    if (!key) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0,
+               "failed to allocate "
+               "memory");
+        ret = -1;
+        goto out;
+    }
+
+    defrag->local_brick_paths = GF_CALLOC(conf->local_subvols_cnt,
+                                          sizeof(*defrag->local_brick_paths),
+                                          gf_common_mt_pointer);
+
+    for (j = 0; j < conf->local_subvols_cnt; j++) {
+        ret = syncop_getxattr(conf->local_subvols[j], loc, &dict, key, NULL,
+                              NULL);
+        if (ret == -1) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, 0,
+                   "failed to get path,"
+                   " errno %d",
+                   ret);
+            /* TODO: We need not break out from here and can resume operation.
+             * We need a place holder in gf_defrag_info_t to mark which
+             * local_brick_paths we are working on. Right now, we blindly
+             * take defrag->local_brick_path[0]. This can be dynamic based on
+             * need */
+            goto out;
+        }
+
+        str = NULL;
+        ret = dict_get_str(dict, key, &str);
+        if (ret != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, -ret, 0, "dict get failed for :%s",
+                   key);
+            goto out;
+        }
+        if (str == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, 0, "key:%s not found", key);
+            ret = -1;
+            goto out;
+        }
+
+        if (!defrag->is_pure_distribute) {
+            tmp = strstr(str, "REPLICATE");
+            if (tmp) {
+                defrag->is_pure_distribute = _gf_false;
+                break;
+            }
+
+            /*TODO: fetching glusterfs.pathinfo on erasure volume is failing.
+             *Function the old way till we get it resolved */
+            tmp = strstr(str, "ERASURE");
+            if (tmp) {
+                defrag->is_pure_distribute = _gf_false;
+                break;
+            }
+
+            defrag->is_pure_distribute = _gf_true;
+        }
+
+        saveptr = NULL;
+
+        for (token = strtok_r(str, ":", &saveptr), i = 1; token;) {
+            token = strtok_r(NULL, ":", &saveptr);
+            i++;
+            if (i == 3) {
+                token = strtok_r(token, ">", &saveptr);
+                break;
+            } else {
+                continue;
+            }
+        }
+
+        defrag->local_brick_paths[j] = gf_strdup(token);
+    }
+
+out:
+    if (ret == -1) {
+        gf_msg(this->name, GF_LOG_INFO, 0, 0,
+               "failed to get brick path. "
+               "Will operate old way");
+        for (j = 0; j < conf->local_subvols_cnt; j++) {
+            GF_FREE(defrag->local_brick_paths[j]);
+        }
+        defrag->is_pure_distribute = _gf_false;
+    }
+
+    if (defrag->is_pure_distribute) {
+        gf_msg(this->name, GF_LOG_INFO, 0, 0, "volume type : pure distribute");
+    }
+
+    GF_FREE(key);
+    return ret;
+}
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index 028c6ac6b9f..84891406c71 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -598,6 +598,15 @@ struct gf_defrag_info_ {
     gf_boolean_t stats;
     /* lock migration flag */
     gf_boolean_t lock_migration_enabled;
+
+    /* local system crawl */
+    char **local_brick_paths;
+
+    /* whether the volume is pure distribute */
+    gf_boolean_t is_pure_distribute;
+
+    /*TODO: Introduce a glusterd option to tune this behaviour*/
+    gf_boolean_t operate_dist;
 };
 
 typedef struct gf_defrag_info_ gf_defrag_info_t;
@@ -1482,4 +1491,6 @@ dht_set_parent_layout_in_dict(loc_t *loc, xlator_t *this, dht_local_t *local);
 int
 dht_dir_layout_error_check(xlator_t *this, inode_t *inode);
 
+int
+dht_get_brick_paths(xlator_t *this, dht_conf_t *conf, loc_t *loc);
 #endif /* _DHT_H */
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index c141ffce90d..d850eef62ab 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -14,6 +14,7 @@
 #include <signal.h>
 #include <glusterfs/events.h>
 #include "glusterfs/compat-errno.h"  // for ENODATA on BSD
+#include <string.h>
 
 #define GF_DISK_SECTOR_SIZE 512
 #define DHT_REBALANCE_PID 4242        /* Change it if required */
@@ -4051,6 +4052,368 @@ out:
     return ret;
 }
 
+int
+gf_defrag_fix_layout_puredist(xlator_t *this, gf_defrag_info_t *defrag,
+                              loc_t *loc, dict_t *fix_layout,
+                              dict_t *migrate_data)
+{
+    int ret = -1;
+    loc_t entry_loc = {
+        0,
+    };
+    fd_t *fd = NULL;
+    inode_t *linked_inode = NULL, *inode = NULL;
+    dht_conf_t *conf = NULL;
+    int should_commit_hash = 1;
+    int perrno = 0;
+    /* absolute brick path length */
+    int brick_len = 0;
+    /* dir path length (relative to gluster mount) */
+    int dir_len = 0;
+    /* absolute dir path length */
+    int total_len = 0;
+    struct dirent *entry = NULL;
+    struct dirent scratch[2] = {{
+        0,
+    }};
+    DIR *dirp = NULL;
+    int full_entry_length = 0;
+    int entry_len = 0;
+    char full_entry_path[4096] = {
+        0,
+    };
+    char full_dir_path[4096] = {
+        0,
+    };
+    ssize_t size = 0;
+    uuid_t tmp_gfid;
+    struct stat tmpbuf = {
+        0,
+    };
+    struct iatt iatt = {
+        0,
+    };
+
+    struct stat lstatbuf = {
+        0,
+    };
+    struct iatt stbuf = {
+        0,
+    };
+
+    conf = this->private;
+    if (!conf) {
+        ret = -1;
+        goto out;
+    }
+
+    /*
+     * Since the primary target for the following lookup is to figure out if the
+     * entry still exists, going to do a direct stat call rather than going
+     * through the whole gluster stack. There are some benefits of doing gluster
+     * lookup, but this is redundant since we have done already one gluster
+     * lookup in the parent function.
+     *
+     * Randomly selecting the first local subvol to read, since it is expected
+     * that the directory structure is present in all the subvols identically
+     */
+
+    brick_len = strlen(defrag->local_brick_paths[0]);
+    /* discarding the first "/" */
+    dir_len = strlen(loc->path) - 1;
+    /* Extra two: one for "/" at the end and one more for '\0'*/
+    total_len = brick_len + dir_len + 2;
+
+    snprintf(full_dir_path, total_len, "%s%s/", defrag->local_brick_paths[0],
+             loc->path + 1);
+
+    ret = sys_lstat(full_dir_path, &tmpbuf);
+    if (ret == -1) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "[absolutepath %s] directory "
+               "not found, path %s error %d",
+               full_dir_path, loc->path, errno);
+        goto out;
+    }
+
+    dirp = sys_opendir(full_dir_path);
+    if (!dirp) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, errno, 0, "failed to open dir : %s",
+               loc->path);
+        if (conf->decommission_subvols_cnt) {
+            defrag->total_failures++;
+        }
+        goto out;
+    }
+
+    while ((entry = sys_readdir(dirp, scratch)) != NULL) {
+        if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+            ret = 1;
+            goto out;
+        }
+        if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..") ||
+            !strcmp(entry->d_name, ".glusterfs"))
+            continue;
+
+        /* TODO: Need to add a check for _DIRENT_HAVE_D_TYPE flag to fall back
+           to stat in case d_type is not defined */
+        if (entry->d_type != DT_DIR) {
+            continue;
+        }
+
+        entry_len = strlen(entry->d_name);
+        full_entry_length = total_len + entry_len + 1; /* one more for "/"*/
+
+        snprintf(full_entry_path, full_entry_length, "%s%s/", full_dir_path,
+                 entry->d_name);
+
+        size = sys_lgetxattr(full_entry_path, GFID_XATTR_KEY, tmp_gfid, 16);
+        if (size != 16) {
+            gf_log(this->name, GF_LOG_ERROR, "gfid not found, path %s",
+                   full_entry_path);
+            continue;
+        }
+
+        loc_wipe(&entry_loc);
+
+        ret = dht_build_child_loc(this, &entry_loc, loc, entry->d_name);
+        if (ret) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "Child loc"
+                   " build failed for entry: %s",
+                   entry->d_name);
+
+            if (conf->decommission_in_progress) {
+                defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+
+                goto out;
+            } else {
+                should_commit_hash = 0;
+
+                continue;
+            }
+        }
+
+        if (gf_uuid_is_null(tmp_gfid)) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "%s/%s"
+                   " gfid not present",
+                   loc->path, entry->d_name);
+            continue;
+        }
+
+        gf_uuid_copy(entry_loc.gfid, tmp_gfid);
+
+        /*In case the gfid stored in the inode by inode_link
+         *and the gfid obtained in the lookup differs, then
+         *client3_3_lookup_cbk will return ESTALE and proper
+         *error will be captured.
+         */
+        memset(&lstatbuf, 0, sizeof(struct stat));
+        ret = sys_lstat(full_entry_path, &lstatbuf);
+        if (ret == -1) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, 0, "lstat failed for %s",
+                   entry->d_name);
+        }
+
+        memset(&stbuf, 0, sizeof(struct iatt));
+        iatt_from_stat(&stbuf, &lstatbuf);
+        gf_uuid_copy(stbuf.ia_gfid, entry_loc.gfid);
+        linked_inode = inode_link(entry_loc.inode, loc->inode, entry->d_name,
+                                  &stbuf);
+
+        inode = entry_loc.inode;
+        entry_loc.inode = linked_inode;
+        inode_unref(inode);
+
+        if (gf_uuid_is_null(loc->gfid)) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "%s/%s"
+                   " gfid not present",
+                   loc->path, entry->d_name);
+            continue;
+        }
+
+        gf_uuid_copy(entry_loc.pargfid, loc->gfid);
+
+        ret = syncop_lookup(this, &entry_loc, &iatt, NULL, NULL, NULL);
+        if (ret) {
+            if (-ret == ENOENT || -ret == ESTALE) {
+                gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_LOOKUP_FAILED,
+                       "Dir:%s renamed or removed. "
+                       "Skipping",
+                       loc->path);
+                ret = 0;
+                if (conf->decommission_subvols_cnt) {
+                    defrag->total_failures++;
+                }
+                continue;
+            } else {
+                gf_msg(this->name, GF_LOG_ERROR, -ret,
+                       DHT_MSG_DIR_LOOKUP_FAILED, "lookup failed for:%s",
+                       entry_loc.path);
+
+                defrag->total_failures++;
+
+                if (conf->decommission_in_progress) {
+                    defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+                    ret = -1;
+                    goto out;
+                } else {
+                    should_commit_hash = 0;
+                    continue;
+                }
+            }
+        }
+
+        /* A return value of 2 means, either process_dir or
+         * lookup of a dir failed. Hence, don't commit hash
+         * for the current directory*/
+
+        ret = gf_defrag_fix_layout_puredist(this, defrag, &entry_loc,
+                                            fix_layout, migrate_data);
+
+        if (defrag->defrag_status == GF_DEFRAG_STATUS_STOPPED) {
+            goto out;
+        }
+
+        if (ret && ret != 2) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_FIX_FAILED,
+                   "Fix layout failed for %s", entry_loc.path);
+
+            defrag->total_failures++;
+
+            if (conf->decommission_in_progress) {
+                defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+
+                goto out;
+            } else {
+                /* Let's not commit-hash if
+                 * gf_defrag_fix_layout failed*/
+                continue;
+            }
+        }
+    }
+
+    ret = sys_closedir(dirp);
+    if (ret) {
+        gf_msg_debug(this->name, 0,
+                     "Failed to close dir %s. Reason :"
+                     " %s",
+                     full_dir_path, strerror(errno));
+        ret = 0;
+    }
+
+    dirp = NULL;
+
+    /* A directory layout is fixed only after its subdirs are healed to
+     * any newly added bricks. If the layout is fixed before subdirs are
+     * healed, the newly added brick will get a non-null layout.
+     * Any subdirs which hash to that layout will no longer show up
+     * in a directory listing until they are healed.
+     */
+
+    ret = syncop_setxattr(this, loc, fix_layout, 0, NULL, NULL);
+
+    /* In case of a race where the directory is deleted just before
+     * layout setxattr, the errors are updated in the layout structure.
+     * We can use this information to make a decision whether the directory
+     * is deleted entirely.
+     */
+    if (ret == 0) {
+        ret = dht_dir_layout_error_check(this, loc->inode);
+        ret = -ret;
+    }
+
+    if (ret) {
+        if (-ret == ENOENT || -ret == ESTALE) {
+            gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_LAYOUT_FIX_FAILED,
+                   "Setxattr failed. Dir %s "
+                   "renamed or removed",
+                   loc->path);
+            if (conf->decommission_subvols_cnt) {
+                defrag->total_failures++;
+            }
+            ret = 0;
+            goto out;
+        } else {
+            gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LAYOUT_FIX_FAILED,
+                   "Setxattr failed for %s", loc->path);
+
+            defrag->total_failures++;
+
+            if (conf->decommission_in_progress) {
+                defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+                ret = -1;
+                goto out;
+            }
+        }
+    }
+
+    if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) &&
+        (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX)) {
+        ret = gf_defrag_process_dir(this, defrag, loc, migrate_data, &perrno);
+
+        if (ret && (ret != 2)) {
+            if (perrno == ENOENT || perrno == ESTALE) {
+                ret = 0;
+                goto out;
+            } else {
+                defrag->total_failures++;
+
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       DHT_MSG_DEFRAG_PROCESS_DIR_FAILED,
+                       "gf_defrag_process_dir failed for "
+                       "directory: %s",
+                       loc->path);
+
+                if (conf->decommission_in_progress) {
+                    goto out;
+                }
+
+                should_commit_hash = 0;
+            }
+        } else if (ret == 2) {
+            should_commit_hash = 0;
+        }
+    }
+
+    gf_msg_trace(this->name, 0, "fix layout called on %s", loc->path);
+
+    if (should_commit_hash &&
+        gf_defrag_settle_hash(this, defrag, loc, fix_layout) != 0) {
+        defrag->total_failures++;
+
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SETTLE_HASH_FAILED,
+               "Settle hash failed for %s", loc->path);
+
+        ret = -1;
+
+        if (conf->decommission_in_progress) {
+            defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    loc_wipe(&entry_loc);
+
+    if (fd)
+        fd_unref(fd);
+
+    if (ret == 0 && should_commit_hash == 0) {
+        ret = 2;
+    }
+
+    if (dirp) {
+        sys_closedir(dirp);
+    }
+
+    return ret;
+}
+
 int
 dht_init_local_subvols_and_nodeuuids(xlator_t *this, dht_conf_t *conf,
                                      loc_t *loc)
@@ -4405,6 +4768,7 @@ gf_defrag_start_crawl(void *data)
     pthread_t *tid = NULL;
     pthread_t filecnt_thread;
     gf_boolean_t fc_thread_started = _gf_false;
+    int i = 0;
 
     this = data;
     if (!this)
@@ -4539,6 +4903,12 @@ gf_defrag_start_crawl(void *data)
             goto out;
         }
 
+        ret = dht_get_brick_paths(this, conf, &loc);
+        if (ret) {
+            gf_log(this->name, GF_LOG_WARNING, "could not get brick path");
+            ret = 0;
+        }
+
         /* Initialise the structures required for parallel migration */
         ret = gf_defrag_parallel_migration_init(this, defrag, &tid,
                                                 &thread_index);
@@ -4556,11 +4926,23 @@ gf_defrag_start_crawl(void *data)
         }
     }
 
-    ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout, migrate_data);
-    if (ret && ret != 2) {
-        defrag->total_failures++;
-        ret = -1;
-        goto out;
+    /* TODO: Need to introduce a flag to safely operate in the old way */
+    if (defrag->operate_dist && defrag->is_pure_distribute) {
+        ret = gf_defrag_fix_layout_puredist(this, defrag, &loc, fix_layout,
+                                            migrate_data);
+        if (ret && ret != 2) {
+            defrag->total_failures++;
+            ret = -1;
+            goto out;
+        }
+    } else {
+        ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout,
+                                   migrate_data);
+        if (ret && ret != 2) {
+            defrag->total_failures++;
+            ret = -1;
+            goto out;
+        }
     }
 
     if (ret != 2 &&
@@ -4606,6 +4988,14 @@ out:
     }
     UNLOCK(&defrag->lock);
 
+    for (i = 0; i < conf->local_subvols_cnt; i++) {
+        if (defrag->local_brick_paths[i]) {
+            GF_FREE(defrag->local_brick_paths[i]);
+        }
+    }
+
+    GF_FREE(defrag->local_brick_paths);
+
     GF_FREE(defrag);
     conf->defrag = NULL;
 
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
index d85b4d1ce13..811bb55925f 100644
--- a/xlators/cluster/dht/src/dht-shared.c
+++ b/xlators/cluster/dht/src/dht-shared.c
@@ -700,6 +700,10 @@ dht_init(xlator_t *this)
         pthread_cond_init(&defrag->fc_wakeup_cond, 0);
 
         defrag->global_error = 0;
+
+        defrag->is_pure_distribute = _gf_false;
+
+        defrag->operate_dist = _gf_true;
     }
 
     conf->use_fallocate = 1;
-- 
cgit