summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/dht/src/dht-rebalance.c
diff options
context:
space:
mode:
authorSusant Palai <spalai@redhat.com>2020-04-27 16:59:16 +0530
committerSusant Palai <spalai@redhat.com>2020-07-31 16:22:23 +0000
commit3af9443c770837abe4f54db399623380ab9767a7 (patch)
tree943b631cc5210a721b90287cea201b03bf78a1fe /xlators/cluster/dht/src/dht-rebalance.c
parentdbff4ecfc18d4d4ad357e2f53806a6caf69d2b65 (diff)
dht: optimize rebalance crawl path
For distribute only volumes we can use the information for local subvolumes to avoid syncop calls which goes through the whole stack to fetch stat and entries. A separate function gf_defrag_fix_layout_puredist is introduced. TODO: A glusterd flag needs to be introduced in case we want to fall back to run the old way. Perf numbers: DirSize - 1Million Old New %diff Depth - 100 (Run 1) 353 74 +377% Depth - 100 (Run 2) 348 72 +377~% Depth - 50 246 122 +100% Depth - 3 174 114 +52% Change-Id: I67cc136cebd34092fd775e69f74c2d5b33d3156d Fixes: #1242 Signed-off-by: Susant Palai <spalai@redhat.com>
Diffstat (limited to 'xlators/cluster/dht/src/dht-rebalance.c')
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c400
1 files changed, 395 insertions, 5 deletions
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index c141ffce90d..d850eef62ab 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -14,6 +14,7 @@
#include <signal.h>
#include <glusterfs/events.h>
#include "glusterfs/compat-errno.h" // for ENODATA on BSD
+#include <string.h>
#define GF_DISK_SECTOR_SIZE 512
#define DHT_REBALANCE_PID 4242 /* Change it if required */
@@ -4052,6 +4053,368 @@ out:
}
int
+gf_defrag_fix_layout_puredist(xlator_t *this, gf_defrag_info_t *defrag,
+ loc_t *loc, dict_t *fix_layout,
+ dict_t *migrate_data)
+{
+ int ret = -1;
+ loc_t entry_loc = {
+ 0,
+ };
+ fd_t *fd = NULL;
+ inode_t *linked_inode = NULL, *inode = NULL;
+ dht_conf_t *conf = NULL;
+ int should_commit_hash = 1;
+ int perrno = 0;
+ /* absolute brick path length */
+ int brick_len = 0;
+ /* dir path length (relative to gluster mount) */
+ int dir_len = 0;
+ /* absolute dir path length */
+ int total_len = 0;
+ struct dirent *entry = NULL;
+ struct dirent scratch[2] = {{
+ 0,
+ }};
+ DIR *dirp = NULL;
+ int full_entry_length = 0;
+ int entry_len = 0;
+ char full_entry_path[4096] = {
+ 0,
+ };
+ char full_dir_path[4096] = {
+ 0,
+ };
+ ssize_t size = 0;
+ uuid_t tmp_gfid;
+ struct stat tmpbuf = {
+ 0,
+ };
+ struct iatt iatt = {
+ 0,
+ };
+
+ struct stat lstatbuf = {
+ 0,
+ };
+ struct iatt stbuf = {
+ 0,
+ };
+
+ conf = this->private;
+ if (!conf) {
+ ret = -1;
+ goto out;
+ }
+
+ /*
+ * Since the primary target for the following lookup is to figure out if the
+ * entry still exists, going to do a direct stat call rather than going
+ * through the whole gluster stack. There are some benefits of doing gluster
+ * lookup, but this is redundant since we have done already one gluster
+ * lookup in the parent function.
+ *
+ * Randomly selecting the first local subvol to read, since it is expected
+ * that the directory structure is present in all the subvols identically
+ */
+
+ brick_len = strlen(defrag->local_brick_paths[0]);
+ /* discarding the first "/" */
+ dir_len = strlen(loc->path) - 1;
+ /* Extra two: one for "/" at the end and one more for '\0'*/
+ total_len = brick_len + dir_len + 2;
+
+ snprintf(full_dir_path, total_len, "%s%s/", defrag->local_brick_paths[0],
+ loc->path + 1);
+
+ ret = sys_lstat(full_dir_path, &tmpbuf);
+ if (ret == -1) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "[absolutepath %s] directory "
+ "not found, path %s error %d",
+ full_dir_path, loc->path, errno);
+ goto out;
+ }
+
+ dirp = sys_opendir(full_dir_path);
+ if (!dirp) {
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, errno, 0, "failed to open dir : %s",
+ loc->path);
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ goto out;
+ }
+
+ while ((entry = sys_readdir(dirp, scratch)) != NULL) {
+ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+ ret = 1;
+ goto out;
+ }
+ if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..") ||
+ !strcmp(entry->d_name, ".glusterfs"))
+ continue;
+
+ /* TODO: Need to add a check for _DIRENT_HAVE_D_TYPE flag to fall back
+ to stat in case d_type is not defined */
+ if (entry->d_type != DT_DIR) {
+ continue;
+ }
+
+ entry_len = strlen(entry->d_name);
+ full_entry_length = total_len + entry_len + 1; /* one more for "/"*/
+
+ snprintf(full_entry_path, full_entry_length, "%s%s/", full_dir_path,
+ entry->d_name);
+
+ size = sys_lgetxattr(full_entry_path, GFID_XATTR_KEY, tmp_gfid, 16);
+ if (size != 16) {
+ gf_log(this->name, GF_LOG_ERROR, "gfid not found, path %s",
+ full_entry_path);
+ continue;
+ }
+
+ loc_wipe(&entry_loc);
+
+ ret = dht_build_child_loc(this, &entry_loc, loc, entry->d_name);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Child loc"
+ " build failed for entry: %s",
+ entry->d_name);
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+
+ goto out;
+ } else {
+ should_commit_hash = 0;
+
+ continue;
+ }
+ }
+
+ if (gf_uuid_is_null(tmp_gfid)) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "%s/%s"
+ " gfid not present",
+ loc->path, entry->d_name);
+ continue;
+ }
+
+ gf_uuid_copy(entry_loc.gfid, tmp_gfid);
+
+ /*In case the gfid stored in the inode by inode_link
+ *and the gfid obtained in the lookup differs, then
+ *client3_3_lookup_cbk will return ESTALE and proper
+ *error will be captured.
+ */
+ memset(&lstatbuf, 0, sizeof(struct stat));
+ ret = sys_lstat(full_entry_path, &lstatbuf);
+ if (ret == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, errno, 0, "lstat failed for %s",
+ entry->d_name);
+ }
+
+ memset(&stbuf, 0, sizeof(struct iatt));
+ iatt_from_stat(&stbuf, &lstatbuf);
+ gf_uuid_copy(stbuf.ia_gfid, entry_loc.gfid);
+ linked_inode = inode_link(entry_loc.inode, loc->inode, entry->d_name,
+ &stbuf);
+
+ inode = entry_loc.inode;
+ entry_loc.inode = linked_inode;
+ inode_unref(inode);
+
+ if (gf_uuid_is_null(loc->gfid)) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "%s/%s"
+ " gfid not present",
+ loc->path, entry->d_name);
+ continue;
+ }
+
+ gf_uuid_copy(entry_loc.pargfid, loc->gfid);
+
+ ret = syncop_lookup(this, &entry_loc, &iatt, NULL, NULL, NULL);
+ if (ret) {
+ if (-ret == ENOENT || -ret == ESTALE) {
+ gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_LOOKUP_FAILED,
+ "Dir:%s renamed or removed. "
+ "Skipping",
+ loc->path);
+ ret = 0;
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ continue;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_DIR_LOOKUP_FAILED, "lookup failed for:%s",
+ entry_loc.path);
+
+ defrag->total_failures++;
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ ret = -1;
+ goto out;
+ } else {
+ should_commit_hash = 0;
+ continue;
+ }
+ }
+ }
+
+ /* A return value of 2 means, either process_dir or
+ * lookup of a dir failed. Hence, don't commit hash
+ * for the current directory*/
+
+ ret = gf_defrag_fix_layout_puredist(this, defrag, &entry_loc,
+ fix_layout, migrate_data);
+
+ if (defrag->defrag_status == GF_DEFRAG_STATUS_STOPPED) {
+ goto out;
+ }
+
+ if (ret && ret != 2) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_FIX_FAILED,
+ "Fix layout failed for %s", entry_loc.path);
+
+ defrag->total_failures++;
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+
+ goto out;
+ } else {
+ /* Let's not commit-hash if
+ * gf_defrag_fix_layout failed*/
+ continue;
+ }
+ }
+ }
+
+ ret = sys_closedir(dirp);
+ if (ret) {
+ gf_msg_debug(this->name, 0,
+ "Failed to close dir %s. Reason :"
+ " %s",
+ full_dir_path, strerror(errno));
+ ret = 0;
+ }
+
+ dirp = NULL;
+
+ /* A directory layout is fixed only after its subdirs are healed to
+ * any newly added bricks. If the layout is fixed before subdirs are
+ * healed, the newly added brick will get a non-null layout.
+ * Any subdirs which hash to that layout will no longer show up
+ * in a directory listing until they are healed.
+ */
+
+ ret = syncop_setxattr(this, loc, fix_layout, 0, NULL, NULL);
+
+ /* In case of a race where the directory is deleted just before
+ * layout setxattr, the errors are updated in the layout structure.
+ * We can use this information to make a decision whether the directory
+ * is deleted entirely.
+ */
+ if (ret == 0) {
+ ret = dht_dir_layout_error_check(this, loc->inode);
+ ret = -ret;
+ }
+
+ if (ret) {
+ if (-ret == ENOENT || -ret == ESTALE) {
+ gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_LAYOUT_FIX_FAILED,
+ "Setxattr failed. Dir %s "
+ "renamed or removed",
+ loc->path);
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ ret = 0;
+ goto out;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LAYOUT_FIX_FAILED,
+ "Setxattr failed for %s", loc->path);
+
+ defrag->total_failures++;
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ ret = -1;
+ goto out;
+ }
+ }
+ }
+
+ if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) &&
+ (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX)) {
+ ret = gf_defrag_process_dir(this, defrag, loc, migrate_data, &perrno);
+
+ if (ret && (ret != 2)) {
+ if (perrno == ENOENT || perrno == ESTALE) {
+ ret = 0;
+ goto out;
+ } else {
+ defrag->total_failures++;
+
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_DEFRAG_PROCESS_DIR_FAILED,
+ "gf_defrag_process_dir failed for "
+ "directory: %s",
+ loc->path);
+
+ if (conf->decommission_in_progress) {
+ goto out;
+ }
+
+ should_commit_hash = 0;
+ }
+ } else if (ret == 2) {
+ should_commit_hash = 0;
+ }
+ }
+
+ gf_msg_trace(this->name, 0, "fix layout called on %s", loc->path);
+
+ if (should_commit_hash &&
+ gf_defrag_settle_hash(this, defrag, loc, fix_layout) != 0) {
+ defrag->total_failures++;
+
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SETTLE_HASH_FAILED,
+ "Settle hash failed for %s", loc->path);
+
+ ret = -1;
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ loc_wipe(&entry_loc);
+
+ if (fd)
+ fd_unref(fd);
+
+ if (ret == 0 && should_commit_hash == 0) {
+ ret = 2;
+ }
+
+ if (dirp) {
+ sys_closedir(dirp);
+ }
+
+ return ret;
+}
+
+int
dht_init_local_subvols_and_nodeuuids(xlator_t *this, dht_conf_t *conf,
loc_t *loc)
{
@@ -4405,6 +4768,7 @@ gf_defrag_start_crawl(void *data)
pthread_t *tid = NULL;
pthread_t filecnt_thread;
gf_boolean_t fc_thread_started = _gf_false;
+ int i = 0;
this = data;
if (!this)
@@ -4539,6 +4903,12 @@ gf_defrag_start_crawl(void *data)
goto out;
}
+ ret = dht_get_brick_paths(this, conf, &loc);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING, "could not get brick path");
+ ret = 0;
+ }
+
/* Initialise the structures required for parallel migration */
ret = gf_defrag_parallel_migration_init(this, defrag, &tid,
&thread_index);
@@ -4556,11 +4926,23 @@ gf_defrag_start_crawl(void *data)
}
}
- ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout, migrate_data);
- if (ret && ret != 2) {
- defrag->total_failures++;
- ret = -1;
- goto out;
+ /* TODO: Need to introduce a flag to safely operate in the old way */
+ if (defrag->operate_dist && defrag->is_pure_distribute) {
+ ret = gf_defrag_fix_layout_puredist(this, defrag, &loc, fix_layout,
+ migrate_data);
+ if (ret && ret != 2) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+ } else {
+ ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout,
+ migrate_data);
+ if (ret && ret != 2) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
}
if (ret != 2 &&
@@ -4606,6 +4988,14 @@ out:
}
UNLOCK(&defrag->lock);
+ for (i = 0; i < conf->local_subvols_cnt; i++) {
+ if (defrag->local_brick_paths[i]) {
+ GF_FREE(defrag->local_brick_paths[i]);
+ }
+ }
+
+ GF_FREE(defrag->local_brick_paths);
+
GF_FREE(defrag);
conf->defrag = NULL;