summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--xlators/cluster/dht/src/dht-common.c114
-rw-r--r--xlators/cluster/dht/src/dht-common.h11
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c400
-rw-r--r--xlators/cluster/dht/src/dht-shared.c4
4 files changed, 524 insertions, 5 deletions
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index 65c9c0b0a31..be92236e3bd 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -11465,3 +11465,117 @@ dht_dir_layout_error_check(xlator_t *this, inode_t *inode)
/* Returning the first xlator error as all xlators have errors */
return layout->list[0].err;
}
+
+/* Get brick paths from all the local subvols and store for use.
+ *
+ * TODO: Make sure newly added brick is not picked for migration.
+ * Otherwise there will be no rebalance as directory entries won't be present
+ * on a newly added brick */
+int
+dht_get_brick_paths(xlator_t *this, dht_conf_t *conf, loc_t *loc)
+{
+ dict_t *dict = NULL;
+ gf_defrag_info_t *defrag = conf->defrag;
+ char *key = NULL;
+ char *tmp = NULL;
+ char *str = NULL;
+ char *token;
+ char *saveptr = NULL;
+ int i = 1;
+ int j = 0;
+ int ret = 0;
+
+ key = gf_strdup("glusterfs.pathinfo");
+ if (!key) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0,
+ "failed to allocate "
+ "memory");
+ ret = -1;
+ goto out;
+ }
+
+ defrag->local_brick_paths = GF_CALLOC(conf->local_subvols_cnt,
+ sizeof(*defrag->local_brick_paths),
+ gf_common_mt_pointer);
+
+ for (j = 0; j < conf->local_subvols_cnt; j++) {
+ ret = syncop_getxattr(conf->local_subvols[j], loc, &dict, key, NULL,
+ NULL);
+ if (ret == -1) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, 0,
+ "failed to get path,"
+ " errno %d",
+ ret);
+ /* TODO: We need not break out from here and can resume operation.
+ * We need a place holder in gf_defrag_info_t to mark which
+ * local_brick_paths we are working on. Right now, we blindly
+ * take defrag->local_brick_path[0]. This can be dynamic based on
+ * need */
+ goto out;
+ }
+
+ str = NULL;
+ ret = dict_get_str(dict, key, &str);
+ if (ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, 0, "dict get failed for :%s",
+ key);
+ goto out;
+ }
+ if (str == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, 0, "key:%s not found", key);
+ ret = -1;
+ goto out;
+ }
+
+ if (!defrag->is_pure_distribute) {
+ tmp = strstr(str, "REPLICATE");
+ if (tmp) {
+ defrag->is_pure_distribute = _gf_false;
+ break;
+ }
+
+ /*TODO: fetching glusterfs.pathinfo on erasure volume is failing.
+ *Function the old way till we get it resolved */
+ tmp = strstr(str, "ERASURE");
+ if (tmp) {
+ defrag->is_pure_distribute = _gf_false;
+ break;
+ }
+
+ defrag->is_pure_distribute = _gf_true;
+ }
+
+ saveptr = NULL;
+
+ for (token = strtok_r(str, ":", &saveptr), i = 1; token;) {
+ token = strtok_r(NULL, ":", &saveptr);
+ i++;
+ if (i == 3) {
+ token = strtok_r(token, ">", &saveptr);
+ break;
+ } else {
+ continue;
+ }
+ }
+
+ defrag->local_brick_paths[j] = gf_strdup(token);
+ }
+
+out:
+ if (ret == -1) {
+ gf_msg(this->name, GF_LOG_INFO, 0, 0,
+ "failed to get brick path. "
+ "Will operate old way");
+ for (j = 0; j < conf->local_subvols_cnt; j++) {
+ GF_FREE(defrag->local_brick_paths[j]);
+ }
+ defrag->is_pure_distribute = _gf_false;
+ }
+
+ if (defrag->is_pure_distribute) {
+ gf_msg(this->name, GF_LOG_INFO, 0, 0, "volume type : pure distribute");
+ }
+
+ GF_FREE(key);
+ return ret;
+}
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index 028c6ac6b9f..84891406c71 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -598,6 +598,15 @@ struct gf_defrag_info_ {
gf_boolean_t stats;
/* lock migration flag */
gf_boolean_t lock_migration_enabled;
+
+ /* local system crawl */
+ char **local_brick_paths;
+
+ /* whether the volume is pure distribute */
+ gf_boolean_t is_pure_distribute;
+
+ /*TODO: Introduce a glusterd option to tune this behaviour*/
+ gf_boolean_t operate_dist;
};
typedef struct gf_defrag_info_ gf_defrag_info_t;
@@ -1482,4 +1491,6 @@ dht_set_parent_layout_in_dict(loc_t *loc, xlator_t *this, dht_local_t *local);
int
dht_dir_layout_error_check(xlator_t *this, inode_t *inode);
+int
+dht_get_brick_paths(xlator_t *this, dht_conf_t *conf, loc_t *loc);
#endif /* _DHT_H */
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index c141ffce90d..d850eef62ab 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -14,6 +14,7 @@
#include <signal.h>
#include <glusterfs/events.h>
#include "glusterfs/compat-errno.h" // for ENODATA on BSD
+#include <string.h>
#define GF_DISK_SECTOR_SIZE 512
#define DHT_REBALANCE_PID 4242 /* Change it if required */
@@ -4052,6 +4053,368 @@ out:
}
int
+gf_defrag_fix_layout_puredist(xlator_t *this, gf_defrag_info_t *defrag,
+ loc_t *loc, dict_t *fix_layout,
+ dict_t *migrate_data)
+{
+ int ret = -1;
+ loc_t entry_loc = {
+ 0,
+ };
+ fd_t *fd = NULL;
+ inode_t *linked_inode = NULL, *inode = NULL;
+ dht_conf_t *conf = NULL;
+ int should_commit_hash = 1;
+ int perrno = 0;
+ /* absolute brick path length */
+ int brick_len = 0;
+ /* dir path length (relative to gluster mount) */
+ int dir_len = 0;
+ /* absolute dir path length */
+ int total_len = 0;
+ struct dirent *entry = NULL;
+ struct dirent scratch[2] = {{
+ 0,
+ }};
+ DIR *dirp = NULL;
+ int full_entry_length = 0;
+ int entry_len = 0;
+ char full_entry_path[4096] = {
+ 0,
+ };
+ char full_dir_path[4096] = {
+ 0,
+ };
+ ssize_t size = 0;
+ uuid_t tmp_gfid;
+ struct stat tmpbuf = {
+ 0,
+ };
+ struct iatt iatt = {
+ 0,
+ };
+
+ struct stat lstatbuf = {
+ 0,
+ };
+ struct iatt stbuf = {
+ 0,
+ };
+
+ conf = this->private;
+ if (!conf) {
+ ret = -1;
+ goto out;
+ }
+
+ /*
+ * Since the primary target for the following lookup is to figure out if the
+ * entry still exists, going to do a direct stat call rather than going
+ * through the whole gluster stack. There are some benefits of doing gluster
+ * lookup, but this is redundant since we have done already one gluster
+ * lookup in the parent function.
+ *
+ * Randomly selecting the first local subvol to read, since it is expected
+ * that the directory structure is present in all the subvols identically
+ */
+
+ brick_len = strlen(defrag->local_brick_paths[0]);
+ /* discarding the first "/" */
+ dir_len = strlen(loc->path) - 1;
+ /* Extra two: one for "/" at the end and one more for '\0'*/
+ total_len = brick_len + dir_len + 2;
+
+ snprintf(full_dir_path, total_len, "%s%s/", defrag->local_brick_paths[0],
+ loc->path + 1);
+
+ ret = sys_lstat(full_dir_path, &tmpbuf);
+ if (ret == -1) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "[absolutepath %s] directory "
+ "not found, path %s error %d",
+ full_dir_path, loc->path, errno);
+ goto out;
+ }
+
+ dirp = sys_opendir(full_dir_path);
+ if (!dirp) {
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, errno, 0, "failed to open dir : %s",
+ loc->path);
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ goto out;
+ }
+
+ while ((entry = sys_readdir(dirp, scratch)) != NULL) {
+ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+ ret = 1;
+ goto out;
+ }
+ if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..") ||
+ !strcmp(entry->d_name, ".glusterfs"))
+ continue;
+
+ /* TODO: Need to add a check for _DIRENT_HAVE_D_TYPE flag to fall back
+ to stat in case d_type is not defined */
+ if (entry->d_type != DT_DIR) {
+ continue;
+ }
+
+ entry_len = strlen(entry->d_name);
+ full_entry_length = total_len + entry_len + 1; /* one more for "/"*/
+
+ snprintf(full_entry_path, full_entry_length, "%s%s/", full_dir_path,
+ entry->d_name);
+
+ size = sys_lgetxattr(full_entry_path, GFID_XATTR_KEY, tmp_gfid, 16);
+ if (size != 16) {
+ gf_log(this->name, GF_LOG_ERROR, "gfid not found, path %s",
+ full_entry_path);
+ continue;
+ }
+
+ loc_wipe(&entry_loc);
+
+ ret = dht_build_child_loc(this, &entry_loc, loc, entry->d_name);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Child loc"
+ " build failed for entry: %s",
+ entry->d_name);
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+
+ goto out;
+ } else {
+ should_commit_hash = 0;
+
+ continue;
+ }
+ }
+
+ if (gf_uuid_is_null(tmp_gfid)) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "%s/%s"
+ " gfid not present",
+ loc->path, entry->d_name);
+ continue;
+ }
+
+ gf_uuid_copy(entry_loc.gfid, tmp_gfid);
+
+ /*In case the gfid stored in the inode by inode_link
+ *and the gfid obtained in the lookup differs, then
+ *client3_3_lookup_cbk will return ESTALE and proper
+ *error will be captured.
+ */
+ memset(&lstatbuf, 0, sizeof(struct stat));
+ ret = sys_lstat(full_entry_path, &lstatbuf);
+ if (ret == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, errno, 0, "lstat failed for %s",
+ entry->d_name);
+ }
+
+ memset(&stbuf, 0, sizeof(struct iatt));
+ iatt_from_stat(&stbuf, &lstatbuf);
+ gf_uuid_copy(stbuf.ia_gfid, entry_loc.gfid);
+ linked_inode = inode_link(entry_loc.inode, loc->inode, entry->d_name,
+ &stbuf);
+
+ inode = entry_loc.inode;
+ entry_loc.inode = linked_inode;
+ inode_unref(inode);
+
+ if (gf_uuid_is_null(loc->gfid)) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "%s/%s"
+ " gfid not present",
+ loc->path, entry->d_name);
+ continue;
+ }
+
+ gf_uuid_copy(entry_loc.pargfid, loc->gfid);
+
+ ret = syncop_lookup(this, &entry_loc, &iatt, NULL, NULL, NULL);
+ if (ret) {
+ if (-ret == ENOENT || -ret == ESTALE) {
+ gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_LOOKUP_FAILED,
+ "Dir:%s renamed or removed. "
+ "Skipping",
+ loc->path);
+ ret = 0;
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ continue;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_DIR_LOOKUP_FAILED, "lookup failed for:%s",
+ entry_loc.path);
+
+ defrag->total_failures++;
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ ret = -1;
+ goto out;
+ } else {
+ should_commit_hash = 0;
+ continue;
+ }
+ }
+ }
+
+ /* A return value of 2 means, either process_dir or
+ * lookup of a dir failed. Hence, don't commit hash
+ * for the current directory*/
+
+ ret = gf_defrag_fix_layout_puredist(this, defrag, &entry_loc,
+ fix_layout, migrate_data);
+
+ if (defrag->defrag_status == GF_DEFRAG_STATUS_STOPPED) {
+ goto out;
+ }
+
+ if (ret && ret != 2) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_FIX_FAILED,
+ "Fix layout failed for %s", entry_loc.path);
+
+ defrag->total_failures++;
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+
+ goto out;
+ } else {
+ /* Let's not commit-hash if
+ * gf_defrag_fix_layout failed*/
+ continue;
+ }
+ }
+ }
+
+ ret = sys_closedir(dirp);
+ if (ret) {
+ gf_msg_debug(this->name, 0,
+ "Failed to close dir %s. Reason :"
+ " %s",
+ full_dir_path, strerror(errno));
+ ret = 0;
+ }
+
+ dirp = NULL;
+
+ /* A directory layout is fixed only after its subdirs are healed to
+ * any newly added bricks. If the layout is fixed before subdirs are
+ * healed, the newly added brick will get a non-null layout.
+ * Any subdirs which hash to that layout will no longer show up
+ * in a directory listing until they are healed.
+ */
+
+ ret = syncop_setxattr(this, loc, fix_layout, 0, NULL, NULL);
+
+ /* In case of a race where the directory is deleted just before
+ * layout setxattr, the errors are updated in the layout structure.
+ * We can use this information to make a decision whether the directory
+ * is deleted entirely.
+ */
+ if (ret == 0) {
+ ret = dht_dir_layout_error_check(this, loc->inode);
+ ret = -ret;
+ }
+
+ if (ret) {
+ if (-ret == ENOENT || -ret == ESTALE) {
+ gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_LAYOUT_FIX_FAILED,
+ "Setxattr failed. Dir %s "
+ "renamed or removed",
+ loc->path);
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ ret = 0;
+ goto out;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LAYOUT_FIX_FAILED,
+ "Setxattr failed for %s", loc->path);
+
+ defrag->total_failures++;
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ ret = -1;
+ goto out;
+ }
+ }
+ }
+
+ if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) &&
+ (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX)) {
+ ret = gf_defrag_process_dir(this, defrag, loc, migrate_data, &perrno);
+
+ if (ret && (ret != 2)) {
+ if (perrno == ENOENT || perrno == ESTALE) {
+ ret = 0;
+ goto out;
+ } else {
+ defrag->total_failures++;
+
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_DEFRAG_PROCESS_DIR_FAILED,
+ "gf_defrag_process_dir failed for "
+ "directory: %s",
+ loc->path);
+
+ if (conf->decommission_in_progress) {
+ goto out;
+ }
+
+ should_commit_hash = 0;
+ }
+ } else if (ret == 2) {
+ should_commit_hash = 0;
+ }
+ }
+
+ gf_msg_trace(this->name, 0, "fix layout called on %s", loc->path);
+
+ if (should_commit_hash &&
+ gf_defrag_settle_hash(this, defrag, loc, fix_layout) != 0) {
+ defrag->total_failures++;
+
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SETTLE_HASH_FAILED,
+ "Settle hash failed for %s", loc->path);
+
+ ret = -1;
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ loc_wipe(&entry_loc);
+
+ if (fd)
+ fd_unref(fd);
+
+ if (ret == 0 && should_commit_hash == 0) {
+ ret = 2;
+ }
+
+ if (dirp) {
+ sys_closedir(dirp);
+ }
+
+ return ret;
+}
+
+int
dht_init_local_subvols_and_nodeuuids(xlator_t *this, dht_conf_t *conf,
loc_t *loc)
{
@@ -4405,6 +4768,7 @@ gf_defrag_start_crawl(void *data)
pthread_t *tid = NULL;
pthread_t filecnt_thread;
gf_boolean_t fc_thread_started = _gf_false;
+ int i = 0;
this = data;
if (!this)
@@ -4539,6 +4903,12 @@ gf_defrag_start_crawl(void *data)
goto out;
}
+ ret = dht_get_brick_paths(this, conf, &loc);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING, "could not get brick path");
+ ret = 0;
+ }
+
/* Initialise the structures required for parallel migration */
ret = gf_defrag_parallel_migration_init(this, defrag, &tid,
&thread_index);
@@ -4556,11 +4926,23 @@ gf_defrag_start_crawl(void *data)
}
}
- ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout, migrate_data);
- if (ret && ret != 2) {
- defrag->total_failures++;
- ret = -1;
- goto out;
+ /* TODO: Need to introduce a flag to safely operate in the old way */
+ if (defrag->operate_dist && defrag->is_pure_distribute) {
+ ret = gf_defrag_fix_layout_puredist(this, defrag, &loc, fix_layout,
+ migrate_data);
+ if (ret && ret != 2) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+ } else {
+ ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout,
+ migrate_data);
+ if (ret && ret != 2) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
}
if (ret != 2 &&
@@ -4606,6 +4988,14 @@ out:
}
UNLOCK(&defrag->lock);
+ for (i = 0; i < conf->local_subvols_cnt; i++) {
+ if (defrag->local_brick_paths[i]) {
+ GF_FREE(defrag->local_brick_paths[i]);
+ }
+ }
+
+ GF_FREE(defrag->local_brick_paths);
+
GF_FREE(defrag);
conf->defrag = NULL;
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
index d85b4d1ce13..811bb55925f 100644
--- a/xlators/cluster/dht/src/dht-shared.c
+++ b/xlators/cluster/dht/src/dht-shared.c
@@ -700,6 +700,10 @@ dht_init(xlator_t *this)
pthread_cond_init(&defrag->fc_wakeup_cond, 0);
defrag->global_error = 0;
+
+ defrag->is_pure_distribute = _gf_false;
+
+ defrag->operate_dist = _gf_true;
}
conf->use_fallocate = 1;