summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/dht/src/dht-rebalance.c
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster/dht/src/dht-rebalance.c')
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c400
1 files changed, 395 insertions, 5 deletions
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index c141ffce90d..d850eef62ab 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -14,6 +14,7 @@
#include <signal.h>
#include <glusterfs/events.h>
#include "glusterfs/compat-errno.h" // for ENODATA on BSD
+#include <string.h>
#define GF_DISK_SECTOR_SIZE 512
#define DHT_REBALANCE_PID 4242 /* Change it if required */
@@ -4052,6 +4053,368 @@ out:
}
int
+gf_defrag_fix_layout_puredist(xlator_t *this, gf_defrag_info_t *defrag,
+ loc_t *loc, dict_t *fix_layout,
+ dict_t *migrate_data)
+{
+ int ret = -1;
+ loc_t entry_loc = {
+ 0,
+ };
+ fd_t *fd = NULL;
+ inode_t *linked_inode = NULL, *inode = NULL;
+ dht_conf_t *conf = NULL;
+ int should_commit_hash = 1;
+ int perrno = 0;
+ /* absolute brick path length */
+ int brick_len = 0;
+ /* dir path length (relative to gluster mount) */
+ int dir_len = 0;
+ /* absolute dir path length */
+ int total_len = 0;
+ struct dirent *entry = NULL;
+ struct dirent scratch[2] = {{
+ 0,
+ }};
+ DIR *dirp = NULL;
+ int full_entry_length = 0;
+ int entry_len = 0;
+ char full_entry_path[4096] = {
+ 0,
+ };
+ char full_dir_path[4096] = {
+ 0,
+ };
+ ssize_t size = 0;
+ uuid_t tmp_gfid;
+ struct stat tmpbuf = {
+ 0,
+ };
+ struct iatt iatt = {
+ 0,
+ };
+
+ struct stat lstatbuf = {
+ 0,
+ };
+ struct iatt stbuf = {
+ 0,
+ };
+
+ conf = this->private;
+ if (!conf) {
+ ret = -1;
+ goto out;
+ }
+
+ /*
+ * Since the primary target for the following lookup is to figure out if the
+ * entry still exists, going to do a direct stat call rather than going
+ * through the whole gluster stack. There are some benefits of doing gluster
+ * lookup, but this is redundant since we have done already one gluster
+ * lookup in the parent function.
+ *
+ * Randomly selecting the first local subvol to read, since it is expected
+ * that the directory structure is present in all the subvols identically
+ */
+
+ brick_len = strlen(defrag->local_brick_paths[0]);
+ /* discarding the first "/" */
+ dir_len = strlen(loc->path) - 1;
+ /* Extra two: one for "/" at the end and one more for '\0'*/
+ total_len = brick_len + dir_len + 2;
+
+ snprintf(full_dir_path, total_len, "%s%s/", defrag->local_brick_paths[0],
+ loc->path + 1);
+
+ ret = sys_lstat(full_dir_path, &tmpbuf);
+ if (ret == -1) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "[absolutepath %s] directory "
+ "not found, path %s error %d",
+ full_dir_path, loc->path, errno);
+ goto out;
+ }
+
+ dirp = sys_opendir(full_dir_path);
+ if (!dirp) {
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, errno, 0, "failed to open dir : %s",
+ loc->path);
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ goto out;
+ }
+
+ while ((entry = sys_readdir(dirp, scratch)) != NULL) {
+ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+ ret = 1;
+ goto out;
+ }
+ if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..") ||
+ !strcmp(entry->d_name, ".glusterfs"))
+ continue;
+
+ /* TODO: Need to add a check for _DIRENT_HAVE_D_TYPE flag to fall back
+ to stat in case d_type is not defined */
+ if (entry->d_type != DT_DIR) {
+ continue;
+ }
+
+ entry_len = strlen(entry->d_name);
+ full_entry_length = total_len + entry_len + 1; /* one more for "/"*/
+
+ snprintf(full_entry_path, full_entry_length, "%s%s/", full_dir_path,
+ entry->d_name);
+
+ size = sys_lgetxattr(full_entry_path, GFID_XATTR_KEY, tmp_gfid, 16);
+ if (size != 16) {
+ gf_log(this->name, GF_LOG_ERROR, "gfid not found, path %s",
+ full_entry_path);
+ continue;
+ }
+
+ loc_wipe(&entry_loc);
+
+ ret = dht_build_child_loc(this, &entry_loc, loc, entry->d_name);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Child loc"
+ " build failed for entry: %s",
+ entry->d_name);
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+
+ goto out;
+ } else {
+ should_commit_hash = 0;
+
+ continue;
+ }
+ }
+
+ if (gf_uuid_is_null(tmp_gfid)) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "%s/%s"
+ " gfid not present",
+ loc->path, entry->d_name);
+ continue;
+ }
+
+ gf_uuid_copy(entry_loc.gfid, tmp_gfid);
+
+ /*In case the gfid stored in the inode by inode_link
+ *and the gfid obtained in the lookup differs, then
+ *client3_3_lookup_cbk will return ESTALE and proper
+ *error will be captured.
+ */
+ memset(&lstatbuf, 0, sizeof(struct stat));
+ ret = sys_lstat(full_entry_path, &lstatbuf);
+ if (ret == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, errno, 0, "lstat failed for %s",
+ entry->d_name);
+ }
+
+ memset(&stbuf, 0, sizeof(struct iatt));
+ iatt_from_stat(&stbuf, &lstatbuf);
+ gf_uuid_copy(stbuf.ia_gfid, entry_loc.gfid);
+ linked_inode = inode_link(entry_loc.inode, loc->inode, entry->d_name,
+ &stbuf);
+
+ inode = entry_loc.inode;
+ entry_loc.inode = linked_inode;
+ inode_unref(inode);
+
+ if (gf_uuid_is_null(loc->gfid)) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "%s/%s"
+ " gfid not present",
+ loc->path, entry->d_name);
+ continue;
+ }
+
+ gf_uuid_copy(entry_loc.pargfid, loc->gfid);
+
+ ret = syncop_lookup(this, &entry_loc, &iatt, NULL, NULL, NULL);
+ if (ret) {
+ if (-ret == ENOENT || -ret == ESTALE) {
+ gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_LOOKUP_FAILED,
+ "Dir:%s renamed or removed. "
+ "Skipping",
+ loc->path);
+ ret = 0;
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ continue;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_DIR_LOOKUP_FAILED, "lookup failed for:%s",
+ entry_loc.path);
+
+ defrag->total_failures++;
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ ret = -1;
+ goto out;
+ } else {
+ should_commit_hash = 0;
+ continue;
+ }
+ }
+ }
+
+ /* A return value of 2 means, either process_dir or
+ * lookup of a dir failed. Hence, don't commit hash
+ * for the current directory*/
+
+ ret = gf_defrag_fix_layout_puredist(this, defrag, &entry_loc,
+ fix_layout, migrate_data);
+
+ if (defrag->defrag_status == GF_DEFRAG_STATUS_STOPPED) {
+ goto out;
+ }
+
+ if (ret && ret != 2) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_FIX_FAILED,
+ "Fix layout failed for %s", entry_loc.path);
+
+ defrag->total_failures++;
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+
+ goto out;
+ } else {
+ /* Let's not commit-hash if
+ * gf_defrag_fix_layout failed*/
+ continue;
+ }
+ }
+ }
+
+ ret = sys_closedir(dirp);
+ if (ret) {
+ gf_msg_debug(this->name, 0,
+ "Failed to close dir %s. Reason :"
+ " %s",
+ full_dir_path, strerror(errno));
+ ret = 0;
+ }
+
+ dirp = NULL;
+
+ /* A directory layout is fixed only after its subdirs are healed to
+ * any newly added bricks. If the layout is fixed before subdirs are
+ * healed, the newly added brick will get a non-null layout.
+ * Any subdirs which hash to that layout will no longer show up
+ * in a directory listing until they are healed.
+ */
+
+ ret = syncop_setxattr(this, loc, fix_layout, 0, NULL, NULL);
+
+ /* In case of a race where the directory is deleted just before
+ * layout setxattr, the errors are updated in the layout structure.
+ * We can use this information to make a decision whether the directory
+ * is deleted entirely.
+ */
+ if (ret == 0) {
+ ret = dht_dir_layout_error_check(this, loc->inode);
+ ret = -ret;
+ }
+
+ if (ret) {
+ if (-ret == ENOENT || -ret == ESTALE) {
+ gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_LAYOUT_FIX_FAILED,
+ "Setxattr failed. Dir %s "
+ "renamed or removed",
+ loc->path);
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ ret = 0;
+ goto out;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LAYOUT_FIX_FAILED,
+ "Setxattr failed for %s", loc->path);
+
+ defrag->total_failures++;
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ ret = -1;
+ goto out;
+ }
+ }
+ }
+
+ if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) &&
+ (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX)) {
+ ret = gf_defrag_process_dir(this, defrag, loc, migrate_data, &perrno);
+
+ if (ret && (ret != 2)) {
+ if (perrno == ENOENT || perrno == ESTALE) {
+ ret = 0;
+ goto out;
+ } else {
+ defrag->total_failures++;
+
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_DEFRAG_PROCESS_DIR_FAILED,
+ "gf_defrag_process_dir failed for "
+ "directory: %s",
+ loc->path);
+
+ if (conf->decommission_in_progress) {
+ goto out;
+ }
+
+ should_commit_hash = 0;
+ }
+ } else if (ret == 2) {
+ should_commit_hash = 0;
+ }
+ }
+
+ gf_msg_trace(this->name, 0, "fix layout called on %s", loc->path);
+
+ if (should_commit_hash &&
+ gf_defrag_settle_hash(this, defrag, loc, fix_layout) != 0) {
+ defrag->total_failures++;
+
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SETTLE_HASH_FAILED,
+ "Settle hash failed for %s", loc->path);
+
+ ret = -1;
+
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ loc_wipe(&entry_loc);
+
+ if (fd)
+ fd_unref(fd);
+
+ if (ret == 0 && should_commit_hash == 0) {
+ ret = 2;
+ }
+
+ if (dirp) {
+ sys_closedir(dirp);
+ }
+
+ return ret;
+}
+
+int
dht_init_local_subvols_and_nodeuuids(xlator_t *this, dht_conf_t *conf,
loc_t *loc)
{
@@ -4405,6 +4768,7 @@ gf_defrag_start_crawl(void *data)
pthread_t *tid = NULL;
pthread_t filecnt_thread;
gf_boolean_t fc_thread_started = _gf_false;
+ int i = 0;
this = data;
if (!this)
@@ -4539,6 +4903,12 @@ gf_defrag_start_crawl(void *data)
goto out;
}
+ ret = dht_get_brick_paths(this, conf, &loc);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING, "could not get brick path");
+ ret = 0;
+ }
+
/* Initialise the structures required for parallel migration */
ret = gf_defrag_parallel_migration_init(this, defrag, &tid,
&thread_index);
@@ -4556,11 +4926,23 @@ gf_defrag_start_crawl(void *data)
}
}
- ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout, migrate_data);
- if (ret && ret != 2) {
- defrag->total_failures++;
- ret = -1;
- goto out;
+ /* TODO: Need to introduce a flag to safely operate in the old way */
+ if (defrag->operate_dist && defrag->is_pure_distribute) {
+ ret = gf_defrag_fix_layout_puredist(this, defrag, &loc, fix_layout,
+ migrate_data);
+ if (ret && ret != 2) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+ } else {
+ ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout,
+ migrate_data);
+ if (ret && ret != 2) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
}
if (ret != 2 &&
@@ -4606,6 +4988,14 @@ out:
}
UNLOCK(&defrag->lock);
+ for (i = 0; i < conf->local_subvols_cnt; i++) {
+ if (defrag->local_brick_paths[i]) {
+ GF_FREE(defrag->local_brick_paths[i]);
+ }
+ }
+
+ GF_FREE(defrag->local_brick_paths);
+
GF_FREE(defrag);
conf->defrag = NULL;