summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--tests/basic/distribute/rebal-all-nodes-migrate.t143
-rw-r--r--tests/dht.rc24
-rw-r--r--xlators/cluster/dht/src/dht-common.c64
-rw-r--r--xlators/cluster/dht/src/dht-common.h9
-rw-r--r--xlators/cluster/dht/src/dht-helper.c8
-rw-r--r--xlators/cluster/dht/src/dht-mem-types.h1
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c88
-rw-r--r--xlators/cluster/dht/src/tier.c57
8 files changed, 376 insertions, 18 deletions
diff --git a/tests/basic/distribute/rebal-all-nodes-migrate.t b/tests/basic/distribute/rebal-all-nodes-migrate.t
new file mode 100644
index 00000000000..14f0a53b1f8
--- /dev/null
+++ b/tests/basic/distribute/rebal-all-nodes-migrate.t
@@ -0,0 +1,143 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../cluster.rc
+. $(dirname $0)/../../dht.rc
+
+
+# Check if every single rebalance process migrated some files
+
+function cluster_rebal_all_nodes_migrated_files {
+ val=0
+ a=$($CLI_1 volume rebalance $V0 status | grep "completed" | awk '{print $2}');
+# echo $a
+ b=($a)
+ for i in "${b[@]}"
+ do
+# echo "$i";
+ if [ "$i" -eq "0" ]; then
+ echo "false";
+ val=1;
+ fi
+ done
+ echo $val
+}
+
+cleanup
+
+TEST launch_cluster 3;
+TEST $CLI_1 peer probe $H2;
+TEST $CLI_1 peer probe $H3;
+EXPECT_WITHIN $PROBE_TIMEOUT 2 peer_count
+
+
+#Start with a pure distribute volume (multiple bricks on the same node)
+TEST $CLI_1 volume create $V0 $H1:$B1/dist1 $H1:$B1/dist2 $H2:$B2/dist3 $H2:$B2/dist4
+
+TEST $CLI_1 volume start $V0
+$CLI_1 volume info $V0
+
+#TEST $CLI_1 volume set $V0 client-log-level DEBUG
+
+## Mount FUSE
+TEST glusterfs -s $H1 --volfile-id $V0 $M0;
+
+TEST mkdir $M0/dir1 2>/dev/null;
+TEST touch $M0/dir1/file-{1..500}
+
+## Add-brick and run rebalance to force file migration
+TEST $CLI_1 volume add-brick $V0 $H1:$B1/dist5 $H2:$B2/dist6
+
+#Start a rebalance
+TEST $CLI_1 volume rebalance $V0 start force
+
+#volume rebalance status should work
+#TEST $CLI_1 volume rebalance $V0 status
+#$CLI_1 volume rebalance $V0 status
+
+EXPECT_WITHIN $REBALANCE_TIMEOUT "0" cluster_rebalance_completed
+EXPECT "0" cluster_rebal_all_nodes_migrated_files
+$CLI_1 volume rebalance $V0 status
+
+
+TEST umount -f $M0
+TEST $CLI_1 volume stop $V0
+TEST $CLI_1 volume delete $V0
+
+
+##############################################################
+
+# Next, a dist-rep volume
+TEST $CLI_1 volume create $V0 replica 2 $H1:$B1/drep1 $H2:$B2/drep1 $H1:$B1/drep2 $H2:$B2/drep2
+
+TEST $CLI_1 volume start $V0
+$CLI_1 volume info $V0
+
+#TEST $CLI_1 volume set $V0 client-log-level DEBUG
+
+## Mount FUSE
+TEST glusterfs -s $H1 --volfile-id $V0 $M0;
+
+TEST mkdir $M0/dir1 2>/dev/null;
+TEST touch $M0/dir1/file-{1..500}
+
+## Add-brick and run rebalance to force file migration
+TEST $CLI_1 volume add-brick $V0 replica 2 $H1:$B1/drep3 $H2:$B2/drep3
+
+#Start a rebalance
+TEST $CLI_1 volume rebalance $V0 start force
+
+#volume rebalance status should work
+#TEST $CLI_1 volume rebalance $V0 status
+#$CLI_1 volume rebalance $V0 status
+
+EXPECT_WITHIN $REBALANCE_TIMEOUT "0" cluster_rebalance_completed
+#EXPECT "0" cluster_rebal_all_nodes_migrated_files
+$CLI_1 volume rebalance $V0 status
+
+
+TEST umount -f $M0
+TEST $CLI_1 volume stop $V0
+TEST $CLI_1 volume delete $V0
+
+##############################################################
+
+# Next, a disperse volume
+TEST $CLI_1 volume create $V0 disperse 3 $H1:$B1/ec1 $H2:$B1/ec2 $H3:$B1/ec3 force
+
+TEST $CLI_1 volume start $V0
+$CLI_1 volume info $V0
+
+#TEST $CLI_1 volume set $V0 client-log-level DEBUG
+
+## Mount FUSE
+TEST glusterfs -s $H1 --volfile-id $V0 $M0;
+
+TEST mkdir $M0/dir1 2>/dev/null;
+TEST touch $M0/dir1/file-{1..500}
+
+## Add-brick and run rebalance to force file migration
+TEST $CLI_1 volume add-brick $V0 $H1:$B2/ec4 $H2:$B2/ec5 $H3:$B2/ec6
+
+#Start a rebalance
+TEST $CLI_1 volume rebalance $V0 start force
+
+#volume rebalance status should work
+#TEST $CLI_1 volume rebalance $V0 status
+#$CLI_1 volume rebalance $V0 status
+
+EXPECT_WITHIN $REBALANCE_TIMEOUT "0" cluster_rebalance_completed
+
+# this will not work unless EC is changed to return all node-uuids
+# comment this out once that patch is ready
+#EXPECT "0" cluster_rebal_all_nodes_migrated_files
+$CLI_1 volume rebalance $V0 status
+
+
+TEST umount -f $M0
+TEST $CLI_1 volume stop $V0
+TEST $CLI_1 volume delete $V0
+
+##############################################################
+
+cleanup
diff --git a/tests/dht.rc b/tests/dht.rc
index bf5e08b645e..53b00645e66 100644
--- a/tests/dht.rc
+++ b/tests/dht.rc
@@ -66,13 +66,33 @@ function get_hashed_brick()
}
+function cluster_rebalance_completed()
+{
+ val=1
+
+ # Rebalance status will be either "failed" or "completed"
+
+ test=$($CLI_1 volume rebalance $V0 status | grep "in progress" 2>&1)
+ if [ $? -ne 0 ]
+ then
+ val=0
+ fi
+
+ echo $val
+ # Do not *return* the value here. If it's non-zero, that will cause
+ # EXPECT_WITHIN (e.g. in bug-884455.t) to return prematurely, leading to
+ # a spurious test failure. Nothing else checks the return value anyway
+ # (they all check the output) so there's no need for it to be non-zero
+ # just because grep didn't find what we want.
+}
+
function rebalance_completed()
{
val=1
- test=$(gluster volume rebalance $V0 status | grep localhost | grep "completed" 2>&1)
+ test=$($CLI volume rebalance $V0 status | grep localhost | grep "completed" 2>&1)
if [ $? -eq 0 ]
then
- val=0
+ val=0
fi
echo $val
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index af6345ecc2a..8b4fd5cf37b 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -3001,6 +3001,8 @@ dht_vgetxattr_fill_and_set (dht_local_t *local, dict_t **dict, xlator_t *this,
out:
return ret;
}
+
+
int
dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xattr,
@@ -3016,6 +3018,11 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
char *next_uuid_str = NULL;
char *saveptr = NULL;
uuid_t node_uuid = {0,};
+ char *uuid_list_copy = NULL;
+ int count = 0;
+ int i = 0;
+ int index = 0;
+ int found = 0;
VALIDATE_OR_GOTO (frame, out);
@@ -3025,6 +3032,10 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
prev = cookie;
conf = this->private;
+ VALIDATE_OR_GOTO (conf->defrag, out);
+
+ gf_msg_debug (this->name, 0, "subvol %s returned", prev->name);
+
LOCK (&frame->lock);
{
this_call_cnt = --local->call_cnt;
@@ -3048,6 +3059,15 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto unlock;
}
+ /* As DHT will not know details of its child xlators
+ * we need to parse this twice to get the count first
+ * and allocate memory later.
+ */
+ count = 0;
+ index = conf->local_subvols_cnt;
+
+ uuid_list_copy = gf_strdup (uuid_list);
+
for (uuid_str = strtok_r (uuid_list, " ", &saveptr);
uuid_str;
uuid_str = next_uuid_str) {
@@ -3057,24 +3077,57 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_msg (this->name, GF_LOG_ERROR, 0,
DHT_MSG_UUID_PARSE_ERROR,
"Failed to parse uuid"
- " failed for %s", prev->name);
+ " for %s", prev->name);
local->op_ret = -1;
local->op_errno = EINVAL;
goto unlock;
}
+ count++;
if (gf_uuid_compare (node_uuid, conf->defrag->node_uuid)) {
gf_msg_debug (this->name, 0, "subvol %s does not"
"belong to this node",
prev->name);
} else {
+
+ /* handle multiple bricks of the same replica
+ * on the same node */
+ if (found)
+ continue;
conf->local_subvols[(conf->local_subvols_cnt)++]
- = prev;
+ = prev;
+ found = 1;
gf_msg_debug (this->name, 0, "subvol %s belongs to"
" this node", prev->name);
- break;
}
}
+
+ if (!found) {
+ local->op_ret = 0;
+ goto unlock;
+ }
+
+ conf->local_nodeuuids[index].count = count;
+ conf->local_nodeuuids[index].uuids
+ = GF_CALLOC (count, sizeof (uuid_t), 1);
+
+ /* The node-uuids are guaranteed to be returned in the same
+ * order as the bricks
+ * A null node-uuid is returned for a brick that is down.
+ */
+
+ saveptr = NULL;
+ i = 0;
+
+ for (uuid_str = strtok_r (uuid_list_copy, " ", &saveptr);
+ uuid_str;
+ uuid_str = next_uuid_str) {
+
+ next_uuid_str = strtok_r (NULL, " ", &saveptr);
+ gf_uuid_parse (uuid_str,
+ conf->local_nodeuuids[index].uuids[i]);
+ i++;
+ }
}
local->op_ret = 0;
@@ -3092,8 +3145,13 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
unwind:
+
+ GF_FREE (conf->local_nodeuuids[index].uuids);
+ conf->local_nodeuuids[index].uuids = NULL;
+
DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, NULL, xdata);
out:
+ GF_FREE (uuid_list_copy);
return 0;
}
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index c8cec133960..f982bf6ac1a 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -419,6 +419,7 @@ struct dht_container {
xlator_t *this;
loc_t *parent_loc;
dict_t *migrate_data;
+ int local_subvol_index;
};
typedef enum tier_mode_ {
@@ -490,6 +491,12 @@ typedef struct gf_tier_conf {
char volname[GD_VOLUME_NAME_MAX + 1];
} gf_tier_conf_t;
+typedef struct subvol_nodeuuids {
+ uuid_t *uuids;
+ int count;
+} subvol_nodeuuid_t;
+
+
struct gf_defrag_info_ {
uint64_t total_files;
uint64_t total_data;
@@ -540,6 +547,7 @@ struct gf_defrag_info_ {
/* lock migration flag */
gf_boolean_t lock_migration_enabled;
+
};
typedef struct gf_defrag_info_ gf_defrag_info_t;
@@ -623,6 +631,7 @@ struct dht_conf {
/*local subvol storage for rebalance*/
xlator_t **local_subvols;
+ subvol_nodeuuid_t *local_nodeuuids;
int32_t local_subvols_cnt;
/*
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c
index 6f08f557730..38965298325 100644
--- a/xlators/cluster/dht/src/dht-helper.c
+++ b/xlators/cluster/dht/src/dht-helper.c
@@ -505,7 +505,6 @@ dht_local_init (call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop)
goto out;
inode = loc->inode;
- local->hashed_subvol = dht_subvol_get_hashed (frame->this, loc);
}
if (fd) {
@@ -844,7 +843,12 @@ dht_init_local_subvolumes (xlator_t *this, dht_conf_t *conf)
conf->local_subvols = GF_CALLOC (cnt, sizeof (xlator_t *),
gf_dht_mt_xlator_t);
- if (!conf->local_subvols) {
+
+ /* FIX FIX : do this dynamically*/
+ conf->local_nodeuuids = GF_CALLOC (cnt, sizeof (subvol_nodeuuid_t),
+ gf_dht_nodeuuids_t);
+
+ if (!conf->local_subvols || !conf->local_nodeuuids) {
return -1;
}
diff --git a/xlators/cluster/dht/src/dht-mem-types.h b/xlators/cluster/dht/src/dht-mem-types.h
index 3554f3f9c2d..19cccef537b 100644
--- a/xlators/cluster/dht/src/dht-mem-types.h
+++ b/xlators/cluster/dht/src/dht-mem-types.h
@@ -39,6 +39,7 @@ enum gf_dht_mem_types_ {
gf_dht_mt_fd_ctx_t,
gf_tier_mt_qfile_array_t,
gf_dht_ret_cache_t,
+ gf_dht_nodeuuids_t,
gf_dht_mt_end
};
#endif
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index a5d00e37c0e..a1266502d63 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -2439,6 +2439,43 @@ gf_defrag_ctx_subvols_init (dht_dfoffset_ctx_t *offset_var, xlator_t *this) {
return 0;
}
+
+/* Return value
+ * 0 : this node does not migrate the file
+ * 1 : this node migrates the file
+ */
+int
+gf_defrag_should_i_migrate (xlator_t *this, int local_subvol_index, uuid_t gfid)
+{
+ int ret = 0;
+ int i = local_subvol_index;
+ char *str = NULL;
+ uint32_t hashval = 0;
+ int32_t index = 0;
+ dht_conf_t *conf = NULL;
+ char buf[UUID_CANONICAL_FORM_LEN + 1] = {0, };
+
+ conf = this->private;
+
+ /* Pure distribute */
+
+ if (conf->local_nodeuuids[i].count == 1) {
+ return 1;
+ }
+
+ str = uuid_utoa_r (gfid, buf);
+
+ ret = dht_hash_compute (this, 0, str, &hashval);
+ if (ret == 0) {
+ index = (hashval % conf->local_nodeuuids[i].count);
+ if (!gf_uuid_compare (conf->defrag->node_uuid,
+ conf->local_nodeuuids[i].uuids[index]))
+ ret = 1;
+ }
+ return ret;
+}
+
+
int
gf_defrag_migrate_single_file (void *opaque)
{
@@ -2517,6 +2554,13 @@ gf_defrag_migrate_single_file (void *opaque)
goto out;
}
+ if (!gf_defrag_should_i_migrate (this, rebal_entry->local_subvol_index,
+ entry->d_stat.ia_gfid)) {
+ gf_msg_debug (this->name, 0, "Don't migrate %s ",
+ entry_loc.path);
+ goto out;
+ }
+
gf_uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid);
gf_uuid_copy (entry_loc.pargfid, loc->gfid);
@@ -2531,6 +2575,7 @@ gf_defrag_migrate_single_file (void *opaque)
goto out;
}
+
hashed_subvol = dht_subvol_get_hashed (this, &entry_loc);
if (!hashed_subvol) {
gf_msg (this->name, GF_LOG_ERROR, 0,
@@ -2953,6 +2998,8 @@ gf_defrag_get_entry (xlator_t *this, int i, struct dht_container **container,
goto out;
}
+ tmp_container->local_subvol_index = i;
+
tmp_container->df_entry->d_stat = df_entry->d_stat;
tmp_container->df_entry->d_ino = df_entry->d_ino;
@@ -4032,6 +4079,33 @@ int gf_defrag_total_file_cnt (xlator_t *this, loc_t *root_loc)
}
+
+int
+dht_get_local_subvols_and_nodeuuids (xlator_t *this, dht_conf_t *conf,
+ loc_t *loc)
+{
+
+ dict_t *dict = NULL;
+ int ret = -1;
+
+ /* Find local subvolumes */
+ ret = syncop_getxattr (this, loc, &dict,
+ GF_REBAL_FIND_LOCAL_SUBVOL,
+ NULL, NULL);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, 0, 0, "local "
+ "subvolume determination failed with error: %d",
+ -ret);
+ ret = -1;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+
int
gf_defrag_start_crawl (void *data)
{
@@ -4056,6 +4130,7 @@ gf_defrag_start_crawl (void *data)
gf_boolean_t is_tier_detach = _gf_false;
call_frame_t *statfs_frame = NULL;
xlator_t *old_THIS = NULL;
+ int j = 0;
this = data;
if (!this)
@@ -4184,14 +4259,8 @@ gf_defrag_start_crawl (void *data)
goto out;
}
- /* Find local subvolumes */
- ret = syncop_getxattr (this, &loc, &dict,
- GF_REBAL_FIND_LOCAL_SUBVOL,
- NULL, NULL);
+ ret = dht_get_local_subvols_and_nodeuuids (this, conf, &loc);
if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0, 0, "local "
- "subvolume determination failed with error: %d",
- -ret);
ret = -1;
goto out;
}
@@ -4199,6 +4268,11 @@ gf_defrag_start_crawl (void *data)
for (i = 0 ; i < conf->local_subvols_cnt; i++) {
gf_msg (this->name, GF_LOG_INFO, 0, 0, "local subvols "
"are %s", conf->local_subvols[i]->name);
+ for (j = 0; j < conf->local_nodeuuids[i].count; j++) {
+ gf_msg (this->name, GF_LOG_INFO, 0, 0,
+ "node uuids are %s",
+ uuid_utoa(conf->local_nodeuuids[i].uuids[j]));
+ }
}
ret = gf_defrag_total_file_cnt (this, &loc);
diff --git a/xlators/cluster/dht/src/tier.c b/xlators/cluster/dht/src/tier.c
index a8aebe00f69..e4b910eb0e6 100644
--- a/xlators/cluster/dht/src/tier.c
+++ b/xlators/cluster/dht/src/tier.c
@@ -198,10 +198,17 @@ out:
static int
tier_check_same_node (xlator_t *this, loc_t *loc, gf_defrag_info_t *defrag)
{
- int ret = -1;
- dict_t *dict = NULL;
- char *uuid_str = NULL;
- uuid_t node_uuid = {0,};
+ int ret = -1;
+ dict_t *dict = NULL;
+ char *uuid_str = NULL;
+ uuid_t node_uuid = {0,};
+ char *dup_str = NULL;
+ char *str = NULL;
+ char *save_ptr = NULL;
+ int count = 0;
+ uint32_t hashval = 0;
+ int32_t index = 0;
+ char buf[GF_UUID_BUF_SIZE] = {0,};
GF_VALIDATE_OR_GOTO ("tier", this, out);
GF_VALIDATE_OR_GOTO (this->name, loc, out);
@@ -215,15 +222,56 @@ tier_check_same_node (xlator_t *this, loc_t *loc, gf_defrag_info_t *defrag)
goto out;
}
+
+ /* This returns multiple node-uuids now - one for each brick
+ * of the subvol.
+ */
+
if (dict_get_str (dict, GF_XATTR_NODE_UUID_KEY, &uuid_str) < 0) {
gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
"Failed to get node-uuid for %s", loc->path);
goto out;
}
+ dup_str = gf_strdup (uuid_str);
+ str = dup_str;
+
+ /* How many uuids returned?
+ * No need to check if one of these is that of the current node.
+ */
+
+ count = 1;
+ while ((str = strchr (str, ' '))) {
+ count++;
+ str++;
+ }
+
+ /* Only one node-uuid - pure distribute? */
+ if (count == 1)
+ goto check_node;
+
+ uuid_utoa_r (loc->gfid, buf);
+ ret = dht_hash_compute (this, 0, buf, &hashval);
+ if (ret == 0) {
+ index = (hashval % count);
+ }
+
+ count = 0;
+ str = dup_str;
+ while ((uuid_str = strtok_r (str, " ", &save_ptr))) {
+ if (count == index)
+ break;
+ count++;
+ str = NULL;
+ }
+
+
+check_node:
+
if (gf_uuid_parse (uuid_str, node_uuid)) {
gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
"uuid_parse failed for %s", loc->path);
+ ret = -1;
goto out;
}
@@ -239,6 +287,7 @@ out:
if (dict)
dict_unref(dict);
+ GF_FREE (dup_str);
return ret;
}