From b23bd3dbc2c153171d0bb1205e6804afe022a55f Mon Sep 17 00:00:00 2001 From: N Balachandran Date: Wed, 10 May 2017 21:26:28 +0530 Subject: cluster/dht: Rebalance on all nodes should migrate files Problem: Rebalance compares the node-uuid of a file against its own to and migrates a file only if they match. However, the current behaviour in both AFR and EC is to return the node-uuid of the first brick in a replica set for all files. This means a single node ends up migrating all the files if the first brick of every replica set is on the same node. Fix: AFR and EC will return all node-uuids for the replica set. The rebalance process will divide the files to be migrated among all the nodes by hashing the gfid of the file and using that value to select a node to perform the migration. This patch makes the required DHT and tiering changes. Some tests in rebal-all-nodes-migrate.t will need to be uncommented once the AFR and EC changes are merged. Change-Id: I5ce41600f5ba0e244ddfd986e2ba8fa23329ff0c BUG: 1366817 Signed-off-by: N Balachandran Reviewed-on: https://review.gluster.org/17239 Smoke: Gluster Build System NetBSD-regression: NetBSD Build System CentOS-regression: Gluster Build System Reviewed-by: Amar Tumballi Reviewed-by: Jeff Darcy Reviewed-by: Shyamsundar Ranganathan --- tests/basic/distribute/rebal-all-nodes-migrate.t | 143 +++++++++++++++++++++++ tests/dht.rc | 24 +++- xlators/cluster/dht/src/dht-common.c | 64 +++++++++- xlators/cluster/dht/src/dht-common.h | 9 ++ xlators/cluster/dht/src/dht-helper.c | 8 +- xlators/cluster/dht/src/dht-mem-types.h | 1 + xlators/cluster/dht/src/dht-rebalance.c | 88 ++++++++++++-- xlators/cluster/dht/src/tier.c | 57 ++++++++- 8 files changed, 376 insertions(+), 18 deletions(-) create mode 100644 tests/basic/distribute/rebal-all-nodes-migrate.t diff --git a/tests/basic/distribute/rebal-all-nodes-migrate.t b/tests/basic/distribute/rebal-all-nodes-migrate.t new file mode 100644 index 00000000000..14f0a53b1f8 --- /dev/null +++ b/tests/basic/distribute/rebal-all-nodes-migrate.t @@ -0,0 +1,143 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../cluster.rc +. $(dirname $0)/../../dht.rc + + +# Check if every single rebalance process migrated some files + +function cluster_rebal_all_nodes_migrated_files { + val=0 + a=$($CLI_1 volume rebalance $V0 status | grep "completed" | awk '{print $2}'); +# echo $a + b=($a) + for i in "${b[@]}" + do +# echo "$i"; + if [ "$i" -eq "0" ]; then + echo "false"; + val=1; + fi + done + echo $val +} + +cleanup + +TEST launch_cluster 3; +TEST $CLI_1 peer probe $H2; +TEST $CLI_1 peer probe $H3; +EXPECT_WITHIN $PROBE_TIMEOUT 2 peer_count + + +#Start with a pure distribute volume (multiple bricks on the same node) +TEST $CLI_1 volume create $V0 $H1:$B1/dist1 $H1:$B1/dist2 $H2:$B2/dist3 $H2:$B2/dist4 + +TEST $CLI_1 volume start $V0 +$CLI_1 volume info $V0 + +#TEST $CLI_1 volume set $V0 client-log-level DEBUG + +## Mount FUSE +TEST glusterfs -s $H1 --volfile-id $V0 $M0; + +TEST mkdir $M0/dir1 2>/dev/null; +TEST touch $M0/dir1/file-{1..500} + +## Add-brick and run rebalance to force file migration +TEST $CLI_1 volume add-brick $V0 $H1:$B1/dist5 $H2:$B2/dist6 + +#Start a rebalance +TEST $CLI_1 volume rebalance $V0 start force + +#volume rebalance status should work +#TEST $CLI_1 volume rebalance $V0 status +#$CLI_1 volume rebalance $V0 status + +EXPECT_WITHIN $REBALANCE_TIMEOUT "0" cluster_rebalance_completed +EXPECT "0" cluster_rebal_all_nodes_migrated_files +$CLI_1 volume rebalance $V0 status + + +TEST umount -f $M0 +TEST $CLI_1 volume stop $V0 +TEST $CLI_1 volume delete $V0 + + +############################################################## + +# Next, a dist-rep volume +TEST $CLI_1 volume create $V0 replica 2 $H1:$B1/drep1 $H2:$B2/drep1 $H1:$B1/drep2 $H2:$B2/drep2 + +TEST $CLI_1 volume start $V0 +$CLI_1 volume info $V0 + +#TEST $CLI_1 volume set $V0 client-log-level DEBUG + +## Mount FUSE +TEST glusterfs -s $H1 --volfile-id $V0 $M0; + +TEST mkdir $M0/dir1 2>/dev/null; +TEST touch $M0/dir1/file-{1..500} + +## Add-brick and run rebalance to force file migration +TEST $CLI_1 volume add-brick $V0 replica 2 $H1:$B1/drep3 $H2:$B2/drep3 + +#Start a rebalance +TEST $CLI_1 volume rebalance $V0 start force + +#volume rebalance status should work +#TEST $CLI_1 volume rebalance $V0 status +#$CLI_1 volume rebalance $V0 status + +EXPECT_WITHIN $REBALANCE_TIMEOUT "0" cluster_rebalance_completed +#EXPECT "0" cluster_rebal_all_nodes_migrated_files +$CLI_1 volume rebalance $V0 status + + +TEST umount -f $M0 +TEST $CLI_1 volume stop $V0 +TEST $CLI_1 volume delete $V0 + +############################################################## + +# Next, a disperse volume +TEST $CLI_1 volume create $V0 disperse 3 $H1:$B1/ec1 $H2:$B1/ec2 $H3:$B1/ec3 force + +TEST $CLI_1 volume start $V0 +$CLI_1 volume info $V0 + +#TEST $CLI_1 volume set $V0 client-log-level DEBUG + +## Mount FUSE +TEST glusterfs -s $H1 --volfile-id $V0 $M0; + +TEST mkdir $M0/dir1 2>/dev/null; +TEST touch $M0/dir1/file-{1..500} + +## Add-brick and run rebalance to force file migration +TEST $CLI_1 volume add-brick $V0 $H1:$B2/ec4 $H2:$B2/ec5 $H3:$B2/ec6 + +#Start a rebalance +TEST $CLI_1 volume rebalance $V0 start force + +#volume rebalance status should work +#TEST $CLI_1 volume rebalance $V0 status +#$CLI_1 volume rebalance $V0 status + +EXPECT_WITHIN $REBALANCE_TIMEOUT "0" cluster_rebalance_completed + +# this will not work unless EC is changed to return all node-uuids +# comment this out once that patch is ready +#EXPECT "0" cluster_rebal_all_nodes_migrated_files +$CLI_1 volume rebalance $V0 status + + +TEST umount -f $M0 +TEST $CLI_1 volume stop $V0 +TEST $CLI_1 volume delete $V0 + +############################################################## + +cleanup diff --git a/tests/dht.rc b/tests/dht.rc index bf5e08b645e..53b00645e66 100644 --- a/tests/dht.rc +++ b/tests/dht.rc @@ -66,13 +66,33 @@ function get_hashed_brick() } +function cluster_rebalance_completed() +{ + val=1 + + # Rebalance status will be either "failed" or "completed" + + test=$($CLI_1 volume rebalance $V0 status | grep "in progress" 2>&1) + if [ $? -ne 0 ] + then + val=0 + fi + + echo $val + # Do not *return* the value here. If it's non-zero, that will cause + # EXPECT_WITHIN (e.g. in bug-884455.t) to return prematurely, leading to + # a spurious test failure. Nothing else checks the return value anyway + # (they all check the output) so there's no need for it to be non-zero + # just because grep didn't find what we want. +} + function rebalance_completed() { val=1 - test=$(gluster volume rebalance $V0 status | grep localhost | grep "completed" 2>&1) + test=$($CLI volume rebalance $V0 status | grep localhost | grep "completed" 2>&1) if [ $? -eq 0 ] then - val=0 + val=0 fi echo $val diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index af6345ecc2a..8b4fd5cf37b 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -3001,6 +3001,8 @@ dht_vgetxattr_fill_and_set (dht_local_t *local, dict_t **dict, xlator_t *this, out: return ret; } + + int dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, dict_t *xattr, @@ -3016,6 +3018,11 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, char *next_uuid_str = NULL; char *saveptr = NULL; uuid_t node_uuid = {0,}; + char *uuid_list_copy = NULL; + int count = 0; + int i = 0; + int index = 0; + int found = 0; VALIDATE_OR_GOTO (frame, out); @@ -3025,6 +3032,10 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, prev = cookie; conf = this->private; + VALIDATE_OR_GOTO (conf->defrag, out); + + gf_msg_debug (this->name, 0, "subvol %s returned", prev->name); + LOCK (&frame->lock); { this_call_cnt = --local->call_cnt; @@ -3048,6 +3059,15 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto unlock; } + /* As DHT will not know details of its child xlators + * we need to parse this twice to get the count first + * and allocate memory later. + */ + count = 0; + index = conf->local_subvols_cnt; + + uuid_list_copy = gf_strdup (uuid_list); + for (uuid_str = strtok_r (uuid_list, " ", &saveptr); uuid_str; uuid_str = next_uuid_str) { @@ -3057,24 +3077,57 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_UUID_PARSE_ERROR, "Failed to parse uuid" - " failed for %s", prev->name); + " for %s", prev->name); local->op_ret = -1; local->op_errno = EINVAL; goto unlock; } + count++; if (gf_uuid_compare (node_uuid, conf->defrag->node_uuid)) { gf_msg_debug (this->name, 0, "subvol %s does not" "belong to this node", prev->name); } else { + + /* handle multiple bricks of the same replica + * on the same node */ + if (found) + continue; conf->local_subvols[(conf->local_subvols_cnt)++] - = prev; + = prev; + found = 1; gf_msg_debug (this->name, 0, "subvol %s belongs to" " this node", prev->name); - break; } } + + if (!found) { + local->op_ret = 0; + goto unlock; + } + + conf->local_nodeuuids[index].count = count; + conf->local_nodeuuids[index].uuids + = GF_CALLOC (count, sizeof (uuid_t), 1); + + /* The node-uuids are guaranteed to be returned in the same + * order as the bricks + * A null node-uuid is returned for a brick that is down. + */ + + saveptr = NULL; + i = 0; + + for (uuid_str = strtok_r (uuid_list_copy, " ", &saveptr); + uuid_str; + uuid_str = next_uuid_str) { + + next_uuid_str = strtok_r (NULL, " ", &saveptr); + gf_uuid_parse (uuid_str, + conf->local_nodeuuids[index].uuids[i]); + i++; + } } local->op_ret = 0; @@ -3092,8 +3145,13 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; unwind: + + GF_FREE (conf->local_nodeuuids[index].uuids); + conf->local_nodeuuids[index].uuids = NULL; + DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, NULL, xdata); out: + GF_FREE (uuid_list_copy); return 0; } diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index c8cec133960..f982bf6ac1a 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -419,6 +419,7 @@ struct dht_container { xlator_t *this; loc_t *parent_loc; dict_t *migrate_data; + int local_subvol_index; }; typedef enum tier_mode_ { @@ -490,6 +491,12 @@ typedef struct gf_tier_conf { char volname[GD_VOLUME_NAME_MAX + 1]; } gf_tier_conf_t; +typedef struct subvol_nodeuuids { + uuid_t *uuids; + int count; +} subvol_nodeuuid_t; + + struct gf_defrag_info_ { uint64_t total_files; uint64_t total_data; @@ -540,6 +547,7 @@ struct gf_defrag_info_ { /* lock migration flag */ gf_boolean_t lock_migration_enabled; + }; typedef struct gf_defrag_info_ gf_defrag_info_t; @@ -623,6 +631,7 @@ struct dht_conf { /*local subvol storage for rebalance*/ xlator_t **local_subvols; + subvol_nodeuuid_t *local_nodeuuids; int32_t local_subvols_cnt; /* diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c index 6f08f557730..38965298325 100644 --- a/xlators/cluster/dht/src/dht-helper.c +++ b/xlators/cluster/dht/src/dht-helper.c @@ -505,7 +505,6 @@ dht_local_init (call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop) goto out; inode = loc->inode; - local->hashed_subvol = dht_subvol_get_hashed (frame->this, loc); } if (fd) { @@ -844,7 +843,12 @@ dht_init_local_subvolumes (xlator_t *this, dht_conf_t *conf) conf->local_subvols = GF_CALLOC (cnt, sizeof (xlator_t *), gf_dht_mt_xlator_t); - if (!conf->local_subvols) { + + /* FIX FIX : do this dynamically*/ + conf->local_nodeuuids = GF_CALLOC (cnt, sizeof (subvol_nodeuuid_t), + gf_dht_nodeuuids_t); + + if (!conf->local_subvols || !conf->local_nodeuuids) { return -1; } diff --git a/xlators/cluster/dht/src/dht-mem-types.h b/xlators/cluster/dht/src/dht-mem-types.h index 3554f3f9c2d..19cccef537b 100644 --- a/xlators/cluster/dht/src/dht-mem-types.h +++ b/xlators/cluster/dht/src/dht-mem-types.h @@ -39,6 +39,7 @@ enum gf_dht_mem_types_ { gf_dht_mt_fd_ctx_t, gf_tier_mt_qfile_array_t, gf_dht_ret_cache_t, + gf_dht_nodeuuids_t, gf_dht_mt_end }; #endif diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index a5d00e37c0e..a1266502d63 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -2439,6 +2439,43 @@ gf_defrag_ctx_subvols_init (dht_dfoffset_ctx_t *offset_var, xlator_t *this) { return 0; } + +/* Return value + * 0 : this node does not migrate the file + * 1 : this node migrates the file + */ +int +gf_defrag_should_i_migrate (xlator_t *this, int local_subvol_index, uuid_t gfid) +{ + int ret = 0; + int i = local_subvol_index; + char *str = NULL; + uint32_t hashval = 0; + int32_t index = 0; + dht_conf_t *conf = NULL; + char buf[UUID_CANONICAL_FORM_LEN + 1] = {0, }; + + conf = this->private; + + /* Pure distribute */ + + if (conf->local_nodeuuids[i].count == 1) { + return 1; + } + + str = uuid_utoa_r (gfid, buf); + + ret = dht_hash_compute (this, 0, str, &hashval); + if (ret == 0) { + index = (hashval % conf->local_nodeuuids[i].count); + if (!gf_uuid_compare (conf->defrag->node_uuid, + conf->local_nodeuuids[i].uuids[index])) + ret = 1; + } + return ret; +} + + int gf_defrag_migrate_single_file (void *opaque) { @@ -2517,6 +2554,13 @@ gf_defrag_migrate_single_file (void *opaque) goto out; } + if (!gf_defrag_should_i_migrate (this, rebal_entry->local_subvol_index, + entry->d_stat.ia_gfid)) { + gf_msg_debug (this->name, 0, "Don't migrate %s ", + entry_loc.path); + goto out; + } + gf_uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid); gf_uuid_copy (entry_loc.pargfid, loc->gfid); @@ -2531,6 +2575,7 @@ gf_defrag_migrate_single_file (void *opaque) goto out; } + hashed_subvol = dht_subvol_get_hashed (this, &entry_loc); if (!hashed_subvol) { gf_msg (this->name, GF_LOG_ERROR, 0, @@ -2953,6 +2998,8 @@ gf_defrag_get_entry (xlator_t *this, int i, struct dht_container **container, goto out; } + tmp_container->local_subvol_index = i; + tmp_container->df_entry->d_stat = df_entry->d_stat; tmp_container->df_entry->d_ino = df_entry->d_ino; @@ -4032,6 +4079,33 @@ int gf_defrag_total_file_cnt (xlator_t *this, loc_t *root_loc) } + +int +dht_get_local_subvols_and_nodeuuids (xlator_t *this, dht_conf_t *conf, + loc_t *loc) +{ + + dict_t *dict = NULL; + int ret = -1; + + /* Find local subvolumes */ + ret = syncop_getxattr (this, loc, &dict, + GF_REBAL_FIND_LOCAL_SUBVOL, + NULL, NULL); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, 0, "local " + "subvolume determination failed with error: %d", + -ret); + ret = -1; + goto out; + } + + ret = 0; +out: + return ret; +} + + int gf_defrag_start_crawl (void *data) { @@ -4056,6 +4130,7 @@ gf_defrag_start_crawl (void *data) gf_boolean_t is_tier_detach = _gf_false; call_frame_t *statfs_frame = NULL; xlator_t *old_THIS = NULL; + int j = 0; this = data; if (!this) @@ -4184,14 +4259,8 @@ gf_defrag_start_crawl (void *data) goto out; } - /* Find local subvolumes */ - ret = syncop_getxattr (this, &loc, &dict, - GF_REBAL_FIND_LOCAL_SUBVOL, - NULL, NULL); + ret = dht_get_local_subvols_and_nodeuuids (this, conf, &loc); if (ret) { - gf_msg (this->name, GF_LOG_ERROR, 0, 0, "local " - "subvolume determination failed with error: %d", - -ret); ret = -1; goto out; } @@ -4199,6 +4268,11 @@ gf_defrag_start_crawl (void *data) for (i = 0 ; i < conf->local_subvols_cnt; i++) { gf_msg (this->name, GF_LOG_INFO, 0, 0, "local subvols " "are %s", conf->local_subvols[i]->name); + for (j = 0; j < conf->local_nodeuuids[i].count; j++) { + gf_msg (this->name, GF_LOG_INFO, 0, 0, + "node uuids are %s", + uuid_utoa(conf->local_nodeuuids[i].uuids[j])); + } } ret = gf_defrag_total_file_cnt (this, &loc); diff --git a/xlators/cluster/dht/src/tier.c b/xlators/cluster/dht/src/tier.c index a8aebe00f69..e4b910eb0e6 100644 --- a/xlators/cluster/dht/src/tier.c +++ b/xlators/cluster/dht/src/tier.c @@ -198,10 +198,17 @@ out: static int tier_check_same_node (xlator_t *this, loc_t *loc, gf_defrag_info_t *defrag) { - int ret = -1; - dict_t *dict = NULL; - char *uuid_str = NULL; - uuid_t node_uuid = {0,}; + int ret = -1; + dict_t *dict = NULL; + char *uuid_str = NULL; + uuid_t node_uuid = {0,}; + char *dup_str = NULL; + char *str = NULL; + char *save_ptr = NULL; + int count = 0; + uint32_t hashval = 0; + int32_t index = 0; + char buf[GF_UUID_BUF_SIZE] = {0,}; GF_VALIDATE_OR_GOTO ("tier", this, out); GF_VALIDATE_OR_GOTO (this->name, loc, out); @@ -215,15 +222,56 @@ tier_check_same_node (xlator_t *this, loc_t *loc, gf_defrag_info_t *defrag) goto out; } + + /* This returns multiple node-uuids now - one for each brick + * of the subvol. + */ + if (dict_get_str (dict, GF_XATTR_NODE_UUID_KEY, &uuid_str) < 0) { gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, "Failed to get node-uuid for %s", loc->path); goto out; } + dup_str = gf_strdup (uuid_str); + str = dup_str; + + /* How many uuids returned? + * No need to check if one of these is that of the current node. + */ + + count = 1; + while ((str = strchr (str, ' '))) { + count++; + str++; + } + + /* Only one node-uuid - pure distribute? */ + if (count == 1) + goto check_node; + + uuid_utoa_r (loc->gfid, buf); + ret = dht_hash_compute (this, 0, buf, &hashval); + if (ret == 0) { + index = (hashval % count); + } + + count = 0; + str = dup_str; + while ((uuid_str = strtok_r (str, " ", &save_ptr))) { + if (count == index) + break; + count++; + str = NULL; + } + + +check_node: + if (gf_uuid_parse (uuid_str, node_uuid)) { gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, "uuid_parse failed for %s", loc->path); + ret = -1; goto out; } @@ -239,6 +287,7 @@ out: if (dict) dict_unref(dict); + GF_FREE (dup_str); return ret; } -- cgit