diff options
| -rw-r--r-- | tests/basic/distribute/rebal-all-nodes-migrate.t | 143 | ||||
| -rw-r--r-- | tests/dht.rc | 24 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.c | 64 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 9 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-helper.c | 7 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-mem-types.h | 1 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-rebalance.c | 97 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/tier.c | 57 | 
8 files changed, 380 insertions, 22 deletions
diff --git a/tests/basic/distribute/rebal-all-nodes-migrate.t b/tests/basic/distribute/rebal-all-nodes-migrate.t new file mode 100644 index 00000000000..14f0a53b1f8 --- /dev/null +++ b/tests/basic/distribute/rebal-all-nodes-migrate.t @@ -0,0 +1,143 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../cluster.rc +. $(dirname $0)/../../dht.rc + + +# Check if every single rebalance process migrated some files + +function cluster_rebal_all_nodes_migrated_files { +        val=0 +        a=$($CLI_1 volume rebalance $V0 status | grep "completed" | awk '{print $2}'); +#        echo $a +        b=($a) +        for i in "${b[@]}" +        do +#                echo "$i"; +                if [ "$i" -eq "0" ]; then +                        echo "false"; +                        val=1; +                fi +        done +        echo $val +} + +cleanup + +TEST launch_cluster 3; +TEST $CLI_1 peer probe $H2; +TEST $CLI_1 peer probe $H3; +EXPECT_WITHIN $PROBE_TIMEOUT 2 peer_count + + +#Start with a pure distribute volume (multiple bricks on the same node) +TEST $CLI_1 volume create $V0 $H1:$B1/dist1 $H1:$B1/dist2 $H2:$B2/dist3 $H2:$B2/dist4 + +TEST $CLI_1 volume start $V0 +$CLI_1 volume info $V0 + +#TEST $CLI_1 volume set $V0 client-log-level DEBUG + +## Mount FUSE +TEST glusterfs -s $H1 --volfile-id $V0 $M0; + +TEST mkdir $M0/dir1 2>/dev/null; +TEST touch $M0/dir1/file-{1..500} + +## Add-brick and run rebalance to force file migration +TEST $CLI_1 volume add-brick $V0 $H1:$B1/dist5 $H2:$B2/dist6 + +#Start a rebalance +TEST $CLI_1 volume rebalance $V0 start force + +#volume rebalance status should work +#TEST $CLI_1 volume rebalance $V0 status +#$CLI_1 volume rebalance $V0 status + +EXPECT_WITHIN $REBALANCE_TIMEOUT "0" cluster_rebalance_completed +EXPECT "0" cluster_rebal_all_nodes_migrated_files +$CLI_1 volume rebalance $V0 status + + +TEST umount -f $M0 +TEST $CLI_1 volume stop $V0 +TEST $CLI_1 volume delete $V0 + + +############################################################## + +# Next, a dist-rep volume +TEST $CLI_1 volume create $V0 replica 2 $H1:$B1/drep1 $H2:$B2/drep1 $H1:$B1/drep2 $H2:$B2/drep2 + +TEST $CLI_1 volume start $V0 +$CLI_1 volume info $V0 + +#TEST $CLI_1 volume set $V0 client-log-level DEBUG + +## Mount FUSE +TEST glusterfs -s $H1 --volfile-id $V0 $M0; + +TEST mkdir $M0/dir1 2>/dev/null; +TEST touch $M0/dir1/file-{1..500} + +## Add-brick and run rebalance to force file migration +TEST $CLI_1 volume add-brick $V0 replica 2 $H1:$B1/drep3 $H2:$B2/drep3 + +#Start a rebalance +TEST $CLI_1 volume rebalance $V0 start force + +#volume rebalance status should work +#TEST $CLI_1 volume rebalance $V0 status +#$CLI_1 volume rebalance $V0 status + +EXPECT_WITHIN $REBALANCE_TIMEOUT "0" cluster_rebalance_completed +#EXPECT "0" cluster_rebal_all_nodes_migrated_files +$CLI_1 volume rebalance $V0 status + + +TEST umount -f $M0 +TEST $CLI_1 volume stop $V0 +TEST $CLI_1 volume delete $V0 + +############################################################## + +# Next, a disperse volume +TEST $CLI_1 volume create $V0 disperse 3 $H1:$B1/ec1 $H2:$B1/ec2 $H3:$B1/ec3 force + +TEST $CLI_1 volume start $V0 +$CLI_1 volume info $V0 + +#TEST $CLI_1 volume set $V0 client-log-level DEBUG + +## Mount FUSE +TEST glusterfs -s $H1 --volfile-id $V0 $M0; + +TEST mkdir $M0/dir1 2>/dev/null; +TEST touch $M0/dir1/file-{1..500} + +## Add-brick and run rebalance to force file migration +TEST $CLI_1 volume add-brick $V0 $H1:$B2/ec4 $H2:$B2/ec5 $H3:$B2/ec6 + +#Start a rebalance +TEST $CLI_1 volume rebalance $V0 start force + +#volume rebalance status should work +#TEST $CLI_1 volume rebalance $V0 status +#$CLI_1 volume rebalance $V0 status + +EXPECT_WITHIN $REBALANCE_TIMEOUT "0" cluster_rebalance_completed + +# this will not work unless EC is changed to return all node-uuids +# comment this out once that patch is ready +#EXPECT "0" cluster_rebal_all_nodes_migrated_files +$CLI_1 volume rebalance $V0 status + + +TEST umount -f $M0 +TEST $CLI_1 volume stop $V0 +TEST $CLI_1 volume delete $V0 + +############################################################## + +cleanup diff --git a/tests/dht.rc b/tests/dht.rc index bf5e08b645e..53b00645e66 100644 --- a/tests/dht.rc +++ b/tests/dht.rc @@ -66,13 +66,33 @@ function get_hashed_brick()  } +function cluster_rebalance_completed() +{ +       val=1 + +       # Rebalance status will be either "failed" or "completed" + +       test=$($CLI_1 volume rebalance $V0 status | grep "in progress" 2>&1) +       if [ $? -ne 0 ] +       then +               val=0 +       fi + +       echo $val +       # Do not *return* the value here.  If it's non-zero, that will cause +       # EXPECT_WITHIN (e.g. in bug-884455.t) to return prematurely, leading to +       # a spurious test failure.  Nothing else checks the return value anyway +       # (they all check the output) so there's no need for it to be non-zero +       # just because grep didn't find what we want. +} +  function rebalance_completed()  {         val=1 -       test=$(gluster volume rebalance $V0 status | grep localhost | grep "completed" 2>&1) +       test=$($CLI volume rebalance $V0 status | grep localhost | grep "completed" 2>&1)         if [ $? -eq 0 ]         then -                val=0 +               val=0         fi         echo $val diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index fe0dc0d3455..22061ef5682 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -2998,6 +2998,8 @@ dht_vgetxattr_fill_and_set (dht_local_t *local, dict_t **dict, xlator_t *this,   out:          return ret;  } + +  int  dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                             int op_ret, int op_errno, dict_t *xattr, @@ -3013,6 +3015,11 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          char         *next_uuid_str = NULL;          char         *saveptr       = NULL;          uuid_t        node_uuid     = {0,}; +        char         *uuid_list_copy = NULL; +        int           count          = 0; +        int           i              = 0; +        int           index          = 0; +        int           found          = 0;          VALIDATE_OR_GOTO (frame, out); @@ -3022,6 +3029,10 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          prev = cookie;          conf = this->private; +        VALIDATE_OR_GOTO (conf->defrag, out); + +        gf_msg_debug (this->name, 0, "subvol %s returned", prev->name); +          LOCK (&frame->lock);          {                  this_call_cnt = --local->call_cnt; @@ -3045,6 +3056,15 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          goto unlock;                  } +                /* As DHT will not know details of its child xlators +                 * we need to parse this twice to get the count first +                 * and allocate memory later. +                 */ +                count = 0; +                index = conf->local_subvols_cnt; + +                uuid_list_copy = gf_strdup (uuid_list); +                  for (uuid_str = strtok_r (uuid_list, " ", &saveptr);                       uuid_str;                       uuid_str = next_uuid_str) { @@ -3054,24 +3074,57 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                                  gf_msg (this->name, GF_LOG_ERROR, 0,                                          DHT_MSG_UUID_PARSE_ERROR,                                          "Failed to parse uuid" -                                        " failed for %s", prev->name); +                                        " for %s", prev->name);                                  local->op_ret = -1;                                  local->op_errno = EINVAL;                                  goto unlock;                          } +                        count++;                          if (gf_uuid_compare (node_uuid, conf->defrag->node_uuid)) {                                  gf_msg_debug (this->name, 0, "subvol %s does not"                                                "belong to this node",                                                prev->name);                          } else { + +                                /* handle multiple bricks of the same replica +                                 * on the same node */ +                                if (found) +                                        continue;                                  conf->local_subvols[(conf->local_subvols_cnt)++] -                                        = prev; +                                                = prev; +                                found = 1;                                  gf_msg_debug (this->name, 0, "subvol %s belongs to"                                                " this node", prev->name); -                                break;                          }                  } + +                if (!found) { +                        local->op_ret = 0; +                        goto unlock; +                } + +                conf->local_nodeuuids[index].count = count; +                conf->local_nodeuuids[index].uuids +                                 = GF_CALLOC (count, sizeof (uuid_t), 1); + +                /* The node-uuids are guaranteed to be returned in the same +                 * order as the bricks +                 * A null node-uuid is returned for a brick that is down. +                 */ + +                saveptr = NULL; +                i = 0; + +                for (uuid_str = strtok_r (uuid_list_copy, " ", &saveptr); +                     uuid_str; +                     uuid_str = next_uuid_str) { + +                        next_uuid_str = strtok_r (NULL, " ", &saveptr); +                        gf_uuid_parse (uuid_str, +                                       conf->local_nodeuuids[index].uuids[i]); +                        i++; +                }          }          local->op_ret = 0; @@ -3089,8 +3142,13 @@ dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          goto out;   unwind: + +        GF_FREE (conf->local_nodeuuids[index].uuids); +        conf->local_nodeuuids[index].uuids = NULL; +          DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, NULL, xdata);   out: +        GF_FREE (uuid_list_copy);          return 0;  } diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index ffe53d99ce1..c74beb26d96 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -373,6 +373,7 @@ struct dht_container {          xlator_t        *this;          loc_t           *parent_loc;          dict_t          *migrate_data; +        int             local_subvol_index;  };  typedef enum tier_mode_ { @@ -444,6 +445,12 @@ typedef struct gf_tier_conf {          char                         volname[GD_VOLUME_NAME_MAX + 1];  } gf_tier_conf_t; +typedef struct subvol_nodeuuids { +        uuid_t *uuids; +        int count; +} subvol_nodeuuid_t; + +  struct gf_defrag_info_ {          uint64_t                     total_files;          uint64_t                     total_data; @@ -494,6 +501,7 @@ struct gf_defrag_info_ {          /* lock migration flag */          gf_boolean_t                 lock_migration_enabled; +  };  typedef struct gf_defrag_info_ gf_defrag_info_t; @@ -577,6 +585,7 @@ struct dht_conf {          /*local subvol storage for rebalance*/          xlator_t       **local_subvols; +        subvol_nodeuuid_t       *local_nodeuuids;          int32_t          local_subvols_cnt;          /* diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c index 81d1dffa0af..8e7989822cd 100644 --- a/xlators/cluster/dht/src/dht-helper.c +++ b/xlators/cluster/dht/src/dht-helper.c @@ -1000,7 +1000,12 @@ dht_init_local_subvolumes (xlator_t *this, dht_conf_t *conf)          conf->local_subvols = GF_CALLOC (cnt, sizeof (xlator_t *),                                          gf_dht_mt_xlator_t); -        if (!conf->local_subvols) { + +        /* FIX FIX : do this dynamically*/ +        conf->local_nodeuuids = GF_CALLOC (cnt, sizeof (subvol_nodeuuid_t), +                                           gf_dht_nodeuuids_t); + +        if (!conf->local_subvols || !conf->local_nodeuuids) {                  return -1;          } diff --git a/xlators/cluster/dht/src/dht-mem-types.h b/xlators/cluster/dht/src/dht-mem-types.h index 3554f3f9c2d..19cccef537b 100644 --- a/xlators/cluster/dht/src/dht-mem-types.h +++ b/xlators/cluster/dht/src/dht-mem-types.h @@ -39,6 +39,7 @@ enum gf_dht_mem_types_ {          gf_dht_mt_fd_ctx_t,          gf_tier_mt_qfile_array_t,          gf_dht_ret_cache_t, +        gf_dht_nodeuuids_t,          gf_dht_mt_end  };  #endif diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index 0c83c15a4dc..90e45b1c293 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -2094,6 +2094,43 @@ gf_defrag_ctx_subvols_init (dht_dfoffset_ctx_t *offset_var, xlator_t *this) {          return 0;  } + +/* Return value + * 0 : this node does not migrate the file + * 1 : this node migrates the file + */ +int +gf_defrag_should_i_migrate (xlator_t *this, int local_subvol_index, uuid_t gfid) +{ +        int         ret               = 0; +        int         i                 = local_subvol_index; +        char       *str               = NULL; +        uint32_t    hashval           = 0; +        int32_t     index        = 0; +        dht_conf_t *conf              = NULL; +        char        buf[UUID_CANONICAL_FORM_LEN + 1] = {0, }; + +        conf = this->private; + +        /* Pure distribute */ + +        if (conf->local_nodeuuids[i].count == 1) { +                return 1; +        } + +        str = uuid_utoa_r (gfid, buf); + +        ret = dht_hash_compute (this, 0, str, &hashval); +        if (ret == 0) { +                index = (hashval % conf->local_nodeuuids[i].count); +                if (!gf_uuid_compare (conf->defrag->node_uuid, +                                      conf->local_nodeuuids[i].uuids[index])) +                        ret = 1; +        } +        return ret; +} + +  int  gf_defrag_migrate_single_file (void *opaque)  { @@ -2165,6 +2202,13 @@ gf_defrag_migrate_single_file (void *opaque)                  goto out;          } +        if (!gf_defrag_should_i_migrate (this, rebal_entry->local_subvol_index, +                                         entry->d_stat.ia_gfid)) { +                gf_msg_debug (this->name, 0, "Don't migrate %s ", +                              entry_loc.path); +                goto out; +        } +          gf_uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid);          gf_uuid_copy (entry_loc.pargfid, loc->gfid); @@ -2179,6 +2223,7 @@ gf_defrag_migrate_single_file (void *opaque)                  goto out;          } +          inode = inode_link (entry_loc.inode, entry_loc.parent, entry->d_name, &iatt);          inode_unref (entry_loc.inode);          /* use the inode returned by inode_link */ @@ -2659,6 +2704,8 @@ gf_defrag_get_entry (xlator_t *this, int i, struct dht_container **container,                          goto out;                  } +                tmp_container->local_subvol_index = i; +                  tmp_container->df_entry->d_stat = df_entry->d_stat;                  tmp_container->df_entry->d_ino  = df_entry->d_ino; @@ -3740,6 +3787,33 @@ int gf_defrag_total_file_cnt (xlator_t *this, loc_t *root_loc)  } + +int +dht_get_local_subvols_and_nodeuuids (xlator_t *this, dht_conf_t *conf, +                                     loc_t *loc) +{ + +        dict_t                  *dict                   = NULL; +        int                      ret                    = -1; + +                /* Find local subvolumes */ +        ret = syncop_getxattr (this, loc, &dict, +                               GF_REBAL_FIND_LOCAL_SUBVOL, +                               NULL, NULL); +        if (ret) { +                gf_msg (this->name, GF_LOG_ERROR, 0, 0, "local " +                        "subvolume determination failed with error: %d", +                        -ret); +                ret = -1; +                goto out; +        } + +        ret = 0; +out: +        return ret; +} + +  int  gf_defrag_start_crawl (void *data)  { @@ -3757,12 +3831,12 @@ gf_defrag_start_crawl (void *data)          glusterfs_ctx_t         *ctx                    = NULL;          dht_methods_t           *methods                = NULL;          int                      i                      = 0; -        int                     thread_index            = 0; -        int                     err                     = 0; -        int                     thread_spawn_count      = 0; +        int                      thread_index           = 0; +        int                      err                    = 0; +        int                      thread_spawn_count     = 0;          pthread_t               *tid                    = NULL; -        gf_boolean_t            is_tier_detach          = _gf_false; - +        gf_boolean_t             is_tier_detach         = _gf_false; +        int                      j                      = 0;          this = data;          if (!this) @@ -3876,14 +3950,8 @@ gf_defrag_start_crawl (void *data)                          goto out;                  } -                /* Find local subvolumes */ -                ret = syncop_getxattr (this, &loc, &dict, -                                       GF_REBAL_FIND_LOCAL_SUBVOL, -                                       NULL, NULL); +                ret = dht_get_local_subvols_and_nodeuuids (this, conf, &loc);                  if (ret) { -                        gf_msg (this->name, GF_LOG_ERROR, 0, 0, "local " -                                "subvolume determination failed with error: %d", -                                -ret);                          ret = -1;                          goto out;                  } @@ -3891,6 +3959,11 @@ gf_defrag_start_crawl (void *data)                  for (i = 0 ; i < conf->local_subvols_cnt; i++) {                          gf_msg (this->name, GF_LOG_INFO, 0, 0, "local subvols "                                  "are %s", conf->local_subvols[i]->name); +                        for (j = 0; j < conf->local_nodeuuids[i].count; j++) { +                                gf_msg (this->name, GF_LOG_INFO, 0, 0, +                                        "node uuids are %s", +                                  uuid_utoa(conf->local_nodeuuids[i].uuids[j])); +                        }                  }                  ret = gf_defrag_total_file_cnt (this, &loc); diff --git a/xlators/cluster/dht/src/tier.c b/xlators/cluster/dht/src/tier.c index 41032743c04..4cbcc81f311 100644 --- a/xlators/cluster/dht/src/tier.c +++ b/xlators/cluster/dht/src/tier.c @@ -198,10 +198,17 @@ out:  static int  tier_check_same_node (xlator_t *this, loc_t *loc, gf_defrag_info_t *defrag)  { -        int     ret            = -1; -        dict_t *dict           = NULL; -        char   *uuid_str       = NULL; -        uuid_t  node_uuid      = {0,}; +        int         ret                     = -1; +        dict_t     *dict                    = NULL; +        char       *uuid_str                = NULL; +        uuid_t      node_uuid               = {0,}; +        char       *dup_str                 = NULL; +        char       *str                     = NULL; +        char       *save_ptr                = NULL; +        int         count                   = 0; +        uint32_t    hashval                 = 0; +        int32_t     index                   = 0; +        char        buf[GF_UUID_BUF_SIZE]   = {0,};          GF_VALIDATE_OR_GOTO ("tier", this, out);          GF_VALIDATE_OR_GOTO (this->name, loc, out); @@ -215,15 +222,56 @@ tier_check_same_node (xlator_t *this, loc_t *loc, gf_defrag_info_t *defrag)                  goto out;          } + +        /*  This returns multiple node-uuids now - one for each brick +         *  of the subvol. +         */ +          if (dict_get_str (dict, GF_XATTR_NODE_UUID_KEY, &uuid_str) < 0) {                  gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,                          "Failed to get node-uuid for %s", loc->path);                  goto out;          } +        dup_str = gf_strdup (uuid_str); +        str = dup_str; + +        /* How many uuids returned? +         * No need to check if one of these is that of the current node. +         */ + +        count = 1; +        while ((str = strchr (str, ' '))) { +                count++; +                str++; +        } + +        /* Only one node-uuid - pure distribute? */ +        if (count == 1) +                goto check_node; + +        uuid_utoa_r (loc->gfid, buf); +        ret = dht_hash_compute (this, 0, buf, &hashval); +        if (ret == 0) { +                index = (hashval % count); +        } + +        count = 0; +        str = dup_str; +        while ((uuid_str = strtok_r (str, " ", &save_ptr))) { +                if (count == index) +                        break; +                count++; +                str = NULL; +        } + + +check_node: +          if (gf_uuid_parse (uuid_str, node_uuid)) {                  gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,                          "uuid_parse failed for %s", loc->path); +                ret = -1;                  goto out;          } @@ -239,6 +287,7 @@ out:          if (dict)                  dict_unref(dict); +        GF_FREE (dup_str);          return ret;  }  | 
