From 25daa42911d2ff697880ee29c591cac5f2abebed Mon Sep 17 00:00:00 2001 From: Amar Tumballi Date: Fri, 9 Sep 2011 09:42:51 +0530 Subject: support for de-commissioning a node using 'remove-brick' to achieve this, we now create volume-file with 'decommissioned-nodes' option in distribute volume, then just perform the rebalance set of operations (with 'force' flag set). now onwards, the 'remove-brick' (with 'start' option) operation tries to migrate data from removed bricks to existing bricks. 'remove-brick' also supports similar options as of replace-brick. * (no options) -> works as 'force', will have the current behavior of remove-brick, ie., no data-migration, volume changes. * start (starts remove-brick with data-migration/draining process, which takes care of migrating data and once complete, will commit the changes to volume file) * pause (stop data migration, but keep the volume file intact with extra options whatever is set) * abort (stop data-migration, and fall back to old configuration) * commit (if volume is stopped, commits the changes to volumefile) * force (stops the data-migration and commits the changes to volume file) Change-Id: I3952bcfbe604a0952e68b6accace7014d5e401d3 BUG: 1952 Reviewed-on: http://review.gluster.com/118 Tested-by: Gluster Build System Reviewed-by: Vijay Bellur --- xlators/cluster/dht/src/dht-common.c | 62 ++++++++++++++++++++++++++++++++++ xlators/cluster/dht/src/dht-common.h | 3 ++ xlators/cluster/dht/src/dht-helper.c | 6 ++++ xlators/cluster/dht/src/dht-selfheal.c | 14 ++++++++ xlators/cluster/dht/src/dht.c | 62 ++++++++++++++++++++++++++++++---- 5 files changed, 141 insertions(+), 6 deletions(-) (limited to 'xlators/cluster/dht') diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 6f8594e30..e221e10ab 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -1689,6 +1689,46 @@ dht_common_setxattr_cbk (call_frame_t *frame, void *cookie, return 0; } +int +dht_checking_pathinfo_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr) +{ + int i = -1; + int ret = -1; + char *value = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + call_frame_t *prev = NULL; + int this_call_cnt = 0; + + local = frame->local; + prev = cookie; + conf = this->private; + + if (op_ret == -1) + goto out; + + + ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &value); + if (ret) + goto out; + + if (!strcmp (value, local->key)) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev->this) + conf->decommissioned_bricks[i] = prev->this; + } + } + +out: + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + DHT_STACK_UNWIND (setxattr, frame, local->op_ret, ENOTSUP); + } + return 0; + +} + int dht_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr, int flags) @@ -1771,6 +1811,28 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, } + tmp = dict_get (xattr, "decommission-brick"); + if (tmp) { + /* This operation should happen only on '/' */ + if (__is_root_gfid (loc->inode->gfid) != 0) { + op_errno = ENOTSUP; + goto err; + } + + memcpy (value, tmp->data, ((tmp->len < 4095) ? tmp->len : 4095)); + local->key = gf_strdup (value); + local->call_cnt = conf->subvolume_cnt; + + for (i = 0 ; i < conf->subvolume_cnt; i++) { + /* Get the pathinfo, and then compare */ + STACK_WIND (frame, dht_checking_pathinfo_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->getxattr, + loc, GF_XATTR_PATHINFO_KEY); + } + return 0; + } + tmp = dict_get (xattr, GF_XATTR_FIX_LAYOUT_KEY); if (tmp) { gf_log (this->name, GF_LOG_INFO, diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index ab1b82af2..3545c0f99 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -202,6 +202,9 @@ struct dht_conf { uint32_t dir_spread_cnt; struct syncenv *env; /* The env pointer to the rebalance synctask */ + + /* to keep track of nodes which are decomissioned */ + xlator_t **decommissioned_bricks; }; typedef struct dht_conf dht_conf_t; diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c index 99abe023b..d8138067e 100644 --- a/xlators/cluster/dht/src/dht-helper.c +++ b/xlators/cluster/dht/src/dht-helper.c @@ -579,6 +579,12 @@ dht_init_subvolumes (xlator_t *this, dht_conf_t *conf) return -1; } + conf->decommissioned_bricks = GF_CALLOC (cnt, sizeof (xlator_t *), + gf_dht_mt_xlator_t); + if (!conf->decommissioned_bricks) { + return -1; + } + return 0; } diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index 882e0209e..1c881be39 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -460,8 +460,22 @@ static inline int dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout) { int i = 0; + int j = 0; int err = 0; int count = 0; + dht_conf_t *conf = NULL; + + /* Gets in use only for replace-brick, remove-brick */ + conf = this->private; + for (i = 0; i < layout->cnt; i++) { + for (j = 0; j < conf->subvolume_cnt; j++) { + if (conf->decommissioned_bricks[j] && + conf->decommissioned_bricks[j] == layout->list[i].xlator) { + layout->list[i].err = -EINVAL; + break; + } + } + } for (i = 0; i < layout->cnt; i++) { err = layout->list[i].err; diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c index 87a575654..d9499a407 100644 --- a/xlators/cluster/dht/src/dht.c +++ b/xlators/cluster/dht/src/dht.c @@ -254,6 +254,47 @@ out: } +int +dht_parse_decommissioned_bricks (xlator_t *this, dht_conf_t *conf, + const char *bricks) +{ + int i = 0; + int ret = -1; + char *tmpstr = NULL; + char *dup_brick = NULL; + char *node = NULL; + + if (!conf || !bricks) + goto out; + + dup_brick = gf_strdup (bricks); + node = strtok_r (dup_brick, ",", &tmpstr); + while (node) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!strcmp (conf->subvolumes[i]->name, node)) { + conf->decommissioned_bricks[i] = + conf->subvolumes[i]; + gf_log (this->name, GF_LOG_INFO, + "decommissioning subvolume %s", + conf->subvolumes[i]->name); + break; + } + } + if (i == conf->subvolume_cnt) { + /* Wrong node given. */ + goto out; + } + node = strtok_r (NULL, ",", &tmpstr); + } + + ret = 0; +out: + if (dup_brick) + GF_FREE (dup_brick); + + return ret; +} + int reconfigure (xlator_t *this, dict_t *options) { @@ -299,6 +340,12 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt, options, uint32, out); + if (dict_get_str (options, "decommissioned-bricks", &temp_str) == 0) { + ret = dht_parse_decommissioned_bricks (this, conf, temp_str); + if (ret == -1) + goto out; + } + ret = 0; out: return ret; @@ -360,14 +407,14 @@ init (xlator_t *this) goto err; } - ret = dht_layouts_init (this, conf); - if (ret == -1) { - goto err; + if (dict_get_str (this->options, "decommissioned-bricks", &temp_str) == 0) { + ret = dht_parse_decommissioned_bricks (this, conf, temp_str); + if (ret == -1) + goto err; } - conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t), - gf_dht_mt_dht_du_t); - if (!conf->du_stats) { + ret = dht_layouts_init (this, conf); + if (ret == -1) { goto err; } @@ -501,5 +548,8 @@ struct volume_options options[] = { { .key = {"directory-layout-spread"}, .type = GF_OPTION_TYPE_INT, }, + { .key = {"decommissioned-bricks"}, + .type = GF_OPTION_TYPE_ANY, + }, { .key = {NULL} }, }; -- cgit