diff options
Diffstat (limited to 'xlators/cluster/dht/src/dht-shared.c')
| -rw-r--r-- | xlators/cluster/dht/src/dht-shared.c | 1104 |
1 files changed, 1104 insertions, 0 deletions
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c new file mode 100644 index 00000000000..bb72b0ffbb5 --- /dev/null +++ b/xlators/cluster/dht/src/dht-shared.c @@ -0,0 +1,1104 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +/* TODO: add NS locking */ +#include <glusterfs/statedump.h> +#include "dht-common.h" +#include "dht-messages.h" + +#ifndef MAX +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) +#endif + +/* TODO: + - use volumename in xattr instead of "dht" + - use NS locks + - handle all cases in self heal layout reconstruction + - complete linkfile selfheal +*/ + +static void +dht_layout_dump(dht_layout_t *layout, const char *prefix) +{ + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; + + if (!layout) + goto out; + + gf_proc_dump_build_key(key, prefix, "cnt"); + gf_proc_dump_write(key, "%d", layout->cnt); + gf_proc_dump_build_key(key, prefix, "preset"); + gf_proc_dump_write(key, "%d", layout->preset); + gf_proc_dump_build_key(key, prefix, "gen"); + gf_proc_dump_write(key, "%d", layout->gen); + if (layout->type != IA_INVAL) { + gf_proc_dump_build_key(key, prefix, "inode type"); + gf_proc_dump_write(key, "%d", layout->type); + } + + if (!IA_ISDIR(layout->type)) + goto out; + + for (i = 0; i < layout->cnt; i++) { + gf_proc_dump_build_key(key, prefix, "list[%d].err", i); + gf_proc_dump_write(key, "%d", layout->list[i].err); + gf_proc_dump_build_key(key, prefix, "list[%d].start", i); + gf_proc_dump_write(key, "0x%x", layout->list[i].start); + gf_proc_dump_build_key(key, prefix, "list[%d].stop", i); + gf_proc_dump_write(key, "0x%x", layout->list[i].stop); + if (layout->list[i].xlator) { + gf_proc_dump_build_key(key, prefix, "list[%d].xlator.type", i); + gf_proc_dump_write(key, "%s", layout->list[i].xlator->type); + gf_proc_dump_build_key(key, prefix, "list[%d].xlator.name", i); + gf_proc_dump_write(key, "%s", layout->list[i].xlator->name); + } + } + +out: + return; +} + +int32_t +dht_priv_dump(xlator_t *this) +{ + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; + dht_conf_t *conf = NULL; + int ret = -1; + + if (!this) + goto out; + + conf = this->private; + if (!conf) + goto out; + + ret = TRY_LOCK(&conf->subvolume_lock); + if (ret != 0) { + return ret; + } + + gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name); + gf_proc_dump_build_key(key_prefix, "xlator.cluster.dht", "%s.priv", + this->name); + gf_proc_dump_write("subvol_cnt", "%d", conf->subvolume_cnt); + for (i = 0; i < conf->subvolume_cnt; i++) { + snprintf(key, sizeof(key), "subvolumes[%d]", i); + gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type, + conf->subvolumes[i]->name); + if (conf->file_layouts && conf->file_layouts[i]) { + snprintf(key, sizeof(key), "file_layouts[%d]", i); + dht_layout_dump(conf->file_layouts[i], key); + } + if (conf->dir_layouts && conf->dir_layouts[i]) { + snprintf(key, sizeof(key), "dir_layouts[%d]", i); + dht_layout_dump(conf->dir_layouts[i], key); + } + if (conf->subvolume_status) { + snprintf(key, sizeof(key), "subvolume_status[%d]", i); + gf_proc_dump_write(key, "%d", (int)conf->subvolume_status[i]); + } + } + + gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed); + gf_proc_dump_write("gen", "%d", conf->gen); + gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk); + gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes); + gf_proc_dump_write("disk_unit", "%c", conf->disk_unit); + gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval); + gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit); + gf_proc_dump_write("use-readdirp", "%d", conf->use_readdirp); + + if (conf->du_stats && conf->subvolume_status) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!conf->subvolume_status[i]) + continue; + + snprintf(key, sizeof(key), "subvolumes[%d]", i); + gf_proc_dump_write(key, "%s", conf->subvolumes[i]->name); + + snprintf(key, sizeof(key), "du_stats[%d].avail_percent", i); + gf_proc_dump_write(key, "%lf", conf->du_stats[i].avail_percent); + + snprintf(key, sizeof(key), "du_stats[%d].avail_space", i); + gf_proc_dump_write(key, "%" PRIu64, conf->du_stats[i].avail_space); + + snprintf(key, sizeof(key), "du_stats[%d].avail_inodes", i); + gf_proc_dump_write(key, "%lf", conf->du_stats[i].avail_inodes); + + snprintf(key, sizeof(key), "du_stats[%d].log", i); + gf_proc_dump_write(key, "%" PRIu32, conf->du_stats[i].log); + } + } + + if (conf->last_stat_fetch) + gf_proc_dump_write("last_stat_fetch", "%s", + ctime(&conf->last_stat_fetch)); + + UNLOCK(&conf->subvolume_lock); + +out: + return ret; +} + +int32_t +dht_inodectx_dump(xlator_t *this, inode_t *inode) +{ + int ret = -1; + dht_layout_t *layout = NULL; + + if (!this) + goto out; + if (!inode) + goto out; + + ret = dht_inode_ctx_layout_get(inode, this, &layout); + + if ((ret != 0) || !layout) + return ret; + + gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name); + dht_layout_dump(layout, "layout"); + +out: + return ret; +} + +void +dht_fini(xlator_t *this) +{ + int i = 0; + dht_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO("dht", this, out); + + conf = this->private; + this->private = NULL; + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + GF_FREE(conf->file_layouts[i]); + } + GF_FREE(conf->file_layouts); + } + + dict_unref(conf->leaf_to_subvol); + + /* allocated in dht_init_subvolumes() */ + GF_FREE(conf->subvolumes); + GF_FREE(conf->subvolume_status); + GF_FREE(conf->last_event); + GF_FREE(conf->subvol_up_time); + GF_FREE(conf->du_stats); + GF_FREE(conf->decommissioned_bricks); + + /* allocated in dht_init() */ + GF_FREE(conf->mds_xattr_key); + GF_FREE(conf->link_xattr_name); + GF_FREE(conf->commithash_xattr_name); + GF_FREE(conf->wild_xattr_name); + + /* allocated in dht_init_regex() */ + if (conf->rsync_regex_valid) + regfree(&conf->rsync_regex); + if (conf->extra_regex_valid) + regfree(&conf->extra_regex); + + synclock_destroy(&conf->link_lock); + + if (conf->lock_pool) + mem_pool_destroy(conf->lock_pool); + + GF_FREE(conf); + } +out: + return; +} + +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO("dht", this, out); + + ret = xlator_mem_acct_init(this, gf_dht_mt_end + 1); + + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY, + "Memory accounting init failed"); + return ret; + } +out: + return ret; +} + +static int +dht_parse_decommissioned_bricks(xlator_t *this, dht_conf_t *conf, + const char *bricks) +{ + int i = 0; + int ret = -1; + char *tmpstr = NULL; + char *dup_brick = NULL; + char *node = NULL; + + if (!conf || !bricks) + goto out; + + dup_brick = gf_strdup(bricks); + if (dup_brick == NULL) { + goto out; + } + + node = strtok_r(dup_brick, ",", &tmpstr); + while (node) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!strcmp(conf->subvolumes[i]->name, node)) { + conf->decommissioned_bricks[i] = conf->subvolumes[i]; + conf->decommission_subvols_cnt++; + gf_msg(this->name, GF_LOG_INFO, 0, + DHT_MSG_SUBVOL_DECOMMISSION_INFO, + "decommissioning subvolume %s", + conf->subvolumes[i]->name); + break; + } + } + if (i == conf->subvolume_cnt) { + /* Wrong node given. */ + goto out; + } + node = strtok_r(NULL, ",", &tmpstr); + } + + ret = 0; + conf->decommission_in_progress = 1; +out: + GF_FREE(dup_brick); + + return ret; +} + +static void +dht_decommissioned_remove(xlator_t *this, dht_conf_t *conf) +{ + int i = 0; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i]) { + conf->decommissioned_bricks[i] = NULL; + conf->decommission_subvols_cnt--; + } + } +} + +static void +dht_init_regex(xlator_t *this, dict_t *odict, char *name, regex_t *re, + gf_boolean_t *re_valid, dht_conf_t *conf) +{ + char *temp_str = NULL; + + if (dict_get_str(odict, name, &temp_str) != 0) { + if (strcmp(name, "rsync-hash-regex")) { + return; + } + temp_str = "^\\.(.+)\\.[^.]+$"; + } + + LOCK(&conf->lock); + { + if (*re_valid) { + regfree(re); + *re_valid = _gf_false; + } + + if (!strcmp(temp_str, "none")) { + goto unlock; + } + + if (regcomp(re, temp_str, REG_EXTENDED) == 0) { + gf_msg_debug(this->name, 0, "using regex %s = %s", name, temp_str); + *re_valid = _gf_true; + } else { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_REGEX_INFO, + "compiling regex %s failed", temp_str); + } + } +unlock: + UNLOCK(&conf->lock); +} + +int +dht_set_subvol_range(xlator_t *this) +{ + int ret = -1; + dht_conf_t *conf = NULL; + + conf = this->private; + + if (!conf) + goto out; + + conf->leaf_to_subvol = dict_new(); + if (!conf->leaf_to_subvol) + goto out; + + ret = glusterfs_reachable_leaves(this, conf->leaf_to_subvol); + +out: + return ret; +} + +static int +dht_configure_throttle(xlator_t *this, dht_conf_t *conf, char *temp_str) +{ + int rebal_thread_count = 0; + int ret = 0; + + pthread_mutex_lock(&conf->defrag->dfq_mutex); + { + if (!strcasecmp(temp_str, "lazy")) { + conf->defrag->recon_thread_count = 1; + } else if (!strcasecmp(temp_str, "normal")) { + conf->defrag->recon_thread_count = 2; + } else if (!strcasecmp(temp_str, "aggressive")) { + conf->defrag->recon_thread_count = MAX(MAX_REBAL_THREADS - 4, 4); + } else if ((gf_string2int(temp_str, &rebal_thread_count) == 0)) { + if ((rebal_thread_count > 0) && + (rebal_thread_count <= MAX_REBAL_THREADS)) { + conf->defrag->recon_thread_count = rebal_thread_count; + pthread_mutex_unlock(&conf->defrag->dfq_mutex); + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "rebal thread count configured to %d", + rebal_thread_count); + goto out; + } else { + pthread_mutex_unlock(&conf->defrag->dfq_mutex); + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, + "Invalid option: Reconfigure: " + "rebal-throttle should be " + "within range of 0 and maximum number of" + " cores available"); + ret = -1; + goto out; + } + } else { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, + "Invalid option: Reconfigure: " + "rebal-throttle should be {lazy|normal|aggressive}" + " or a number up to the number of cores available," + " not (%s), defaulting to (%d)", + temp_str, conf->dthrottle); + ret = -1; + } + } + pthread_mutex_unlock(&conf->defrag->dfq_mutex); + +out: + return ret; +} + +int +dht_reconfigure(xlator_t *this, dict_t *options) +{ + dht_conf_t *conf = NULL; + char *temp_str = NULL; + gf_boolean_t search_unhashed; + int ret = -1; + + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", options, out); + + conf = this->private; + if (!conf) + return 0; + + if (dict_get_str(options, "lookup-unhashed", &temp_str) == 0) { + /* If option is not "auto", other options _should_ be boolean*/ + if (strcasecmp(temp_str, "auto")) { + if (!gf_string2boolean(temp_str, &search_unhashed)) { + gf_msg_debug(this->name, 0, + "Reconfigure: " + "lookup-unhashed reconfigured(%s)", + temp_str); + conf->search_unhashed = search_unhashed; + } else { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, + "Invalid option: Reconfigure: " + "lookup-unhashed should be boolean," + " not (%s), defaulting to (%d)", + temp_str, conf->search_unhashed); + ret = -1; + goto out; + } + } else { + gf_msg_debug(this->name, 0, + "Reconfigure:" + " lookup-unhashed reconfigured auto "); + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; + } + } + + GF_OPTION_RECONF("lookup-optimize", conf->lookup_optimize, options, bool, + out); + + GF_OPTION_RECONF("min-free-disk", conf->min_free_disk, options, + percent_or_size, out); + /* option can be any one of percent or bytes */ + conf->disk_unit = 0; + if (conf->min_free_disk < 100.0) + conf->disk_unit = 'p'; + + GF_OPTION_RECONF("min-free-inodes", conf->min_free_inodes, options, percent, + out); + + GF_OPTION_RECONF("directory-layout-spread", conf->dir_spread_cnt, options, + uint32, out); + + GF_OPTION_RECONF("readdir-optimize", conf->readdir_optimize, options, bool, + out); + GF_OPTION_RECONF("randomize-hash-range-by-gfid", conf->randomize_by_gfid, + options, bool, out); + + GF_OPTION_RECONF("lock-migration", conf->lock_migration_enabled, options, + bool, out); + + GF_OPTION_RECONF("force-migration", conf->force_migration, options, bool, + out); + + if (conf->defrag) { + if (dict_get_str(options, "rebal-throttle", &temp_str) == 0) { + ret = dht_configure_throttle(this, conf, temp_str); + if (ret == -1) + goto out; + } + } + + if (conf->defrag) { + conf->defrag->lock_migration_enabled = conf->lock_migration_enabled; + } + + if (conf->defrag) { + GF_OPTION_RECONF("rebalance-stats", conf->defrag->stats, options, bool, + out); + } + + if (dict_get_str(options, "decommissioned-bricks", &temp_str) == 0) { + ret = dht_parse_decommissioned_bricks(this, conf, temp_str); + if (ret == -1) + goto out; + } else { + dht_decommissioned_remove(this, conf); + } + + dht_init_regex(this, options, "rsync-hash-regex", &conf->rsync_regex, + &conf->rsync_regex_valid, conf); + dht_init_regex(this, options, "extra-hash-regex", &conf->extra_regex, + &conf->extra_regex_valid, conf); + + GF_OPTION_RECONF("weighted-rebalance", conf->do_weighting, options, bool, + out); + + GF_OPTION_RECONF("use-readdirp", conf->use_readdirp, options, bool, out); + ret = 0; +out: + return ret; +} + +static int +gf_defrag_pattern_list_fill(xlator_t *this, gf_defrag_info_t *defrag, + char *data) +{ + int ret = -1; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *dup_str = NULL; + char *num = NULL; + char *pattern_str = NULL; + char *pattern = NULL; + gf_defrag_pattern_list_t *temp_list = NULL; + gf_defrag_pattern_list_t *pattern_list = NULL; + + if (!this || !defrag || !data) + goto out; + + /* Get the pattern for pattern list. "pattern:<optional-size>" + * eg: *avi, *pdf:10MB, *:1TB + */ + pattern_str = strtok_r(data, ",", &tmp_str); + while (pattern_str) { + dup_str = gf_strdup(pattern_str); + if (!dup_str) + goto out; + pattern_list = GF_CALLOC(1, sizeof(gf_defrag_pattern_list_t), 1); + if (!pattern_list) { + goto out; + } + pattern = strtok_r(dup_str, ":", &tmp_str1); + num = strtok_r(NULL, ":", &tmp_str1); + if (!pattern) + goto out; + if (!num) { + if (gf_string2bytesize_uint64(pattern, &pattern_list->size) == 0) { + pattern = "*"; + } + } else if (gf_string2bytesize_uint64(num, &pattern_list->size) != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, + "Invalid option. Defrag pattern:" + " Invalid number format \"%s\"", + num); + goto out; + } + memcpy(pattern_list->path_pattern, pattern, strlen(dup_str)); + + if (!defrag->defrag_pattern) + temp_list = NULL; + else + temp_list = defrag->defrag_pattern; + + pattern_list->next = temp_list; + + defrag->defrag_pattern = pattern_list; + pattern_list = NULL; + + GF_FREE(dup_str); + dup_str = NULL; + + pattern_str = strtok_r(NULL, ",", &tmp_str); + } + + ret = 0; +out: + if (ret) + GF_FREE(pattern_list); + GF_FREE(dup_str); + + return ret; +} + +static int +dht_init_methods(xlator_t *this) +{ + int ret = -1; + dht_conf_t *conf = NULL; + dht_methods_t *methods = NULL; + + GF_VALIDATE_OR_GOTO("dht", this, err); + + conf = this->private; + methods = &(conf->methods); + + methods->migration_get_dst_subvol = dht_migration_get_dst_subvol; + methods->migration_other = NULL; + methods->layout_search = dht_layout_search; + + ret = 0; +err: + return ret; +} + +int +dht_init(xlator_t *this) +{ + dht_conf_t *conf = NULL; + char *temp_str = NULL; + int ret = -1; + int i = 0; + gf_defrag_info_t *defrag = NULL; + int cmd = 0; + char *node_uuid = NULL; + uint32_t commit_hash = 0; + + GF_VALIDATE_OR_GOTO("dht", this, err); + + if (!this->children) { + gf_msg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_INVALID_CONFIGURATION, + "Distribute needs more than one subvolume"); + return -1; + } + + if (!this->parents) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_CONFIGURATION, + "dangling volume. check volfile"); + } + + conf = GF_CALLOC(1, sizeof(*conf), gf_dht_mt_dht_conf_t); + if (!conf) { + goto err; + } + + LOCK_INIT(&conf->subvolume_lock); + LOCK_INIT(&conf->layout_lock); + LOCK_INIT(&conf->lock); + synclock_init(&conf->link_lock, SYNC_LOCK_DEFAULT); + + /* We get the commit-hash to set only for rebalance process */ + if (dict_get_uint32(this->options, "commit-hash", &commit_hash) == 0) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_COMMIT_HASH_INFO, + "%s using commit hash %u", __func__, commit_hash); + conf->vol_commit_hash = commit_hash; + conf->vch_forced = _gf_true; + } + + ret = dict_get_int32(this->options, "rebalance-cmd", &cmd); + + if (cmd) { + defrag = GF_CALLOC(1, sizeof(gf_defrag_info_t), gf_defrag_info_mt); + + GF_VALIDATE_OR_GOTO(this->name, defrag, err); + + LOCK_INIT(&defrag->lock); + + defrag->is_exiting = 0; + + conf->defrag = defrag; + defrag->this = this; + + ret = dict_get_str(this->options, "node-uuid", &node_uuid); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_CONFIGURATION, + "Invalid volume configuration: " + "node-uuid not specified"); + goto err; + } + + if (gf_uuid_parse(node_uuid, defrag->node_uuid)) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, + "Invalid option:" + " Cannot parse glusterd node uuid"); + goto err; + } + + defrag->cmd = cmd; + + defrag->stats = _gf_false; + + defrag->queue = NULL; + + defrag->crawl_done = 0; + + defrag->global_error = 0; + + defrag->q_entry_count = 0; + + defrag->wakeup_crawler = 0; + + pthread_mutex_init(&defrag->dfq_mutex, 0); + pthread_cond_init(&defrag->parallel_migration_cond, 0); + pthread_cond_init(&defrag->rebalance_crawler_alarm, 0); + pthread_cond_init(&defrag->df_wakeup_thread, 0); + + pthread_mutex_init(&defrag->fc_mutex, 0); + pthread_cond_init(&defrag->fc_wakeup_cond, 0); + + defrag->global_error = 0; + } + + conf->use_fallocate = 1; + + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; + if (dict_get_str(this->options, "lookup-unhashed", &temp_str) == 0) { + /* If option is not "auto", other options _should_ be boolean */ + if (strcasecmp(temp_str, "auto")) { + gf_boolean_t search_unhashed_bool; + ret = gf_string2boolean(temp_str, &search_unhashed_bool); + if (ret == -1) { + goto err; + } + conf->search_unhashed = search_unhashed_bool + ? GF_DHT_LOOKUP_UNHASHED_ON + : GF_DHT_LOOKUP_UNHASHED_OFF; + } else { + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; + } + } + + GF_OPTION_INIT("lookup-optimize", conf->lookup_optimize, bool, err); + + GF_OPTION_INIT("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool, err); + + GF_OPTION_INIT("use-readdirp", conf->use_readdirp, bool, err); + + GF_OPTION_INIT("min-free-disk", conf->min_free_disk, percent_or_size, err); + + GF_OPTION_INIT("min-free-inodes", conf->min_free_inodes, percent, err); + + conf->dir_spread_cnt = conf->subvolume_cnt; + GF_OPTION_INIT("directory-layout-spread", conf->dir_spread_cnt, uint32, + err); + + GF_OPTION_INIT("assert-no-child-down", conf->assert_no_child_down, bool, + err); + + GF_OPTION_INIT("readdir-optimize", conf->readdir_optimize, bool, err); + + GF_OPTION_INIT("lock-migration", conf->lock_migration_enabled, bool, err); + + GF_OPTION_INIT("force-migration", conf->force_migration, bool, err); + + if (defrag) { + defrag->lock_migration_enabled = conf->lock_migration_enabled; + + GF_OPTION_INIT("rebalance-stats", defrag->stats, bool, err); + if (dict_get_str(this->options, "rebalance-filter", &temp_str) == 0) { + if (gf_defrag_pattern_list_fill(this, defrag, temp_str) == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, + "Invalid option:" + " Cannot parse rebalance-filter (%s)", + temp_str); + + goto err; + } + } + } + + /* option can be any one of percent or bytes */ + conf->disk_unit = 0; + if (conf->min_free_disk < 100) + conf->disk_unit = 'p'; + + ret = dht_init_subvolumes(this, conf); + if (ret == -1) { + goto err; + } + + if (cmd) { + ret = dht_init_local_subvolumes(this, conf); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_INIT_LOCAL_SUBVOL_FAILED, + "dht_init_local_subvolumes failed"); + goto err; + } + } + + if (dict_get_str(this->options, "decommissioned-bricks", &temp_str) == 0) { + ret = dht_parse_decommissioned_bricks(this, conf, temp_str); + if (ret == -1) + goto err; + } + + dht_init_regex(this, this->options, "rsync-hash-regex", &conf->rsync_regex, + &conf->rsync_regex_valid, conf); + dht_init_regex(this, this->options, "extra-hash-regex", &conf->extra_regex, + &conf->extra_regex_valid, conf); + + ret = dht_layouts_init(this, conf); + if (ret == -1) { + goto err; + } + + conf->gen = 1; + + this->local_pool = mem_pool_new(dht_local_t, 512); + if (!this->local_pool) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + " DHT initialisation failed. " + "failed to create local_t's memory pool"); + goto err; + } + + GF_OPTION_INIT("randomize-hash-range-by-gfid", conf->randomize_by_gfid, + bool, err); + + if (defrag) { + GF_OPTION_INIT("rebal-throttle", temp_str, str, err); + if (temp_str) { + ret = dht_configure_throttle(this, conf, temp_str); + if (ret == -1) + goto err; + } + } + + GF_OPTION_INIT("xattr-name", conf->xattr_name, str, err); + gf_asprintf(&conf->mds_xattr_key, "%s." DHT_MDS_STR, conf->xattr_name); + gf_asprintf(&conf->link_xattr_name, "%s." DHT_LINKFILE_STR, + conf->xattr_name); + gf_asprintf(&conf->commithash_xattr_name, "%s." DHT_COMMITHASH_STR, + conf->xattr_name); + gf_asprintf(&conf->wild_xattr_name, "%s*", conf->xattr_name); + if (!conf->link_xattr_name || !conf->wild_xattr_name) { + goto err; + } + + GF_OPTION_INIT("weighted-rebalance", conf->do_weighting, bool, err); + + conf->lock_pool = mem_pool_new(dht_lock_t, 512); + if (!conf->lock_pool) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INIT_FAILED, + "failed to create lock mem_pool, failing " + "initialization"); + goto err; + } + + this->private = conf; + + if (dht_set_subvol_range(this)) + goto err; + + if (dht_init_methods(this)) + goto err; + + return 0; + +err: + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + GF_FREE(conf->file_layouts[i]); + } + GF_FREE(conf->file_layouts); + } + + GF_FREE(conf->subvolumes); + + GF_FREE(conf->subvolume_status); + + GF_FREE(conf->du_stats); + + GF_FREE(conf->defrag); + + GF_FREE(conf->xattr_name); + GF_FREE(conf->link_xattr_name); + GF_FREE(conf->wild_xattr_name); + GF_FREE(conf->mds_xattr_key); + + if (conf->lock_pool) + mem_pool_destroy(conf->lock_pool); + + GF_FREE(conf); + } + + return -1; +} + +struct volume_options dht_options[] = { + { + .key = {"lookup-unhashed"}, + .value = {"auto", "yes", "no", "enable", "disable", "1", "0", "on", + "off"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "on", + .description = + "This option if set to ON, does a lookup through " + "all the sub-volumes, in case a lookup didn't return any result " + "from the hash subvolume. If set to OFF, it does not do a lookup " + "on the remaining subvolumes.", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE, + .level = OPT_STATUS_BASIC, + }, + {.key = {"lookup-optimize"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = + "This option if set to ON enables the optimization " + "of -ve lookups, by not doing a lookup on non-hashed subvolumes for " + "files, in case the hashed subvolume does not return any result. " + "This option disregards the lookup-unhashed setting, when enabled.", + .op_version = {GD_OP_VERSION_3_7_2}, + .level = OPT_STATUS_ADVANCED, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"min-free-disk"}, + .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, + .default_value = "10%", + .description = + "Percentage/Size of disk space, after which the " + "process starts balancing out the cluster, and logs will appear " + "in log files", + .op_version = {1}, + .level = OPT_STATUS_BASIC, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"min-free-inodes"}, + .type = GF_OPTION_TYPE_PERCENT, + .default_value = "5%", + .description = "after system has only N% of inodes, warnings " + "starts to appear in log files", + .op_version = {1}, + .level = OPT_STATUS_BASIC, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + { + .key = {"unhashed-sticky-bit"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, + {.key = {"use-readdirp"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "This option if set to ON, forces the use of " + "readdirp, and hence also displays the stats of the files.", + .level = OPT_STATUS_ADVANCED, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"assert-no-child-down"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option if set to ON, in the event of " + "CHILD_DOWN, will call exit."}, + { + .key = {"directory-layout-spread"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .validate = GF_OPT_VALIDATE_MIN, + .description = "Specifies the directory layout spread. Takes number " + "of subvolumes as default value.", + + .op_version = {2}, + }, + { + .key = {"decommissioned-bricks"}, + .type = GF_OPTION_TYPE_ANY, + .description = + "This option if set to ON, decommissions " + "the brick, so that no new data is allowed to be created " + "on that brick.", + .level = OPT_STATUS_ADVANCED, + }, + { + .key = {"rebalance-cmd"}, + .type = GF_OPTION_TYPE_INT, + }, + { + .key = {"commit-hash"}, + .type = GF_OPTION_TYPE_INT, + }, + { + .key = {"node-uuid"}, + .type = GF_OPTION_TYPE_STR, + }, + { + .key = {"rebalance-stats"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = + "This option if set to ON displays and logs the " + " time taken for migration of each file, during the rebalance " + "process. If set to OFF, the rebalance logs will only display the " + "time spent in each directory.", + .op_version = {2}, + .level = OPT_STATUS_BASIC, + }, + {.key = {"readdir-optimize"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = + "This option if set to ON enables the optimization " + "that allows DHT to requests non-first subvolumes to filter out " + "directory entries.", + .op_version = {1}, + .level = OPT_STATUS_ADVANCED, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"rsync-hash-regex"}, + .type = GF_OPTION_TYPE_STR, + /* Setting a default here doesn't work. See dht_init_regex. */ + .description = + "Regular expression for stripping temporary-file " + "suffix and prefix used by rsync, to prevent relocation when the " + "file is renamed.", + .op_version = {3}, + .level = OPT_STATUS_BASIC, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"extra-hash-regex"}, + .type = GF_OPTION_TYPE_STR, + /* Setting a default here doesn't work. See dht_init_regex. */ + .description = + "Regular expression for stripping temporary-file " + "suffix and prefix used by an application, to prevent relocation when " + "the file is renamed.", + .op_version = {3}, + .level = OPT_STATUS_BASIC, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + { + .key = {"rebalance-filter"}, + .type = GF_OPTION_TYPE_STR, + }, + + { + .key = {"xattr-name"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "trusted.glusterfs.dht", + .description = + "Base for extended attributes used by this " + "translator instance, to avoid conflicts with others above or " + "below it.", + .op_version = {3}, + }, + + {.key = {"weighted-rebalance"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = + "When enabled, files will be allocated to bricks " + "with a probability proportional to their size. Otherwise, all " + "bricks will have the same probability (legacy behavior).", + .op_version = {GD_OP_VERSION_3_6_0}, + .level = OPT_STATUS_BASIC, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + + /* NUFA option */ + {.key = {"local-volume-name"}, .type = GF_OPTION_TYPE_XLATOR}, + + /* switch option */ + {.key = {"pattern.switch.case"}, .type = GF_OPTION_TYPE_ANY}, + + { + .key = {"randomize-hash-range-by-gfid"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = + "Use gfid of directory to determine the subvolume " + "from which hash ranges are allocated starting with 0. " + "Note that we still use a directory/file's name to determine the " + "subvolume to which it hashes", + .op_version = {GD_OP_VERSION_3_6_0}, + }, + + {.key = {"rebal-throttle"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "normal", + .description = " Sets the maximum number of parallel file migrations " + "allowed on a node during the rebalance operation. The" + " default value is normal and allows a max of " + "[($(processing units) - 4) / 2), 2] files to be " + "migrated at a time. Lazy will allow only one file to " + "be migrated at a time and aggressive will allow " + "max of [($(processing units) - 4) / 2), 4]", + .op_version = {GD_OP_VERSION_3_7_0}, + .level = OPT_STATUS_BASIC, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC + + }, + + {.key = {"lock-migration"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = " If enabled this feature will migrate the posix locks" + " associated with a file during rebalance", + .op_version = {GD_OP_VERSION_3_8_0}, + .level = OPT_STATUS_ADVANCED, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + + {.key = {"force-migration"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "If disabled, rebalance will not migrate files that " + "are being written to by an application", + .op_version = {GD_OP_VERSION_4_0_0}, + .level = OPT_STATUS_ADVANCED, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + + {.key = {NULL}}, +}; + +#define NUM_DHT_OPTIONS (sizeof(dht_options) / sizeof(dht_options[0])) + +extern struct volume_options options[NUM_DHT_OPTIONS] + __attribute__((alias("dht_options"))); |
