/* Copyright (c) 2008-2012 Red Hat, Inc. This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser General Public License, version 3 or any later version (LGPLv3 or later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ #include #include #include #include #include #include #include "afr-common.c" #include "afr-messages.h" struct volume_options options[]; static char *afr_favorite_child_policies[AFR_FAV_CHILD_POLICY_MAX + 1] = { [AFR_FAV_CHILD_NONE] = "none", [AFR_FAV_CHILD_BY_SIZE] = "size", [AFR_FAV_CHILD_BY_CTIME] = "ctime", [AFR_FAV_CHILD_BY_MTIME] = "mtime", [AFR_FAV_CHILD_BY_MAJORITY] = "majority", [AFR_FAV_CHILD_POLICY_MAX] = NULL, }; int32_t notify(xlator_t *this, int32_t event, void *data, ...) { int ret = -1; va_list ap; void *data2 = NULL; va_start(ap, data); data2 = va_arg(ap, dict_t *); va_end(ap); ret = afr_notify(this, event, data, data2); return ret; } int32_t mem_acct_init(xlator_t *this) { int ret = -1; if (!this) return ret; ret = xlator_mem_acct_init(this, gf_afr_mt_end + 1); if (ret != 0) { return ret; } return ret; } int xlator_subvolume_index(xlator_t *this, xlator_t *subvol) { int index = -1; int i = 0; xlator_list_t *list = NULL; list = this->children; while (list) { if (subvol == list->xlator || strcmp(subvol->name, list->xlator->name) == 0) { index = i; break; } list = list->next; i++; } return index; } static void fix_quorum_options(xlator_t *this, afr_private_t *priv, char *qtype, dict_t *options) { if (dict_get_sizen(options, "quorum-type") == NULL) { /* If user doesn't configure anything enable auto-quorum if the * replica has more than two subvolumes */ if (priv->child_count > 2) qtype = "auto"; } if (priv->quorum_count && strcmp(qtype, "fixed")) { gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_OVERRIDE, "quorum-type %s overriding quorum-count %u", qtype, priv->quorum_count); } if (!strcmp(qtype, "none")) { priv->quorum_count = 0; } else if (!strcmp(qtype, "auto")) { priv->quorum_count = AFR_QUORUM_AUTO; } } int afr_set_favorite_child_policy(afr_private_t *priv, char *policy) { int index = -1; index = gf_get_index_by_elem(afr_favorite_child_policies, policy); if (index < 0 || index >= AFR_FAV_CHILD_POLICY_MAX) return -1; priv->fav_child_policy = index; return 0; } static void set_data_self_heal_algorithm(afr_private_t *priv, char *algo) { if (!algo) { priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DYNAMIC; } else if (strcmp(algo, "full") == 0) { priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_FULL; } else if (strcmp(algo, "diff") == 0) { priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DIFF; } else { priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DYNAMIC; } } int reconfigure(xlator_t *this, dict_t *options) { afr_private_t *priv = NULL; xlator_t *read_subvol = NULL; int read_subvol_index = -1; int timeout_old = 0; int ret = -1; int index = -1; char *qtype = NULL; char *fav_child_policy = NULL; char *data_self_heal = NULL; char *data_self_heal_algorithm = NULL; char *locking_scheme = NULL; gf_boolean_t consistent_io = _gf_false; gf_boolean_t choose_local_old = _gf_false; gf_boolean_t enabled_old = _gf_false; priv = this->private; GF_OPTION_RECONF("metadata-splitbrain-forced-heal", priv->metadata_splitbrain_forced_heal, options, bool, out); GF_OPTION_RECONF("background-self-heal-count", priv->background_self_heal_count, options, uint32, out); GF_OPTION_RECONF("heal-wait-queue-length", priv->heal_wait_qlen, options, uint32, out); GF_OPTION_RECONF("metadata-self-heal", priv->metadata_self_heal, options, bool, out); GF_OPTION_RECONF("data-self-heal", data_self_heal, options, str, out); gf_string2boolean(data_self_heal, &priv->data_self_heal); GF_OPTION_RECONF("entry-self-heal", priv->entry_self_heal, options, bool, out); GF_OPTION_RECONF("data-self-heal-window-size", priv->data_self_heal_window_size, options, uint32, out); GF_OPTION_RECONF("data-self-heal-algorithm", data_self_heal_algorithm, options, str, out); set_data_self_heal_algorithm(priv, data_self_heal_algorithm); GF_OPTION_RECONF("halo-enabled", priv->halo_enabled, options, bool, out); GF_OPTION_RECONF("halo-shd-max-latency", priv->shd.halo_max_latency_msec, options, uint32, out); GF_OPTION_RECONF("halo-nfsd-max-latency", priv->nfsd.halo_max_latency_msec, options, uint32, out); GF_OPTION_RECONF("halo-max-latency", priv->halo_max_latency_msec, options, uint32, out); GF_OPTION_RECONF("halo-max-replicas", priv->halo_max_replicas, options, uint32, out); GF_OPTION_RECONF("halo-min-replicas", priv->halo_min_replicas, options, uint32, out); GF_OPTION_RECONF("read-subvolume", read_subvol, options, xlator, out); choose_local_old = priv->choose_local; GF_OPTION_RECONF("choose-local", priv->choose_local, options, bool, out); if (choose_local_old != priv->choose_local) { priv->read_child = -1; if (choose_local_old == _gf_false) priv->did_discovery = _gf_false; } GF_OPTION_RECONF("read-hash-mode", priv->hash_mode, options, uint32, out); if (read_subvol) { index = xlator_subvolume_index(this, read_subvol); if (index == -1) { gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL, "%s not a subvolume", read_subvol->name); goto out; } priv->read_child = index; } GF_OPTION_RECONF("read-subvolume-index", read_subvol_index, options, int32, out); if (read_subvol_index > -1) { index = read_subvol_index; if (index >= priv->child_count) { gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL, "%d not a subvolume-index", index); goto out; } priv->read_child = index; } GF_OPTION_RECONF("pre-op-compat", priv->pre_op_compat, options, bool, out); GF_OPTION_RECONF("locking-scheme", locking_scheme, options, str, out); priv->granular_locks = (strcmp(locking_scheme, "granular") == 0); GF_OPTION_RECONF("full-lock", priv->full_lock, options, bool, out); GF_OPTION_RECONF("granular-entry-heal", priv->esh_granular, options, bool, out); GF_OPTION_RECONF("eager-lock", priv->eager_lock, options, bool, out); GF_OPTION_RECONF("optimistic-change-log", priv->optimistic_change_log, options, bool, out); GF_OPTION_RECONF("quorum-type", qtype, options, str, out); GF_OPTION_RECONF("quorum-count", priv->quorum_count, options, uint32, out); fix_quorum_options(this, priv, qtype, options); if (priv->quorum_count && !afr_has_quorum(priv->child_up, this, NULL)) gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL, "Client-quorum is not met"); GF_OPTION_RECONF("post-op-delay-secs", priv->post_op_delay_secs, options, uint32, out); GF_OPTION_RECONF(AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, options, size_uint64, out); /* Reset this so we re-discover in case the topology changed. */ GF_OPTION_RECONF("ensure-durability", priv->ensure_durability, options, bool, out); enabled_old = priv->shd.enabled; GF_OPTION_RECONF("self-heal-daemon", priv->shd.enabled, options, bool, out); GF_OPTION_RECONF("iam-self-heal-daemon", priv->shd.iamshd, options, bool, out); timeout_old = priv->shd.timeout; GF_OPTION_RECONF("heal-timeout", priv->shd.timeout, options, int32, out); GF_OPTION_RECONF("consistent-metadata", priv->consistent_metadata, options, bool, out); GF_OPTION_RECONF("shd-max-threads", priv->shd.max_threads, options, uint32, out); GF_OPTION_RECONF("shd-wait-qlength", priv->shd.wait_qlength, options, uint32, out); GF_OPTION_RECONF("favorite-child-policy", fav_child_policy, options, str, out); if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1) goto out; priv->did_discovery = _gf_false; GF_OPTION_RECONF("consistent-io", consistent_io, options, bool, out); if (priv->quorum_count != 0) consistent_io = _gf_false; priv->consistent_io = consistent_io; if (priv->shd.enabled) { if ((priv->shd.enabled != enabled_old) || (timeout_old != priv->shd.timeout)) afr_selfheal_childup(this, priv); } ret = 0; out: return ret; } static int afr_pending_xattrs_init(afr_private_t *priv, xlator_t *this) { int ret = -1; int i = 0; char *ptr = NULL; char *ptr1 = NULL; char *xattrs_list = NULL; xlator_list_t *trav = NULL; int child_count = -1; trav = this->children; child_count = priv->child_count; if (priv->thin_arbiter_count) { /* priv->pending_key[THIN_ARBITER_BRICK_INDEX] is used as the * name of the thin arbiter file for persistence across add/ * removal of DHT subvols.*/ child_count++; } GF_OPTION_INIT("afr-pending-xattr", xattrs_list, str, out); priv->pending_key = GF_CALLOC(sizeof(*priv->pending_key), child_count, gf_afr_mt_char); if (!priv->pending_key) { ret = -ENOMEM; goto out; } if (!xattrs_list) { gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_NO_CHANGELOG, "Unable to fetch afr-pending-xattr option from volfile." " Falling back to using client translator names. "); while (i < child_count) { ret = gf_asprintf(&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX, trav->xlator->name); if (ret == -1) { ret = -ENOMEM; goto out; } trav = trav->next; i++; } ret = 0; goto out; } ptr = ptr1 = gf_strdup(xattrs_list); if (!ptr) { ret = -ENOMEM; goto out; } for (i = 0, ptr = strtok(ptr, ","); ptr; ptr = strtok(NULL, ",")) { ret = gf_asprintf(&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX, ptr); if (ret == -1) { ret = -ENOMEM; goto out; } i++; } ret = 0; out: GF_FREE(ptr1); return ret; } void afr_ta_init(afr_private_t *priv) { priv->thin_arbiter_count = 1; priv->child_count--; priv->ta_child_up = 0; priv->ta_bad_child_index = AFR_CHILD_UNKNOWN; priv->ta_notify_dom_lock_offset = 0; priv->ta_in_mem_txn_count = 0; priv->ta_on_wire_txn_count = 0; priv->release_ta_notify_dom_lock = _gf_false; INIT_LIST_HEAD(&priv->ta_waitq); INIT_LIST_HEAD(&priv->ta_onwireq); gf_uuid_clear(priv->ta_gfid); } int32_t init(xlator_t *this) { afr_private_t *priv = NULL; int child_count = 0; xlator_list_t *trav = NULL; int i = 0; int ret = -1; GF_UNUSED int op_errno = 0; xlator_t *read_subvol = NULL; int read_subvol_index = -1; char *qtype = NULL; char *fav_child_policy = NULL; char *thin_arbiter = NULL; char *data_self_heal = NULL; char *locking_scheme = NULL; char *data_self_heal_algorithm = NULL; if (!this->children) { gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_CHILD_MISCONFIGURED, "replicate translator needs more than one " "subvolume defined."); return -1; } if (!this->parents) { gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_VOL_MISCONFIGURED, "Volume is dangling."); } this->private = GF_CALLOC(1, sizeof(afr_private_t), gf_afr_mt_afr_private_t); if (!this->private) goto out; priv = this->private; INIT_LIST_HEAD(&priv->saved_locks); INIT_LIST_HEAD(&priv->lk_healq); LOCK_INIT(&priv->lock); child_count = xlator_subvolume_count(this); priv->child_count = child_count; priv->read_child = -1; GF_OPTION_INIT("arbiter-count", priv->arbiter_count, uint32, out); GF_OPTION_INIT("thin-arbiter", thin_arbiter, str, out); if (thin_arbiter && strlen(thin_arbiter) > 0) { afr_ta_init(priv); } INIT_LIST_HEAD(&priv->healing); INIT_LIST_HEAD(&priv->heal_waiting); priv->spb_choice_timeout = AFR_DEFAULT_SPB_CHOICE_TIMEOUT; GF_OPTION_INIT("afr-dirty-xattr", priv->afr_dirty, str, out); GF_OPTION_INIT("metadata-splitbrain-forced-heal", priv->metadata_splitbrain_forced_heal, bool, out); GF_OPTION_INIT("read-subvolume", read_subvol, xlator, out); if (read_subvol) { priv->read_child = xlator_subvolume_index(this, read_subvol); if (priv->read_child == -1) { gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL, "%s not a subvolume", read_subvol->name); goto out; } } GF_OPTION_INIT("read-subvolume-index", read_subvol_index, int32, out); if (read_subvol_index > -1) { if (read_subvol_index >= priv->child_count) { gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL, "%d not a subvolume-index", read_subvol_index); goto out; } priv->read_child = read_subvol_index; } GF_OPTION_INIT("choose-local", priv->choose_local, bool, out); priv->pending_reads = GF_CALLOC(sizeof(*priv->pending_reads), priv->child_count, gf_afr_mt_atomic_t); GF_OPTION_INIT("read-hash-mode", priv->hash_mode, uint32, out); priv->favorite_child = -1; GF_OPTION_INIT("favorite-child-policy", fav_child_policy, str, out); if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1) goto out; GF_OPTION_INIT("shd-max-threads", priv->shd.max_threads, uint32, out); GF_OPTION_INIT("shd-wait-qlength", priv->shd.wait_qlength, uint32, out); GF_OPTION_INIT("background-self-heal-count", priv->background_self_heal_count, uint32, out); GF_OPTION_INIT("heal-wait-queue-length", priv->heal_wait_qlen, uint32, out); GF_OPTION_INIT("data-self-heal", data_self_heal, str, out); gf_string2boolean(data_self_heal, &priv->data_self_heal); GF_OPTION_INIT("data-self-heal-algorithm", data_self_heal_algorithm, str, out); set_data_self_heal_algorithm(priv, data_self_heal_algorithm); GF_OPTION_INIT("data-self-heal-window-size", priv->data_self_heal_window_size, uint32, out); GF_OPTION_INIT("metadata-self-heal", priv->metadata_self_heal, bool, out); GF_OPTION_INIT("entry-self-heal", priv->entry_self_heal, bool, out); GF_OPTION_INIT("halo-shd-max-latency", priv->shd.halo_max_latency_msec, uint32, out); GF_OPTION_INIT("halo-max-latency", priv->halo_max_latency_msec, uint32, out); GF_OPTION_INIT("halo-max-replicas", priv->halo_max_replicas, uint32, out); GF_OPTION_INIT("halo-min-replicas", priv->halo_min_replicas, uint32, out); GF_OPTION_INIT("halo-enabled", priv->halo_enabled, bool, out); GF_OPTION_INIT("halo-nfsd-max-latency", priv->nfsd.halo_max_latency_msec, uint32, out); GF_OPTION_INIT("iam-nfs-daemon", priv->nfsd.iamnfsd, bool, out); GF_OPTION_INIT("optimistic-change-log", priv->optimistic_change_log, bool, out); GF_OPTION_INIT("pre-op-compat", priv->pre_op_compat, bool, out); GF_OPTION_INIT("locking-scheme", locking_scheme, str, out); priv->granular_locks = (strcmp(locking_scheme, "granular") == 0); GF_OPTION_INIT("full-lock", priv->full_lock, bool, out); GF_OPTION_INIT("granular-entry-heal", priv->esh_granular, bool, out); GF_OPTION_INIT("eager-lock", priv->eager_lock, bool, out); GF_OPTION_INIT("quorum-type", qtype, str, out); GF_OPTION_INIT("quorum-count", priv->quorum_count, uint32, out); GF_OPTION_INIT(AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size_uint64, out); fix_quorum_options(this, priv, qtype, this->options); GF_OPTION_INIT("post-op-delay-secs", priv->post_op_delay_secs, uint32, out); GF_OPTION_INIT("ensure-durability", priv->ensure_durability, bool, out); GF_OPTION_INIT("self-heal-daemon", priv->shd.enabled, bool, out); GF_OPTION_INIT("iam-self-heal-daemon", priv->shd.iamshd, bool, out); GF_OPTION_INIT("heal-timeout", priv->shd.timeout, int32, out); GF_OPTION_INIT("consistent-metadata", priv->consistent_metadata, bool, out); GF_OPTION_INIT("consistent-io", priv->consistent_io, bool, out); if (priv->quorum_count != 0) priv->consistent_io = _gf_false; priv->wait_count = 1; priv->local = GF_CALLOC(sizeof(unsigned char), child_count, gf_afr_mt_char); if (!priv->local) { ret = -ENOMEM; goto out; } priv->child_up = GF_CALLOC(sizeof(unsigned char), child_count, gf_afr_mt_char); priv->child_latency = GF_MALLOC(sizeof(*priv->child_latency) * child_count, gf_afr_mt_child_latency_t); if (!priv->child_up || !priv->child_latency) { ret = -ENOMEM; goto out; } /*Initialize to -ve ping timeout so that they are not considered * in child-up events until ping-event comes*/ for (i = 0; i < child_count; i++) priv->child_latency[i] = -1; priv->children = GF_CALLOC(sizeof(xlator_t *), child_count, gf_afr_mt_xlator_t); if (!priv->children) { ret = -ENOMEM; goto out; } ret = afr_pending_xattrs_init(priv, this); if (ret) goto out; trav = this->children; i = 0; while (i < child_count) { priv->children[i] = trav->xlator; trav = trav->next; i++; } ret = gf_asprintf(&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT, this->name); if (-1 == ret) { ret = -ENOMEM; goto out; } priv->last_event = GF_CALLOC(child_count, sizeof(*priv->last_event), gf_afr_mt_int32_t); if (!priv->last_event) { ret = -ENOMEM; goto out; } this->itable = inode_table_new(SHD_INODE_LRU_LIMIT, this); if (!this->itable) { ret = -ENOMEM; goto out; } if (priv->shd.iamshd) { ret = afr_selfheal_daemon_init(this); if (ret) { ret = -ENOMEM; goto out; } } /* keep more local here as we may need them for self-heal etc */ this->local_pool = mem_pool_new(afr_local_t, 512); if (!this->local_pool) { ret = -1; goto out; } priv->root_inode = NULL; ret = 0; out: return ret; } void afr_destroy_healer_object(xlator_t *this, struct subvol_healer *healer) { int ret = -1; if (!healer) return; if (healer->running) { /* * If there are any resources to cleanup, We need * to do that gracefully using pthread_cleanup_push */ ret = gf_thread_cleanup_xint(healer->thread); if (ret) gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SELF_HEAL_FAILED, "Failed to clean up healer threads."); healer->thread = 0; } pthread_cond_destroy(&healer->cond); pthread_mutex_destroy(&healer->mutex); } void afr_selfheal_daemon_fini(xlator_t *this) { struct subvol_healer *healer = NULL; afr_self_heald_t *shd = NULL; afr_private_t *priv = NULL; int i = 0; priv = this->private; if (!priv) return; shd = &priv->shd; if (!shd->iamshd) return; for (i = 0; i < priv->child_count; i++) { healer = &shd->index_healers[i]; afr_destroy_healer_object(this, healer); healer = &shd->full_healers[i]; afr_destroy_healer_object(this, healer); if (shd->statistics[i]) eh_destroy(shd->statistics[i]); } GF_FREE(shd->index_healers); GF_FREE(shd->full_healers); GF_FREE(shd->statistics); if (shd->split_brain) eh_destroy(shd->split_brain); } void fini(xlator_t *this) { afr_private_t *priv = NULL; priv = this->private; afr_selfheal_daemon_fini(this); GF_ASSERT(list_empty(&priv->saved_locks)); LOCK(&priv->lock); if (priv->timer != NULL) { gf_timer_call_cancel(this->ctx, priv->timer); priv->timer = NULL; } UNLOCK(&priv->lock); if (this->local_pool != NULL) { mem_pool_destroy(this->local_pool); this->local_pool = NULL; } this->private = NULL; afr_priv_destroy(priv); if (this->itable) { inode_table_destroy(this->itable); this->itable = NULL; } return; } struct xlator_fops fops = { .lookup = afr_lookup, .lk = afr_lk, .flush = afr_flush, .statfs = afr_statfs, .fsyncdir = afr_fsyncdir, .inodelk = afr_inodelk, .finodelk = afr_finodelk, .entrylk = afr_entrylk, .fentrylk = afr_fentrylk, .ipc = afr_ipc, .lease = afr_lease, /* inode read */ .access = afr_access, .stat = afr_stat, .fstat = afr_fstat, .readlink = afr_readlink, .getxattr = afr_getxattr, .fgetxattr = afr_fgetxattr, .readv = afr_readv, .seek = afr_seek, /* inode write */ .writev = afr_writev, .truncate = afr_truncate, .ftruncate = afr_ftruncate, .setxattr = afr_setxattr, .fsetxattr = afr_fsetxattr, .setattr = afr_setattr, .fsetattr = afr_fsetattr, .removexattr = afr_removexattr, .fremovexattr = afr_fremovexattr, .fallocate = afr_fallocate, .discard = afr_discard, .zerofill = afr_zerofill, .xattrop = afr_xattrop, .fxattrop = afr_fxattrop, .fsync = afr_fsync, /*inode open*/ .opendir = afr_opendir, .open = afr_open, /* dir read */ .readdir = afr_readdir, .readdirp = afr_readdirp, /* dir write */ .create = afr_create, .mknod = afr_mknod, .mkdir = afr_mkdir, .unlink = afr_unlink, .rmdir = afr_rmdir, .link = afr_link, .symlink = afr_symlink, .rename = afr_rename, }; struct xlator_dumpops dumpops = { .priv = afr_priv_dump, }; struct xlator_cbks cbks = { .release = afr_release, .releasedir = afr_releasedir, .forget = afr_forget, }; struct volume_options options[] = { {.key = {"read-subvolume"}, .type = GF_OPTION_TYPE_XLATOR, .op_version = {1}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "inode-read fops happen only on one of the bricks in " "replicate. Afr will prefer the one specified using " "this option if it is not stale. Option value must be " "one of the xlator names of the children. " "Ex: -client-0 till " "-client-"}, {.key = {"read-subvolume-index"}, .type = GF_OPTION_TYPE_INT, .default_value = "-1", .op_version = {2}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "inode-read fops happen only on one of the bricks in " "replicate. AFR will prefer the one specified using " "this option if it is not stale. allowed options" " include -1 till replica-count - 1"}, {.key = {"read-hash-mode"}, .type = GF_OPTION_TYPE_INT, .min = 0, .max = 5, .default_value = "1", .op_version = {2}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "inode-read fops happen only on one of the bricks in " "replicate. AFR will prefer the one computed using " "the method specified using this option.\n" "0 = first readable child of AFR, starting from 1st child.\n" "1 = hash by GFID of file (all clients use " "same subvolume).\n" "2 = hash by GFID of file and client PID.\n" "3 = brick having the least outstanding read requests.\n" "4 = brick having the least network ping latency.\n" "5 = Hybrid mode between 3 and 4, ie least value among " "network-latency multiplied by outstanding-read-requests."}, { .key = {"choose-local"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "true", .op_version = {2}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "Choose a local subvolume (i.e. Brick) to read from" " if read-subvolume is not explicitly set.", }, {.key = {"background-self-heal-count"}, .type = GF_OPTION_TYPE_INT, .min = 0, .max = 256, .default_value = "8", .validate = GF_OPT_VALIDATE_MIN, .op_version = {1}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "This specifies the number of per client self-heal " "jobs that can perform parallel heals in the " "background."}, {.key = {"halo-shd-max-latency"}, .type = GF_OPTION_TYPE_INT, .min = 1, .max = 99999, .default_value = "99999", .op_version = {GD_OP_VERSION_3_11_0}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate", "halo"}, .description = "Maximum latency for shd halo replication in msec."}, {.key = {"halo-enabled"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "False", .op_version = {GD_OP_VERSION_3_11_0}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate", "halo"}, .description = "Enable Halo (geo) replication mode."}, {.key = {"halo-nfsd-max-latency"}, .type = GF_OPTION_TYPE_INT, .min = 1, .max = 99999, .default_value = "5", .op_version = {GD_OP_VERSION_3_11_0}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate", "halo"}, .description = "Maximum latency for nfsd halo replication in msec."}, {.key = {"halo-max-latency"}, .type = GF_OPTION_TYPE_INT, .min = 1, .max = AFR_HALO_MAX_LATENCY, .default_value = "5", .op_version = {GD_OP_VERSION_3_11_0}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate", "halo"}, .description = "Maximum latency for halo replication in msec."}, {.key = {"halo-max-replicas"}, .type = GF_OPTION_TYPE_INT, .min = 1, .max = 99999, .default_value = "99999", .op_version = {GD_OP_VERSION_3_11_0}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate", "halo"}, .description = "The maximum number of halo replicas; replicas" " beyond this value will be written asynchronously" "via the SHD."}, {.key = {"halo-min-replicas"}, .type = GF_OPTION_TYPE_INT, .min = 1, .max = 99999, .default_value = "2", .op_version = {GD_OP_VERSION_3_11_0}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate", "halo"}, .description = "The minimmum number of halo replicas, before adding " "out of region replicas."}, {.key = {"heal-wait-queue-length"}, .type = GF_OPTION_TYPE_INT, .min = 0, .max = 10000, /*Around 100MB with sizeof(afr_local_t)= 10496 bytes*/ .default_value = "128", .validate = GF_OPT_VALIDATE_MIN, .op_version = {GD_OP_VERSION_3_7_10}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "This specifies the number of heals that can be queued" " for the parallel background self heal jobs."}, {.key = {"data-self-heal"}, .type = GF_OPTION_TYPE_STR, .value = {"1", "on", "yes", "true", "enable", "0", "off", "no", "false", "disable", "open"}, .default_value = "off", .op_version = {1}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "Using this option we can enable/disable data " "self-heal on the file. \"open\" means data " "self-heal action will only be triggered by file " "open operations."}, {.key = {"data-self-heal-algorithm"}, .type = GF_OPTION_TYPE_STR, .op_version = {1}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "Select between \"full\", \"diff\". The " "\"full\" algorithm copies the entire file from " "source to sink. The \"diff\" algorithm copies to " "sink only those blocks whose checksums don't match " "with those of source. If no option is configured " "the option is chosen dynamically as follows: " "If the file does not exist on one of the sinks " "or empty file exists or if the source file size is " "about the same as page size the entire file will " "be read and written i.e \"full\" algo, " "otherwise \"diff\" algo is chosen.", .value = {"diff", "full"}}, {.key = {"data-self-heal-window-size"}, .type = GF_OPTION_TYPE_INT, .min = 1, .max = 1024, .default_value = "1", .op_version = {1}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "Maximum number blocks per file for which self-heal " "process would be applied simultaneously."}, {.key = {"metadata-self-heal"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", .op_version = {1}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, /*.validate_fn = validate_replica*/ .description = "Using this option we can enable/disable metadata " "i.e. Permissions, ownerships, xattrs self-heal on " "the file/directory."}, {.key = {"entry-self-heal"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", .op_version = {1}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, /*.validate_fn = validate_replica*/ .description = "Using this option we can enable/disable entry " "self-heal on the directory."}, {.key = {"data-change-log"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", .op_version = {1}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "This option exists only for backward compatibility " "and configuring it doesn't have any effect"}, {.key = {"metadata-change-log"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", .op_version = {1}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "This option exists only for backward compatibility " "and configuring it doesn't have any effect"}, {.key = {"entry-change-log"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", .op_version = {1}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "This option exists only for backward compatibility " "and configuring it doesn't have any effect"}, {.key = {"optimistic-change-log"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", .description = "Entry/Metadata fops will not perform " "pre fop changelog operations in afr transaction " "if this option is enabled."}, {.key = {"inodelk-trace"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", .description = "Enabling this option logs inode lock/unlocks"}, {.key = {"entrylk-trace"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", .description = "Enabling this option logs entry lock/unlocks"}, {.key = {"pre-op-compat"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", .description = "Use separate pre-op xattrop() FOP rather than " "overloading xdata of the OP"}, {.key = {"eager-lock"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", .op_version = {1}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "Enable/Disable eager lock for replica volume. " "Lock phase of a transaction has two sub-phases. " "First is an attempt to acquire locks in parallel by " "broadcasting non-blocking lock requests. If lock " "acquisition fails on any server, then the held locks " "are unlocked and we revert to a blocking locks mode " "sequentially on one server after another. If this " "option is enabled the initial broadcasting lock " "request attempts to acquire a full lock on the entire file. " "If this fails, we revert back to the sequential " "\"regional\" blocking locks as before. In the case " "where such an \"eager\" lock is granted in the " "non-blocking phase, it gives rise to an opportunity " "for optimization. i.e, if the next write transaction " "on the same FD arrives before the unlock phase of " "the first transaction, it \"takes over\" the full " "file lock. Similarly if yet another data transaction " "arrives before the unlock phase of the \"optimized\" " "transaction, that in turn \"takes over\" the lock as " "well. The actual unlock now happens at the end of " "the last \"optimized\" transaction." }, {.key = {"self-heal-daemon"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", .op_version = {1}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE, .tags = {"replicate"}, /*.validate_fn = validate_replica_heal_enable_disable*/ .description = "This option applies to only self-heal-daemon. " "Index directory crawl and automatic healing of files " "will not be performed if this option is turned off."}, {.key = {"iam-self-heal-daemon"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", .description = "This option differentiates if the replicate " "translator is running as part of self-heal-daemon " "or not."}, {.key = {"iam-nfs-daemon"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", .description = "This option differentiates if the replicate " "translator is running as part of an NFS daemon " "or not."}, { .key = {"quorum-type"}, .type = GF_OPTION_TYPE_STR, .value = {"none", "auto", "fixed"}, .default_value = "none", .op_version = {1}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, /*.option = quorum-type*/ .description = "If value is \"fixed\" only allow writes if " "quorum-count bricks are present. If value is " "\"auto\" only allow writes if more than half of " "bricks, or exactly half including the first, are " "present.", }, { .key = {"quorum-count"}, .type = GF_OPTION_TYPE_INT, .min = 1, .max = INT_MAX, .default_value = 0, .op_version = {1}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, /*.option = quorum-count*/ /*.validate_fn = validate_quorum_count*/ .description = "If quorum-type is \"fixed\" only allow writes if " "this many bricks are present. Other quorum types " "will OVERWRITE this value.", }, { .key = {"quorum-reads"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "no", .op_version = {GD_OP_VERSION_3_7_0}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "This option has been removed. Reads are not allowed " "if quorum is not met.", }, { .key = {"node-uuid"}, .type = GF_OPTION_TYPE_STR, .description = "Local glusterd uuid string, used in starting " "self-heal-daemon so that it can crawl only on " "local index directories.", }, { .key = {"post-op-delay-secs"}, .type = GF_OPTION_TYPE_INT, .min = 0, .max = INT_MAX, .default_value = "1", .op_version = {2}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "Time interval induced artificially before " "post-operation phase of the transaction to " "enhance overlap of adjacent write operations.", }, { .key = {AFR_SH_READDIR_SIZE_KEY}, .type = GF_OPTION_TYPE_SIZET, .description = "readdirp size for performing entry self-heal", .min = 1024, .max = 131072, .op_version = {2}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE, .tags = {"replicate"}, .default_value = "1KB", }, { .key = {"ensure-durability"}, .type = GF_OPTION_TYPE_BOOL, .op_version = {3}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "Afr performs fsyncs for transactions if this " "option is on to make sure the changelogs/data is " "written to the disk", .default_value = "on", }, { .key = {"afr-dirty-xattr"}, .type = GF_OPTION_TYPE_STR, .default_value = AFR_DIRTY_DEFAULT, }, {.key = {"afr-pending-xattr"}, .type = GF_OPTION_TYPE_STR, .description = "Comma separated list of xattrs that are used to " "capture information on pending heals."}, { .key = {"metadata-splitbrain-forced-heal"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", }, {.key = {"heal-timeout"}, .type = GF_OPTION_TYPE_INT, .min = 5, .max = INT_MAX, .default_value = "600", .op_version = {2}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "time interval for checking the need to self-heal " "in self-heal-daemon"}, { .key = {"consistent-metadata"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "no", .op_version = {GD_OP_VERSION_3_7_0}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "If this option is enabled, readdirp will force " "lookups on those entries read whose read child is " "not the same as that of the parent. This will " "guarantee that all read operations on a file serve " "attributes from the same subvol as long as it holds " " a good copy of the file/dir.", }, {.key = {"arbiter-count"}, .type = GF_OPTION_TYPE_INT, .description = "subset of child_count. Has to be 0 or 1."}, { .key = {"thin-arbiter"}, .type = GF_OPTION_TYPE_STR, .op_version = {GD_OP_VERSION_4_1_0}, .flags = OPT_FLAG_SETTABLE, .tags = {"replicate"}, .description = "contains host:path of thin abriter brick", }, {.key = {"shd-max-threads"}, .type = GF_OPTION_TYPE_INT, .min = 1, .max = 64, .default_value = "1", .op_version = {GD_OP_VERSION_3_7_12}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "Maximum number of parallel heals SHD can do per " "local brick. This can substantially lower heal times" ", but can also crush your bricks if you don't have " "the storage hardware to support this."}, { .key = {"shd-wait-qlength"}, .type = GF_OPTION_TYPE_INT, .min = 1, .max = 655536, .default_value = "1024", .op_version = {GD_OP_VERSION_3_7_12}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "This option can be used to control number of heals" " that can wait in SHD per subvolume", }, { .key = {"locking-scheme"}, .type = GF_OPTION_TYPE_STR, .value = {"full", "granular"}, .default_value = "full", .op_version = {GD_OP_VERSION_3_7_12}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "If this option is set to granular, self-heal will " "stop being compatible with afr-v1, which helps afr " "be more granular while self-healing", }, {.key = {"full-lock"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "yes", .op_version = {GD_OP_VERSION_3_13_2}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE, .tags = {"replicate"}, .description = "If this option is disabled, then the IOs will take " "range locks same as versions till 3.13.1."}, { .key = {"granular-entry-heal"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "no", .op_version = {GD_OP_VERSION_3_8_0}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "If this option is enabled, self-heal will resort to " "granular way of recording changelogs and doing entry " "self-heal.", }, { .key = {"favorite-child-policy"}, .type = GF_OPTION_TYPE_STR, .value = {"none", "size", "ctime", "mtime", "majority"}, .default_value = "none", .op_version = {GD_OP_VERSION_3_7_12}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "This option can be used to automatically resolve " "split-brains using various policies without user " "intervention. \"size\" picks the file with the " "biggest size as the source. \"ctime\" and \"mtime\" " "pick the file with the latest ctime and mtime " "respectively as the source. \"majority\" picks a file" " with identical mtime and size in more than half the " "number of bricks in the replica.", }, { .key = {"consistent-io"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "no", .description = "If this option is enabled, i/o will fail even if " "one of the bricks is down in the replicas", }, {.key = {"use-compound-fops"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "no", .op_version = {GD_OP_VERSION_3_8_4}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "This option exists only for backward compatibility " "and configuring it doesn't have any effect"}, {.key = {NULL}}, }; xlator_api_t xlator_api = { .init = init, .fini = fini, .notify = notify, .reconfigure = reconfigure, .mem_acct_init = mem_acct_init, .op_version = {1}, /* Present from the initial version */ .dumpops = &dumpops, .fops = &fops, .cbks = &cbks, .options = options, .identifier = "replicate", .category = GF_MAINTAINED, };