diff options
Diffstat (limited to 'xlators/cluster/dht/src')
23 files changed, 29906 insertions, 13624 deletions
diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am index 3fc29bf8154..56f1f2ad7c8 100644 --- a/xlators/cluster/dht/src/Makefile.am +++ b/xlators/cluster/dht/src/Makefile.am @@ -1,32 +1,37 @@ xlator_LTLIBRARIES = dht.la nufa.la switch.la + +AM_CFLAGS = -Wall $(GF_CFLAGS) + xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c dht-rebalance.c \ dht-selfheal.c dht-rename.c dht-hashfn.c dht-diskusage.c \ dht-common.c dht-inode-write.c dht-inode-read.c dht-shared.c \ - $(top_builddir)/xlators/lib/src/libxlator.c + dht-lock.c $(top_builddir)/xlators/lib/src/libxlator.c dht_la_SOURCES = $(dht_common_source) dht.c nufa_la_SOURCES = $(dht_common_source) nufa.c switch_la_SOURCES = $(dht_common_source) switch.c -dht_la_LDFLAGS = -module -avoid-version +dht_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -nufa_la_LDFLAGS = -module -avoid-version +nufa_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -switch_la_LDFLAGS = -module -avoid-version +switch_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = dht-common.h dht-mem-types.h \ - $(top_builddir)/xlators/lib/src/libxlator.h +noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h \ + dht-lock.h $(top_builddir)/xlators/lib/src/libxlator.h AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ - -I$(top_srcdir)/xlators/lib/src - -AM_CFLAGS = -Wall $(GF_CFLAGS) + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ + -I$(top_srcdir)/rpc/rpc-lib/src \ + -I$(top_srcdir)/xlators/lib/src \ + -DDATADIR=\"$(localstatedir)\" \ + -DLIBDIR=\"$(libdir)\" CLEANFILES = @@ -35,3 +40,9 @@ uninstall-local: install-data-hook: ln -sf dht.so $(DESTDIR)$(xlatordir)/distribute.so + +if UNITTEST +CLEANFILES += *.gcda *.gcno *_xunit.xml +noinst_PROGRAMS = +TESTS = +endif diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 3868fc38fd5..8ba0cc4c732 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -8,96 +8,439 @@ cases as published by the Free Software Foundation. */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - /* TODO: add NS locking */ -#include "glusterfs.h" -#include "xlator.h" #include "libxlator.h" #include "dht-common.h" -#include "defaults.h" -#include "byte-order.h" -#include "glusterfs-acl.h" +#include "dht-lock.h" +#include <glusterfs/byte-order.h> +#include <glusterfs/quota-common-utils.h> +#include <glusterfs/upcall-utils.h> +#include "glusterfs/compat-errno.h" // for ENODATA on BSD +#include <glusterfs/common-utils.h> #include <sys/time.h> #include <libgen.h> +#include <signal.h> -int -dht_aggregate (dict_t *this, char *key, data_t *value, void *data) +static int +dht_rmdir_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *xdata); + +static int +dht_link2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret); + +static int +dht_set_dir_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req); + +static int +dht_lookup_everywhere_done(call_frame_t *frame, xlator_t *this); + +static int +dht_common_mark_mdsxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata); + +static int +dht_rmdir_unlock(call_frame_t *frame, xlator_t *this); + +static const char *dht_dbg_vxattrs[] = {DHT_DBG_HASHED_SUBVOL_PATTERN, NULL}; + +/* Check the xdata to make sure EBADF has been set by client xlator */ +int32_t +dht_check_remote_fd_failed_error(dht_local_t *local, int op_ret, int op_errno) { - dict_t *dst = NULL; - int64_t *ptr = 0, *size = NULL; - int32_t ret = -1; - data_t *dict_data = NULL; + if (op_ret == -1 && (op_errno == EBADF || op_errno == EBADFD) && + !(local->fd_checked)) { + return 1; + } + return 0; +} - dst = data; +/* Sets the blocks and size values to fixed values. This is to be called + * only for dirs. The caller is responsible for checking the type + */ +int32_t +dht_set_fixed_dir_stat(struct iatt *stat) +{ + if (stat) { + stat->ia_blocks = DHT_DIR_STAT_BLOCKS; + stat->ia_size = DHT_DIR_STAT_SIZE; + return 0; + } + return -1; +} - if (strcmp (key, GF_XATTR_QUOTA_SIZE_KEY) == 0) { - ret = dict_get_bin (dst, key, (void **)&size); - if (ret < 0) { - size = GF_CALLOC (1, sizeof (int64_t), - gf_common_mt_char); - if (size == NULL) { - gf_log ("dht", GF_LOG_WARNING, - "memory allocation failed"); - return -1; - } - ret = dict_set_bin (dst, key, size, sizeof (int64_t)); - if (ret < 0) { - gf_log ("dht", GF_LOG_WARNING, - "dht aggregate dict set failed"); - GF_FREE (size); - return -1; - } - } +/* Return true if key exists in array + */ +static gf_boolean_t +dht_match_xattr(const char *key) +{ + char **xattrs_to_heal = get_xattrs_to_heal(); - ptr = data_to_bin (value); - if (ptr == NULL) { - gf_log ("dht", GF_LOG_WARNING, "data to bin failed"); - return -1; - } + return gf_get_index_by_elem(xattrs_to_heal, (char *)key) >= 0; +} + +static int +dht_aggregate_quota_xattr(dict_t *dst, char *key, data_t *value) +{ + int ret = -1; + quota_meta_t *meta_dst = NULL; + quota_meta_t *meta_src = NULL; + int64_t *size = NULL; + int64_t dst_dir_count = 0; + int64_t src_dir_count = 0; + + if (value == NULL) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DATA_NULL, + "data value is NULL"); + ret = -1; + goto out; + } + + ret = dict_get_bin(dst, key, (void **)&meta_dst); + if (ret < 0) { + meta_dst = GF_CALLOC(1, sizeof(quota_meta_t), gf_common_quota_meta_t); + if (meta_dst == NULL) { + gf_msg("dht", GF_LOG_WARNING, ENOMEM, DHT_MSG_NO_MEMORY, + "Memory allocation failed"); + ret = -1; + goto out; + } + ret = dict_set_bin(dst, key, meta_dst, sizeof(quota_meta_t)); + if (ret < 0) { + gf_msg("dht", GF_LOG_WARNING, EINVAL, DHT_MSG_DICT_SET_FAILED, + "dht aggregate dict set failed"); + GF_FREE(meta_dst); + ret = -1; + goto out; + } + } + + if (value->len > sizeof(int64_t)) { + meta_src = data_to_bin(value); + + meta_dst->size = hton64(ntoh64(meta_dst->size) + + ntoh64(meta_src->size)); + meta_dst->file_count = hton64(ntoh64(meta_dst->file_count) + + ntoh64(meta_src->file_count)); - *size = hton64 (ntoh64 (*size) + ntoh64 (*ptr)); + if (value->len > (2 * sizeof(int64_t))) { + dst_dir_count = ntoh64(meta_dst->dir_count); + src_dir_count = ntoh64(meta_src->dir_count); - } else if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) { - ret = gf_get_min_stime (THIS, dst, key, value); - if (ret < 0) - return ret; + if (src_dir_count > dst_dir_count) + meta_dst->dir_count = meta_src->dir_count; } else { - /* compare user xattrs only */ - if (!strncmp (key, "user.", strlen ("user."))) { - ret = dict_lookup (dst, key, &dict_data); - if (!ret && dict_data && value) { - ret = is_data_equal (dict_data, value); - if (!ret) - gf_log ("dht", GF_LOG_DEBUG, - "xattr mismatch for %s", key); - } - } - ret = dict_set (dst, key, value); - if (ret) - gf_log ("dht", GF_LOG_WARNING, "xattr dict set failed"); + meta_dst->dir_count = 0; } + } else { + size = data_to_bin(value); + meta_dst->size = hton64(ntoh64(meta_dst->size) + ntoh64(*size)); + } - return 0; + ret = 0; +out: + return ret; } +static int +add_opt(char **optsp, const char *opt) +{ + char *newopts = NULL; + unsigned oldsize = 0; + unsigned newsize = 0; + + if (*optsp == NULL) + newopts = gf_strdup(opt); + else { + oldsize = strlen(*optsp); + newsize = oldsize + 1 + strlen(opt) + 1; + newopts = GF_REALLOC(*optsp, newsize); + if (newopts) + sprintf(newopts + oldsize, ",%s", opt); + } + if (newopts == NULL) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY, + "Error to add choices in buffer in add_opt"); + return -1; + } + *optsp = newopts; + return 0; +} -void -dht_aggregate_xattr (dict_t *dst, dict_t *src) +/* Return Choice list from Split brain status */ +static char * +getChoices(const char *value) { - if ((dst == NULL) || (src == NULL)) { + int i = 0; + char *ptr = NULL; + char *tok = NULL; + char *result = NULL; + char *newval = NULL; + + ptr = strstr(value, "Choices:"); + if (!ptr) { + result = ptr; + goto out; + } + + newval = gf_strdup(ptr); + if (!newval) { + result = newval; + goto out; + } + + tok = strtok(newval, ":"); + if (!tok) { + result = tok; + goto out; + } + + while (tok) { + i++; + if (i == 2) + break; + tok = strtok(NULL, ":"); + } + + result = gf_strdup(tok); + +out: + if (newval) + GF_FREE(newval); + + return result; +} + +/* This function prepare a list of choices for key + (replica.split-brain-status) in case of metadata split brain + only on the basis of key-value passed to this function. + After prepare the list of choices it update the same key in dict + with this value to reflect the same in + replica.split-brain-status attr for file. + +*/ + +static int +dht_aggregate_split_brain_xattr(dict_t *dst, char *key, data_t *value) +{ + int ret = 0; + char *oldvalue = NULL; + char *old_choice = NULL; + char *new_choice = NULL; + char *full_choice = NULL; + char *status = NULL; + + if (value == NULL) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DATA_NULL, + "GF_AFR_SBRAIN_STATUS value is NULL"); + ret = -1; + goto out; + } + + ret = dict_get_str(dst, key, &oldvalue); + if (ret) + goto out; + + /* skip code that is irrelevant if !oldvalue */ + if (!oldvalue) + goto out; + + if (strstr(oldvalue, "not")) { + gf_msg_debug("dht", 0, "Need to update split-brain status in dict"); + ret = -1; + goto out; + } + if (strstr(oldvalue, "metadata-split-brain:yes") && + (strstr(oldvalue, "data-split-brain:no"))) { + if (strstr(value->data, "not")) { + gf_msg_debug("dht", 0, "No need to update split-brain status"); + ret = 0; + goto out; + } + if (strstr(value->data, "yes") && + (strncmp(oldvalue, value->data, strlen(oldvalue)))) { + old_choice = getChoices(oldvalue); + if (!old_choice) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY, + "Error to get choices"); + ret = -1; + goto out; + } + + ret = add_opt(&full_choice, old_choice); + if (ret) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY, + "Error to add choices"); + ret = -1; + goto out; + } + + new_choice = getChoices(value->data); + if (!new_choice) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY, + "Error to get choices"); + ret = -1; goto out; + } + + ret = add_opt(&full_choice, new_choice); + if (ret) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY, + "Error to add choices "); + ret = -1; + goto out; + } + ret = gf_asprintf(&status, + "data-split-brain:%s " + "metadata-split-brain:%s Choices:%s", + "no", "yes", full_choice); + + if (-1 == ret) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY, + "Error to prepare status "); + goto out; + } + ret = dict_set_dynstr(dst, key, status); + if (ret) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set full choice"); + } } + } - dict_foreach (src, dht_aggregate, dst); out: - return; + if (old_choice) + GF_FREE(old_choice); + if (new_choice) + GF_FREE(new_choice); + if (full_choice) + GF_FREE(full_choice); + + return ret; +} + +static int +dht_aggregate(dict_t *this, char *key, data_t *value, void *data) +{ + dict_t *dst = NULL; + int32_t ret = -1; + data_t *dict_data = NULL; + + dst = data; + + /* compare split brain xattr only */ + if (strcmp(key, GF_AFR_SBRAIN_STATUS) == 0) { + ret = dht_aggregate_split_brain_xattr(dst, key, value); + if (!ret) + goto out; + } else if (strcmp(key, QUOTA_SIZE_KEY) == 0) { + ret = dht_aggregate_quota_xattr(dst, key, value); + if (ret) { + gf_msg("dht", GF_LOG_WARNING, 0, + DHT_MSG_AGGREGATE_QUOTA_XATTR_FAILED, + "Failed to aggregate quota xattr"); + } + goto out; + } else if (fnmatch(GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) { + ret = gf_get_min_stime(THIS, dst, key, value); + goto out; + } else { + /* compare user xattrs only */ + if (!strncmp(key, "user.", SLEN("user."))) { + ret = dict_lookup(dst, key, &dict_data); + if (!ret && dict_data && value) { + ret = is_data_equal(dict_data, value); + if (!ret) + gf_msg_debug("dht", 0, "xattr mismatch for %s", key); + } + } + } + + ret = dict_set(dst, key, value); + if (ret) { + gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value: key = %s", key); + } + +out: + return ret; +} + +static void +dht_aggregate_xattr(dict_t *dst, dict_t *src) +{ + if ((dst == NULL) || (src == NULL)) { + goto out; + } + + dict_foreach(src, dht_aggregate, dst); +out: + return; +} + +/* Code to save hashed subvol on inode ctx as a mds subvol + */ +int +dht_inode_ctx_mdsvol_set(inode_t *inode, xlator_t *this, xlator_t *mds_subvol) +{ + dht_inode_ctx_t *ctx = NULL; + int ret = -1; + uint64_t ctx_int = 0; + gf_boolean_t ctx_free = _gf_false; + + LOCK(&inode->lock); + { + ret = __inode_ctx_get(inode, this, &ctx_int); + if (ctx_int) { + ctx = (dht_inode_ctx_t *)(uintptr_t)ctx_int; + ctx->mds_subvol = mds_subvol; + } else { + ctx = GF_CALLOC(1, sizeof(*ctx), gf_dht_mt_inode_ctx_t); + if (!ctx) + goto unlock; + ctx->mds_subvol = mds_subvol; + ctx_free = _gf_true; + ctx_int = (long)ctx; + ret = __inode_ctx_set(inode, this, &ctx_int); + } + } +unlock: + UNLOCK(&inode->lock); + if (ret && ctx_free) + GF_FREE(ctx); + return ret; +} + +/*Code to get mds subvol from inode ctx */ + +int +dht_inode_ctx_mdsvol_get(inode_t *inode, xlator_t *this, xlator_t **mdsvol) +{ + dht_inode_ctx_t *ctx = NULL; + int ret = -1; + + if (!mdsvol) + return ret; + + if (__is_root_gfid(inode->gfid)) { + (*mdsvol) = FIRST_CHILD(this); + return 0; + } + + ret = dht_inode_ctx_get(inode, this, &ctx); + + if (!ret && ctx) { + if (ctx->mds_subvol) { + *mdsvol = ctx->mds_subvol; + ret = 0; + } else { + ret = -1; + } + } + + return ret; } /* TODO: @@ -107,5234 +450,10942 @@ out: - complete linkfile selfheal */ - -int -dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int op_ret, int op_errno, dict_t *xdata) +static int +dht_lookup_selfheal_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - int ret = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + int ret = -1; - GF_VALIDATE_OR_GOTO ("dht", frame, out); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); - local = frame->local; - ret = op_ret; + local = frame->local; + conf = this->private; + ret = op_ret; - FRAME_SU_UNDO (frame, dht_local_t); + FRAME_SU_UNDO(frame, dht_local_t); - if (ret == 0) { - layout = local->selfheal.layout; - ret = dht_layout_set (this, local->inode, layout); - } + if (ret == 0) { + layout = local->selfheal.layout; + ret = dht_layout_set(this, local->inode, layout); + } - if (local->loc.parent) { - dht_inode_ctx_time_update (local->loc.parent, this, - &local->postparent, 1); - } + dht_inode_ctx_time_update(local->inode, this, &local->stbuf, 1); + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, &local->postparent, + 1); + } - DHT_STRIP_PHASE1_FLAGS (&local->stbuf); + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + dht_set_fixed_dir_stat(&local->postparent); + /* Delete mds xattr at the time of STACK UNWIND */ + GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr); - DHT_STACK_UNWIND (lookup, frame, ret, local->op_errno, local->inode, - &local->stbuf, local->xattr, &local->postparent); + DHT_STACK_UNWIND(lookup, frame, ret, local->op_errno, local->inode, + &local->stbuf, local->xattr, &local->postparent); out: - return ret; + return ret; } - -int -dht_discover_complete (xlator_t *this, call_frame_t *discover_frame) +static int +dht_discover_complete(xlator_t *this, call_frame_t *discover_frame) { - dht_local_t *local = NULL; - call_frame_t *main_frame = NULL; - int op_errno = 0; - int ret = -1; - dht_layout_t *layout = NULL; - dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + dht_local_t *heal_local = NULL; + call_frame_t *main_frame = NULL; + call_frame_t *heal_frame = NULL; + int op_errno = 0; + int ret = -1; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + uint32_t vol_commit_hash = 0; + xlator_t *source = NULL; + int heal_path = 0; + int error_while_marking_mds = 0; + int i = 0; + loc_t loc = {0}; + int8_t is_read_only = 0, layout_anomalies = 0; + char gfid_local[GF_UUID_BUF_SIZE] = {0}; + + local = discover_frame->local; + layout = local->layout; + conf = this->private; + gf_uuid_unparse(local->gfid, gfid_local); + + LOCK(&discover_frame->lock); + { + main_frame = local->main_frame; + local->main_frame = NULL; + } + UNLOCK(&discover_frame->lock); - local = discover_frame->local; - layout = local->layout; - conf = this->private; + if (!main_frame) + return 0; - LOCK(&discover_frame->lock); - { - main_frame = local->main_frame; - local->main_frame = NULL; + /* Code to update all extended attributed from + subvol to local->xattr on that internal xattr has found + */ + if (conf->subvolume_cnt == 1) + local->need_xattr_heal = 0; + if (local->need_xattr_heal && (local->mds_xattr)) { + dht_dir_set_heal_xattr(this, local, local->xattr, local->mds_xattr, + NULL, NULL); + dict_unref(local->mds_xattr); + local->mds_xattr = NULL; + } + + ret = dict_get_int8(local->xattr_req, QUOTA_READ_ONLY_KEY, &is_read_only); + if (ret < 0) + gf_msg_debug(this->name, 0, "key = %s not present in dict", + QUOTA_READ_ONLY_KEY); + + if (local->file_count && local->dir_count) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_FILE_TYPE_MISMATCH, + "path %s exists as a file on one subvolume " + "and directory on another. " + "Please fix it manually", + local->loc.path); + op_errno = EIO; + goto out; + } + + if (local->cached_subvol) { + ret = dht_layout_preset(this, local->cached_subvol, local->inode); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_SET_FAILED, + "failed to set layout for subvolume %s", + local->cached_subvol ? local->cached_subvol->name : "<nil>"); + op_errno = EINVAL; + goto out; + } + } else { + ret = dht_layout_normalize(this, &local->loc, layout); + if ((ret < 0) || ((ret > 0) && (local->op_ret != 0))) { + /* either the layout is incorrect or the directory is + * not found even in one subvolume. + */ + gf_msg_debug(this->name, 0, + "normalizing failed on %s " + "(overlaps/holes present: %s, " + "ENOENT errors: %d)", + local->loc.path, (ret < 0) ? "yes" : "no", + (ret > 0) ? ret : 0); + layout_anomalies = 1; + } else if (local->inode) { + dht_layout_set(this, local->inode, layout); + } + } + + if (!conf->vch_forced) { + ret = dict_get_uint32(local->xattr, conf->commithash_xattr_name, + &vol_commit_hash); + if (ret == 0) { + conf->vol_commit_hash = vol_commit_hash; } - UNLOCK(&discover_frame->lock); + } - if (!main_frame) + if (IA_ISDIR(local->stbuf.ia_type) && !is_read_only) { + for (i = 0; i < layout->cnt; i++) { + if (!source && !layout->list[i].err) + source = layout->list[i].xlator; + if (layout->list[i].err == ENOENT || + layout->list[i].err == ESTALE) { + heal_path = 1; + } + + if (source && heal_path) + break; + } + } + + if (IA_ISDIR(local->stbuf.ia_type)) { + /* Call function to save hashed subvol on inode ctx if + internal mds xattr is not present and all subvols are up + */ + if (!local->op_ret && !__is_root_gfid(local->stbuf.ia_gfid)) + (void)dht_common_mark_mdsxattr(discover_frame, + &error_while_marking_mds, 1); + + if (local->need_xattr_heal && !heal_path) { + local->need_xattr_heal = 0; + ret = dht_dir_xattr_heal(this, local, &op_errno); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_DIR_XATTR_HEAL_FAILED, + "xattr heal failed for " + "directory gfid is %s ", + gfid_local); + } + } + } + + if (source && (heal_path || layout_anomalies || error_while_marking_mds)) { + gf_uuid_copy(loc.gfid, local->gfid); + if (gf_uuid_is_null(loc.gfid)) { + goto done; + } + + if (local->inode) + loc.inode = inode_ref(local->inode); + else + goto done; + + heal_frame = create_frame(this, this->ctx->pool); + if (heal_frame) { + heal_local = dht_local_init(heal_frame, &loc, NULL, 0); + if (!heal_local) + goto cleanup; + + gf_uuid_copy(heal_local->gfid, local->gfid); + heal_frame->cookie = source; + heal_local->xattr = dict_ref(local->xattr); + heal_local->stbuf = local->stbuf; + heal_local->postparent = local->postparent; + heal_local->inode = inode_ref(loc.inode); + heal_local->main_frame = main_frame; + FRAME_SU_DO(heal_frame, dht_local_t); + ret = synctask_new(this->ctx->env, dht_heal_full_path, + dht_heal_full_path_done, heal_frame, heal_frame); + if (!ret) { + loc_wipe(&loc); return 0; + } + /* + * Failed to spawn the synctask. Returning + * with out doing heal. + */ + cleanup: + loc_wipe(&loc); + DHT_STACK_DESTROY(heal_frame); + } + } +done: + dht_set_fixed_dir_stat(&local->postparent); + /* Delete mds xattr at the time of STACK UNWIND */ + if (local->xattr) + GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr); - if (local->file_count && local->dir_count) { - gf_log (this->name, GF_LOG_ERROR, - "path %s exists as a file on one subvolume " - "and directory on another. " - "Please fix it manually", - local->loc.path); - op_errno = EIO; - goto out; - } + DHT_STACK_UNWIND(lookup, main_frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); + return 0; - if (local->cached_subvol) { - ret = dht_layout_preset (this, local->cached_subvol, - local->inode); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "failed to set layout for subvolume %s", - local->cached_subvol ? local->cached_subvol->name : "<nil>"); - op_errno = EINVAL; - goto out; - } - } else { - ret = dht_layout_normalize (this, &local->loc, layout); - if ((ret < 0) || ((ret > 0) && (local->op_ret != 0))) { - /* either the layout is incorrect or the directory is - * not found even in one subvolume. - */ - gf_log (this->name, GF_LOG_DEBUG, - "normalizing failed on %s " - "(overlaps/holes present: %s, " - "ENOENT errors: %d)", local->loc.path, - (ret < 0) ? "yes" : "no", (ret > 0) ? ret : 0); - if ((ret > 0) && (ret == conf->subvolume_cnt)) { - op_errno = ESTALE; - goto out; - } - } +out: + DHT_STACK_UNWIND(lookup, main_frame, -1, op_errno, NULL, NULL, NULL, NULL); - if (local->inode) - dht_layout_set (this, local->inode, layout); - } + return ret; +} - DHT_STACK_UNWIND (lookup, main_frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf, local->xattr, - &local->postparent); - return 0; +static int +dht_common_mark_mdsxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *prev = cookie; + int ret = -1; + dht_conf_t *conf = 0; + dht_layout_t *layout = NULL; + int32_t mds_heal_fresh_lookup = 0; + + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + + local = frame->local; + conf = this->private; + layout = local->selfheal.layout; + mds_heal_fresh_lookup = local->mds_heal_fresh_lookup; + + if (op_ret) { + gf_msg_debug(this->name, op_ret, + "Failed to set %s on the MDS %s for path %s. ", + conf->mds_xattr_key, prev->name, local->loc.path); + } else { + /* Save mds subvol on inode ctx */ + ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED, + "Failed to set mds subvol on inode ctx" + " %s for %s ", + prev->name, local->loc.path); + } + } + if (!local->mds_heal_fresh_lookup && layout) { + dht_selfheal_dir_setattr(frame, &local->loc, &local->stbuf, 0xffffffff, + layout); + } out: - DHT_STACK_UNWIND (lookup, main_frame, -1, op_errno, NULL, NULL, NULL, - NULL); - - return ret; + if (mds_heal_fresh_lookup) + DHT_STACK_DESTROY(frame); + return 0; } +static xlator_t * +dht_inode_get_hashed_subvol(inode_t *inode, xlator_t *this, loc_t *loc) +{ + char *path = NULL; + loc_t populate_loc = { + 0, + }; + char *name = NULL; + xlator_t *hash_subvol = NULL; + + if (!inode) + return hash_subvol; + + if (loc && loc->parent && loc->path) { + if (!loc->name) { + name = strrchr(loc->path, '/'); + if (name) { + loc->name = name + 1; + } else { + goto out; + } + } + hash_subvol = dht_subvol_get_hashed(this, loc); + goto out; + } -int -dht_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, dict_t *xattr, - struct iatt *postparent) -{ - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; - int ret = -1; - int is_dir = 0; - int is_linkfile = 0; - int attempt_unwind = 0; - dht_conf_t *conf = 0; - - GF_VALIDATE_OR_GOTO ("dht", frame, out); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", this->private, out); - GF_VALIDATE_OR_GOTO ("dht", cookie, out); + if (!gf_uuid_is_null(inode->gfid)) { + populate_loc.inode = inode_ref(inode); + populate_loc.parent = inode_parent(populate_loc.inode, NULL, NULL); + inode_path(populate_loc.inode, NULL, &path); - local = frame->local; - prev = cookie; - conf = this->private; + if (!path) + goto out; - layout = local->layout; + populate_loc.path = path; + if (!populate_loc.name && populate_loc.path) { + name = strrchr(populate_loc.path, '/'); + if (name) { + populate_loc.name = name + 1; - /* Check if the gfid is different for file from other node */ - if (!op_ret && uuid_compare (local->gfid, stbuf->ia_gfid)) { - gf_log (this->name, GF_LOG_WARNING, - "%s: gfid different on %s", - local->loc.path, prev->this->name); + } else { + goto out; + } } + hash_subvol = dht_subvol_get_hashed(this, &populate_loc); + } +out: + if (populate_loc.inode) + loc_wipe(&populate_loc); + return hash_subvol; +} +/* Common function call by revalidate/selfheal code path to populate + internal xattr if it is not present, mark_during_fresh_lookup value + determines either function is call by revalidate_cbk(discover_complete) + or call by selfheal code path while fresh lookup. + Here we do wind a call serially in case of fresh lookup and + for other lookup code path we do wind a call parallel.The reason + to wind a call serially is at the time of fresh lookup directory is not + discovered and at the time of revalidate_lookup directory is + already discovered. So, revalidate codepath can race with setxattr + codepath and can get into spurious heals because of an ongoing setxattr. + This can slow down revalidates, if healing happens in foreground. + However, if healing happens in background, there is no direct performance + penalty. +*/ +int +dht_common_mark_mdsxattr(call_frame_t *frame, int *errst, + int mark_during_fresh_lookup) +{ + dht_local_t *local = NULL; + xlator_t *this = NULL; + xlator_t *hashed_subvol = NULL; + int ret = 0; + int i = 0; + dict_t *xattrs = NULL; + char gfid_local[GF_UUID_BUF_SIZE] = { + 0, + }; + int32_t zero[1] = {0}; + dht_conf_t *conf = 0; + dht_layout_t *layout = NULL; + dht_local_t *copy_local = NULL; + call_frame_t *xattr_frame = NULL; + gf_boolean_t vol_down = _gf_false; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + this = frame->this; + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, frame->local, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + local = frame->local; + conf = this->private; + layout = local->selfheal.layout; + local->mds_heal_fresh_lookup = mark_during_fresh_lookup; + + gf_uuid_unparse(local->gfid, gfid_local); + + /* Code to update hashed subvol consider as a mds subvol + and wind a setxattr call on hashed subvol to update + internal xattr + */ + if (!local->xattr || !dict_get(local->xattr, conf->mds_xattr_key)) { + /* It means no internal MDS xattr has been set yet + */ + /* Check the status of all subvol are up while call + this function call by lookup code path + */ + if (mark_during_fresh_lookup) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!conf->subvolume_status[i]) { + vol_down = _gf_true; + break; + } + } + if (vol_down) { + gf_msg_debug(this->name, 0, + "subvol %s is down. Unable to " + " save mds subvol on inode for " + " path %s gfid is %s ", + conf->subvolumes[i]->name, local->loc.path, + gfid_local); + goto out; + } + } - LOCK (&frame->lock); - { - /* TODO: assert equal mode on stbuf->st_mode and - local->stbuf->st_mode + /* Calculate hashed subvol based on inode and parent node + */ + hashed_subvol = dht_inode_get_hashed_subvol(local->inode, this, + &local->loc); + if (!hashed_subvol) { + gf_msg(this->name, GF_LOG_DEBUG, 0, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "Failed to get hashed subvol for path %s" + "gfid is %s ", + local->loc.path, gfid_local); + if (errst) + (*errst) = 1; + ret = -1; + goto out; + } + xattrs = dict_new(); + if (!xattrs) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "dict_new failed"); + ret = -1; + goto out; + } + /* Add internal MDS xattr on disk for hashed subvol + */ + ret = dht_dict_set_array(xattrs, conf->mds_xattr_key, zero, 1); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary" + " value:key = %s for " + "path %s", + conf->mds_xattr_key, local->loc.path); + ret = -1; + goto out; + } + /* Create a new frame to wind a call only while + this function call by revalidate_cbk code path + To wind a call parallel need to create a new frame + */ + if (mark_during_fresh_lookup) { + xattr_frame = create_frame(this, this->ctx->pool); + if (!xattr_frame) { + ret = -1; + goto out; + } + copy_local = dht_local_init(xattr_frame, &(local->loc), NULL, 0); + if (!copy_local) { + ret = -1; + DHT_STACK_DESTROY(xattr_frame); + goto out; + } + copy_local->stbuf = local->stbuf; + copy_local->mds_heal_fresh_lookup = mark_during_fresh_lookup; + if (!copy_local->inode) + copy_local->inode = inode_ref(local->inode); + gf_uuid_copy(copy_local->loc.gfid, local->gfid); + FRAME_SU_DO(xattr_frame, dht_local_t); + STACK_WIND_COOKIE(xattr_frame, dht_common_mark_mdsxattr_cbk, + hashed_subvol, hashed_subvol, + hashed_subvol->fops->setxattr, &local->loc, + xattrs, 0, NULL); + } else { + STACK_WIND_COOKIE(frame, dht_common_mark_mdsxattr_cbk, + (void *)hashed_subvol, hashed_subvol, + hashed_subvol->fops->setxattr, &local->loc, + xattrs, 0, NULL); + } + } else { + gf_msg_debug(this->name, 0, + "internal xattr %s is present on subvol" + "on path %s gfid is %s ", + conf->mds_xattr_key, local->loc.path, gfid_local); + if (!mark_during_fresh_lookup) + dht_selfheal_dir_setattr(frame, &local->loc, &local->stbuf, + 0xffffffff, layout); + } - else mkdir/chmod/chown and fix - */ - ret = dht_layout_merge (this, layout, prev->this, - op_ret, op_errno, xattr); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to merge layouts", local->loc.path); +out: + if (xattrs) + dict_unref(xattrs); + return ret; +} - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "lookup of %s on %s returned error (%s)", - local->loc.path, prev->this->name, - strerror (op_errno)); +/* Get the value of key from dict in the bytewise and save in array after + convert from network byte order to host byte order +*/ +static int32_t +dht_dict_get_array(dict_t *dict, char *key, int32_t value[], int32_t size, + int *errst) +{ + void *ptr = NULL; + int32_t len = -1; + int32_t vindex = -1; + int32_t err = -1; + int ret = 0; + + if (dict == NULL) { + (*errst) = -1; + return -EINVAL; + } + err = dict_get_ptr_and_len(dict, key, &ptr, &len); + if (err != 0) { + (*errst) = -1; + return err; + } + + if (len != (size * sizeof(int32_t))) { + (*errst) = -1; + return -EINVAL; + } + + for (vindex = 0; vindex < size; vindex++) { + value[vindex] = ntoh32(*((int32_t *)ptr + vindex)); + if (value[vindex] < 0) + ret = -1; + } + + return ret; +} - goto unlock; - } +static int +dht_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, + dict_t *xattr, struct iatt *postparent) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = -1; + int is_dir = 0; + int32_t check_mds = 0; + int is_linkfile = 0; + int attempt_unwind = 0; + dht_conf_t *conf = 0; + char gfid_local[GF_UUID_BUF_SIZE] = {0}; + char gfid_node[GF_UUID_BUF_SIZE] = {0}; + int32_t mds_xattr_val[1] = {0}; + int errst = 0; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", this->private, out); + GF_VALIDATE_OR_GOTO("dht", cookie, out); + + local = frame->local; + prev = cookie; + conf = this->private; + + layout = local->layout; + + /* Check if the gfid is different for file from other node */ + if (!op_ret && gf_uuid_compare(local->gfid, stbuf->ia_gfid)) { + gf_uuid_unparse(stbuf->ia_gfid, gfid_node); + gf_uuid_unparse(local->gfid, gfid_local); + + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH, + "%s: gfid different on %s, gfid local = %s" + "gfid other = %s", + local->loc.path, prev->name, gfid_local, gfid_node); + } + + LOCK(&frame->lock); + { + /* TODO: assert equal mode on stbuf->st_mode and + local->stbuf->st_mode + + else mkdir/chmod/chown and fix + */ + + ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, xattr); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED, + "%s: failed to merge layouts for subvol %s", local->loc.path, + prev->name); - is_linkfile = check_is_linkfile (inode, stbuf, xattr, - conf->link_xattr_name); - is_dir = check_is_dir (inode, stbuf, xattr); + if (op_ret == -1) { + local->op_errno = op_errno; + gf_msg_debug(this->name, op_errno, + "lookup of %s on %s returned error", local->loc.path, + prev->name); - if (is_dir) { - local->dir_count ++; - } else { - local->file_count ++; - - if (!is_linkfile) { - /* real file */ - local->cached_subvol = prev->this; - attempt_unwind = 1; - } else { - goto unlock; - } - } + goto unlock; + } - local->op_ret = 0; + is_linkfile = check_is_linkfile(inode, stbuf, xattr, + conf->link_xattr_name); + is_dir = check_is_dir(inode, stbuf, xattr); - if (local->xattr == NULL) { - local->xattr = dict_ref (xattr); - } else { - dht_aggregate_xattr (local->xattr, xattr); - } + if (is_dir) { + local->dir_count++; + } else { + local->file_count++; + + if (!is_linkfile && !local->cached_subvol) { + /* real file */ + /* Ok, we somehow managed to find a file on + * more than one subvol. ignore this or we + * will end up overwriting information while a + * a thread is potentially unwinding from + * dht_discover_complete + */ + local->cached_subvol = prev; + attempt_unwind = 1; + } else { + goto unlock; + } + } - if (local->inode == NULL) - local->inode = inode_ref (inode); + local->op_ret = 0; - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - dht_iatt_merge (this, &local->postparent, postparent, - prev->this); + if (local->xattr == NULL) { + local->xattr = dict_ref(xattr); + } else { + /* Don't aggregate for files. See BZ#1484709 */ + if (is_dir) + dht_aggregate_xattr(local->xattr, xattr); } + + if (local->inode == NULL) + local->inode = inode_ref(inode); + + dht_iatt_merge(this, &local->stbuf, stbuf); + dht_iatt_merge(this, &local->postparent, postparent); + + if (!dict_get(xattr, conf->mds_xattr_key)) { + goto unlock; + } else { + gf_msg_debug(this->name, 0, + "internal xattr %s is present on subvol" + "on path %s gfid is %s ", + conf->mds_xattr_key, local->loc.path, gfid_local); + } + check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key, + mds_xattr_val, 1, &errst); + /* save mds subvol on inode ctx */ + ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED, + "Failed to set hashed subvol for %s vol is %s", + local->loc.path, prev->name); + } + + if ((check_mds < 0) && !errst) { + local->mds_xattr = dict_ref(xattr); + gf_msg_debug(this->name, 0, + "Value of %s is not zero on mds subvol" + "so xattr needs to be healed on non mds" + " path is %s and vol name is %s " + " gfid is %s", + conf->mds_xattr_key, local->loc.path, prev->name, + gfid_local); + local->need_xattr_heal = 1; + local->mds_subvol = prev; + } + } unlock: - UNLOCK (&frame->lock); + UNLOCK(&frame->lock); out: - this_call_cnt = dht_frame_return (frame); + /* Make sure, the thread executing dht_discover_complete is the one + * which calls STACK_DESTROY (frame). In the case of "attempt_unwind", + * this makes sure that the thread don't call dht_frame_return, till + * call to dht_discover_complete is done. + */ + if (attempt_unwind) { + dht_discover_complete(this, frame); + } - if (is_last_call (this_call_cnt) || attempt_unwind) { - dht_discover_complete (this, frame); - } + this_call_cnt = dht_frame_return(frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_DESTROY (frame); + if (is_last_call(this_call_cnt) && !attempt_unwind) { + dht_discover_complete(this, frame); + } - return 0; + if (is_last_call(this_call_cnt)) + DHT_STACK_DESTROY(frame); + + return 0; } +static int +dht_set_file_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + int ret = -EINVAL; + dht_conf_t *conf = NULL; + + conf = this->private; + if (!conf) { + goto err; + } + + if (!xattr_req) { + goto err; + } + + /* Used to check whether this is a linkto file. + */ + ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value:key = %s for " + "path %s", + conf->link_xattr_name, loc->path); + goto err; + } + + /* This is used to make sure we don't unlink linkto files + * which are the target of an ongoing file migration. + */ + ret = dict_set_uint32(xattr_req, GLUSTERFS_OPEN_FD_COUNT, 4); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value:key = %s for " + "path %s", + GLUSTERFS_OPEN_FD_COUNT, loc->path); + goto err; + } + + ret = 0; +err: + return ret; +} -int -dht_discover (call_frame_t *frame, xlator_t *this, loc_t *loc) +/* This is a gfid based nameless lookup. Without a name, the hashed subvol + * cannot be calculated so a lookup is sent to all subvols. + */ +static int +dht_do_discover(call_frame_t *frame, xlator_t *this, loc_t *loc) { - int ret; - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int call_cnt = 0; - int op_errno = EINVAL; - int i = 0; - call_frame_t *discover_frame = NULL; + int ret; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int call_cnt = 0; + int op_errno = EINVAL; + int i = 0; + call_frame_t *discover_frame = NULL; + + conf = this->private; + local = frame->local; + + /* As we do not know if this is a file or directory, request + * both file and directory xattrs + */ + ret = dht_set_file_xattr_req(this, loc, local->xattr_req); + if (ret) { + goto err; + } + + ret = dht_set_dir_xattr_req(this, loc, local->xattr_req); + if (ret) { + goto err; + } + + if (loc_is_root(loc)) { + /* Request the DHT commit hash xattr (trusted.glusterfs.dht.commithash) + * set on the brick root. + */ + ret = dict_set_uint32(local->xattr_req, conf->commithash_xattr_name, + sizeof(uint32_t)); + } - conf = this->private; - local = frame->local; + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; - ret = dict_set_uint32 (local->xattr_req, conf->xattr_name, 4 * 4); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set '%s' key", - loc->path, conf->xattr_name); + local->layout = dht_layout_new(this, conf->subvolume_cnt); - ret = dict_set_uint32 (local->xattr_req, conf->link_xattr_name, 256); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set '%s' key", - loc->path, conf->link_xattr_name); + if (!local->layout) { + op_errno = ENOMEM; + goto err; + } - call_cnt = conf->subvolume_cnt; - local->call_cnt = call_cnt; + gf_uuid_copy(local->gfid, loc->gfid); - local->layout = dht_layout_new (this, conf->subvolume_cnt); + discover_frame = copy_frame(frame); + if (!discover_frame) { + op_errno = ENOMEM; + goto err; + } - if (!local->layout) { - op_errno = ENOMEM; - goto err; - } + discover_frame->local = local; + frame->local = NULL; + local->main_frame = frame; - uuid_copy (local->gfid, loc->gfid); + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(discover_frame, dht_discover_cbk, conf->subvolumes[i], + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, &local->loc, + local->xattr_req); + } - discover_frame = copy_frame (frame); - if (!discover_frame) { - op_errno = ENOMEM; - goto err; - } + return 0; - discover_frame->local = local; - frame->local = NULL; - local->main_frame = frame; +err: + DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); - for (i = 0; i < call_cnt; i++) { - STACK_WIND (discover_frame, dht_discover_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->lookup, - &local->loc, local->xattr_req); - } + return 0; +} - return 0; +/* Code to call syntask to heal custom xattr from hashed subvol + to non hashed subvol +*/ +int +dht_dir_xattr_heal(xlator_t *this, dht_local_t *local, int *op_errno) +{ + dht_local_t *copy_local = NULL; + call_frame_t *copy = NULL; + int ret = -1; + char gfid_local[GF_UUID_BUF_SIZE] = {0}; + + if (gf_uuid_is_null(local->gfid)) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DIR_XATTR_HEAL_FAILED, + "No gfid exists for path %s " + "so healing xattr is not possible", + local->loc.path); + *op_errno = EIO; + goto out; + } + + gf_uuid_unparse(local->gfid, gfid_local); + copy = create_frame(this, this->ctx->pool); + if (copy) { + copy_local = dht_local_init(copy, &(local->loc), NULL, 0); + if (!copy_local) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, + DHT_MSG_DIR_XATTR_HEAL_FAILED, + "Memory allocation failed " + "for path %s gfid %s ", + local->loc.path, gfid_local); + *op_errno = ENOMEM; + DHT_STACK_DESTROY(copy); + } else { + copy_local->stbuf = local->stbuf; + gf_uuid_copy(copy_local->loc.gfid, local->gfid); + copy_local->mds_subvol = local->mds_subvol; + FRAME_SU_DO(copy, dht_local_t); + ret = synctask_new(this->ctx->env, dht_dir_heal_xattrs, + dht_dir_heal_xattrs_done, copy, copy); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, + DHT_MSG_DIR_XATTR_HEAL_FAILED, + "Synctask creation failed to heal xattr " + "for path %s gfid %s ", + local->loc.path, gfid_local); + *op_errno = ENOMEM; + DHT_STACK_DESTROY(copy); + } + } + } +out: + return ret; +} -err: - DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, - NULL); +static int +dht_needs_selfheal(call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int needs_selfheal = 0; + int ret = 0; + + local = frame->local; + layout = local->layout; + + if (local->need_attrheal || local->need_xattr_heal || + local->need_selfheal) { + needs_selfheal = 1; + } + + ret = dht_layout_normalize(this, &local->loc, layout); + + if (ret != 0) { + gf_msg_debug(this->name, 0, "fixing assignment on %s", local->loc.path); + needs_selfheal = 1; + } + return needs_selfheal; +} +static int +is_permission_different(ia_prot_t *prot1, ia_prot_t *prot2) +{ + if ((prot1->owner.read != prot2->owner.read) || + (prot1->owner.write != prot2->owner.write) || + (prot1->owner.exec != prot2->owner.exec) || + (prot1->group.read != prot2->group.read) || + (prot1->group.write != prot2->group.write) || + (prot1->group.exec != prot2->group.exec) || + (prot1->other.read != prot2->other.read) || + (prot1->other.write != prot2->other.write) || + (prot1->other.exec != prot2->other.exec) || + (prot1->suid != prot2->suid) || (prot1->sgid != prot2->sgid) || + (prot1->sticky != prot2->sticky)) { + return 1; + } else { return 0; + } } - int -dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, dict_t *xattr, - struct iatt *postparent) -{ - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; - int ret = -1; - int is_dir = 0; - - GF_VALIDATE_OR_GOTO ("dht", frame, out); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", this->private, out); - GF_VALIDATE_OR_GOTO ("dht", cookie, out); +dht_lookup_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, + dict_t *xattr, struct iatt *postparent) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = -1; + int is_dir = 0; + int32_t check_mds = 0; + int errst = 0; + char gfid_local[GF_UUID_BUF_SIZE] = {0}; + char gfid_node[GF_UUID_BUF_SIZE] = {0}; + int32_t mds_xattr_val[1] = {0}; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", this->private, out); + GF_VALIDATE_OR_GOTO("dht", cookie, out); + + local = frame->local; + prev = cookie; + conf = this->private; + + layout = local->layout; + gf_msg_debug(this->name, op_errno, + "%s: lookup on %s returned with op_ret = %d, op_errno = %d", + local->loc.path, prev->name, op_ret, op_errno); + + /* The first successful lookup*/ + if (!op_ret && gf_uuid_is_null(local->gfid)) { + memcpy(local->gfid, stbuf->ia_gfid, 16); + } + if (!gf_uuid_is_null(local->gfid)) { + gf_uuid_unparse(local->gfid, gfid_local); + } + + /* Check if the gfid is different for file from other node */ + if (!op_ret && gf_uuid_compare(local->gfid, stbuf->ia_gfid)) { + gf_uuid_unparse(stbuf->ia_gfid, gfid_node); + + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH, + "%s: gfid different on %s." + " gfid local = %s, gfid subvol = %s", + local->loc.path, prev->name, gfid_local, gfid_node); + } + + LOCK(&frame->lock); + { + /* TODO: assert equal mode on stbuf->st_mode and + local->stbuf->st_mode + else mkdir/chmod/chown and fix + */ + ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, xattr); - local = frame->local; - prev = cookie; + if (op_ret == -1) { + local->op_errno = op_errno; - layout = local->layout; + /* The GFID is missing on this subvol. Force a heal. */ + if (op_errno == ENODATA) { + local->need_lookup_everywhere = 1; + } + goto unlock; + } - if (!op_ret && uuid_is_null (local->gfid)) - memcpy (local->gfid, stbuf->ia_gfid, 16); + is_dir = check_is_dir(inode, stbuf, xattr); + if (!is_dir) { + gf_msg_debug(this->name, 0, + "%s: lookup on %s returned non dir 0%o" + "calling lookup_everywhere", + local->loc.path, prev->name, stbuf->ia_type); - /* Check if the gfid is different for file from other node */ - if (!op_ret && uuid_compare (local->gfid, stbuf->ia_gfid)) { - gf_log (this->name, GF_LOG_WARNING, - "%s: gfid different on %s", - local->loc.path, prev->this->name); + local->need_lookup_everywhere = 1; + goto unlock; } - LOCK (&frame->lock); - { - /* TODO: assert equal mode on stbuf->st_mode and - local->stbuf->st_mode + local->op_ret = 0; + if (local->xattr == NULL) { + local->xattr = dict_ref(xattr); + } else { + dht_aggregate_xattr(local->xattr, xattr); + } + + if (__is_root_gfid(stbuf->ia_gfid)) { + ret = dht_dir_has_layout(xattr, conf->xattr_name); + if (ret >= 0) { + if (is_greater_time(local->prebuf.ia_ctime, + local->prebuf.ia_ctime_nsec, + stbuf->ia_ctime, stbuf->ia_ctime_nsec)) { + /* Choose source */ + local->prebuf.ia_gid = stbuf->ia_gid; + local->prebuf.ia_uid = stbuf->ia_uid; + + local->prebuf.ia_ctime = stbuf->ia_ctime; + local->prebuf.ia_ctime_nsec = stbuf->ia_ctime_nsec; + local->prebuf.ia_prot = stbuf->ia_prot; + } + } + } - else mkdir/chmod/chown and fix - */ - ret = dht_layout_merge (this, layout, prev->this, - op_ret, op_errno, xattr); + if (local->stbuf.ia_type != IA_INVAL) { + /* This is not the first subvol to respond + * Compare values to see if attrs need to be healed + */ + if ((local->stbuf.ia_gid != stbuf->ia_gid) || + (local->stbuf.ia_uid != stbuf->ia_uid) || + (is_permission_different(&local->stbuf.ia_prot, + &stbuf->ia_prot))) { + local->need_attrheal = 1; + } + } - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "lookup of %s on %s returned error (%s)", - local->loc.path, prev->this->name, - strerror (op_errno)); + if (local->inode == NULL) + local->inode = inode_ref(inode); - goto unlock; - } + dht_iatt_merge(this, &local->stbuf, stbuf); + dht_iatt_merge(this, &local->postparent, postparent); - is_dir = check_is_dir (inode, stbuf, xattr); - if (!is_dir) { - gf_log (this->name, GF_LOG_DEBUG, - "lookup of %s on %s returned non dir 0%o", - local->loc.path, prev->this->name, - stbuf->ia_type); - local->need_selfheal = 1; - goto unlock; - } + if (!dict_get(xattr, conf->mds_xattr_key)) { + gf_msg_debug(this->name, 0, + "%s: mds xattr %s is not present " + "on %s(gfid = %s)", + local->loc.path, conf->mds_xattr_key, prev->name, + gfid_local); + goto unlock; + } - local->op_ret = 0; - if (local->xattr == NULL) { - local->xattr = dict_ref (xattr); - } else { - dht_aggregate_xattr (local->xattr, xattr); - } + /* Save the mds subvol info and stbuf. This is the value that will + * be used for healing + */ + local->mds_subvol = prev; + local->mds_stbuf = *stbuf; - if (local->inode == NULL) - local->inode = inode_ref (inode); + /* Save mds subvol on inode ctx */ + + ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED, + "%s: Failed to set mds (%s)", local->loc.path, prev->name); + } + check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key, + mds_xattr_val, 1, &errst); + if ((check_mds < 0) && !errst) { + /* Check if xattrs need to be healed on the directories */ + local->mds_xattr = dict_ref(xattr); + gf_msg_debug(this->name, 0, + "%s: %s is not zero on %s. Xattrs need to be healed." + "(gfid = %s)", + local->loc.path, conf->mds_xattr_key, prev->name, + gfid_local); + local->need_xattr_heal = 1; + } + } - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - dht_iatt_merge (this, &local->postparent, postparent, - prev->this); - } unlock: - UNLOCK (&frame->lock); + UNLOCK(&frame->lock); + this_call_cnt = dht_frame_return(frame); - this_call_cnt = dht_frame_return (frame); + if (is_last_call(this_call_cnt)) { + /* If the mds subvol is not set correctly*/ + if (!__is_root_gfid(local->gfid) && + (!dict_get(local->xattr, conf->mds_xattr_key))) { + local->need_selfheal = 1; + } - if (is_last_call (this_call_cnt)) { - if (local->need_selfheal) { - local->need_selfheal = 0; - dht_lookup_everywhere (frame, this, &local->loc); - return 0; - } + /* No need to call xattr heal code if volume count is 1 + */ + if (conf->subvolume_cnt == 1) { + local->need_xattr_heal = 0; + } - if (local->op_ret == 0) { - ret = dht_layout_normalize (this, &local->loc, layout); + if (local->need_selfheal || local->need_lookup_everywhere) { + /* Set the gfid-req so posix will set the GFID*/ + if (!gf_uuid_is_null(local->gfid)) { + /* Ok, this should _never_ happen */ + ret = dict_set_static_bin(local->xattr_req, "gfid-req", + local->gfid, 16); + } else { + if (!gf_uuid_is_null(local->gfid_req)) + ret = dict_set_static_bin(local->xattr_req, "gfid-req", + local->gfid_req, 16); + } + } - if (ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "fixing assignment on %s", - local->loc.path); - goto selfheal; - } + if (local->need_lookup_everywhere) { + local->need_lookup_everywhere = 0; + dht_lookup_everywhere(frame, this, &local->loc); + return 0; + } - dht_layout_set (this, local->inode, layout); - } + if (local->op_ret == 0) { + if (dht_needs_selfheal(frame, this)) { + goto selfheal; + } - if (local->loc.parent) { - dht_inode_ctx_time_update (local->loc.parent, this, - &local->postparent, 1); - } + dht_layout_set(this, local->inode, layout); + if (local->inode) { + dht_inode_ctx_time_update(local->inode, this, &local->stbuf, 1); + } - DHT_STRIP_PHASE1_FLAGS (&local->stbuf); - DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf, local->xattr, - &local->postparent); + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); + } } - return 0; + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + dht_set_fixed_dir_stat(&local->postparent); + /* Delete mds xattr at the time of STACK UNWIND */ + if (local->xattr) + GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr); + + DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); + } + + return 0; selfheal: - FRAME_SU_DO (frame, dht_local_t); - uuid_copy (local->loc.gfid, local->gfid); - ret = dht_selfheal_directory (frame, dht_lookup_selfheal_cbk, - &local->loc, layout); + FRAME_SU_DO(frame, dht_local_t); + ret = dht_selfheal_directory(frame, dht_lookup_selfheal_cbk, &local->loc, + layout); out: - return ret; + return ret; +} + +static int +dht_lookup_directory(call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + int call_cnt = 0; + int i = 0; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int ret = 0; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, unwind); + GF_VALIDATE_OR_GOTO("dht", frame->local, unwind); + GF_VALIDATE_OR_GOTO("dht", this->private, unwind); + GF_VALIDATE_OR_GOTO("dht", loc, unwind); + + conf = this->private; + local = frame->local; + + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + local->layout = dht_layout_new(this, conf->subvolume_cnt); + if (!local->layout) { + goto unwind; + } + + if (local->xattr != NULL) { + dict_unref(local->xattr); + local->xattr = NULL; + } + + if (!gf_uuid_is_null(local->gfid)) { + /* use this gfid in order to heal any missing ones */ + ret = dict_set_gfuuid(local->xattr_req, "gfid-req", local->gfid, true); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "%s: Failed to set dictionary value:" + " key = gfid-req", + local->loc.path); + } + + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE( + frame, dht_lookup_dir_cbk, conf->subvolumes[i], conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, &local->loc, local->xattr_req); + } + return 0; +unwind: + DHT_STACK_UNWIND(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL); +out: + return 0; } int -dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, dict_t *xattr, - struct iatt *postparent) -{ - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; - dht_conf_t *conf = NULL; - int ret = -1; - int is_dir = 0; - int is_linkfile = 0; - call_frame_t *copy = NULL; - dht_local_t *copy_local = NULL; - - GF_VALIDATE_OR_GOTO ("dht", frame, err); - GF_VALIDATE_OR_GOTO ("dht", this, err); - GF_VALIDATE_OR_GOTO ("dht", frame->local, err); - GF_VALIDATE_OR_GOTO ("dht", cookie, err); +dht_revalidate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, + dict_t *xattr, struct iatt *postparent) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int is_dir = 0; + int is_linkfile = 0; + int follow_link = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + uint32_t vol_commit_hash = 0; + xlator_t *subvol = NULL; + int32_t check_mds = 0; + int errst = 0, i = 0; + int32_t mds_xattr_val[1] = {0}; + + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO("dht", this, err); + GF_VALIDATE_OR_GOTO("dht", frame->local, err); + GF_VALIDATE_OR_GOTO("dht", cookie, err); + GF_VALIDATE_OR_GOTO("dht", this->private, err); + + local = frame->local; + prev = cookie; + conf = this->private; + + if (!conf->vch_forced) { + /* Update the commithash value if available + */ + ret = dict_get_uint32(xattr, conf->commithash_xattr_name, + &vol_commit_hash); + if (ret == 0) { + conf->vol_commit_hash = vol_commit_hash; + } + } - local = frame->local; - prev = cookie; - conf = this->private; - if (!conf) - goto out; + gf_uuid_unparse(local->loc.gfid, gfid); - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - - if ((op_errno != ENOTCONN) - && (op_errno != ENOENT) - && (op_errno != ESTALE)) { - gf_log (this->name, GF_LOG_INFO, - "subvolume %s for %s returned -1 (%s)", - prev->this->name, local->loc.path, - strerror (op_errno)); - } - if (op_errno == ESTALE) { - /* propagate the ESTALE to parent. - * setting local->return_estale would send - * ESTALE to parent. */ - local->return_estale = 1; - } + gf_msg_debug(this->name, op_errno, + "%s: revalidate lookup on %s returned op_ret %d", + local->loc.path, prev->name, op_ret); - /* if it is ENOENT, we may have to do a - * 'lookup_everywhere()' to make sure - * the file is not migrated */ - if (op_errno == ENOENT) { - if (IA_ISREG (local->loc.inode->ia_type)) { - local->need_lookup_everywhere = 1; - } + LOCK(&frame->lock); + { + if (gf_uuid_is_null(local->gfid)) { + memcpy(local->gfid, local->loc.gfid, 16); + } + + if (op_ret == -1) { + local->op_errno = op_errno; + + if ((op_errno != ENOTCONN) && (op_errno != ENOENT) && + (op_errno != ESTALE)) { + gf_msg(this->name, GF_LOG_INFO, op_errno, + DHT_MSG_REVALIDATE_CBK_INFO, + "Revalidate: subvolume %s for %s " + "(gfid = %s) returned -1", + prev->name, local->loc.path, gfid); + } + if (op_errno == ESTALE) { + /* propagate the ESTALE to parent. + * setting local->return_estale would send + * ESTALE to parent. */ + local->return_estale = 1; + } + + /* if it is ENOENT, we may have to do a + * 'lookup_everywhere()' to make sure + * the file is not migrated */ + if (op_errno == ENOENT) { + if (IA_ISREG(local->loc.inode->ia_type)) { + gf_msg_debug(this->name, 0, + "found ENOENT for %s. " + "Setting " + "need_lookup_everywhere" + " flag to 1", + local->loc.path); + + local->need_lookup_everywhere = 1; + } else if (IA_ISDIR(local->loc.inode->ia_type)) { + layout = local->layout; + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == prev) { + layout->list[i].err = op_errno; + break; } - goto unlock; + } + + local->need_selfheal = 1; } + } - if (stbuf->ia_type != local->inode->ia_type) { - gf_log (this->name, GF_LOG_INFO, - "mismatching filetypes 0%o v/s 0%o for %s", - (stbuf->ia_type), (local->inode->ia_type), - local->loc.path); + /* The GFID is missing on this subvol. Lookup everywhere to force a + * gfid heal + */ + if ((op_errno == ENODATA) && + (IA_ISDIR(local->loc.inode->ia_type))) { + local->need_lookup_everywhere = 1; + } - local->op_ret = -1; - local->op_errno = EINVAL; + goto unlock; + } - goto unlock; - } + if ((!IA_ISINVAL(local->inode->ia_type)) && + stbuf->ia_type != local->inode->ia_type) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_FILE_TYPE_MISMATCH, + "mismatching filetypes 0%o v/s 0%o for %s," + " gfid = %s", + (stbuf->ia_type), (local->inode->ia_type), local->loc.path, + gfid); - layout = local->layout; + local->op_ret = -1; + local->op_errno = EINVAL; - is_dir = check_is_dir (inode, stbuf, xattr); - is_linkfile = check_is_linkfile (inode, stbuf, xattr, - conf->link_xattr_name); + goto unlock; + } - if (is_linkfile) { - gf_log (this->name, GF_LOG_INFO, - "linkfile found in revalidate for %s", - local->loc.path); - local->return_estale = 1; + layout = local->layout; - goto unlock; + is_dir = check_is_dir(inode, stbuf, xattr); + is_linkfile = check_is_linkfile(inode, stbuf, xattr, + conf->link_xattr_name); + if (is_linkfile) { + follow_link = 1; + goto unlock; + } + if (is_dir) { + ret = dht_dir_has_layout(xattr, conf->xattr_name); + if (ret >= 0) { + if (is_greater_time(local->prebuf.ia_ctime, + local->prebuf.ia_ctime_nsec, + stbuf->ia_ctime, stbuf->ia_ctime_nsec)) { + /* Choose source */ + local->prebuf.ia_gid = stbuf->ia_gid; + local->prebuf.ia_uid = stbuf->ia_uid; + + local->prebuf.ia_ctime = stbuf->ia_ctime; + local->prebuf.ia_ctime_nsec = stbuf->ia_ctime_nsec; + + if (__is_root_gfid(stbuf->ia_gfid)) + local->prebuf.ia_prot = stbuf->ia_prot; } + } + + if (local->stbuf.ia_type != IA_INVAL) { + if ((local->stbuf.ia_gid != stbuf->ia_gid) || + (local->stbuf.ia_uid != stbuf->ia_uid) || + is_permission_different(&local->stbuf.ia_prot, + &stbuf->ia_prot)) { + local->need_attrheal = 1; + } + } + + if (!dict_get(xattr, conf->mds_xattr_key)) { + gf_msg_debug(this->name, 0, + "%s: internal xattr %s is not present" + " on subvol %s(gfid is %s)", + local->loc.path, conf->mds_xattr_key, prev->name, + gfid); + } else { + check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key, + mds_xattr_val, 1, &errst); + local->mds_subvol = prev; + local->mds_stbuf.ia_gid = stbuf->ia_gid; + local->mds_stbuf.ia_uid = stbuf->ia_uid; + local->mds_stbuf.ia_prot = stbuf->ia_prot; + + /* save mds subvol on inode ctx */ + ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_SET_INODE_CTX_FAILED, + "Failed to set MDS subvol for %s vol is %s", + local->loc.path, prev->name); + } + if ((check_mds < 0) && !errst) { + /* Check if xattrs need to be healed on the directory + */ + local->mds_xattr = dict_ref(xattr); + gf_msg_debug(this->name, 0, + "Value of %s is not zero on " + "hashed subvol so xattr needs to" + " be healed on non hashed" + " path is %s and vol name is %s " + " gfid is %s", + conf->mds_xattr_key, local->loc.path, + prev->name, gfid); + local->need_xattr_heal = 1; + } + } + ret = dht_layout_dir_mismatch(this, layout, prev, &local->loc, + xattr); + if (ret != 0) { + /* In memory layout does not match on-disk layout. + */ + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_MISMATCH, + "Mismatching layouts for %s, gfid = %s", local->loc.path, + gfid); - if (is_dir) { - ret = dht_dir_has_layout (xattr, conf->xattr_name); - if (ret >= 0) { - if (is_greater_time(local->stbuf.ia_ctime, - local->stbuf.ia_ctime_nsec, - stbuf->ia_ctime, - stbuf->ia_ctime_nsec)) { - local->prebuf.ia_gid = stbuf->ia_gid; - local->prebuf.ia_uid = stbuf->ia_uid; - } - } - if (local->stbuf.ia_type != IA_INVAL) - { - if ((local->stbuf.ia_gid != stbuf->ia_gid) || - (local->stbuf.ia_uid != stbuf->ia_uid)) { - local->need_selfheal = 1; - } - } - ret = dht_layout_dir_mismatch (this, layout, - prev->this, &local->loc, - xattr); - if (ret != 0) { - gf_log (this->name, GF_LOG_INFO, - "mismatching layouts for %s", - local->loc.path); - - local->layout_mismatch = 1; + local->layout_mismatch = 1; - goto unlock; - } - } + goto unlock; + } + } - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - dht_iatt_merge (this, &local->postparent, postparent, - prev->this); + gf_uuid_copy(local->stbuf.ia_gfid, stbuf->ia_gfid); + dht_iatt_merge(this, &local->stbuf, stbuf); + dht_iatt_merge(this, &local->postparent, postparent); - local->op_ret = 0; + local->op_ret = 0; - if (!local->xattr) { - local->xattr = dict_ref (xattr); - } else if (is_dir) { - dht_aggregate_xattr (local->xattr, xattr); - } + if (!local->xattr) { + local->xattr = dict_ref(xattr); + } else if (is_dir) { + dht_aggregate_xattr(local->xattr, xattr); } + } unlock: - UNLOCK (&frame->lock); -out: - this_call_cnt = dht_frame_return (frame); - - if (is_last_call (this_call_cnt)) { - if (!IA_ISDIR (local->stbuf.ia_type) - && (local->hashed_subvol != local->cached_subvol) - && (local->stbuf.ia_nlink == 1) - && (conf && conf->unhashed_sticky_bit)) { - local->stbuf.ia_prot.sticky = 1; - } - if (local->need_selfheal) { - local->need_selfheal = 0; - uuid_copy (local->gfid, local->stbuf.ia_gfid); - local->stbuf.ia_gid = local->prebuf.ia_gid; - local->stbuf.ia_uid = local->prebuf.ia_uid; - copy = create_frame (this, this->ctx->pool); - if (copy) { - copy_local = dht_local_init (copy, &local->loc, - NULL, 0); - if (!copy_local) - goto cont; - copy_local->stbuf = local->stbuf; - copy->local = copy_local; - FRAME_SU_DO (copy, dht_local_t); - ret = synctask_new (this->ctx->env, - dht_dir_attr_heal, - dht_dir_attr_heal_done, - copy, copy); - } - } -cont: - if (local->layout_mismatch) { - /* Found layout mismatch in the directory, need to - fix this in the inode context */ - dht_layout_unref (this, local->layout); - local->layout = NULL; - dht_lookup_directory (frame, this, &local->loc); - return 0; - } + UNLOCK(&frame->lock); - if (local->need_lookup_everywhere) { - /* As the current layout gave ENOENT error, we would - need a new layout */ - dht_layout_unref (this, local->layout); - local->layout = NULL; - - /* We know that current cached subvol is no more - valid, get the new one */ - local->cached_subvol = NULL; - dht_lookup_everywhere (frame, this, &local->loc); - return 0; - } - if (local->return_estale) { - local->op_ret = -1; - local->op_errno = ESTALE; - } - - if (local->loc.parent) { - dht_inode_ctx_time_update (local->loc.parent, this, - &local->postparent, 1); - } + if (follow_link) { + /* Found a linkto file. Follow it to see if the target file exists + */ + gf_uuid_copy(local->gfid, stbuf->ia_gfid); - DHT_STRIP_PHASE1_FLAGS (&local->stbuf); - DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf, local->xattr, - &local->postparent); + subvol = dht_linkfile_subvol(this, inode, stbuf, xattr); + if (!subvol) { + op_errno = ESTALE; + local->op_ret = -1; + } else { + STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol, + subvol->fops->lookup, &local->loc, + local->xattr_req); + return 0; } + } -err: - return ret; -} - + this_call_cnt = dht_frame_return(frame); -int -dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent, - dict_t *xdata) -{ - dht_local_t *local = NULL; - xlator_t *cached_subvol = NULL; - dht_conf_t *conf = NULL; - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", frame, out); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", this->private, out); - GF_VALIDATE_OR_GOTO ("dht", cookie, out); + if (is_last_call(this_call_cnt)) { + if (!IA_ISDIR(local->stbuf.ia_type) && + (local->hashed_subvol != local->cached_subvol) && + (local->stbuf.ia_nlink == 1) && + (conf && conf->unhashed_sticky_bit)) { + local->stbuf.ia_prot.sticky = 1; + } + /* No need to call heal code if volume count is 1 + */ + if (conf->subvolume_cnt == 1) + local->need_xattr_heal = 0; + + if (IA_ISDIR(local->stbuf.ia_type)) { + /* No mds xattr found. Trigger a heal to set it */ + if (!__is_root_gfid(local->loc.inode->gfid) && + (!dict_get(local->xattr, conf->mds_xattr_key))) + local->need_selfheal = 1; + + if (dht_needs_selfheal(frame, this)) { + if (!__is_root_gfid(local->loc.inode->gfid)) { + if (local->mds_subvol) { + local->stbuf.ia_gid = local->mds_stbuf.ia_gid; + local->stbuf.ia_uid = local->mds_stbuf.ia_uid; + local->stbuf.ia_prot = local->mds_stbuf.ia_prot; + } + } else { + local->stbuf.ia_gid = local->prebuf.ia_gid; + local->stbuf.ia_uid = local->prebuf.ia_uid; + local->stbuf.ia_prot = local->prebuf.ia_prot; + } - local = frame->local; - cached_subvol = local->cached_subvol; - conf = this->private; + layout = local->layout; + dht_selfheal_directory(frame, dht_lookup_selfheal_cbk, + &local->loc, layout); + return 0; + } + } - ret = dht_layout_preset (this, local->cached_subvol, local->loc.inode); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to set layout for subvolume %s", - cached_subvol ? cached_subvol->name : "<nil>"); - local->op_ret = -1; - local->op_errno = EINVAL; - goto unwind; + if (local->layout_mismatch) { + /* Found layout mismatch in the directory, need to + fix this in the inode context */ + dht_layout_unref(this, local->layout); + local->layout = NULL; + dht_lookup_directory(frame, this, &local->loc); + return 0; } - local->op_ret = 0; - if ((local->stbuf.ia_nlink == 1) - && (conf && conf->unhashed_sticky_bit)) { - local->stbuf.ia_prot.sticky = 1; + if (local->need_lookup_everywhere) { + /* As the current layout gave ENOENT error, we would + need a new layout */ + dht_layout_unref(this, local->layout); + local->layout = NULL; + + /* We know that current cached subvol is no longer + valid, get the new one */ + local->cached_subvol = NULL; + if (local->xattr_req) { + if (!gf_uuid_is_null(local->gfid)) { + ret = dict_set_static_bin(local->xattr_req, "gfid-req", + local->gfid, 16); + } + } + + dht_lookup_everywhere(frame, this, &local->loc); + return 0; + } + if (local->return_estale) { + local->op_ret = -1; + local->op_errno = ESTALE; } if (local->loc.parent) { - dht_inode_ctx_time_update (local->loc.parent, this, - postparent, 1); + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); } + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + dht_set_fixed_dir_stat(&local->postparent); + + /* local->stbuf is updated only from subvols which have a layout + * The reason is to avoid choosing attr heal source from newly + * added bricks. In case e.g we have only one subvol and for + * some reason layout is not present on it, then local->stbuf + * will be EINVAL. This is an indication that the subvols + * active in the cluster do not have layouts on disk. + * Unwind with ESTALE to trigger a fresh lookup */ + if (is_dir && local->stbuf.ia_type == IA_INVAL) { + local->op_ret = -1; + local->op_errno = ESTALE; + } + /* Delete mds xattr at the time of STACK UNWIND */ + if (local->xattr) + GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr); + + DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); + } + +err: + return ret; +} + +static int +dht_lookup_linkfile_create_cbk(call_frame_t *frame, void *cooie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *cached_subvol = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", this->private, out); + + local = frame->local; + cached_subvol = local->cached_subvol; + conf = this->private; + + gf_uuid_unparse(local->loc.gfid, gfid); + + if (local->locked) + dht_unlock_namespace(frame, &local->lock[0]); + + ret = dht_layout_preset(this, local->cached_subvol, local->loc.inode); + if (ret < 0) { + gf_msg_debug(this->name, EINVAL, + "Failed to set layout for subvolume %s, " + "(gfid = %s)", + cached_subvol ? cached_subvol->name : "<nil>", gfid); + local->op_ret = -1; + local->op_errno = EINVAL; + goto unwind; + } + + local->op_ret = 0; + if ((local->stbuf.ia_nlink == 1) && (conf && conf->unhashed_sticky_bit)) { + local->stbuf.ia_prot.sticky = 1; + } + + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1); + } + unwind: - if (local->linked == _gf_true) - dht_linkfile_attr_heal (frame, this); + gf_msg_debug(this->name, 0, + "creation of linkto on hashed subvol:%s, " + "returned with op_ret %d and op_errno %d: %s", + local->hashed_subvol->name, op_ret, op_errno, + uuid_utoa(local->loc.gfid)); + + if (local->linked == _gf_true) + dht_linkfile_attr_heal(frame, this); + + dht_set_fixed_dir_stat(&local->postparent); - DHT_STRIP_PHASE1_FLAGS (&local->stbuf); - DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, - local->inode, &local->stbuf, local->xattr, - &local->postparent); + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); out: - return ret; + return ret; } - -int -dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this) +static int +dht_lookup_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - int ret = 0; - dht_local_t *local = NULL; - xlator_t *hashed_subvol = NULL; - xlator_t *cached_subvol = NULL; - dht_layout_t *layout = NULL; + int this_call_cnt = 0; + dht_local_t *local = NULL; + const char *path = NULL; - local = frame->local; - hashed_subvol = local->hashed_subvol; - cached_subvol = local->cached_subvol; + local = (dht_local_t *)frame->local; + path = local->loc.path; + FRAME_SU_UNDO(frame, dht_local_t); - if (local->file_count && local->dir_count) { - gf_log (this->name, GF_LOG_ERROR, - "path %s exists as a file on one subvolume " - "and directory on another. " - "Please fix it manually", - local->loc.path); - DHT_STACK_UNWIND (lookup, frame, -1, EIO, NULL, NULL, NULL, - NULL); - return 0; - } + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO, + "lookup_unlink returned with " + "op_ret -> %d and op-errno -> %d for %s", + op_ret, op_errno, ((path == NULL) ? "null" : path)); - if (local->dir_count) { - dht_lookup_directory (frame, this, &local->loc); - return 0; - } + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + dht_lookup_everywhere_done(frame, this); + } - if (!cached_subvol) { - DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL, NULL, - NULL); - return 0; - } + return 0; +} - if (local->need_lookup_everywhere) { - if (uuid_compare (local->gfid, local->inode->gfid)) { - /* GFID different, return error */ - DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, - NULL, NULL, NULL); - return 0; - } - local->op_ret = 0; - local->op_errno = 0; - layout = dht_layout_for_subvol (this, cached_subvol); - if (!layout) { - gf_log (this->name, GF_LOG_INFO, - "%s: no pre-set layout for subvolume %s", - local->loc.path, (cached_subvol ? - cached_subvol->name : - "<nil>")); - } +static int +dht_lookup_unlink_of_false_linkto_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int this_call_cnt = 0; + dht_local_t *local = NULL; + const char *path = NULL; - ret = dht_layout_set (this, local->inode, layout); - if (ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to set layout for subvol %s", - local->loc.path, (cached_subvol ? - cached_subvol->name : - "<nil>")); - } + local = (dht_local_t *)frame->local; + path = local->loc.path; - if (local->loc.parent) { - dht_inode_ctx_time_update (local->loc.parent, this, - &local->postparent, 1); - } + FRAME_SU_UNDO(frame, dht_local_t); - DHT_STRIP_PHASE1_FLAGS (&local->stbuf); - DHT_STACK_UNWIND (lookup, frame, local->op_ret, - local->op_errno, local->inode, - &local->stbuf, local->xattr, - &local->postparent); - return 0; - } + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO, + "lookup_unlink returned with " + "op_ret -> %d and op-errno -> %d for %s", + op_ret, op_errno, ((path == NULL) ? "null" : path)); - if (!hashed_subvol) { - gf_log (this->name, GF_LOG_INFO, - "cannot create linkfile file for %s on %s: " - "hashed subvolume cannot be found.", - local->loc.path, cached_subvol->name); + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + if ((op_ret == 0) || ((op_errno != EBUSY) && (op_errno != ENOTCONN))) { + dht_lookup_everywhere_done(frame, this); + } else { + /*When dht_lookup_everywhere is performed, one cached + *and one hashed file was found and hashed file does + *not point to the above mentioned cached node. So it + *was considered as stale and an unlink was performed. + *But unlink fails. So may be rebalance is in progress. + *now ideally we have two data-files. One obtained during + *lookup_everywhere and one where unlink-failed. So + *at this point in time we cannot decide which one to + *choose because there are chances of first cached + *file is truncated after rebalance and if it is chosen + *as cached node, application will fail. So return EIO.*/ + + if (op_errno == EBUSY) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_UNLINK_FAILED, + "Could not unlink the linkto file as " + "either fd is open and/or linkto xattr " + "is set for %s", + ((path == NULL) ? "null" : path)); + } + DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL); + } + } + + return 0; +} - local->op_ret = 0; - local->op_errno = 0; +static int +dht_lookup_unlink_stale_linkto_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + const char *path = NULL; - ret = dht_layout_preset (frame->this, cached_subvol, - local->inode); - if (ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "failed to set layout for subvol %s", - cached_subvol ? cached_subvol->name : - "<nil>"); - local->op_ret = -1; - local->op_errno = EINVAL; - } + /* NOTE: + * If stale file unlink fails either there is an open-fd or is not an + * dht-linkto-file then posix_unlink returns EBUSY, which is overwritten + * to ENOENT + */ - if (local->loc.parent) { - dht_inode_ctx_time_update (local->loc.parent, this, - &local->postparent, 1); - } + local = frame->local; - DHT_STRIP_PHASE1_FLAGS (&local->stbuf); - DHT_STACK_UNWIND (lookup, frame, local->op_ret, - local->op_errno, local->inode, - &local->stbuf, local->xattr, - &local->postparent); - return 0; - } + if (local) { + FRAME_SU_UNDO(frame, dht_local_t); + if (local->loc.path) + path = local->loc.path; + } - gf_log (this->name, GF_LOG_DEBUG, - "linking file %s existing on %s to %s (hash)", - local->loc.path, cached_subvol->name, - hashed_subvol->name); + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO, + "Returned with op_ret %d and " + "op_errno %d for %s", + op_ret, op_errno, ((path == NULL) ? "null" : path)); - ret = dht_linkfile_create (frame, - dht_lookup_linkfile_create_cbk, this, - cached_subvol, hashed_subvol, &local->loc); + DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, NULL); - return ret; + return 0; } - -int -dht_lookup_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - struct iatt *preparent, struct iatt *postparent, - dict_t *xdata) +static int +dht_fill_dict_to_avoid_unlink_of_migrating_file(dict_t *dict) { - int this_call_cnt = 0; + int ret = 0; - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - dht_lookup_everywhere_done (frame, this); - } + ret = dict_set_int32_sizen(dict, DHT_SKIP_NON_LINKTO_UNLINK, 1); - return 0; -} + if (ret) + return -1; + ret = dict_set_int32_sizen(dict, DHT_SKIP_OPEN_FD_UNLINK, 1); -int -dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - int is_linkfile = 0; - int is_dir = 0; - xlator_t *subvol = NULL; - loc_t *loc = NULL; - xlator_t *link_subvol = NULL; - int ret = -1; - int32_t fd_count = 0; - dht_conf_t *conf = NULL; - - GF_VALIDATE_OR_GOTO ("dht", frame, out); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", cookie, out); - GF_VALIDATE_OR_GOTO ("dht", this->private, out); - - local = frame->local; - loc = &local->loc; - conf = this->private; - - prev = cookie; - subvol = prev->this; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - if (op_errno != ENOENT) - local->op_errno = op_errno; - goto unlock; - } + if (ret) + return -1; - if (uuid_is_null (local->gfid)) - uuid_copy (local->gfid, buf->ia_gfid); + return 0; +} - if (uuid_compare (local->gfid, buf->ia_gfid)) { - gf_log (this->name, GF_LOG_WARNING, - "%s: gfid differs on subvolume %s", - loc->path, prev->this->name); - } +static int32_t +dht_linkfile_create_lookup_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + dht_local_t *local = NULL; + int call_cnt = 0, ret = 0; + xlator_t *subvol = NULL; + uuid_t gfid = { + 0, + }; + char gfid_str[GF_UUID_BUF_SIZE] = {0}; + + subvol = cookie; + local = frame->local; + + if (subvol == local->hashed_subvol) { + if ((op_ret == 0) || (op_errno != ENOENT)) + local->dont_create_linkto = _gf_true; + } else { + if (gf_uuid_is_null(local->gfid)) + gf_uuid_copy(gfid, local->loc.gfid); + else + gf_uuid_copy(gfid, local->gfid); + + if ((op_ret == 0) && gf_uuid_compare(gfid, buf->ia_gfid)) { + gf_uuid_unparse(gfid, gfid_str); + gf_msg_debug(this->name, 0, + "gfid (%s) different on cached subvol " + "(%s) and looked up inode (%s), not " + "creating linkto", + uuid_utoa(buf->ia_gfid), subvol->name, gfid_str); + local->dont_create_linkto = _gf_true; + } else if (op_ret == -1) { + local->dont_create_linkto = _gf_true; + } + } + + call_cnt = dht_frame_return(frame); + if (is_last_call(call_cnt)) { + if (local->dont_create_linkto) + goto no_linkto; + else { + gf_msg_debug(this->name, 0, + "Creating linkto file on %s(hash) to " + "%s on %s (gfid = %s)", + local->hashed_subvol->name, local->loc.path, + local->cached_subvol->name, gfid_str); + + ret = dht_linkfile_create(frame, dht_lookup_linkfile_create_cbk, + this, local->cached_subvol, + local->hashed_subvol, &local->loc); + + if (ret < 0) + goto no_linkto; + } + } + + return 0; + +no_linkto: + gf_msg_debug(this->name, 0, + "skipped linkto creation (path:%s) (gfid:%s) " + "(hashed-subvol:%s) (cached-subvol:%s)", + local->loc.path, gfid_str, local->hashed_subvol->name, + local->cached_subvol->name); + + dht_lookup_linkfile_create_cbk(frame, NULL, this, 0, 0, local->loc.inode, + &local->stbuf, &local->preparent, + &local->postparent, local->xattr); + return 0; +} - is_linkfile = check_is_linkfile (inode, buf, xattr, - conf->link_xattr_name); - is_dir = check_is_dir (inode, buf, xattr); - - if (is_linkfile) { - link_subvol = dht_linkfile_subvol (this, inode, buf, - xattr); - gf_log (this->name, GF_LOG_DEBUG, - "found on %s linkfile %s (-> %s)", - subvol->name, loc->path, - link_subvol ? link_subvol->name : "''"); - goto unlock; - } +static int32_t +dht_call_lookup_linkfile_create(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + int i = 0; + xlator_t *subvol = NULL; - /* non linkfile GFID takes precedence */ - uuid_copy (local->gfid, buf->ia_gfid); + local = frame->local; + if (gf_uuid_is_null(local->gfid)) + gf_uuid_unparse(local->loc.gfid, gfid); + else + gf_uuid_unparse(local->gfid, gfid); - if (is_dir) { - local->dir_count++; + if (op_ret < 0) { + gf_log(this->name, GF_LOG_WARNING, + "protecting namespace failed, skipping linkto " + "creation (path:%s)(gfid:%s)(hashed-subvol:%s)" + "(cached-subvol:%s)", + local->loc.path, gfid, local->hashed_subvol->name, + local->cached_subvol->name); + goto err; + } - gf_log (this->name, GF_LOG_DEBUG, - "found on %s directory %s", - subvol->name, loc->path); - } else { - local->file_count++; - - if (!local->cached_subvol) { - /* found one file */ - dht_iatt_merge (this, &local->stbuf, buf, - subvol); - local->xattr = dict_ref (xattr); - local->cached_subvol = subvol; - gf_log (this->name, GF_LOG_DEBUG, - "found on %s file %s", - subvol->name, loc->path); - - dht_iatt_merge (this, &local->postparent, - postparent, subvol); - } else { - /* This is where we need 'rename' both entries logic */ - gf_log (this->name, GF_LOG_WARNING, - "multiple subvolumes (%s and %s) have " - "file %s (preferably rename the file " - "in the backend, and do a fresh lookup)", - local->cached_subvol->name, - subvol->name, local->loc.path); - } - } - } -unlock: - UNLOCK (&frame->lock); + local->locked = _gf_true; - if (is_linkfile) { - ret = dict_get_int32 (xattr, GLUSTERFS_OPEN_FD_COUNT, &fd_count); - /* Delete the linkfile only if there are no open fds on it. - if there is a open-fd, it may be in migration */ - if (!ret && (fd_count == 0)) { - gf_log (this->name, GF_LOG_INFO, - "deleting stale linkfile %s on %s", - loc->path, subvol->name); - STACK_WIND (frame, dht_lookup_unlink_cbk, - subvol, subvol->fops->unlink, loc, 0, NULL); - return 0; - } - } + local->call_cnt = 2; - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - dht_lookup_everywhere_done (frame, this); - } + for (i = 0; i < 2; i++) { + subvol = (subvol == NULL) ? local->hashed_subvol : local->cached_subvol; -out: - return ret; + STACK_WIND_COOKIE(frame, dht_linkfile_create_lookup_cbk, subvol, subvol, + subvol->fops->lookup, &local->loc, NULL); + } + + return 0; + +err: + dht_lookup_linkfile_create_cbk(frame, NULL, this, 0, 0, local->loc.inode, + &local->stbuf, &local->preparent, + &local->postparent, local->xattr); + return 0; } +/* Rebalance is performed from cached_node to hashed_node. Initial cached_node + * contains a non-linkto file. After migration it is converted to linkto and + * then unlinked. And at hashed_subvolume, first a linkto file is present, + * then after migration it is converted to a non-linkto file. + * + * Lets assume a file is present on cached subvolume and a new brick is added + * and new brick is the new_hashed subvolume. So fresh lookup on newly added + * hashed subvolume will fail and dht_lookup_everywhere gets called. If just + * before sending the dht_lookup_everywhere request rebalance is in progress, + * + * from cached subvolume it may see: Nonlinkto or linkto or No file + * from hashed subvolume it may see: No file or linkto file or non-linkto file + * + * So this boils down to 9 cases: + * at cached_subvol at hashed_subvol + * ---------------- ----------------- + * + *a) No file No file + * [request reached after [Request reached before + * migration] Migration] + * + *b) No file Linkto File + * + *c) No file Non-Linkto File + * + *d) Linkto No-File + * + *e) Linkto Linkto + * + *f) Linkto Non-Linkto + * + *g) NonLinkto No-File + * + *h) NonLinkto Linkto + * + *i) NonLinkto NonLinkto + * + * dht_lookup_everywhere_done takes decision based on any of the above case + */ -int -dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc) +static int +dht_lookup_everywhere_done(call_frame_t *frame, xlator_t *this) { - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - int i = 0; - int call_cnt = 0; + int ret = 0; + dht_local_t *local = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *cached_subvol = NULL; + dht_layout_t *layout = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + gf_boolean_t found_non_linkto_on_hashed = _gf_false; + + local = frame->local; + hashed_subvol = local->hashed_subvol; + cached_subvol = local->cached_subvol; + + gf_uuid_unparse(local->loc.gfid, gfid); + + if (local->file_count && local->dir_count) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_FILE_TYPE_MISMATCH, + "path %s (gfid = %s)exists as a file on one " + "subvolume and directory on another. " + "Please fix it manually", + local->loc.path, gfid); + DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL); + return 0; + } + if (local->op_ret && local->gfid_missing) { + if (gf_uuid_is_null(local->gfid_req)) { + DHT_STACK_UNWIND(lookup, frame, -1, ENODATA, NULL, NULL, NULL, + NULL); + return 0; + } + /* A hack */ + dht_lookup_directory(frame, this, &local->loc); + return 0; + } - GF_VALIDATE_OR_GOTO ("dht", frame, err); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", this->private, out); - GF_VALIDATE_OR_GOTO ("dht", loc, out); + if (local->dir_count) { + dht_lookup_directory(frame, this, &local->loc); + return 0; + } + + gf_msg_debug(this->name, 0, + "STATUS: hashed_subvol %s " + "cached_subvol %s", + (hashed_subvol == NULL) ? "null" : hashed_subvol->name, + (cached_subvol == NULL) ? "null" : cached_subvol->name); + + if (!cached_subvol) { + if (local->skip_unlink.handle_valid_link && hashed_subvol) { + /*Purpose of "DHT_SKIP_NON_LINKTO_UNLINK": + * If this lookup is performed by rebalance and this + * rebalance process detected hashed file and by + * the time it sends the lookup request to cached node, + * file got migrated and now at initial hashed_node, + * final migrated file is present. With current logic, + * because this process fails to find the cached_node, + * it will unlink the file at initial hashed_node. + * + * So we avoid this by setting key, and checking at the + * posix_unlink that unlink the file only if file is a + * linkto file and not a migrated_file. + */ + + ret = dht_fill_dict_to_avoid_unlink_of_migrating_file( + local->xattr_req); + + if (ret) { + /* If for some reason, setting key in the dict + * fails, return with ENOENT, as with respect to + * this process, it detected only a stale link + * file. + * + * Next lookup will delete it. + * + * Performing deletion of stale link file when + * setting key in dict fails, may cause the data + * loss because of the above mentioned race. + */ - conf = this->private; - local = frame->local; + DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, + NULL); + } else { + local->skip_unlink.handle_valid_link = _gf_false; + + gf_msg_debug(this->name, 0, + "No Cached was found and " + "unlink on hashed was skipped" + " so performing now: %s", + local->loc.path); + FRAME_SU_DO(frame, dht_local_t); + STACK_WIND(frame, dht_lookup_unlink_stale_linkto_cbk, + hashed_subvol, hashed_subvol->fops->unlink, + &local->loc, 0, local->xattr_req); + } - call_cnt = conf->subvolume_cnt; - local->call_cnt = call_cnt; + } else { + gf_msg_debug(this->name, 0, + "There was no cached file and " + "unlink on hashed is not skipped %s", + local->loc.path); - if (!local->inode) - local->inode = inode_ref (loc->inode); + DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, NULL); + } + return 0; + } - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_lookup_everywhere_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->lookup, - loc, local->xattr_req); + /* At the time of dht_lookup, no file was found on hashed and that is + * why dht_lookup_everywhere is called, but by the time + * dht_lookup_everywhere + * reached to server, file might have already migrated. In that case we + * will find a migrated file at the hashed_node. In this case store the + * layout in context and return successfully. + */ + + if (hashed_subvol || local->need_lookup_everywhere) { + if (local->need_lookup_everywhere) { + found_non_linkto_on_hashed = _gf_true; + + } else if ((local->file_count == 1) && + (hashed_subvol == cached_subvol)) { + gf_msg_debug(this->name, 0, + "found cached file on hashed subvolume " + "so store in context and return for %s", + local->loc.path); + + found_non_linkto_on_hashed = _gf_true; } - return 0; -out: - DHT_STACK_UNWIND (lookup, frame, -1, EINVAL, NULL, NULL, NULL, NULL); -err: - return -1; -} + if (found_non_linkto_on_hashed) + goto preset_layout; + } + if (hashed_subvol) { + if (local->skip_unlink.handle_valid_link == _gf_true) { + if (cached_subvol == local->skip_unlink.hash_links_to) { + if (gf_uuid_compare(local->skip_unlink.cached_gfid, + local->skip_unlink.hashed_gfid)) { + /*GFID different, return error*/ + DHT_STACK_UNWIND(lookup, frame, -1, ESTALE, NULL, NULL, + NULL, NULL); -int -dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, dict_t *xattr, - struct iatt *postparent) -{ - call_frame_t *prev = NULL; - dht_local_t *local = NULL; - xlator_t *subvol = NULL; - loc_t *loc = NULL; - dht_conf_t *conf = NULL; - int ret = 0; - - GF_VALIDATE_OR_GOTO ("dht", frame, out); - GF_VALIDATE_OR_GOTO ("dht", this, unwind); - GF_VALIDATE_OR_GOTO ("dht", frame->local, unwind); - GF_VALIDATE_OR_GOTO ("dht", this->private, unwind); - GF_VALIDATE_OR_GOTO ("dht", cookie, unwind); - - prev = cookie; - subvol = prev->this; - conf = this->private; - local = frame->local; - loc = &local->loc; + return 0; + } - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "lookup of %s on %s (following linkfile) failed (%s)", - local->loc.path, subvol->name, strerror (op_errno)); + ret = dht_layout_preset(this, cached_subvol, local->loc.inode); + if (ret) { + gf_msg(this->name, GF_LOG_INFO, 0, + DHT_MSG_LAYOUT_PRESET_FAILED, + "Could not set pre-set layout " + "for subvolume %s", + cached_subvol->name); + } - /* If cached subvol returned ENOTCONN, do not do - lookup_everywhere. We need to make sure linkfile does not get - removed, which can take away the namespace, and subvol is - anyways down. */ + local->op_ret = (ret == 0) ? ret : -1; + local->op_errno = (ret == 0) ? ret : EINVAL; - if (op_errno != ENOTCONN) - goto err; - else - goto unwind; - } + /* Presence of local->cached_subvol validates + * that lookup from cached node is successful + */ - if (check_is_dir (inode, stbuf, xattr)) { - gf_log (this->name, GF_LOG_INFO, - "lookup of %s on %s (following linkfile) reached dir", - local->loc.path, subvol->name); - goto err; - } + if (!local->op_ret && local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); + } - if (check_is_linkfile (inode, stbuf, xattr, conf->link_xattr_name)) { - gf_log (this->name, GF_LOG_INFO, - "lookup of %s on %s (following linkfile) reached link", - local->loc.path, subvol->name); - goto err; + gf_msg_debug(this->name, 0, + "Skipped unlinking linkto file " + "on the hashed subvolume. " + "Returning success as it is a " + "valid linkto file. Path:%s", + local->loc.path); + + goto unwind_hashed_and_cached; + } else { + local->skip_unlink.handle_valid_link = _gf_false; + + gf_msg_debug(this->name, 0, + "Linkto file found on hashed " + "subvol " + "and data file found on cached " + "subvolume. But linkto points to " + "different cached subvolume (%s) " + "path %s", + (local->skip_unlink.hash_links_to + ? local->skip_unlink.hash_links_to->name + : " <nil>"), + local->loc.path); + + if (local->skip_unlink.opend_fd_count == 0) { + ret = dht_fill_dict_to_avoid_unlink_of_migrating_file( + local->xattr_req); + + if (ret) { + DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, + NULL, NULL); + } else { + local->call_cnt = 1; + FRAME_SU_DO(frame, dht_local_t); + STACK_WIND(frame, dht_lookup_unlink_of_false_linkto_cbk, + hashed_subvol, hashed_subvol->fops->unlink, + &local->loc, 0, local->xattr_req); + } + + return 0; + } + } } + } - if (uuid_compare (local->gfid, stbuf->ia_gfid)) { - gf_log (this->name, GF_LOG_WARNING, - "%s: gfid different on data file on %s", - local->loc.path, subvol->name); - goto err; +preset_layout: + + if (found_non_linkto_on_hashed) { + if (local->need_lookup_everywhere) { + if (gf_uuid_compare(local->gfid, local->inode->gfid)) { + /* GFID different, return error */ + DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, + NULL); + return 0; + } } - if ((stbuf->ia_nlink == 1) - && (conf && conf->unhashed_sticky_bit)) { - stbuf->ia_prot.sticky = 1; + local->op_ret = 0; + local->op_errno = 0; + layout = dht_layout_for_subvol(this, cached_subvol); + if (!layout) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO, + "%s: no pre-set layout for subvolume %s," + " gfid = %s", + local->loc.path, + (cached_subvol ? cached_subvol->name : "<nil>"), gfid); } - ret = dht_layout_preset (this, prev->this, inode); + ret = dht_layout_set(this, local->inode, layout); if (ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "failed to set layout for subvolume %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO, + "%s: failed to set layout for subvol %s, " + "gfid = %s", + local->loc.path, + (cached_subvol ? cached_subvol->name : "<nil>"), gfid); } if (local->loc.parent) { - dht_inode_ctx_time_update (local->loc.parent, this, - postparent, 1); + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); } -unwind: - DHT_STRIP_PHASE1_FLAGS (stbuf); - DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, xattr, - postparent); - + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + dht_set_fixed_dir_stat(&local->postparent); + DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); return 0; + } -err: - dht_lookup_everywhere (frame, this, loc); -out: + if (!hashed_subvol) { + gf_msg_debug(this->name, 0, + "Cannot create linkfile for %s on %s: " + "hashed subvolume cannot be found, gfid = %s.", + local->loc.path, cached_subvol->name, gfid); + + local->op_ret = 0; + local->op_errno = 0; + + ret = dht_layout_preset(frame->this, cached_subvol, local->inode); + if (ret < 0) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_PRESET_FAILED, + "Failed to set layout for subvol %s" + ", gfid = %s", + cached_subvol ? cached_subvol->name : "<nil>", gfid); + local->op_ret = -1; + local->op_errno = EINVAL; + } + + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); + } + + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + dht_set_fixed_dir_stat(&local->postparent); + DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); return 0; + } + + if (frame->root->op != GF_FOP_RENAME) { + local->current = &local->lock[0]; + ret = dht_protect_namespace(frame, &local->loc, hashed_subvol, + &local->current->ns, + dht_call_lookup_linkfile_create); + } else { + gf_msg_debug(this->name, 0, + "Creating linkto file on %s(hash) to %s on %s " + "(gfid = %s)", + hashed_subvol->name, local->loc.path, cached_subvol->name, + gfid); + + ret = dht_linkfile_create(frame, dht_lookup_linkfile_create_cbk, this, + cached_subvol, hashed_subvol, &local->loc); + } + + return ret; + +unwind_hashed_and_cached: + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + dht_set_fixed_dir_stat(&local->postparent); + DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr, + &local->postparent); + return 0; } - -int -dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc) +static int +dht_lookup_everywhere_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xattr, + struct iatt *postparent) { - int call_cnt = 0; - int i = 0; - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - int ret = 0; - - GF_VALIDATE_OR_GOTO ("dht", frame, out); - GF_VALIDATE_OR_GOTO ("dht", this, unwind); - GF_VALIDATE_OR_GOTO ("dht", frame->local, unwind); - GF_VALIDATE_OR_GOTO ("dht", this->private, unwind); - GF_VALIDATE_OR_GOTO ("dht", loc, unwind); + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + int is_linkfile = 0; + int is_dir = 0; + loc_t *loc = NULL; + xlator_t *link_subvol = NULL; + int ret = -1; + int32_t fd_count = 0; + dht_conf_t *conf = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + dict_t *dict_req = {0}; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", cookie, out); + GF_VALIDATE_OR_GOTO("dht", this->private, out); + + local = frame->local; + loc = &local->loc; + conf = this->private; + + prev = cookie; + + gf_msg_debug(this->name, 0, + "returned with op_ret %d and op_errno %d (%s) " + "from subvol %s", + op_ret, op_errno, loc->path, prev->name); + + LOCK(&frame->lock); + { + if (op_ret == -1) { + if (op_errno != ENOENT) + local->op_errno = op_errno; + if (op_errno == ENODATA) + local->gfid_missing = _gf_true; + goto unlock; + } - conf = this->private; - local = frame->local; + if (gf_uuid_is_null(local->gfid)) + gf_uuid_copy(local->gfid, buf->ia_gfid); - call_cnt = conf->subvolume_cnt; - local->call_cnt = call_cnt; + gf_uuid_unparse(local->gfid, gfid); - local->layout = dht_layout_new (this, conf->subvolume_cnt); - if (!local->layout) { - goto unwind; + if (gf_uuid_compare(local->gfid, buf->ia_gfid)) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH, + "%s: gfid differs on subvolume %s," + " gfid local = %s, gfid node = %s", + loc->path, prev->name, gfid, uuid_utoa(buf->ia_gfid)); } - if (local->xattr != NULL) { - dict_unref (local->xattr); - local->xattr = NULL; - } + is_linkfile = check_is_linkfile(inode, buf, xattr, + conf->link_xattr_name); - if (!uuid_is_null (local->gfid)) { - ret = dict_set_static_bin (local->xattr_req, "gfid-req", - local->gfid, 16); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set gfid", local->loc.path); + if (is_linkfile) { + link_subvol = dht_linkfile_subvol(this, inode, buf, xattr); + gf_msg_debug(this->name, 0, "found on %s linkfile %s (-> %s)", + prev->name, loc->path, + link_subvol ? link_subvol->name : "''"); + goto unlock; } - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_lookup_dir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->lookup, - &local->loc, local->xattr_req); - } - return 0; -unwind: - DHT_STACK_UNWIND (lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL); -out: - return 0; + is_dir = check_is_dir(inode, buf, xattr); -} + /* non linkfile GFID takes precedence but don't overwrite + gfid if we have already found a cached file*/ + if (!local->cached_subvol) + gf_uuid_copy(local->gfid, buf->ia_gfid); + if (is_dir) { + local->dir_count++; -int -dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, dict_t *xattr, - struct iatt *postparent) -{ - char is_linkfile = 0; - char is_dir = 0; - xlator_t *subvol = NULL; - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - loc_t *loc = NULL; - call_frame_t *prev = NULL; - int ret = 0; - dht_layout_t *parent_layout = NULL; - - GF_VALIDATE_OR_GOTO ("dht", frame, err); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", cookie, out); - GF_VALIDATE_OR_GOTO ("dht", this->private, out); - - conf = this->private; - - prev = cookie; - local = frame->local; - loc = &local->loc; + gf_msg_debug(this->name, 0, "found on %s directory %s", prev->name, + loc->path); + } else { + local->file_count++; - /* This is required for handling stale linkfile deletion, - * or any more call which happens from this 'loc'. - */ - if (!op_ret && uuid_is_null (local->gfid)) - memcpy (local->gfid, stbuf->ia_gfid, 16); - - if (ENTRY_MISSING (op_ret, op_errno)) { - gf_log (this->name, GF_LOG_TRACE, "Entry %s missing on subvol" - " %s", loc->path, prev->this->name); - if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_ON) { - local->op_errno = ENOENT; - dht_lookup_everywhere (frame, this, loc); - return 0; - } - if ((conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) && - (loc->parent)) { - ret = dht_inode_ctx_layout_get (loc->parent, this, - &parent_layout); - if (ret || !parent_layout) - goto out; - if (parent_layout->search_unhashed) { - local->op_errno = ENOENT; - dht_lookup_everywhere (frame, this, loc); - return 0; - } - } - } + gf_msg_debug(this->name, 0, "found cached file on %s for %s", + prev->name, loc->path); - if (op_ret == 0) { - is_dir = check_is_dir (inode, stbuf, xattr); - if (is_dir) { - local->inode = inode_ref (inode); - local->xattr = dict_ref (xattr); - } - } + if (!local->cached_subvol) { + /* found one file */ + dht_iatt_merge(this, &local->stbuf, buf); - if (is_dir || (op_ret == -1 && op_errno == ENOTCONN)) { - dht_lookup_directory (frame, this, &local->loc); - return 0; - } + local->xattr = dict_ref(xattr); + local->cached_subvol = prev; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, "Lookup of %s for subvolume" - " %s failed with error %s", loc->path, prev->this->name, - strerror (op_errno)); - goto out; + gf_msg_debug(this->name, 0, + "storing cached on %s file" + " %s", + prev->name, loc->path); + + dht_iatt_merge(this, &local->postparent, postparent); + + gf_uuid_copy(local->skip_unlink.cached_gfid, buf->ia_gfid); + } else { + /* This is where we need 'rename' both entries logic */ + gf_msg(this->name, GF_LOG_WARNING, 0, + DHT_MSG_FILE_ON_MULT_SUBVOL, + "multiple subvolumes (%s and %s) have " + "file %s (preferably rename the file " + "in the backend,and do a fresh lookup)", + local->cached_subvol->name, prev->name, local->loc.path); + } } + } +unlock: + UNLOCK(&frame->lock); + + if (is_linkfile) { + ret = dict_get_int32(xattr, GLUSTERFS_OPEN_FD_COUNT, &fd_count); + + /* Any linkto file found on the non-hashed subvolume should + * be unlinked (performed in the "else if" block below) + * + * But if a linkto file is found on hashed subvolume, it may be + * pointing to valid cached node. So unlinking of linkto + * file on hashed subvolume is skipped and inside + * dht_lookup_everywhere_done, checks are performed. If this + * linkto file is found as stale linkto file, it is deleted + * otherwise unlink is skipped. + */ - is_linkfile = check_is_linkfile (inode, stbuf, xattr, - conf->link_xattr_name); + if (local->hashed_subvol && local->hashed_subvol == prev) { + local->skip_unlink.handle_valid_link = _gf_true; + local->skip_unlink.opend_fd_count = fd_count; + local->skip_unlink.hash_links_to = link_subvol; + gf_uuid_copy(local->skip_unlink.hashed_gfid, buf->ia_gfid); + + gf_msg_debug(this->name, 0, + "Found" + " one linkto file on hashed subvol %s " + "for %s: Skipping unlinking till " + "everywhere_done", + prev->name, loc->path); + + } else if (!ret && (fd_count == 0)) { + dict_req = dict_new(); + + ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(dict_req); + + if (ret) { + /* Skip unlinking for dict_failure + *File is found as a linkto file on non-hashed, + *subvolume. In the current implementation, + *finding a linkto-file on non-hashed does not + *always implies that it is stale. So deletion + *of file should be done only when both fd is + *closed and linkto-xattr is set. In case of + *dict_set failure, avoid skipping of file. + *NOTE: dht_frame_return should get called for + * this block. + */ - if (!is_linkfile) { - /* non-directory and not a linkfile */ + dict_unref(dict_req); + + } else { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO, + "attempting deletion of stale linkfile " + "%s on %s (hashed subvol is %s)", + loc->path, prev->name, + (local->hashed_subvol ? local->hashed_subvol->name + : "<null>")); + /* * + * These stale files may be created using root + * user. Hence deletion will work only with + * root. + */ + FRAME_SU_DO(frame, dht_local_t); + STACK_WIND(frame, dht_lookup_unlink_cbk, prev, + prev->fops->unlink, loc, 0, dict_req); - ret = dht_layout_preset (this, prev->this, inode); - if (ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "could not set pre-set layout for subvolume %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; - goto out; - } - goto out; - } + dict_unref(dict_req); - subvol = dht_linkfile_subvol (this, inode, stbuf, xattr); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "linkfile not having link subvolume. path=%s", - loc->path); - dht_lookup_everywhere (frame, this, loc); return 0; + } } + } - STACK_WIND (frame, dht_lookup_linkfile_cbk, - subvol, subvol->fops->lookup, - &local->loc, local->xattr_req); - - return 0; + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + dht_lookup_everywhere_done(frame, this); + } out: - /* - * FIXME: postparent->ia_size and postparent->st_blocks do not have - * correct values. since, postparent corresponds to a directory these - * two members should have values equal to sum of corresponding values - * from each of the subvolume. See dht_iatt_merge for reference. - */ + return ret; +} - if (!op_ret && local->loc.parent) { - dht_inode_ctx_time_update (local->loc.parent, this, - postparent, 1); - } +int +dht_lookup_everywhere(call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int i = 0; + int call_cnt = 0; + + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", this->private, out); + GF_VALIDATE_OR_GOTO("dht", loc, out); + + conf = this->private; + local = frame->local; - DHT_STRIP_PHASE1_FLAGS (stbuf); - DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, xattr, - postparent); + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + if (!local->inode) + local->inode = inode_ref(loc->inode); + + gf_msg_debug(this->name, 0, "winding lookup call to %d subvols", call_cnt); + + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(frame, dht_lookup_everywhere_cbk, conf->subvolumes[i], + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, loc, + local->xattr_req); + } + + return 0; +out: + DHT_STACK_UNWIND(lookup, frame, -1, EINVAL, NULL, NULL, NULL, NULL); err: - return 0; + return -1; } -/* For directories, check if acl xattrs have been requested (by the acl xlator), - * if not, request for them. These xattrs are needed for dht dir self-heal to - * perform proper self-healing of dirs - */ -void -dht_check_and_set_acl_xattr_req (inode_t *inode, dict_t *xattr_req) +int +dht_lookup_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) { - int ret = 0; + xlator_t *prev = NULL; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + loc_t *loc = NULL; + dht_conf_t *conf = NULL; + int ret = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, unwind); + GF_VALIDATE_OR_GOTO("dht", frame->local, unwind); + GF_VALIDATE_OR_GOTO("dht", this->private, unwind); + GF_VALIDATE_OR_GOTO("dht", cookie, unwind); + + prev = cookie; + subvol = prev; + conf = this->private; + local = frame->local; + loc = &local->loc; + + gf_uuid_unparse(loc->gfid, gfid); + + if (op_ret == -1) { + gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_LINK_FILE_LOOKUP_INFO, + "Lookup of %s on %s (following linkfile) failed " + ",gfid = %s", + local->loc.path, subvol->name, gfid); + + /* If cached subvol returned ENOTCONN, do not do + lookup_everywhere. We need to make sure linkfile does not get + removed, which can take away the namespace, and subvol is + anyways down. */ + + local->cached_subvol = NULL; + if (op_errno != ENOTCONN) + goto err; + else + goto unwind; + } + + if (check_is_dir(inode, stbuf, xattr)) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LINK_FILE_LOOKUP_INFO, + "Lookup of %s on %s (following linkfile) reached dir," + " gfid = %s", + local->loc.path, subvol->name, gfid); + goto err; + } + + if (check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name)) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LINK_FILE_LOOKUP_INFO, + "lookup of %s on %s (following linkfile) reached link," + "gfid = %s", + local->loc.path, subvol->name, gfid); + goto err; + } + + if (gf_uuid_compare(local->gfid, stbuf->ia_gfid)) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH, + "%s: gfid different on data file on %s," + " gfid local = %s, gfid node = %s ", + local->loc.path, subvol->name, gfid, uuid_utoa(stbuf->ia_gfid)); + goto err; + } + + if ((stbuf->ia_nlink == 1) && (conf && conf->unhashed_sticky_bit)) { + stbuf->ia_prot.sticky = 1; + } + + ret = dht_layout_preset(this, prev, inode); + if (ret < 0) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_PRESET_FAILED, + "Failed to set layout for subvolume %s," + "gfid = %s", + prev->name, gfid); + op_ret = -1; + op_errno = EINVAL; + } + + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1); + } - GF_ASSERT (inode); - GF_ASSERT (xattr_req); +unwind: + DHT_STRIP_PHASE1_FLAGS(stbuf); + dht_set_fixed_dir_stat(postparent); + DHT_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr, + postparent); - if (inode->ia_type != IA_IFDIR) - return; + return 0; - if (!dict_get (xattr_req, POSIX_ACL_ACCESS_XATTR)) { - ret = dict_set_int8 (xattr_req, POSIX_ACL_ACCESS_XATTR, 0); - if (ret) - gf_log (THIS->name, GF_LOG_WARNING, - "failed to set key %s", - POSIX_ACL_ACCESS_XATTR); - } +err: + dht_lookup_everywhere(frame, this, loc); +out: + return 0; +} - if (!dict_get (xattr_req, POSIX_ACL_DEFAULT_XATTR)) { - ret = dict_set_int8 (xattr_req, POSIX_ACL_DEFAULT_XATTR, 0); - if (ret) - gf_log (THIS->name, GF_LOG_WARNING, - "failed to set key %s", - POSIX_ACL_DEFAULT_XATTR); - } +/* Code to get hashed subvol based on inode and loc + First it check if loc->parent and loc->path exist then it get + hashed subvol based on loc. +*/ - return; +static gf_boolean_t +dht_should_lookup_everywhere(xlator_t *this, dht_conf_t *conf, loc_t *loc) +{ + dht_layout_t *parent_layout = NULL; + int ret = 0; + gf_boolean_t lookup_everywhere = _gf_true; + + /* lookup-optimize supersedes lookup-unhashed settings. + * If it is set, do not process search_unhashed + * If lookup-optimize if enabled, lookup everywhere if: + * - this is the rebalance daemon. + * - loc->parent is unavailable. + * - parent_layout is unavailable + * - parent_layout->commit_hash != conf->vol_commit_hash + */ + + if (conf->lookup_optimize) { + if (!conf->defrag && loc->parent) { + ret = dht_inode_ctx_layout_get(loc->parent, this, &parent_layout); + if (!ret && parent_layout && + (parent_layout->commit_hash == conf->vol_commit_hash)) { + lookup_everywhere = _gf_false; + } + } + goto out; + } else { + if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) { + if (loc->parent) { + ret = dht_inode_ctx_layout_get(loc->parent, this, + &parent_layout); + if (ret || !parent_layout || + (!parent_layout->search_unhashed)) { + lookup_everywhere = _gf_false; + } + } else { + lookup_everywhere = _gf_false; + } + + goto out; + } + } +out: + return lookup_everywhere; } int -dht_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) -{ - xlator_t *subvol = NULL; - xlator_t *hashed_subvol = NULL; - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int ret = -1; - int op_errno = -1; - dht_layout_t *layout = NULL; - int i = 0; - int call_cnt = 0; - loc_t new_loc = {0,}; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - - conf = this->private; - if (!conf) - goto err; +dht_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) +{ + char is_linkfile = 0; + char is_dir = 0; + xlator_t *subvol = NULL; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + loc_t *loc = NULL; + xlator_t *prev = NULL; + int ret = 0; + uint32_t vol_commit_hash = 0; + + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", cookie, out); + GF_VALIDATE_OR_GOTO("dht", this->private, out); + + conf = this->private; + + prev = cookie; + local = frame->local; + loc = &local->loc; + + gf_msg_debug(this->name, op_errno, + "%s: fresh_lookup on %s returned with op_ret %d", loc->path, + prev->name, op_ret); + + if (op_ret == -1) { + if (ENTRY_MISSING(op_ret, op_errno)) { + if (1 == conf->subvolume_cnt) { + /* No need to lookup again */ + goto out; + } - local = dht_local_init (frame, loc, NULL, GF_FOP_LOOKUP); - if (!local) { - op_errno = ENOMEM; - goto err; - } + gf_msg_debug(this->name, 0, "Entry %s missing on subvol %s", + loc->path, prev->name); - ret = dht_filter_loc_subvol_key (this, loc, &new_loc, - &hashed_subvol); - if (ret) { - loc_wipe (&local->loc); - ret = loc_dup (&new_loc, &local->loc); - - /* we no more need 'new_loc' entries */ - loc_wipe (&new_loc); - - /* check if loc_dup() is successful */ - if (ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_DEBUG, - "copying location failed for path=%s", - loc->path); - goto err; - } - } + if (dht_should_lookup_everywhere(this, conf, loc)) { + local->op_errno = ENOENT; + dht_lookup_everywhere(frame, this, loc); + return 0; + } - if (xattr_req) { - local->xattr_req = dict_ref (xattr_req); } else { - local->xattr_req = dict_new (); - } - - if (uuid_is_null (loc->pargfid) && !uuid_is_null (loc->gfid) && - !__is_root_gfid (loc->inode->gfid)) { - local->cached_subvol = NULL; - dht_discover (frame, this, loc); + /* posix returns ENODATA if the gfid is not set but the client and + * server protocol layers do not send the stbuf. We need to + * heal this so check if this is a directory on the other subvols. + */ + if ((op_errno == ENOTCONN) || (op_errno == ENODATA)) { + dht_lookup_directory(frame, this, &local->loc); return 0; + } + } + gf_msg_debug(this->name, op_errno, "%s: Lookup on subvolume %s failed", + loc->path, prev->name); + goto out; + } + + /* Lookup succeeded - op_ret = 0 */ + + /* This is required for handling stale linkfile deletion, + * or any more call which happens from this 'loc'. + */ + if (gf_uuid_is_null(local->gfid)) { + /*This is set from the first successful response*/ + memcpy(local->gfid, stbuf->ia_gfid, 16); + } + + if (!conf->vch_forced) { + /* Update the commit hash in conf if it is found */ + ret = dict_get_uint32(xattr, conf->commithash_xattr_name, + &vol_commit_hash); + if (ret == 0) { + conf->vol_commit_hash = vol_commit_hash; } + } - if (!hashed_subvol) - hashed_subvol = dht_subvol_get_hashed (this, loc); - local->hashed_subvol = hashed_subvol; + is_dir = check_is_dir(inode, stbuf, xattr); + if (is_dir) { + /* A directory is present on all subvols, send the lookup to + * all subvols now */ + local->inode = inode_ref(inode); + local->xattr = dict_ref(xattr); + dht_lookup_directory(frame, this, &local->loc); + return 0; + } - if (is_revalidate (loc)) { - layout = local->layout; - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "revalidate without cache. path=%s", - loc->path); - op_errno = EINVAL; - goto err; - } + is_linkfile = check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name); - if (layout->gen && (layout->gen < conf->gen)) { - gf_log (this->name, GF_LOG_TRACE, - "incomplete layout failure for path=%s", - loc->path); + if (!is_linkfile) { + /* non-directory and not a linkto file. This is a data file + * Update the layout to point to the cached subvol + */ - dht_layout_unref (this, local->layout); - local->layout = NULL; - local->cached_subvol = NULL; - goto do_fresh_lookup; - } + ret = dht_layout_preset(this, prev, inode); + if (ret < 0) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_PRESET_FAILED, + "%s: could not set pre-set layout for subvolume %s", + loc->path, prev->name); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + goto out; + } + + /* This is a linkto file. Get the value of the target subvol from the + * linkto xattr and lookup there to see if the file exists + */ + subvol = dht_linkfile_subvol(this, inode, stbuf, xattr); + if (!subvol) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO, + "%s: No link subvol for linkto", loc->path); + dht_lookup_everywhere(frame, this, loc); + return 0; + } - local->inode = inode_ref (loc->inode); + gf_msg_debug(this->name, 0, "%s: Calling lookup on linkto target %s", + loc->path, subvol->name); - /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute, - * revalidates directly go to the cached-subvolume. - */ - ret = dict_set_uint32 (local->xattr_req, - conf->xattr_name, 4 * 4); - - if (IA_ISDIR (local->inode->ia_type)) { - local->call_cnt = call_cnt = conf->subvolume_cnt; - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_revalidate_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->lookup, - loc, local->xattr_req); - } - return 0; - } + STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol, + subvol->fops->lookup, &local->loc, local->xattr_req); + + return 0; + +out: + /* + * FIXME: postparent->ia_size and postparent->st_blocks do not have + * correct values. since, postparent corresponds to a directory these + * two members should have values equal to sum of corresponding values + * from each of the subvolume. See dht_iatt_merge for reference. + */ + + if (!op_ret && local && local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1); + } + + DHT_STRIP_PHASE1_FLAGS(stbuf); + dht_set_fixed_dir_stat(postparent); + DHT_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr, + postparent); +err: + return 0; +} - call_cnt = local->call_cnt = layout->cnt; +/* For directories, check if acl xattrs have been requested (by the acl + * xlator), if not, request for them. These xattrs are needed for dht dir + * self-heal to perform proper self-healing of dirs + */ +static void +dht_check_and_set_acl_xattr_req(xlator_t *this, dict_t *xattr_req) +{ + int ret = 0; - /* need it for self-healing linkfiles which is - 'in-migration' state */ - ret = dict_set_uint32 (local->xattr_req, - GLUSTERFS_OPEN_FD_COUNT, 4); + GF_ASSERT(xattr_req); - /* need it for dir self-heal */ - dht_check_and_set_acl_xattr_req (loc->inode, local->xattr_req); + if (!dict_get(xattr_req, POSIX_ACL_ACCESS_XATTR)) { + ret = dict_set_int8(xattr_req, POSIX_ACL_ACCESS_XATTR, 0); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value:key = %s", + POSIX_ACL_ACCESS_XATTR); + } - for (i = 0; i < call_cnt; i++) { - subvol = layout->list[i].xlator; + if (!dict_get(xattr_req, POSIX_ACL_DEFAULT_XATTR)) { + ret = dict_set_int8(xattr_req, POSIX_ACL_DEFAULT_XATTR, 0); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value:key = %s", + POSIX_ACL_DEFAULT_XATTR); + } - STACK_WIND (frame, dht_revalidate_cbk, - subvol, subvol->fops->lookup, - &local->loc, local->xattr_req); + return; +} - } - } else { - do_fresh_lookup: - /* TODO: remove the hard-coding */ - ret = dict_set_uint32 (local->xattr_req, - conf->xattr_name, 4 * 4); - - ret = dict_set_uint32 (local->xattr_req, - conf->link_xattr_name, 256); - - /* need it for self-healing linkfiles which is - 'in-migration' state */ - ret = dict_set_uint32 (local->xattr_req, - GLUSTERFS_OPEN_FD_COUNT, 4); - - /* need it for dir self-heal */ - dht_check_and_set_acl_xattr_req (loc->inode, local->xattr_req); - - if (!hashed_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s, " - "checking on all the subvols to see if " - "it is a directory", loc->path); - call_cnt = conf->subvolume_cnt; - local->call_cnt = call_cnt; - - local->layout = dht_layout_new (this, - conf->subvolume_cnt); - if (!local->layout) { - op_errno = ENOMEM; - goto err; - } +/* for directories, we need the following info: + * the layout : trusted.glusterfs.dht + * the mds information : trusted.glusterfs.dht.mds + * the acl info: See above + */ +static int +dht_set_dir_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + int ret = -EINVAL; + dht_conf_t *conf = NULL; + + conf = this->private; + if (!conf) { + goto err; + } + + if (!xattr_req) { + goto err; + } + + /* Xattr to get the layout for a directory + */ + ret = dict_set_uint32(xattr_req, conf->xattr_name, 4 * 4); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value:key = %s for " + "path %s", + conf->xattr_name, loc->path); + goto err; + } + + /*Non-fatal failure */ + ret = dict_set_uint32(xattr_req, conf->mds_xattr_key, 4); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value:key = %s for " + "path %s", + conf->mds_xattr_key, loc->path); + } + + dht_check_and_set_acl_xattr_req(this, xattr_req); + ret = 0; +err: + return ret; +} - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_lookup_dir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->lookup, - &local->loc, local->xattr_req); - } - return 0; - } +/* If the hashed subvol is present, send the lookup to only that subvol first. + * If no hashed subvol, send a lookup to all subvols and proceed based on the + * responses. + */ +static int +dht_do_fresh_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + int ret = -1; + dht_conf_t *conf = NULL; + xlator_t *hashed_subvol = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int call_cnt = 0; + int i = 0; + + conf = this->private; + if (!conf) { + op_errno = EINVAL; + goto err; + } + + local = frame->local; + if (!local) { + op_errno = EINVAL; + goto err; + } + + /* Since we don't know whether this is a file or a directory, + * request all xattrs*/ + ret = dht_set_file_xattr_req(this, loc, local->xattr_req); + if (ret) { + op_errno = -ret; + goto err; + } + + ret = dht_set_dir_xattr_req(this, loc, local->xattr_req); + if (ret) { + op_errno = -ret; + goto err; + } + + /* Fuse sets a random value in gfid-req. If the gfid is missing + * on one or more subvols, posix will set the gfid to this value, + * causing GFID mismatches for directories. Remove the value fuse + * has sent before sending the lookup. + */ + ret = dict_get_gfuuid(local->xattr_req, "gfid-req", &local->gfid_req); + if (ret) { + gf_msg_debug(this->name, 0, "%s: No gfid-req available", loc->path); + } else { + dict_del(local->xattr_req, "gfid-req"); + } + /* This should have been set in dht_lookup */ + hashed_subvol = local->hashed_subvol; + + if (!hashed_subvol) { + gf_msg_debug(this->name, 0, + "%s: no subvolume in layout for path, " + "checking on all the subvols to see if " + "it is a directory", + loc->path); - STACK_WIND (frame, dht_lookup_cbk, - hashed_subvol, hashed_subvol->fops->lookup, - loc, local->xattr_req); + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + /* Allocate a layout. This will be populated and saved in + * the dht inode_ctx on successful lookup + */ + local->layout = dht_layout_new(this, conf->subvolume_cnt); + if (!local->layout) { + op_errno = ENOMEM; + goto err; } + gf_msg_debug(this->name, 0, + "%s: Found null hashed subvol. Calling lookup" + " on all nodes.", + loc->path); + + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(frame, dht_lookup_dir_cbk, conf->subvolumes[i], + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, &local->loc, + local->xattr_req); + } return 0; + } + /* if the hashed_subvol is non-null, send the lookup there first so + * as to see whether we have a file or a directory */ + gf_msg_debug(this->name, 0, "%s: Calling fresh lookup on %s", loc->path, + hashed_subvol->name); + + STACK_WIND_COOKIE(frame, dht_lookup_cbk, hashed_subvol, hashed_subvol, + hashed_subvol->fops->lookup, loc, local->xattr_req); + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, - NULL); - return 0; + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; } - -int -dht_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +static int +dht_do_revalidate(call_frame_t *frame, xlator_t *this, loc_t *loc) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_ret = -1; - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } + xlator_t *subvol = NULL; + xlator_t *mds_subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int op_errno = -1; + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; + int gen = 0; + + conf = this->private; + if (!conf) { + op_errno = EINVAL; + goto err; + } + + local = frame->local; + if (!local) { + op_errno = EINVAL; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_msg_debug(this->name, 0, + "path = %s. No layout found in the inode ctx.", loc->path); + op_errno = EINVAL; + goto err; + } + + /* Generation number has changed. This layout may be stale. */ + if (layout->gen && (layout->gen < conf->gen)) { + gen = layout->gen; + dht_layout_unref(this, local->layout); + local->layout = NULL; + local->cached_subvol = NULL; + + gf_msg_debug(this->name, 0, + "path = %s. In memory layout may be stale." + "(layout->gen (%d) is less than " + "conf->gen (%d)). Calling fresh lookup.", + loc->path, gen, conf->gen); + + dht_do_fresh_lookup(frame, this, loc); + return 0; + } + + local->inode = inode_ref(loc->inode); + + /* Since we don't know whether this has changed, + * request all xattrs*/ + ret = dht_set_file_xattr_req(this, loc, local->xattr_req); + if (ret) { + op_errno = -ret; + goto err; + } + + ret = dht_set_dir_xattr_req(this, loc, local->xattr_req); + if (ret) { + op_errno = -ret; + goto err; + } + + if (IA_ISDIR(local->inode->ia_type)) { + ret = dht_inode_ctx_mdsvol_get(local->inode, this, &mds_subvol); + if (ret || !mds_subvol) { + gf_msg_debug(this->name, 0, "path = %s. No mds subvol in inode ctx", + local->loc.path); + } + local->mds_subvol = mds_subvol; + local->call_cnt = conf->subvolume_cnt; - local->op_ret = 0; + /* local->call_cnt will change as responses are processed. Always use a + * local copy to loop through the STACK_WIND calls + */ - local->postparent = *postparent; - local->preparent = *preparent; + call_cnt = local->call_cnt; - if (local->loc.parent) { - dht_inode_ctx_time_update (local->loc.parent, this, - &local->preparent, 0); - dht_inode_ctx_time_update (local->loc.parent, this, - &local->postparent, 1); - } + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(frame, dht_revalidate_cbk, conf->subvolumes[i], + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, loc, + local->xattr_req); } -unlock: - UNLOCK (&frame->lock); + return 0; + } - DHT_STACK_UNWIND (unlink, frame, local->op_ret, local->op_errno, - &local->preparent, &local->postparent, NULL); + /* If not a dir, this should be 1 */ + local->call_cnt = layout->cnt; + call_cnt = local->call_cnt; - return 0; + for (i = 0; i < call_cnt; i++) { + subvol = layout->list[i].xlator; + + gf_msg_debug(this->name, 0, + "path = %s. Calling " + "revalidate lookup on %s", + loc->path, subvol->name); + + STACK_WIND_COOKIE(frame, dht_revalidate_cbk, subvol, subvol, + subvol->fops->lookup, &local->loc, local->xattr_req); + } + return 0; +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; } +/* Depending on the input, decide if this is a: + * fresh-lookup: loc->name is provided but no dht inode ctx + * revalidation: loc->name is provided, dht inode ctx is present + * discover: gfid based nameless lookup. + */ int -dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +dht_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; + xlator_t *hashed_subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int op_errno = -1; + loc_t new_loc = { + 0, + }; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + + conf = this->private; + if (!conf) + goto err; + + local = dht_local_init(frame, loc, NULL, GF_FOP_LOOKUP); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + ret = dht_filter_loc_subvol_key(this, loc, &new_loc, &hashed_subvol); + if (ret) { + loc_wipe(&local->loc); + ret = loc_dup(&new_loc, &local->loc); + + /* we no longer need 'new_loc' entries */ + loc_wipe(&new_loc); + + /* check if loc_dup() is successful */ + if (ret == -1) { + op_errno = errno; + gf_msg_debug(this->name, errno, + "copying location failed for path=%s", loc->path); + goto err; + } + } + + if (xattr_req) { + local->xattr_req = dict_ref(xattr_req); + } else { + local->xattr_req = dict_new(); + } + + /* Nameless lookup */ + + /* This is usually sent by NFS. Lookups are done based on the gfid and + * no name information is available. Without the name, dht cannot calculate + * the hash and has to send a lookup to all subvols. + */ + if (gf_uuid_is_null(loc->pargfid) && !gf_uuid_is_null(loc->gfid) && + !__is_root_gfid(loc->inode->gfid)) { + local->cached_subvol = NULL; + dht_do_discover(frame, this, loc); + return 0; + } - xlator_t *cached_subvol = NULL; + if (loc_is_root(loc)) { + /* Request the DHT commit hash xattr (trusted.glusterfs.dht.commithash) + * set on the brick root. + */ + ret = dict_set_uint32(local->xattr_req, conf->commithash_xattr_name, + sizeof(uint32_t)); + } - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if ((op_ret == -1) && !((op_errno == ENOENT) || - (op_errno == ENOTCONN))) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } + if (!hashed_subvol) + hashed_subvol = dht_subvol_get_hashed(this, loc); + local->hashed_subvol = hashed_subvol; - local->op_ret = 0; - } -unlock: - UNLOCK (&frame->lock); + if (is_revalidate(loc)) { + /* The entry has been looked up before and has a dht inode_ctx + */ + dht_do_revalidate(frame, this, loc); + return 0; + } else { + /* Entry has not been looked up before + */ + dht_do_fresh_lookup(frame, this, loc); + return 0; + } - if (local->op_ret == -1) - goto err; + return 0; +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; +} - cached_subvol = dht_subvol_get_cached (this, local->loc.inode); - if (!cached_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", - local->loc.path); - local->op_errno = EINVAL; - goto err; - } +static int +dht_unlink_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *prev = NULL; - STACK_WIND (frame, dht_unlink_cbk, - cached_subvol, cached_subvol->fops->unlink, - &local->loc, local->flags, NULL); + local = frame->local; + prev = cookie; - return 0; + LOCK(&frame->lock); + { + if ((op_ret == -1) && + !((op_errno == ENOENT) || (op_errno == ENOTCONN))) { + local->op_errno = op_errno; + UNLOCK(&frame->lock); + gf_msg_debug(this->name, op_errno, + "Unlink link: subvolume %s returned -1", prev->name); + goto post_unlock; + } -err: - DHT_STACK_UNWIND (unlink, frame, -1, local->op_errno, - NULL, NULL, NULL); - return 0; + local->op_ret = 0; + } + UNLOCK(&frame->lock); +post_unlock: + dht_set_fixed_dir_stat(&local->preparent); + dht_set_fixed_dir_stat(&local->postparent); + DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, xdata); + + return 0; } -int -dht_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xdata) +static int +dht_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + xlator_t *hashed_subvol = NULL; - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } + local = frame->local; + prev = cookie; + LOCK(&frame->lock); + { + if (op_ret == -1) { + if (op_errno != ENOENT) { + local->op_ret = -1; + local->op_errno = op_errno; + } else { local->op_ret = 0; + } + UNLOCK(&frame->lock); + gf_msg_debug(this->name, op_errno, + "Unlink: subvolume %s returned -1", prev->name); + goto post_unlock; } -unlock: - UNLOCK (&frame->lock); - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (setxattr, frame, local->op_ret, - local->op_errno, NULL); - } + local->op_ret = 0; - return 0; + local->postparent = *postparent; + local->preparent = *preparent; + + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, + &local->preparent, 0); + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); + } + } + UNLOCK(&frame->lock); +post_unlock: + if (!local->op_ret) { + hashed_subvol = dht_subvol_get_hashed(this, &local->loc); + if (hashed_subvol && hashed_subvol != local->cached_subvol) { + /* + * If hashed and cached are different, then we need + * to unlink linkfile from hashed subvol if data + * file is deleted successfully + */ + STACK_WIND_COOKIE(frame, dht_unlink_linkfile_cbk, hashed_subvol, + hashed_subvol, hashed_subvol->fops->unlink, + &local->loc, local->flags, xdata); + return 0; + } + } + + dht_set_fixed_dir_stat(&local->preparent); + dht_set_fixed_dir_stat(&local->postparent); + DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, xdata); + + return 0; } -static void -fill_layout_info (dht_layout_t *layout, char *buf) +static int +dht_common_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - int i = 0; - char tmp_buf[128] = {0,}; + DHT_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata); + return 0; +} - for (i = 0; i < layout->cnt; i++) { - snprintf (tmp_buf, 128, "(%s %u %u)", - layout->list[i].xlator->name, - layout->list[i].start, - layout->list[i].stop); - if (i) - strcat (buf, " "); - strcat (buf, tmp_buf); +static int +dht_fix_layout_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + + if (op_ret == 0) { + /* update the layout in the inode ctx */ + local = frame->local; + layout = local->selfheal.layout; + + dht_layout_set(this, local->loc.inode, layout); + } + + DHT_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata); + return 0; +} + +static int +dht_err_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + + local = frame->local; + prev = cookie; + + LOCK(&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + UNLOCK(&frame->lock); + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + goto post_unlock; } + + local->op_ret = 0; + } + UNLOCK(&frame->lock); +post_unlock: + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + if ((local->fop == GF_FOP_SETXATTR) || + (local->fop == GF_FOP_FSETXATTR)) { + DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno, + NULL); + /* 'local' itself may not be valid after this */ + goto out; + } + if ((local->fop == GF_FOP_REMOVEXATTR) || + (local->fop == GF_FOP_FREMOVEXATTR)) { + DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno, + NULL); + } + } + +out: + return 0; } -void -dht_fill_pathinfo_xattr (xlator_t *this, dht_local_t *local, - char *xattr_buf, int32_t alloc_len, - int flag, char *layout_buf) -{ - if (flag && local->xattr_val) - snprintf (xattr_buf, alloc_len, - "((<"DHT_PATHINFO_HEADER"%s> %s) (%s-layout %s))", - this->name, local->xattr_val, this->name, - layout_buf); - else if (local->xattr_val) - snprintf (xattr_buf, alloc_len, - "(<"DHT_PATHINFO_HEADER"%s> %s)", - this->name, local->xattr_val); - else if (flag) - snprintf (xattr_buf, alloc_len, "(%s-layout %s)", - this->name, layout_buf); +/* Set the value[] of key into dict after convert from + host byte order to network byte order +*/ +int32_t +dht_dict_set_array(dict_t *dict, char *key, int32_t value[], int32_t size) +{ + int ret = -1; + int32_t *ptr = NULL; + int32_t vindex; + + if (value == NULL) { + return -EINVAL; + } + + ptr = GF_MALLOC(sizeof(int32_t) * size, gf_common_mt_char); + if (ptr == NULL) { + return -ENOMEM; + } + for (vindex = 0; vindex < size; vindex++) { + ptr[vindex] = hton32(value[vindex]); + } + ret = dict_set_bin(dict, key, ptr, sizeof(int32_t) * size); + if (ret) + GF_FREE(ptr); + return ret; } -int -dht_vgetxattr_alloc_and_fill (dht_local_t *local, dict_t *xattr, xlator_t *this, - int op_errno) +static int +dht_common_mds_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - int ret = -1; - char *value = NULL; - int32_t plen = 0; + dht_local_t *local = NULL; + call_frame_t *prev = cookie; - ret = dict_get_str (xattr, local->xsel, &value); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "Subvolume %s returned -1 (%s)", this->name, - strerror (op_errno)); - local->op_ret = -1; - local->op_errno = op_errno; + local = frame->local; + + if (op_ret) + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->this->name); + + if (local->fop == GF_FOP_SETXATTR) { + DHT_STACK_UNWIND(setxattr, frame, 0, op_errno, local->xdata); + /* 'local' itself may not be valid after this */ + goto out; + } + + if (local->fop == GF_FOP_FSETXATTR) { + DHT_STACK_UNWIND(fsetxattr, frame, 0, op_errno, local->xdata); + /* 'local' itself may not be valid after this */ + goto out; + } + + if (local->fop == GF_FOP_REMOVEXATTR) { + DHT_STACK_UNWIND(removexattr, frame, 0, op_errno, NULL); + /* 'local' itself may not be valid after this */ + goto out; + } + + if (local->fop == GF_FOP_FREMOVEXATTR) { + DHT_STACK_UNWIND(fremovexattr, frame, 0, op_errno, NULL); + } + +out: + return 0; +} + +/* Code to wind a xattrop call to add 1 on current mds internal xattr + value +*/ +static int +dht_setxattr_non_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + int ret = 0; + dict_t *xattrop = NULL; + int32_t addone[1] = {1}; + call_frame_t *prev = NULL; + dht_conf_t *conf = NULL; + + local = frame->local; + prev = cookie; + conf = this->private; + + LOCK(&frame->lock); + { + if (op_ret && !local->op_ret) { + local->op_ret = op_ret; + local->op_errno = op_errno; + UNLOCK(&frame->lock); + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->this->name); + goto post_unlock; + } + } + UNLOCK(&frame->lock); +post_unlock: + this_call_cnt = dht_frame_return(frame); + + if (is_last_call(this_call_cnt)) { + if (!local->op_ret) { + xattrop = dict_new(); + if (!xattrop) { + gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, 0, + "dictionary creation failed"); + ret = -1; + goto out; + } + ret = dht_dict_set_array(xattrop, conf->mds_xattr_key, addone, 1); + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "dictionary set array failed "); + ret = -1; goto out; + } + if ((local->fop == GF_FOP_SETXATTR) || + (local->fop == GF_FOP_REMOVEXATTR)) { + STACK_WIND(frame, dht_common_mds_xattrop_cbk, local->mds_subvol, + local->mds_subvol->fops->xattrop, &local->loc, + GF_XATTROP_ADD_ARRAY, xattrop, NULL); + } else { + STACK_WIND(frame, dht_common_mds_xattrop_cbk, local->mds_subvol, + local->mds_subvol->fops->fxattrop, local->fd, + GF_XATTROP_ADD_ARRAY, xattrop, NULL); + } + } else { + if (local->fop == GF_FOP_SETXATTR) { + DHT_STACK_UNWIND(setxattr, frame, 0, 0, local->xdata); + /* 'local' itself may not be valid after this */ + goto just_return; + } + + if (local->fop == GF_FOP_FSETXATTR) { + DHT_STACK_UNWIND(fsetxattr, frame, 0, 0, local->xdata); + /* 'local' itself may not be valid after this */ + goto just_return; + } + + if (local->fop == GF_FOP_REMOVEXATTR) { + DHT_STACK_UNWIND(removexattr, frame, 0, 0, NULL); + /* 'local' itself may not be valid after this */ + goto just_return; + } + + if (local->fop == GF_FOP_FREMOVEXATTR) { + DHT_STACK_UNWIND(fremovexattr, frame, 0, 0, NULL); + /* 'local' itself may not be valid after this */ + goto just_return; + } + } + } +out: + if (ret) { + if (local->fop == GF_FOP_SETXATTR) { + DHT_STACK_UNWIND(setxattr, frame, 0, 0, local->xdata); + /* 'local' itself may not be valid after this */ + goto just_return; } - local->alloc_len += strlen(value); - - if (!local->xattr_val) { - local->alloc_len += (strlen (DHT_PATHINFO_HEADER) + 10); - local->xattr_val = GF_CALLOC (local->alloc_len, sizeof (char), - gf_common_mt_char); - if (!local->xattr_val) { - ret = -1; - goto out; - } + if (local->fop == GF_FOP_FSETXATTR) { + DHT_STACK_UNWIND(fsetxattr, frame, 0, 0, local->xdata); + /* 'local' itself may not be valid after this */ + goto just_return; } - if (local->xattr_val) { - plen = strlen (local->xattr_val); - if (plen) { - /* extra byte(s) for \0 to be safe */ - local->alloc_len += (plen + 2); - local->xattr_val = GF_REALLOC (local->xattr_val, - local->alloc_len); - if (!local->xattr_val) { - ret = -1; - goto out; - } - } + if (local->fop == GF_FOP_REMOVEXATTR) { + DHT_STACK_UNWIND(removexattr, frame, 0, 0, NULL); + /* 'local' itself may not be valid after this */ + goto just_return; + } - (void) strcat (local->xattr_val, value); - (void) strcat (local->xattr_val, " "); - local->op_ret = 0; + if (local->fop == GF_FOP_FREMOVEXATTR) { + DHT_STACK_UNWIND(fremovexattr, frame, 0, 0, NULL); } + } +just_return: + if (xattrop) + dict_unref(xattrop); + return 0; +} - ret = 0; +static int +dht_setxattr_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + call_frame_t *prev = NULL; + xlator_t *mds_subvol = NULL; + int i = 0; + + local = frame->local; + prev = cookie; + conf = this->private; + mds_subvol = local->mds_subvol; + + if (op_ret == -1) { + local->op_ret = op_ret; + local->op_errno = op_errno; + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->this->name); + goto out; + } + + local->op_ret = 0; + local->call_cnt = conf->subvolume_cnt - 1; + local->xdata = dict_ref(xdata); + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (mds_subvol && (mds_subvol == conf->subvolumes[i])) + continue; + if (local->fop == GF_FOP_SETXATTR) { + STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i], + conf->subvolumes[i]->fops->setxattr, &local->loc, + local->xattr, local->flags, local->xattr_req); + } + + if (local->fop == GF_FOP_FSETXATTR) { + STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i], + conf->subvolumes[i]->fops->fsetxattr, local->fd, + local->xattr, local->flags, local->xattr_req); + } + + if (local->fop == GF_FOP_REMOVEXATTR) { + STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i], + conf->subvolumes[i]->fops->removexattr, &local->loc, + local->key, local->xattr_req); + } + + if (local->fop == GF_FOP_FREMOVEXATTR) { + STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i], + conf->subvolumes[i]->fops->fremovexattr, local->fd, + local->key, local->xattr_req); + } + } + + return 0; +out: + if (local->fop == GF_FOP_SETXATTR) { + DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno, + xdata); + /* 'local' itself may not be valid after this */ + goto just_return; + } + + if (local->fop == GF_FOP_FSETXATTR) { + DHT_STACK_UNWIND(fsetxattr, frame, local->op_ret, local->op_errno, + xdata); + /* 'local' itself may not be valid after this */ + goto just_return; + } + + if (local->fop == GF_FOP_REMOVEXATTR) { + DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno, + NULL); + /* 'local' itself may not be valid after this */ + goto just_return; + } + + if (local->fop == GF_FOP_FREMOVEXATTR) { + DHT_STACK_UNWIND(fremovexattr, frame, local->op_ret, local->op_errno, + NULL); + } + +just_return: + return 0; +} - out: - return ret; +static int +dht_xattrop_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *dict, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + + local = frame->local; + prev = cookie; + + if (op_ret == -1) { + local->op_errno = op_errno; + local->op_ret = op_ret; + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->this->name); + goto out; + } + + if (local->fop == GF_FOP_SETXATTR) { + STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol, + local->mds_subvol->fops->setxattr, &local->loc, local->xattr, + local->flags, local->xattr_req); + } + + if (local->fop == GF_FOP_FSETXATTR) { + STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol, + local->mds_subvol->fops->fsetxattr, local->fd, local->xattr, + local->flags, local->xattr_req); + } + + if (local->fop == GF_FOP_REMOVEXATTR) { + STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol, + local->mds_subvol->fops->removexattr, &local->loc, + local->key, local->xattr_req); + } + + if (local->fop == GF_FOP_FREMOVEXATTR) { + STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol, + local->mds_subvol->fops->fremovexattr, local->fd, local->key, + local->xattr_req); + } + + return 0; +out: + if (local->fop == GF_FOP_SETXATTR) { + DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno, + xdata); + /* 'local' itself may not be valid after this */ + goto just_return; + } + + if (local->fop == GF_FOP_FSETXATTR) { + DHT_STACK_UNWIND(fsetxattr, frame, local->op_ret, local->op_errno, + xdata); + /* 'local' itself may not be valid after this */ + goto just_return; + } + + if (local->fop == GF_FOP_REMOVEXATTR) { + DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno, + NULL); + /* 'local' itself may not be valid after this */ + goto just_return; + } + + if (local->fop == GF_FOP_FREMOVEXATTR) { + DHT_STACK_UNWIND(fremovexattr, frame, local->op_ret, local->op_errno, + NULL); + } + +just_return: + return 0; } -int -dht_vgetxattr_fill_and_set (dht_local_t *local, dict_t **dict, xlator_t *this, - gf_boolean_t flag) +static void +fill_layout_info(dht_layout_t *layout, char *buf) { - int ret = -1; - char *xattr_buf = NULL; - char layout_buf[8192] = {0,}; + int i = 0; + char tmp_buf[128] = { + 0, + }; + + for (i = 0; i < layout->cnt; i++) { + snprintf(tmp_buf, sizeof(tmp_buf), "(%s %u %u)", + layout->list[i].xlator->name, layout->list[i].start, + layout->list[i].stop); + if (i) + strcat(buf, " "); + strcat(buf, tmp_buf); + } +} - if (flag) - fill_layout_info (local->layout, layout_buf); +static void +dht_fill_pathinfo_xattr(xlator_t *this, dht_local_t *local, char *xattr_buf, + int32_t alloc_len, int flag, char *layout_buf) +{ + if (flag) { + if (local->xattr_val) { + snprintf(xattr_buf, alloc_len, + "((<" DHT_PATHINFO_HEADER "%s> %s) (%s-layout %s))", + this->name, local->xattr_val, this->name, layout_buf); + } else { + snprintf(xattr_buf, alloc_len, "(%s-layout %s)", this->name, + layout_buf); + } + } else if (local->xattr_val) { + snprintf(xattr_buf, alloc_len, "(<" DHT_PATHINFO_HEADER "%s> %s)", + this->name, local->xattr_val); + } else { + xattr_buf[0] = '\0'; + } +} - *dict = dict_new (); - if (!*dict) - goto out; +static int +dht_vgetxattr_alloc_and_fill(dht_local_t *local, dict_t *xattr, xlator_t *this, + int op_errno) +{ + int ret = -1; + char *value = NULL; + + ret = dict_get_str(xattr, local->xsel, &value); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_GET_XATTR_FAILED, + "Subvolume %s returned -1", this->name); + local->op_ret = -1; + local->op_errno = op_errno; + goto out; + } + + local->alloc_len += strlen(value); + + if (!local->xattr_val) { + local->alloc_len += (SLEN(DHT_PATHINFO_HEADER) + 10); + local->xattr_val = GF_MALLOC(local->alloc_len, gf_common_mt_char); + if (!local->xattr_val) { + ret = -1; + goto out; + } + local->xattr_val[0] = '\0'; + } - local->xattr_val[strlen (local->xattr_val) - 1] = '\0'; + int plen = strlen(local->xattr_val); + if (plen) { + /* extra byte(s) for \0 to be safe */ + local->alloc_len += (plen + 2); + local->xattr_val = GF_REALLOC(local->xattr_val, local->alloc_len); + if (!local->xattr_val) { + ret = -1; + goto out; + } + } - /* we would need max this many bytes to create xattr string - * extra 40 bytes is just an estimated amount of additional - * space required as we include translator name and some - * spaces, brackets etc. when forming the pathinfo string. - * - * For node-uuid we just don't have all the pretty formatting, - * but since this is a generic routine for pathinfo & node-uuid - * we dont have conditional space allocation and try to be - * generic - */ - local->alloc_len += (2 * strlen (this->name)) - + strlen (layout_buf) - + 40; - xattr_buf = GF_CALLOC (local->alloc_len, sizeof (char), - gf_common_mt_char); - if (!xattr_buf) - goto out; + (void)strcat(local->xattr_val, value); + (void)strcat(local->xattr_val, " "); + local->op_ret = 0; - if (XATTR_IS_PATHINFO (local->xsel)) { - (void) dht_fill_pathinfo_xattr (this, local, xattr_buf, - local->alloc_len, flag, - layout_buf); - } else if (XATTR_IS_NODE_UUID (local->xsel)) { - (void) snprintf (xattr_buf, local->alloc_len, "%s", - local->xattr_val); - } else { - gf_log (this->name, GF_LOG_WARNING, - "Unknown local->xsel (%s)", local->xsel); - GF_FREE (xattr_buf); - goto out; - } + ret = 0; - ret = dict_set_dynstr (*dict, local->xsel, xattr_buf); - if (ret) - GF_FREE (xattr_buf); - GF_FREE (local->xattr_val); +out: + return ret; +} - out: - return ret; +static int +dht_vgetxattr_fill_and_set(dht_local_t *local, dict_t **dict, xlator_t *this, + gf_boolean_t flag) +{ + int ret = -1; + char *xattr_buf = NULL; + char layout_buf[8192] = { + 0, + }; + + if (flag) + fill_layout_info(local->layout, layout_buf); + + *dict = dict_new(); + if (!*dict) + goto out; + + local->xattr_val[strlen(local->xattr_val) - 1] = '\0'; + + /* we would need max this many bytes to create xattr string + * extra 40 bytes is just an estimated amount of additional + * space required as we include translator name and some + * spaces, brackets etc. when forming the pathinfo string. + * + * For node-uuid we just don't have all the pretty formatting, + * but since this is a generic routine for pathinfo & node-uuid + * we don't have conditional space allocation and try to be + * generic + */ + local->alloc_len += (2 * strlen(this->name)) + strlen(layout_buf) + 40; + xattr_buf = GF_MALLOC(local->alloc_len, gf_common_mt_char); + if (!xattr_buf) + goto out; + + if (XATTR_IS_PATHINFO(local->xsel)) { + (void)dht_fill_pathinfo_xattr(this, local, xattr_buf, local->alloc_len, + flag, layout_buf); + } else if ((XATTR_IS_NODE_UUID(local->xsel)) || + (XATTR_IS_NODE_UUID_LIST(local->xsel))) { + (void)snprintf(xattr_buf, local->alloc_len, "%s", local->xattr_val); + } else { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GET_XATTR_FAILED, + "Unknown local->xsel (%s)", local->xsel); + GF_FREE(xattr_buf); + goto out; + } + + ret = dict_set_dynstr(*dict, local->xsel, xattr_buf); + if (ret) + GF_FREE(xattr_buf); + GF_FREE(local->xattr_val); + +out: + return ret; } -int -dht_vgetxattr_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +static int +dht_find_local_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, + dict_t *xdata) { - int ret = 0; - dht_local_t *local = NULL; - int this_call_cnt = 0; - dict_t *dict = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *prev = NULL; + int this_call_cnt = 0; + int ret = 0; + char *uuid_str = NULL; + char *uuid_list = NULL; + char *next_uuid_str = NULL; + char *saveptr = NULL; + uuid_t node_uuid = { + 0, + }; + char *uuid_list_copy = NULL; + int count = 0; + int i = 0; + int index = 0; + int found = 0; + nodeuuid_info_t *tmp_ptr = NULL; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(frame->local, out); + + local = frame->local; + prev = cookie; + conf = this->private; + + VALIDATE_OR_GOTO(conf->defrag, out); + + gf_msg_debug(this->name, 0, "subvol %s returned", prev->name); + + LOCK(&frame->lock); + { + this_call_cnt = --local->call_cnt; + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; + UNLOCK(&frame->lock); + if (op_errno == ENODATA) + gf_msg_debug(this->name, 0, "failed to get node-uuid"); + else + gf_msg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_GET_XATTR_FAILED, "failed to get node-uuid"); + goto post_unlock; + } - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (frame->local, out); + ret = dict_get_str(xattr, local->xsel, &uuid_list); - local = frame->local; + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_GET_FAILED, + "Failed to get %s", local->xsel); + local->op_ret = -1; + local->op_errno = EINVAL; + goto unlock; + } - LOCK (&frame->lock); - { - this_call_cnt = --local->call_cnt; - if (op_ret < 0) { - if (op_errno != ENOTCONN) { - gf_log (this->name, GF_LOG_ERROR, - "getxattr err (%s) for dir", - strerror (op_errno)); - local->op_ret = -1; - local->op_errno = op_errno; - } + /* As DHT will not know details of its child xlators + * we need to parse this twice to get the count first + * and allocate memory later. + */ + count = 0; + index = conf->local_subvols_cnt; - goto unlock; - } + uuid_list_copy = gf_strdup(uuid_list); + if (!uuid_list_copy) + goto unlock; - ret = dht_vgetxattr_alloc_and_fill (local, xattr, this, - op_errno); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "alloc or fill failure"); - } - unlock: - UNLOCK (&frame->lock); + for (uuid_str = strtok_r(uuid_list, " ", &saveptr); uuid_str; + uuid_str = next_uuid_str) { + next_uuid_str = strtok_r(NULL, " ", &saveptr); + if (gf_uuid_parse(uuid_str, node_uuid)) { + local->op_ret = -1; + local->op_errno = EINVAL; + UNLOCK(&frame->lock); + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UUID_PARSE_ERROR, + "Failed to parse uuid for %s", prev->name); + goto post_unlock; + } + + count++; + if (gf_uuid_compare(node_uuid, conf->defrag->node_uuid)) { + gf_msg_debug(this->name, 0, + "subvol %s does not" + "belong to this node", + prev->name); + } else { + /* handle multiple bricks of the same replica + * on the same node */ + if (found) + continue; + conf->local_subvols[(conf->local_subvols_cnt)++] = prev; + found = 1; + gf_msg_debug(this->name, 0, + "subvol %s belongs to" + " this node", + prev->name); + } + } + + if (!found) { + local->op_ret = 0; + goto unlock; + } + + conf->local_nodeuuids[index].count = count; + conf->local_nodeuuids[index].elements = GF_CALLOC( + count, sizeof(nodeuuid_info_t), 1); + + /* The node-uuids are guaranteed to be returned in the same + * order as the bricks + * A null node-uuid is returned for a brick that is down. + */ - if (!is_last_call (this_call_cnt)) - goto out; + saveptr = NULL; + i = 0; - /* -- last call: do patch ups -- */ + for (uuid_str = strtok_r(uuid_list_copy, " ", &saveptr); uuid_str; + uuid_str = next_uuid_str) { + next_uuid_str = strtok_r(NULL, " ", &saveptr); + tmp_ptr = &(conf->local_nodeuuids[index].elements[i]); + gf_uuid_parse(uuid_str, tmp_ptr->uuid); - if (local->op_ret == -1) { - goto unwind; + if (!gf_uuid_compare(tmp_ptr->uuid, conf->defrag->node_uuid)) { + tmp_ptr->info = REBAL_NODEUUID_MINE; + } + i++; + tmp_ptr = NULL; } + } - ret = dht_vgetxattr_fill_and_set (local, &dict, this, _gf_true); - if (ret) - goto unwind; + local->op_ret = 0; +unlock: + UNLOCK(&frame->lock); +post_unlock: + if (!is_last_call(this_call_cnt)) + goto out; - DHT_STACK_UNWIND (getxattr, frame, 0, 0, dict, xdata); - goto cleanup; + if (local->op_ret == -1) { + goto unwind; + } - unwind: - DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, NULL, NULL); - cleanup: - if (dict) - dict_unref (dict); - out: - return 0; + DHT_STACK_UNWIND(getxattr, frame, 0, 0, xattr, xdata); + goto out; + +unwind: + + GF_FREE(conf->local_nodeuuids[index].elements); + conf->local_nodeuuids[index].elements = NULL; + + DHT_STACK_UNWIND(getxattr, frame, -1, local->op_errno, NULL, xdata); +out: + GF_FREE(uuid_list_copy); + return 0; } -int -dht_vgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +static int +dht_vgetxattr_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - dht_local_t *local = NULL; - int ret = 0; - dict_t *dict = NULL; - call_frame_t *prev = NULL; - gf_boolean_t flag = _gf_true; + int ret = 0; + dht_local_t *local = NULL; + int this_call_cnt = 0; + dict_t *dict = NULL; - local = frame->local; - prev = cookie; + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(frame->local, out); + + local = frame->local; + LOCK(&frame->lock); + { + this_call_cnt = --local->call_cnt; if (op_ret < 0) { + if (op_errno != ENOTCONN) { local->op_ret = -1; local->op_errno = op_errno; - gf_log (this->name, GF_LOG_ERROR, "Subvolume %s returned -1 " - "(%s)", prev->this->name, strerror (op_errno)); - goto unwind; + UNLOCK(&frame->lock); + gf_msg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_GET_XATTR_FAILED, "getxattr err for dir"); + goto post_unlock; + } + + goto unlock; } - ret = dht_vgetxattr_alloc_and_fill (local, xattr, this, - op_errno); + ret = dht_vgetxattr_alloc_and_fill(local, xattr, this, op_errno); if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "alloc or fill failure"); - goto unwind; + UNLOCK(&frame->lock); + gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_DICT_SET_FAILED, + "alloc or fill failure"); + goto post_unlock; } + } +unlock: + UNLOCK(&frame->lock); +post_unlock: + if (!is_last_call(this_call_cnt)) + goto out; - flag = (local->layout->cnt > 1) ? _gf_true : _gf_false; + /* -- last call: do patch ups -- */ - ret = dht_vgetxattr_fill_and_set (local, &dict, this, flag); - if (ret) - goto unwind; + if (local->op_ret == -1) { + goto unwind; + } - DHT_STACK_UNWIND (getxattr, frame, 0, 0, dict, xdata); - goto cleanup; + ret = dht_vgetxattr_fill_and_set(local, &dict, this, _gf_true); + if (ret) + goto unwind; - unwind: - DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, - NULL, NULL); - cleanup: - if (dict) - dict_unref (dict); + DHT_STACK_UNWIND(getxattr, frame, 0, 0, dict, xdata); + goto cleanup; - return 0; +unwind: + DHT_STACK_UNWIND(getxattr, frame, -1, local->op_errno, NULL, NULL); +cleanup: + if (dict) + dict_unref(dict); +out: + return 0; } -int -dht_linkinfo_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr, - dict_t *xdata) +static int +dht_vgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xattr, dict_t *xdata) { - int ret = 0; - char *value = NULL; + dht_local_t *local = NULL; + int ret = 0; + dict_t *dict = NULL; + xlator_t *prev = NULL; + gf_boolean_t flag = _gf_true; + + local = frame->local; + prev = cookie; + + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_GET_XATTR_FAILED, + "vgetxattr: Subvolume %s returned -1", prev->name); + goto unwind; + } + + ret = dht_vgetxattr_alloc_and_fill(local, xattr, this, op_errno); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY, + "Allocation or fill failure"); + goto unwind; + } + + flag = (local->layout->cnt > 1) ? _gf_true : _gf_false; + + ret = dht_vgetxattr_fill_and_set(local, &dict, this, flag); + if (ret) + goto unwind; + + DHT_STACK_UNWIND(getxattr, frame, 0, 0, dict, xdata); + goto cleanup; - if (op_ret != -1) { - ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &value); - if (!ret) { - ret = dict_set_str (xattr, GF_XATTR_LINKINFO_KEY, value); - if (!ret) - gf_log (this->name, GF_LOG_TRACE, - "failed to set linkinfo"); - } +unwind: + DHT_STACK_UNWIND(getxattr, frame, -1, local->op_errno, NULL, NULL); +cleanup: + if (dict) + dict_unref(dict); + + return 0; +} + +static int +dht_linkinfo_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, + dict_t *xdata) +{ + int ret = 0; + char *value = NULL; + + if (op_ret != -1) { + ret = dict_get_str(xattr, GF_XATTR_PATHINFO_KEY, &value); + if (!ret) { + ret = dict_set_str(xattr, GF_XATTR_LINKINFO_KEY, value); + if (!ret) + gf_msg_trace(this->name, 0, "failed to set linkinfo"); } + } - DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata); + DHT_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata); - return 0; + return 0; } -int -dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +static int +dht_mds_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - int this_call_cnt = 0; - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (frame->local, out); - VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(frame->local, err); + VALIDATE_OR_GOTO(this->private, err); - conf = this->private; - local = frame->local; + conf = this->private; + local = frame->local; - this_call_cnt = dht_frame_return (frame); + if (!xattr || (op_ret == -1)) { + local->op_ret = op_ret; + goto out; + } + dict_del(xattr, conf->xattr_name); + local->op_ret = 0; - if (!xattr || (op_ret == -1)) - goto out; + if (!local->xattr) { + local->xattr = dict_copy_with_ref(xattr, NULL); + } + +out: + DHT_STACK_UNWIND(getxattr, frame, local->op_ret, op_errno, local->xattr, + xdata); + return 0; +err: + DHT_STACK_UNWIND(getxattr, frame, -1, EINVAL, NULL, NULL); + return 0; +} - if (dict_get (xattr, conf->xattr_name)) { - dict_del (xattr, conf->xattr_name); +int +dht_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xattr, dict_t *xdata) +{ + int this_call_cnt = 0; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int ret = 0; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(frame->local, err); + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + local = frame->local; + + if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto err; + return 0; + } + + LOCK(&frame->lock); + { + if (!xattr || (op_ret == -1)) { + local->op_ret = op_ret; + goto unlock; } - if (frame->root->pid >= 0 ) { - GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr); - GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr); + dict_del(xattr, conf->xattr_name); + dict_del(xattr, conf->mds_xattr_key); + + dict_del(xattr, conf->commithash_xattr_name); + + if (frame->root->pid >= 0) { + GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr); + GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr); } local->op_ret = 0; if (!local->xattr) { - local->xattr = dict_copy_with_ref (xattr, NULL); + local->xattr = dict_copy_with_ref(xattr, NULL); } else { - dht_aggregate_xattr (local->xattr, xattr); + dht_aggregate_xattr(local->xattr, xattr); } -out: - if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno, - local->xattr, NULL); + + if (!local->xdata) { + local->xdata = dict_ref(xdata); + } else if ((local->inode && IA_ISDIR(local->inode->ia_type)) || + (local->fd && IA_ISDIR(local->fd->inode->ia_type))) { + dht_aggregate_xattr(local->xdata, xdata); } - return 0; + } +unlock: + UNLOCK(&frame->lock); + + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + /* If we have a valid xattr received from any one of the + * subvolume, let's return it */ + if (local->xattr) { + local->op_ret = 0; + } + + DHT_STACK_UNWIND(getxattr, frame, local->op_ret, op_errno, local->xattr, + local->xdata); + } + return 0; +err: + DHT_STACK_UNWIND(getxattr, frame, -1, EINVAL, NULL, NULL); + return 0; } -int32_t -dht_getxattr_unwind (call_frame_t *frame, - int op_ret, int op_errno, dict_t *dict, dict_t *xdata) +static int32_t +dht_getxattr_unwind(call_frame_t *frame, int op_ret, int op_errno, dict_t *dict, + dict_t *xdata) { - DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); - return 0; + DHT_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; } - -int -dht_getxattr_get_real_filename_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - dict_t *xattr, dict_t *xdata) +static int +dht_getxattr_get_real_filename_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + dict_t *xattr, dict_t *xdata) { - int this_call_cnt = 0; - dht_local_t *local = NULL; + int this_call_cnt = 0; + dht_local_t *local = NULL; + local = frame->local; - local = frame->local; + LOCK(&frame->lock); + { + if (local->op_errno == EOPNOTSUPP) { + /* Nothing to do here, we have already found + * a subvol which does not have the get_real_filename + * optimization. If condition is for simple logic. + */ + goto unlock; + } - if (op_ret != -1) { - if (local->xattr) - dict_unref (local->xattr); - local->xattr = dict_ref (xattr); + if (op_ret == -1) { + if (op_errno == EOPNOTSUPP) { + /* This subvol does not have the optimization. + * Better let the user know we don't support it. + * Remove previous results if any. + */ - if (local->xattr_req) - dict_unref (local->xattr_req); - local->xattr_req = dict_ref (xdata); - } + if (local->xattr) { + dict_unref(local->xattr); + local->xattr = NULL; + } - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno, - local->xattr, local->xattr_req); - } + if (local->xattr_req) { + dict_unref(local->xattr_req); + local->xattr_req = NULL; + } - return 0; + local->op_ret = op_ret; + local->op_errno = op_errno; + UNLOCK(&frame->lock); + gf_msg(this->name, GF_LOG_WARNING, op_errno, + DHT_MSG_UPGRADE_BRICKS, + "At least " + "one of the bricks does not support " + "this operation. Please upgrade all " + "bricks."); + goto post_unlock; + } + + if (op_errno == ENOATTR) { + /* Do nothing, our defaults are set to this. + */ + goto unlock; + } + + /* This is a place holder for every other error + * case. I am not sure of how to interpret + * ENOTCONN etc. As of now, choosing to ignore + * down subvol and return a good result(if any) + * from other subvol. + */ + UNLOCK(&frame->lock); + gf_msg(this->name, GF_LOG_WARNING, op_errno, + DHT_MSG_GET_XATTR_FAILED, "Failed to get real filename."); + goto post_unlock; + } + + /* This subvol has the required file. + * There could be other subvols which have returned + * success already, choosing to return the latest good + * result. + */ + if (local->xattr) + dict_unref(local->xattr); + local->xattr = dict_ref(xattr); + + if (local->xattr_req) { + dict_unref(local->xattr_req); + local->xattr_req = NULL; + } + if (xdata) + local->xattr_req = dict_ref(xdata); + + local->op_ret = op_ret; + local->op_errno = 0; + UNLOCK(&frame->lock); + gf_msg_debug(this->name, 0, "Found a matching file."); + goto post_unlock; + } +unlock: + UNLOCK(&frame->lock); +post_unlock: + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + DHT_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno, + local->xattr, local->xattr_req); + } + + return 0; } +static int +dht_getxattr_get_real_filename(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *key, dict_t *xdata) +{ + dht_local_t *local = NULL; + int i = 0; + dht_layout_t *layout = NULL; + int cnt = 0; + xlator_t *subvol = NULL; + + local = frame->local; + layout = local->layout; -int -dht_getxattr_get_real_filename (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *key, dict_t *xdata) + cnt = local->call_cnt = layout->cnt; + + local->op_ret = -1; + local->op_errno = ENOATTR; + + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND(frame, dht_getxattr_get_real_filename_cbk, subvol, + subvol->fops->getxattr, loc, key, xdata); + } + + return 0; +} + +static int +dht_marker_populate_args(call_frame_t *frame, int type, int *gauge, + xlator_t **subvols) { - dht_local_t *local = NULL; - int i = 0; - dht_layout_t *layout = NULL; - int cnt = 0; - xlator_t *subvol = NULL; + dht_local_t *local = NULL; + int i = 0; + dht_layout_t *layout = NULL; + local = frame->local; + layout = local->layout; - local = frame->local; - layout = local->layout; + for (i = 0; i < layout->cnt; i++) + subvols[i] = layout->list[i].xlator; - cnt = local->call_cnt = layout->cnt; + return layout->cnt; +} - local->op_ret = -1; - local->op_errno = ENODATA; +static int +dht_is_debug_xattr_key(const char **array, char *key) +{ + int i = 0; - for (i = 0; i < cnt; i++) { - subvol = layout->list[i].xlator; - STACK_WIND (frame, dht_getxattr_get_real_filename_cbk, - subvol, subvol->fops->getxattr, - loc, key, xdata); - } + for (i = 0; array[i]; i++) { + if (fnmatch(array[i], key, FNM_NOESCAPE) == 0) + return i; + } - return 0; + return -1; } +/* Note we already have frame->local initialised here*/ -int -dht_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *key, dict_t *xdata) -#define DHT_IS_DIR(layout) (layout->cnt > 1) -{ - - xlator_t *subvol = NULL; - xlator_t *hashed_subvol = NULL; - xlator_t *cached_subvol = NULL; - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - xlator_t **sub_volumes = NULL; - int op_errno = -1; - int i = 0; - int cnt = 0; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (this->private, err); - - conf = this->private; - - local = dht_local_init (frame, loc, NULL, GF_FOP_GETXATTR); - if (!local) { - op_errno = ENOMEM; +static int +dht_handle_debug_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *key) +{ + dht_local_t *local = NULL; + int ret = -1; + int op_errno = ENODATA; + char *value = NULL; + loc_t file_loc = {0}; + const char *name = NULL; + + local = frame->local; + + if (dht_is_debug_xattr_key(dht_dbg_vxattrs, (char *)key) == -1) { + goto out; + } + + local->xattr = dict_new(); + if (!local->xattr) { + op_errno = ENOMEM; + goto out; + } + + if (strncmp(key, DHT_DBG_HASHED_SUBVOL_KEY, + SLEN(DHT_DBG_HASHED_SUBVOL_KEY)) == 0) { + name = key + strlen(DHT_DBG_HASHED_SUBVOL_KEY); + if (strlen(name) == 0) { + op_errno = EINVAL; + goto out; + } + + ret = dht_build_child_loc(this, &file_loc, loc, (char *)name); + if (ret) { + op_errno = ENOMEM; + goto out; + } - goto err; + local->hashed_subvol = dht_subvol_get_hashed(this, &file_loc); + if (local->hashed_subvol == NULL) { + op_errno = ENODATA; + goto out; } - layout = local->layout; - if (!layout) { - gf_log (this->name, GF_LOG_ERROR, - "layout is NULL"); - op_errno = ENOENT; - goto err; + value = gf_strdup(local->hashed_subvol->name); + if (!value) { + op_errno = ENOMEM; + goto out; } - if (key) { - local->key = gf_strdup (key); - if (!local->key) { - op_errno = ENOMEM; - goto err; - } + ret = dict_set_dynstr(local->xattr, (char *)key, value); + if (ret < 0) { + op_errno = -ret; + ret = -1; + goto out; } + ret = 0; + goto out; + } - if (key && - (strncmp (key, GF_XATTR_GET_REAL_FILENAME_KEY, - strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0) - && DHT_IS_DIR(layout)) { - dht_getxattr_get_real_filename (frame, this, loc, key, xdata); - return 0; - } +out: + loc_wipe(&file_loc); + DHT_STACK_UNWIND(getxattr, frame, ret, op_errno, local->xattr, NULL); + return 0; +} - /* for file use cached subvolume (obviously!): see if {} - * below - * for directory: - * wind to all subvolumes and exclude subvolumes which - * return ENOTCONN (in callback) - * - * NOTE: Don't trust inode here, as that may not be valid - * (until inode_link() happens) - */ - if (key && DHT_IS_DIR(layout) && - (XATTR_IS_PATHINFO (key) - || (strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0))) { - (void) strncpy (local->xsel, key, 256); - cnt = local->call_cnt = layout->cnt; - for (i = 0; i < cnt; i++) { - subvol = layout->list[i].xlator; - STACK_WIND (frame, dht_vgetxattr_dir_cbk, - subvol, subvol->fops->getxattr, - loc, key, NULL); - } - return 0; - } +/* Virtual Xattr which returns 1 if all subvols are up, + else returns 0. Geo-rep then uses this virtual xattr + after a fresh mount and starts the I/O. +*/ - /* node-uuid or pathinfo for files */ - if (key && ((strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0) - || XATTR_IS_PATHINFO (key))) { - cached_subvol = local->cached_subvol; - (void) strncpy (local->xsel, key, 256); +enum dht_vxattr_subvol { + DHT_VXATTR_SUBVOLS_UP = 1, + DHT_VXATTR_SUBVOLS_DOWN = 0, +}; - local->call_cnt = 1; - STACK_WIND (frame, dht_vgetxattr_cbk, cached_subvol, - cached_subvol->fops->getxattr, loc, key, NULL); +int +dht_vgetxattr_subvol_status(call_frame_t *frame, xlator_t *this, + const char *key) +{ + dht_local_t *local = NULL; + int ret = -1; + int op_errno = ENODATA; + int value = DHT_VXATTR_SUBVOLS_UP; + int i = 0; + dht_conf_t *conf = NULL; + + conf = this->private; + local = frame->local; + + if (!key) { + op_errno = EINVAL; + goto out; + } + local->xattr = dict_new(); + if (!local->xattr) { + op_errno = ENOMEM; + goto out; + } + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!conf->subvolume_status[i]) { + value = DHT_VXATTR_SUBVOLS_DOWN; + gf_msg_debug(this->name, 0, "subvol %s is down ", + conf->subvolumes[i]->name); + break; + } + } + ret = dict_set_int8(local->xattr, (char *)key, value); + if (ret < 0) { + op_errno = -ret; + ret = -1; + goto out; + } + ret = 0; - return 0; - } +out: + DHT_STACK_UNWIND(getxattr, frame, ret, op_errno, local->xattr, NULL); + return 0; +} - if (key && (strcmp (key, GF_XATTR_LINKINFO_KEY) == 0)) { - hashed_subvol = dht_subvol_get_hashed (this, loc); - if (!hashed_subvol) { - gf_log (this->name, GF_LOG_ERROR, "Failed to get" - "hashed subvol for %s", loc->path); - op_errno = EINVAL; - goto err; - } +int +dht_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key, + dict_t *xdata) +#define DHT_IS_DIR(layout) (layout->cnt > 1) +{ + xlator_t *subvol = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *mds_subvol = NULL; + xlator_t *cached_subvol = NULL; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int op_errno = -1; + int i = 0; + int cnt = 0; + char *node_uuid_key = NULL; + int ret = -1; + + GF_CHECK_XATTR_KEY_AND_GOTO(key, IO_THREADS_QUEUE_SIZE_KEY, op_errno, err); + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + + local = dht_local_init(frame, loc, NULL, GF_FOP_GETXATTR); + if (!local) { + op_errno = ENOMEM; + + goto err; + } + + layout = local->layout; + if (!layout) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_NULL, + "Layout is NULL"); + op_errno = ENOENT; + goto err; + } + + /* skip over code which is irrelevant without a valid key */ + if (!key) + goto no_key; + + local->key = gf_strdup(key); + if (!local->key) { + op_errno = ENOMEM; + goto err; + } + + if (strncmp(key, conf->mds_xattr_key, strlen(key)) == 0) { + op_errno = ENOTSUP; + goto err; + } + + if (strncmp(key, DHT_SUBVOL_STATUS_KEY, SLEN(DHT_SUBVOL_STATUS_KEY)) == 0) { + dht_vgetxattr_subvol_status(frame, this, key); + return 0; + } - cached_subvol = dht_subvol_get_cached (this, loc->inode); - if (!cached_subvol) { - gf_log (this->name, GF_LOG_ERROR, "Failed to get" - "cached subvol for %s", loc->path); - op_errno = EINVAL; - goto err; - } + /* skip over code which is irrelevant if !DHT_IS_DIR(layout) */ + if (!DHT_IS_DIR(layout)) + goto no_dht_is_dir; - if (hashed_subvol == cached_subvol) { - op_errno = ENODATA; - goto err; - } - if (hashed_subvol) { - STACK_WIND (frame, dht_linkinfo_getxattr_cbk, hashed_subvol, - hashed_subvol->fops->getxattr, loc, - GF_XATTR_PATHINFO_KEY, NULL); - return 0; - } - op_errno = ENODATA; - goto err; + if ((strncmp(key, GF_XATTR_GET_REAL_FILENAME_KEY, + SLEN(GF_XATTR_GET_REAL_FILENAME_KEY)) == 0) && + DHT_IS_DIR(layout)) { + dht_getxattr_get_real_filename(frame, this, loc, key, xdata); + return 0; + } + + if (!strcmp(key, GF_REBAL_FIND_LOCAL_SUBVOL)) { + ret = gf_asprintf(&node_uuid_key, "%s", GF_XATTR_LIST_NODE_UUIDS_KEY); + if (ret == -1 || !node_uuid_key) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY, + "Failed to copy node uuid key"); + op_errno = ENOMEM; + goto err; + } + (void)snprintf(local->xsel, sizeof(local->xsel), "%s", node_uuid_key); + cnt = local->call_cnt = conf->subvolume_cnt; + for (i = 0; i < cnt; i++) { + STACK_WIND_COOKIE(frame, dht_find_local_subvol_cbk, + conf->subvolumes[i], conf->subvolumes[i], + conf->subvolumes[i]->fops->getxattr, loc, + node_uuid_key, xdata); } + if (node_uuid_key) + GF_FREE(node_uuid_key); + return 0; + } + + if (!strcmp(key, GF_REBAL_OLD_FIND_LOCAL_SUBVOL)) { + ret = gf_asprintf(&node_uuid_key, "%s", GF_XATTR_NODE_UUID_KEY); + if (ret == -1 || !node_uuid_key) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY, + "Failed to copy node uuid key"); + op_errno = ENOMEM; + goto err; + } + (void)snprintf(local->xsel, sizeof(local->xsel), "%s", node_uuid_key); + cnt = local->call_cnt = conf->subvolume_cnt; + for (i = 0; i < cnt; i++) { + STACK_WIND_COOKIE(frame, dht_find_local_subvol_cbk, + conf->subvolumes[i], conf->subvolumes[i], + conf->subvolumes[i]->fops->getxattr, loc, + node_uuid_key, xdata); + } + if (node_uuid_key) + GF_FREE(node_uuid_key); + return 0; + } + + /* for file use cached subvolume (obviously!): see if {} + * below + * for directory: + * wind to all subvolumes and exclude subvolumes which + * return ENOTCONN (in callback) + * + * NOTE: Don't trust inode here, as that may not be valid + * (until inode_link() happens) + */ + + if (XATTR_IS_PATHINFO(key) || (strcmp(key, GF_XATTR_NODE_UUID_KEY) == 0) || + (strcmp(key, GF_XATTR_LIST_NODE_UUIDS_KEY) == 0)) { + (void)snprintf(local->xsel, sizeof(local->xsel), "%s", key); + cnt = local->call_cnt = layout->cnt; + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND(frame, dht_vgetxattr_dir_cbk, subvol, + subvol->fops->getxattr, loc, key, xdata); + } + return 0; + } - if (key && (!strcmp (GF_XATTR_MARKER_KEY, key)) - && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { - if (DHT_IS_DIR(layout)) { - cnt = layout->cnt; - } else { - cnt = 1; - } +no_dht_is_dir: + /* node-uuid or pathinfo for files */ + if (XATTR_IS_PATHINFO(key) || (strcmp(key, GF_XATTR_NODE_UUID_KEY) == 0)) { + cached_subvol = local->cached_subvol; + (void)snprintf(local->xsel, sizeof(local->xsel), "%s", key); + local->call_cnt = 1; + STACK_WIND_COOKIE(frame, dht_vgetxattr_cbk, cached_subvol, + cached_subvol, cached_subvol->fops->getxattr, loc, + key, xdata); - sub_volumes = alloca ( cnt * sizeof (xlator_t *)); - for (i = 0; i < cnt; i++) - *(sub_volumes + i) = layout->list[i].xlator; - - if (cluster_getmarkerattr (frame, this, loc, key, - local, dht_getxattr_unwind, - sub_volumes, cnt, - MARKER_UUID_TYPE, marker_uuid_default_gauge, - conf->vol_uuid)) { - op_errno = EINVAL; - goto err; - } + return 0; + } - return 0; + if (strcmp(key, GF_XATTR_LINKINFO_KEY) == 0) { + hashed_subvol = dht_subvol_get_hashed(this, loc); + if (!hashed_subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "Failed to get hashed subvol for %s", loc->path); + op_errno = EINVAL; + goto err; } - if (key && !strcmp (GF_XATTR_QUOTA_LIMIT_LIST, key)) { - /* quota hardlimit and aggregated size of a directory is stored - * in inode contexts of each brick. Hence its good enough that - * we send getxattr for this key to any brick. - */ - local->call_cnt = 1; - subvol = dht_first_up_subvol (this); - STACK_WIND (frame, dht_getxattr_cbk, subvol, - subvol->fops->getxattr, loc, key, xdata); - return 0; + cached_subvol = dht_subvol_get_cached(this, loc->inode); + if (!cached_subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_CACHED_SUBVOL_GET_FAILED, + "Failed to get cached subvol for %s", loc->path); + op_errno = EINVAL; + goto err; } - if (key && *conf->vol_uuid) { - if ((match_uuid_local (key, conf->vol_uuid) == 0) && - (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { - if (DHT_IS_DIR(layout)) { - cnt = layout->cnt; - } else { - cnt = 1; - } - sub_volumes = alloca ( cnt * sizeof (xlator_t *)); - for (i = 0; i < cnt; i++) - sub_volumes[i] = layout->list[i].xlator; - - if (cluster_getmarkerattr (frame, this, loc, key, - local, dht_getxattr_unwind, - sub_volumes, cnt, - MARKER_XTIME_TYPE, - marker_xtime_default_gauge, - conf->vol_uuid)) { - op_errno = EINVAL; - goto err; - } - - return 0; - } + if (hashed_subvol == cached_subvol) { + op_errno = ENODATA; + goto err; } - if (DHT_IS_DIR(layout)) { - cnt = local->call_cnt = layout->cnt; + STACK_WIND(frame, dht_linkinfo_getxattr_cbk, hashed_subvol, + hashed_subvol->fops->getxattr, loc, GF_XATTR_PATHINFO_KEY, + xdata); + return 0; + } + + if (dht_is_debug_xattr_key(dht_dbg_vxattrs, (char *)key) >= 0) { + dht_handle_debug_getxattr(frame, this, loc, key); + return 0; + } + +no_key: + if (cluster_handle_marker_getxattr(frame, loc, key, conf->vol_uuid, + dht_getxattr_unwind, + dht_marker_populate_args) == 0) + return 0; + + if (DHT_IS_DIR(layout)) { + local->call_cnt = conf->subvolume_cnt; + cnt = conf->subvolume_cnt; + ret = dht_inode_ctx_mdsvol_get(loc->inode, this, &mds_subvol); + if (!mds_subvol) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "Cannot determine MDS, fetching xattr %s randomly" + " from a subvol for path %s ", + key, loc->path); } else { - cnt = local->call_cnt = 1; + /* TODO need to handle it, As of now we are + choosing availability instead of chossing + consistencty, in case of mds_subvol is + down winding a getxattr call on other subvol + and return xattr + */ + local->mds_subvol = mds_subvol; + for (i = 0; i < cnt; i++) { + if (conf->subvolumes[i] == mds_subvol) { + if (!conf->subvolume_status[i]) { + gf_msg(this->name, GF_LOG_INFO, 0, + DHT_MSG_HASHED_SUBVOL_DOWN, + "MDS %s is down for path" + " path %s so fetching xattr " + "%s randomly from a subvol ", + local->mds_subvol->name, loc->path, key); + ret = 1; + } + } + } } - for (i = 0; i < cnt; i++) { - subvol = layout->list[i].xlator; - STACK_WIND (frame, dht_getxattr_cbk, - subvol, subvol->fops->getxattr, - loc, key, NULL); + if (!ret && key && local->mds_subvol && dht_match_xattr(key)) { + STACK_WIND(frame, dht_mds_getxattr_cbk, local->mds_subvol, + local->mds_subvol->fops->getxattr, loc, key, xdata); + + return 0; } - return 0; + } else { + cnt = local->call_cnt = 1; + } + + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND(frame, dht_getxattr_cbk, subvol, subvol->fops->getxattr, loc, + key, xdata); + } + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(getxattr, frame, -1, op_errno, NULL, NULL); - return 0; + return 0; } #undef DHT_IS_DIR int -dht_fgetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, const char *key, dict_t *xdata) -{ - xlator_t *subvol = NULL; - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - int op_errno = -1; - int i = 0; - int cnt = 0; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - VALIDATE_OR_GOTO (this->private, err); - - local = dht_local_init (frame, NULL, fd, GF_FOP_FGETXATTR); - if (!local) { - op_errno = ENOMEM; - - goto err; +dht_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int op_errno = -1; + int i = 0; + int cnt = 0; + xlator_t *mds_subvol = NULL; + int ret = -1; + dht_conf_t *conf = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + VALIDATE_OR_GOTO(fd->inode, err); + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + + local = dht_local_init(frame, NULL, fd, GF_FOP_FGETXATTR); + if (!local) { + op_errno = ENOMEM; + + goto err; + } + + layout = local->layout; + if (!layout) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_NULL, + "Layout is NULL"); + op_errno = ENOENT; + goto err; + } + + if (key) { + local->key = gf_strdup(key); + if (!local->key) { + op_errno = ENOMEM; + goto err; + } + } + + gf_uuid_unparse(fd->inode->gfid, gfid); + + if ((fd->inode->ia_type == IA_IFDIR) && key && + (strncmp(key, GF_XATTR_LOCKINFO_KEY, SLEN(GF_XATTR_LOCKINFO_KEY)) != + 0)) { + local->call_cnt = conf->subvolume_cnt; + cnt = conf->subvolume_cnt; + ret = dht_inode_ctx_mdsvol_get(fd->inode, this, &mds_subvol); + + if (!mds_subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "cannot determine MDS, fetching xattr %s " + " randomly from a subvol for gfid %s ", + key, gfid); + } else { + /* TODO need to handle it, As of now we are + choosing availability instead of chossing + consistencty, in case of hashed_subvol is + down winding a getxattr call on other subvol + and return xattr + */ + local->mds_subvol = mds_subvol; + for (i = 0; i < cnt; i++) { + if (conf->subvolumes[i] == mds_subvol) { + if (!conf->subvolume_status[i]) { + gf_msg(this->name, GF_LOG_WARNING, 0, + DHT_MSG_HASHED_SUBVOL_DOWN, + "MDS subvolume %s is down" + " for gfid %s so fetching xattr " + " %s randomly from a subvol ", + local->mds_subvol->name, gfid, key); + ret = 1; + } + } + } } - layout = local->layout; - if (!layout) { - gf_log (this->name, GF_LOG_ERROR, - "layout is NULL"); - op_errno = ENOENT; - goto err; - } + if (!ret && key && local->mds_subvol && dht_match_xattr(key)) { + STACK_WIND(frame, dht_mds_getxattr_cbk, local->mds_subvol, + local->mds_subvol->fops->fgetxattr, fd, key, NULL); - if (key) { - local->key = gf_strdup (key); - if (!local->key) { - op_errno = ENOMEM; - goto err; - } + return 0; } - if ((fd->inode->ia_type == IA_IFDIR) - && key - && (strncmp (key, GF_XATTR_LOCKINFO_KEY, - strlen (GF_XATTR_LOCKINFO_KEY) != 0))) { - cnt = local->call_cnt = layout->cnt; - } else { - cnt = local->call_cnt = 1; - } + } else { + cnt = local->call_cnt = 1; + } - for (i = 0; i < cnt; i++) { - subvol = layout->list[i].xlator; - STACK_WIND (frame, dht_getxattr_cbk, - subvol, subvol->fops->fgetxattr, - fd, key, NULL); - } - return 0; + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND(frame, dht_getxattr_cbk, subvol, subvol->fops->fgetxattr, fd, + key, NULL); + } + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fgetxattr, frame, -1, op_errno, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL); - return 0; + return 0; } -int -dht_fsetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, dict_t *xattr, int flags, dict_t *xdata) +static int +dht_setxattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) { - xlator_t *subvol = NULL; - dht_local_t *local = NULL; - int op_errno = EINVAL; - dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int op_errno = EINVAL; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (fd->inode, err); - VALIDATE_OR_GOTO (this->private, err); + if (!frame || !frame->local) + goto err; - conf = this->private; + local = frame->local; + op_errno = local->op_errno; - GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, - op_errno, err); - - local = dht_local_init (frame, NULL, fd, GF_FOP_FSETXATTR); - if (!local) { - op_errno = ENOMEM; - goto err; - } + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno, + local->rebalance.xdata); + return 0; + } - subvol = local->cached_subvol; - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + if (subvol == NULL) + goto err; - local->call_cnt = 1; + local->call_cnt = 2; /* This is the second attempt */ - STACK_WIND (frame, dht_err_cbk, subvol, subvol->fops->fsetxattr, - fd, xattr, flags, NULL); + if (local->fop == GF_FOP_SETXATTR) { + STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol, + subvol->fops->setxattr, &local->loc, + local->rebalance.xattr, local->rebalance.flags, + local->xattr_req); + } else { + STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol, + subvol->fops->fsetxattr, local->fd, + local->rebalance.xattr, local->rebalance.flags, + local->xattr_req); + } - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); - - return 0; + DHT_STACK_UNWIND(setxattr, frame, (local ? local->op_ret : -1), op_errno, + NULL); + return 0; } - -static int -dht_common_setxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xdata) +int +dht_file_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - DHT_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); + int ret = -1; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + struct iatt *stbuf = NULL; + inode_t *inode = NULL; + xlator_t *subvol1 = NULL, *subvol2 = NULL; + + local = frame->local; + prev = cookie; + local->op_errno = op_errno; + + if ((local->fop == GF_FOP_FSETXATTR) && + dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; return 0; -} + } -int -dht_checking_pathinfo_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr, - dict_t *xdata) -{ - int i = -1; - int ret = -1; - char *value = NULL; - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - call_frame_t *prev = NULL; - int this_call_cnt = 0; + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1.", + prev->name); + goto out; + } - local = frame->local; - prev = cookie; - conf = this->private; + if (local->call_cnt != 1) + goto out; - if (op_ret == -1) - goto out; + ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf); + if ((!op_ret) && !stbuf) { + goto out; + } - ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &value); - if (ret) - goto out; + local->op_ret = op_ret; + local->rebalance.target_op_fn = dht_setxattr2; + if (xdata) + local->rebalance.xdata = dict_ref(xdata); - if (!strcmp (value, local->key)) { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->subvolumes[i] == prev->this) - conf->decommissioned_bricks[i] = prev->this; - } + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } + + /* Phase 1 of migration */ + if (IS_DHT_MIGRATION_PHASE1(stbuf)) { + inode = (local->fd) ? local->fd->inode : local->loc.inode; + + ret = dht_inode_ctx_get_mig_info(this, inode, &subvol1, &subvol2); + if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) { + dht_setxattr2(this, subvol2, frame, 0); + return 0; } + ret = dht_rebalance_in_progress_check(this, frame); + if (!ret) + return 0; + } + out: - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (setxattr, frame, local->op_ret, ENOTSUP, NULL); - } - return 0; + if (local->fop == GF_FOP_SETXATTR) { + DHT_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata); + } else { + DHT_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, xdata); + } + + return 0; } -int -dht_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr, int flags, dict_t *xdata) -{ - xlator_t *subvol = NULL; - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - dht_layout_t *layout = NULL; - int i = 0; - int op_errno = EINVAL; - int ret = -1; - data_t *tmp = NULL; - uint32_t dir_spread = 0; - char value[4096] = {0,}; - gf_dht_migrate_data_type_t forced_rebalance = GF_DHT_MIGRATE_DATA; - int call_cnt = 0; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - - conf = this->private; - - GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, - op_errno, err); - - local = dht_local_init (frame, loc, NULL, GF_FOP_SETXATTR); - if (!local) { - op_errno = ENOMEM; - goto err; +/* Function is call by dict_foreach_fnmatch if key is match with + user.* and set boolean flag to true +*/ +static int +dht_is_user_xattr(dict_t *this, char *key, data_t *value, void *data) +{ + gf_boolean_t *user_xattr_found = data; + *user_xattr_found = _gf_true; + return 0; +} + +/* Common code to wind a (f)(set|remove)xattr call to set xattr on directory + */ +static int +dht_dir_common_set_remove_xattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + fd_t *fd, dict_t *xattr, int flags, + dict_t *xdata, int *op_errno) + +{ + dict_t *xattrop = NULL; + int32_t subone[1] = {-1}; + gf_boolean_t uxattr_key_found = _gf_false; + xlator_t *mds_subvol = NULL; + xlator_t *travvol = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int i = 0; + int call_cnt = 0; + dht_local_t *local = NULL; + char gfid_local[GF_UUID_BUF_SIZE] = {0}; + char **xattrs_to_heal; + + conf = this->private; + local = frame->local; + call_cnt = conf->subvolume_cnt; + local->flags = flags; + xattrs_to_heal = get_xattrs_to_heal(); + + if (!gf_uuid_is_null(local->gfid)) { + gf_uuid_unparse(local->gfid, gfid_local); + } + + if ((local->fop == GF_FOP_SETXATTR) || (local->fop == GF_FOP_FSETXATTR)) { + /* Check if any user xattr present in xattr + */ + dict_foreach_fnmatch(xattr, "user*", dht_is_user_xattr, + &uxattr_key_found); + + /* Check if any custom key xattr present in dict xattr + and start index from 1 because user xattr already + checked in previous line + */ + for (i = 1; xattrs_to_heal[i]; i++) + if (dict_get(xattr, xattrs_to_heal[i])) + uxattr_key_found = _gf_true; + } + + if ((local->fop == GF_FOP_REMOVEXATTR) || + (local->fop == GF_FOP_FREMOVEXATTR)) { + /* Check if any custom key xattr present in local->key + */ + for (i = 0; xattrs_to_heal[i]; i++) + if (strstr(local->key, xattrs_to_heal[i])) + uxattr_key_found = _gf_true; + } + + /* If there is no custom key xattr present or gfid is root + or call_cnt is 1 then wind a (f)setxattr call on all subvols + */ + if (!uxattr_key_found || __is_root_gfid(local->gfid) || call_cnt == 1) { + for (i = 0; i < conf->subvolume_cnt; i++) { + travvol = conf->subvolumes[i]; + if ((local->fop == GF_FOP_SETXATTR) || + (local->fop == GF_FOP_FSETXATTR)) { + if (fd) { + STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol, + travvol->fops->fsetxattr, fd, xattr, + flags, xdata); + } else { + STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol, + travvol->fops->setxattr, loc, xattr, + flags, xdata); + } + } + + if ((local->fop == GF_FOP_REMOVEXATTR) || + (local->fop == GF_FOP_FREMOVEXATTR)) { + if (fd) { + STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol, + travvol->fops->fremovexattr, fd, + local->key, local->xattr_req); + } else { + STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol, + travvol->fops->removexattr, loc, + local->key, local->xattr_req); + } + } } - subvol = local->cached_subvol; - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; + return 0; + } + + /* Calculate hash subvol based on inode and parent inode + */ + if (fd) { + ret = dht_inode_ctx_mdsvol_get(fd->inode, this, &mds_subvol); + } else { + ret = dht_inode_ctx_mdsvol_get(loc->inode, this, &mds_subvol); + } + if (ret || !mds_subvol) { + if (fd) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "Failed to get mds subvol for fd %p" + "gfid is %s ", + fd, gfid_local); + } else { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "%s: Failed to get mds subvol. (gfid is %s)", loc->path, + gfid_local); + } + (*op_errno) = ENOENT; + goto err; + } + + local->mds_subvol = mds_subvol; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == mds_subvol) { + if (!conf->subvolume_status[i]) { + gf_msg(this->name, GF_LOG_WARNING, 0, + DHT_MSG_HASHED_SUBVOL_DOWN, + "MDS subvol is down for path " + " %s gfid is %s Unable to set xattr ", + local->loc.path, gfid_local); + (*op_errno) = ENOTCONN; goto err; + } + } + } + + if (uxattr_key_found) { + xattrop = dict_new(); + if (!xattrop) { + gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, 0, + "dictionary creation failed for path %s " + "for gfid is %s ", + local->loc.path, gfid_local); + (*op_errno) = ENOMEM; + goto err; + } + local->xattr = dict_ref(xattr); + /* Subtract current MDS xattr value to -1 , value of MDS + xattr represents no. of times xattr modification failed + on non MDS subvols. + */ + ret = dht_dict_set_array(xattrop, conf->mds_xattr_key, subone, 1); + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "dictionary set array failed for path %s " + "for gfid is %s ", + local->loc.path, gfid_local); + if (xattrop) + dict_unref(xattrop); + (*op_errno) = ret; + goto err; + } + /* Wind a xattrop call to use ref counting approach + update mds xattr to -1 before update xattr on + hashed subvol and update mds xattr to +1 after update + xattr on all non hashed subvol + */ + if (fd) { + STACK_WIND(frame, dht_xattrop_mds_cbk, local->mds_subvol, + local->mds_subvol->fops->fxattrop, fd, + GF_XATTROP_ADD_ARRAY, xattrop, NULL); + } else { + STACK_WIND(frame, dht_xattrop_mds_cbk, local->mds_subvol, + local->mds_subvol->fops->xattrop, loc, + GF_XATTROP_ADD_ARRAY, xattrop, NULL); } + if (xattrop) + dict_unref(xattrop); + } - layout = local->layout; - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for path=%s", loc->path); - op_errno = EINVAL; - goto err; + return 0; +err: + return -1; +} + +int +dht_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr, + int flags, dict_t *xdata) +{ + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + int op_errno = EINVAL; + dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + int ret = -1; + int call_cnt = 0; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + VALIDATE_OR_GOTO(fd->inode, err); + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + + if (!conf->defrag) + GF_IF_INTERNAL_XATTR_GOTO(conf->wild_xattr_name, xattr, op_errno, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_FSETXATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_msg_debug(this->name, 0, "no layout for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local->xattr_req = xdata ? dict_ref(xdata) : dict_new(); + local->call_cnt = call_cnt = layout->cnt; + + if (IA_ISDIR(fd->inode->ia_type)) { + local->hashed_subvol = NULL; + ret = dht_dir_common_set_remove_xattr(frame, this, NULL, fd, xattr, + flags, xdata, &op_errno); + if (ret) + goto err; + } else { + local->call_cnt = 1; + local->rebalance.xattr = dict_ref(xattr); + local->rebalance.flags = flags; + + ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1); + if (ret) { + gf_msg_debug(this->name, 0, + "Failed to set dictionary key %s for fd=%p", + DHT_IATT_IN_XDATA_KEY, fd); } - local->call_cnt = call_cnt = layout->cnt; + STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol, + subvol->fops->fsetxattr, fd, xattr, flags, + local->xattr_req); + } + return 0; - tmp = dict_get (xattr, "distribute.migrate-data"); - if (tmp) { - if (IA_ISDIR (loc->inode->ia_type)) { - op_errno = ENOTSUP; - goto err; - } +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(fsetxattr, frame, -1, op_errno, NULL); - /* TODO: need to interpret the 'value' for more meaning - (ie, 'target' subvolume given there, etc) */ - memcpy (value, tmp->data, tmp->len); - if (strcmp (value, "force") == 0) - forced_rebalance = - GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS; - - if (conf->decommission_in_progress) - forced_rebalance = GF_DHT_MIGRATE_HARDLINK; - - local->rebalance.target_node = dht_subvol_get_hashed (this, loc); - if (!local->rebalance.target_node) { - gf_log (this->name, GF_LOG_ERROR, "Failed to get " - "hashed subvol for %s", loc->path); - op_errno = EINVAL; - goto err; - } + return 0; +} - local->rebalance.from_subvol = local->cached_subvol; +static int +dht_checking_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, + dict_t *xdata) +{ + int i = -1; + int ret = -1; + char *value = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *prev = NULL; + int this_call_cnt = 0; + + local = frame->local; + prev = cookie; + conf = this->private; + + if (op_ret == -1) + goto out; + + ret = dict_get_str(xattr, GF_XATTR_PATHINFO_KEY, &value); + if (ret) + goto out; + + if (!strcmp(value, local->key)) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev) + conf->decommissioned_bricks[i] = prev; + } + } - if (local->rebalance.target_node == local->rebalance.from_subvol) { - op_errno = EEXIST; - goto err; - } - if (local->rebalance.target_node) { - local->flags = forced_rebalance; +out: + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + DHT_STACK_UNWIND(setxattr, frame, local->op_ret, ENOTSUP, NULL); + } + return 0; +} - ret = dht_start_rebalance_task (this, frame); - if (!ret) - return 0; +static int +dht_nuke_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, NULL); + return 0; +} - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to create a new synctask", - loc->path); - } - op_errno = EINVAL; - goto err; +static int +dht_nuke_dir(call_frame_t *frame, xlator_t *this, loc_t *loc, data_t *tmp) +{ + if (!IA_ISDIR(loc->inode->ia_type)) { + DHT_STACK_UNWIND(setxattr, frame, -1, ENOTSUP, NULL); + return 0; + } - } + /* Setxattr didn't need the parent, but rmdir does. */ + loc->parent = inode_parent(loc->inode, NULL, NULL); + if (!loc->parent) { + DHT_STACK_UNWIND(setxattr, frame, -1, ENOENT, NULL); + return 0; + } + gf_uuid_copy(loc->pargfid, loc->parent->gfid); + + if (!loc->name && loc->path) { + loc->name = strrchr(loc->path, '/'); + if (loc->name) { + ++(loc->name); + } + } + + /* + * We do this instead of calling dht_rmdir_do directly for two reasons. + * The first is that we want to reuse all of the initialization that + * dht_rmdir does, so if it ever changes we'll just follow along. The + * second (i.e. why we don't use STACK_WIND_TAIL) is so that we don't + * obscure the fact that we came in via this path instead of a genuine + * rmdir. That makes debugging just a tiny bit easier. + */ + STACK_WIND(frame, dht_nuke_dir_cbk, this, this->fops->rmdir, loc, 1, NULL); + + return 0; +} - tmp = dict_get (xattr, "decommission-brick"); - if (tmp) { - /* This operation should happen only on '/' */ - if (!__is_root_gfid (loc->inode->gfid)) { - op_errno = ENOTSUP; - goto err; - } +int +dht_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr, + int flags, dict_t *xdata) +{ + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + dht_methods_t *methods = NULL; + dht_layout_t *layout = NULL; + int i = 0; + int op_errno = EINVAL; + int ret = -1; + data_t *tmp = NULL; + uint32_t dir_spread = 0; + char value[4096] = { + 0, + }; + gf_dht_migrate_data_type_t forced_rebalance = GF_DHT_MIGRATE_DATA; + int call_cnt = 0; + uint32_t new_hash = 0; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + + conf = this->private; + GF_VALIDATE_OR_GOTO(this->name, conf, err); + + methods = &(conf->methods); + + /* Rebalance daemon is allowed to set internal keys */ + if (!conf->defrag) + GF_IF_INTERNAL_XATTR_GOTO(conf->wild_xattr_name, xattr, op_errno, err); + + local = dht_local_init(frame, loc, NULL, GF_FOP_SETXATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local->call_cnt = call_cnt = layout->cnt; + tmp = dict_get(xattr, conf->mds_xattr_key); + if (tmp) { + op_errno = ENOTSUP; + goto err; + } + + tmp = dict_get(xattr, GF_XATTR_FILE_MIGRATE_KEY); + if (tmp) { + if (IA_ISDIR(loc->inode->ia_type)) { + op_errno = ENOTSUP; + goto err; + } + + /* TODO: need to interpret the 'value' for more meaning + (ie, 'target' subvolume given there, etc) */ + memcpy(value, tmp->data, tmp->len); + if (strcmp(value, "force") == 0) + forced_rebalance = GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS; + + if (conf->decommission_in_progress) + forced_rebalance = GF_DHT_MIGRATE_HARDLINK; + + if (!loc->path) { + op_errno = EINVAL; + goto err; + } + + if (!local->loc.name) + local->loc.name = strrchr(local->loc.path, '/') + 1; + + if (!local->loc.parent) + local->loc.parent = inode_parent(local->loc.inode, NULL, NULL); + + if ((!local->loc.name) || (!local->loc.parent)) { + op_errno = EINVAL; + goto err; + } + + if (gf_uuid_is_null(local->loc.pargfid)) + gf_uuid_copy(local->loc.pargfid, local->loc.parent->gfid); + + methods->migration_get_dst_subvol(this, local); + + if (!local->rebalance.target_node) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "Failed to get hashed subvol for %s", loc->path); + op_errno = EINVAL; + goto err; + } + + local->rebalance.from_subvol = local->cached_subvol; + + if (local->rebalance.target_node == local->rebalance.from_subvol) { + op_errno = EEXIST; + goto err; + } + if (local->rebalance.target_node) { + local->flags = forced_rebalance; - memcpy (value, tmp->data, ((tmp->len < 4095) ? tmp->len : 4095)); - local->key = gf_strdup (value); - local->call_cnt = conf->subvolume_cnt; + frame->root->pid = GF_CLIENT_PID_DEFRAG; - for (i = 0 ; i < conf->subvolume_cnt; i++) { - /* Get the pathinfo, and then compare */ - STACK_WIND (frame, dht_checking_pathinfo_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->getxattr, - loc, GF_XATTR_PATHINFO_KEY, NULL); - } + ret = dht_start_rebalance_task(this, frame); + if (!ret) return 0; + + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_REBALANCE_START_FAILED, + "%s: failed to create a new rebalance synctask", loc->path); } + op_errno = EINVAL; + goto err; + } - tmp = dict_get (xattr, GF_XATTR_FIX_LAYOUT_KEY); - if (tmp) { - gf_log (this->name, GF_LOG_INFO, - "fixing the layout of %s", loc->path); + tmp = dict_get(xattr, "decommission-brick"); + if (tmp) { + /* This operation should happen only on '/' */ + if (!__is_root_gfid(loc->inode->gfid)) { + op_errno = ENOTSUP; + goto err; + } - ret = dht_fix_directory_layout (frame, dht_common_setxattr_cbk, - layout); - if (ret) { - op_errno = ENOTCONN; - goto err; - } - return ret; + memcpy(value, tmp->data, min(tmp->len, 4095)); + local->key = gf_strdup(value); + local->call_cnt = conf->subvolume_cnt; + + for (i = 0; i < conf->subvolume_cnt; i++) { + /* Get the pathinfo, and then compare */ + STACK_WIND_COOKIE(frame, dht_checking_pathinfo_cbk, + conf->subvolumes[i], conf->subvolumes[i], + conf->subvolumes[i]->fops->getxattr, loc, + GF_XATTR_PATHINFO_KEY, NULL); } + return 0; + } - tmp = dict_get (xattr, "distribute.directory-spread-count"); - if (tmp) { - /* Setxattr value is packed as 'binary', not string */ - memcpy (value, tmp->data, ((tmp->len < 4095)?tmp->len:4095)); - ret = gf_string2uint32 (value, &dir_spread); - if (!ret && ((dir_spread <= conf->subvolume_cnt) && - (dir_spread > 0))) { - layout->spread_cnt = dir_spread; - - ret = dht_fix_directory_layout (frame, - dht_common_setxattr_cbk, - layout); - if (ret) { - op_errno = ENOTCONN; - goto err; - } - return ret; - } - gf_log (this->name, GF_LOG_ERROR, - "wrong 'directory-spread-count' value (%s)", value); - op_errno = ENOTSUP; + tmp = dict_get(xattr, GF_XATTR_FIX_LAYOUT_KEY); + if (tmp) { + ret = dict_get_uint32(xattr, "new-commit-hash", &new_hash); + if (ret == 0) { + gf_msg_debug(this->name, 0, + "updating commit hash for %s from %u to %u", + uuid_utoa(loc->gfid), layout->commit_hash, new_hash); + layout->commit_hash = new_hash; + + ret = dht_update_commit_hash_for_layout(frame); + if (ret) { + op_errno = ENOTCONN; goto err; + } + return ret; } - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_err_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->setxattr, - loc, xattr, flags, xdata); + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_FIX_LAYOUT_INFO, + "fixing the layout of %s", loc->path); + + ret = dht_fix_directory_layout(frame, dht_fix_layout_setxattr_cbk, + layout); + if (ret) { + op_errno = ENOTCONN; + goto err; } + return ret; + } + + tmp = dict_get(xattr, "distribute.directory-spread-count"); + if (tmp) { + /* Setxattr value is packed as 'binary', not string */ + memcpy(value, tmp->data, min(tmp->len, 4095)); + ret = gf_string2uint32(value, &dir_spread); + if (!ret && ((dir_spread <= conf->subvolume_cnt) && (dir_spread > 0))) { + layout->spread_cnt = dir_spread; + + ret = dht_fix_directory_layout(frame, dht_common_setxattr_cbk, + layout); + if (ret) { + op_errno = ENOTCONN; + goto err; + } + return ret; + } + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_OPERATION_NOT_SUP, + "wrong 'directory-spread-count' value (%s)", value); + op_errno = ENOTSUP; + goto err; + } + + tmp = dict_get(xattr, "glusterfs.dht.nuke"); + if (tmp) { + return dht_nuke_dir(frame, this, loc, tmp); + } + local->xattr_req = xdata ? dict_ref(xdata) : dict_new(); + + if (IA_ISDIR(loc->inode->ia_type)) { + local->hashed_subvol = NULL; + ret = dht_dir_common_set_remove_xattr(frame, this, loc, NULL, xattr, + flags, xdata, &op_errno); + if (ret) + goto err; + } else { + local->rebalance.xattr = dict_ref(xattr); + local->rebalance.flags = flags; + local->call_cnt = 1; - return 0; + ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1); + + STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol, + subvol->fops->setxattr, loc, xattr, flags, + local->xattr_req); + } + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL); - return 0; + return 0; } - -int -dht_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xdata) +static int +dht_removexattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) { - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; + dht_local_t *local = NULL; + int op_errno = EINVAL; - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } + if (!frame || !frame->local) + goto err; - local->op_ret = 0; - } -unlock: - UNLOCK (&frame->lock); + local = frame->local; + op_errno = local->op_errno; - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (removexattr, frame, local->op_ret, - local->op_errno, NULL); - } + local->call_cnt = 2; /* This is the second attempt */ + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno, + local->rebalance.xdata); return 0; -} + } + + if (subvol == NULL) + goto err; + if (local->fop == GF_FOP_REMOVEXATTR) { + STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol, + subvol->fops->removexattr, &local->loc, local->key, + local->xattr_req); + } else { + STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol, + subvol->fops->fremovexattr, local->fd, local->key, + local->xattr_req); + } + + return 0; + +err: + DHT_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL); + return 0; +} int -dht_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *key, dict_t *xdata) +dht_file_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - int call_cnt = 0; - dht_conf_t *conf = NULL; + int ret = -1; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + struct iatt *stbuf = NULL; + inode_t *inode = NULL; + xlator_t *subvol1 = NULL, *subvol2 = NULL; - int i; + local = frame->local; + prev = cookie; - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (this->private, err); + local->op_errno = op_errno; - conf = this->private; + if ((local->fop == GF_FOP_FREMOVEXATTR) && + dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } - GF_IF_NATIVE_XATTR_GOTO (conf->wild_xattr_name, key, op_errno, err); + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + goto out; + } - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); + if (local->call_cnt != 1) + goto out; - local = dht_local_init (frame, loc, NULL, GF_FOP_REMOVEXATTR); - if (!local) { - op_errno = ENOMEM; - goto err; - } + ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf); - subvol = local->cached_subvol; - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + if ((!op_ret) && !stbuf) { + goto out; + } - layout = local->layout; - if (!local->layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + local->op_ret = 0; - local->call_cnt = call_cnt = layout->cnt; - local->key = gf_strdup (key); + local->rebalance.target_op_fn = dht_removexattr2; + if (xdata) + local->rebalance.xdata = dict_ref(xdata); - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_removexattr_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->removexattr, - loc, key, NULL); - } + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } - return 0; + /* Phase 1 of migration */ + if (IS_DHT_MIGRATION_PHASE1(stbuf)) { + inode = (local->fd) ? local->fd->inode : local->loc.inode; -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); + ret = dht_inode_ctx_get_mig_info(this, inode, &subvol1, &subvol2); + if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) { + dht_removexattr2(this, subvol2, frame, 0); + return 0; + } - return 0; + ret = dht_rebalance_in_progress_check(this, frame); + if (!ret) + return 0; + } + +out: + if (local->fop == GF_FOP_REMOVEXATTR) { + DHT_STACK_UNWIND(removexattr, frame, op_ret, op_errno, xdata); + } else { + DHT_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, xdata); + } + return 0; } int -dht_fremovexattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, const char *key, dict_t *xdata) +dht_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *key, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - int call_cnt = 0; - dht_conf_t *conf = 0; - - int i; + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int call_cnt = 0; + dht_conf_t *conf = NULL; + int ret = 0; + + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + + GF_IF_NATIVE_XATTR_GOTO(conf->wild_xattr_name, key, op_errno, err); + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + + local = dht_local_init(frame, loc, NULL, GF_FOP_REMOVEXATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + + layout = local->layout; + if (!local->layout) { + gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + + local->call_cnt = call_cnt = layout->cnt; + local->key = gf_strdup(key); + + if (key && (strncmp(key, conf->mds_xattr_key, strlen(key)) == 0)) { + op_errno = ENOTSUP; + goto err; + } + + if (IA_ISDIR(loc->inode->ia_type)) { + local->hashed_subvol = NULL; + ret = dht_dir_common_set_remove_xattr(frame, this, loc, NULL, NULL, 0, + local->xattr_req, &op_errno); + if (ret) + goto err; - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (this->private, err); + } else { + local->call_cnt = 1; + ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_DICT_SET_FAILED, + "Failed to " + "set dictionary key %s for %s", + DHT_IATT_IN_XDATA_KEY, loc->path); + } - conf = this->private; + STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol, + subvol->fops->removexattr, loc, key, + local->xattr_req); + } - GF_IF_NATIVE_XATTR_GOTO (conf->wild_xattr_name, key, op_errno, err); + return 0; - VALIDATE_OR_GOTO (frame, err); +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL); - local = dht_local_init (frame, NULL, fd, GF_FOP_FREMOVEXATTR); - if (!local) { - op_errno = ENOMEM; - goto err; - } + return 0; +} - subvol = local->cached_subvol; - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for inode=%s", - uuid_utoa (fd->inode->gfid)); - op_errno = EINVAL; - goto err; - } +int +dht_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int call_cnt = 0; + dht_conf_t *conf = 0; + int ret = 0; + + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + + GF_IF_NATIVE_XATTR_GOTO(conf->wild_xattr_name, key, op_errno, err); + + VALIDATE_OR_GOTO(frame, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_FREMOVEXATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for inode=%s", + uuid_utoa(fd->inode->gfid)); + op_errno = EINVAL; + goto err; + } + + layout = local->layout; + if (!local->layout) { + gf_msg_debug(this->name, 0, "no layout for inode=%s", + uuid_utoa(fd->inode->gfid)); + op_errno = EINVAL; + goto err; + } + local->xattr_req = xdata ? dict_ref(xdata) : dict_new(); + + local->call_cnt = call_cnt = layout->cnt; + local->key = gf_strdup(key); + + if (IA_ISDIR(fd->inode->ia_type)) { + local->hashed_subvol = NULL; + ret = dht_dir_common_set_remove_xattr(frame, this, NULL, fd, NULL, 0, + local->xattr_req, &op_errno); + if (ret) + goto err; - layout = local->layout; - if (!local->layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for inode=%s", uuid_utoa (fd->inode->gfid)); - op_errno = EINVAL; - goto err; + } else { + local->call_cnt = 1; + ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_DICT_SET_FAILED, + "Failed to " + "set dictionary key %s for fd=%p", + DHT_IATT_IN_XDATA_KEY, fd); } - local->call_cnt = call_cnt = layout->cnt; - local->key = gf_strdup (key); + STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol, + subvol->fops->fremovexattr, fd, key, + local->xattr_req); + } - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_removexattr_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->fremovexattr, - fd, key, NULL); - } - - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fremovexattr, frame, -1, op_errno, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL); - return 0; + return 0; } - int -dht_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd, dict_t *xdata) +dht_fd_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, fd_t *fd, dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } + local = frame->local; + prev = cookie; - local->op_ret = 0; + LOCK(&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + UNLOCK(&frame->lock); + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + goto post_unlock; } -unlock: - UNLOCK (&frame->lock); - - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, - local->fd, NULL); - return 0; + local->op_ret = 0; + } + UNLOCK(&frame->lock); +post_unlock: + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) + DHT_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, local->fd, + NULL); + + return 0; } /* * dht_normalize_stats - */ static void -dht_normalize_stats (struct statvfs *buf, unsigned long bsize, - unsigned long frsize) +dht_normalize_stats(struct statvfs *buf, unsigned long bsize, + unsigned long frsize) { - double factor = 0; + double factor = 0; + + if (buf->f_bsize != bsize) { + buf->f_bsize = bsize; + } + + if (buf->f_frsize != frsize) { + factor = ((double)buf->f_frsize) / frsize; + buf->f_frsize = frsize; + buf->f_blocks = (fsblkcnt_t)(factor * buf->f_blocks); + buf->f_bfree = (fsblkcnt_t)(factor * buf->f_bfree); + buf->f_bavail = (fsblkcnt_t)(factor * buf->f_bavail); + } +} - if (buf->f_bsize != bsize) { - buf->f_bsize = bsize; +static int +dht_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct statvfs *statvfs, dict_t *xdata) +{ + gf_boolean_t event = _gf_false; + qdstatfs_action_t action = qdstatfs_action_OFF; + dht_local_t *local = NULL; + int this_call_cnt = 0; + int bsize = 0; + int frsize = 0; + GF_UNUSED int ret = 0; + unsigned long new_usage = 0; + unsigned long cur_usage = 0; + + local = frame->local; + GF_ASSERT(local); + + if (xdata) + ret = dict_get_int8(xdata, "quota-deem-statfs", (int8_t *)&event); + + LOCK(&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + goto unlock; + } + if (!statvfs) { + op_errno = EINVAL; + local->op_ret = -1; + goto unlock; } + local->op_ret = 0; + + if (local->quota_deem_statfs) { + if (event == _gf_true) { + action = qdstatfs_action_COMPARE; + } else { + action = qdstatfs_action_NEGLECT; + } + } else { + if (event == _gf_true) { + action = qdstatfs_action_REPLACE; + local->quota_deem_statfs = _gf_true; + } + } + + if (local->quota_deem_statfs) { + switch (action) { + case qdstatfs_action_NEGLECT: + goto unlock; + + case qdstatfs_action_REPLACE: + local->statvfs = *statvfs; + goto unlock; - if (buf->f_frsize != frsize) { - factor = ((double) buf->f_frsize) / frsize; - buf->f_frsize = frsize; - buf->f_blocks = (fsblkcnt_t) (factor * buf->f_blocks); - buf->f_bfree = (fsblkcnt_t) (factor * buf->f_bfree); - buf->f_bavail = (fsblkcnt_t) (factor * buf->f_bavail); + case qdstatfs_action_COMPARE: + new_usage = statvfs->f_blocks - statvfs->f_bfree; + cur_usage = local->statvfs.f_blocks - + local->statvfs.f_bfree; + /* Take the max of the usage from subvols */ + if (new_usage >= cur_usage) + local->statvfs = *statvfs; + goto unlock; + + default: + break; + } } + + if (local->statvfs.f_bsize != 0) { + bsize = max(local->statvfs.f_bsize, statvfs->f_bsize); + frsize = max(local->statvfs.f_frsize, statvfs->f_frsize); + dht_normalize_stats(&local->statvfs, bsize, frsize); + dht_normalize_stats(statvfs, bsize, frsize); + } else { + local->statvfs.f_bsize = statvfs->f_bsize; + local->statvfs.f_frsize = statvfs->f_frsize; + } + + local->statvfs.f_blocks += statvfs->f_blocks; + local->statvfs.f_bfree += statvfs->f_bfree; + local->statvfs.f_bavail += statvfs->f_bavail; + local->statvfs.f_files += statvfs->f_files; + local->statvfs.f_ffree += statvfs->f_ffree; + local->statvfs.f_favail += statvfs->f_favail; + local->statvfs.f_fsid = statvfs->f_fsid; + local->statvfs.f_flag = statvfs->f_flag; + local->statvfs.f_namemax = statvfs->f_namemax; + } +unlock: + UNLOCK(&frame->lock); + + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) + DHT_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno, + &local->statvfs, xdata); + + return 0; } int -dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct statvfs *statvfs, dict_t *xdata) +dht_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = 0; - int bsize = 0; - int frsize = 0; - int8_t quota_deem_statfs = 0; - GF_UNUSED int ret = 0; - unsigned long new_usage = 0; - unsigned long cur_usage = 0; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + int i = -1; + inode_t *inode = NULL; + inode_table_t *itable = NULL; + static uuid_t root_gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + loc_t newloc = { + 0, + }; - ret = dict_get_int8 (xdata, "quota-deem-statfs", "a_deem_statfs); + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(this->private, err); - local = frame->local; + conf = this->private; - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - goto unlock; - } - if (!statvfs) { - op_errno = EINVAL; - local->op_ret = -1; - goto unlock; - } - local->op_ret = 0; + local = dht_local_init(frame, NULL, NULL, GF_FOP_STATFS); + if (!local) { + op_errno = ENOMEM; + goto err; + } - if (quota_deem_statfs) { - new_usage = statvfs->f_blocks - statvfs->f_bfree; - cur_usage = local->statvfs.f_blocks - local->statvfs.f_bfree; - /* We take the maximux of the usage from the subvols */ - if (new_usage >= cur_usage) - local->statvfs = *statvfs; - goto unlock; - } + if (loc->inode && !IA_ISDIR(loc->inode->ia_type)) { + itable = loc->inode->table; + if (!itable) { + op_errno = EINVAL; + goto err; + } - if (local->statvfs.f_bsize != 0) { - bsize = max(local->statvfs.f_bsize, statvfs->f_bsize); - frsize = max(local->statvfs.f_frsize, statvfs->f_frsize); - dht_normalize_stats(&local->statvfs, bsize, frsize); - dht_normalize_stats(statvfs, bsize, frsize); - } else { - local->statvfs.f_bsize = statvfs->f_bsize; - local->statvfs.f_frsize = statvfs->f_frsize; - } + loc = &local->loc2; - local->statvfs.f_blocks += statvfs->f_blocks; - local->statvfs.f_bfree += statvfs->f_bfree; - local->statvfs.f_bavail += statvfs->f_bavail; - local->statvfs.f_files += statvfs->f_files; - local->statvfs.f_ffree += statvfs->f_ffree; - local->statvfs.f_favail += statvfs->f_favail; - local->statvfs.f_fsid = statvfs->f_fsid; - local->statvfs.f_flag = statvfs->f_flag; - local->statvfs.f_namemax = statvfs->f_namemax; + inode = inode_find(itable, root_gfid); + if (!inode) { + op_errno = EINVAL; + goto err; + } + dht_build_root_loc(inode, &newloc); + loc = &newloc; + } - } -unlock: - UNLOCK (&frame->lock); + local->call_cnt = conf->subvolume_cnt; + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND(frame, dht_statfs_cbk, conf->subvolumes[i], + conf->subvolumes[i]->fops->statfs, loc, xdata); + } + return 0; - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno, - &local->statvfs, xdata); +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL); - return 0; + return 0; } - int -dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +dht_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) { - xlator_t *subvol = NULL; - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int op_errno = -1; - int i = -1; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + int i = -1; + int ret = 0; + gf_boolean_t new_xdata = _gf_false; + xlator_t **subvolumes = NULL; + int call_count = 0; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + + local = dht_local_init(frame, loc, fd, GF_FOP_OPENDIR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + local->first_up_subvol = dht_first_up_subvol(this); + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + op_errno = ENOMEM; + goto err; + } + new_xdata = _gf_true; + } + + ret = dict_set_uint32(xdata, conf->link_xattr_name, 256); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value : key = %s", + conf->link_xattr_name); + + /* dht_readdirp will wind to all subvols so open has to be sent to + * all subvols whether or not conf->local_subvols is set */ + + call_count = local->call_cnt = conf->subvolume_cnt; + subvolumes = conf->subvolumes; + + /* In case of parallel-readdir, the readdir-ahead will be loaded + * below dht, in this case, if we want to enable or disable SKIP_DIRs + * it has to be done in opendir, so that prefetching logic in + * readdir-ahead, honors it */ + for (i = 0; i < call_count; i++) { + if (conf->readdir_optimize == _gf_true) { + if (subvolumes[i] != local->first_up_subvol) { + ret = dict_set_int32(xdata, GF_READDIR_SKIP_DIRS, 1); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary" + " value :key = %s, ret:%d", + GF_READDIR_SKIP_DIRS, ret); + } + } - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (this->private, err); + STACK_WIND_COOKIE(frame, dht_fd_cbk, subvolumes[i], subvolumes[i], + subvolumes[i]->fops->opendir, loc, fd, xdata); + dict_del(xdata, GF_READDIR_SKIP_DIRS); + } - conf = this->private; + if (new_xdata) + dict_unref(xdata); - local = dht_local_init (frame, NULL, NULL, GF_FOP_STATFS); - if (!local) { - op_errno = ENOMEM; - goto err; - } + return 0; - if (IA_ISDIR (loc->inode->ia_type)) { - local->call_cnt = conf->subvolume_cnt; +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(opendir, frame, -1, op_errno, NULL, NULL); - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (frame, dht_statfs_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->statfs, loc, - xdata); - } - return 0; - } + return 0; +} - subvol = dht_subvol_get_cached (this, loc->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } +/* dht_readdirp_cbk creates a new dentry and dentry->inode is not assigned. + This functions assigns an inode if all of the following conditions are + true: - local->call_cnt = 1; + * DHT has only one child. In this case the entire layout is present on + this single child and hence we can set complete layout in inode. + * backend has complete layout and there are no anomalies in it and from + this information layout can be constructed and set in inode. +*/ - STACK_WIND (frame, dht_statfs_cbk, - subvol, subvol->fops->statfs, loc, xdata); +static void +dht_populate_inode_for_dentry(xlator_t *this, xlator_t *subvol, + gf_dirent_t *entry, gf_dirent_t *orig_entry) +{ + dht_layout_t *layout = NULL; + int ret = 0; + loc_t loc = { + 0, + }; + + if (gf_uuid_is_null(orig_entry->d_stat.ia_gfid)) { + /* this skips the '..' entry for the root of the volume */ + return; + } - return 0; + gf_uuid_copy(loc.gfid, orig_entry->d_stat.ia_gfid); + loc.inode = inode_ref(orig_entry->inode); -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); + if (is_revalidate(&loc)) { + goto out; + } - return 0; -} + layout = dht_layout_new(this, 1); + if (!layout) + goto out; + ret = dht_layout_merge(this, layout, subvol, 0, 0, orig_entry->dict); + if (!ret) { + ret = dht_layout_normalize(this, &loc, layout); + if (ret == 0) { + dht_layout_set(this, orig_entry->inode, layout); + entry->inode = inode_ref(orig_entry->inode); + layout = NULL; + } + } -int -dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, - dict_t *xdata) + if (layout) + dht_layout_unref(this, layout); + +out: + loc_wipe(&loc); + return; +} + +/* Posix returns op_errno = ENOENT to indicate that there are no more + * entries + */ +static int +dht_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, gf_dirent_t *orig_entries, dict_t *xdata) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int op_errno = -1; - int i = -1; + dht_local_t *local = NULL; + gf_dirent_t entries; + gf_dirent_t *orig_entry = NULL; + gf_dirent_t *entry = NULL; + xlator_t *prev = NULL; + xlator_t *next_subvol = NULL; + off_t next_offset = 0; + int count = 0; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + dht_methods_t *methods = NULL; + xlator_t *subvol = 0; + xlator_t *hashed_subvol = 0; + int ret = 0; + int readdir_optimize = 0; + inode_table_t *itable = NULL; + inode_t *inode = NULL; + gf_boolean_t skip_hashed_check = _gf_false; + + INIT_LIST_HEAD(&entries.list); + + prev = cookie; + local = frame->local; + GF_VALIDATE_OR_GOTO(this->name, local->fd, unwind); + + itable = local->fd->inode->table; + + conf = this->private; + GF_VALIDATE_OR_GOTO(this->name, conf, unwind); + + methods = &(conf->methods); + + if (op_ret <= 0) { + goto done; + } + + /* Why aren't we skipping DHT entirely in case of a single subvol? + * Because if this was a larger volume earlier and all but one subvol + * was removed, there might be stale linkto files on the subvol. + */ + if (conf->subvolume_cnt == 1) { + /* return all directory and file entries except + * linkto files for a single child DHT + */ + skip_hashed_check = _gf_true; + } - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (this->private, err); + if (!local->layout) + local->layout = dht_layout_get(this, local->fd->inode); - conf = this->private; + layout = local->layout; - local = dht_local_init (frame, loc, fd, GF_FOP_OPENDIR); - if (!local) { - op_errno = ENOMEM; + /* This will skip the entries on the subvol without a layout, + * hence preventing the crash but rmdir might fail with + * "directory not empty" errors*/ - goto err; + if (layout == NULL) + goto done; + + if (conf->readdir_optimize == _gf_true) + readdir_optimize = 1; + + gf_msg_debug(this->name, 0, "Processing entries from %s", prev->name); + + list_for_each_entry(orig_entry, (&orig_entries->list), list) + { + next_offset = orig_entry->d_off; + + gf_msg_debug(this->name, 0, "%s: entry = %s, type = %d", prev->name, + orig_entry->d_name, orig_entry->d_type); + + if (IA_ISINVAL(orig_entry->d_stat.ia_type)) { + /*stat failed somewhere- display this entry but the data may + * be inaccurate. + */ + gf_msg_debug(this->name, EINVAL, "Invalid stat for %s (gfid %s)", + orig_entry->d_name, + uuid_utoa(orig_entry->d_stat.ia_gfid)); } - local->call_cnt = conf->subvolume_cnt; + if (check_is_linkfile(NULL, (&orig_entry->d_stat), orig_entry->dict, + conf->link_xattr_name)) { + gf_msg_debug(this->name, 0, "%s: %s is a linkto file", prev->name, + orig_entry->d_name); + continue; + } - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (frame, dht_fd_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->opendir, - loc, fd, xdata); + if (skip_hashed_check) { + goto list; } - return 0; + if (check_is_dir(NULL, (&orig_entry->d_stat), NULL)) { + /*Directory entries filtering : + * a) If rebalance is running, pick from first_up_subvol + * b) (rebalance not running)hashed subvolume is NULL or + * down then filter in first_up_subvolume. Other wise the + * corresponding hashed subvolume will take care of the + * directory entry. + */ + if (readdir_optimize) { + if (prev == local->first_up_subvol) + goto list; + else + continue; + } + + hashed_subvol = methods->layout_search(this, layout, + orig_entry->d_name); + + if (prev == hashed_subvol) + goto list; + if ((hashed_subvol && dht_subvol_status(conf, hashed_subvol)) || + (prev != local->first_up_subvol)) + continue; + + goto list; + } + + list: + entry = gf_dirent_for_name(orig_entry->d_name); + if (!entry) { + goto unwind; + } + + /* Do this if conf->search_unhashed is set to "auto" */ + if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) { + subvol = methods->layout_search(this, layout, orig_entry->d_name); + if (!subvol || (subvol != prev)) { + /* TODO: Count the number of entries which need + linkfile to prove its existence in fs */ + layout->search_unhashed++; + } + } + + entry->d_off = orig_entry->d_off; + entry->d_stat = orig_entry->d_stat; + entry->d_ino = orig_entry->d_ino; + entry->d_type = orig_entry->d_type; + entry->d_len = orig_entry->d_len; + + if (orig_entry->dict) + entry->dict = dict_ref(orig_entry->dict); + + /* making sure we set the inode ctx right with layout, + currently possible only for non-directories, so for + directories don't set entry inodes */ + if (IA_ISDIR(entry->d_stat.ia_type)) { + entry->d_stat.ia_blocks = DHT_DIR_STAT_BLOCKS; + entry->d_stat.ia_size = DHT_DIR_STAT_SIZE; + if (orig_entry->inode) { + dht_inode_ctx_time_update(orig_entry->inode, this, + &entry->d_stat, 1); + + if (conf->subvolume_cnt == 1) { + dht_populate_inode_for_dentry(this, prev, entry, + orig_entry); + } + } + } else { + if (orig_entry->dict && + dict_get(orig_entry->dict, conf->link_xattr_name)) { + /* Strip out the S and T flags set by rebalance*/ + DHT_STRIP_PHASE1_FLAGS(&entry->d_stat); + } + + if (orig_entry->inode) { + ret = dht_layout_preset(this, prev, orig_entry->inode); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, + DHT_MSG_LAYOUT_SET_FAILED, + "failed to link the layout " + "in inode for %s", + orig_entry->d_name); + + entry->inode = inode_ref(orig_entry->inode); + } else if (itable) { + /* + * orig_entry->inode might be null if any upper + * layer xlators below client set to null, to + * force a lookup on the inode even if the inode + * is present in the inode table. In that case + * we just update the ctx to make sure we didn't + * missed anything. + */ + inode = inode_find(itable, orig_entry->d_stat.ia_gfid); + if (inode) { + ret = dht_layout_preset(this, prev, inode); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, + DHT_MSG_LAYOUT_SET_FAILED, + "failed to link the layout" + " in inode for %s", + orig_entry->d_name); + inode_unref(inode); + inode = NULL; + } + } + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (opendir, frame, -1, op_errno, NULL, NULL); + gf_msg_debug(this->name, 0, "%s: Adding entry = %s", prev->name, + entry->d_name); + + list_add_tail(&entry->list, &entries.list); + count++; + } + +done: + + /* We need to ensure that only the last subvolume's end-of-directory + * notification is respected so that directory reading does not stop + * before all subvolumes have been read. That could happen because the + * posix for each subvolume sends a ENOENT on end-of-directory but in + * distribute we're not concerned only with a posix's view of the + * directory but the aggregated namespace' view of the directory. + * Possible values: + * op_ret == 0 and op_errno != 0 + * if op_errno != ENOENT : Error.Unwind. + * if op_errno == ENOENT : There are no more entries on this subvol. + * Move to the next one. + * op_ret > 0 and count == 0 : + * The subvol returned entries to dht but all were stripped out. + * For example, if they were linkto files or dirs where + * hashed_subvol != prev. Try to get some entries by winding + * to the next subvol. This can be dangerous if parallel readdir + * is enabled as it grows the stack. + * + * op_ret > 0 and count > 0: + * We found some entries. Unwind even if the buffer is not full. + * + */ + + op_ret = count; + if (count == 0) { + /* non-zero next_offset means that + * EOF is not yet hit on the current subvol + */ + if ((next_offset == 0) || (op_errno == ENOENT)) { + next_offset = 0; + next_subvol = dht_subvol_next(this, prev); + } else { + next_subvol = prev; + } + + if (!next_subvol) { + goto unwind; + } + if (conf->readdir_optimize == _gf_true) { + if (next_subvol != local->first_up_subvol) { + ret = dict_set_int32(local->xattr, GF_READDIR_SKIP_DIRS, 1); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value" + ":key = %s", + GF_READDIR_SKIP_DIRS); + } else { + dict_del(local->xattr, GF_READDIR_SKIP_DIRS); + } + } + + STACK_WIND_COOKIE(frame, dht_readdirp_cbk, next_subvol, next_subvol, + next_subvol->fops->readdirp, local->fd, local->size, + next_offset, local->xattr); return 0; + } + +unwind: + /* We need to ensure that only the last subvolume's end-of-directory + * notification is respected so that directory reading does not stop + * before all subvolumes have been read. That could happen because the + * posix for each subvolume sends a ENOENT on end-of-directory but in + * distribute we're not concerned only with a posix's view of the + * directory but the aggregated namespace' view of the directory. + */ + if (op_ret < 0) + op_ret = 0; + + if (prev != dht_last_up_subvol(this)) + op_errno = 0; + + DHT_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &entries, NULL); + + gf_dirent_free(&entries); + return 0; } +static int +dht_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, gf_dirent_t *orig_entries, dict_t *xdata) +{ + dht_local_t *local = NULL; + gf_dirent_t entries; + gf_dirent_t *orig_entry = NULL; + gf_dirent_t *entry = NULL; + xlator_t *prev = NULL; + xlator_t *next_subvol = NULL; + off_t next_offset = 0; + int count = 0; + dht_layout_t *layout = 0; + xlator_t *subvol = 0; + dht_conf_t *conf = NULL; + dht_methods_t *methods = NULL; + gf_boolean_t skip_hashed_check = _gf_false; -int -dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, - int op_errno, gf_dirent_t *orig_entries, dict_t *xdata) -{ - dht_local_t *local = NULL; - gf_dirent_t entries; - gf_dirent_t *orig_entry = NULL; - gf_dirent_t *entry = NULL; - call_frame_t *prev = NULL; - xlator_t *next_subvol = NULL; - off_t next_offset = 0; - int count = 0; - dht_layout_t *layout = 0; - dht_conf_t *conf = NULL; - xlator_t *subvol = 0; - int ret = 0; - - INIT_LIST_HEAD (&entries.list); - prev = cookie; - local = frame->local; - conf = this->private; + INIT_LIST_HEAD(&entries.list); - if (op_ret < 0) - goto done; + prev = cookie; + local = frame->local; - if (!local->layout) - local->layout = dht_layout_get (this, local->fd->inode); + conf = this->private; + GF_VALIDATE_OR_GOTO(this->name, conf, done); - layout = local->layout; + methods = &(conf->methods); - list_for_each_entry (orig_entry, (&orig_entries->list), list) { - next_offset = orig_entry->d_off; - if (check_is_dir (NULL, (&orig_entry->d_stat), NULL) && - (prev->this != local->first_up_subvol)) { - continue; - } - if (check_is_linkfile (NULL, (&orig_entry->d_stat), - orig_entry->dict, - conf->link_xattr_name)) { - continue; - } + if (op_ret <= 0) + goto done; - entry = gf_dirent_for_name (orig_entry->d_name); - if (!entry) { + if (!local->layout) + local->layout = dht_layout_get(this, local->fd->inode); - goto unwind; - } + layout = local->layout; - /* Do this if conf->search_unhashed is set to "auto" */ - if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) { - subvol = dht_layout_search (this, layout, - orig_entry->d_name); - if (!subvol || (subvol != prev->this)) { - /* TODO: Count the number of entries which need - linkfile to prove its existence in fs */ - layout->search_unhashed++; - } - } + gf_msg_debug(this->name, 0, "Processing entries from %s", prev->name); - dht_itransform (this, prev->this, orig_entry->d_off, - &entry->d_off); - - entry->d_stat = orig_entry->d_stat; - entry->d_ino = orig_entry->d_ino; - entry->d_type = orig_entry->d_type; - entry->d_len = orig_entry->d_len; - - if (orig_entry->dict) - entry->dict = dict_ref (orig_entry->dict); - - /* making sure we set the inode ctx right with layout, - currently possible only for non-directories, so for - directories don't set entry inodes */ - if (!IA_ISDIR(entry->d_stat.ia_type) && orig_entry->inode) { - ret = dht_layout_preset (this, prev->this, - orig_entry->inode); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "failed to link the layout in inode"); - entry->inode = inode_ref (orig_entry->inode); - } else if (orig_entry->inode) { - dht_inode_ctx_time_update (orig_entry->inode, this, - &entry->d_stat, 1); - } + if (conf->subvolume_cnt == 1) { + /*return everything*/ + skip_hashed_check = _gf_true; + count = op_ret; + goto done; + } - list_add_tail (&entry->list, &entries.list); - count++; - } - op_ret = count; - /* We need to ensure that only the last subvolume's end-of-directory - * notification is respected so that directory reading does not stop - * before all subvolumes have been read. That could happen because the - * posix for each subvolume sends a ENOENT on end-of-directory but in - * distribute we're not concerned only with a posix's view of the - * directory but the aggregated namespace' view of the directory. - */ - if (prev->this != dht_last_up_subvol (this)) - op_errno = 0; + list_for_each_entry(orig_entry, (&orig_entries->list), list) + { + next_offset = orig_entry->d_off; + + gf_msg_debug(this->name, 0, "%s: entry = %s, type = %d", prev->name, + orig_entry->d_name, orig_entry->d_type); + subvol = methods->layout_search(this, layout, orig_entry->d_name); + + if (!subvol || (subvol == prev)) { + entry = gf_dirent_for_name(orig_entry->d_name); + if (!entry) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "Memory allocation failed "); + goto unwind; + } + + entry->d_off = orig_entry->d_off; + entry->d_ino = orig_entry->d_ino; + entry->d_type = orig_entry->d_type; + entry->d_len = orig_entry->d_len; + + gf_msg_debug(this->name, 0, "%s: Adding = entry %s", prev->name, + entry->d_name); + + list_add_tail(&entry->list, &entries.list); + count++; + } + } done: - if (count == 0) { - /* non-zero next_offset means that - EOF is not yet hit on the current subvol - */ - if (next_offset == 0) { - next_subvol = dht_subvol_next (this, prev->this); + op_ret = count; + /* We need to ensure that only the last subvolume's end-of-directory + * notification is respected so that directory reading does not stop + * before all subvolumes have been read. That could happen because the + * posix for each subvolume sends a ENOENT on end-of-directory but in + * distribute we're not concerned only with a posix's view of the + * directory but the aggregated namespace' view of the directory. + */ + if (count == 0) { + if ((next_offset == 0) || (op_errno == ENOENT)) { + next_offset = 0; + next_subvol = dht_subvol_next(this, prev); + } else { + next_subvol = prev; + } + + if (!next_subvol) { + goto unwind; + } + + STACK_WIND_COOKIE(frame, dht_readdir_cbk, next_subvol, next_subvol, + next_subvol->fops->readdir, local->fd, local->size, + next_offset, NULL); + return 0; + } + +unwind: + /* We need to ensure that only the last subvolume's end-of-directory + * notification is respected so that directory reading does not stop + * before all subvolumes have been read. That could happen because the + * posix for each subvolume sends a ENOENT on end-of-directory but in + * distribute we're not concerned only with a posix's view of the + * directory but the aggregated namespace' view of the directory. + */ + + if (prev != dht_last_up_subvol(this)) + op_errno = 0; + + if (!skip_hashed_check) { + DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, NULL); + gf_dirent_free(&entries); + + } else { + DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, orig_entries, NULL); + } + return 0; +} + +static int +dht_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t yoff, int whichop, dict_t *dict) +{ + dht_local_t *local = NULL; + int op_errno = -1; + xlator_t *xvol = NULL; + int ret = 0; + dht_conf_t *conf = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + + local = dht_local_init(frame, NULL, NULL, whichop); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->fd = fd_ref(fd); + local->size = size; + local->xattr_req = (dict) ? dict_ref(dict) : NULL; + local->first_up_subvol = dht_first_up_subvol(this); + local->op_ret = -1; + + dht_deitransform(this, yoff, &xvol); + + /* TODO: do proper readdir */ + if (whichop == GF_FOP_READDIRP) { + if (dict) + local->xattr = dict_ref(dict); + else + local->xattr = dict_new(); + + if (local->xattr) { + ret = dict_set_uint32(local->xattr, conf->link_xattr_name, 256); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value" + " : key = %s", + conf->link_xattr_name); + + if (conf->readdir_optimize == _gf_true) { + if (xvol != local->first_up_subvol) { + ret = dict_set_int32(local->xattr, GF_READDIR_SKIP_DIRS, 1); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_DICT_SET_FAILED, + "Failed to set " + "dictionary value: " + "key = %s", + GF_READDIR_SKIP_DIRS); } else { - next_subvol = prev->this; + dict_del(local->xattr, GF_READDIR_SKIP_DIRS); } + } - if (!next_subvol) { - goto unwind; + if (conf->subvolume_cnt == 1) { + ret = dict_set_uint32(local->xattr, conf->xattr_name, 4 * 4); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, + DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary " + "value:key = %s ", + conf->xattr_name); } + } + } - if (conf->readdir_optimize == _gf_true) { - if (next_subvol != local->first_up_subvol) { - ret = dict_set_int32 (local->xattr, - GF_READDIR_SKIP_DIRS, 1); - if (ret) - gf_log (this->name, GF_LOG_ERROR, - "dict set failed"); - } else { - dict_del (local->xattr, - GF_READDIR_SKIP_DIRS); - } - } + STACK_WIND_COOKIE(frame, dht_readdirp_cbk, xvol, xvol, + xvol->fops->readdirp, fd, size, yoff, local->xattr); + } else { + STACK_WIND_COOKIE(frame, dht_readdir_cbk, xvol, xvol, + xvol->fops->readdir, fd, size, yoff, local->xattr); + } - STACK_WIND (frame, dht_readdirp_cbk, - next_subvol, next_subvol->fops->readdirp, - local->fd, local->size, next_offset, - local->xattr); - return 0; - } + return 0; -unwind: - if (op_ret < 0) - op_ret = 0; +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(readdir, frame, -1, op_errno, NULL, NULL); - DHT_STACK_UNWIND (readdirp, frame, op_ret, op_errno, &entries, NULL); + return 0; +} - gf_dirent_free (&entries); +int +dht_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t yoff, dict_t *xdata) +{ + int op = GF_FOP_READDIR; + dht_conf_t *conf = NULL; + int i = 0; - return 0; + conf = this->private; + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!conf->subvolume_status[i]) { + op = GF_FOP_READDIRP; + break; + } + } + + if (conf->use_readdirp) + op = GF_FOP_READDIRP; + +out: + dht_do_readdir(frame, this, fd, size, yoff, op, 0); + return 0; } +int +dht_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t yoff, dict_t *dict) +{ + dht_do_readdir(frame, this, fd, size, yoff, GF_FOP_READDIRP, dict); + return 0; +} + +static int +dht_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + local = frame->local; + + LOCK(&frame->lock); + { + if (op_ret == -1) + local->op_errno = op_errno; + else if (op_ret == 0) + local->op_ret = 0; + } + UNLOCK(&frame->lock); + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) + DHT_STACK_UNWIND(fsyncdir, frame, local->op_ret, local->op_errno, + xdata); + + return 0; +} int -dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *orig_entries, - dict_t *xdata) +dht_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync, + dict_t *xdata) { - dht_local_t *local = NULL; - gf_dirent_t entries; - gf_dirent_t *orig_entry = NULL; - gf_dirent_t *entry = NULL; - call_frame_t *prev = NULL; - xlator_t *next_subvol = NULL; - off_t next_offset = 0; - int count = 0; - dht_layout_t *layout = 0; - xlator_t *subvol = 0; - - INIT_LIST_HEAD (&entries.list); - prev = cookie; - local = frame->local; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + int i = -1; - if (op_ret < 0) - goto done; + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + VALIDATE_OR_GOTO(this->private, err); - if (!local->layout) - local->layout = dht_layout_get (this, local->fd->inode); + conf = this->private; - layout = local->layout; + local = dht_local_init(frame, NULL, NULL, GF_FOP_FSYNCDIR); + if (!local) { + op_errno = ENOMEM; + goto err; + } - list_for_each_entry (orig_entry, (&orig_entries->list), list) { - next_offset = orig_entry->d_off; + local->fd = fd_ref(fd); + local->call_cnt = conf->subvolume_cnt; - subvol = dht_layout_search (this, layout, orig_entry->d_name); + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND(frame, dht_fsyncdir_cbk, conf->subvolumes[i], + conf->subvolumes[i]->fops->fsyncdir, fd, datasync, xdata); + } - if (!subvol || (subvol == prev->this)) { - entry = gf_dirent_for_name (orig_entry->d_name); - if (!entry) { - gf_log (this->name, GF_LOG_ERROR, - "memory allocation failed :("); - goto unwind; - } + return 0; - dht_itransform (this, prev->this, orig_entry->d_off, - &entry->d_off); +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(fsyncdir, frame, -1, op_errno, NULL); - entry->d_ino = orig_entry->d_ino; - entry->d_type = orig_entry->d_type; - entry->d_len = orig_entry->d_len; + return 0; +} - list_add_tail (&entry->list, &entries.list); - count++; - } +int +dht_newfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + xlator_t *prev = NULL; + int ret = -1; + dht_local_t *local = NULL; + + if (op_ret == -1) + goto out; + + local = frame->local; + if (!local) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + prev = cookie; + + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, preparent, 0); + dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1); + } + + ret = dht_layout_preset(this, prev, inode); + if (ret < 0) { + gf_msg_debug(this->name, EINVAL, + "could not set pre-set layout for subvolume %s", + prev ? prev->name : NULL); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + if (local->linked == _gf_true) + dht_linkfile_attr_heal(frame, this); +out: + /* + * FIXME: ia_size and st_blocks of preparent and postparent do not have + * correct values. since, preparent and postparent buffers correspond + * to a directory these two members should have values equal to sum of + * corresponding values from each of the subvolume. + * See dht_iatt_merge for reference. + */ + DHT_STRIP_PHASE1_FLAGS(stbuf); + dht_set_fixed_dir_stat(postparent); + dht_set_fixed_dir_stat(preparent); + + if (local && local->lock[0].layout.parent_layout.locks) { + /* store op_errno for failure case*/ + local->op_errno = op_errno; + local->refresh_layout_unlock(frame, this, op_ret, 1); + + if (op_ret == 0) { + DHT_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, stbuf, + preparent, postparent, xdata); } - op_ret = count; - /* We need to ensure that only the last subvolume's end-of-directory - * notification is respected so that directory reading does not stop - * before all subvolumes have been read. That could happen because the - * posix for each subvolume sends a ENOENT on end-of-directory but in - * distribute we're not concerned only with a posix's view of the - * directory but the aggregated namespace' view of the directory. - */ - if (prev->this != dht_last_up_subvol (this)) - op_errno = 0; + } else { + DHT_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, stbuf, + preparent, postparent, xdata); + } -done: - if (count == 0) { - /* non-zero next_offset means that - EOF is not yet hit on the current subvol - */ - if (next_offset == 0) { - next_subvol = dht_subvol_next (this, prev->this); - } else { - next_subvol = prev->this; - } + return 0; +} - if (!next_subvol) { - goto unwind; - } +static int +dht_mknod_linkfile_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *cached_subvol = NULL; + dht_conf_t *conf = NULL; - STACK_WIND (frame, dht_readdir_cbk, - next_subvol, next_subvol->fops->readdir, - local->fd, local->size, next_offset, NULL); - return 0; - } + local = frame->local; -unwind: - if (op_ret < 0) - op_ret = 0; + if (!local || !local->cached_subvol) { + op_errno = EINVAL; + goto err; + } - DHT_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries, NULL); + if (op_ret == -1) { + local->op_errno = op_errno; + goto err; + } - gf_dirent_free (&entries); + conf = this->private; + if (!conf) { + local->op_errno = EINVAL; + op_errno = EINVAL; + goto err; + } - return 0; + cached_subvol = local->cached_subvol; + + if (local->params) { + dict_del(local->params, conf->link_xattr_name); + dict_del(local->params, GLUSTERFS_INTERNAL_FOP_KEY); + } + + STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)cached_subvol, + cached_subvol, cached_subvol->fops->mknod, &local->loc, + local->mode, local->rdev, local->umask, local->params); + + return 0; +err: + if (local && local->lock[0].layout.parent_layout.locks) { + local->refresh_layout_unlock(frame, this, -1, 1); + } else { + DHT_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); + } + return 0; } +static int +dht_mknod_wind_to_avail_subvol(call_frame_t *frame, xlator_t *this, + xlator_t *subvol, loc_t *loc, dev_t rdev, + mode_t mode, mode_t umask, dict_t *params) +{ + dht_local_t *local = NULL; + xlator_t *avail_subvol = NULL; -int -dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t yoff, int whichop, dict_t *dict) -{ - dht_local_t *local = NULL; - int op_errno = -1; - xlator_t *xvol = NULL; - off_t xoff = 0; - int ret = 0; - dht_conf_t *conf = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (this->private, err); - - conf = this->private; - - local = dht_local_init (frame, NULL, NULL, whichop); - if (!local) { - op_errno = ENOMEM; - goto err; - } + local = frame->local; - local->fd = fd_ref (fd); - local->size = size; - local->xattr_req = (dict)? dict_ref (dict) : NULL; - local->first_up_subvol = dht_first_up_subvol (this); + if (!dht_is_subvol_filled(this, subvol)) { + gf_msg_debug(this->name, 0, "creating %s on %s", loc->path, + subvol->name); - dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff); + STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, rdev, umask, params); + } else { + avail_subvol = dht_free_disk_available_subvol(this, subvol, local); - /* TODO: do proper readdir */ - if (whichop == GF_FOP_READDIRP) { - if (dict) - local->xattr = dict_ref (dict); - else - local->xattr = dict_new (); + if (avail_subvol != subvol) { + local->params = dict_ref(params); + local->rdev = rdev; + local->mode = mode; + local->umask = umask; + local->cached_subvol = avail_subvol; + local->hashed_subvol = subvol; - if (local->xattr) { - ret = dict_set_uint32 (local->xattr, - conf->link_xattr_name, 256); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "failed to set '%s' key", - conf->link_xattr_name); - if (conf->readdir_optimize == _gf_true) { - if (xvol != local->first_up_subvol) { - ret = dict_set_int32 (local->xattr, - GF_READDIR_SKIP_DIRS, 1); - if (ret) - gf_log (this->name, - GF_LOG_ERROR, - "Dict set failed"); - } else { - dict_del (local->xattr, - GF_READDIR_SKIP_DIRS); - } - } - } + gf_msg_debug(this->name, 0, "creating %s on %s (link at %s)", + loc->path, avail_subvol->name, subvol->name); - STACK_WIND (frame, dht_readdirp_cbk, xvol, xvol->fops->readdirp, - fd, size, xoff, local->xattr); - } else { - STACK_WIND (frame, dht_readdir_cbk, xvol, xvol->fops->readdir, - fd, size, xoff, local->xattr); - } + dht_linkfile_create(frame, dht_mknod_linkfile_create_cbk, this, + avail_subvol, subvol, loc); - return 0; + goto out; + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL); + gf_msg_debug(this->name, 0, "creating %s on %s", loc->path, + subvol->name); - return 0; + STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, rdev, umask, params); + } +out: + return 0; } - -int -dht_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t yoff, dict_t *xdata) +static int32_t +dht_mknod_do(call_frame_t *frame) { - int op = GF_FOP_READDIR; - dht_conf_t *conf = NULL; - int i = 0; + dht_local_t *local = NULL; + dht_layout_t *refreshed = NULL; + xlator_t *subvol = NULL; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dht_methods_t *methods = NULL; - conf = this->private; - if (!conf) - goto out; + local = frame->local; - for (i = 0; i < conf->subvolume_cnt; i++) { - if (!conf->subvolume_status[i]) { - op = GF_FOP_READDIRP; - break; - } - } + this = THIS; - if (conf->use_readdirp) - op = GF_FOP_READDIRP; + conf = this->private; -out: - dht_do_readdir (frame, this, fd, size, yoff, op, 0); - return 0; + GF_VALIDATE_OR_GOTO(this->name, conf, err); + + methods = &(conf->methods); + + /* We don't need parent_loc anymore */ + loc_wipe(&local->loc); + + loc_copy(&local->loc, &local->loc2); + + loc_wipe(&local->loc2); + + refreshed = local->selfheal.refreshed_layout; + + subvol = methods->layout_search(this, refreshed, local->loc.name); + + if (!subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "no subvolume in " + "layout for path=%s", + local->loc.path); + local->op_errno = ENOENT; + goto err; + } + + dht_mknod_wind_to_avail_subvol(frame, this, subvol, &local->loc, + local->rdev, local->mode, local->umask, + local->params); + return 0; +err: + local->refresh_layout_unlock(frame, this, -1, 1); + + return 0; } -int -dht_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t yoff, dict_t *dict) +static int32_t +dht_mknod_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - dht_do_readdir (frame, this, fd, size, yoff, GF_FOP_READDIRP, dict); - return 0; + DHT_STACK_DESTROY(frame); + return 0; } +static int32_t +dht_mknod_finish(call_frame_t *frame, xlator_t *this, int op_ret, + int invoke_cbk) +{ + dht_local_t *local = NULL, *lock_local = NULL; + call_frame_t *lock_frame = NULL; + int lock_count = 0; + + local = frame->local; + lock_count = dht_lock_count(local->lock[0].layout.parent_layout.locks, + local->lock[0].layout.parent_layout.lk_count); + if (lock_count == 0) + goto done; + + lock_frame = copy_frame(frame); + if (lock_frame == NULL) { + goto done; + } + + lock_local = dht_local_init(lock_frame, &local->loc, NULL, + lock_frame->root->op); + if (lock_local == NULL) { + goto done; + } + + lock_local->lock[0] + .layout.parent_layout.locks = local->lock[0].layout.parent_layout.locks; + lock_local->lock[0].layout.parent_layout.lk_count = + local->lock[0].layout.parent_layout.lk_count; + + local->lock[0].layout.parent_layout.locks = NULL; + local->lock[0].layout.parent_layout.lk_count = 0; + + dht_unlock_inodelk(lock_frame, + lock_local->lock[0].layout.parent_layout.locks, + lock_local->lock[0].layout.parent_layout.lk_count, + dht_mknod_unlock_cbk); + lock_frame = NULL; +done: + if (lock_frame != NULL) { + DHT_STACK_DESTROY(lock_frame); + } -int -dht_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xdata) + if (op_ret == 0) + return 0; + + DHT_STACK_UNWIND(mknod, frame, op_ret, local->op_errno, NULL, NULL, NULL, + NULL, NULL); + return 0; +} + +static int32_t +dht_mknod_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = 0; + dht_local_t *local = NULL; + local = frame->local; - local = frame->local; + if (!local) { + goto err; + } - LOCK (&frame->lock); - { - if (op_ret == -1) - local->op_errno = op_errno; + if (op_ret < 0) { + gf_msg("DHT", GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR, + "mknod lock failed for file: %s", local->loc2.name); - if (op_ret == 0) - local->op_ret = 0; - } - UNLOCK (&frame->lock); + local->op_errno = op_errno; - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_UNWIND (fsyncdir, frame, local->op_ret, - local->op_errno, xdata); + goto err; + } - return 0; -} + local->refresh_layout_unlock = dht_mknod_finish; + local->refresh_layout_done = dht_mknod_do; -int -dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - int datasync, dict_t *xdata) + dht_refresh_layout(frame); + + return 0; +err: + if (local) + dht_mknod_finish(frame, this, -1, 0); + else + DHT_STACK_UNWIND(mknod, frame, -1, EINVAL, NULL, NULL, NULL, NULL, + NULL); + return 0; +} + +static int32_t +dht_mknod_lock(call_frame_t *frame, xlator_t *subvol) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int op_errno = -1; - int i = -1; + dht_local_t *local = NULL; + int count = 1, ret = -1; + dht_lock_t **lk_array = NULL; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - VALIDATE_OR_GOTO (this->private, err); + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO(frame->this->name, frame->local, err); - conf = this->private; + local = frame->local; - local = dht_local_init (frame, NULL, NULL, GF_FOP_FSYNCDIR); - if (!local) { - op_errno = ENOMEM; - goto err; - } + lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer); - local->fd = fd_ref (fd); - local->call_cnt = conf->subvolume_cnt; + if (lk_array == NULL) + goto err; - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (frame, dht_fsyncdir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->fsyncdir, - fd, datasync, xdata); - } + lk_array[0] = dht_lock_new(frame->this, subvol, &local->loc, F_RDLCK, + DHT_LAYOUT_HEAL_DOMAIN, NULL, + IGNORE_ENOENT_ESTALE); - return 0; + if (lk_array[0] == NULL) + goto err; + + local->lock[0].layout.parent_layout.locks = lk_array; + local->lock[0].layout.parent_layout.lk_count = count; + ret = dht_blocking_inodelk(frame, lk_array, count, dht_mknod_lock_cbk); + + if (ret < 0) { + local->lock[0].layout.parent_layout.locks = NULL; + local->lock[0].layout.parent_layout.lk_count = 0; + goto err; + } + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); + if (lk_array != NULL) { + dht_lock_array_free(lk_array, count); + GF_FREE(lk_array); + } - return 0; + return -1; } +static int +dht_refresh_parent_layout_resume(call_frame_t *frame, xlator_t *this, int ret, + int invoke_cbk) +{ + dht_local_t *local = NULL, *parent_local = NULL; + call_stub_t *stub = NULL; + call_frame_t *parent_frame = NULL; -int -dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) + local = frame->local; + + stub = local->stub; + local->stub = NULL; + + parent_frame = stub->frame; + parent_local = parent_frame->local; + + if (ret < 0) { + parent_local->op_ret = -1; + parent_local->op_errno = local->op_errno ? local->op_errno : EIO; + } else { + parent_local->op_ret = 0; + } + + call_resume(stub); + + DHT_STACK_DESTROY(frame); + + return 0; +} + +static int +dht_refresh_parent_layout_done(call_frame_t *frame) { - xlator_t *prev = NULL; - int ret = -1; - dht_local_t *local = NULL; + dht_local_t *local = NULL; + int ret = 0; + local = frame->local; - if (op_ret == -1) - goto out; + if (local->op_ret < 0) { + ret = -1; + goto resume; + } - local = frame->local; - if (!local) { - op_ret = -1; - op_errno = EINVAL; - goto out; - } + dht_layout_set(frame->this, local->loc.inode, + local->selfheal.refreshed_layout); - prev = cookie; +resume: + dht_refresh_parent_layout_resume(frame, frame->this, ret, 1); + return 0; +} - if (local->loc.parent) { +static int +dht_handle_parent_layout_change(xlator_t *this, call_stub_t *stub) +{ + call_frame_t *refresh_frame = NULL, *frame = NULL; + dht_local_t *refresh_local = NULL, *local = NULL; - dht_inode_ctx_time_update (local->loc.parent, this, - preparent, 0); - dht_inode_ctx_time_update (local->loc.parent, this, - postparent, 1); - } + frame = stub->frame; + local = frame->local; - ret = dht_layout_preset (this, prev, inode); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "could not set pre-set layout for subvolume %s", - prev? prev->name: NULL); - op_ret = -1; - op_errno = EINVAL; - goto out; - } - if (local->linked == _gf_true) - dht_linkfile_attr_heal (frame, this); -out: - /* - * FIXME: ia_size and st_blocks of preparent and postparent do not have - * correct values. since, preparent and postparent buffers correspond - * to a directory these two members should have values equal to sum of - * corresponding values from each of the subvolume. - * See dht_iatt_merge for reference. - */ - DHT_STRIP_PHASE1_FLAGS (stbuf); - DHT_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, stbuf, - preparent, postparent, xdata); - return 0; + refresh_frame = copy_frame(frame); + if (!refresh_frame) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "mem allocation failed for refresh_frame"); + return -1; + } + + refresh_local = dht_local_init(refresh_frame, NULL, NULL, stub->fop); + if (!refresh_local) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "mem allocation failed for refresh_local"); + return -1; + } + + refresh_local->loc.inode = inode_ref(local->loc.parent); + gf_uuid_copy(refresh_local->loc.gfid, local->loc.parent->gfid); + + refresh_local->stub = stub; + + refresh_local->refresh_layout_unlock = dht_refresh_parent_layout_resume; + refresh_local->refresh_layout_done = dht_refresh_parent_layout_done; + + dht_refresh_layout(refresh_frame); + return 0; } -int -dht_mknod_linkfile_create_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent, - dict_t *xdata) +static int32_t +dht_call_mkdir_stub(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - xlator_t *cached_subvol = NULL; + dht_local_t *local = NULL; + call_stub_t *stub = NULL; - if (op_ret == -1) - goto err; + local = frame->local; + stub = local->stub; + local->stub = NULL; - local = frame->local; - if (!local || !local->cached_subvol) { - op_errno = EINVAL; - goto err; - } + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; + } else { + local->op_ret = 0; + } - cached_subvol = local->cached_subvol; + call_resume(stub); - STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)cached_subvol, - cached_subvol, cached_subvol->fops->mknod, - &local->loc, local->mode, local->rdev, local->umask, - local->params); + return 0; +} - return 0; +static int32_t +dht_guard_parent_layout_and_namespace(xlator_t *subvol, call_stub_t *stub) +{ + dht_local_t *local = NULL; + int ret = -1; + loc_t *loc = NULL; + xlator_t *hashed_subvol = NULL, *this = NULL; + ; + call_frame_t *frame = NULL; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + int32_t *parent_disk_layout = NULL; + dht_layout_t *parent_layout = NULL; + dht_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO("dht", stub, err); + + frame = stub->frame; + this = frame->this; + + conf = this->private; + + local = frame->local; + + local->stub = stub; + + /* TODO: recheck whether we should lock on src or dst if we do similar + * stale layout checks for rename. + */ + loc = &stub->args.loc; + + gf_uuid_unparse(loc->parent->gfid, pgfid); + + if (local->params == NULL) { + local->params = dict_new(); + if (local->params == NULL) { + local->op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "%s (%s/%s) (path: %s): " + "dict allocation failed", + gf_fop_list[stub->fop], pgfid, loc->name, loc->path); + goto err; + } + } + + hashed_subvol = dht_subvol_get_hashed(this, loc); + if (hashed_subvol == NULL) { + local->op_errno = EINVAL; + + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "%s (%s/%s) (path: %s): " + "hashed subvolume not found", + gf_fop_list[stub->fop], pgfid, loc->name, loc->path); + goto err; + } + + parent_layout = dht_layout_get(this, loc->parent); + + ret = dht_disk_layout_extract_for_subvol(this, parent_layout, hashed_subvol, + &parent_disk_layout); + if (ret == -1) { + local->op_errno = EINVAL; + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "%s (%s/%s) (path: %s): " + "extracting in-memory layout of parent failed. ", + gf_fop_list[stub->fop], pgfid, loc->name, loc->path); + goto err; + } + + memcpy((void *)local->parent_disk_layout, (void *)parent_disk_layout, + sizeof(local->parent_disk_layout)); + + dht_layout_unref(this, parent_layout); + parent_layout = NULL; + + ret = dict_set_str(local->params, GF_PREOP_PARENT_KEY, conf->xattr_name); + if (ret < 0) { + local->op_errno = -ret; + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "%s (%s/%s) (path: %s): " + "setting %s key in params dictionary failed. ", + gf_fop_list[stub->fop], pgfid, loc->name, loc->path, + GF_PREOP_PARENT_KEY); + goto err; + } + + ret = dict_set_bin(local->params, conf->xattr_name, parent_disk_layout, + 4 * 4); + if (ret < 0) { + local->op_errno = -ret; + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "%s (%s/%s) (path: %s): " + "setting parent-layout in params dictionary failed. ", + gf_fop_list[stub->fop], pgfid, loc->name, loc->path); + goto err; + } + + parent_disk_layout = NULL; + local->hashed_subvol = hashed_subvol; + + local->current = &local->lock[0]; + ret = dht_protect_namespace(frame, loc, hashed_subvol, &local->current->ns, + dht_call_mkdir_stub); + if (ret < 0) + goto err; + + return 0; err: - DHT_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, - NULL); - return 0; + + if (parent_disk_layout != NULL) + GF_FREE(parent_disk_layout); + + if (parent_layout != NULL) + dht_layout_unref(this, parent_layout); + + return -1; } int -dht_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *params) +dht_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *params) { - xlator_t *subvol = NULL; - int op_errno = -1; - xlator_t *avail_subvol = NULL; - dht_local_t *local = NULL; + xlator_t *subvol = NULL; + int op_errno = -1; + int i = 0; + int ret = 0; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + + conf = this->private; + + dht_get_du_info(frame, this, loc); + + local = dht_local_init(frame, loc, NULL, GF_FOP_MKNOD); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = dht_subvol_get_hashed(this, loc); + if (!subvol) { + gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s", + loc->path); + op_errno = EIO; + goto err; + } + + /* Post remove-brick, the client layout may not be in sync with + * disk layout because of lack of lookup. Hence,a mknod call + * may fall on the decommissioned brick. Hence, if the + * hashed_subvol is part of decommissioned bricks list, do a + * lookup on parent dir. If a fix-layout is already done by the + * remove-brick process, the parent directory layout will be in + * sync with that of the disk. If fix-layout is still ending + * on the parent directory, we can let the file get created on + * the decommissioned brick which will be eventually migrated to + * non-decommissioned brick based on the new layout. + */ + + if (conf->decommission_subvols_cnt) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i] && + conf->decommissioned_bricks[i] == subvol) { + gf_msg_debug(this->name, 0, + "hashed subvol:%s is " + "part of decommission brick list for " + "file: %s", + subvol->name, loc->path); + + /* dht_refresh_layout needs directory info in + * local->loc. Hence, storing the parent_loc in + * local->loc and storing the create context in + * local->loc2. We will restore this information + * in dht_creation do */ + + ret = loc_copy(&local->loc2, &local->loc); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "loc_copy failed %s", loc->path); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); + goto err; + } - dht_get_du_info (frame, this, loc); + local->params = dict_ref(params); + local->rdev = rdev; + local->mode = mode; + local->umask = umask; - local = dht_local_init (frame, loc, NULL, GF_FOP_MKNOD); - if (!local) { - op_errno = ENOMEM; - goto err; - } + loc_wipe(&local->loc); - subvol = dht_subvol_get_hashed (this, loc); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - op_errno = ENOENT; - goto err; - } - - if (!dht_is_subvol_filled (this, subvol)) { - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s", loc->path, subvol->name); + ret = dht_build_parent_loc(this, &local->loc, loc, &op_errno); - STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, - subvol, subvol->fops->mknod, loc, mode, - rdev, umask, params); - } else { + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_LOC_FAILED, + "parent loc build failed"); + goto err; + } - avail_subvol = dht_free_disk_available_subvol (this, subvol, - local); - if (avail_subvol != subvol) { - /* Choose the minimum filled volume, and create the - files there */ - - local->params = dict_ref (params); - local->cached_subvol = avail_subvol; - local->mode = mode; - local->rdev = rdev; - local->umask = umask; - dht_linkfile_create (frame, - dht_mknod_linkfile_create_cbk, - this, avail_subvol, subvol, loc); - } else { - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s", loc->path, subvol->name); + ret = dht_mknod_lock(frame, subvol); - STACK_WIND_COOKIE (frame, dht_newfile_cbk, - (void *)subvol, subvol, - subvol->fops->mknod, loc, mode, - rdev, umask, params); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR, + "locking parent failed"); + goto err; } + + goto done; + } } + } - return 0; + dht_mknod_wind_to_avail_subvol(frame, this, subvol, loc, rdev, mode, umask, + params); + +done: + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (mknod, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } +int +dht_symlink(call_frame_t *frame, xlator_t *this, const char *linkname, + loc_t *loc, mode_t umask, dict_t *params) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + + local = dht_local_init(frame, loc, NULL, GF_FOP_SYMLINK); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = dht_subvol_get_hashed(this, loc); + if (!subvol) { + gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s", + loc->path); + op_errno = EIO; + goto err; + } + + gf_msg_trace(this->name, 0, "creating %s on %s", loc->path, subvol->name); + + STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->symlink, linkname, loc, umask, params); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + + return 0; +} int -dht_symlink (call_frame_t *frame, xlator_t *this, - const char *linkname, loc_t *loc, mode_t umask, dict_t *params) +dht_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; + xlator_t *cached_subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + + local = dht_local_init(frame, loc, NULL, GF_FOP_UNLINK); + if (!local) { + op_errno = ENOMEM; + + goto err; + } + + cached_subvol = local->cached_subvol; + if (!cached_subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + + local->flags = xflag; + STACK_WIND_COOKIE(frame, dht_unlink_cbk, cached_subvol, cached_subvol, + cached_subvol->fops->unlink, loc, xflag, xdata); + + return 0; +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(unlink, frame, -1, op_errno, NULL, NULL, NULL); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); + return 0; +} - local = dht_local_init (frame, loc, NULL, GF_FOP_SYMLINK); - if (!local) { - op_errno = ENOMEM; - goto err; - } +static int +dht_remove_stale_linkto_cbk(int ret, call_frame_t *sync_frame, void *data) +{ + DHT_STACK_DESTROY(sync_frame); + return 0; +} - subvol = dht_subvol_get_hashed (this, loc); +static int +dht_remove_stale_linkto(void *data) +{ + call_frame_t *frame = NULL; + dht_local_t *local = NULL; + xlator_t *this = NULL; + dict_t *xdata_in = NULL; + int ret = 0; + + GF_VALIDATE_OR_GOTO("dht", data, out); + + frame = data; + local = frame->local; + this = frame->this; + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", local, out); + GF_VALIDATE_OR_GOTO("dht", local->link_subvol, out); + + xdata_in = dict_new(); + if (!xdata_in) + goto out; + + ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(xdata_in); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, 0, + "Failed to set keys for stale linkto" + "deletion on path %s", + local->loc.path); + goto out; + } + + ret = syncop_unlink(local->link_subvol, &local->loc, xdata_in, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, 0, + "Removal of linkto failed" + " on path %s at subvol %s", + local->loc.path, local->link_subvol->name); + } +out: + if (xdata_in) + dict_unref(xdata_in); + return ret; +} + +static int +dht_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + int ret = -1; + gf_boolean_t stbuf_merged = _gf_false; + xlator_t *subvol = NULL; + call_frame_t *cleanup_frame = NULL; + dht_local_t *cleanup_local = NULL; + + local = frame->local; + + if (op_ret == -1) { + /* Remove the linkto if exists */ + if (local->linked) { + cleanup_frame = create_frame(this, this->ctx->pool); + if (cleanup_frame) { + cleanup_local = dht_local_init(cleanup_frame, &local->loc2, + NULL, 0); + if (!cleanup_local || !local->link_subvol) { + DHT_STACK_DESTROY(cleanup_frame); + goto out; + } + cleanup_local->link_subvol = local->link_subvol; + FRAME_SU_DO(cleanup_frame, dht_local_t); + ret = synctask_new(this->ctx->env, dht_remove_stale_linkto, + dht_remove_stale_linkto_cbk, cleanup_frame, + cleanup_frame); + } + } + /* No continuation on DHT inode missing errors, as we should + * then have a good stbuf that states P2 happened. We would + * get inode missing if, the file completed migrated between + * the lookup and the link call */ + goto out; + } + + /* Update parent on success, even if P1/2 checks are positive. + * The second call on success will further update the parent */ + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, preparent, 0); + dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1); + } + + /* Update linkto attrs, if this is the first call and non-P2, + * if we detect P2 then we need to trust the attrs from the + * second call, not the first */ + if (local->linked == _gf_true && + ((local->call_cnt == 1 && !IS_DHT_MIGRATION_PHASE2(stbuf)) || + (local->call_cnt != 1 && IS_DHT_MIGRATION_PHASE2(&local->stbuf)))) { + dht_iatt_merge(this, &local->stbuf, stbuf); + stbuf_merged = _gf_true; + dht_linkfile_attr_heal(frame, this); + } + + /* No further P1/2 checks if we are in the second iteration of + * the call */ + if (local->call_cnt != 1) { + goto out; + } else { + /* Preserve the return values, in case the migration decides + * to recreate the link on the same subvol that the current + * hased for the link was created on. */ + dht_iatt_merge(this, &local->preparent, preparent); + dht_iatt_merge(this, &local->postparent, postparent); + if (!stbuf_merged) { + dht_iatt_merge(this, &local->stbuf, stbuf); + stbuf_merged = _gf_true; + } + + local->inode = inode_ref(inode); + } + + local->op_ret = op_ret; + local->op_errno = op_errno; + local->rebalance.target_op_fn = dht_link2; + dht_set_local_rebalance(this, local, stbuf, preparent, postparent, xdata); + + /* Check if the rebalance phase2 is true */ + if (IS_DHT_MIGRATION_PHASE2(stbuf)) { + ret = dht_inode_ctx_get_mig_info(this, local->loc.inode, NULL, &subvol); if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - op_errno = ENOENT; - goto err; - } + /* Phase 2 of migration */ + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } else { + dht_link2(this, subvol, frame, 0); + return 0; + } + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1(stbuf)) { + ret = dht_inode_ctx_get_mig_info(this, local->loc.inode, NULL, &subvol); + if (subvol) { + dht_link2(this, subvol, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check(this, frame); + if (!ret) + return 0; + } +out: + DHT_STRIP_PHASE1_FLAGS(stbuf); + + dht_set_fixed_dir_stat(preparent); + dht_set_fixed_dir_stat(postparent); + DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent, + postparent, NULL); + + return 0; +} + +static int +dht_link2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) +{ + dht_local_t *local = NULL; + int op_errno = EINVAL; + + local = frame->local; + if (!local) + goto err; - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s", loc->path, subvol->name); + op_errno = local->op_errno; - STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol, - subvol->fops->symlink, linkname, loc, umask, - params); + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + dht_set_fixed_dir_stat(&local->preparent); + dht_set_fixed_dir_stat(&local->postparent); + + DHT_STACK_UNWIND(link, frame, local->op_ret, op_errno, local->inode, + &local->stbuf, &local->preparent, &local->postparent, + NULL); + return 0; + } + + if (subvol == NULL) { + op_errno = EINVAL; + goto err; + } + + /* Second call to create link file could result in EEXIST as the + * first call created the linkto in the currently + * migrating subvol, which could be the new hashed subvol */ + if (local->link_subvol == subvol) { + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + dht_set_fixed_dir_stat(&local->preparent); + dht_set_fixed_dir_stat(&local->postparent); + DHT_STACK_UNWIND(link, frame, 0, 0, local->inode, &local->stbuf, + &local->preparent, &local->postparent, NULL); return 0; + } + local->call_cnt = 2; + + STACK_WIND(frame, dht_link_cbk, subvol, subvol->fops->link, &local->loc, + &local->loc2, local->xattr_req); + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (link, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); + DHT_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } - -int -dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, - dict_t *xdata) +static int +dht_link_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - xlator_t *cached_subvol = NULL; - xlator_t *hashed_subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - - if (dht_filter_loc_subvol_key (this, loc, &local->loc, - &cached_subvol)) { - gf_log (this->name, GF_LOG_INFO, - "unlinking %s on %s (given path %s)", - local->loc.path, cached_subvol->name, loc->path); - STACK_WIND (frame, dht_unlink_cbk, - cached_subvol, cached_subvol->fops->unlink, - &local->loc, xflag, xdata); - goto done; - } + dht_local_t *local = NULL; + xlator_t *srcvol = NULL; - local = dht_local_init (frame, loc, NULL, GF_FOP_UNLINK); - if (!local) { - op_errno = ENOMEM; + if (op_ret == -1) + goto err; - goto err; - } + local = frame->local; + srcvol = local->linkfile.srcvol; - hashed_subvol = dht_subvol_get_hashed (this, loc); - /* Dont fail unlink if hashed_subvol is NULL which can be the result - * of layout anomaly */ - if (!hashed_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - } + STACK_WIND(frame, dht_link_cbk, srcvol, srcvol->fops->link, &local->loc, + &local->loc2, local->xattr_req); - cached_subvol = local->cached_subvol; - if (!cached_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + return 0; - local->flags = xflag; - if (hashed_subvol && hashed_subvol != cached_subvol) { - STACK_WIND (frame, dht_unlink_linkfile_cbk, - hashed_subvol, hashed_subvol->fops->unlink, loc, - xflag, xdata); - } else { - STACK_WIND (frame, dht_unlink_cbk, - cached_subvol, cached_subvol->fops->unlink, loc, - xflag, xdata); - } -done: - return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); + DHT_STRIP_PHASE1_FLAGS(stbuf); + dht_set_fixed_dir_stat(preparent); + dht_set_fixed_dir_stat(postparent); + DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent, + postparent, xdata); - return 0; + return 0; } +int +dht_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + xlator_t *cached_subvol = NULL; + xlator_t *hashed_subvol = NULL; + int op_errno = -1; + int ret = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(oldloc, err); + VALIDATE_OR_GOTO(newloc, err); + + local = dht_local_init(frame, oldloc, NULL, GF_FOP_LINK); + if (!local) { + op_errno = ENOMEM; + + goto err; + } + local->call_cnt = 1; + + cached_subvol = local->cached_subvol; + if (!cached_subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for path=%s", + oldloc->path); + op_errno = ENOENT; + goto err; + } + + hashed_subvol = dht_subvol_get_hashed(this, newloc); + if (!hashed_subvol) { + gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s", + newloc->path); + op_errno = EIO; + goto err; + } + + ret = loc_copy(&local->loc2, newloc); + if (ret == -1) { + op_errno = ENOMEM; + goto err; + } + if (xdata) + local->xattr_req = dict_ref(xdata); + + if (hashed_subvol != cached_subvol) { + gf_uuid_copy(local->gfid, oldloc->inode->gfid); + dht_linkfile_create(frame, dht_link_linkfile_cbk, this, cached_subvol, + hashed_subvol, newloc); + } else { + STACK_WIND(frame, dht_link_cbk, cached_subvol, + cached_subvol->fops->link, oldloc, newloc, xdata); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + + return 0; +} int -dht_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +dht_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; - dht_local_t *local = NULL; + xlator_t *prev = NULL; + int ret = -1; + dht_local_t *local = NULL; + gf_boolean_t parent_layout_changed = _gf_false; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + xlator_t *subvol = NULL; + + local = frame->local; + + local = frame->local; + if (!local) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + if (op_ret == -1) { + local->op_errno = op_errno; + parent_layout_changed = (xdata && + dict_get(xdata, GF_PREOP_CHECK_FAILED)) + ? _gf_true + : _gf_false; + + if (parent_layout_changed) { + if (local && local->lock[0].layout.parent_layout.locks) { + /* Returning failure as the layout could not be fixed even under + * the lock */ + goto out; + } - prev = cookie; + gf_uuid_unparse(local->loc.parent->gfid, pgfid); + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_PARENT_LAYOUT_CHANGED, + "create (%s/%s) (path: %s): parent layout " + "changed. Attempting a layout refresh and then a " + "retry", + pgfid, local->loc.name, local->loc.path); - local = frame->local; + /* + dht_refresh_layout needs directory info in local->loc.Hence, + storing the parent_loc in local->loc and storing the create + context in local->loc2. We will restore this information in + dht_creation_do. + */ + + loc_wipe(&local->loc2); + + ret = loc_copy(&local->loc2, &local->loc); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "loc_copy failed %s", local->loc.path); - if (op_ret == -1) goto out; + } - layout = dht_layout_for_subvol (this, prev->this); - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no pre-set layout for subvolume %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; + loc_wipe(&local->loc); + + ret = dht_build_parent_loc(this, &local->loc, &local->loc2, + &op_errno); + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_LOC_FAILED, + "parent loc build failed"); goto out; - } + } - if (local->loc.parent) { - dht_inode_ctx_time_update (local->loc.parent, this, - preparent, 0); - dht_inode_ctx_time_update (local->loc.parent, this, - postparent, 1); - } - if (local->linked == _gf_true) { - local->stbuf = *stbuf; - dht_linkfile_attr_heal (frame, this); + subvol = dht_subvol_get_hashed(this, &local->loc2); + + ret = dht_create_lock(frame, subvol); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR, + "locking parent failed"); + goto out; + } + + return 0; } -out: - DHT_STRIP_PHASE1_FLAGS (stbuf); - DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf, preparent, - postparent, NULL); - return 0; -} + goto out; + } + prev = cookie; -int -dht_link_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent, - dict_t *xdata) -{ - dht_local_t *local = NULL; - xlator_t *srcvol = NULL; + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, preparent, 0); - if (op_ret == -1) - goto err; + dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1); + } - local = frame->local; - srcvol = local->linkfile.srcvol; + ret = dht_fd_ctx_set(this, fd, prev); + if (ret != 0) { + gf_msg_debug(this->name, 0, + "Possible fd leak. " + "Could not set fd ctx for subvol %s", + prev->name); + } - STACK_WIND (frame, dht_link_cbk, srcvol, srcvol->fops->link, - &local->loc, &local->loc2, xdata); + ret = dht_layout_preset(this, prev, inode); + if (ret != 0) { + gf_msg_debug(this->name, 0, "could not set preset layout for subvol %s", + prev->name); + op_ret = -1; + op_errno = EINVAL; + goto out; + } - return 0; + local->op_errno = op_errno; -err: - DHT_STRIP_PHASE1_FLAGS (stbuf); - DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf, preparent, - postparent, NULL); + if (local->linked == _gf_true) { + local->stbuf = *stbuf; + dht_linkfile_attr_heal(frame, this); + } +out: - return 0; + DHT_STRIP_PHASE1_FLAGS(stbuf); + dht_set_fixed_dir_stat(preparent); + dht_set_fixed_dir_stat(postparent); + + if (local && local->lock[0].layout.parent_layout.locks) { + /* store op_errno for failure case*/ + local->op_errno = op_errno; + local->refresh_layout_unlock(frame, this, op_ret, 1); + + if (op_ret == 0) { + DHT_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf, + preparent, postparent, xdata); + } + } else { + DHT_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf, + preparent, postparent, xdata); + } + return 0; } +static int +dht_create_linkfile_create_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *cached_subvol = NULL; + dht_conf_t *conf = NULL; + + local = frame->local; + if (!local) { + op_errno = EINVAL; + goto err; + } + + if (op_ret == -1) { + local->op_errno = op_errno; + goto err; + } + + conf = this->private; + if (!conf) { + local->op_errno = EINVAL; + op_errno = EINVAL; + goto err; + } + + cached_subvol = local->cached_subvol; + + if (local->params) { + dict_del(local->params, conf->link_xattr_name); + dict_del(local->params, GLUSTERFS_INTERNAL_FOP_KEY); + } + + STACK_WIND_COOKIE(frame, dht_create_cbk, cached_subvol, cached_subvol, + cached_subvol->fops->create, &local->loc, local->flags, + local->mode, local->umask, local->fd, local->params); + + return 0; +err: + if (local && local->lock[0].layout.parent_layout.locks) { + local->refresh_layout_unlock(frame, this, -1, 1); + } else { + DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); + } + return 0; +} -int -dht_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdata) +static int +dht_create_wind_to_avail_subvol(call_frame_t *frame, xlator_t *this, + xlator_t *subvol, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, + dict_t *params) { - xlator_t *cached_subvol = NULL; - xlator_t *hashed_subvol = NULL; - int op_errno = -1; - int ret = -1; - dht_local_t *local = NULL; + dht_local_t *local = NULL; + xlator_t *avail_subvol = NULL; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (oldloc, err); - VALIDATE_OR_GOTO (newloc, err); + local = frame->local; - local = dht_local_init (frame, oldloc, NULL, GF_FOP_LINK); - if (!local) { - op_errno = ENOMEM; + if (!dht_is_subvol_filled(this, subvol)) { + gf_msg_debug(this->name, 0, "creating %s on %s", loc->path, + subvol->name); - goto err; - } + dht_set_parent_layout_in_dict(loc, this, local); - cached_subvol = local->cached_subvol; - if (!cached_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", oldloc->path); - op_errno = EINVAL; - goto err; - } + STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol, + subvol->fops->create, loc, flags, mode, umask, fd, + params); - hashed_subvol = dht_subvol_get_hashed (this, newloc); - if (!hashed_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - newloc->path); - op_errno = EINVAL; - goto err; - } + } else { + avail_subvol = dht_free_disk_available_subvol(this, subvol, local); - ret = loc_copy (&local->loc2, newloc); - if (ret == -1) { - op_errno = ENOMEM; - goto err; - } + if (avail_subvol != subvol) { + local->cached_subvol = avail_subvol; + local->hashed_subvol = subvol; - if (hashed_subvol != cached_subvol) { - uuid_copy (local->gfid, oldloc->inode->gfid); - dht_linkfile_create (frame, dht_link_linkfile_cbk, this, - cached_subvol, hashed_subvol, newloc); - } else { - STACK_WIND (frame, dht_link_cbk, - cached_subvol, cached_subvol->fops->link, - oldloc, newloc, xdata); + gf_msg_debug(this->name, 0, "creating %s on %s (link at %s)", + loc->path, avail_subvol->name, subvol->name); + + dht_linkfile_create(frame, dht_create_linkfile_create_cbk, this, + avail_subvol, subvol, loc); + + goto out; } - return 0; + gf_msg_debug(this->name, 0, "creating %s on %s", loc->path, + subvol->name); -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + dht_set_parent_layout_in_dict(loc, this, local); - return 0; + STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol, + subvol->fops->create, loc, flags, mode, umask, fd, + params); + } +out: + return 0; } - int -dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - fd_t *fd, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +dht_build_parent_loc(xlator_t *this, loc_t *parent, loc_t *child, + int32_t *op_errno) { - call_frame_t *prev = NULL; - int ret = -1; - dht_local_t *local = NULL; + inode_table_t *table = NULL; + int ret = -1; - if (op_ret == -1) - goto out; + if (!parent || !child) { + if (op_errno) + *op_errno = EINVAL; + goto out; + } - local = frame->local; - if (!local) { - op_ret = -1; - op_errno = EINVAL; - goto out; + if (child->parent) { + parent->inode = inode_ref(child->parent); + if (!parent->inode) { + if (op_errno) + *op_errno = EINVAL; + goto out; } - prev = cookie; + gf_uuid_copy(parent->gfid, child->pargfid); - if (local->loc.parent) { - dht_inode_ctx_time_update (local->loc.parent, this, - preparent, 0); + ret = 0; - dht_inode_ctx_time_update (local->loc.parent, this, - postparent, 1); + goto out; + } else { + if (gf_uuid_is_null(child->pargfid)) { + if (op_errno) + *op_errno = EINVAL; + goto out; } - ret = dht_layout_preset (this, prev->this, inode); - if (ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "could not set preset layout for subvol %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; + table = this->itable; + + if (!table) { + if (op_errno) { + *op_errno = EINVAL; goto out; + } } - if (local->linked == _gf_true) { - local->stbuf = *stbuf; - dht_linkfile_attr_heal (frame, this); + + parent->inode = inode_find(table, child->pargfid); + + if (!parent->inode) { + if (op_errno) { + *op_errno = EINVAL; + goto out; + } } + + gf_uuid_copy(parent->gfid, child->pargfid); + + ret = 0; + } + out: - DHT_STRIP_PHASE1_FLAGS (stbuf); - DHT_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, stbuf, preparent, - postparent, NULL); - return 0; + return ret; } - -int -dht_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent, - dict_t *xdata) +static int32_t +dht_create_do(call_frame_t *frame) { - dht_local_t *local = NULL; - xlator_t *cached_subvol = NULL; + dht_local_t *local = NULL; + dht_layout_t *refreshed = NULL; + xlator_t *subvol = NULL; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dht_methods_t *methods = NULL; - if (op_ret == -1) - goto err; + local = frame->local; - local = frame->local; - cached_subvol = local->cached_subvol; + this = THIS; - STACK_WIND (frame, dht_create_cbk, - cached_subvol, cached_subvol->fops->create, - &local->loc, local->flags, local->mode, - local->umask, local->fd, local->params); + conf = this->private; - return 0; + GF_VALIDATE_OR_GOTO(this->name, conf, err); + + methods = &(conf->methods); + + /* We don't need parent_loc anymore */ + loc_wipe(&local->loc); + + loc_copy(&local->loc, &local->loc2); + + loc_wipe(&local->loc2); + + refreshed = local->selfheal.refreshed_layout; + + subvol = methods->layout_search(this, refreshed, local->loc.name); + + if (!subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "no subvolume in " + "layout for path=%s", + local->loc.path); + local->op_errno = ENOENT; + goto err; + } + + dht_create_wind_to_avail_subvol(frame, this, subvol, &local->loc, + local->flags, local->mode, local->umask, + local->fd, local->params); + return 0; err: - DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, - NULL, NULL, NULL); + local->refresh_layout_unlock(frame, this, -1, 1); + + return 0; +} + +static int32_t +dht_create_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + DHT_STACK_DESTROY(frame); + return 0; +} + +static int32_t +dht_create_finish(call_frame_t *frame, xlator_t *this, int op_ret, + int invoke_cbk) +{ + dht_local_t *local = NULL, *lock_local = NULL; + call_frame_t *lock_frame = NULL; + int lock_count = 0; + + local = frame->local; + lock_count = dht_lock_count(local->lock[0].layout.parent_layout.locks, + local->lock[0].layout.parent_layout.lk_count); + if (lock_count == 0) + goto done; + + lock_frame = copy_frame(frame); + if (lock_frame == NULL) { + goto done; + } + + lock_local = dht_local_init(lock_frame, &local->loc, NULL, + lock_frame->root->op); + if (lock_local == NULL) { + goto done; + } + + lock_local->lock[0] + .layout.parent_layout.locks = local->lock[0].layout.parent_layout.locks; + lock_local->lock[0].layout.parent_layout.lk_count = + local->lock[0].layout.parent_layout.lk_count; + + local->lock[0].layout.parent_layout.locks = NULL; + local->lock[0].layout.parent_layout.lk_count = 0; + + dht_unlock_inodelk(lock_frame, + lock_local->lock[0].layout.parent_layout.locks, + lock_local->lock[0].layout.parent_layout.lk_count, + dht_create_unlock_cbk); + lock_frame = NULL; + +done: + if (lock_frame != NULL) { + DHT_STACK_DESTROY(lock_frame); + } + + if (op_ret == 0) return 0; + + DHT_STACK_UNWIND(create, frame, op_ret, local->op_errno, NULL, NULL, NULL, + NULL, NULL, NULL); + return 0; } -int -dht_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - mode_t umask, fd_t *fd, dict_t *params) +static int32_t +dht_create_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - int op_errno = -1; - xlator_t *subvol = NULL; - dht_local_t *local = NULL; - xlator_t *avail_subvol = NULL; + dht_local_t *local = NULL; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); + local = frame->local; - dht_get_du_info (frame, this, loc); + if (!local) { + goto err; + } - local = dht_local_init (frame, loc, fd, GF_FOP_CREATE); - if (!local) { - op_errno = ENOMEM; - goto err; - } + if (op_ret < 0) { + gf_msg("DHT", GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR, + "Create lock failed for file: %s", local->loc2.name); - if (dht_filter_loc_subvol_key (this, loc, &local->loc, - &subvol)) { - gf_log (this->name, GF_LOG_INFO, - "creating %s on %s (got create on %s)", - local->loc.path, subvol->name, loc->path); - STACK_WIND (frame, dht_create_cbk, - subvol, subvol->fops->create, - &local->loc, flags, mode, umask, fd, params); - goto done; - } + local->op_errno = op_errno; - subvol = dht_subvol_get_hashed (this, loc); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - op_errno = ENOENT; - goto err; - } + goto err; + } - if (!dht_is_subvol_filled (this, subvol)) { - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_create_cbk, - subvol, subvol->fops->create, - loc, flags, mode, umask, fd, params); - goto done; - } - /* Choose the minimum filled volume, and create the - files there */ - avail_subvol = dht_free_disk_available_subvol (this, subvol, local); - if (avail_subvol != subvol) { - local->params = dict_ref (params); - local->flags = flags; - local->mode = mode; - local->umask = umask; - local->cached_subvol = avail_subvol; - local->hashed_subvol = subvol; - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s (link at %s)", loc->path, - avail_subvol->name, subvol->name); - dht_linkfile_create (frame, dht_create_linkfile_create_cbk, - this, avail_subvol, subvol, loc); - goto done; - } - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_create_cbk, - subvol, subvol->fops->create, - loc, flags, mode, umask, fd, params); -done: - return 0; + local->refresh_layout_unlock = dht_create_finish; -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, - NULL, NULL, NULL); + local->refresh_layout_done = dht_create_do; - return 0; -} + dht_refresh_layout(frame); + return 0; +err: + if (local) + dht_create_finish(frame, this, -1, 0); + else + DHT_STACK_UNWIND(create, frame, -1, EINVAL, NULL, NULL, NULL, NULL, + NULL, NULL); + return 0; +} -int -dht_mkdir_selfheal_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +int32_t +dht_create_lock(call_frame_t *frame, xlator_t *subvol) { - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int count = 1, ret = -1; + dht_lock_t **lk_array = NULL; - local = frame->local; - layout = local->selfheal.layout; + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO(frame->this->name, frame->local, err); - if (op_ret == 0) { - dht_layout_set (this, local->inode, layout); - if (local->loc.parent) { - dht_inode_ctx_time_update (local->loc.parent, this, - &local->preparent, 0); + local = frame->local; - dht_inode_ctx_time_update (local->loc.parent, this, - &local->postparent, 1); - } - } + lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer); - DHT_STACK_UNWIND (mkdir, frame, op_ret, op_errno, - local->inode, &local->stbuf, &local->preparent, - &local->postparent, NULL); + if (lk_array == NULL) + goto err; - return 0; + lk_array[0] = dht_lock_new(frame->this, subvol, &local->loc, F_RDLCK, + DHT_LAYOUT_HEAL_DOMAIN, NULL, + IGNORE_ENOENT_ESTALE); + + if (lk_array[0] == NULL) + goto err; + + local->lock[0].layout.parent_layout.locks = lk_array; + local->lock[0].layout.parent_layout.lk_count = count; + + ret = dht_blocking_inodelk(frame, lk_array, count, dht_create_lock_cbk); + + if (ret < 0) { + local->lock[0].layout.parent_layout.locks = NULL; + local->lock[0].layout.parent_layout.lk_count = 0; + goto err; + } + + return 0; +err: + if (lk_array != NULL) { + dht_lock_array_free(lk_array, count); + GF_FREE(lk_array); + } + + return -1; } int -dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +dht_set_parent_layout_in_dict(loc_t *loc, xlator_t *this, dht_local_t *local) { - dht_local_t *local = NULL; - int this_call_cnt = 0; - int ret = -1; - gf_boolean_t subvol_filled = _gf_false; - call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; + dht_conf_t *conf = this->private; + dht_layout_t *parent_layout = NULL; + int *parent_disk_layout = NULL; + xlator_t *hashed_subvol = NULL; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + int ret = 0; + + gf_uuid_unparse(loc->parent->gfid, pgfid); + + parent_layout = dht_layout_get(this, loc->parent); + hashed_subvol = dht_subvol_get_hashed(this, loc); + + ret = dht_disk_layout_extract_for_subvol(this, parent_layout, hashed_subvol, + &parent_disk_layout); + if (ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "%s (%s/%s) (path: %s): " + "extracting in-memory layout of parent failed. ", + gf_fop_list[local->fop], pgfid, loc->name, loc->path); + goto err; + } + + ret = dict_set_str_sizen(local->params, GF_PREOP_PARENT_KEY, + conf->xattr_name); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "%s (%s/%s) (path: %s): " + "setting %s key in params dictionary failed. ", + gf_fop_list[local->fop], pgfid, loc->name, loc->path, + GF_PREOP_PARENT_KEY); + goto err; + } + + ret = dict_set_bin(local->params, conf->xattr_name, parent_disk_layout, + 4 * 4); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "%s (%s/%s) (path: %s): " + "setting parent-layout in params dictionary failed. ", + gf_fop_list[local->fop], pgfid, loc->name, loc->path); + goto err; + } - local = frame->local; - prev = cookie; - layout = local->layout; +err: + dht_layout_unref(this, parent_layout); + return ret; +} + +int +dht_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *params) +{ + int op_errno = -1; + xlator_t *subvol = NULL; + xlator_t *hashed_subvol = NULL; + dht_local_t *local = NULL; + int i = 0; + dht_conf_t *conf = NULL; + int ret = 0; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + + conf = this->private; + + dht_get_du_info(frame, this, loc); + + local = dht_local_init(frame, loc, fd, GF_FOP_CREATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->params = dict_ref(params); + local->flags = flags; + local->mode = mode; + local->umask = umask; + + if (dht_filter_loc_subvol_key(this, loc, &local->loc, &subvol)) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO, + "creating %s on %s (got create on %s)", local->loc.path, + subvol->name, loc->path); + + /* Since lookup-optimize is enabled by default, we need + * to create the linkto file if required. + * Note this does not check for decommisioned bricks + * and min-free-disk limits as this is a debugging tool + * and not expected to be used in production. + */ + hashed_subvol = dht_subvol_get_hashed(this, &local->loc); - subvol_filled = dht_is_subvol_filled (this, prev->this); + if (hashed_subvol && (hashed_subvol != subvol)) { + /* Create the linkto file and then the data file */ + local->cached_subvol = subvol; + local->hashed_subvol = hashed_subvol; - LOCK (&frame->lock); - { - if (subvol_filled && (op_ret != -1)) { - ret = dht_layout_merge (this, layout, prev->this, - -1, ENOSPC, NULL); - } else { - if (op_ret == -1 && op_errno == EEXIST) - /* Very likely just a race between mkdir and - self-heal (from lookup of a concurrent mkdir - attempt). - Ignore error for now. layout setting will - anyways fail if this was a different (old) - pre-existing different directory. - */ - op_ret = 0; - ret = dht_layout_merge (this, layout, prev->this, - op_ret, op_errno, NULL); + dht_linkfile_create(frame, dht_create_linkfile_create_cbk, this, + subvol, hashed_subvol, &local->loc); + goto done; + } + /* We either don't have a hashed subvol or the hashed subvol is + * the same as the one specified. No need to create the linkto + * file as we expect a lookup everywhere if there are problems + * with the parent layout + */ + + dht_set_parent_layout_in_dict(loc, this, local); + + STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol, + subvol->fops->create, &local->loc, flags, mode, umask, + fd, params); + goto done; + } + + subvol = dht_subvol_get_hashed(this, loc); + if (!subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "no subvolume in layout for path=%s", loc->path); + + op_errno = EIO; + goto err; + } + + /* Post remove-brick, the client layout may not be in sync with + * disk layout because of lack of lookup. Hence,a create call + * may fall on the decommissioned brick. Hence, if the + * hashed_subvol is part of decommissioned bricks list, do a + * lookup on parent dir. If a fix-layout is already done by the + * remove-brick process, the parent directory layout will be in + * sync with that of the disk. If fix-layout is still ending + * on the parent directory, we can let the file get created on + * the decommissioned brick which will be eventually migrated to + * non-decommissioned brick based on the new layout. + */ + + if (conf->decommission_subvols_cnt) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i] && + conf->decommissioned_bricks[i] == subvol) { + gf_msg_debug(this->name, 0, + "hashed subvol:%s is " + "part of decommission brick list for " + "file: %s", + subvol->name, loc->path); + + /* dht_refresh_layout needs directory info in + * local->loc. Hence, storing the parent_loc in + * local->loc and storing the create context in + * local->loc2. We will restore this information + * in dht_creation do */ + + ret = loc_copy(&local->loc2, &local->loc); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "loc_copy failed %s", loc->path); + + goto err; } - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to merge layouts", local->loc.path); - if (op_ret == -1) { - local->op_errno = op_errno; - goto unlock; + loc_wipe(&local->loc); + + ret = dht_build_parent_loc(this, &local->loc, loc, &op_errno); + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_LOC_FAILED, + "parent loc build failed"); + goto err; + } + + ret = dht_create_lock(frame, subvol); + + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR, + "locking parent failed"); + goto err; } - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - dht_iatt_merge (this, &local->preparent, preparent, prev->this); - dht_iatt_merge (this, &local->postparent, postparent, - prev->this); - } -unlock: - UNLOCK (&frame->lock); - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - dht_selfheal_new_directory (frame, dht_mkdir_selfheal_cbk, - layout); + goto done; + } } + } - return 0; + dht_create_wind_to_avail_subvol(frame, this, subvol, loc, flags, mode, + umask, fd, params); +done: + return 0; + +err: + + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, + NULL); + + return 0; } -int -dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent, - dict_t *xdata) -{ - dht_local_t *local = NULL; - int ret = -1; - call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; - dht_conf_t *conf = NULL; - int i = 0; - xlator_t *hashed_subvol = NULL; - - VALIDATE_OR_GOTO (this->private, err); +static int +dht_mkdir_selfheal_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; - local = frame->local; - prev = cookie; - layout = local->layout; - conf = this->private; - hashed_subvol = local->hashed_subvol; + local = frame->local; + layout = local->selfheal.layout; - if (uuid_is_null (local->loc.gfid) && !op_ret) - uuid_copy (local->loc.gfid, stbuf->ia_gfid); + FRAME_SU_UNDO(frame, dht_local_t); + dht_set_fixed_dir_stat(&local->preparent); + dht_set_fixed_dir_stat(&local->postparent); - if (dht_is_subvol_filled (this, hashed_subvol)) - ret = dht_layout_merge (this, layout, prev->this, - -1, ENOSPC, NULL); - else - ret = dht_layout_merge (this, layout, prev->this, - op_ret, op_errno, NULL); + if (op_ret == 0) { + dht_layout_set(this, local->inode, layout); - /* TODO: we may have to return from the function - if layout merge fails. For now, lets just log an error */ - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to merge layouts", local->loc.path); + dht_inode_ctx_time_update(local->inode, this, &local->stbuf, 1); + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, + &local->preparent, 0); - if (op_ret == -1) { - local->op_errno = op_errno; - goto err; + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); } - local->op_ret = 0; + } - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - dht_iatt_merge (this, &local->preparent, preparent, prev->this); - dht_iatt_merge (this, &local->postparent, postparent, prev->this); + DHT_STACK_UNWIND(mkdir, frame, op_ret, op_errno, local->inode, + &local->stbuf, &local->preparent, &local->postparent, + NULL); - local->call_cnt = conf->subvolume_cnt - 1; + return 0; +} - if (uuid_is_null (local->loc.gfid)) - uuid_copy (local->loc.gfid, stbuf->ia_gfid); - if (local->call_cnt == 0) { - dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk, - &local->loc, layout); +static int +dht_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + int ret = -1; + gf_boolean_t subvol_filled = _gf_false; + gf_boolean_t dir_exists = _gf_false; + xlator_t *prev = NULL; + dht_layout_t *layout = NULL; + + local = frame->local; + prev = cookie; + layout = local->layout; + + subvol_filled = dht_is_subvol_filled(this, prev); + + LOCK(&frame->lock); + { + if (subvol_filled && (op_ret != -1)) { + ret = dht_layout_merge(this, layout, prev, -1, ENOSPC, NULL); + } else { + if (op_ret == -1 && op_errno == EEXIST) { + /* Very likely just a race between mkdir and + self-heal (from lookup of a concurrent mkdir + attempt). + Ignore error for now. layout setting will + anyways fail if this was a different (old) + pre-existing different directory. + */ + op_ret = 0; + dir_exists = _gf_true; + } + ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, NULL); } - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->subvolumes[i] == hashed_subvol) - continue; - STACK_WIND (frame, dht_mkdir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->mkdir, &local->loc, - local->mode, local->umask, local->params); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED, + "%s: failed to merge layouts for subvol %s", local->loc.path, + prev->name); + + if (op_ret == -1) { + local->op_errno = op_errno; + goto unlock; } - return 0; -err: - DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, - NULL, NULL); - return 0; + + if (dir_exists) + goto unlock; + + dht_iatt_merge(this, &local->stbuf, stbuf); + dht_iatt_merge(this, &local->preparent, preparent); + dht_iatt_merge(this, &local->postparent, postparent); + } +unlock: + UNLOCK(&frame->lock); + + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + /*Unlock entrylk and inodelk once mkdir is done on all subvols*/ + dht_unlock_namespace(frame, &local->lock[0]); + FRAME_SU_DO(frame, dht_local_t); + dht_selfheal_new_directory(frame, dht_mkdir_selfheal_cbk, layout); + } + + return 0; } +static int +dht_mkdir_hashed_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata); - int -dht_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, mode_t umask, dict_t *params) +static int +dht_mkdir_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *params) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int op_errno = -1; - xlator_t *hashed_subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1, ret = -1; + xlator_t *hashed_subvol = NULL; + int32_t *parent_disk_layout = NULL; + dht_layout_t *parent_layout = NULL; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + VALIDATE_OR_GOTO(loc->path, err); + VALIDATE_OR_GOTO(this->private, err); + + gf_uuid_unparse(loc->parent->gfid, pgfid); + + conf = this->private; + local = frame->local; + + if (local->op_ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "mkdir (%s/%s) (path: %s): refreshing parent layout " + "failed.", + pgfid, loc->name, loc->path); + + op_errno = local->op_errno; + goto err; + } + + local->op_ret = -1; + + hashed_subvol = dht_subvol_get_hashed(this, loc); + if (hashed_subvol == NULL) { + gf_msg_debug(this->name, 0, + "mkdir (%s/%s) (path: %s): hashed subvol not " + "found", + pgfid, loc->name, loc->path); + op_errno = ENOENT; + goto err; + } + + local->hashed_subvol = hashed_subvol; + + parent_layout = dht_layout_get(this, loc->parent); + + ret = dht_disk_layout_extract_for_subvol(this, parent_layout, hashed_subvol, + &parent_disk_layout); + if (ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, EIO, DHT_MSG_PARENT_LAYOUT_CHANGED, + "mkdir (%s/%s) (path: %s): " + "extracting in-memory layout of parent failed. ", + pgfid, loc->name, loc->path); + goto err; + } + + if (memcmp(local->parent_disk_layout, parent_disk_layout, + sizeof(local->parent_disk_layout)) == 0) { + gf_msg(this->name, GF_LOG_WARNING, EIO, DHT_MSG_PARENT_LAYOUT_CHANGED, + "mkdir (%s/%s) (path: %s): loop detected. " + "parent layout didn't change even though " + "previous attempt of mkdir failed because of " + "in-memory layout not matching with that on disk.", + pgfid, loc->name, loc->path); + op_errno = EIO; + goto err; + } + + memcpy((void *)local->parent_disk_layout, (void *)parent_disk_layout, + sizeof(local->parent_disk_layout)); + + dht_layout_unref(this, parent_layout); + parent_layout = NULL; + + ret = dict_set_str(params, GF_PREOP_PARENT_KEY, conf->xattr_name); + if (ret < 0) { + local->op_errno = -ret; + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "mkdir (%s/%s) (path: %s): " + "setting %s key in params dictionary failed. ", + pgfid, loc->name, loc->path, GF_PREOP_PARENT_KEY); + goto err; + } + + ret = dict_set_bin(params, conf->xattr_name, parent_disk_layout, 4 * 4); + if (ret < 0) { + local->op_errno = -ret; + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "setting parent-layout in params dictionary failed. " + "mkdir (%s/%s) (path: %s)", + pgfid, loc->name, loc->path); + goto err; + } + + parent_disk_layout = NULL; + + STACK_WIND_COOKIE(frame, dht_mkdir_hashed_cbk, hashed_subvol, hashed_subvol, + hashed_subvol->fops->mkdir, loc, mode, umask, params); + + return 0; +err: + dht_unlock_namespace(frame, &local->lock[0]); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (this->private, err); + op_errno = local ? local->op_errno : op_errno; + DHT_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - conf = this->private; + if (parent_disk_layout != NULL) + GF_FREE(parent_disk_layout); - dht_get_du_info (frame, this, loc); + if (parent_layout != NULL) + dht_layout_unref(this, parent_layout); - local = dht_local_init (frame, loc, NULL, GF_FOP_MKDIR); - if (!local) { - op_errno = ENOMEM; - goto err; - } + return 0; +} - hashed_subvol = dht_subvol_get_hashed (this, loc); - if (hashed_subvol == NULL) { - gf_log (this->name, GF_LOG_DEBUG, - "hashed subvol not found for %s", - loc->path); - op_errno = EINVAL; +static int +dht_mkdir_hashed_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + int ret = -1; + xlator_t *prev = NULL; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *hashed_subvol = NULL; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + gf_boolean_t parent_layout_changed = _gf_false; + call_stub_t *stub = NULL; + + local = frame->local; + prev = cookie; + layout = local->layout; + conf = this->private; + hashed_subvol = local->hashed_subvol; + + gf_uuid_unparse(local->loc.parent->gfid, pgfid); + + if (gf_uuid_is_null(local->loc.gfid) && !op_ret) + gf_uuid_copy(local->loc.gfid, stbuf->ia_gfid); + + if (op_ret == -1) { + local->op_errno = op_errno; + + parent_layout_changed = (xdata && + dict_get(xdata, GF_PREOP_CHECK_FAILED)) + ? 1 + : 0; + if (parent_layout_changed) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_PARENT_LAYOUT_CHANGED, + "mkdir (%s/%s) (path: %s): parent layout " + "changed. Attempting a refresh and then a " + "retry", + pgfid, local->loc.name, local->loc.path); + + stub = fop_mkdir_stub(frame, dht_mkdir_helper, &local->loc, + local->mode, local->umask, local->params); + if (stub == NULL) { goto err; - } - - local->hashed_subvol = hashed_subvol; - local->mode = mode; - local->umask = umask; - local->params = dict_ref (params); - local->inode = inode_ref (loc->inode); + } - local->layout = dht_layout_new (this, conf->subvolume_cnt); - if (!local->layout) { - op_errno = ENOMEM; + ret = dht_handle_parent_layout_change(this, stub); + if (ret) { goto err; - } + } + + stub = NULL; + + return 0; + } + + goto err; + } + + dict_del(local->params, GF_PREOP_PARENT_KEY); + dict_del(local->params, conf->xattr_name); + + if (dht_is_subvol_filled(this, hashed_subvol)) + ret = dht_layout_merge(this, layout, prev, -1, ENOSPC, NULL); + else + ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, NULL); + + /* TODO: we may have to return from the function + if layout merge fails. For now, lets just log an error */ + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED, + "%s: failed to merge layouts for subvol %s", local->loc.path, + prev->name); + + local->op_ret = 0; + + dht_iatt_merge(this, &local->stbuf, stbuf); + dht_iatt_merge(this, &local->preparent, preparent); + dht_iatt_merge(this, &local->postparent, postparent); + + local->call_cnt = conf->subvolume_cnt - 1; + /* Delete internal mds xattr from params dict to avoid store + internal mds xattr on other subvols + */ + dict_del(local->params, conf->mds_xattr_key); + + if (gf_uuid_is_null(local->loc.gfid)) + gf_uuid_copy(local->loc.gfid, stbuf->ia_gfid); + + /* Set hashed subvol as a mds subvol on inode ctx */ + /*if (!local->inode) + local->inode = inode_ref (inode); + */ + ret = dht_inode_ctx_mdsvol_set(local->inode, this, hashed_subvol); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED, + "Failed to set hashed subvol for %s on inode vol is %s", + local->loc.path, hashed_subvol->name); + } + + if (local->call_cnt == 0) { + /*Unlock namespace lock once mkdir is done on all subvols*/ + dht_unlock_namespace(frame, &local->lock[0]); + FRAME_SU_DO(frame, dht_local_t); + dht_selfheal_directory(frame, dht_mkdir_selfheal_cbk, &local->loc, + layout); + return 0; + } + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == hashed_subvol) + continue; + STACK_WIND_COOKIE(frame, dht_mkdir_cbk, conf->subvolumes[i], + conf->subvolumes[i], conf->subvolumes[i]->fops->mkdir, + &local->loc, local->mode, local->umask, + local->params); + } + + return 0; +err: + if (local->op_ret != 0) { + dht_unlock_namespace(frame, &local->lock[0]); + } - STACK_WIND (frame, dht_mkdir_hashed_cbk, - hashed_subvol, - hashed_subvol->fops->mkdir, - loc, mode, umask, params); + DHT_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; +} +static int +dht_mkdir_guard_parent_layout_cbk(call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, mode_t umask, + dict_t *params) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = 0; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + int ret = -1; + int32_t zero[1] = {0}; + + local = frame->local; + conf = this->private; + + gf_uuid_unparse(loc->parent->gfid, pgfid); + + if (local->op_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "mkdir (%s/%s) (path: %s): " + "Acquiring lock on parent to guard against " + "layout-change failed.", + pgfid, loc->name, loc->path); + goto err; + } + + local->op_ret = -1; + /* Add internal MDS xattr on disk for hashed subvol + */ + ret = dht_dict_set_array(params, conf->mds_xattr_key, zero, 1); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value:key = %s for " + "path %s", + conf->mds_xattr_key, loc->path); + } + + STACK_WIND_COOKIE(frame, dht_mkdir_hashed_cbk, local->hashed_subvol, + local->hashed_subvol, local->hashed_subvol->fops->mkdir, + loc, mode, umask, params); + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, - NULL, NULL); + DHT_STACK_UNWIND(mkdir, frame, -1, local->op_errno, NULL, NULL, NULL, NULL, + NULL); - return 0; + return 0; } - int -dht_rmdir_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xdata) +dht_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *params) { - dht_local_t *local = NULL; - - local = frame->local; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = EINVAL, ret = -1; + xlator_t *hashed_subvol = NULL; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + call_stub_t *stub = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + VALIDATE_OR_GOTO(loc->path, err); + VALIDATE_OR_GOTO(this->private, err); + + gf_uuid_unparse(loc->parent->gfid, pgfid); + + conf = this->private; + + if (!params || !dict_get(params, "gfid-req")) { + op_errno = EPERM; + gf_msg_callingfn(this->name, GF_LOG_WARNING, op_errno, + DHT_MSG_GFID_NULL, + "mkdir: %s is received " + "without gfid-req %p", + loc->path, params); + goto err; + } + + dht_get_du_info(frame, this, loc); + + local = dht_local_init(frame, loc, NULL, GF_FOP_MKDIR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + hashed_subvol = dht_subvol_get_hashed(this, loc); + if (hashed_subvol == NULL) { + gf_msg_debug(this->name, 0, "hashed subvol not found for %s", + loc->path); + local->op_errno = EIO; + goto err; + } + + local->hashed_subvol = hashed_subvol; + local->mode = mode; + local->umask = umask; + if (params) + local->params = dict_ref(params); + + local->inode = inode_ref(loc->inode); + + local->layout = dht_layout_new(this, conf->subvolume_cnt); + if (!local->layout) { + op_errno = ENOMEM; + goto err; + } + + /* set the newly created directory hash to the commit hash + * if the configuration option is set. If configuration option + * is not set, the older clients may still be connecting to the + * volume and hence we need to preserve the 1 in disk[0] part of the + * layout xattr */ + if (conf->lookup_optimize) + local->layout->commit_hash = conf->vol_commit_hash; + else + local->layout->commit_hash = DHT_LAYOUT_HASH_INVALID; + + stub = fop_mkdir_stub(frame, dht_mkdir_guard_parent_layout_cbk, loc, mode, + umask, params); + if (stub == NULL) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, + DHT_MSG_PARENT_LAYOUT_CHANGED, + "mkdir (%s/%s) (path: %s): " + "creating stub failed.", + pgfid, loc->name, loc->path); + local->op_errno = ENOMEM; + goto err; + } + + ret = dht_guard_parent_layout_and_namespace(this, stub); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_PARENT_LAYOUT_CHANGED, + "mkdir (%s/%s) (path: %s) cannot wind lock request to " + "guard parent layout", + pgfid, loc->name, loc->path); + goto err; + } + + return 0; - DHT_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno, - &local->preparent, &local->postparent, NULL); +err: + op_errno = local ? local->op_errno : op_errno; + DHT_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } - -int -dht_rmdir_hashed_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +static int +dht_rmdir_selfheal_cbk(call_frame_t *heal_frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; + dht_local_t *local = NULL; + dht_local_t *heal_local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - local->op_ret = -1; - if (op_errno != ENOENT && op_errno != EACCES) { - local->need_selfheal = 1; - } + heal_local = heal_frame->local; + main_frame = heal_local->main_frame; + local = main_frame->local; + DHT_STACK_DESTROY(heal_frame); + dht_set_fixed_dir_stat(&local->preparent); + dht_set_fixed_dir_stat(&local->postparent); - gf_log (this->name, GF_LOG_DEBUG, - "rmdir on %s for %s failed (%s)", - prev->this->name, local->loc.path, - strerror (op_errno)); - goto unlock; - } + DHT_STACK_UNWIND(rmdir, main_frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, NULL); - dht_iatt_merge (this, &local->preparent, preparent, prev->this); - dht_iatt_merge (this, &local->postparent, postparent, - prev->this); + return 0; +} +static int +dht_rmdir_hashed_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + dht_local_t *heal_local = NULL; + call_frame_t *heal_frame = NULL; + dht_conf_t *conf = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + prev = cookie; + conf = this->private; + + gf_uuid_unparse(local->loc.gfid, gfid); + + LOCK(&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + local->op_ret = -1; + if (conf->subvolume_cnt != 1) { + if (op_errno != ENOENT && op_errno != EACCES && + op_errno != ESTALE) { + local->need_selfheal = 1; + } + } + + gf_msg_debug(this->name, op_errno, + "rmdir on %s for %s failed " + "(gfid = %s)", + prev->name, local->loc.path, gfid); + goto unlock; } + + dht_iatt_merge(this, &local->preparent, preparent); + dht_iatt_merge(this, &local->postparent, postparent); + } unlock: - UNLOCK (&frame->lock); - - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - if (local->need_selfheal) { - local->layout = - dht_layout_get (this, local->loc.inode); - - /* TODO: neater interface needed below */ - local->stbuf.ia_type = local->loc.inode->ia_type; - - uuid_copy (local->gfid, local->loc.inode->gfid); - dht_selfheal_restore (frame, dht_rmdir_selfheal_cbk, - &local->loc, local->layout); - } else { - - if (local->loc.parent) { - dht_inode_ctx_time_update (local->loc.parent, - this, - &local->preparent, - 0); - - dht_inode_ctx_time_update (local->loc.parent, - this, - &local->postparent, - 1); - } + UNLOCK(&frame->lock); - DHT_STACK_UNWIND (rmdir, frame, local->op_ret, - local->op_errno, &local->preparent, - &local->postparent, NULL); - } - } + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + if (local->need_selfheal) { + dht_rmdir_unlock(frame, this); + local->layout = dht_layout_get(this, local->loc.inode); - return 0; -} + /* TODO: neater interface needed below */ + local->stbuf.ia_type = local->loc.inode->ia_type; + gf_uuid_copy(local->gfid, local->loc.inode->gfid); -int -dht_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - int done = 0; + /* Use a different frame or else the rmdir op_ret is + * overwritten by that of the selfheal */ - local = frame->local; - prev = cookie; + heal_frame = copy_frame(frame); - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - local->op_ret = -1; + if (heal_frame == NULL) { + goto err; + } - if (op_errno != ENOENT && op_errno != EACCES) { - local->need_selfheal = 1; - } + heal_local = dht_local_init(heal_frame, &local->loc, NULL, 0); + if (!heal_local) { + DHT_STACK_DESTROY(heal_frame); + goto err; + } - gf_log (this->name, GF_LOG_DEBUG, - "rmdir on %s for %s failed (%s)", - prev->this->name, local->loc.path, - strerror (op_errno)); - goto unlock; - } + heal_local->inode = inode_ref(local->loc.inode); + heal_local->main_frame = frame; + gf_uuid_copy(heal_local->gfid, local->loc.inode->gfid); - /* Track if rmdir succeeded on atleast one subvol*/ - local->fop_succeeded = 1; - dht_iatt_merge (this, &local->preparent, preparent, prev->this); - dht_iatt_merge (this, &local->postparent, postparent, - prev->this); - } -unlock: - UNLOCK (&frame->lock); + dht_selfheal_restore(heal_frame, dht_rmdir_selfheal_cbk, + &heal_local->loc, heal_local->layout); + return 0; + } else { + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, + &local->preparent, 0); + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); + } - this_call_cnt = dht_frame_return (frame); + dht_set_fixed_dir_stat(&local->preparent); + dht_set_fixed_dir_stat(&local->postparent); - /* if local->hashed_subvol, we are yet to wind to hashed_subvol. */ - if (local->hashed_subvol && (this_call_cnt == 1)) { - done = 1; - } else if (!local->hashed_subvol && !this_call_cnt) { - done = 1; + dht_rmdir_unlock(frame, this); + DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, NULL); } + } + return 0; - if (done) { - if (local->need_selfheal && local->fop_succeeded) { - local->layout = - dht_layout_get (this, local->loc.inode); +err: + DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno, NULL, NULL, + NULL); + return 0; +} - /* TODO: neater interface needed below */ - local->stbuf.ia_type = local->loc.inode->ia_type; +static int +dht_rmdir_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + DHT_STACK_DESTROY(frame); + return 0; +} - uuid_copy (local->gfid, local->loc.inode->gfid); - dht_selfheal_restore (frame, dht_rmdir_selfheal_cbk, - &local->loc, local->layout); - } else if (this_call_cnt) { - /* If non-hashed subvol's have responded, proceed */ +static int +dht_rmdir_unlock(call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL, *lock_local = NULL; + call_frame_t *lock_frame = NULL; + int lock_count = 0; - local->need_selfheal = 0; - STACK_WIND (frame, dht_rmdir_hashed_subvol_cbk, - local->hashed_subvol, - local->hashed_subvol->fops->rmdir, - &local->loc, local->flags, NULL); - } else if (!this_call_cnt) { - /* All subvol's have responded, proceed */ + local = frame->local; - if (local->loc.parent) { + /* Unlock entrylk */ + dht_unlock_entrylk_wrapper(frame, &local->lock[0].ns.directory_ns); - dht_inode_ctx_time_update (local->loc.parent, - this, - &local->preparent, - 0); + /* Unlock inodelk */ + lock_count = dht_lock_count(local->lock[0].ns.parent_layout.locks, + local->lock[0].ns.parent_layout.lk_count); - dht_inode_ctx_time_update (local->loc.parent, - this, - &local->postparent, - 1); + if (lock_count == 0) + goto done; - } + lock_frame = copy_frame(frame); + if (lock_frame == NULL) + goto done; - DHT_STACK_UNWIND (rmdir, frame, local->op_ret, - local->op_errno, &local->preparent, - &local->postparent, NULL); - } - } + lock_local = dht_local_init(lock_frame, &local->loc, NULL, + lock_frame->root->op); + if (lock_local == NULL) + goto done; - return 0; -} + lock_local->lock[0].ns.parent_layout.locks = local->lock[0] + .ns.parent_layout.locks; + lock_local->lock[0] + .ns.parent_layout.lk_count = local->lock[0].ns.parent_layout.lk_count; + local->lock[0].ns.parent_layout.locks = NULL; + local->lock[0].ns.parent_layout.lk_count = 0; + dht_unlock_inodelk(lock_frame, lock_local->lock[0].ns.parent_layout.locks, + lock_local->lock[0].ns.parent_layout.lk_count, + dht_rmdir_unlock_cbk); + lock_frame = NULL; -int -dht_rmdir_do (call_frame_t *frame, xlator_t *this) +done: + if (lock_frame != NULL) { + DHT_STACK_DESTROY(lock_frame); + } + + return 0; +} + +static int +dht_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int i = 0; - xlator_t *hashed_subvol = NULL; + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + int done = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + dht_local_t *heal_local = NULL; + call_frame_t *heal_frame = NULL; + int ret = -1; + + local = frame->local; + prev = cookie; + + LOCK(&frame->lock); + { + if (op_ret == -1) { + if ((op_errno != ENOENT) && (op_errno != ESTALE)) { + local->op_errno = op_errno; + local->op_ret = -1; - VALIDATE_OR_GOTO (this->private, err); + if (op_errno != EACCES) + local->need_selfheal = 1; + } - conf = this->private; - local = frame->local; + gf_uuid_unparse(local->loc.gfid, gfid); - if (local->op_ret == -1) - goto err; + gf_msg_debug(this->name, op_errno, + "rmdir on %s for %s failed." + "(gfid = %s)", + prev->name, local->loc.path, gfid); + goto unlock; + } - local->call_cnt = conf->subvolume_cnt; + /* Track if rmdir succeeded on at least one subvol*/ + local->fop_succeeded = 1; + dht_iatt_merge(this, &local->preparent, preparent); + dht_iatt_merge(this, &local->postparent, postparent); + } +unlock: + UNLOCK(&frame->lock); - /* first remove from non-hashed_subvol */ - hashed_subvol = dht_subvol_get_hashed (this, &local->loc); + this_call_cnt = dht_frame_return(frame); - if (!hashed_subvol) { - gf_log (this->name, GF_LOG_WARNING, "failed to get hashed " - "subvol for %s",local->loc.path); - } else { - local->hashed_subvol = hashed_subvol; - } + /* if local->hashed_subvol, we are yet to wind to hashed_subvol. */ + if (local->hashed_subvol && (this_call_cnt == 1)) { + done = 1; + } else if (!local->hashed_subvol && !this_call_cnt) { + done = 1; + } + + if (done) { + if (local->need_selfheal && local->fop_succeeded) { + dht_rmdir_unlock(frame, this); + local->layout = dht_layout_get(this, local->loc.inode); + + /* TODO: neater interface needed below */ + local->stbuf.ia_type = local->loc.inode->ia_type; + + gf_uuid_copy(local->gfid, local->loc.inode->gfid); + heal_frame = copy_frame(frame); + if (heal_frame == NULL) { + goto err; + } + + heal_local = dht_local_init(heal_frame, &local->loc, NULL, 0); + if (!heal_local) { + DHT_STACK_DESTROY(heal_frame); + goto err; + } + + heal_local->inode = inode_ref(local->loc.inode); + heal_local->main_frame = frame; + gf_uuid_copy(heal_local->gfid, local->loc.inode->gfid); + ret = dht_selfheal_restore(heal_frame, dht_rmdir_selfheal_cbk, + &heal_local->loc, heal_local->layout); + if (ret) { + DHT_STACK_DESTROY(heal_frame); + goto err; + } + + } else if (this_call_cnt) { + /* If non-hashed subvol's have responded, proceed */ + if (local->op_ret == 0) { + /* Delete the dir from the hashed subvol if: + * The fop succeeded on at least one subvol + * and did not fail on any + * or + * The fop failed with ENOENT/ESTALE on + * all subvols */ + + STACK_WIND_COOKIE(frame, dht_rmdir_hashed_subvol_cbk, + local->hashed_subvol, local->hashed_subvol, + local->hashed_subvol->fops->rmdir, + &local->loc, local->flags, NULL); + } else { + /* hashed-subvol was non-NULL and rmdir failed on + * all non hashed-subvols. Unwind rmdir with + * local->op_ret and local->op_errno. */ + dht_rmdir_unlock(frame, this); + DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, NULL); - /* When DHT has only 1 child */ - if (conf->subvolume_cnt == 1) { - STACK_WIND (frame, dht_rmdir_hashed_subvol_cbk, - conf->subvolumes[0], - conf->subvolumes[0]->fops->rmdir, - &local->loc, local->flags, NULL); return 0; - } + } + } else if (!this_call_cnt) { + /* All subvol's have responded, proceed */ - for (i = 0; i < conf->subvolume_cnt; i++) { - if (hashed_subvol && - (hashed_subvol == conf->subvolumes[i])) - continue; + if (local->loc.parent) { + dht_inode_ctx_time_update(local->loc.parent, this, + &local->preparent, 0); + + dht_inode_ctx_time_update(local->loc.parent, this, + &local->postparent, 1); + } + + dht_set_fixed_dir_stat(&local->preparent); + dht_set_fixed_dir_stat(&local->postparent); - STACK_WIND (frame, dht_rmdir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->rmdir, - &local->loc, local->flags, NULL); + dht_rmdir_unlock(frame, this); + DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, NULL); } + } - return 0; + return 0; err: - DHT_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno, - &local->preparent, &local->postparent, NULL); - return 0; + DHT_STACK_UNWIND(rmdir, frame, -1, local->op_errno, NULL, NULL, NULL); + return 0; } - -int -dht_rmdir_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +static int +dht_rmdir_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - xlator_t *src = NULL; - call_frame_t *main_frame = NULL; - dht_local_t *main_local = NULL; - int this_call_cnt = 0; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *hashed_subvol; - local = frame->local; - prev = cookie; - src = prev->this; + conf = this->private; + local = frame->local; - main_frame = local->main_frame; - main_local = main_frame->local; + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR, + "acquiring entrylk after inodelk failed rmdir for %s)", + local->loc.path); - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "unlinked linkfile %s on %s", - local->loc.path, src->name); - } else { - main_local->op_ret = -1; - main_local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "unlink of %s on %s failed (%s)", - local->loc.path, src->name, strerror (op_errno)); - } + local->op_ret = -1; + local->op_errno = op_errno; + goto err; + } - this_call_cnt = dht_frame_return (main_frame); - if (is_last_call (this_call_cnt)) - dht_rmdir_do (main_frame, this); + hashed_subvol = local->hashed_subvol; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (hashed_subvol && (hashed_subvol == conf->subvolumes[i])) + continue; - DHT_STACK_DESTROY (frame); - return 0; -} + STACK_WIND_COOKIE(frame, dht_rmdir_cbk, conf->subvolumes[i], + conf->subvolumes[i], conf->subvolumes[i]->fops->rmdir, + &local->loc, local->flags, NULL); + } + return 0; -int -dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, inode_t *inode, - struct iatt *stbuf, dict_t *xattr, struct iatt *parent) +err: + DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, NULL); + + return 0; +} + +static int +dht_rmdir_do(call_frame_t *frame, xlator_t *this) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - xlator_t *src = NULL; - call_frame_t *main_frame = NULL; - dht_local_t *main_local = NULL; - int this_call_cnt = 0; - dht_conf_t *conf = this->private; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + xlator_t *hashed_subvol = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; - local = frame->local; - prev = cookie; - src = prev->this; + VALIDATE_OR_GOTO(frame->local, err); + local = frame->local; + VALIDATE_OR_GOTO(this->private, out); + conf = this->private; - main_frame = local->main_frame; - main_local = main_frame->local; + if (local->op_ret == -1) + goto out; - if (op_ret != 0) - goto err; + local->call_cnt = conf->subvolume_cnt; - if (!check_is_linkfile (inode, stbuf, xattr, conf->link_xattr_name)) { - main_local->op_ret = -1; - main_local->op_errno = ENOTEMPTY; + /* first remove from non-hashed_subvol */ + hashed_subvol = dht_subvol_get_hashed(this, &local->loc); - gf_log (this->name, GF_LOG_WARNING, - "%s on %s found to be not a linkfile (type=0%o)", - local->loc.path, src->name, stbuf->ia_type); - goto err; - } + if (!hashed_subvol) { + gf_uuid_unparse(local->loc.gfid, gfid); - STACK_WIND (frame, dht_rmdir_linkfile_unlink_cbk, - src, src->fops->unlink, &local->loc, 0, NULL); + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "Failed to get hashed subvol for %s (gfid = %s)", + local->loc.path, gfid); + } else { + local->hashed_subvol = hashed_subvol; + } + + /* When DHT has only 1 child */ + if (conf->subvolume_cnt == 1) { + STACK_WIND_COOKIE(frame, dht_rmdir_hashed_subvol_cbk, + conf->subvolumes[0], conf->subvolumes[0], + conf->subvolumes[0]->fops->rmdir, &local->loc, + local->flags, NULL); return 0; + } + + local->current = &local->lock[0]; + ret = dht_protect_namespace(frame, &local->loc, local->hashed_subvol, + &local->current->ns, dht_rmdir_lock_cbk); + if (ret < 0) { + local->op_ret = -1; + local->op_errno = errno ? errno : EINVAL; + goto out; + } + + return 0; + +out: + dht_set_fixed_dir_stat(&local->preparent); + dht_set_fixed_dir_stat(&local->postparent); + + DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno, + &local->preparent, &local->postparent, NULL); + return 0; err: + DHT_STACK_UNWIND(rmdir, frame, -1, EINVAL, NULL, NULL, NULL); + return 0; +} - this_call_cnt = dht_frame_return (main_frame); - if (is_last_call (this_call_cnt)) - dht_rmdir_do (main_frame, this); +static void +dht_rmdir_readdirp_done(call_frame_t *readdirp_frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + dht_local_t *main_local = NULL; + dht_local_t *local = NULL; + int this_call_cnt = 0; + + local = readdirp_frame->local; + main_frame = local->main_frame; + main_local = main_frame->local; + + /* At least one readdirp failed. + * This is a bit hit or miss - if readdirp failed on more than + * one subvol, we don't know which error is returned. + */ + if (local->op_ret == -1) { + main_local->op_ret = local->op_ret; + main_local->op_errno = local->op_errno; + } + + this_call_cnt = dht_frame_return(main_frame); + + if (is_last_call(this_call_cnt)) + dht_rmdir_do(main_frame, this); + + DHT_STACK_DESTROY(readdirp_frame); +} - DHT_STACK_DESTROY (frame); +/* Keep sending readdirp on the subvol until it returns no more entries + * It is possible that not all entries will fit in a single readdirp in + * which case the rmdir will keep failing with ENOTEMPTY + */ + +static int +dht_rmdir_readdirp_do(call_frame_t *readdirp_frame, xlator_t *this) +{ + dht_local_t *local = NULL; + + local = readdirp_frame->local; + + if (local->op_ret == -1) { + /* there is no point doing another readdirp on this + * subvol . */ + dht_rmdir_readdirp_done(readdirp_frame, this); return 0; + } + + STACK_WIND_COOKIE(readdirp_frame, dht_rmdir_readdirp_cbk, + local->hashed_subvol, local->hashed_subvol, + local->hashed_subvol->fops->readdirp, local->fd, 4096, 0, + local->xattr); + + return 0; } +static int +dht_rmdir_linkfile_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *prev = NULL; + xlator_t *src = NULL; + call_frame_t *readdirp_frame = NULL; + dht_local_t *readdirp_local = NULL; + int this_call_cnt = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + prev = cookie; + src = prev; + + readdirp_frame = local->main_frame; + readdirp_local = readdirp_frame->local; + + gf_uuid_unparse(local->loc.gfid, gfid); + + if (op_ret == 0) { + gf_msg_trace(this->name, 0, "Unlinked linkfile %s on %s, gfid = %s", + local->loc.path, src->name, gfid); + } else { + if (op_errno != ENOENT) { + readdirp_local->op_ret = -1; + readdirp_local->op_errno = op_errno; + } + gf_msg_debug(this->name, op_errno, + "Unlink of %s on %s failed. (gfid = %s)", local->loc.path, + src->name, gfid); + } + + this_call_cnt = dht_frame_return(readdirp_frame); + + if (is_last_call(this_call_cnt)) + dht_rmdir_readdirp_do(readdirp_frame, this); + + DHT_STACK_DESTROY(frame); + return 0; +} -int -dht_rmdir_cached_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, inode_t *inode, - struct iatt *stbuf, dict_t *xattr, - struct iatt *parent) -{ - dht_local_t *local = NULL; - xlator_t *src = NULL; - call_frame_t *main_frame = NULL; - dht_local_t *main_local = NULL; - int this_call_cnt = 0; - dht_conf_t *conf = this->private; - dict_t *xattrs = NULL; - int ret = 0; +static int +dht_rmdir_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, struct iatt *parent) +{ + dht_local_t *local = NULL; + xlator_t *prev = NULL; + xlator_t *src = NULL; + call_frame_t *readdirp_frame = NULL; + dht_local_t *readdirp_local = NULL; + int this_call_cnt = 0; + dht_conf_t *conf = this->private; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + prev = cookie; + src = prev; + + gf_msg_debug(this->name, 0, "dht_rmdir_lookup_cbk %s", local->loc.path); + + readdirp_frame = local->main_frame; + readdirp_local = readdirp_frame->local; + + if (op_ret != 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_FILE_LOOKUP_FAILED, + "lookup failed for %s on %s", local->loc.path, src->name); + goto err; + } + + if (!check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name)) { + readdirp_local->op_ret = -1; + readdirp_local->op_errno = ENOTEMPTY; + + gf_uuid_unparse(local->loc.gfid, gfid); + + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NOT_LINK_FILE_ERROR, + "%s on %s is not a linkfile (type=0%o, gfid = %s)", + local->loc.path, src->name, stbuf->ia_type, gfid); + goto err; + } + + STACK_WIND_COOKIE(frame, dht_rmdir_linkfile_unlink_cbk, src, src, + src->fops->unlink, &local->loc, 0, NULL); + return 0; +err: - local = frame->local; - src = local->hashed_subvol; + this_call_cnt = dht_frame_return(readdirp_frame); + if (is_last_call(this_call_cnt)) { + dht_rmdir_readdirp_do(readdirp_frame, this); + } - main_frame = local->main_frame; - main_local = main_frame->local; + DHT_STACK_DESTROY(frame); + return 0; +} - if (op_ret == 0) { - main_local->op_ret = -1; - main_local->op_errno = ENOTEMPTY; +static int +dht_rmdir_cached_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, + struct iatt *parent) +{ + dht_local_t *local = NULL; + xlator_t *src = NULL; + call_frame_t *readdirp_frame = NULL; + dht_local_t *readdirp_local = NULL; + int this_call_cnt = 0; + dht_conf_t *conf = this->private; + dict_t *xattrs = NULL; + int ret = 0; + + local = frame->local; + src = local->hashed_subvol; + + /* main_frame here is the readdirp_frame */ + + readdirp_frame = local->main_frame; + readdirp_local = readdirp_frame->local; + + gf_msg_debug(this->name, 0, "returning for %s ", local->loc.path); + + if (op_ret == 0) { + readdirp_local->op_ret = -1; + readdirp_local->op_errno = ENOTEMPTY; + + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SUBVOL_ERROR, + "%s found on cached subvol %s", local->loc.path, src->name); + goto err; + } else if (op_errno != ENOENT) { + readdirp_local->op_ret = -1; + readdirp_local->op_errno = op_errno; + + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_SUBVOL_ERROR, + "%s not found on cached subvol %s", local->loc.path, src->name); + goto err; + } + + xattrs = dict_new(); + if (!xattrs) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "dict_new failed"); + goto err; + } + + ret = dict_set_uint32(xattrs, conf->link_xattr_name, 256); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value: key = %s", + conf->link_xattr_name); + if (xattrs) + dict_unref(xattrs); + goto err; + } + STACK_WIND_COOKIE(frame, dht_rmdir_lookup_cbk, src, src, src->fops->lookup, + &local->loc, xattrs); + if (xattrs) + dict_unref(xattrs); + + return 0; +err: - gf_log (this->name, GF_LOG_WARNING, - "%s found on cached subvol %s", - local->loc.path, src->name); - goto err; - } else if (op_errno != ENOENT) { - main_local->op_ret = -1; - main_local->op_errno = op_errno; - goto err; - } + this_call_cnt = dht_frame_return(readdirp_frame); - xattrs = dict_new (); - if (!xattrs) { - gf_log (this->name, GF_LOG_ERROR, "dict_new failed"); - goto err; - } + /* Once all the lookups/unlinks etc have returned, proceed to wind + * readdirp on the subvol again until no entries are returned. + * This is required if there are more entries than can be returned + * in a single readdirp call. + */ - ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "failed to set linkto key" - " in dict"); - if (xattrs) - dict_unref (xattrs); - goto err; - } + if (is_last_call(this_call_cnt)) + dht_rmdir_readdirp_do(readdirp_frame, this); - STACK_WIND (frame, dht_rmdir_lookup_cbk, - src, src->fops->lookup, &local->loc, xattrs); - if (xattrs) - dict_unref (xattrs); + DHT_STACK_DESTROY(frame); + return 0; +} +static int +dht_rmdir_is_subvol_empty(call_frame_t *frame, xlator_t *this, + gf_dirent_t *entries, xlator_t *src) +{ + int ret = 0; + int build_ret = 0; + gf_dirent_t *trav = NULL; + call_frame_t *lookup_frame = NULL; + dht_local_t *lookup_local = NULL; + dht_local_t *local = NULL; + dict_t *xattrs = NULL; + dht_conf_t *conf = this->private; + xlator_t *subvol = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + int count = 0; + gf_boolean_t unwind = _gf_false; + + local = frame->local; + + list_for_each_entry(trav, &entries->list, list) + { + if (strcmp(trav->d_name, ".") == 0) + continue; + if (strcmp(trav->d_name, "..") == 0) + continue; + if (check_is_linkfile(NULL, (&trav->d_stat), trav->dict, + conf->link_xattr_name)) { + count++; + continue; + } + + /* this entry is either a directory which is neither "." nor "..", + or a non directory which is not a linkfile. the directory is to + be treated as non-empty + */ return 0; -err: + } - this_call_cnt = dht_frame_return (main_frame); - if (is_last_call (this_call_cnt)) - dht_rmdir_do (main_frame, this); + xattrs = dict_new(); + if (!xattrs) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "dict_new failed"); + return -1; + } - DHT_STACK_DESTROY (frame); - return 0; -} + ret = dict_set_uint32(xattrs, conf->link_xattr_name, 256); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value: key = %s", + conf->link_xattr_name); + if (xattrs) + dict_unref(xattrs); + return -1; + } -int -dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, - gf_dirent_t *entries, xlator_t *src) -{ - int ret = 0; - int build_ret = 0; - gf_dirent_t *trav = NULL; - call_frame_t *lookup_frame = NULL; - dht_local_t *lookup_local = NULL; - dht_local_t *local = NULL; - dict_t *xattrs = NULL; - dht_conf_t *conf = this->private; - xlator_t *subvol = NULL; + local->call_cnt = count; + ret = 0; - local = frame->local; + list_for_each_entry(trav, &entries->list, list) + { + if (strcmp(trav->d_name, ".") == 0) + continue; + if (strcmp(trav->d_name, "..") == 0) + continue; - list_for_each_entry (trav, &entries->list, list) { - if (strcmp (trav->d_name, ".") == 0) - continue; - if (strcmp (trav->d_name, "..") == 0) - continue; - if (check_is_linkfile (NULL, (&trav->d_stat), trav->dict, - conf->link_xattr_name)) { - ret++; - continue; - } + lookup_frame = copy_frame(frame); - /* this entry is either a directory which is neither "." nor "..", - or a non directory which is not a linkfile. the directory is to - be treated as non-empty - */ - return 0; + if (!lookup_frame) { + /* out of memory, let the rmdir fail + (as non-empty, unfortunately) */ + goto err; } - xattrs = dict_new (); - if (!xattrs) { - gf_log (this->name, GF_LOG_ERROR, "dict_new failed"); - return -1; + lookup_local = dht_local_init(lookup_frame, NULL, NULL, GF_FOP_LOOKUP); + if (!lookup_local) { + goto err; } - ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "failed to set linkto key" - " in dict"); - if (xattrs) - dict_unref (xattrs); - return -1; - } - - list_for_each_entry (trav, &entries->list, list) { - if (strcmp (trav->d_name, ".") == 0) - continue; - if (strcmp (trav->d_name, "..") == 0) - continue; - - lookup_frame = NULL; - lookup_local = NULL; - - lookup_frame = copy_frame (frame); - if (!lookup_frame) { - /* out of memory, let the rmdir fail - (as non-empty, unfortunately) */ - goto err; - } + lookup_frame->local = lookup_local; + lookup_local->main_frame = frame; + lookup_local->hashed_subvol = src; - lookup_local = mem_get0 (this->local_pool); - if (!lookup_local) { - goto err; - } + build_ret = dht_build_child_loc(this, &lookup_local->loc, &local->loc, + trav->d_name); + if (build_ret != 0) + goto err; - lookup_frame->local = lookup_local; - lookup_local->main_frame = frame; - lookup_local->hashed_subvol = src; + gf_uuid_copy(lookup_local->loc.gfid, trav->d_stat.ia_gfid); - build_ret = dht_build_child_loc (this, &lookup_local->loc, - &local->loc, trav->d_name); - if (build_ret != 0) - goto err; + gf_uuid_unparse(lookup_local->loc.gfid, gfid); - uuid_copy (lookup_local->loc.gfid, trav->d_stat.ia_gfid); + gf_msg_trace(this->name, 0, "looking up %s on subvolume %s, gfid = %s", + lookup_local->loc.path, src->name, gfid); - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", - lookup_local->loc.path, src->name); + subvol = dht_linkfile_subvol(this, NULL, &trav->d_stat, trav->dict); + if (!subvol || (subvol == src)) { + /* we need to delete the linkto file if it does not have a + * valid subvol or it points to itself. + */ + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_INVALID_LINKFILE, + "Linkfile does not have link subvolume. " + "path = %s, gfid = %s", + lookup_local->loc.path, gfid); - LOCK (&frame->lock); - { - local->call_cnt++; - } - UNLOCK (&frame->lock); - - subvol = dht_linkfile_subvol (this, NULL, &trav->d_stat, - trav->dict); - if (!subvol) { - gf_log (this->name, GF_LOG_INFO, - "linkfile not having link subvolume. path=%s", - lookup_local->loc.path); - STACK_WIND (lookup_frame, dht_rmdir_lookup_cbk, - src, src->fops->lookup, - &lookup_local->loc, xattrs); - } else { - STACK_WIND (lookup_frame, dht_rmdir_cached_lookup_cbk, - subvol, subvol->fops->lookup, - &lookup_local->loc, xattrs); - } - ret++; + gf_msg_debug(this->name, 0, "looking up %s on subvol %s, gfid = %s", + lookup_local->loc.path, src->name, gfid); + + STACK_WIND_COOKIE(lookup_frame, dht_rmdir_lookup_cbk, src, src, + src->fops->lookup, &lookup_local->loc, xattrs); + } else { + gf_msg_debug(this->name, 0, + "Looking up linkfile target %s on " + " subvol %s, gfid = %s", + lookup_local->loc.path, subvol->name, gfid); + + STACK_WIND(lookup_frame, dht_rmdir_cached_lookup_cbk, subvol, + subvol->fops->lookup, &lookup_local->loc, xattrs); } + ret++; - if (xattrs) - dict_unref (xattrs); + lookup_frame = NULL; + lookup_local = NULL; + } - return ret; + if (xattrs) + dict_unref(xattrs); + + return ret; err: - if (xattrs) - dict_unref (xattrs); + if (xattrs) + dict_unref(xattrs); - DHT_STACK_DESTROY (lookup_frame); - return 0; + if (lookup_frame) + DHT_STACK_DESTROY(lookup_frame); + + /* Handle the case where the wound calls have unwound before the + * loop processing is done + */ + + LOCK(&frame->lock); + { + local->op_ret = -1; + local->op_errno = ENOTEMPTY; + + local->call_cnt -= (count - ret); + if (!local->call_cnt) + unwind = _gf_true; + } + UNLOCK(&frame->lock); + + if (!unwind) { + return ret; + } + return 0; } +/* + * No more entries on this subvol. Proceed to the actual rmdir operation. + */ -int -dht_rmdir_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *entries, - dict_t *xdata) +static int +dht_rmdir_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = -1; - call_frame_t *prev = NULL; - xlator_t *src = NULL; - int ret = 0; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + xlator_t *src = NULL; + int ret = 0; + char *path = NULL; + + local = frame->local; + prev = cookie; + src = prev; + + if (op_ret > 2) { + /* dht_rmdir_is_subvol_empty() may free the frame, + * copy path for logging. + */ + path = gf_strdup(local->loc.path); - local = frame->local; - prev = cookie; - src = prev->this; - - if (op_ret > 2) { - ret = dht_rmdir_is_subvol_empty (frame, this, entries, src); - - switch (ret) { - case 0: /* non linkfiles exist */ - gf_log (this->name, GF_LOG_TRACE, - "readdir on %s for %s returned %d entries", - prev->this->name, local->loc.path, op_ret); - local->op_ret = -1; - local->op_errno = ENOTEMPTY; - break; - default: - /* @ret number of linkfiles are getting unlinked */ - gf_log (this->name, GF_LOG_TRACE, - "readdir on %s for %s found %d linkfiles", - prev->this->name, local->loc.path, ret); - break; - } + ret = dht_rmdir_is_subvol_empty(frame, this, entries, src); + + switch (ret) { + case 0: /* non linkfiles exist */ + gf_msg_trace(this->name, 0, + "readdir on %s for %s returned %d " + "entries", + prev->name, local->loc.path, op_ret); + local->op_ret = -1; + local->op_errno = ENOTEMPTY; + break; + default: + /* @ret number of linkfiles are getting unlinked */ + gf_msg_trace(this->name, 0, + "readdir on %s for %s found %d " + "linkfiles", + prev->name, path, ret); + break; } + } - this_call_cnt = dht_frame_return (frame); + /* readdirp failed or no linkto files were found on this subvol */ + if (!ret) + dht_rmdir_readdirp_done(frame, this); - if (is_last_call (this_call_cnt)) { - dht_rmdir_do (frame, this); - } + GF_FREE(path); + return 0; +} +static int +dht_rmdir_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = -1; + xlator_t *prev = NULL; + int ret = 0; + dht_conf_t *conf = this->private; + dict_t *dict = NULL; + int i = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + dht_local_t *readdirp_local = NULL; + call_frame_t *readdirp_frame = NULL; + int cnt = 0; + + local = frame->local; + prev = cookie; + + this_call_cnt = dht_frame_return(frame); + if (op_ret == -1) { + gf_uuid_unparse(local->loc.gfid, gfid); + + gf_msg_debug(this->name, op_errno, + "opendir on %s for %s failed, " + "gfid = %s,", + prev->name, local->loc.path, gfid); + if ((op_errno != ENOENT) && (op_errno != ESTALE)) { + local->op_ret = -1; + local->op_errno = op_errno; + } + goto err; + } + + if (!is_last_call(this_call_cnt)) return 0; -} + if (local->op_ret == -1) + goto err; -int -dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd, dict_t *xdata) -{ - dht_local_t *local = NULL; - int this_call_cnt = -1; - call_frame_t *prev = NULL; - dict_t *dict = NULL; - int ret = 0; - dht_conf_t *conf = this->private; - int i = 0; + fd_bind(fd); - local = frame->local; - prev = cookie; + dict = dict_new(); + if (!dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } - this_call_cnt = dht_frame_return (frame); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "opendir on %s for %s failed (%s)", - prev->this->name, local->loc.path, - strerror (op_errno)); - if (op_errno != ENOENT) { - local->op_ret = -1; - local->op_errno = op_errno; - } - goto err; - } + ret = dict_set_uint32(dict, conf->link_xattr_name, 256); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "%s: Failed to set dictionary value:key = %s", local->loc.path, + conf->link_xattr_name); - if (!is_last_call (this_call_cnt)) - return 0; + cnt = local->call_cnt = conf->subvolume_cnt; - if (local->op_ret == -1) - goto err; + /* Create a separate frame per subvol as we might need + * to resend readdirp multiple times to get all the + * entries. + */ - dict = dict_new (); - if (!dict) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto err; + for (i = 0; i < conf->subvolume_cnt; i++) { + readdirp_frame = copy_frame(frame); + + if (!readdirp_frame) { + cnt--; + /* Reduce the local->call_cnt as well */ + (void)dht_frame_return(frame); + continue; } - ret = dict_set_uint32 (dict, conf->link_xattr_name, 256); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set '%s' key", - local->loc.path, conf->link_xattr_name); + readdirp_local = dht_local_init(readdirp_frame, &local->loc, local->fd, + 0); - local->call_cnt = conf->subvolume_cnt; - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (frame, dht_rmdir_readdirp_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->readdirp, - local->fd, 4096, 0, dict); + if (!readdirp_local) { + DHT_STACK_DESTROY(readdirp_frame); + cnt--; + /* Reduce the local->call_cnt as well */ + dht_frame_return(frame); + continue; } + readdirp_local->main_frame = frame; + readdirp_local->op_ret = 0; + readdirp_local->xattr = dict_ref(dict); + /* overload this field to save the subvol info */ + readdirp_local->hashed_subvol = conf->subvolumes[i]; - if (dict) - dict_unref (dict); + STACK_WIND_COOKIE(readdirp_frame, dht_rmdir_readdirp_cbk, + conf->subvolumes[i], conf->subvolumes[i], + conf->subvolumes[i]->fops->readdirp, + readdirp_local->fd, 4096, 0, readdirp_local->xattr); + } - return 0; + if (dict) + dict_unref(dict); + + /* Could not wind readdirp to any subvol */ + + if (!cnt) + goto err; + + return 0; err: - if (is_last_call (this_call_cnt)) { - dht_rmdir_do (frame, this); - } + if (is_last_call(this_call_cnt)) { + dht_rmdir_do(frame, this); + } - return 0; + return 0; } - int -dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, - dict_t *xdata) +dht_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int op_errno = -1; - int i = -1; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + int i = -1; + int ret = -1; + dict_t *xattr_req = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + VALIDATE_OR_GOTO(loc->path, err); + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + + local = dht_local_init(frame, loc, NULL, GF_FOP_RMDIR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->call_cnt = conf->subvolume_cnt; + local->op_ret = 0; + local->fop_succeeded = 0; + + local->flags = flags; + + local->fd = fd_create(local->loc.inode, frame->root->pid); + if (!local->fd) { + op_errno = ENOMEM; + goto err; + } + + if (flags) { + return dht_rmdir_do(frame, this); + } + if (xdata) { + xattr_req = dict_ref(xdata); + } else { + xattr_req = dict_new(); + } + if (xattr_req) { + ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256); + /* If parallel-readdir is enabled, this is required + * to handle stale linkto files in the directory + * being deleted. If this fails, log an error but + * do not prevent the operation. + */ + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "%s: failed to set key %s", + loc->path, conf->link_xattr_name); + } + } else { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "%s: failed to set key %s", + loc->path, conf->link_xattr_name); + } + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND_COOKIE(frame, dht_rmdir_opendir_cbk, conf->subvolumes[i], + conf->subvolumes[i], + conf->subvolumes[i]->fops->opendir, loc, local->fd, + xattr_req); + } + + if (xattr_req) { + dict_unref(xattr_req); + } + return 0; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - VALIDATE_OR_GOTO (this->private, err); +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(rmdir, frame, -1, op_errno, NULL, NULL, NULL); - conf = this->private; + return 0; +} - local = dht_local_init (frame, loc, NULL, GF_FOP_RMDIR); - if (!local) { - op_errno = ENOMEM; - goto err; - } +static int +dht_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) - local->call_cnt = conf->subvolume_cnt; - local->op_ret = 0; - local->fop_succeeded = 0; +{ + DHT_STACK_UNWIND(entrylk, frame, op_ret, op_errno, xdata); + return 0; +} + +/* TODO + * Sending entrylk to cached subvol can result in stale lock + * as described in the bug 1311002. + */ +int +dht_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; - local->flags = flags; + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); - local->fd = fd_create (local->loc.inode, frame->root->pid); - if (!local->fd) { + local = dht_local_init(frame, loc, NULL, GF_FOP_ENTRYLK); + if (!local) { + op_errno = ENOMEM; + goto err; + } - op_errno = ENOMEM; - goto err; - } + subvol = local->cached_subvol; + if (!subvol) { + gf_uuid_unparse(loc->gfid, gfid); - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (frame, dht_rmdir_opendir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->opendir, - loc, local->fd, NULL); - } + gf_msg_debug(this->name, 0, + "no cached subvolume for path=%s, " + "gfid = %s", + loc->path, gfid); + op_errno = EINVAL; + goto err; + } - return 0; + local->call_cnt = 1; + + STACK_WIND(frame, dht_entrylk_cbk, subvol, subvol->fops->entrylk, volume, + loc, basename, cmd, type, xdata); + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (rmdir, frame, -1, op_errno, - NULL, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(entrylk, frame, -1, op_errno, NULL); - return 0; + return 0; } -int -dht_entrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) +static int +dht_fentrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - DHT_STACK_UNWIND (entrylk, frame, op_ret, op_errno, xdata); - return 0; + DHT_STACK_UNWIND(fentrylk, frame, op_ret, op_errno, NULL); + return 0; } - int -dht_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata) -{ - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - - local = dht_local_init (frame, loc, NULL, GF_FOP_ENTRYLK); - if (!local) { - op_errno = ENOMEM; - goto err; - } +dht_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + char gfid[GF_UUID_BUF_SIZE] = {0}; - subvol = local->cached_subvol; - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + VALIDATE_OR_GOTO(fd->inode, err); - local->call_cnt = 1; + gf_uuid_unparse(fd->inode->gfid, gfid); - STACK_WIND (frame, dht_entrylk_cbk, - subvol, subvol->fops->entrylk, - volume, loc, basename, cmd, type, xdata); + subvol = dht_subvol_get_cached(this, fd->inode); + if (!subvol) { + gf_msg_debug(this->name, 0, + "No cached subvolume for fd=%p," + " gfid = %s", + fd, gfid); + op_errno = EINVAL; + goto err; + } - return 0; + STACK_WIND(frame, dht_fentrylk_cbk, subvol, subvol->fops->fentrylk, volume, + fd, basename, cmd, type, xdata); + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(fentrylk, frame, -1, op_errno, NULL); - return 0; + return 0; } +static int32_t +dht_ipc_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; -int -dht_fentrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); -{ - DHT_STACK_UNWIND (fentrylk, frame, op_ret, op_errno, NULL); - return 0; -} + local = frame->local; + + LOCK(&frame->lock); + { + if (op_ret < 0 && op_errno != ENOTCONN) { + local->op_errno = op_errno; + goto unlock; + } + local->op_ret = 0; + } +unlock: + UNLOCK(&frame->lock); + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + DHT_STACK_UNWIND(ipc, frame, local->op_ret, local->op_errno, NULL); + } -int -dht_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata) +out: + return 0; +} + +int32_t +dht_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; + dht_local_t *local = NULL; + int op_errno = EINVAL; + dht_conf_t *conf = NULL; + int call_cnt = 0; + int i = 0; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + if (op != GF_IPC_TARGET_UPCALL) + goto wind_default; - STACK_WIND (frame, dht_fentrylk_cbk, - subvol, subvol->fops->fentrylk, - volume, fd, basename, cmd, type, xdata); + VALIDATE_OR_GOTO(this->private, err); + conf = this->private; - return 0; + local = dht_local_init(frame, NULL, NULL, GF_FOP_IPC); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + if (xdata) { + if (dict_set_int8(xdata, conf->xattr_name, 0) < 0) + goto err; + } + + for (i = 0; i < call_cnt; i++) { + STACK_WIND(frame, dht_ipc_cbk, conf->subvolumes[i], + conf->subvolumes[i]->fops->ipc, op, xdata); + } + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); + DHT_STACK_UNWIND(ipc, frame, -1, op_errno, NULL); - return 0; -} + return 0; +wind_default: + STACK_WIND(frame, default_ipc_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ipc, op, xdata); + return 0; +} int -dht_forget (xlator_t *this, inode_t *inode) +dht_forget(xlator_t *this, inode_t *inode) { - uint64_t ctx_int = 0; - dht_inode_ctx_t *ctx = NULL; - dht_layout_t *layout = NULL; + uint64_t ctx_int = 0; + dht_inode_ctx_t *ctx = NULL; + dht_layout_t *layout = NULL; - inode_ctx_del (inode, this, &ctx_int); + inode_ctx_del(inode, this, &ctx_int); - if (!ctx_int) - return 0; + if (!ctx_int) + return 0; - ctx = (dht_inode_ctx_t *) (long) ctx_int; + ctx = (dht_inode_ctx_t *)(long)ctx_int; - layout = ctx->layout; - ctx->layout = NULL; - dht_layout_unref (this, layout); - GF_FREE (ctx); + layout = ctx->layout; + ctx->layout = NULL; + dht_layout_unref(this, layout); + GF_FREE(ctx); - return 0; + return 0; } - int -dht_notify (xlator_t *this, int event, void *data, ...) -{ - xlator_t *subvol = NULL; - int cnt = -1; - int i = -1; - dht_conf_t *conf = NULL; - int ret = -1; - int propagate = 0; - - int had_heard_from_all = 0; - int have_heard_from_all = 0; - struct timeval time = {0,}; - gf_defrag_info_t *defrag = NULL; - dict_t *dict = NULL; - gf_defrag_type cmd = 0; - dict_t *output = NULL; - va_list ap; - - - conf = this->private; - if (!conf) - return ret; - - /* had all subvolumes reported status once till now? */ - had_heard_from_all = 1; - for (i = 0; i < conf->subvolume_cnt; i++) { - if (!conf->last_event[i]) { - had_heard_from_all = 0; - } - } - - switch (event) { +dht_notify(xlator_t *this, int event, void *data, ...) +{ + xlator_t *subvol = NULL; + int cnt = -1; + int i = -1; + dht_conf_t *conf = NULL; + int ret = -1; + int propagate = 0; + + int had_heard_from_all = 0; + int have_heard_from_all = 0; + gf_defrag_info_t *defrag = NULL; + dict_t *dict = NULL; + gf_defrag_type cmd = 0; + dict_t *output = NULL; + va_list ap; + struct gf_upcall *up_data = NULL; + struct gf_upcall_cache_invalidation *up_ci = NULL; + + conf = this->private; + GF_VALIDATE_OR_GOTO(this->name, conf, out); + + /* had all subvolumes reported status once till now? */ + had_heard_from_all = 1; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!conf->last_event[i]) { + had_heard_from_all = 0; + } + } + + switch (event) { case GF_EVENT_CHILD_UP: - subvol = data; + subvol = data; - conf->gen++; + conf->gen++; - for (i = 0; i < conf->subvolume_cnt; i++) { - if (subvol == conf->subvolumes[i]) { - cnt = i; - break; - } + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + cnt = i; + break; } + } - if (cnt == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "got GF_EVENT_CHILD_UP bad subvolume %s", - subvol->name); - break; - } + if (cnt == -1) { + gf_msg_debug(this->name, 0, + "got GF_EVENT_CHILD_UP bad " + "subvolume %s", + subvol->name); + break; + } - gettimeofday (&time, NULL); - LOCK (&conf->subvolume_lock); - { - conf->subvolume_status[cnt] = 1; - conf->last_event[cnt] = event; - conf->subvol_up_time[cnt] = time.tv_sec; - } - UNLOCK (&conf->subvolume_lock); + LOCK(&conf->subvolume_lock); + { + conf->subvolume_status[cnt] = 1; + conf->last_event[cnt] = event; + conf->subvol_up_time[cnt] = gf_time(); + } + UNLOCK(&conf->subvolume_lock); - /* one of the node came back up, do a stat update */ - dht_get_du_info_for_subvol (this, cnt); + /* one of the node came back up, do a stat update */ + dht_get_du_info_for_subvol(this, cnt); - break; + break; - case GF_EVENT_CHILD_MODIFIED: - subvol = data; + case GF_EVENT_SOME_DESCENDENT_UP: + subvol = data; + conf->gen++; + propagate = 1; - conf->gen++; - propagate = 1; + break; - break; + case GF_EVENT_SOME_DESCENDENT_DOWN: + subvol = data; + propagate = 1; - case GF_EVENT_CHILD_DOWN: - subvol = data; - - if (conf->assert_no_child_down) { - gf_log (this->name, GF_LOG_WARNING, - "Received CHILD_DOWN. Exiting"); - if (conf->defrag) { - gf_defrag_stop (conf->defrag, - GF_DEFRAG_STATUS_FAILED, NULL); - } else { - kill (getpid(), SIGTERM); - } - } + break; - for (i = 0; i < conf->subvolume_cnt; i++) { - if (subvol == conf->subvolumes[i]) { - cnt = i; - break; - } - } + case GF_EVENT_CHILD_DOWN: + subvol = data; - if (cnt == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "got GF_EVENT_CHILD_DOWN bad subvolume %s", - subvol->name); - break; + if (conf->assert_no_child_down) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_CHILD_DOWN, + "Received CHILD_DOWN. Exiting"); + if (conf->defrag) { + gf_defrag_stop(conf, GF_DEFRAG_STATUS_FAILED, NULL); + } else { + kill(getpid(), SIGTERM); } + } - LOCK (&conf->subvolume_lock); - { - conf->subvolume_status[cnt] = 0; - conf->last_event[cnt] = event; - conf->subvol_up_time[cnt] = 0; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + cnt = i; + break; } - UNLOCK (&conf->subvolume_lock); + } + if (cnt == -1) { + gf_msg_debug(this->name, 0, + "got GF_EVENT_CHILD_DOWN bad " + "subvolume %s", + subvol->name); break; + } - case GF_EVENT_CHILD_CONNECTING: - subvol = data; + LOCK(&conf->subvolume_lock); + { + conf->subvolume_status[cnt] = 0; + conf->last_event[cnt] = event; + conf->subvol_up_time[cnt] = 0; + } + UNLOCK(&conf->subvolume_lock); - for (i = 0; i < conf->subvolume_cnt; i++) { - if (subvol == conf->subvolumes[i]) { - cnt = i; - break; - } - } + for (i = 0; i < conf->subvolume_cnt; i++) + if (conf->last_event[i] != event) + event = GF_EVENT_SOME_DESCENDENT_DOWN; + break; - if (cnt == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "got GF_EVENT_CHILD_CONNECTING bad subvolume %s", - subvol->name); - break; - } + case GF_EVENT_CHILD_CONNECTING: + subvol = data; - LOCK (&conf->subvolume_lock); - { - conf->last_event[cnt] = event; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + cnt = i; + break; } - UNLOCK (&conf->subvolume_lock); + } + if (cnt == -1) { + gf_msg_debug(this->name, 0, + "got GF_EVENT_CHILD_CONNECTING" + " bad subvolume %s", + subvol->name); break; - case GF_EVENT_VOLUME_DEFRAG: - { - if (!conf->defrag) { - return ret; - } - defrag = conf->defrag; + } - dict = data; - va_start (ap, data); - output = va_arg (ap, dict_t*); + LOCK(&conf->subvolume_lock); + { + conf->last_event[cnt] = event; + } + UNLOCK(&conf->subvolume_lock); - ret = dict_get_int32 (dict, "rebalance-command", - (int32_t*)&cmd); - if (ret) - return ret; - LOCK (&defrag->lock); - { - if (defrag->is_exiting) - goto unlock; - if (cmd == GF_DEFRAG_CMD_STATUS) - gf_defrag_status_get (defrag, output); - else if (cmd == GF_DEFRAG_CMD_STOP) - gf_defrag_stop (defrag, - GF_DEFRAG_STATUS_STOPPED, output); - } -unlock: - UNLOCK (&defrag->lock); - return 0; - break; - } + break; + case GF_EVENT_VOLUME_DEFRAG: { + if (!conf->defrag) { + return ret; + } + defrag = conf->defrag; + dict = data; + va_start(ap, data); + output = va_arg(ap, dict_t *); + + ret = dict_get_int32(dict, "rebalance-command", (int32_t *)&cmd); + if (ret) { + va_end(ap); + return ret; + } + LOCK(&defrag->lock); + { + if (defrag->is_exiting) + goto unlock; + if ((cmd == GF_DEFRAG_CMD_STATUS) || + (cmd == GF_DEFRAG_CMD_DETACH_STATUS)) + gf_defrag_status_get(conf, output); + else if (cmd == GF_DEFRAG_CMD_DETACH_START) + defrag->cmd = GF_DEFRAG_CMD_DETACH_START; + else if (cmd == GF_DEFRAG_CMD_STOP || + cmd == GF_DEFRAG_CMD_DETACH_STOP) + gf_defrag_stop(conf, GF_DEFRAG_STATUS_STOPPED, output); + } + unlock: + UNLOCK(&defrag->lock); + va_end(ap); + return ret; + break; + } + case GF_EVENT_UPCALL: + up_data = (struct gf_upcall *)data; + if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION) + break; + up_ci = (struct gf_upcall_cache_invalidation *)up_data->data; + + /* Since md-cache will be aggressively filtering lookups, + * the stale layout issue will be more pronounced. Hence + * when a layout xattr is changed by the rebalance process + * notify all the md-cache clients to invalidate the existing + * stat cache and send the lookup next time*/ + if (up_ci->dict && dict_get(up_ci->dict, conf->xattr_name)) + up_ci->flags |= UP_EXPLICIT_LOOKUP; + + /* TODO: Instead of invalidating iatt, update the new + * hashed/cached subvolume in dht inode_ctx */ + if (IS_DHT_LINKFILE_MODE(&up_ci->stat)) + up_ci->flags |= UP_EXPLICIT_LOOKUP; + + propagate = 1; + break; default: - propagate = 1; + propagate = 1; + break; + } + + /* have all subvolumes reported status once by now? */ + have_heard_from_all = 1; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!conf->last_event[i]) + have_heard_from_all = 0; + } + + /* if all subvols have reported status, no need to hide anything + or wait for anything else. Just propagate blindly */ + if (have_heard_from_all) { + propagate = 1; + } + + if (!had_heard_from_all && have_heard_from_all) { + static int run_defrag = 0; + /* This is the first event which completes aggregation + of events from all subvolumes. If at least one subvol + had come up, propagate CHILD_UP, but only this time + */ + event = GF_EVENT_CHILD_DOWN; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->last_event[i] == GF_EVENT_CHILD_UP) { + event = GF_EVENT_CHILD_UP; break; + } + + if (conf->last_event[i] == GF_EVENT_CHILD_CONNECTING) { + event = GF_EVENT_CHILD_CONNECTING; + /* continue to check other events for CHILD_UP */ + } } + /* Rebalance is started with assert_no_child_down. So we do + * not need to handle CHILD_DOWN event here. + * + * If there is a graph switch, we should not restart the + * rebalance daemon. Use 'run_defrag' to indicate if the + * thread has already started. + */ + if (conf->defrag && !run_defrag) { + run_defrag = 1; + ret = gf_thread_create(&conf->defrag->th, NULL, gf_defrag_start, + this, "dhtdg"); + if (ret) { + GF_FREE(conf->defrag); + conf->defrag = NULL; + kill(getpid(), SIGTERM); + } + } + } + + ret = 0; + if (propagate) + ret = default_notify(this, event, data); +out: + return ret; +} - /* have all subvolumes reported status once by now? */ - have_heard_from_all = 1; - for (i = 0; i < conf->subvolume_cnt; i++) { - if (!conf->last_event[i]) - have_heard_from_all = 0; - } +int +dht_inode_ctx_layout_get(inode_t *inode, xlator_t *this, dht_layout_t **layout) +{ + dht_inode_ctx_t *ctx = NULL; + int ret = -1; - /* if all subvols have reported status, no need to hide anything - or wait for anything else. Just propagate blindly */ - if (have_heard_from_all) { - propagate = 1; + ret = dht_inode_ctx_get(inode, this, &ctx); + if (!ret && ctx) { + if (ctx->layout) { + if (layout) + *layout = ctx->layout; + ret = 0; + } else { + ret = -1; } + } + return ret; +} - if (!had_heard_from_all && have_heard_from_all) { - /* This is the first event which completes aggregation - of events from all subvolumes. If at least one subvol - had come up, propagate CHILD_UP, but only this time - */ - event = GF_EVENT_CHILD_DOWN; +void +dht_log_new_layout_for_dir_selfheal(xlator_t *this, loc_t *loc, + dht_layout_t *layout) +{ + char string[2048] = {0}; + char *output_string = NULL; + int len = 0; + int off = 0; + int i = 0; + gf_loglevel_t log_level = gf_log_get_loglevel(); + int ret = 0; + + if (log_level < GF_LOG_INFO) + return; - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->last_event[i] == GF_EVENT_CHILD_UP) { - event = GF_EVENT_CHILD_UP; - break; - } + if (!layout) + return; - if (conf->last_event[i] == GF_EVENT_CHILD_CONNECTING) { - event = GF_EVENT_CHILD_CONNECTING; - /* continue to check other events for CHILD_UP */ - } - } + if (!layout->cnt) + return; - /* rebalance is started with assert_no_child_down. So we do - * not need to handle CHILD_DOWN event here. - */ - if (conf->defrag) { - ret = gf_thread_create (&conf->defrag->th, NULL, - gf_defrag_start, this); - if (ret) { - conf->defrag = NULL; - GF_FREE (conf->defrag); - kill (getpid(), SIGTERM); - } - } - } + if (!loc) + return; + + if (!loc->path) + return; + + ret = snprintf(string, sizeof(string), "Setting layout of %s with ", + loc->path); + + if (ret < 0) + return; + + len += ret; + + /* Calculation of total length of the string required to calloc + * output_string. Log includes subvolume-name, start-range, end-range + * and err value. + * + * This log will help to debug cases where: + * a) Different processes set different layout of a directory. + * b) Error captured in lookup, which will be filled in layout->err + * (like ENOENT, ESTALE etc) + */ + + for (i = 0; i < layout->cnt; i++) { + ret = snprintf(string, sizeof(string), + "[Subvol_name: %s, Err: %d , Start: " + "0x%x, Stop: 0x%x, Hash: 0x%x], ", + layout->list[i].xlator->name, layout->list[i].err, + layout->list[i].start, layout->list[i].stop, + layout->list[i].commit_hash); + + if (ret < 0) + return; + + len += ret; + } + + len++; + + output_string = GF_MALLOC(len + 1, gf_common_mt_char); + + if (!output_string) + return; + + ret = snprintf(output_string, len + 1, "Setting layout of %s with ", + loc->path); + + if (ret < 0) + goto err; + + off += ret; + + for (i = 0; i < layout->cnt; i++) { + ret = snprintf(output_string + off, len - off, + "[Subvol_name: %s, Err: %d , Start: " + "0x%x, Stop: 0x%x, Hash: 0x%x], ", + layout->list[i].xlator->name, layout->list[i].err, + layout->list[i].start, layout->list[i].stop, + layout->list[i].commit_hash); + + if (ret < 0) + goto err; + + off += ret; + } + + gf_msg(this->name, GF_LOG_DEBUG, 0, DHT_MSG_LOG_FIXED_LAYOUT, "%s", + output_string); +err: + GF_FREE(output_string); +} + +int32_t +dht_migration_get_dst_subvol(xlator_t *this, dht_local_t *local) +{ + int ret = -1; + + if (!local) + goto out; + + local->rebalance.target_node = dht_subvol_get_hashed(this, &local->loc); + + if (local->rebalance.target_node) ret = 0; - if (propagate) - ret = default_notify (this, event, data); - return ret; +out: + return ret; +} + +/* +This function should not be called more then once during a FOP +handling path. It is valid only for for ops on files +*/ +int32_t +dht_set_local_rebalance(xlator_t *this, dht_local_t *local, struct iatt *stbuf, + struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + if (!local) + return -1; + + if (local->rebalance.set) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_REBAL_STRUCT_SET, + "local->rebalance already set"); + } + + if (stbuf) + memcpy(&local->rebalance.stbuf, stbuf, sizeof(struct iatt)); + + if (prebuf) + memcpy(&local->rebalance.prebuf, prebuf, sizeof(struct iatt)); + + if (postbuf) + memcpy(&local->rebalance.postbuf, postbuf, sizeof(struct iatt)); + + if (xdata) + local->rebalance.xdata = dict_ref(xdata); + + local->rebalance.set = 1; + + return 0; +} + +int32_t +dht_release(xlator_t *this, fd_t *fd) +{ + return dht_fd_ctx_destroy(this, fd); +} + +static int +dht_pt_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + + local = frame->local; + + if (!op_ret) { + dht_layout_set(this, inode, local->layout); + } + + DHT_STACK_UNWIND(mkdir, frame, op_ret, op_errno, inode, stbuf, preparent, + postparent, NULL); + + return 0; +} + +int32_t +dht_pt_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) +{ + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + bool free_xdata = false; + int ret = 0; + int op_errno = 0; + int32_t *disk_layout_p = NULL; + + conf = this->private; + + local = dht_local_init(frame, loc, NULL, GF_FOP_MKDIR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + layout = dht_layout_new(this, conf->subvolume_cnt); + if (!layout) + goto wind; + + local->layout = layout; + + if (!xdata) { + xdata = dict_new(); + if (!xdata) + goto wind; + free_xdata = true; + } + + /*Set the xlator or the following will crash*/ + layout->list[0].xlator = conf->subvolumes[0]; + + dht_selfheal_layout_new_directory(frame, loc, layout); + + dht_disk_layout_extract(this, layout, 0, &disk_layout_p); + + ret = dict_set_bin(xdata, conf->xattr_name, disk_layout_p, 4 * 4); + if (ret) { + gf_msg("dht", GF_LOG_DEBUG, EINVAL, DHT_MSG_DICT_SET_FAILED, + "dht layout dict set failed"); + } +wind: + STACK_WIND(frame, dht_pt_mkdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata); + if (free_xdata) + dict_unref(xdata); + return 0; + +err: + op_errno = local ? local->op_errno : op_errno; + DHT_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + + return 0; +} + +static int +dht_pt_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + dht_conf_t *conf = NULL; + + conf = this->private; + dict_del(xattr, conf->xattr_name); + dict_del(xattr, conf->mds_xattr_key); + dict_del(xattr, conf->commithash_xattr_name); + + if (frame->root->pid >= 0) { + GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr); + GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr); + } + + DHT_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata); + return 0; +} + +int +dht_pt_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *key, dict_t *xdata) +{ + STACK_WIND(frame, dht_pt_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, key, xdata); + return 0; +} + +static int +dht_pt_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + dht_conf_t *conf = NULL; + + conf = this->private; + dict_del(xattr, conf->xattr_name); + + if (frame->root->pid >= 0) { + GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr); + GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr); + } + + DHT_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, xattr, xdata); + return 0; } int -dht_inode_ctx_layout_get (inode_t *inode, xlator_t *this, dht_layout_t **layout) +dht_pt_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key, + dict_t *xdata) { - dht_inode_ctx_t *ctx = NULL; - int ret = -1; + STACK_WIND(frame, dht_pt_fgetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, fd, key, xdata); + return 0; +} - ret = dht_inode_ctx_get (inode, this, &ctx); +/* The job of this function is to check if all the xlators have updated + * error in the layout. */ +int +dht_dir_layout_error_check(xlator_t *this, inode_t *inode) +{ + dht_layout_t *layout = NULL; + int i = 0; - if (!ret && ctx) { - if (ctx->layout) { - if (layout) - *layout = ctx->layout; - ret = 0; - } else { - ret = -1; - } + layout = dht_layout_get(this, inode); + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err == 0) { + return 0; } + } - return ret; + /* Returning the first xlator error as all xlators have errors */ + return layout->list[0].err; } diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 2ece28a616b..fe0dc3db34a 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -8,780 +8,1377 @@ cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - #include <regex.h> -#include <signal.h> #include "dht-mem-types.h" +#include "dht-messages.h" +#include <glusterfs/call-stub.h> #include "libxlator.h" -#include "syncop.h" +#include <glusterfs/syncop.h> +#include <glusterfs/refcount.h> +#include <glusterfs/timer.h> +#include "protocol-common.h" +#include <glusterfs/glusterfs-acl.h> #ifndef _DHT_H #define _DHT_H -#define GF_XATTR_FIX_LAYOUT_KEY "distribute.fix.layout" -#define GF_DHT_LOOKUP_UNHASHED_ON 1 +#define GF_XATTR_FIX_LAYOUT_KEY "distribute.fix.layout" +#define GF_XATTR_FILE_MIGRATE_KEY "trusted.distribute.migrate-data" +#define DHT_MDS_STR "mds" +#define GF_DHT_LOOKUP_UNHASHED_OFF 0 +#define GF_DHT_LOOKUP_UNHASHED_ON 1 #define GF_DHT_LOOKUP_UNHASHED_AUTO 2 -#define DHT_PATHINFO_HEADER "DISTRIBUTE:" +#define DHT_PATHINFO_HEADER "DISTRIBUTE:" +#define DHT_FILE_MIGRATE_DOMAIN "dht.file.migrate" +/* Layout synchronization */ +#define DHT_LAYOUT_HEAL_DOMAIN "dht.layout.heal" +/* Namespace synchronization */ +#define DHT_ENTRY_SYNC_DOMAIN "dht.entry.sync" +#define DHT_LAYOUT_HASH_INVALID 1 +#define MAX_REBAL_THREADS sysconf(_SC_NPROCESSORS_ONLN) + +#define DHT_DIR_STAT_BLOCKS 8 +#define DHT_DIR_STAT_SIZE 4096 + +/* Virtual xattr for subvols status */ + +#define DHT_SUBVOL_STATUS_KEY "dht.subvol.status" + +/* Virtual xattrs for debugging */ + +#define DHT_DBG_HASHED_SUBVOL_PATTERN "dht.file.hashed-subvol.*" +#define DHT_DBG_HASHED_SUBVOL_KEY "dht.file.hashed-subvol." -#include <fnmatch.h> +/* Rebalance nodeuuid flags */ +#define REBAL_NODEUUID_MINE 0x01 -typedef int (*dht_selfheal_dir_cbk_t) (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - dict_t *xdata); -typedef int (*dht_defrag_cbk_fn_t) (xlator_t *this, call_frame_t *frame, - int ret); +typedef int (*dht_selfheal_dir_cbk_t)(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata); +typedef int (*dht_defrag_cbk_fn_t)(xlator_t *this, xlator_t *dst_node, + call_frame_t *frame, int ret); +typedef int (*dht_refresh_layout_unlock)(call_frame_t *frame, xlator_t *this, + int op_ret, int invoke_cbk); + +typedef int (*dht_refresh_layout_done_handle)(call_frame_t *frame); struct dht_layout { - int spread_cnt; /* layout spread count per directory, - is controlled by 'setxattr()' with - special key */ - int cnt; - int preset; - int gen; - int type; - int ref; /* use with dht_conf_t->layout_lock */ - gf_boolean_t search_unhashed; - struct { - int err; /* 0 = normal - -1 = dir exists and no xattr - >0 = dir lookup failed with errno - */ - uint32_t start; - uint32_t stop; - xlator_t *xlator; - } list[]; + int spread_cnt; /* layout spread count per directory, + is controlled by 'setxattr()' with + special key */ + int cnt; + int preset; + /* + * The last *configuration* state for which this directory was known + * to be in balance. The corresponding vol_commit_hash changes + * whenever bricks are added or removed. This value changes when a + * (full) rebalance is complete. If they match, it's safe to assume + * that every file is where it should be and there's no need to do + * lookups for files elsewhere. If they don't, then we have to do a + * global lookup to be sure. + */ + uint32_t commit_hash; + /* + * The *runtime* state of the volume, changes when connections to + * bricks are made or lost. + */ + int gen; + int type; + gf_atomic_t ref; /* use with dht_conf_t->layout_lock */ + uint32_t search_unhashed; + struct { + int err; /* 0 = normal + -1 = dir exists and no xattr + >0 = dir lookup failed with errno + */ + uint32_t start; + uint32_t stop; + uint32_t commit_hash; + xlator_t *xlator; + } list[]; }; -typedef struct dht_layout dht_layout_t; +typedef struct dht_layout dht_layout_t; struct dht_stat_time { - uint32_t atime; - uint32_t atime_nsec; - uint32_t ctime; - uint32_t ctime_nsec; - uint32_t mtime; - uint32_t mtime_nsec; + uint32_t atime; + uint32_t atime_nsec; + uint32_t ctime; + uint32_t ctime_nsec; + uint32_t mtime; + uint32_t mtime_nsec; }; typedef struct dht_stat_time dht_stat_time_t; struct dht_inode_ctx { - dht_layout_t *layout; - dht_stat_time_t time; + dht_layout_t *layout; + dht_stat_time_t time; + xlator_t *lock_subvol; + xlator_t *mds_subvol; /* This is only used for directories */ }; typedef struct dht_inode_ctx dht_inode_ctx_t; - typedef enum { - DHT_HASH_TYPE_DM, - DHT_HASH_TYPE_DM_USER, + DHT_HASH_TYPE_DM, + DHT_HASH_TYPE_DM_USER, } dht_hashfn_type_t; +typedef enum { + DHT_INODELK, + DHT_ENTRYLK, +} dht_lock_type_t; + /* rebalance related */ struct dht_rebalance_ { - xlator_t *from_subvol; - xlator_t *target_node; - off_t offset; - size_t size; - int32_t flags; - int count; - struct iobref *iobref; - struct iovec *vector; - struct iatt stbuf; - dht_defrag_cbk_fn_t target_op_fn; - dict_t *xdata; + xlator_t *from_subvol; + xlator_t *target_node; + off_t offset; + size_t size; + int32_t flags; + int count; + struct iobref *iobref; + struct iovec *vector; + struct iatt stbuf; + struct iatt prebuf; + struct iatt postbuf; + dht_defrag_cbk_fn_t target_op_fn; + dict_t *xdata; + dict_t *xattr; + dict_t *dict; + struct gf_flock flock; + int32_t set; + int lock_cmd; }; -struct dht_local { - int call_cnt; - loc_t loc; - loc_t loc2; - int op_ret; - int op_errno; - int layout_mismatch; - /* Use stbuf as the postbuf, when we require both - * pre and post attrs */ - struct iatt stbuf; - struct iatt prebuf; - struct iatt preoldparent; - struct iatt postoldparent; - struct iatt preparent; - struct iatt postparent; - struct statvfs statvfs; - fd_t *fd; - inode_t *inode; - dict_t *params; - dict_t *xattr; - dict_t *xattr_req; - dht_layout_t *layout; - size_t size; - ino_t ia_ino; - xlator_t *src_hashed, *src_cached; - xlator_t *dst_hashed, *dst_cached; - xlator_t *cached_subvol; - xlator_t *hashed_subvol; - char need_selfheal; - int file_count; - int dir_count; - call_frame_t *main_frame; - int fop_succeeded; - struct { - fop_mknod_cbk_t linkfile_cbk; - struct iatt stbuf; - loc_t loc; - inode_t *inode; - dict_t *xattr; - xlator_t *srcvol; - } linkfile; - struct { - uint32_t hole_cnt; - uint32_t overlaps_cnt; - uint32_t down; - uint32_t misc; - dht_selfheal_dir_cbk_t dir_cbk; - dht_layout_t *layout; - } selfheal; - uint32_t uid; - uint32_t gid; - - /* needed by nufa */ - int32_t flags; - mode_t mode; - dev_t rdev; - mode_t umask; - - /* need for file-info */ - char *xattr_val; - char *key; - - /* which xattr request? */ - char xsel[256]; - int32_t alloc_len; - - char *newpath; - - /* gfid related */ - uuid_t gfid; - - /*Marker Related*/ - struct marker_str marker; - - /* flag used to make sure we need to return estale in - {lookup,revalidate}_cbk */ - char return_estale; - char need_lookup_everywhere; - - glusterfs_fop_t fop; +/** + * Enum to store decided action based on the qdstatfs (quota-deem-statfs) + * events + **/ +typedef enum { + qdstatfs_action_OFF = 0, + qdstatfs_action_REPLACE, + qdstatfs_action_NEGLECT, + qdstatfs_action_COMPARE, +} qdstatfs_action_t; - gf_boolean_t linked; - xlator_t *link_subvol; +typedef enum { + REACTION_INVALID, + FAIL_ON_ANY_ERROR, + IGNORE_ENOENT_ESTALE, + IGNORE_ENOENT_ESTALE_EIO, +} dht_reaction_type_t; + +struct dht_skip_linkto_unlink { + xlator_t *hash_links_to; + uuid_t cached_gfid; + uuid_t hashed_gfid; + int opend_fd_count; + gf_boolean_t handle_valid_link; +}; - struct dht_rebalance_ rebalance; - xlator_t *first_up_subvol; +typedef struct { + xlator_t *xl; + loc_t loc; /* contains/points to inode to lock on. */ + char *domain; /* Only locks within a single domain + * contend with each other + */ + char *basename; /* Required for entrylk */ + gf_boolean_t locked; + dht_reaction_type_t do_on_failure; + short type; /* read/write lock. */ + gf_lkowner_t lk_owner; +} dht_lock_t; + +/* The lock structure represents inodelk. */ +typedef struct { + fop_inodelk_cbk_t inodelk_cbk; + dht_lock_t **locks; + int lk_count; + dht_reaction_type_t reaction; + + /* whether locking failed on _any_ of the "locks" above */ + int op_ret; + int op_errno; +} dht_ilock_wrap_t; + +/* The lock structure represents entrylk. */ +typedef struct { + fop_entrylk_cbk_t entrylk_cbk; + dht_lock_t **locks; + int lk_count; + dht_reaction_type_t reaction; + + /* whether locking failed on _any_ of the "locks" above */ + int op_ret; + int op_errno; +} dht_elock_wrap_t; + +/* The first member of dht_dir_transaction_t should be of type dht_ilock_wrap_t. + * Otherwise it can result in subtle memory corruption issues as in most of the + * places we use lock[0].layout.my_layout or lock[0].layout.parent_layout and + * lock[0].ns.parent_layout (like in dht_local_wipe). + */ +typedef union { + union { + dht_ilock_wrap_t my_layout; + dht_ilock_wrap_t parent_layout; + } layout; + struct dht_namespace { + dht_ilock_wrap_t parent_layout; + dht_elock_wrap_t directory_ns; + fop_entrylk_cbk_t ns_cbk; + } ns; +} dht_dir_transaction_t; + +typedef int (*dht_selfheal_layout_t)(call_frame_t *frame, loc_t *loc, + dht_layout_t *layout); + +typedef gf_boolean_t (*dht_need_heal_t)(call_frame_t *frame, + dht_layout_t **inmem, + dht_layout_t **ondisk); +struct dht_local { + loc_t loc; + loc_t loc2; + int call_cnt; + int op_ret; + int op_errno; + int layout_mismatch; + /* Use stbuf as the postbuf, when we require both + * pre and post attrs */ + struct iatt stbuf; + struct iatt mds_stbuf; + struct iatt prebuf; + struct iatt preoldparent; + struct iatt postoldparent; + struct iatt preparent; + struct iatt postparent; + struct statvfs statvfs; + fd_t *fd; + inode_t *inode; + dict_t *params; + dict_t *xattr; + dict_t *mds_xattr; + dict_t *xdata; /* dict used to save xdata response by xattr fop */ + dict_t *xattr_req; + dht_layout_t *layout; + size_t size; + ino_t ia_ino; + xlator_t *src_hashed, *src_cached; + xlator_t *dst_hashed, *dst_cached; + xlator_t *cached_subvol; + xlator_t *hashed_subvol; + xlator_t *mds_subvol; /* This is use for dir only */ + int file_count; + int dir_count; + call_frame_t *main_frame; + int fop_succeeded; + struct { + fop_mknod_cbk_t linkfile_cbk; + struct iatt stbuf; + loc_t loc; + inode_t *inode; + dict_t *xattr; + xlator_t *srcvol; + } linkfile; + struct { + uint32_t hole_cnt; + uint32_t overlaps_cnt; + uint32_t down; + uint32_t misc; + dht_selfheal_dir_cbk_t dir_cbk; + dht_selfheal_layout_t healer; + dht_need_heal_t should_heal; + dht_layout_t *layout, *refreshed_layout; + uint32_t missing_cnt; + gf_boolean_t force_mkdir; + } selfheal; + + dht_refresh_layout_unlock refresh_layout_unlock; + dht_refresh_layout_done_handle refresh_layout_done; + + uint32_t uid; + uint32_t gid; + pid_t pid; + + glusterfs_fop_t fop; + + /* need for file-info */ + char *xattr_val; + char *key; + + /* needed by nufa */ + int32_t flags; + mode_t mode; + dev_t rdev; + mode_t umask; + + /* which xattr request? */ + char xsel[256]; + int32_t alloc_len; + + /* gfid related */ + uuid_t gfid; + uuid_t gfid_req; + + xlator_t *link_subvol; + + struct dht_rebalance_ rebalance; + xlator_t *first_up_subvol; + + struct dht_skip_linkto_unlink skip_unlink; + + dht_dir_transaction_t lock[2], *current; + + /* inodelks during filerename for backward compatibility */ + dht_lock_t **rename_inodelk_backward_compatible; + + call_stub_t *stub; + int32_t parent_disk_layout[4]; + + /* rename rollback */ + int *ret_cache; + + loc_t loc2_copy; + + int rename_inodelk_bc_count; + /* This is use only for directory operation */ + int32_t valid; + int32_t mds_heal_fresh_lookup; + short lock_type; + char need_selfheal; + char need_xattr_heal; + char need_attrheal; + /* flag used to make sure we need to return estale in + {lookup,revalidate}_cbk */ + char return_estale; + char need_lookup_everywhere; + /* fd open check */ + gf_boolean_t fd_checked; + gf_boolean_t linked; + gf_boolean_t added_link; + gf_boolean_t is_linkfile; + gf_boolean_t quota_deem_statfs; + gf_boolean_t heal_layout; + gf_boolean_t locked; + gf_boolean_t dont_create_linkto; + gf_boolean_t gfid_missing; }; typedef struct dht_local dht_local_t; /* du - disk-usage */ struct dht_du { - double avail_percent; - double avail_inodes; - uint64_t avail_space; - uint32_t log; + double avail_percent; + double avail_inodes; + uint64_t avail_space; + uint32_t log; + uint32_t chunks; + uint32_t total_blocks; + uint32_t avail_blocks; + uint32_t frsize; /*fragment size*/ }; typedef struct dht_du dht_du_t; enum gf_defrag_type { - GF_DEFRAG_CMD_START = 1, - GF_DEFRAG_CMD_STOP = 1 + 1, - GF_DEFRAG_CMD_STATUS = 1 + 2, - GF_DEFRAG_CMD_START_LAYOUT_FIX = 1 + 3, - GF_DEFRAG_CMD_START_FORCE = 1 + 4, + GF_DEFRAG_CMD_NONE = 0, + GF_DEFRAG_CMD_START = 1, + GF_DEFRAG_CMD_STOP = 1 + 1, + GF_DEFRAG_CMD_STATUS = 1 + 2, + GF_DEFRAG_CMD_START_LAYOUT_FIX = 1 + 3, + GF_DEFRAG_CMD_START_FORCE = 1 + 4, + GF_DEFRAG_CMD_DETACH_STATUS = 1 + 11, + GF_DEFRAG_CMD_DETACH_START = 1 + 13, + GF_DEFRAG_CMD_DETACH_COMMIT = 1 + 14, + GF_DEFRAG_CMD_DETACH_COMMIT_FORCE = 1 + 15, + GF_DEFRAG_CMD_DETACH_STOP = 1 + 16, + /* new labels are used so it will help + * while removing old labels by easily differentiating. + * A few labels are added so that the count remains same + * between this enum and the ones on the xdr file. + * different values for the same enum cause errors and + * confusion. + */ }; typedef enum gf_defrag_type gf_defrag_type; enum gf_defrag_status_t { - GF_DEFRAG_STATUS_NOT_STARTED, - GF_DEFRAG_STATUS_STARTED, - GF_DEFRAG_STATUS_STOPPED, - GF_DEFRAG_STATUS_COMPLETE, - GF_DEFRAG_STATUS_FAILED, - GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED, - GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED, - GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE, - GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED, + GF_DEFRAG_STATUS_NOT_STARTED, + GF_DEFRAG_STATUS_STARTED, + GF_DEFRAG_STATUS_STOPPED, + GF_DEFRAG_STATUS_COMPLETE, + GF_DEFRAG_STATUS_FAILED, + GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED, + GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED, + GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE, + GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED, }; typedef enum gf_defrag_status_t gf_defrag_status_t; typedef struct gf_defrag_pattern_list gf_defrag_pattern_list_t; struct gf_defrag_pattern_list { - char path_pattern[256]; - uint64_t size; - gf_defrag_pattern_list_t *next; + char path_pattern[256]; + uint64_t size; + gf_defrag_pattern_list_t *next; }; +struct dht_container { + union { + struct list_head list; + struct { + struct _gf_dirent_t *next; + struct _gf_dirent_t *prev; + }; + }; + gf_dirent_t *df_entry; + xlator_t *this; + loc_t *parent_loc; + dict_t *migrate_data; + int local_subvol_index; +}; + +typedef struct nodeuuid_info { + char info; /* Set to 1 is this is my node's uuid*/ + uuid_t uuid; /* Store the nodeuuid as well for debugging*/ +} nodeuuid_info_t; + +typedef struct subvol_nodeuuids_info { + nodeuuid_info_t *elements; + int count; +} subvol_nodeuuids_info_t; + struct gf_defrag_info_ { - uint64_t total_files; - uint64_t total_data; - uint64_t num_files_lookedup; - uint64_t total_failures; - uint64_t skipped; - gf_lock_t lock; - int cmd; - pthread_t th; - gf_defrag_status_t defrag_status; - struct rpc_clnt *rpc; - uint32_t connected; - uint32_t is_exiting; - pid_t pid; - inode_t *root_inode; - uuid_t node_uuid; - struct timeval start_time; - gf_boolean_t stats; - gf_defrag_pattern_list_t *defrag_pattern; + uint64_t total_files; + uint64_t total_data; + uint64_t num_files_lookedup; + uint64_t total_failures; + uint64_t skipped; + uint64_t num_dirs_processed; + uint64_t size_processed; + gf_lock_t lock; + pthread_t th; + struct rpc_clnt *rpc; + uint32_t connected; + uint32_t is_exiting; + pid_t pid; + int cmd; + inode_t *root_inode; + uuid_t node_uuid; + time_t start_time; + uint32_t new_commit_hash; + gf_defrag_status_t defrag_status; + gf_defrag_pattern_list_t *defrag_pattern; + + pthread_cond_t parallel_migration_cond; + pthread_mutex_t dfq_mutex; + pthread_cond_t rebalance_crawler_alarm; + int32_t q_entry_count; + int32_t global_error; + struct dht_container *queue; + int32_t crawl_done; + int32_t abort; + int32_t wakeup_crawler; + + /*Throttle params*/ + /*stands for reconfigured thread count*/ + int32_t recon_thread_count; + pthread_cond_t df_wakeup_thread; + + /* backpointer to make it easier to write functions for rebalance */ + xlator_t *this; + + pthread_cond_t fc_wakeup_cond; + pthread_mutex_t fc_mutex; + + /*stands for current running thread count*/ + int32_t current_thread_count; + + gf_boolean_t stats; + /* lock migration flag */ + gf_boolean_t lock_migration_enabled; }; typedef struct gf_defrag_info_ gf_defrag_info_t; +struct dht_methods_s { + int32_t (*migration_get_dst_subvol)(xlator_t *this, dht_local_t *local); + int32_t (*migration_other)(xlator_t *this, gf_defrag_info_t *defrag); + xlator_t *(*layout_search)(xlator_t *this, dht_layout_t *layout, + const char *name); +}; + +typedef struct dht_methods_s dht_methods_t; + struct dht_conf { - gf_lock_t subvolume_lock; - int subvolume_cnt; - xlator_t **subvolumes; - char *subvolume_status; - int *last_event; - dht_layout_t **file_layouts; - dht_layout_t **dir_layouts; - gf_boolean_t search_unhashed; - int gen; - dht_du_t *du_stats; - double min_free_disk; - double min_free_inodes; - char disk_unit; - int32_t refresh_interval; - gf_boolean_t unhashed_sticky_bit; - struct timeval last_stat_fetch; - gf_lock_t layout_lock; - void *private; /* Can be used by wrapper xlators over - dht */ - gf_boolean_t use_readdirp; - char vol_uuid[UUID_SIZE + 1]; - gf_boolean_t assert_no_child_down; - time_t *subvol_up_time; - - /* This is the count used as the distribute layout for a directory */ - /* Will be a global flag to control the layout spread count */ - uint32_t dir_spread_cnt; - - /* to keep track of nodes which are decomissioned */ - xlator_t **decommissioned_bricks; - int decommission_in_progress; - int decommission_subvols_cnt; - - /* defrag related */ - gf_defrag_info_t *defrag; - - /* Request to filter directory entries in readdir request */ - - gf_boolean_t readdir_optimize; - - /* Support regex-based name reinterpretation. */ - regex_t rsync_regex; - gf_boolean_t rsync_regex_valid; - regex_t extra_regex; - gf_boolean_t extra_regex_valid; - - /* Support variable xattr names. */ - char *xattr_name; - char *link_xattr_name; - char *wild_xattr_name; + xlator_t **subvolumes; + char *subvolume_status; + int *last_event; + dht_layout_t **file_layouts; + dht_layout_t **dir_layouts; + unsigned int search_unhashed; + int gen; + dht_du_t *du_stats; + double min_free_disk; + double min_free_inodes; + int subvolume_cnt; + int32_t refresh_interval; + gf_lock_t subvolume_lock; + time_t last_stat_fetch; + gf_lock_t layout_lock; + dict_t *leaf_to_subvol; + void *private; /* Can be used by wrapper xlators over + dht */ + time_t *subvol_up_time; + + /* to keep track of nodes which are decommissioned */ + xlator_t **decommissioned_bricks; + int decommission_in_progress; + int decommission_subvols_cnt; + + /* defrag related */ + gf_defrag_info_t *defrag; + + /* Support regex-based name reinterpretation. */ + regex_t rsync_regex; + regex_t extra_regex; + + /* Support variable xattr names. */ + char *xattr_name; + char *mds_xattr_key; + char *link_xattr_name; + char *commithash_xattr_name; + char *wild_xattr_name; + + dht_methods_t methods; + + struct mem_pool *lock_pool; + + /*local subvol storage for rebalance*/ + xlator_t **local_subvols; + subvol_nodeuuids_info_t *local_nodeuuids; + int32_t local_subvols_cnt; + + int dthrottle; + + /* Hard link handle requirement for migration triggered from client*/ + synclock_t link_lock; + + /* lock migration */ + gf_lock_t lock; + + /* This is the count used as the distribute layout for a directory */ + /* Will be a global flag to control the layout spread count */ + uint32_t dir_spread_cnt; + + /* + * "Commit hash" for this volume topology. Changed whenever bricks + * are added or removed. + */ + uint32_t vol_commit_hash; + + char vol_uuid[UUID_SIZE + 1]; + + char disk_unit; + + gf_boolean_t lock_migration_enabled; + + gf_boolean_t vch_forced; + + gf_boolean_t use_fallocate; + + gf_boolean_t force_migration; + + gf_boolean_t lookup_optimize; + + gf_boolean_t unhashed_sticky_bit; + + gf_boolean_t assert_no_child_down; + + gf_boolean_t use_readdirp; + + /* Request to filter directory entries in readdir request */ + gf_boolean_t readdir_optimize; + + gf_boolean_t rsync_regex_valid; + + gf_boolean_t extra_regex_valid; + + /* Support size-weighted rebalancing (heterogeneous bricks). */ + gf_boolean_t do_weighting; + + gf_boolean_t randomize_by_gfid; }; typedef struct dht_conf dht_conf_t; +struct dht_dfoffset_ctx { + xlator_t *this; + off_t offset; + int32_t readdir_done; +}; +typedef struct dht_dfoffset_ctx dht_dfoffset_ctx_t; struct dht_disk_layout { - uint32_t cnt; - uint32_t type; - struct { - uint32_t start; - uint32_t stop; - } list[1]; + uint32_t cnt; + uint32_t type; + struct { + uint32_t start; + uint32_t stop; + } list[1]; }; typedef struct dht_disk_layout dht_disk_layout_t; typedef enum { - GF_DHT_MIGRATE_DATA, - GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS, - GF_DHT_MIGRATE_HARDLINK, - GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS + GF_DHT_MIGRATE_DATA, + GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS, + GF_DHT_MIGRATE_HARDLINK, + GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS } gf_dht_migrate_data_type_t; +typedef enum { + GF_DHT_EQUAL_DISTRIBUTION, + GF_DHT_WEIGHTED_DISTRIBUTION +} dht_distribution_type_t; + +struct dir_dfmeta { + gf_dirent_t *equeue; + dht_dfoffset_ctx_t *offset_var; + struct list_head **head; + struct list_head **iterator; + int *fetch_entries; + /* fds corresponding to local subvols only */ + fd_t **lfd; +}; + +typedef struct dht_migrate_info { + xlator_t *src_subvol; + xlator_t *dst_subvol; + GF_REF_DECL; +} dht_migrate_info_t; + +typedef struct dht_fd_ctx { + uint64_t opened_on_dst; + GF_REF_DECL; +} dht_fd_ctx_t; + #define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT) -#define is_revalidate(loc) (dht_inode_ctx_layout_get (loc->inode, this, NULL) == 0) +#define is_revalidate(loc) \ + (dht_inode_ctx_layout_get((loc)->inode, this, NULL) == 0) #define is_last_call(cnt) (cnt == 0) #define DHT_MIGRATION_IN_PROGRESS 1 -#define DHT_MIGRATION_COMPLETED 2 +#define DHT_MIGRATION_COMPLETED 2 -#define check_is_linkfile(i,s,x,n) (IS_DHT_LINKFILE_MODE (s) && dict_get (x, n)) +#define check_is_linkfile(i, s, x, n) \ + (IS_DHT_LINKFILE_MODE(s) && dict_get(x, n)) -#define IS_DHT_MIGRATION_PHASE2(buf) ( \ - IA_ISREG ((buf)->ia_type) && \ - ((st_mode_from_ia ((buf)->ia_prot, (buf)->ia_type) & \ - ~S_IFMT) == DHT_LINKFILE_MODE)) +#define IS_DHT_MIGRATION_PHASE2(buf) \ + (IA_ISREG((buf)->ia_type) && \ + ((st_mode_from_ia((buf)->ia_prot, (buf)->ia_type) & ~S_IFMT) == \ + DHT_LINKFILE_MODE)) -#define IS_DHT_MIGRATION_PHASE1(buf) ( \ - IA_ISREG ((buf)->ia_type) && \ - ((buf)->ia_prot.sticky == 1) && \ - ((buf)->ia_prot.sgid == 1)) +#define IS_DHT_MIGRATION_PHASE1(buf) \ + (IA_ISREG((buf)->ia_type) && ((buf)->ia_prot.sticky == 1) && \ + ((buf)->ia_prot.sgid == 1)) -#define DHT_STRIP_PHASE1_FLAGS(buf) do { \ - if ((buf) && IS_DHT_MIGRATION_PHASE1(buf)) { \ - (buf)->ia_prot.sticky = 0; \ - (buf)->ia_prot.sgid = 0; \ - } \ - } while (0) +#define DHT_STRIP_PHASE1_FLAGS(buf) \ + do { \ + if ((buf) && IS_DHT_MIGRATION_PHASE1(buf)) { \ + (buf)->ia_prot.sticky = 0; \ + (buf)->ia_prot.sgid = 0; \ + } \ + } while (0) #define dht_inode_missing(op_errno) (op_errno == ENOENT || op_errno == ESTALE) -#define check_is_dir(i,s,x) (IA_ISDIR(s->ia_type)) +#define check_is_dir(i, s, x) (IA_ISDIR(s->ia_type)) #define layout_is_sane(layout) ((layout) && (layout->cnt > 0)) -#define DHT_STACK_UNWIND(fop, frame, params ...) do { \ - dht_local_t *__local = NULL; \ - xlator_t *__xl = NULL; \ - if (frame) { \ - __xl = frame->this; \ - __local = frame->local; \ - frame->local = NULL; \ - } \ - STACK_UNWIND_STRICT (fop, frame, params); \ - dht_local_wipe (__xl, __local); \ - } while (0) - -#define DHT_STACK_DESTROY(frame) do { \ - dht_local_t *__local = NULL; \ - xlator_t *__xl = NULL; \ - __xl = frame->this; \ - __local = frame->local; \ - frame->local = NULL; \ - STACK_DESTROY (frame->root); \ - dht_local_wipe (__xl, __local); \ - } while (0) - -#define DHT_UPDATE_TIME(ctx_sec, ctx_nsec, new_sec, new_nsec, inode, post) do {\ - int32_t sec = 0; \ - sec = new_sec; \ - LOCK (&inode->lock); \ - { \ - new_sec = max(new_sec, ctx_sec); \ - if (sec < new_sec) \ - new_nsec = ctx_nsec; \ - if (sec == new_sec) \ - new_nsec = max (new_nsec, ctx_nsec); \ - if (post) { \ - ctx_sec = new_sec; \ - ctx_nsec = new_nsec; \ - } \ - } \ - UNLOCK (&inode->lock); \ - } while (0) - -#define is_greater_time(a, an, b, bn) (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))) -dht_layout_t *dht_layout_new (xlator_t *this, int cnt); -dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode); -dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol); -xlator_t *dht_layout_search (xlator_t *this, dht_layout_t *layout, - const char *name); -int dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout); -int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, - uint32_t *holes_p, uint32_t *overlaps_p, - uint32_t *missing_p, uint32_t *down_p, - uint32_t *misc_p, uint32_t *no_space_p); -int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, - xlator_t *subvol, loc_t *loc, dict_t *xattr); - -xlator_t *dht_linkfile_subvol (xlator_t *this, inode_t *inode, - struct iatt *buf, dict_t *xattr); -int dht_linkfile_unlink (call_frame_t *frame, xlator_t *this, - xlator_t *subvol, loc_t *loc); - -int dht_layouts_init (xlator_t *this, dht_conf_t *conf); -int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, - int op_ret, int op_errno, dict_t *xattr); - -int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, - int pos, int32_t **disk_layout_p); -int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, - int pos, void *disk_layout_raw, int disk_layout_len); - - -int dht_frame_return (call_frame_t *frame); - -int dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y); -int dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol, - uint64_t *x); - -void dht_local_wipe (xlator_t *this, dht_local_t *local); -dht_local_t *dht_local_init (call_frame_t *frame, loc_t *loc, fd_t *fd, - glusterfs_fop_t fop); -int dht_iatt_merge (xlator_t *this, struct iatt *to, struct iatt *from, - xlator_t *subvol); - -xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc); -xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode); -xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev); -xlator_t *dht_subvol_next_available (xlator_t *this, xlator_t *prev); -int dht_subvol_cnt (xlator_t *this, xlator_t *subvol); - -int dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p); - -int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, - xlator_t *this, xlator_t *tovol, - xlator_t *fromvol, loc_t *loc); -int dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc); -int dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc); -int -dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, - loc_t *loc, dht_layout_t *layout); -int -dht_selfheal_new_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, - dht_layout_t *layout); -int -dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, - loc_t *loc, dht_layout_t *layout); -int -dht_layout_sort_volname (dht_layout_t *layout); - -int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc); - -gf_boolean_t dht_is_subvol_filled (xlator_t *this, xlator_t *subvol); -xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, - dht_local_t *layout); -int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx); - -int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode); -int dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout);; -void dht_layout_unref (xlator_t *this, dht_layout_t *layout); -dht_layout_t *dht_layout_ref (xlator_t *this, dht_layout_t *layout); -xlator_t *dht_first_up_subvol (xlator_t *this); -xlator_t *dht_last_up_subvol (xlator_t *this); - -int dht_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name); - -int dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc, - xlator_t **subvol); - -int dht_rename_cleanup (call_frame_t *frame); -int dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent, - dict_t *xdata); - -int dht_fix_directory_layout (call_frame_t *frame, - dht_selfheal_dir_cbk_t dir_cbk, - dht_layout_t *layout); - -int dht_init_subvolumes (xlator_t *this, dht_conf_t *conf); +#define we_are_not_migrating(x) ((x) == 1) + +#define DHT_STACK_UNWIND(fop, frame, params...) \ + do { \ + dht_local_t *__local = NULL; \ + xlator_t *__xl = NULL; \ + if (frame) { \ + __xl = frame->this; \ + __local = frame->local; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT(fop, frame, params); \ + dht_local_wipe(__xl, __local); \ + } while (0) + +#define DHT_STACK_DESTROY(frame) \ + do { \ + dht_local_t *__local = NULL; \ + xlator_t *__xl = NULL; \ + __xl = frame->this; \ + __local = frame->local; \ + frame->local = NULL; \ + STACK_DESTROY(frame->root); \ + dht_local_wipe(__xl, __local); \ + } while (0) + +#define DHT_UPDATE_TIME(ctx_sec, ctx_nsec, new_sec, new_nsec, post) \ + do { \ + if (ctx_sec == new_sec) \ + new_nsec = max(new_nsec, ctx_nsec); \ + else if (ctx_sec > new_sec) { \ + new_sec = ctx_sec; \ + new_nsec = ctx_nsec; \ + } \ + if (post) { \ + ctx_sec = new_sec; \ + ctx_nsec = new_nsec; \ + } \ + } while (0) + +#define is_greater_time(a, an, b, bn) \ + (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))) + +#define DHT_MARK_FOP_INTERNAL(xattr) \ + do { \ + int tmp = -1; \ + if (!xattr) { \ + xattr = dict_new(); \ + if (!xattr) \ + break; \ + } \ + tmp = dict_set_str(xattr, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); \ + if (tmp) { \ + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, \ + "Failed to set dictionary value: key = %s," \ + " path = %s", \ + GLUSTERFS_INTERNAL_FOP_KEY, local->loc.path); \ + } \ + } while (0) + +dht_layout_t * +dht_layout_new(xlator_t *this, int cnt); +dht_layout_t * +dht_layout_get(xlator_t *this, inode_t *inode); +dht_layout_t * +dht_layout_for_subvol(xlator_t *this, xlator_t *subvol); +xlator_t * +dht_layout_search(xlator_t *this, dht_layout_t *layout, const char *name); +int32_t +dht_migration_get_dst_subvol(xlator_t *this, dht_local_t *local); +int32_t +dht_migration_needed(xlator_t *this); +int +dht_layout_normalize(xlator_t *this, loc_t *loc, dht_layout_t *layout); +void +dht_layout_anomalies(xlator_t *this, loc_t *loc, dht_layout_t *layout, + uint32_t *holes_p, uint32_t *overlaps_p, + uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p, + uint32_t *no_space_p); +int +dht_layout_dir_mismatch(xlator_t *this, dht_layout_t *layout, xlator_t *subvol, + loc_t *loc, dict_t *xattr); +xlator_t * +dht_linkfile_subvol(xlator_t *this, inode_t *inode, struct iatt *buf, + dict_t *xattr); +int +dht_linkfile_unlink(call_frame_t *frame, xlator_t *this, xlator_t *subvol, + loc_t *loc); + +int +dht_layouts_init(xlator_t *this, dht_conf_t *conf); +int +dht_layout_merge(xlator_t *this, dht_layout_t *layout, xlator_t *subvol, + int op_ret, int op_errno, dict_t *xattr); + +int +dht_disk_layout_extract(xlator_t *this, dht_layout_t *layout, int pos, + int32_t **disk_layout_p); +int +dht_disk_layout_extract_for_subvol(xlator_t *this, dht_layout_t *layout, + xlator_t *subvol, int32_t **disk_layout_p); + +int +dht_frame_return(call_frame_t *frame); + +int +dht_deitransform(xlator_t *this, uint64_t y, xlator_t **subvol); + +void +dht_local_wipe(xlator_t *this, dht_local_t *local); +dht_local_t * +dht_local_init(call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop); +int +dht_iatt_merge(xlator_t *this, struct iatt *to, struct iatt *from); + +xlator_t * +dht_subvol_get_hashed(xlator_t *this, loc_t *loc); +xlator_t * +dht_subvol_get_cached(xlator_t *this, inode_t *inode); +xlator_t * +dht_subvol_next(xlator_t *this, xlator_t *prev); +xlator_t * +dht_subvol_next_available(xlator_t *this, xlator_t *prev); +int +dht_subvol_cnt(xlator_t *this, xlator_t *subvol); + +int +dht_hash_compute(xlator_t *this, int type, const char *name, uint32_t *hash_p); + +int +dht_linkfile_create(call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, + xlator_t *this, xlator_t *tovol, xlator_t *fromvol, + loc_t *loc); +int +dht_lookup_everywhere(call_frame_t *frame, xlator_t *this, loc_t *loc); +int +dht_selfheal_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, + loc_t *loc, dht_layout_t *layout); +int +dht_selfheal_new_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, + dht_layout_t *layout); +int +dht_selfheal_restore(call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, + loc_t *loc, dht_layout_t *layout); +void +dht_layout_sort_volname(dht_layout_t *layout); + +int +dht_get_du_info(call_frame_t *frame, xlator_t *this, loc_t *loc); + +gf_boolean_t +dht_is_subvol_filled(xlator_t *this, xlator_t *subvol); +xlator_t * +dht_free_disk_available_subvol(xlator_t *this, xlator_t *subvol, + dht_local_t *layout); +int +dht_get_du_info_for_subvol(xlator_t *this, int subvol_idx); + +int +dht_layout_preset(xlator_t *this, xlator_t *subvol, inode_t *inode); +int +dht_layout_set(xlator_t *this, inode_t *inode, dht_layout_t *layout); +; +void +dht_layout_unref(xlator_t *this, dht_layout_t *layout); +dht_layout_t * +dht_layout_ref(xlator_t *this, dht_layout_t *layout); +int +dht_layout_index_for_subvol(dht_layout_t *layout, xlator_t *subvol); +xlator_t * +dht_first_up_subvol(xlator_t *this); +xlator_t * +dht_last_up_subvol(xlator_t *this); + +int +dht_build_child_loc(xlator_t *this, loc_t *child, loc_t *parent, char *name); + +int +dht_filter_loc_subvol_key(xlator_t *this, loc_t *loc, loc_t *new_loc, + xlator_t **subvol); + +int +dht_rename_cleanup(call_frame_t *frame); +int +dht_rename_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata); + +int +dht_update_commit_hash_for_layout(call_frame_t *frame); +int +dht_fix_directory_layout(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, + dht_layout_t *layout); + +int +dht_init_subvolumes(xlator_t *this, dht_conf_t *conf); /* migration/rebalance */ -int dht_start_rebalance_task (xlator_t *this, call_frame_t *frame); +int +dht_start_rebalance_task(xlator_t *this, call_frame_t *frame); -int dht_rebalance_in_progress_check (xlator_t *this, call_frame_t *frame); -int dht_rebalance_complete_check (xlator_t *this, call_frame_t *frame); +int +dht_rebalance_in_progress_check(xlator_t *this, call_frame_t *frame); +int +dht_rebalance_complete_check(xlator_t *this, call_frame_t *frame); +int +dht_init_local_subvolumes(xlator_t *this, dht_conf_t *conf); /* FOPS */ -int32_t dht_lookup (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *xattr_req); - -int32_t dht_stat (call_frame_t *frame, - xlator_t *this, - loc_t *loc, dict_t *xdata); - -int32_t dht_fstat (call_frame_t *frame, - xlator_t *this, - fd_t *fd, dict_t *xdata); - -int32_t dht_truncate (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - off_t offset, dict_t *xdata); - -int32_t dht_ftruncate (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - off_t offset, dict_t *xdata); - -int32_t dht_access (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t mask, dict_t *xdata); - -int32_t dht_readlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - size_t size, dict_t *xdata); - -int32_t dht_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, - mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata); - -int32_t dht_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata); - -int32_t dht_unlink (call_frame_t *frame, - xlator_t *this, - loc_t *loc, int xflag, dict_t *xdata); - -int32_t dht_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags, dict_t *xdata); - -int32_t dht_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *loc, mode_t umask, - dict_t *xdata); - -int32_t dht_rename (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc, dict_t *xdata); - -int32_t dht_link (call_frame_t *frame, - xlator_t *this, - loc_t *oldloc, - loc_t *newloc, dict_t *xdata); - -int32_t dht_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - mode_t umask, fd_t *fd, dict_t *params); - -int32_t dht_open (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - int32_t flags, fd_t *fd, dict_t *xdata); - -int32_t dht_readv (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t offset, uint32_t flags, dict_t *xdata); - -int32_t dht_writev (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - struct iovec *vector, - int32_t count, - off_t offset, - uint32_t flags, - struct iobref *iobref, dict_t *xdata); - -int32_t dht_flush (call_frame_t *frame, - xlator_t *this, - fd_t *fd, dict_t *xdata); - -int32_t dht_fsync (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t datasync, dict_t *xdata); - -int32_t dht_opendir (call_frame_t *frame, - xlator_t *this, - loc_t *loc, fd_t *fd, dict_t *xdata); - -int32_t dht_fsyncdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t datasync, dict_t *xdata); - -int32_t dht_statfs (call_frame_t *frame, - xlator_t *this, - loc_t *loc, dict_t *xdata); - -int32_t dht_setxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - dict_t *dict, - int32_t flags, dict_t *xdata); - -int32_t dht_getxattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name, dict_t *xdata); - -int32_t dht_fsetxattr (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - dict_t *dict, - int32_t flags, dict_t *xdata); - -int32_t dht_fgetxattr (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - const char *name, dict_t *xdata); - -int32_t dht_removexattr (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - const char *name, dict_t *xdata); -int32_t dht_fremovexattr (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - const char *name, dict_t *xdata); - -int32_t dht_lk (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - int32_t cmd, - struct gf_flock *flock, dict_t *xdata); - -int32_t dht_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, - struct gf_flock *flock, dict_t *xdata); - -int32_t dht_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, - struct gf_flock *flock, dict_t *xdata); - -int32_t dht_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata); - -int32_t dht_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata); - -int32_t dht_readdir (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, off_t off, dict_t *xdata); - -int32_t dht_readdirp (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, off_t off, dict_t *dict); - -int32_t dht_xattrop (call_frame_t *frame, - xlator_t *this, - loc_t *loc, - gf_xattrop_flags_t flags, - dict_t *dict, dict_t *xdata); - -int32_t dht_fxattrop (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - gf_xattrop_flags_t flags, - dict_t *dict, dict_t *xdata); - -int32_t dht_forget (xlator_t *this, inode_t *inode); -int32_t dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid, dict_t *xdata); -int32_t dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid, dict_t *xdata); -int32_t dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t mode, off_t offset, size_t len, dict_t *xdata); -int32_t dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, - off_t offset, size_t len, dict_t *xdata); -int32_t dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, - off_t offset, off_t len, dict_t *xdata); - -int32_t dht_init (xlator_t *this); -void dht_fini (xlator_t *this); -int dht_reconfigure (xlator_t *this, dict_t *options); -int32_t dht_notify (xlator_t *this, int32_t event, void *data, ...); +int32_t +dht_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req); + +int32_t +dht_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int32_t +dht_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata); + +int32_t +dht_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata); + +int32_t +dht_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata); + +int32_t +dht_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata); + +int32_t +dht_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata); + +int32_t +dht_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata); + +int32_t +dht_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata); + +int32_t +dht_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata); + +int32_t +dht_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata); + +int32_t +dht_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata); + +int32_t +dht_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); + +int32_t +dht_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); + +int32_t +dht_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *params); + +int32_t +dht_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata); + +int32_t +dht_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata); + +int32_t +dht_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata); + +int32_t +dht_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata); + +int32_t +dht_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata); + +int32_t +dht_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata); + +int32_t +dht_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata); + +int32_t +dht_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int32_t +dht_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata); + +int32_t +dht_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, + dict_t *xdata); + +int32_t +dht_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata); + +int32_t +dht_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata); + +int32_t +dht_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata); +int32_t +dht_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata); + +int32_t +dht_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata); + +int32_t +dht_lease(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct gf_lease *lease, dict_t *xdata); + +int32_t +dht_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + int32_t cmd, struct gf_flock *flock, dict_t *xdata); + +int32_t +dht_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *flock, dict_t *xdata); + +int32_t +dht_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata); + +int32_t +dht_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata); + +int32_t +dht_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata); + +int32_t +dht_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *dict); + +int32_t +dht_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata); + +int32_t +dht_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata); + +int32_t +dht_forget(xlator_t *this, inode_t *inode); +int32_t +dht_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata); +int32_t +dht_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid, dict_t *xdata); +int32_t +dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata); +int32_t +dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata); +int32_t +dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata); +int32_t +dht_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata); + +int +dht_set_subvol_range(xlator_t *this); +int32_t +dht_init(xlator_t *this); +void +dht_fini(xlator_t *this); +int +dht_reconfigure(xlator_t *this, dict_t *options); +int32_t +dht_notify(xlator_t *this, int32_t event, void *data, ...); /* definitions for nufa/switch */ -int dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, inode_t *inode, - struct iatt *stbuf, dict_t *xattr, - struct iatt *postparent); -int dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +int +dht_revalidate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, + dict_t *xattr, struct iatt *postparent); +int +dht_lookup_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, + dict_t *xattr, struct iatt *postparent); +int +dht_lookup_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, dict_t *xattr, struct iatt *postparent); -int dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, dict_t *xattr, - struct iatt *postparent); -int dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, dict_t *xattr, - struct iatt *postparent); -int dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - fd_t *fd, inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent, - dict_t *xdata); -int dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata); +int +dht_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent); +int +dht_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata); +int +dht_newfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata); int -gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict); +dht_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); + +int +dht_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xattr, dict_t *xdata); + +int +dht_common_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata); +int +gf_defrag_status_get(dht_conf_t *conf, dict_t *dict); int -gf_defrag_stop (gf_defrag_info_t *defrag, gf_defrag_status_t status, - dict_t *output); +gf_defrag_stop(dht_conf_t *conf, gf_defrag_status_t status, dict_t *output); -void* -gf_defrag_start (void *this); +void * +gf_defrag_start(void *this); int32_t -gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs, - struct iatt *stbuf); +gf_defrag_handle_hardlink(xlator_t *this, loc_t *loc, int *fop_errno); int -dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, - int flag); +dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, + int flag, int *fop_errno); int -dht_inode_ctx_layout_get (inode_t *inode, xlator_t *this, - dht_layout_t **layout_int); +dht_inode_ctx_layout_get(inode_t *inode, xlator_t *this, + dht_layout_t **layout_int); int -dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this, - dht_layout_t* layout_int); +dht_inode_ctx_layout_set(inode_t *inode, xlator_t *this, + dht_layout_t *layout_int); int -dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat, - int32_t update_ctx); -void dht_inode_ctx_time_set (inode_t *inode, xlator_t *this, struct iatt *stat); +dht_inode_ctx_time_update(inode_t *inode, xlator_t *this, struct iatt *stat, + int32_t update_ctx); +void +dht_inode_ctx_time_set(inode_t *inode, xlator_t *this, struct iatt *stat); -int dht_inode_ctx_get (inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx); -int dht_inode_ctx_set (inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx); int -dht_dir_attr_heal (void *data); +dht_inode_ctx_get(inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx); int -dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data); +dht_inode_ctx_set(inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx); int -dht_dir_has_layout (dict_t *xattr, char *name); -gf_boolean_t -dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator); +dht_dir_attr_heal(void *data); +int +dht_dir_attr_heal_done(int ret, call_frame_t *sync_frame, void *data); xlator_t * -dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol, - dht_layout_t *layout); +dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, + xlator_t *ignore, dht_layout_t *layout, + uint64_t filesize); xlator_t * -dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, - dht_layout_t *layout); +dht_subvol_maxspace_nonzeroinode(xlator_t *this, xlator_t *subvol, + dht_layout_t *layout); int -dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this); +dht_dir_has_layout(dict_t *xattr, char *name); +int +dht_linkfile_attr_heal(call_frame_t *frame, xlator_t *this); + +int32_t +dht_priv_dump(xlator_t *this); +int32_t +dht_inodectx_dump(xlator_t *this, inode_t *inode); + +gf_boolean_t +dht_is_subvol_in_layout(dht_layout_t *layout, xlator_t *xlator); + +int +dht_inode_ctx_get_mig_info(xlator_t *this, inode_t *inode, + xlator_t **src_subvol, xlator_t **dst_subvol); +gf_boolean_t +dht_mig_info_is_invalid(xlator_t *current, xlator_t *src_subvol, + xlator_t *dst_subvol); + +int +dht_subvol_status(dht_conf_t *conf, xlator_t *subvol); void -dht_layout_dump (dht_layout_t *layout, const char *prefix); +dht_log_new_layout_for_dir_selfheal(xlator_t *this, loc_t *loc, + dht_layout_t *layout); + +int +dht_layout_sort(dht_layout_t *layout); + +int +dht_heal_full_path(void *data); + +int +dht_heal_full_path_done(int op_ret, call_frame_t *frame, void *data); + +int +dht_layout_missing_dirs(dht_layout_t *layout); + +int +dht_refresh_layout(call_frame_t *frame); + +int +dht_build_parent_loc(xlator_t *this, loc_t *parent, loc_t *child, + int32_t *op_errno); + +int32_t +dht_set_local_rebalance(xlator_t *this, dht_local_t *local, struct iatt *stbuf, + struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata); +void +dht_build_root_loc(inode_t *inode, loc_t *loc); + +gf_boolean_t +dht_fd_open_on_dst(xlator_t *this, fd_t *fd, xlator_t *dst); + int32_t -dht_priv_dump (xlator_t *this); +dht_fd_ctx_destroy(xlator_t *this, fd_t *fd); + +int32_t +dht_release(xlator_t *this, fd_t *fd); + int32_t -dht_inodectx_dump (xlator_t *this, inode_t *inode); +dht_set_fixed_dir_stat(struct iatt *stat); + +xlator_t * +dht_get_lock_subvolume(xlator_t *this, struct gf_flock *lock, + dht_local_t *local); + +int +dht_lk_inode_unref(call_frame_t *frame, int32_t op_ret); + +int +dht_fd_ctx_set(xlator_t *this, fd_t *fd, xlator_t *subvol); + +int +dht_check_and_open_fd_on_subvol(xlator_t *this, call_frame_t *frame); + +/* FD fop callbacks */ + +int +dht_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata); + +int +dht_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xdata); + +int +dht_file_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata); + +int +dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata); + +int +dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata); + +int +dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata); + +int +dht_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata); + +int +dht_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata); + +int +dht_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iovec *vector, int count, struct iatt *stbuf, + struct iobref *iobref, dict_t *xdata); + +int +dht_file_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *stbuf, dict_t *xdata); int -dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol); +dht_file_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata); -#endif/* _DHT_H */ +int +dht_file_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata); + +/* All custom xattr heal functions */ +int +dht_dir_heal_xattrs(void *data); + +int +dht_dir_heal_xattrs_done(int ret, call_frame_t *sync_frame, void *data); + +int32_t +dht_dict_set_array(dict_t *dict, char *key, int32_t value[], int32_t size); + +int +dht_set_user_xattr(dict_t *dict, char *k, data_t *v, void *data); + +void +dht_dir_set_heal_xattr(xlator_t *this, dht_local_t *local, dict_t *dst, + dict_t *src, int *uret, int *uflag); + +int +dht_dir_xattr_heal(xlator_t *this, dht_local_t *local, int *op_errno); + +int +dht_common_mark_mdsxattr(call_frame_t *frame, int *errst, int flag); + +int +dht_inode_ctx_mdsvol_get(inode_t *inode, xlator_t *this, xlator_t **mdsvol); + +int +dht_selfheal_dir_setattr(call_frame_t *frame, loc_t *loc, struct iatt *stbuf, + int32_t valid, dht_layout_t *layout); + +/* Abstract out the DHT-IATT-IN-DICT */ + +void +dht_selfheal_layout_new_directory(call_frame_t *frame, loc_t *loc, + dht_layout_t *new_layout); + +int +dht_pt_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key, + dict_t *xdata); + +int +dht_pt_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *key, dict_t *xdata); + +int32_t +dht_pt_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata); + +int +dht_pt_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); + +int32_t +dht_check_remote_fd_failed_error(dht_local_t *local, int op_ret, int op_errno); + +int +dht_common_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata); + +int32_t +dht_create_lock(call_frame_t *frame, xlator_t *subvol); + +int +dht_set_parent_layout_in_dict(loc_t *loc, xlator_t *this, dht_local_t *local); + +int +dht_dir_layout_error_check(xlator_t *this, inode_t *inode); + +int +dht_inode_ctx_mdsvol_set(inode_t *inode, xlator_t *this, xlator_t *mds_subvol); +#endif /* _DHT_H */ diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index fe3955ecbb7..c0588828fdb 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -8,407 +8,480 @@ cases as published by the Free Software Foundation. */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - /* TODO: add NS locking */ -#include "glusterfs.h" -#include "xlator.h" #include "dht-common.h" -#include "defaults.h" #include <sys/time.h> - +#include <glusterfs/events.h> int -dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct statvfs *statvfs, - dict_t *xdata) +dht_du_info_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct statvfs *statvfs, dict_t *xdata) { - dht_conf_t *conf = NULL; - call_frame_t *prev = NULL; - int this_call_cnt = 0; - int i = 0; - double percent = 0; - double percent_inodes = 0; - uint64_t bytes = 0; - - conf = this->private; - prev = cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "failed to get disk info from %s", prev->this->name); - goto out; - } - - if (statvfs && statvfs->f_blocks) { - percent = (statvfs->f_bavail * 100) / statvfs->f_blocks; - bytes = (statvfs->f_bavail * statvfs->f_frsize); - } - - if (statvfs && statvfs->f_files) { - percent_inodes = (statvfs->f_ffree * 100) / statvfs->f_files; - } else { - /* set percent inodes to 100 for dynamically allocated inode filesystems - this logic holds good so that, distribute has nothing to worry about - total inodes rather let the 'create()' to be scheduled on the hashed - subvol regardless of the total inodes. since we have no awareness on - loosing inodes this logic fits well - */ - percent_inodes = 100; - } - - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) - if (prev->this == conf->subvolumes[i]) { - conf->du_stats[i].avail_percent = percent; - conf->du_stats[i].avail_space = bytes; - conf->du_stats[i].avail_inodes = percent_inodes; - gf_log (this->name, GF_LOG_DEBUG, - "on subvolume '%s': avail_percent is: " - "%.2f and avail_space is: %"PRIu64" " - "and avail_inodes is: %.2f", - prev->this->name, - conf->du_stats[i].avail_percent, - conf->du_stats[i].avail_space, - conf->du_stats[i].avail_inodes); - } - } - UNLOCK (&conf->subvolume_lock); + dht_conf_t *conf = NULL; + xlator_t *prev = NULL; + int this_call_cnt = 0; + int i = 0; + double percent = 0; + double percent_inodes = 0; + uint64_t bytes = 0; + uint32_t bpc; /* blocks per chunk */ + uint32_t chunks = 0; + + conf = this->private; + prev = cookie; + + if (op_ret == -1 || !statvfs) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, + DHT_MSG_GET_DISK_INFO_ERROR, "failed to get disk info from %s", + prev->name); + goto out; + } + + if (statvfs->f_blocks) { + percent = (statvfs->f_bavail * 100) / statvfs->f_blocks; + bytes = (statvfs->f_bavail * statvfs->f_frsize); + /* + * A 32-bit count of 1MB chunks allows a maximum brick size of + * ~4PB. It's possible that we could see a single local FS + * bigger than that some day, but this code is likely to be + * irrelevant by then. Meanwhile, it's more important to keep + * the chunk size small so the layout-calculation code that + * uses this value can be tested on normal machines. + */ + bpc = (1 << 20) / statvfs->f_bsize; + chunks = (statvfs->f_blocks + bpc - 1) / bpc; + } + + if (statvfs->f_files) { + percent_inodes = (statvfs->f_ffree * 100) / statvfs->f_files; + } else { + /* + * Set percent inodes to 100 for dynamically allocated inode + * filesystems. The rationale is that distribute need not + * worry about total inodes; rather, let the 'create()' be + * scheduled on the hashed subvol regardless of the total + * inodes. + */ + percent_inodes = 100; + } + + LOCK(&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) + if (prev == conf->subvolumes[i]) { + conf->du_stats[i].avail_percent = percent; + conf->du_stats[i].avail_space = bytes; + conf->du_stats[i].avail_inodes = percent_inodes; + conf->du_stats[i].chunks = chunks; + conf->du_stats[i].total_blocks = statvfs->f_blocks; + conf->du_stats[i].avail_blocks = statvfs->f_bavail; + conf->du_stats[i].frsize = statvfs->f_frsize; + + gf_msg_debug(this->name, 0, + "subvolume '%s': avail_percent " + "is: %.2f and avail_space " + "is: %" PRIu64 + " and avail_inodes" + " is: %.2f", + prev->name, conf->du_stats[i].avail_percent, + conf->du_stats[i].avail_space, + conf->du_stats[i].avail_inodes); + break; /* no point in looping further */ + } + } + UNLOCK(&conf->subvolume_lock); out: - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) - DHT_STACK_DESTROY (frame); + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) + DHT_STACK_DESTROY(frame); - return 0; + return 0; } int -dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx) +dht_get_du_info_for_subvol(xlator_t *this, int subvol_idx) { - dht_conf_t *conf = NULL; - call_frame_t *statfs_frame = NULL; - dht_local_t *statfs_local = NULL; - call_pool_t *pool = NULL; - loc_t tmp_loc = {0,}; - - conf = this->private; - pool = this->ctx->pool; - - statfs_frame = create_frame (this, pool); - if (!statfs_frame) { - goto err; - } - - /* local->fop value is not used in this case */ - statfs_local = dht_local_init (statfs_frame, NULL, NULL, - GF_FOP_MAXVALUE); - if (!statfs_local) { - goto err; - } - - /* make it root gfid, should be enough to get the proper info back */ - tmp_loc.gfid[15] = 1; - - statfs_local->call_cnt = 1; - STACK_WIND (statfs_frame, dht_du_info_cbk, - conf->subvolumes[subvol_idx], - conf->subvolumes[subvol_idx]->fops->statfs, - &tmp_loc, NULL); - - return 0; + dht_conf_t *conf = NULL; + call_frame_t *statfs_frame = NULL; + dht_local_t *statfs_local = NULL; + call_pool_t *pool = NULL; + loc_t tmp_loc = { + 0, + }; + + conf = this->private; + pool = this->ctx->pool; + + statfs_frame = create_frame(this, pool); + if (!statfs_frame) { + goto err; + } + + /* local->fop value is not used in this case */ + statfs_local = dht_local_init(statfs_frame, NULL, NULL, GF_FOP_MAXVALUE); + if (!statfs_local) { + goto err; + } + + /* make it root gfid, should be enough to get the proper info back */ + tmp_loc.gfid[15] = 1; + + statfs_local->call_cnt = 1; + STACK_WIND_COOKIE( + statfs_frame, dht_du_info_cbk, conf->subvolumes[subvol_idx], + conf->subvolumes[subvol_idx], + conf->subvolumes[subvol_idx]->fops->statfs, &tmp_loc, NULL); + + return 0; err: - if (statfs_frame) - DHT_STACK_DESTROY (statfs_frame); + if (statfs_frame) + DHT_STACK_DESTROY(statfs_frame); - return -1; + return -1; } int -dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc) +dht_get_du_info(call_frame_t *frame, xlator_t *this, loc_t *loc) { - int i = 0; - dht_conf_t *conf = NULL; - call_frame_t *statfs_frame = NULL; - dht_local_t *statfs_local = NULL; - struct timeval tv = {0,}; - loc_t tmp_loc = {0,}; - - conf = this->private; - - gettimeofday (&tv, NULL); - - /* make it root gfid, should be enough to get the proper - info back */ - tmp_loc.gfid[15] = 1; - - if (tv.tv_sec > (conf->refresh_interval - + conf->last_stat_fetch.tv_sec)) { - - statfs_frame = copy_frame (frame); - if (!statfs_frame) { - goto err; - } - - /* In this case, 'local->fop' is not used */ - statfs_local = dht_local_init (statfs_frame, loc, NULL, - GF_FOP_MAXVALUE); - if (!statfs_local) { - goto err; - } - - statfs_local->call_cnt = conf->subvolume_cnt; - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (statfs_frame, dht_du_info_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->statfs, - &tmp_loc, NULL); - } - - conf->last_stat_fetch.tv_sec = tv.tv_sec; - } - return 0; + int i = 0; + int ret = -1; + dht_conf_t *conf = NULL; + call_frame_t *statfs_frame = NULL; + dht_local_t *statfs_local = NULL; + loc_t tmp_loc = { + 0, + }; + time_t now; + + conf = this->private; + now = gf_time(); + /* make it root gfid, should be enough to get the proper + info back */ + tmp_loc.gfid[15] = 1; + + if (now > (conf->refresh_interval + conf->last_stat_fetch)) { + statfs_frame = copy_frame(frame); + if (!statfs_frame) { + goto err; + } + + /* In this case, 'local->fop' is not used */ + statfs_local = dht_local_init(statfs_frame, loc, NULL, GF_FOP_MAXVALUE); + if (!statfs_local) { + goto err; + } + + statfs_local->params = dict_new(); + if (!statfs_local->params) + goto err; + + ret = dict_set_int8(statfs_local->params, + GF_INTERNAL_IGNORE_DEEM_STATFS, 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set " GF_INTERNAL_IGNORE_DEEM_STATFS " in dict"); + goto err; + } + + statfs_local->call_cnt = conf->subvolume_cnt; + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND_COOKIE(statfs_frame, dht_du_info_cbk, + conf->subvolumes[i], conf->subvolumes[i], + conf->subvolumes[i]->fops->statfs, &tmp_loc, + statfs_local->params); + } + + conf->last_stat_fetch = now; + } + return 0; err: - if (statfs_frame) - DHT_STACK_DESTROY (statfs_frame); + if (statfs_frame) + DHT_STACK_DESTROY(statfs_frame); - return -1; + return -1; } - gf_boolean_t -dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) +dht_is_subvol_filled(xlator_t *this, xlator_t *subvol) { - int i = 0; - dht_conf_t *conf = NULL; - gf_boolean_t subvol_filled_inodes = _gf_false; - gf_boolean_t subvol_filled_space = _gf_false; - gf_boolean_t is_subvol_filled = _gf_false; - - conf = this->private; - - /* Check for values above specified percent or free disk */ - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (subvol == conf->subvolumes[i]) { - if (conf->disk_unit == 'p') { - if (conf->du_stats[i].avail_percent < - conf->min_free_disk) { - subvol_filled_space = _gf_true; - break; - } - - } else { - if (conf->du_stats[i].avail_space < - conf->min_free_disk) { - subvol_filled_space = _gf_true; - break; - } - } - if (conf->du_stats[i].avail_inodes < - conf->min_free_inodes) { - subvol_filled_inodes = _gf_true; - break; - } - } - } - } - UNLOCK (&conf->subvolume_lock); - - if (subvol_filled_space && conf->subvolume_status[i]) { - if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) { - gf_log (this->name, GF_LOG_WARNING, - "disk space on subvolume '%s' is getting " - "full (%.2f %%), consider adding more nodes", - subvol->name, - (100 - conf->du_stats[i].avail_percent)); - } - } - - if (subvol_filled_inodes && conf->subvolume_status[i]) { - if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) { - gf_log (this->name, GF_LOG_CRITICAL, - "inodes on subvolume '%s' are at " - "(%.2f %%), consider adding more nodes", - subvol->name, - (100 - conf->du_stats[i].avail_inodes)); - } - } - - is_subvol_filled = (subvol_filled_space || subvol_filled_inodes); - - return is_subvol_filled; -} + int i = 0; + char vol_name[256]; + dht_conf_t *conf = NULL; + gf_boolean_t subvol_filled_inodes = _gf_false; + gf_boolean_t subvol_filled_space = _gf_false; + gf_boolean_t is_subvol_filled = _gf_false; + double usage = 0; + + conf = this->private; + + /* Check for values above specified percent or free disk */ + LOCK(&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + if (conf->disk_unit == 'p') { + if (conf->du_stats[i].avail_percent < conf->min_free_disk) { + subvol_filled_space = _gf_true; + break; + } + + } else { + if (conf->du_stats[i].avail_space < conf->min_free_disk) { + subvol_filled_space = _gf_true; + break; + } + } + if (conf->du_stats[i].avail_inodes < conf->min_free_inodes) { + subvol_filled_inodes = _gf_true; + break; + } + } + } + } + UNLOCK(&conf->subvolume_lock); + if (subvol_filled_space && conf->subvolume_status[i]) { + if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) { + usage = 100 - conf->du_stats[i].avail_percent; -/*Get the best subvolume to create the file in*/ -xlator_t * -dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, - dht_local_t *local) -{ - xlator_t *avail_subvol = NULL; - dht_conf_t *conf = NULL; - dht_layout_t *layout = NULL; - loc_t *loc = NULL; + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SUBVOL_INSUFF_SPACE, + "disk space on subvolume '%s' is getting " + "full (%.2f %%), consider adding more bricks", + subvol->name, usage); - conf = this->private; - if (!local) - goto out; - loc = &local->loc; - if (!local->layout) { - layout = dht_layout_get (this, loc->parent); - - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "layout missing path=%s parent=%s", - loc->path, uuid_utoa (loc->parent->gfid)); - goto out; - } - } else { - layout = dht_layout_ref (this, local->layout); + (void)snprintf(vol_name, sizeof(vol_name), "%s", this->name); + vol_name[(strlen(this->name) - 4)] = '\0'; + + gf_event(EVENT_DHT_DISK_USAGE, "volume=%s;subvol=%s;usage=%.2f %%", + vol_name, subvol->name, usage); } + } + + if (subvol_filled_inodes && conf->subvolume_status[i]) { + if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) { + usage = 100 - conf->du_stats[i].avail_inodes; + gf_msg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_SUBVOL_INSUFF_INODES, + "inodes on subvolume '%s' are at " + "(%.2f %%), consider adding more bricks", + subvol->name, usage); + + (void)snprintf(vol_name, sizeof(vol_name), "%s", this->name); + vol_name[(strlen(this->name) - 4)] = '\0'; + + gf_event(EVENT_DHT_INODES_USAGE, + "volume=%s;subvol=%s;usage=%.2f %%", vol_name, + subvol->name, usage); + } + } - LOCK (&conf->subvolume_lock); - { - avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, - layout); - if(!avail_subvol) - { - avail_subvol = dht_subvol_maxspace_nonzeroinode(this, - subvol, - layout); - } + is_subvol_filled = (subvol_filled_space || subvol_filled_inodes); + + return is_subvol_filled; +} - } - UNLOCK (&conf->subvolume_lock); +/*Get the best subvolume to create the file in*/ +xlator_t * +dht_free_disk_available_subvol(xlator_t *this, xlator_t *subvol, + dht_local_t *local) +{ + xlator_t *avail_subvol = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + loc_t *loc = NULL; + + conf = this->private; + if (!local) + goto out; + loc = &local->loc; + if (!local->layout) { + layout = dht_layout_get(this, loc->parent); + + if (!layout) { + gf_msg_debug(this->name, 0, + "Missing layout. path=%s," + " parent gfid = %s", + loc->path, uuid_utoa(loc->parent->gfid)); + goto out; + } + } else { + layout = dht_layout_ref(this, local->layout); + } + + LOCK(&conf->subvolume_lock); + { + avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, NULL, + layout, 0); + if (!avail_subvol) { + avail_subvol = dht_subvol_maxspace_nonzeroinode(this, subvol, + layout); + } + } + UNLOCK(&conf->subvolume_lock); out: - if (!avail_subvol) { - gf_log (this->name, - GF_LOG_DEBUG, - "no subvolume has enough free space and/or inodes\ - to create"); - avail_subvol = subvol; - } - - if (layout) - dht_layout_unref (this, layout); - return avail_subvol; + if (!avail_subvol) { + gf_msg_debug(this->name, 0, + "No subvolume has enough free space \ + and/or inodes to create"); + avail_subvol = subvol; + } + + if (layout) + dht_layout_unref(this, layout); + return avail_subvol; } -static inline -int32_t dht_subvol_has_err (xlator_t *this, dht_layout_t *layout) +static inline int32_t +dht_subvol_has_err(dht_conf_t *conf, xlator_t *this, xlator_t *ignore, + dht_layout_t *layout) { - int ret = -1; - int i = 0; + int ret = -1; + int i = 0; + + if (!this || !layout) + goto out; + + /* this check is meant for rebalance process. The source of the file + * should be ignored for space check */ + if (this == ignore) { + goto out; + } + + /* check if subvol has layout errors, before selecting it */ + for (i = 0; i < layout->cnt; i++) { + if (!strcmp(layout->list[i].xlator->name, this->name) && + (layout->list[i].err != 0)) { + ret = -1; + goto out; + } + } - if (!this || !layout) + /* discard decommissioned subvol */ + if (conf->decommission_subvols_cnt) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i] && + conf->decommissioned_bricks[i] == this) { + ret = -1; goto out; - - /* check if subvol has layout errors, before selecting it */ - for (i = 0; i < layout->cnt; i++) { - if (!strcmp (layout->list[i].xlator->name, this->name) && - (layout->list[i].err != 0)) { - ret = -1; - goto out; - } + } } - ret = 0; + } + + ret = 0; out: - return ret; + return ret; } /*Get subvolume which has both space and inodes more than the min criteria*/ xlator_t * dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, - dht_layout_t *layout) + xlator_t *ignore, dht_layout_t *layout, + uint64_t filesize) { - int i = 0; - double max = 0; - double max_inodes = 0; - int ignore_subvol = 0; - - xlator_t *avail_subvol = NULL; - dht_conf_t *conf = NULL; - - conf = this->private; - - for(i=0; i < conf->subvolume_cnt; i++) { - /* check if subvol has layout errors, before selecting it */ - ignore_subvol = dht_subvol_has_err (conf->subvolumes[i], - layout); - if (ignore_subvol) - continue; - - if ((conf->disk_unit == 'p') && - (conf->du_stats[i].avail_percent > conf->min_free_disk) && - (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) { - if ((conf->du_stats[i].avail_inodes > max_inodes) || - (conf->du_stats[i].avail_percent > max)) { - max = conf->du_stats[i].avail_percent; - max_inodes = conf->du_stats[i].avail_inodes; - avail_subvol = conf->subvolumes[i]; - } - } + int i = 0; + double max = 0; + double max_inodes = 0; + int ignore_subvol = 0; + uint64_t total_blocks = 0; + uint64_t avail_blocks = 0; + uint64_t frsize = 0; + double post_availspace = 0; + double post_percent = 0; + + xlator_t *avail_subvol = NULL; + dht_conf_t *conf = NULL; + + conf = this->private; + + for (i = 0; i < conf->subvolume_cnt; i++) { + /* check if subvol has layout errors and also it is not a + * decommissioned brick, before selecting it */ + ignore_subvol = dht_subvol_has_err(conf, conf->subvolumes[i], ignore, + layout); + if (ignore_subvol) + continue; + + if ((conf->disk_unit == 'p') && + (conf->du_stats[i].avail_percent > conf->min_free_disk) && + (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) { + if ((conf->du_stats[i].avail_inodes > max_inodes) || + (conf->du_stats[i].avail_percent > max)) { + max = conf->du_stats[i].avail_percent; + max_inodes = conf->du_stats[i].avail_inodes; + avail_subvol = conf->subvolumes[i]; + total_blocks = conf->du_stats[i].total_blocks; + avail_blocks = conf->du_stats[i].avail_blocks; + frsize = conf->du_stats[i].frsize; + } + } - if ((conf->disk_unit != 'p') && - (conf->du_stats[i].avail_space > conf->min_free_disk) && - (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) { - if ((conf->du_stats[i].avail_inodes > max_inodes) || - (conf->du_stats[i].avail_space > max)) { - max = conf->du_stats[i].avail_space; - max_inodes = conf->du_stats[i].avail_inodes; - avail_subvol = conf->subvolumes[i]; - } - } + if ((conf->disk_unit != 'p') && + (conf->du_stats[i].avail_space > conf->min_free_disk) && + (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) { + if ((conf->du_stats[i].avail_inodes > max_inodes) || + (conf->du_stats[i].avail_space > max)) { + max = conf->du_stats[i].avail_space; + max_inodes = conf->du_stats[i].avail_inodes; + avail_subvol = conf->subvolumes[i]; + } + } + } + + if (avail_subvol) { + if (conf->disk_unit == 'p') { + post_availspace = (avail_blocks * frsize) - filesize; + post_percent = (post_availspace * 100) / (total_blocks * frsize); + if (post_percent < conf->min_free_disk) + avail_subvol = NULL; } + if (conf->disk_unit != 'p') { + if ((max - filesize) < conf->min_free_disk) + avail_subvol = NULL; + } + } - return avail_subvol; + return avail_subvol; } - -/* Get subvol which has atleast one inode and maximum space */ +/* Get subvol which has at least one inode and maximum space */ xlator_t * -dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, - dht_layout_t *layout) +dht_subvol_maxspace_nonzeroinode(xlator_t *this, xlator_t *subvol, + dht_layout_t *layout) { - int i = 0; - double max = 0; - int ignore_subvol = 0; - - xlator_t *avail_subvol = NULL; - dht_conf_t *conf = NULL; - - conf = this->private; - - for (i = 0; i < conf->subvolume_cnt; i++) { - /* check if subvol has layout errors, before selecting it */ - ignore_subvol = dht_subvol_has_err (conf->subvolumes[i], - layout); - if (ignore_subvol) - continue; - - if (conf->disk_unit == 'p') { - if ((conf->du_stats[i].avail_percent > max) - && (conf->du_stats[i].avail_inodes > 0 )) { - max = conf->du_stats[i].avail_percent; - avail_subvol = conf->subvolumes[i]; - } - } else { - if ((conf->du_stats[i].avail_space > max) - && (conf->du_stats[i].avail_inodes > 0)) { - max = conf->du_stats[i].avail_space; - avail_subvol = conf->subvolumes[i]; - } - } + int i = 0; + double max = 0; + int ignore_subvol = 0; + + xlator_t *avail_subvol = NULL; + dht_conf_t *conf = NULL; + + conf = this->private; + + for (i = 0; i < conf->subvolume_cnt; i++) { + /* check if subvol has layout errors and also it is not a + * decommissioned brick, before selecting it*/ + + ignore_subvol = dht_subvol_has_err(conf, conf->subvolumes[i], NULL, + layout); + if (ignore_subvol) + continue; + + if (conf->disk_unit == 'p') { + if ((conf->du_stats[i].avail_percent > max) && + (conf->du_stats[i].avail_inodes > 0)) { + max = conf->du_stats[i].avail_percent; + avail_subvol = conf->subvolumes[i]; + } + } else { + if ((conf->du_stats[i].avail_space > max) && + (conf->du_stats[i].avail_inodes > 0)) { + max = conf->du_stats[i].avail_space; + avail_subvol = conf->subvolumes[i]; + } } + } - return avail_subvol; + return avail_subvol; } diff --git a/xlators/cluster/dht/src/dht-hashfn.c b/xlators/cluster/dht/src/dht-hashfn.c index 656cf23a058..acda67c312a 100644 --- a/xlators/cluster/dht/src/dht-hashfn.c +++ b/xlators/cluster/dht/src/dht-hashfn.c @@ -8,104 +8,103 @@ cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - - -#include "glusterfs.h" -#include "xlator.h" #include "dht-common.h" -#include "hashfn.h" +#include <glusterfs/hashfn.h> - -int -dht_hash_compute_internal (int type, const char *name, uint32_t *hash_p) +static int +dht_hash_compute_internal(int type, const char *name, const int len, + uint32_t *hash_p) { - int ret = 0; - uint32_t hash = 0; + int ret = 0; + uint32_t hash = 0; - switch (type) { + switch (type) { case DHT_HASH_TYPE_DM: case DHT_HASH_TYPE_DM_USER: - hash = gf_dm_hashfn (name, strlen (name)); - break; + hash = gf_dm_hashfn(name, len); + break; default: - ret = -1; - break; - } + ret = -1; + break; + } - if (ret == 0) { - *hash_p = hash; - } + if (ret == 0) { + *hash_p = hash; + } - return ret; + return ret; } - -static inline -gf_boolean_t -dht_munge_name (const char *original, char *modified, size_t len, regex_t *re) +/* The function returns: + * 0 : in case no munge took place + * >0 : the length (inc. terminating NULL!) of the newly modified string, + * if it was munged. + */ +static int +dht_munge_name(const char *original, char *modified, size_t len, regex_t *re) { - regmatch_t matches[2]; - size_t new_len; - - if (regexec(re,original,2,matches,0) != REG_NOMATCH) { - if (matches[1].rm_so != -1) { - new_len = matches[1].rm_eo - matches[1].rm_so; - /* Equal would fail due to the NUL at the end. */ - if (new_len < len) { - memcpy (modified,original+matches[1].rm_so, - new_len); - modified[new_len] = '\0'; - return _gf_true; - } - } + regmatch_t matches[2] = { + {0}, + }; + size_t new_len = 0; + int ret = 0; + + ret = regexec(re, original, 2, matches, 0); + + if (ret != REG_NOMATCH) { + if (matches[1].rm_so != -1) { + new_len = matches[1].rm_eo - matches[1].rm_so; + /* Equal would fail due to the NUL at the end. */ + if (new_len < len) { + memcpy(modified, original + matches[1].rm_so, new_len); + modified[new_len] = '\0'; + return new_len + 1; /* +1 for the terminating NULL */ + } } + } - /* This is guaranteed safe because of how the dest was allocated. */ - strcpy(modified,original); - return _gf_false; + /* This is guaranteed safe because of how the dest was allocated. */ + strcpy(modified, original); + return 0; } int -dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p) +dht_hash_compute(xlator_t *this, int type, const char *name, uint32_t *hash_p) { - char *rsync_friendly_name = NULL; - dht_conf_t *priv = this->private; - size_t len = 0; - gf_boolean_t munged = _gf_false; - - /* - * It wouldn't be safe to use alloca in an inline function that doesn't - * actually get inlined, and it wouldn't be efficient to do a real - * allocation, so we use alloca here (if needed) and pass that to the - * inline. - */ + char *rsync_friendly_name = NULL; + dht_conf_t *priv = NULL; + size_t len = 0; + int munged = 0; + priv = this->private; + + if (name == NULL) + return -1; + + len = strlen(name) + 1; + rsync_friendly_name = alloca(len); + + LOCK(&priv->lock); + { if (priv->extra_regex_valid) { - len = strlen(name) + 1; - rsync_friendly_name = alloca(len); - munged = dht_munge_name (name, rsync_friendly_name, len, - &priv->extra_regex); + munged = dht_munge_name(name, rsync_friendly_name, len, + &priv->extra_regex); } if (!munged && priv->rsync_regex_valid) { - len = strlen(name) + 1; - rsync_friendly_name = alloca(len); - gf_log (this->name, GF_LOG_TRACE, "trying regex for %s", name); - munged = dht_munge_name (name, rsync_friendly_name, len, - &priv->rsync_regex); - if (munged) { - gf_log (this->name, GF_LOG_DEBUG, - "munged down to %s", rsync_friendly_name); - } + gf_msg_trace(this->name, 0, "trying regex for %s", name); + munged = dht_munge_name(name, rsync_friendly_name, len, + &priv->rsync_regex); } - - if (!munged) { - rsync_friendly_name = (char *)name; - } - - return dht_hash_compute_internal (type, rsync_friendly_name, hash_p); + } + UNLOCK(&priv->lock); + if (munged) { + gf_msg_debug(this->name, 0, "munged down to %s", rsync_friendly_name); + len = munged; + } else { + rsync_friendly_name = (char *)name; + } + + return dht_hash_compute_internal(type, rsync_friendly_name, len - 1, + hash_p); } diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c index f1dc5072f40..3f2fe43d5f3 100644 --- a/xlators/cluster/dht/src/dht-helper.c +++ b/xlators/cluster/dht/src/dht-helper.c @@ -8,1199 +8,2297 @@ cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - - -#include "glusterfs.h" -#include "xlator.h" #include "dht-common.h" +#include "dht-lock.h" +#include "glusterfs/compat-errno.h" // for ENODATA on BSD -static inline int -dht_inode_ctx_set1 (xlator_t *this, inode_t *inode, xlator_t *subvol) +static void +dht_free_fd_ctx(dht_fd_ctx_t *fd_ctx) { - uint64_t tmp_subvol = 0; + GF_FREE(fd_ctx); +} - tmp_subvol = (long)subvol; - return inode_ctx_set1 (inode, this, &tmp_subvol); +int32_t +dht_fd_ctx_destroy(xlator_t *this, fd_t *fd) +{ + dht_fd_ctx_t *fd_ctx = NULL; + uint64_t value = 0; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + ret = fd_ctx_del(fd, this, &value); + if (ret) { + goto out; + } + + fd_ctx = (dht_fd_ctx_t *)(uintptr_t)value; + if (fd_ctx) { + GF_REF_PUT(fd_ctx); + } +out: + return ret; } -int -dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol) +static int +__dht_fd_ctx_set(xlator_t *this, fd_t *fd, xlator_t *dst) { - int ret = -1; - uint64_t tmp_subvol = 0; + dht_fd_ctx_t *fd_ctx = NULL; + uint64_t value = 0; + int ret = -1; - ret = inode_ctx_get1 (inode, this, &tmp_subvol); - if (tmp_subvol && subvol) - *subvol = (xlator_t *)tmp_subvol; + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); - return ret; -} + fd_ctx = GF_CALLOC(1, sizeof(*fd_ctx), gf_dht_mt_fd_ctx_t); + if (!fd_ctx) { + goto out; + } + + fd_ctx->opened_on_dst = (uint64_t)(uintptr_t)dst; + GF_REF_INIT(fd_ctx, dht_free_fd_ctx); + + value = (uint64_t)(uintptr_t)fd_ctx; + + ret = __fd_ctx_set(fd, this, value); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_FD_CTX_SET_FAILED, + "fd=0x%p", fd, NULL); + GF_REF_PUT(fd_ctx); + } +out: + return ret; +} int -dht_frame_return (call_frame_t *frame) +dht_fd_ctx_set(xlator_t *this, fd_t *fd, xlator_t *dst) { - dht_local_t *local = NULL; - int this_call_cnt = -1; + dht_fd_ctx_t *fd_ctx = NULL; + uint64_t value = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + LOCK(&fd->lock); + { + ret = __fd_ctx_get(fd, this, &value); + if (ret && value) { + fd_ctx = (dht_fd_ctx_t *)(uintptr_t)value; + if (fd_ctx->opened_on_dst == (uint64_t)(uintptr_t)dst) { + /* This could happen due to racing + * check_progress tasks*/ + goto unlock; + } else { + /* This would be a big problem*/ + /* Overwrite and hope for the best*/ + fd_ctx->opened_on_dst = (uint64_t)(uintptr_t)dst; + UNLOCK(&fd->lock); + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_VALUE, + NULL); - if (!frame) - return -1; + goto out; + } + } + ret = __dht_fd_ctx_set(this, fd, dst); + } +unlock: + UNLOCK(&fd->lock); +out: + return ret; +} - local = frame->local; +static dht_fd_ctx_t * +dht_fd_ctx_get(xlator_t *this, fd_t *fd) +{ + dht_fd_ctx_t *fd_ctx = NULL; + int ret = -1; + uint64_t tmp_val = 0; + + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + LOCK(&fd->lock); + { + ret = __fd_ctx_get(fd, this, &tmp_val); + if ((ret < 0) || (tmp_val == 0)) { + goto unlock; + } + + fd_ctx = (dht_fd_ctx_t *)(uintptr_t)tmp_val; + GF_REF_GET(fd_ctx); + } +unlock: + UNLOCK(&fd->lock); - LOCK (&frame->lock); - { - this_call_cnt = --local->call_cnt; +out: + return fd_ctx; +} + +gf_boolean_t +dht_fd_open_on_dst(xlator_t *this, fd_t *fd, xlator_t *dst) +{ + dht_fd_ctx_t *fd_ctx = NULL; + gf_boolean_t opened = _gf_false; + + fd_ctx = dht_fd_ctx_get(this, fd); + + if (fd_ctx) { + if (fd_ctx->opened_on_dst == (uint64_t)(uintptr_t)dst) { + opened = _gf_true; } - UNLOCK (&frame->lock); + GF_REF_PUT(fd_ctx); + } - return this_call_cnt; + return opened; } +void +dht_free_mig_info(void *data) +{ + dht_migrate_info_t *miginfo = NULL; -static uint64_t -dht_bits_for (uint64_t num) + miginfo = data; + GF_FREE(miginfo); + + return; +} + +static int +dht_inode_ctx_set_mig_info(xlator_t *this, inode_t *inode, xlator_t *src_subvol, + xlator_t *dst_subvol) { - uint64_t bits = 0, ctrl = 1; + dht_migrate_info_t *miginfo = NULL; + uint64_t value = 0; + int ret = -1; + + miginfo = GF_CALLOC(1, sizeof(*miginfo), gf_dht_mt_miginfo_t); + if (miginfo == NULL) + goto out; - while (ctrl < num) { - ctrl *= 2; - bits ++; - } + miginfo->src_subvol = src_subvol; + miginfo->dst_subvol = dst_subvol; + GF_REF_INIT(miginfo, dht_free_mig_info); - return bits; + value = (uint64_t)(uintptr_t)miginfo; + + ret = inode_ctx_set1(inode, this, &value); + if (ret < 0) { + GF_REF_PUT(miginfo); + } + +out: + return ret; } -/* - * A slightly "updated" version of the algorithm described in the commit log - * is used here. - * - * The only enhancement is that: +int +dht_inode_ctx_get_mig_info(xlator_t *this, inode_t *inode, + xlator_t **src_subvol, xlator_t **dst_subvol) +{ + int ret = -1; + uint64_t tmp_miginfo = 0; + dht_migrate_info_t *miginfo = NULL; + + LOCK(&inode->lock); + { + ret = __inode_ctx_get1(inode, this, &tmp_miginfo); + if ((ret < 0) || (tmp_miginfo == 0)) { + UNLOCK(&inode->lock); + goto out; + } + + miginfo = (dht_migrate_info_t *)(uintptr_t)tmp_miginfo; + GF_REF_GET(miginfo); + } + UNLOCK(&inode->lock); + + if (src_subvol) + *src_subvol = miginfo->src_subvol; + + if (dst_subvol) + *dst_subvol = miginfo->dst_subvol; + + GF_REF_PUT(miginfo); + +out: + return ret; +} + +gf_boolean_t +dht_mig_info_is_invalid(xlator_t *current, xlator_t *src_subvol, + xlator_t *dst_subvol) +{ + /* Not set + */ + if (!src_subvol || !dst_subvol) + return _gf_true; + + /* Invalid scenarios: + * The src_subvol does not match the subvol on which the current op was sent + * so the cached subvol has changed between the last mig_info_set and now. + * src_subvol == dst_subvol. The file was migrated without any FOP detecting + * a P2 so the old dst is now the current subvol. + * + * There is still one scenario where the info could be outdated - if + * file has undergone multiple migrations and ends up on the same src_subvol + * on which the mig_info was first set. + */ + if ((current == dst_subvol) || (current != src_subvol)) + return _gf_true; + + return _gf_false; +} + +/* Used to check if fd fops have the fd opened on the cached subvol + * This is required when: + * 1. an fd is opened on FILE1 on subvol1 + * 2. the file is migrated to subvol2 + * 3. a lookup updates the cached subvol in the inode_ctx to subvol2 + * 4. a write comes on the fd + * The write is sent to subvol2 on an fd which has been opened only on fd1 + * Since the migration phase checks don't kick in, the fop fails with EBADF * - * - The number of bits used by the backend filesystem for HUGE d_off which - * is described as 63, and - * - The number of bits used by the d_off presented by the transformation - * upwards which is described as 64, are both made "configurable." */ +int +dht_check_and_open_fd_on_subvol_complete(int ret, call_frame_t *frame, + void *data) +{ + glusterfs_fop_t fop = 0; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + xlator_t *this = NULL; + fd_t *fd = NULL; + int op_errno = -1; + + local = frame->local; + this = frame->this; + fop = local->fop; + subvol = local->cached_subvol; + fd = local->fd; + + if (ret) { + op_errno = local->op_errno; + goto handle_err; + } + + switch (fop) { + case GF_FOP_WRITE: + STACK_WIND_COOKIE(frame, dht_writev_cbk, subvol, subvol, + subvol->fops->writev, fd, local->rebalance.vector, + local->rebalance.count, local->rebalance.offset, + local->rebalance.flags, local->rebalance.iobref, + local->xattr_req); + break; + + case GF_FOP_FLUSH: + STACK_WIND(frame, dht_flush_cbk, subvol, subvol->fops->flush, fd, + local->xattr_req); + break; + + case GF_FOP_FSETATTR: + STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol, + subvol->fops->fsetattr, fd, + &local->rebalance.stbuf, local->rebalance.flags, + local->xattr_req); + break; + + case GF_FOP_ZEROFILL: + STACK_WIND_COOKIE(frame, dht_zerofill_cbk, subvol, subvol, + subvol->fops->zerofill, fd, + local->rebalance.offset, local->rebalance.size, + local->xattr_req); + + break; + + case GF_FOP_DISCARD: + STACK_WIND_COOKIE(frame, dht_discard_cbk, subvol, subvol, + subvol->fops->discard, local->fd, + local->rebalance.offset, local->rebalance.size, + local->xattr_req); + break; + + case GF_FOP_FALLOCATE: + STACK_WIND_COOKIE(frame, dht_fallocate_cbk, subvol, subvol, + subvol->fops->fallocate, fd, + local->rebalance.flags, local->rebalance.offset, + local->rebalance.size, local->xattr_req); + break; + + case GF_FOP_FTRUNCATE: + STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol, + subvol->fops->ftruncate, fd, + local->rebalance.offset, local->xattr_req); + break; + + case GF_FOP_FSYNC: + STACK_WIND_COOKIE(frame, dht_fsync_cbk, subvol, subvol, + subvol->fops->fsync, local->fd, + local->rebalance.flags, local->xattr_req); + break; + + case GF_FOP_READ: + STACK_WIND(frame, dht_readv_cbk, subvol, subvol->fops->readv, + local->fd, local->rebalance.size, + local->rebalance.offset, local->rebalance.flags, + local->xattr_req); + break; + + case GF_FOP_FSTAT: + STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol, + subvol->fops->fstat, fd, local->xattr_req); + break; + + case GF_FOP_FSETXATTR: + STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol, + subvol->fops->fsetxattr, local->fd, + local->rebalance.xattr, local->rebalance.flags, + local->xattr_req); + break; + + case GF_FOP_FREMOVEXATTR: + STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol, + subvol->fops->fremovexattr, local->fd, local->key, + local->xattr_req); + + break; + + case GF_FOP_FXATTROP: + STACK_WIND(frame, dht_common_xattrop_cbk, subvol, + subvol->fops->fxattrop, local->fd, + local->rebalance.flags, local->rebalance.xattr, + local->xattr_req); + break; + + case GF_FOP_FGETXATTR: + STACK_WIND(frame, dht_getxattr_cbk, subvol, subvol->fops->fgetxattr, + local->fd, local->key, NULL); + break; + + case GF_FOP_FINODELK: + STACK_WIND(frame, dht_finodelk_cbk, subvol, subvol->fops->finodelk, + local->key, local->fd, local->rebalance.lock_cmd, + &local->rebalance.flock, local->xattr_req); + break; + default: + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UNKNOWN_FOP, "fd=%p", + fd, "gfid=%s", uuid_utoa(fd->inode->gfid), "name=%s", + subvol->name, NULL); + break; + } + + goto out; + + /* Could not open the fd on the dst. Unwind */ + +handle_err: + + switch (fop) { + case GF_FOP_WRITE: + DHT_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL); + break; + + case GF_FOP_FLUSH: + DHT_STACK_UNWIND(flush, frame, -1, op_errno, NULL); + break; + + case GF_FOP_FSETATTR: + DHT_STACK_UNWIND(fsetattr, frame, -1, op_errno, NULL, NULL, NULL); + break; + + case GF_FOP_ZEROFILL: + DHT_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL); + break; + + case GF_FOP_DISCARD: + DHT_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL); + break; + + case GF_FOP_FALLOCATE: + DHT_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL); + break; + + case GF_FOP_FTRUNCATE: + DHT_STACK_UNWIND(ftruncate, frame, -1, op_errno, NULL, NULL, NULL); + break; + + case GF_FOP_FSYNC: + DHT_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL); + break; + + case GF_FOP_READ: + DHT_STACK_UNWIND(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, + NULL); + break; + + case GF_FOP_FSTAT: + DHT_STACK_UNWIND(fstat, frame, -1, op_errno, NULL, NULL); + break; + + case GF_FOP_FSETXATTR: + DHT_STACK_UNWIND(fsetxattr, frame, -1, op_errno, NULL); + break; + + case GF_FOP_FREMOVEXATTR: + DHT_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL); + break; + + case GF_FOP_FXATTROP: + DHT_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL); + break; + + case GF_FOP_FGETXATTR: + DHT_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL); + break; + + case GF_FOP_FINODELK: + DHT_STACK_UNWIND(finodelk, frame, -1, op_errno, NULL); + break; + + default: + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UNKNOWN_FOP, "fd=%p", + fd, "gfid=%s", uuid_utoa(fd->inode->gfid), "name=%s", + subvol->name, NULL); + break; + } -#define BACKEND_D_OFF_BITS 63 -#define PRESENT_D_OFF_BITS 63 +out: -#define ONE 1ULL -#define MASK (~0ULL) -#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS)) -#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS)) + return 0; +} -#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1)) -#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1))) +/* Check once again if the fd has been opened on the cached subvol. + * If not, open and update the fd_ctx. + */ int -dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) +dht_check_and_open_fd_on_subvol_task(void *data) { - dht_conf_t *conf = NULL; - int cnt = 0; - int max = 0; - uint64_t y = 0; - uint64_t hi_mask = 0; - uint64_t off_mask = 0; - int max_bits = 0; - - if (x == ((uint64_t) -1)) { - y = (uint64_t) -1; - goto out; - } + loc_t loc = { + 0, + }; + int ret = -1; + call_frame_t *frame = NULL; + dht_local_t *local = NULL; + fd_t *fd = NULL; + xlator_t *this = NULL; + xlator_t *subvol = NULL; + + frame = data; + local = frame->local; + this = THIS; + fd = local->fd; + subvol = local->cached_subvol; + + local->fd_checked = _gf_true; + + if (fd_is_anonymous(fd) || dht_fd_open_on_dst(this, fd, subvol)) { + ret = 0; + goto out; + } - conf = this->private; - if (!conf) - goto out; + gf_msg_debug(this->name, 0, "Opening fd (%p, flags=0%o) on file %s @ %s", + fd, fd->flags, uuid_utoa(fd->inode->gfid), subvol->name); - max = conf->subvolume_cnt; - cnt = dht_subvol_cnt (this, subvol); + loc.inode = inode_ref(fd->inode); + gf_uuid_copy(loc.gfid, fd->inode->gfid); - if (max == 1) { - y = x; - goto out; - } + /* Open this on the dst subvol */ - max_bits = dht_bits_for (max); + SYNCTASK_SETID(0, 0); - hi_mask = ~(PRESENT_MASK >> (max_bits + 1)); + ret = syncop_open(subvol, &loc, (fd->flags & ~(O_CREAT | O_EXCL | O_TRUNC)), + fd, NULL, NULL); - if (x & hi_mask) { - /* HUGE d_off */ - off_mask = MASK << max_bits; - y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt; + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_OPEN_FD_ON_DST_FAILED, + "fd=%p", fd, "flags=0%o", fd->flags, "gfid=%s", + uuid_utoa(fd->inode->gfid), "name=%s", subvol->name, NULL); + /* This can happen if the cached subvol was updated in the + * inode_ctx and the fd was opened on the new cached suvol + * after this fop was wound on the old cached subvol. + * As we do not close the fd on the old subvol (a leak) + * don't treat ENOENT as an error and allow the phase1/phase2 + * checks to handle it. + */ + + if ((-ret != ENOENT) && (-ret != ESTALE)) { + local->op_errno = -ret; + ret = -1; } else { - /* small d_off */ - y = ((x * max) + cnt); + ret = 0; } + local->op_errno = -ret; + ret = -1; + + } else { + dht_fd_ctx_set(this, fd, subvol); + } + + SYNCTASK_SETID(frame->root->uid, frame->root->gid); out: - if (y_p) - *y_p = y; + loc_wipe(&loc); - return 0; + return ret; } int -dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc, - xlator_t **subvol) +dht_check_and_open_fd_on_subvol(xlator_t *this, call_frame_t *frame) { - char *new_name = NULL; - char *new_path = NULL; - xlator_list_t *trav = NULL; - char key[1024] = {0,}; - int ret = 0; /* not found */ - - /* Why do other tasks if first required 'char' itself is not there */ - if (!new_loc || !loc || !loc->name || !strchr (loc->name, '@')) - goto out; + int ret = -1; + dht_local_t *local = NULL; - trav = this->children; - while (trav) { - snprintf (key, 1024, "*@%s:%s", this->name, trav->xlator->name); - if (fnmatch (key, loc->name, FNM_NOESCAPE) == 0) { - new_name = GF_CALLOC(strlen (loc->name), - sizeof (char), - gf_common_mt_char); - if (!new_name) - goto out; - if (fnmatch (key, loc->path, FNM_NOESCAPE) == 0) { - new_path = GF_CALLOC(strlen (loc->path), - sizeof (char), - gf_common_mt_char); - if (!new_path) - goto out; - strncpy (new_path, loc->path, (strlen (loc->path) - - strlen (key) + 1)); - } - strncpy (new_name, loc->name, (strlen (loc->name) - - strlen (key) + 1)); - - if (new_loc) { - new_loc->path = ((new_path) ? new_path: - gf_strdup (loc->path)); - new_loc->name = new_name; - new_loc->inode = inode_ref (loc->inode); - new_loc->parent = inode_ref (loc->parent); - } - *subvol = trav->xlator; - ret = 1; /* success */ - goto out; - } - trav = trav->next; - } -out: - if (!ret) { - /* !success */ - GF_FREE (new_path); - GF_FREE (new_name); - } - return ret; + /* + if (dht_fd_open_on_dst (this, fd, subvol)) + goto out; + */ + local = frame->local; + + ret = synctask_new(this->ctx->env, dht_check_and_open_fd_on_subvol_task, + dht_check_and_open_fd_on_subvol_complete, frame, frame); + + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SYNCTASK_CREATE_FAILED, + "to-check-and-open fd=%p", local->fd, NULL); + } + + return ret; } int -dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p, - uint64_t *x_p) +dht_frame_return(call_frame_t *frame) { - dht_conf_t *conf = NULL; - int cnt = 0; - int max = 0; - uint64_t x = 0; - xlator_t *subvol = 0; - int max_bits = 0; - uint64_t off_mask = 0; - uint64_t host_mask = 0; - - if (!this->private) - return -1; - - conf = this->private; - max = conf->subvolume_cnt; - - if (max == 1) { - x = y; - cnt = 0; - goto out; - } - - if (y & TOP_BIT) { - /* HUGE d_off */ - max_bits = dht_bits_for (max); - off_mask = (MASK << max_bits); - host_mask = ~(off_mask); - - x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS; - - cnt = y & host_mask; - } else { - /* small d_off */ - cnt = y % max; - x = y / max; - } + dht_local_t *local = NULL; + int this_call_cnt = -1; + + if (!frame) + return -1; + local = frame->local; + + LOCK(&frame->lock); + { + this_call_cnt = --local->call_cnt; + } + UNLOCK(&frame->lock); + + return this_call_cnt; +} + +/* + * Use this function to specify which subvol you want the file created + * on - this need not be the hashed subvol. + * Format: <filename>@<this->name>:<subvol-name> + * Eg: file-1@vol1-dht:vol1-client-0 + * where vol1 is a pure distribute volume + * will create file-1 on vol1-client-0 + */ + +int +dht_filter_loc_subvol_key(xlator_t *this, loc_t *loc, loc_t *new_loc, + xlator_t **subvol) +{ + char *new_name = NULL; + char *new_path = NULL; + xlator_list_t *trav = NULL; + char key[1024] = { + 0, + }; + int ret = 0; /* not found */ + int keylen = 0; + int name_len = 0; + int path_len = 0; + + /* Why do other tasks if first required 'char' itself is not there */ + if (!new_loc || !loc || !loc->name || !strchr(loc->name, '@')) { + /* Skip the GF_FREE checks here */ + return ret; + } + + trav = this->children; + while (trav) { + keylen = snprintf(key, sizeof(key), "*@%s:%s", this->name, + trav->xlator->name); + /* Ignore '*' */ + keylen = keylen - 1; + if (fnmatch(key, loc->name, FNM_NOESCAPE) == 0) { + name_len = strlen(loc->name) - keylen; + new_name = GF_MALLOC(name_len + 1, gf_common_mt_char); + if (!new_name) + goto out; + if (fnmatch(key, loc->path, FNM_NOESCAPE) == 0) { + path_len = strlen(loc->path) - keylen; + new_path = GF_MALLOC(path_len + 1, gf_common_mt_char); + if (!new_path) + goto out; + snprintf(new_path, path_len + 1, "%s", loc->path); + } + snprintf(new_name, name_len + 1, "%s", loc->name); + + if (new_loc) { + new_loc->path = ((new_path) ? new_path : gf_strdup(loc->path)); + new_loc->name = new_name; + new_loc->inode = inode_ref(loc->inode); + new_loc->parent = inode_ref(loc->parent); + } + *subvol = trav->xlator; + ret = 1; /* success */ + goto out; + } + trav = trav->next; + } out: - subvol = conf->subvolumes[cnt]; + if (!ret) { + /* !success */ + GF_FREE(new_path); + GF_FREE(new_name); + } + return ret; +} + +static xlator_t * +dht_get_subvol_from_id(xlator_t *this, int client_id) +{ + xlator_t *xl = NULL; + dht_conf_t *conf = NULL; + char *sid = NULL; + int32_t ret = -1; - if (subvol_p) - *subvol_p = subvol; + conf = this->private; - if (x_p) - *x_p = x; + ret = gf_asprintf(&sid, "%d", client_id); + if (ret == -1) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_ASPRINTF_FAILED, NULL); + goto out; + } - return 0; + if (dict_get_ptr(conf->leaf_to_subvol, sid, (void **)&xl)) + xl = NULL; + + GF_FREE(sid); + +out: + return xl; } +int +dht_deitransform(xlator_t *this, uint64_t y, xlator_t **subvol_p) +{ + int client_id = 0; + xlator_t *subvol = 0; + dht_conf_t *conf = NULL; + + if (!this->private) + return -1; + + conf = this->private; + + client_id = gf_deitransform(this, y); + + subvol = dht_get_subvol_from_id(this, client_id); + + if (!subvol) + subvol = conf->subvolumes[0]; + + if (subvol_p) + *subvol_p = subvol; + + return 0; +} void -dht_local_wipe (xlator_t *this, dht_local_t *local) +dht_local_wipe(xlator_t *this, dht_local_t *local) { - if (!local) - return; + int i = 0; - loc_wipe (&local->loc); - loc_wipe (&local->loc2); + if (!local) + return; - if (local->xattr) - dict_unref (local->xattr); + loc_wipe(&local->loc); + loc_wipe(&local->loc2); + loc_wipe(&local->loc2_copy); - if (local->inode) - inode_unref (local->inode); + if (local->xattr) + dict_unref(local->xattr); - if (local->layout) { - dht_layout_unref (this, local->layout); - local->layout = NULL; - } + if (local->inode) + inode_unref(local->inode); - loc_wipe (&local->linkfile.loc); + if (local->layout) { + dht_layout_unref(this, local->layout); + local->layout = NULL; + } - if (local->linkfile.xattr) - dict_unref (local->linkfile.xattr); + loc_wipe(&local->linkfile.loc); - if (local->linkfile.inode) - inode_unref (local->linkfile.inode); + if (local->linkfile.xattr) + dict_unref(local->linkfile.xattr); - if (local->fd) { - fd_unref (local->fd); - local->fd = NULL; - } + if (local->linkfile.inode) + inode_unref(local->linkfile.inode); - if (local->params) { - dict_unref (local->params); - local->params = NULL; - } + if (local->fd) { + fd_unref(local->fd); + local->fd = NULL; + } - if (local->xattr_req) - dict_unref (local->xattr_req); + if (local->params) { + dict_unref(local->params); + local->params = NULL; + } - if (local->selfheal.layout) { - dht_layout_unref (this, local->selfheal.layout); - local->selfheal.layout = NULL; - } + if (local->xattr_req) + dict_unref(local->xattr_req); + if (local->mds_xattr) + dict_unref(local->mds_xattr); + if (local->xdata) + dict_unref(local->xdata); - GF_FREE (local->newpath); + if (local->selfheal.layout) { + dht_layout_unref(this, local->selfheal.layout); + local->selfheal.layout = NULL; + } - GF_FREE (local->key); + if (local->selfheal.refreshed_layout) { + dht_layout_unref(this, local->selfheal.refreshed_layout); + local->selfheal.refreshed_layout = NULL; + } - GF_FREE (local->rebalance.vector); + for (i = 0; i < 2; i++) { + dht_lock_array_free(local->lock[i].ns.parent_layout.locks, + local->lock[i].ns.parent_layout.lk_count); - if (local->rebalance.iobref) - iobref_unref (local->rebalance.iobref); + GF_FREE(local->lock[i].ns.parent_layout.locks); - mem_put (local); -} + dht_lock_array_free(local->lock[i].ns.directory_ns.locks, + local->lock[i].ns.directory_ns.lk_count); + GF_FREE(local->lock[i].ns.directory_ns.locks); + } + + GF_FREE(local->key); + + if (local->rebalance.xdata) + dict_unref(local->rebalance.xdata); + + if (local->rebalance.xattr) + dict_unref(local->rebalance.xattr); + + if (local->rebalance.dict) + dict_unref(local->rebalance.dict); + + GF_FREE(local->rebalance.vector); + if (local->rebalance.iobref) + iobref_unref(local->rebalance.iobref); + + if (local->stub) { + call_stub_destroy(local->stub); + local->stub = NULL; + } + + if (local->ret_cache) + GF_FREE(local->ret_cache); + + mem_put(local); +} dht_local_t * -dht_local_init (call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop) +dht_local_init(call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop) { - dht_local_t *local = NULL; - inode_t *inode = NULL; - int ret = 0; + dht_local_t *local = NULL; + inode_t *inode = NULL; + int ret = 0; - local = mem_get0 (THIS->local_pool); - if (!local) - goto out; + local = mem_get0(THIS->local_pool); + if (!local) + goto out; - if (loc) { - ret = loc_copy (&local->loc, loc); - if (ret) - goto out; + if (loc) { + ret = loc_copy(&local->loc, loc); + if (ret) + goto out; - inode = loc->inode; - } + inode = loc->inode; + } - if (fd) { - local->fd = fd_ref (fd); - if (!inode) - inode = fd->inode; - } + if (fd) { + local->fd = fd_ref(fd); + if (!inode) + inode = fd->inode; + } - local->op_ret = -1; - local->op_errno = EUCLEAN; - local->fop = fop; + local->op_ret = -1; + local->op_errno = EUCLEAN; + local->fop = fop; - if (inode) { - local->layout = dht_layout_get (frame->this, inode); - local->cached_subvol = dht_subvol_get_cached (frame->this, - inode); - } + if (inode) { + local->layout = dht_layout_get(frame->this, inode); + local->cached_subvol = dht_subvol_get_cached(frame->this, inode); + } - frame->local = local; + frame->local = local; out: - if (ret) { - if (local) - mem_put (local); - local = NULL; - } - return local; + if (ret) { + if (local) + mem_put(local); + local = NULL; + } + return local; } xlator_t * -dht_first_up_subvol (xlator_t *this) +dht_first_up_subvol(xlator_t *this) { - dht_conf_t *conf = NULL; - xlator_t *child = NULL; - int i = 0; - time_t time = 0; + dht_conf_t *conf = NULL; + xlator_t *child = NULL; + int i = 0; + time_t time = 0; - conf = this->private; - if (!conf) - goto out; + conf = this->private; + if (!conf) + goto out; - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->subvol_up_time[i]) { - if (!time) { - time = conf->subvol_up_time[i]; - child = conf->subvolumes[i]; - } else if (time > conf->subvol_up_time[i]) { - time = conf->subvol_up_time[i]; - child = conf->subvolumes[i]; - } - } + LOCK(&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvol_up_time[i]) { + if (!time) { + time = conf->subvol_up_time[i]; + child = conf->subvolumes[i]; + } else if (time > conf->subvol_up_time[i]) { + time = conf->subvol_up_time[i]; + child = conf->subvolumes[i]; } + } } - UNLOCK (&conf->subvolume_lock); + } + UNLOCK(&conf->subvolume_lock); out: - return child; + return child; } xlator_t * -dht_last_up_subvol (xlator_t *this) +dht_last_up_subvol(xlator_t *this) { - dht_conf_t *conf = NULL; - xlator_t *child = NULL; - int i = 0; - - conf = this->private; - if (!conf) - goto out; - - LOCK (&conf->subvolume_lock); - { - for (i = conf->subvolume_cnt-1; i >= 0; i--) { - if (conf->subvolume_status[i]) { - child = conf->subvolumes[i]; - break; - } - } + dht_conf_t *conf = NULL; + xlator_t *child = NULL; + int i = 0; + + conf = this->private; + if (!conf) + goto out; + + LOCK(&conf->subvolume_lock); + { + for (i = conf->subvolume_cnt - 1; i >= 0; i--) { + if (conf->subvolume_status[i]) { + child = conf->subvolumes[i]; + break; + } } - UNLOCK (&conf->subvolume_lock); + } + UNLOCK(&conf->subvolume_lock); out: - return child; + return child; } xlator_t * -dht_subvol_get_hashed (xlator_t *this, loc_t *loc) +dht_subvol_get_hashed(xlator_t *this, loc_t *loc) { - dht_layout_t *layout = NULL; - xlator_t *subvol = NULL; + dht_layout_t *layout = NULL; + xlator_t *subvol = NULL; + dht_conf_t *conf = NULL; + dht_methods_t *methods = NULL; - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); - if (__is_root_gfid (loc->gfid)) { - subvol = dht_first_up_subvol (this); - goto out; - } + conf = this->private; + GF_VALIDATE_OR_GOTO(this->name, conf, out); - GF_VALIDATE_OR_GOTO (this->name, loc->parent, out); - GF_VALIDATE_OR_GOTO (this->name, loc->name, out); + methods = &(conf->methods); - layout = dht_layout_get (this, loc->parent); + if (__is_root_gfid(loc->gfid)) { + subvol = dht_first_up_subvol(this); + goto out; + } - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "layout missing path=%s parent=%s", - loc->path, uuid_utoa (loc->parent->gfid)); - goto out; - } + GF_VALIDATE_OR_GOTO(this->name, loc->parent, out); + GF_VALIDATE_OR_GOTO(this->name, loc->name, out); - subvol = dht_layout_search (this, layout, loc->name); + layout = dht_layout_get(this, loc->parent); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "could not find subvolume for path=%s", - loc->path); - goto out; - } + if (!layout) { + gf_msg_debug(this->name, 0, "Missing layout. path=%s, parent gfid =%s", + loc->path, uuid_utoa(loc->parent->gfid)); + goto out; + } + + subvol = methods->layout_search(this, layout, loc->name); + + if (!subvol) { + gf_msg_debug(this->name, 0, "No hashed subvolume for path=%s", + loc->path); + goto out; + } out: - if (layout) { - dht_layout_unref (this, layout); - } + if (layout) { + dht_layout_unref(this, layout); + } - return subvol; + return subvol; } - xlator_t * -dht_subvol_get_cached (xlator_t *this, inode_t *inode) +dht_subvol_get_cached(xlator_t *this, inode_t *inode) { - dht_layout_t *layout = NULL; - xlator_t *subvol = NULL; + dht_layout_t *layout = NULL; + xlator_t *subvol = NULL; - GF_VALIDATE_OR_GOTO (this->name, this, out); - GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); - layout = dht_layout_get (this, inode); + layout = dht_layout_get(this, inode); - if (!layout) { - goto out; - } + if (!layout) { + goto out; + } - subvol = layout->list[0].xlator; + subvol = layout->list[0].xlator; out: - if (layout) { - dht_layout_unref (this, layout); - } + if (layout) { + dht_layout_unref(this, layout); + } - return subvol; + return subvol; } - xlator_t * -dht_subvol_next (xlator_t *this, xlator_t *prev) +dht_subvol_next(xlator_t *this, xlator_t *prev) { - dht_conf_t *conf = NULL; - int i = 0; - xlator_t *next = NULL; - - conf = this->private; - if (!conf) - goto out; - - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->subvolumes[i] == prev) { - if ((i + 1) < conf->subvolume_cnt) - next = conf->subvolumes[i + 1]; - break; - } + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *next = NULL; + + conf = this->private; + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev) { + if ((i + 1) < conf->subvolume_cnt) + next = conf->subvolumes[i + 1]; + break; } + } out: - return next; + return next; } /* This func wraps around, if prev is actually the last subvol. */ xlator_t * -dht_subvol_next_available (xlator_t *this, xlator_t *prev) +dht_subvol_next_available(xlator_t *this, xlator_t *prev) { - dht_conf_t *conf = NULL; - int i = 0; - xlator_t *next = NULL; - - conf = this->private; - if (!conf) - goto out; - - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->subvolumes[i] == prev) { - /* if prev is last in conf->subvolumes, then wrap - * around. - */ - if ((i + 1) < conf->subvolume_cnt) { - next = conf->subvolumes[i + 1]; - } else { - next = conf->subvolumes[0]; - } - break; - } + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *next = NULL; + + conf = this->private; + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev) { + /* if prev is last in conf->subvolumes, then wrap + * around. + */ + if ((i + 1) < conf->subvolume_cnt) { + next = conf->subvolumes[i + 1]; + } else { + next = conf->subvolumes[0]; + } + break; } + } out: - return next; + return next; } int -dht_subvol_cnt (xlator_t *this, xlator_t *subvol) +dht_subvol_cnt(xlator_t *this, xlator_t *subvol) { - int i = 0; - int ret = -1; - dht_conf_t *conf = NULL; - - conf = this->private; - if (!conf) - goto out; - - for (i = 0; i < conf->subvolume_cnt; i++) { - if (subvol == conf->subvolumes[i]) { - ret = i; - break; - } + int i = 0; + int ret = -1; + dht_conf_t *conf = NULL; + + conf = this->private; + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + ret = i; + break; } + } out: - return ret; + return ret; } +#define set_if_greater(a, b) \ + do { \ + if ((a) < (b)) \ + (a) = (b); \ + } while (0) -#define set_if_greater(a, b) do { \ - if ((a) < (b)) \ - (a) = (b); \ - } while (0) - +#define set_if_greater_time(a, an, b, bn) \ + do { \ + if (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))) { \ + (a) = (b); \ + (an) = (bn); \ + } \ + } while (0) -#define set_if_greater_time(a, an, b, bn) do { \ - if (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))){ \ - (a) = (b); \ - (an) = (bn); \ - } \ - } while (0) \ +int +dht_iatt_merge(xlator_t *this, struct iatt *to, struct iatt *from) +{ + if (!from || !to) + return 0; + to->ia_dev = from->ia_dev; + + gf_uuid_copy(to->ia_gfid, from->ia_gfid); + + to->ia_ino = from->ia_ino; + to->ia_prot = from->ia_prot; + to->ia_type = from->ia_type; + to->ia_nlink = from->ia_nlink; + to->ia_rdev = from->ia_rdev; + to->ia_size += from->ia_size; + to->ia_blksize = from->ia_blksize; + to->ia_blocks += from->ia_blocks; + + if (IA_ISDIR(from->ia_type)) { + to->ia_blocks = DHT_DIR_STAT_BLOCKS; + to->ia_size = DHT_DIR_STAT_SIZE; + } + set_if_greater(to->ia_uid, from->ia_uid); + set_if_greater(to->ia_gid, from->ia_gid); + + set_if_greater_time(to->ia_atime, to->ia_atime_nsec, from->ia_atime, + from->ia_atime_nsec); + set_if_greater_time(to->ia_mtime, to->ia_mtime_nsec, from->ia_mtime, + from->ia_mtime_nsec); + set_if_greater_time(to->ia_ctime, to->ia_ctime_nsec, from->ia_ctime, + from->ia_ctime_nsec); + + return 0; +} int -dht_iatt_merge (xlator_t *this, struct iatt *to, - struct iatt *from, xlator_t *subvol) +dht_build_child_loc(xlator_t *this, loc_t *child, loc_t *parent, char *name) { - if (!from || !to) - return 0; + if (!child) { + goto err; + } - to->ia_dev = from->ia_dev; + if (strcmp(parent->path, "/") == 0) + gf_asprintf((char **)&child->path, "/%s", name); + else + gf_asprintf((char **)&child->path, "%s/%s", parent->path, name); - uuid_copy (to->ia_gfid, from->ia_gfid); + if (!child->path) { + goto err; + } - to->ia_ino = from->ia_ino; - to->ia_prot = from->ia_prot; - to->ia_type = from->ia_type; - to->ia_nlink = from->ia_nlink; - to->ia_rdev = from->ia_rdev; - to->ia_size += from->ia_size; - to->ia_blksize = from->ia_blksize; - to->ia_blocks += from->ia_blocks; + child->name = strrchr(child->path, '/'); + if (child->name) + child->name++; - set_if_greater (to->ia_uid, from->ia_uid); - set_if_greater (to->ia_gid, from->ia_gid); + child->parent = inode_ref(parent->inode); + child->inode = inode_new(parent->inode->table); - set_if_greater_time(to->ia_atime, to->ia_atime_nsec, - from->ia_atime, from->ia_atime_nsec); - set_if_greater_time (to->ia_mtime, to->ia_mtime_nsec, - from->ia_mtime, from->ia_mtime_nsec); - set_if_greater_time (to->ia_ctime, to->ia_ctime_nsec, - from->ia_ctime, from->ia_ctime_nsec); + if (!child->inode) { + goto err; + } - return 0; + return 0; +err: + if (child) { + loc_wipe(child); + } + return -1; } int -dht_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +dht_init_local_subvolumes(xlator_t *this, dht_conf_t *conf) { - if (!child) { - goto err; - } - - if (strcmp (parent->path, "/") == 0) - gf_asprintf ((char **)&child->path, "/%s", name); - else - gf_asprintf ((char **)&child->path, "%s/%s", parent->path, name); + xlator_list_t *subvols = NULL; + int cnt = 0; - if (!child->path) { - goto err; - } + if (!conf) + return -1; - child->name = strrchr (child->path, '/'); - if (child->name) - child->name++; + for (subvols = this->children; subvols; subvols = subvols->next) + cnt++; - child->parent = inode_ref (parent->inode); - child->inode = inode_new (parent->inode->table); + conf->local_subvols = GF_CALLOC(cnt, sizeof(xlator_t *), + gf_dht_mt_xlator_t); - if (!child->inode) { - goto err; - } + /* FIX FIX : do this dynamically*/ + conf->local_nodeuuids = GF_CALLOC(cnt, sizeof(subvol_nodeuuids_info_t), + gf_dht_nodeuuids_t); - return 0; -err: - loc_wipe (child); + if (!conf->local_subvols || !conf->local_nodeuuids) { return -1; -} + } + conf->local_subvols_cnt = 0; + return 0; +} int -dht_init_subvolumes (xlator_t *this, dht_conf_t *conf) +dht_init_subvolumes(xlator_t *this, dht_conf_t *conf) { - xlator_list_t *subvols = NULL; - int cnt = 0; + xlator_list_t *subvols = NULL; + int cnt = 0; - if (!conf) - return -1; + if (!conf) + return -1; - for (subvols = this->children; subvols; subvols = subvols->next) - cnt++; + for (subvols = this->children; subvols; subvols = subvols->next) + cnt++; - conf->subvolumes = GF_CALLOC (cnt, sizeof (xlator_t *), - gf_dht_mt_xlator_t); - if (!conf->subvolumes) { - return -1; - } - conf->subvolume_cnt = cnt; + conf->subvolumes = GF_CALLOC(cnt, sizeof(xlator_t *), gf_dht_mt_xlator_t); + if (!conf->subvolumes) { + return -1; + } + conf->subvolume_cnt = cnt; + /* Doesn't make sense to do any dht layer tasks + if the subvol count is 1. Set it as pass_through */ + if (cnt == 1) + this->pass_through = _gf_true; - cnt = 0; - for (subvols = this->children; subvols; subvols = subvols->next) - conf->subvolumes[cnt++] = subvols->xlator; + conf->local_subvols_cnt = 0; - conf->subvolume_status = GF_CALLOC (cnt, sizeof (char), - gf_dht_mt_char); - if (!conf->subvolume_status) { - return -1; - } + dht_set_subvol_range(this); - conf->last_event = GF_CALLOC (cnt, sizeof (int), - gf_dht_mt_char); - if (!conf->last_event) { - return -1; - } + cnt = 0; + for (subvols = this->children; subvols; subvols = subvols->next) + conf->subvolumes[cnt++] = subvols->xlator; - conf->subvol_up_time = GF_CALLOC (cnt, sizeof (time_t), - gf_dht_mt_subvol_time); - if (!conf->subvol_up_time) { - return -1; - } + conf->subvolume_status = GF_CALLOC(cnt, sizeof(char), gf_dht_mt_char); + if (!conf->subvolume_status) { + return -1; + } - conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t), - gf_dht_mt_dht_du_t); - if (!conf->du_stats) { - return -1; - } + conf->last_event = GF_CALLOC(cnt, sizeof(int), gf_dht_mt_char); + if (!conf->last_event) { + return -1; + } - conf->decommissioned_bricks = GF_CALLOC (cnt, sizeof (xlator_t *), - gf_dht_mt_xlator_t); - if (!conf->decommissioned_bricks) { - return -1; - } + conf->subvol_up_time = GF_CALLOC(cnt, sizeof(time_t), + gf_dht_mt_subvol_time); + if (!conf->subvol_up_time) { + return -1; + } - return 0; -} + conf->du_stats = GF_CALLOC(conf->subvolume_cnt, sizeof(dht_du_t), + gf_dht_mt_dht_du_t); + if (!conf->du_stats) { + return -1; + } + conf->decommissioned_bricks = GF_CALLOC(cnt, sizeof(xlator_t *), + gf_dht_mt_xlator_t); + if (!conf->decommissioned_bricks) { + return -1; + } + return 0; +} +/* + op_ret values : + 0 : Success. + -1 : Failure. + 1 : File is being migrated but not by this DHT layer. +*/ static int -dht_migration_complete_check_done (int op_ret, call_frame_t *frame, void *data) +dht_migration_complete_check_done(int op_ret, call_frame_t *frame, void *data) { - dht_local_t *local = NULL; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; - local = frame->local; + local = frame->local; - local->rebalance.target_op_fn (THIS, frame, op_ret); + if (op_ret != 0) + goto out; - return 0; -} + if (local->cached_subvol == NULL) { + local->op_errno = EINVAL; + goto out; + } + subvol = local->cached_subvol; + +out: + local->rebalance.target_op_fn(THIS, subvol, frame, op_ret); + + return 0; +} int -dht_migration_complete_check_task (void *data) +dht_migration_complete_check_task(void *data) { - int ret = -1; - xlator_t *src_node = NULL; - xlator_t *dst_node = NULL; - dht_local_t *local = NULL; - dict_t *dict = NULL; - dht_layout_t *layout = NULL; - struct iatt stbuf = {0,}; - xlator_t *this = NULL; - call_frame_t *frame = NULL; - loc_t tmp_loc = {0,}; - char *path = NULL; - dht_conf_t *conf = NULL; - inode_t *inode = NULL; - fd_t *iter_fd = NULL; - uint64_t tmp_subvol = 0; - int open_failed = 0; - - this = THIS; - frame = data; - local = frame->local; - conf = this->private; - - src_node = local->cached_subvol; - - if (!local->loc.inode && !local->fd) { - local->op_errno = EINVAL; - goto out; + int ret = -1; + xlator_t *src_node = NULL; + xlator_t *dst_node = NULL, *linkto_target = NULL; + dht_local_t *local = NULL; + dict_t *dict = NULL; + struct iatt stbuf = { + 0, + }; + xlator_t *this = NULL; + call_frame_t *frame = NULL; + loc_t tmp_loc = { + 0, + }; + char *path = NULL; + dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + fd_t *tmp = NULL; + uint64_t tmp_miginfo = 0; + dht_migrate_info_t *miginfo = NULL; + gf_boolean_t skip_open = _gf_false; + int open_failed = 0; + + this = THIS; + frame = data; + local = frame->local; + conf = this->private; + + src_node = local->cached_subvol; + + if (!local->loc.inode && !local->fd) { + local->op_errno = EINVAL; + goto out; + } + + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr + * as root:root. If a fd is already open, access check won't be done*/ + + if (!local->loc.inode) { + ret = syncop_fgetxattr(src_node, local->fd, &dict, + conf->link_xattr_name, NULL, NULL); + } else { + SYNCTASK_SETID(0, 0); + ret = syncop_getxattr(src_node, &local->loc, &dict, + conf->link_xattr_name, NULL, NULL); + SYNCTASK_SETID(frame->root->uid, frame->root->gid); + } + + /* + * Each DHT xlator layer has its own name for the linkto xattr. + * If the file mode bits indicate the the file is being migrated but + * this layer's linkto xattr is not set, it means that another + * DHT layer is migrating the file. In this case, return 1 so + * the mode bits can be passed on to the higher layer for appropriate + * action. + */ + if (-ret == ENODATA) { + /* This DHT translator is not migrating this file */ + + ret = inode_ctx_reset1(inode, this, &tmp_miginfo); + if (tmp_miginfo) { + /* This can be a problem if the file was + * migrated by two different layers. Raise + * a warning here. + */ + gf_smsg( + this->name, GF_LOG_WARNING, 0, DHT_MSG_HAS_MIGINFO, "tmp=%s", + tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), NULL); + + miginfo = (void *)(uintptr_t)tmp_miginfo; + GF_REF_PUT(miginfo); } + ret = 1; + goto out; + } + + if (!ret) + linkto_target = dht_linkfile_subvol(this, NULL, NULL, dict); + + if (local->loc.inode) { + loc_copy(&tmp_loc, &local->loc); + } else { + tmp_loc.inode = inode_ref(inode); + gf_uuid_copy(tmp_loc.gfid, inode->gfid); + } + + ret = syncop_lookup(this, &tmp_loc, &stbuf, 0, 0, 0); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_FILE_LOOKUP_FAILED, + "tmp=%s", tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), + "name=%s", this->name, NULL); + local->op_errno = -ret; + ret = -1; + goto out; + } + + dst_node = dht_subvol_get_cached(this, tmp_loc.inode); + if (linkto_target && dst_node != linkto_target) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_LINKFILE, + "linkto_target_name=%s", linkto_target->name, "dst_name=%s", + dst_node->name, NULL); + } + + if (gf_uuid_compare(stbuf.ia_gfid, tmp_loc.inode->gfid)) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH, "tmp=%s", + tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), + "dst_name=%s", dst_node->name, NULL); + ret = -1; + local->op_errno = EIO; + goto out; + } + + /* update local. A layout is set in inode-ctx in lookup already */ + + dht_layout_unref(this, local->layout); + + local->layout = dht_layout_get(frame->this, inode); + local->cached_subvol = dst_node; + + ret = 0; + + /* once we detect the migration complete, the inode-ctx2 is no more + required.. delete the ctx and also, it means, open() already + done on all the fd of inode */ + ret = inode_ctx_reset1(inode, this, &tmp_miginfo); + if (tmp_miginfo) { + miginfo = (void *)(uintptr_t)tmp_miginfo; + GF_REF_PUT(miginfo); + goto out; + } + + /* perform 'open()' on all the fd's present on the inode */ + if (tmp_loc.path == NULL) { + inode_path(inode, NULL, &path); + if (path) + tmp_loc.path = path; + } + + LOCK(&inode->lock); + + if (list_empty(&inode->fd_list)) + goto unlock; + + /* perform open as root:root. There is window between linkfile + * creation(root:root) and setattr with the correct uid/gid + */ + SYNCTASK_SETID(0, 0); + + /* It's possible that we are the last user of iter_fd after each + * iteration. In this case the fd_unref() of iter_fd at the end of + * the loop will cause the destruction of the fd. So we need to + * iterate the list safely because iter_fd cannot be trusted. + */ + iter_fd = list_entry((&inode->fd_list)->next, typeof(*iter_fd), inode_list); + while (&iter_fd->inode_list != (&inode->fd_list)) { + if (fd_is_anonymous(iter_fd) || + (dht_fd_open_on_dst(this, iter_fd, dst_node))) { + if (!tmp) { + iter_fd = list_entry(iter_fd->inode_list.next, typeof(*iter_fd), + inode_list); + continue; + } + skip_open = _gf_true; + } + /* We need to release the inode->lock before calling + * syncop_open() to avoid possible deadlocks. However this + * can cause the iter_fd to be released by other threads. + * To avoid this, we take a reference before releasing the + * lock. + */ + fd_ref(iter_fd); - inode = (!local->fd) ? local->loc.inode : local->fd->inode; - - /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr - * as root:root. If a fd is already open, access check wont be done*/ + UNLOCK(&inode->lock); - if (!local->loc.inode) { - ret = syncop_fgetxattr (src_node, local->fd, &dict, - conf->link_xattr_name); + if (tmp) { + fd_unref(tmp); + tmp = NULL; + } + if (skip_open) + goto next; + + /* flags for open are stripped down to allow following the + * new location of the file, otherwise we can get EEXIST or + * truncate the file again as rebalance is moving the data */ + ret = syncop_open(dst_node, &tmp_loc, + (iter_fd->flags & ~(O_CREAT | O_EXCL | O_TRUNC)), + iter_fd, NULL, NULL); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_OPEN_FD_ON_DST_FAILED, "id=%p", iter_fd, + "flags=0%o", iter_fd->flags, "path=%s", path, "name=%s", + dst_node->name, NULL); + + open_failed = 1; + local->op_errno = -ret; + ret = -1; } else { - SYNCTASK_SETID (0, 0); - ret = syncop_getxattr (src_node, &local->loc, &dict, - conf->link_xattr_name); - SYNCTASK_SETID (frame->root->uid, frame->root->gid); + dht_fd_ctx_set(this, iter_fd, dst_node); } - if (!ret) - dst_node = dht_linkfile_subvol (this, NULL, NULL, dict); + next: + LOCK(&inode->lock); + skip_open = _gf_false; + tmp = iter_fd; + iter_fd = list_entry(tmp->inode_list.next, typeof(*tmp), inode_list); + } - if (ret) { - if (!dht_inode_missing(-ret) || (!local->loc.inode)) { - local->op_errno = -ret; - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to get the 'linkto' xattr %s", - local->loc.path, strerror (-ret)); - ret = -1; - goto out; - } + SYNCTASK_SETID(frame->root->uid, frame->root->gid); - /* Need to do lookup on hashed subvol, then get the file */ - ret = syncop_lookup (this, &local->loc, NULL, &stbuf, NULL, - NULL); - if (ret) { - local->op_errno = -ret; - ret = -1; - goto out; - } + if (open_failed) { + ret = -1; + goto unlock; + } + ret = 0; - dst_node = dht_subvol_get_cached (this, local->loc.inode); - } +unlock: + UNLOCK(&inode->lock); + if (tmp) { + fd_unref(tmp); + tmp = NULL; + } - if (!dst_node) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to get the destination node", - local->loc.path); - ret = -1; - local->op_errno = EINVAL; - goto out; - } +out: + if (dict) { + dict_unref(dict); + } - /* lookup on dst */ - if (local->loc.inode) { - ret = syncop_lookup (dst_node, &local->loc, NULL, &stbuf, NULL, - NULL); - - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to lookup the file on %s", - local->loc.path, dst_node->name); - local->op_errno = -ret; - ret = -1; - goto out; - } + loc_wipe(&tmp_loc); - if (uuid_compare (stbuf.ia_gfid, local->loc.inode->gfid)) { - gf_log (this->name, GF_LOG_ERROR, - "%s: gfid different on the target file on %s", - local->loc.path, dst_node->name); - ret = -1; - local->op_errno = EIO; - goto out; - } - } + return ret; +} - /* update inode ctx (the layout) */ - dht_layout_unref (this, local->layout); +int +dht_rebalance_complete_check(xlator_t *this, call_frame_t *frame) +{ + int ret = -1; - ret = dht_layout_preset (this, dst_node, inode); - if (ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: could not set preset layout for subvol %s", - local->loc.path, dst_node->name); - ret = -1; - local->op_errno = EINVAL; - goto out; - } + ret = synctask_new(this->ctx->env, dht_migration_complete_check_task, + dht_migration_complete_check_done, frame, frame); + return ret; +} - layout = dht_layout_for_subvol (this, dst_node); - if (!layout) { - gf_log (this->name, GF_LOG_INFO, - "%s: no pre-set layout for subvolume %s", - local->loc.path, dst_node ? dst_node->name : "<nil>"); - ret = -1; - local->op_errno = EINVAL; - goto out; - } +/* During 'in-progress' state, both nodes should have the file */ +/* + op_ret values : + 0 : Success + -1 : Failure. + 1 : File is being migrated but not by this DHT layer. +*/ +static int +dht_inprogress_check_done(int op_ret, call_frame_t *frame, void *data) +{ + dht_local_t *local = NULL; + xlator_t *dst_subvol = NULL, *src_subvol = NULL; + inode_t *inode = NULL; - ret = dht_layout_set (this, inode, layout); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to set the new layout", - local->loc.path); - local->op_errno = EINVAL; - goto out; - } + local = frame->local; - local->cached_subvol = dst_node; - ret = 0; + if (op_ret != 0) + goto out; - /* once we detect the migration complete, the inode-ctx2 is no more - required.. delete the ctx and also, it means, open() already - done on all the fd of inode */ - ret = inode_ctx_reset1 (inode, this, &tmp_subvol); - if (tmp_subvol) - goto out; + inode = local->loc.inode ? local->loc.inode : local->fd->inode; - if (list_empty (&inode->fd_list)) - goto out; + dht_inode_ctx_get_mig_info(THIS, inode, &src_subvol, &dst_subvol); + if (dht_mig_info_is_invalid(local->cached_subvol, src_subvol, dst_subvol)) { + dst_subvol = dht_subvol_get_cached(THIS, inode); + if (!dst_subvol) { + local->op_errno = EINVAL; + goto out; + } + } - /* perform open as root:root. There is window between linkfile - * creation(root:root) and setattr with the correct uid/gid - */ - SYNCTASK_SETID(0, 0); +out: + local->rebalance.target_op_fn(THIS, dst_subvol, frame, op_ret); - /* perform 'open()' on all the fd's present on the inode */ - tmp_loc.inode = inode; - inode_path (inode, NULL, &path); + return 0; +} + +static int +dht_rebalance_inprogress_task(void *data) +{ + int ret = -1; + xlator_t *src_node = NULL; + xlator_t *dst_node = NULL; + dht_local_t *local = NULL; + dict_t *dict = NULL; + call_frame_t *frame = NULL; + xlator_t *this = NULL; + char *path = NULL; + struct iatt stbuf = { + 0, + }; + loc_t tmp_loc = { + 0, + }; + dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + fd_t *tmp = NULL; + int open_failed = 0; + uint64_t tmp_miginfo = 0; + dht_migrate_info_t *miginfo = NULL; + gf_boolean_t skip_open = _gf_false; + + this = THIS; + frame = data; + local = frame->local; + conf = this->private; + + src_node = local->cached_subvol; + + if (!local->loc.inode && !local->fd) + goto out; + + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr + * as root:root. If a fd is already open, access check won't be done*/ + if (local->loc.inode) { + SYNCTASK_SETID(0, 0); + ret = syncop_getxattr(src_node, &local->loc, &dict, + conf->link_xattr_name, NULL, NULL); + SYNCTASK_SETID(frame->root->uid, frame->root->gid); + } else { + ret = syncop_fgetxattr(src_node, local->fd, &dict, + conf->link_xattr_name, NULL, NULL); + } + + /* + * Each DHT xlator layer has its own name for the linkto xattr. + * If the file mode bits indicate the the file is being migrated but + * this layer's linkto xattr is not present, it means that another + * DHT layer is migrating the file. In this case, return 1 so + * the mode bits can be passed on to the higher layer for appropriate + * action. + */ + + if (-ret == ENODATA) { + /* This DHT layer is not migrating this file */ + ret = inode_ctx_reset1(inode, this, &tmp_miginfo); + if (tmp_miginfo) { + /* This can be a problem if the file was + * migrated by two different layers. Raise + * a warning here. + */ + gf_smsg( + this->name, GF_LOG_WARNING, 0, DHT_MSG_HAS_MIGINFO, "tmp=%s", + tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), NULL); + miginfo = (void *)(uintptr_t)tmp_miginfo; + GF_REF_PUT(miginfo); + } + ret = 1; + goto out; + } + + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_GET_XATTR_FAILED, + "path=%s", local->loc.path, NULL); + ret = -1; + goto out; + } + + dst_node = dht_linkfile_subvol(this, NULL, NULL, dict); + if (!dst_node) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GET_XATTR_FAILED, + "path=%s", local->loc.path, NULL); + ret = -1; + goto out; + } + + local->rebalance.target_node = dst_node; + + if (local->loc.inode) { + loc_copy(&tmp_loc, &local->loc); + } else { + tmp_loc.inode = inode_ref(inode); + gf_uuid_copy(tmp_loc.gfid, inode->gfid); + } + + /* lookup on dst */ + ret = syncop_lookup(dst_node, &tmp_loc, &stbuf, NULL, NULL, NULL); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_FILE_LOOKUP_FAILED, + "tmp=%s", tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), + "name=%s", dst_node->name, NULL); + ret = -1; + goto out; + } + + if (gf_uuid_compare(stbuf.ia_gfid, tmp_loc.inode->gfid)) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH, "tmp=%s", + tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), + "name=%s", dst_node->name, NULL); + ret = -1; + goto out; + } + ret = 0; + + if (tmp_loc.path == NULL) { + inode_path(inode, NULL, &path); if (path) - tmp_loc.path = path; - list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { - if (fd_is_anonymous (iter_fd)) - continue; - - /* flags for open are stripped down to allow following the - * new location of the file, otherwise we can get EEXIST or - * truncate the file again as rebalance is moving the data */ - ret = syncop_open (dst_node, &tmp_loc, - (iter_fd->flags & - ~(O_CREAT | O_EXCL | O_TRUNC)), iter_fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "failed to open " - "the fd (%p, flags=0%o) on file %s @ %s", - iter_fd, iter_fd->flags, path, dst_node->name); - open_failed = 1; - local->op_errno = -ret; - ret = -1; - } + tmp_loc.path = path; + } + + LOCK(&inode->lock); + + if (list_empty(&inode->fd_list)) + goto unlock; + + /* perform open as root:root. There is window between linkfile + * creation(root:root) and setattr with the correct uid/gid + */ + SYNCTASK_SETID(0, 0); + + /* It's possible that we are the last user of iter_fd after each + * iteration. In this case the fd_unref() of iter_fd at the end of + * the loop will cause the destruction of the fd. So we need to + * iterate the list safely because iter_fd cannot be trusted. + */ + iter_fd = list_entry((&inode->fd_list)->next, typeof(*iter_fd), inode_list); + while (&iter_fd->inode_list != (&inode->fd_list)) { + /* We need to release the inode->lock before calling + * syncop_open() to avoid possible deadlocks. However this + * can cause the iter_fd to be released by other threads. + * To avoid this, we take a reference before releasing the + * lock. + */ + + if (fd_is_anonymous(iter_fd) || + (dht_fd_open_on_dst(this, iter_fd, dst_node))) { + if (!tmp) { + iter_fd = list_entry(iter_fd->inode_list.next, typeof(*iter_fd), + inode_list); + continue; + } + skip_open = _gf_true; } - GF_FREE (path); - SYNCTASK_SETID (frame->root->uid, frame->root->gid); + /* Yes, this is ugly but there isn't a cleaner way to do this + * the fd_ref is an atomic increment so not too bad. We want to + * reduce the number of inode locks and unlocks. + */ - if (open_failed) { - ret = -1; - goto out; + fd_ref(iter_fd); + UNLOCK(&inode->lock); + + if (tmp) { + fd_unref(tmp); + tmp = NULL; } - ret = 0; + if (skip_open) + goto next; + + /* flags for open are stripped down to allow following the + * new location of the file, otherwise we can get EEXIST or + * truncate the file again as rebalance is moving the data */ + ret = syncop_open(dst_node, &tmp_loc, + (iter_fd->flags & ~(O_CREAT | O_EXCL | O_TRUNC)), + iter_fd, NULL, NULL); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_OPEN_FD_ON_DST_FAILED, "fd=%p", iter_fd, + "flags=0%o", iter_fd->flags, "path=%s", path, "name=%s", + dst_node->name, NULL); + ret = -1; + open_failed = 1; + } else { + /* Potential fd leak if this fails here as it will be + reopened at the next Phase1/2 check */ + dht_fd_ctx_set(this, iter_fd, dst_node); + } + + next: + LOCK(&inode->lock); + skip_open = _gf_false; + tmp = iter_fd; + iter_fd = list_entry(tmp->inode_list.next, typeof(*tmp), inode_list); + } + + SYNCTASK_SETID(frame->root->uid, frame->root->gid); + +unlock: + UNLOCK(&inode->lock); + + if (tmp) { + fd_unref(tmp); + tmp = NULL; + } + if (open_failed) { + ret = -1; + goto out; + } + + ret = dht_inode_ctx_set_mig_info(this, inode, src_node, dst_node); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED, + "path=%s", local->loc.path, "name=%s", dst_node->name, NULL); + goto out; + } + + ret = 0; out: + if (dict) { + dict_unref(dict); + } - return ret; + loc_wipe(&tmp_loc); + return ret; } int -dht_rebalance_complete_check (xlator_t *this, call_frame_t *frame) +dht_rebalance_in_progress_check(xlator_t *this, call_frame_t *frame) { - int ret = -1; + int ret = -1; - ret = synctask_new (this->ctx->env, dht_migration_complete_check_task, - dht_migration_complete_check_done, - frame, frame); - return ret; + ret = synctask_new(this->ctx->env, dht_rebalance_inprogress_task, + dht_inprogress_check_done, frame, frame); + return ret; } -/* During 'in-progress' state, both nodes should have the file */ -static int -dht_inprogress_check_done (int op_ret, call_frame_t *sync_frame, void *data) +int +dht_inode_ctx_layout_set(inode_t *inode, xlator_t *this, + dht_layout_t *layout_int) { - dht_local_t *local = NULL; - - local = sync_frame->local; - - local->rebalance.target_op_fn (THIS, sync_frame, op_ret); - - return 0; + dht_inode_ctx_t *ctx = NULL; + int ret = -1; + + ret = dht_inode_ctx_get(inode, this, &ctx); + if (!ret && ctx) { + ctx->layout = layout_int; + } else { + ctx = GF_CALLOC(1, sizeof(*ctx), gf_dht_mt_inode_ctx_t); + if (!ctx) + return ret; + ctx->layout = layout_int; + } + + ret = dht_inode_ctx_set(inode, this, ctx); + + return ret; } -static int -dht_rebalance_inprogress_task (void *data) +void +dht_inode_ctx_time_set(inode_t *inode, xlator_t *this, struct iatt *stat) { - int ret = -1; - xlator_t *src_node = NULL; - xlator_t *dst_node = NULL; - dht_local_t *local = NULL; - dict_t *dict = NULL; - call_frame_t *frame = NULL; - xlator_t *this = NULL; - char *path = NULL; - struct iatt stbuf = {0,}; - loc_t tmp_loc = {0,}; - dht_conf_t *conf = NULL; - inode_t *inode = NULL; - fd_t *iter_fd = NULL; - int open_failed = 0; - - this = THIS; - frame = data; - local = frame->local; - conf = this->private; - - src_node = local->cached_subvol; - - if (!local->loc.inode && !local->fd) - goto out; + dht_inode_ctx_t *ctx = NULL; + dht_stat_time_t *time = 0; + int ret = -1; - inode = (!local->fd) ? local->loc.inode : local->fd->inode; + ret = dht_inode_ctx_get(inode, this, &ctx); - /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr - * as root:root. If a fd is already open, access check wont be done*/ - if (local->loc.inode) { - SYNCTASK_SETID (0, 0); - ret = syncop_getxattr (src_node, &local->loc, &dict, - conf->link_xattr_name); - SYNCTASK_SETID (frame->root->uid, frame->root->gid); - } else { - ret = syncop_fgetxattr (src_node, local->fd, &dict, - conf->link_xattr_name); - } + if (ret) + return; - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to get the 'linkto' xattr %s", - local->loc.path, strerror (-ret)); - ret = -1; - goto out; - } + time = &ctx->time; - dst_node = dht_linkfile_subvol (this, NULL, NULL, dict); - if (!dst_node) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to get the 'linkto' xattr from dict", - local->loc.path); - ret = -1; - goto out; - } - - local->rebalance.target_node = dst_node; - - if (local->loc.inode) { - /* lookup on dst */ - ret = syncop_lookup (dst_node, &local->loc, NULL, - &stbuf, NULL, NULL); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to lookup the file on %s", - local->loc.path, dst_node->name); - ret = -1; - goto out; - } + time->mtime = stat->ia_mtime; + time->mtime_nsec = stat->ia_mtime_nsec; - if (uuid_compare (stbuf.ia_gfid, local->loc.inode->gfid)) { - gf_log (this->name, GF_LOG_ERROR, - "%s: gfid different on the target file on %s", - local->loc.path, dst_node->name); - ret = -1; - goto out; - } - } + time->ctime = stat->ia_ctime; + time->ctime_nsec = stat->ia_ctime_nsec; - ret = 0; + time->atime = stat->ia_atime; + time->atime_nsec = stat->ia_atime_nsec; - if (list_empty (&inode->fd_list)) - goto done; + return; +} - /* perform open as root:root. There is window between linkfile - * creation(root:root) and setattr with the correct uid/gid - */ - SYNCTASK_SETID (0, 0); +int +dht_inode_ctx_time_update(inode_t *inode, xlator_t *this, struct iatt *stat, + int32_t post) +{ + dht_inode_ctx_t *ctx = NULL; + dht_stat_time_t *time = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO(this->name, stat, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + ret = dht_inode_ctx_get(inode, this, &ctx); + + if (ret) { + ctx = GF_CALLOC(1, sizeof(*ctx), gf_dht_mt_inode_ctx_t); + if (!ctx) + return -1; + } + + time = &ctx->time; + + LOCK(&inode->lock); + { + DHT_UPDATE_TIME(time->mtime, time->mtime_nsec, stat->ia_mtime, + stat->ia_mtime_nsec, post); + DHT_UPDATE_TIME(time->ctime, time->ctime_nsec, stat->ia_ctime, + stat->ia_ctime_nsec, post); + DHT_UPDATE_TIME(time->atime, time->atime_nsec, stat->ia_atime, + stat->ia_atime_nsec, post); + } + UNLOCK(&inode->lock); + + ret = dht_inode_ctx_set(inode, this, ctx); +out: + return 0; +} - tmp_loc.inode = inode; - inode_path (inode, NULL, &path); - if (path) - tmp_loc.path = path; - - list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { - if (fd_is_anonymous (iter_fd)) - continue; - - /* flags for open are stripped down to allow following the - * new location of the file, otherwise we can get EEXIST or - * truncate the file again as rebalance is moving the data */ - ret = syncop_open (dst_node, &tmp_loc, - (iter_fd->flags & - ~(O_CREAT | O_EXCL | O_TRUNC)), iter_fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "failed to send open " - "the fd (%p, flags=0%o) on file %s @ %s", - iter_fd, iter_fd->flags, path, dst_node->name); - ret = -1; - open_failed = 1; - } - } - GF_FREE (path); +int +dht_inode_ctx_get(inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx) +{ + int ret = -1; + uint64_t ctx_int = 0; - SYNCTASK_SETID (frame->root->uid, frame->root->gid); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); - if (open_failed) { - ret = -1; - goto out; - } + ret = inode_ctx_get(inode, this, &ctx_int); -done: - ret = dht_inode_ctx_set1 (this, inode, dst_node); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to set inode-ctx target file at %s", - local->loc.path, dst_node->name); - goto out; - } + if (ret) + return ret; - ret = 0; + if (ctx) + *ctx = (dht_inode_ctx_t *)(uintptr_t)ctx_int; out: - return ret; + return ret; } int -dht_rebalance_in_progress_check (xlator_t *this, call_frame_t *frame) +dht_inode_ctx_set(inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx) { + int ret = -1; + uint64_t ctx_int = 0; - int ret = -1; + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, ctx, out); - ret = synctask_new (this->ctx->env, dht_rebalance_inprogress_task, - dht_inprogress_check_done, - frame, frame); - return ret; + ctx_int = (long)ctx; + ret = inode_ctx_set(inode, this, &ctx_int); +out: + return ret; } int -dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this, - dht_layout_t *layout_int) +dht_subvol_status(dht_conf_t *conf, xlator_t *subvol) { - dht_inode_ctx_t *ctx = NULL; - int ret = -1; + int i; - ret = dht_inode_ctx_get (inode, this, &ctx); - if (!ret && ctx) { - ctx->layout = layout_int; - } else { - ctx = GF_CALLOC (1, sizeof (*ctx), gf_dht_mt_inode_ctx_t); - if (!ctx) - return ret; - ctx->layout = layout_int; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == subvol) { + return conf->subvolume_status[i]; } - - ret = dht_inode_ctx_set (inode, this, ctx); - - return ret; + } + return 0; } - -void -dht_inode_ctx_time_set (inode_t *inode, xlator_t *this, struct iatt *stat) +inode_t * +dht_heal_path(xlator_t *this, char *path, inode_table_t *itable) { - dht_inode_ctx_t *ctx = NULL; - dht_stat_time_t *time = 0; - int ret = -1; - - ret = dht_inode_ctx_get (inode, this, &ctx); + int ret = -1; + struct iatt iatt = { + 0, + }; + inode_t *linked_inode = NULL; + loc_t loc = { + 0, + }; + char *bname = NULL; + char *save_ptr = NULL; + static uuid_t gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + char *tmp_path = NULL; + + tmp_path = gf_strdup(path); + if (!tmp_path) { + goto out; + } + + gf_uuid_copy(loc.pargfid, gfid); + loc.parent = inode_ref(itable->root); + + bname = strtok_r(tmp_path, "/", &save_ptr); + + /* sending a lookup on parent directory, + * Eg: if path is like /a/b/c/d/e/f/g/ + * then we will send a lookup on a first and then b,c,d,etc + */ + + while (bname) { + linked_inode = NULL; + loc.inode = inode_grep(itable, loc.parent, bname); + if (loc.inode == NULL) { + loc.inode = inode_new(itable); + if (loc.inode == NULL) { + ret = -ENOMEM; + goto out; + } + } else { + /* + * Inode is already populated in the inode table. + * Which means we already looked up the inode and + * linked with a dentry. So that we will skip + * lookup on this entry, and proceed to next. + */ + linked_inode = loc.inode; + bname = strtok_r(NULL, "/", &save_ptr); + if (!bname) { + goto out; + } + inode_unref(loc.parent); + loc.parent = loc.inode; + gf_uuid_copy(loc.pargfid, loc.inode->gfid); + loc.inode = NULL; + continue; + } - if (ret) - return; + loc.name = bname; + ret = loc_path(&loc, bname); - time = &ctx->time; + ret = syncop_lookup(this, &loc, &iatt, NULL, NULL, NULL); + if (ret) { + gf_smsg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_SELFHEAL_FAILED, + "path=%s", path, "subvolume=%s", this->name, "bname=%s", + bname, NULL); + goto out; + } - time->mtime = stat->ia_mtime; - time->mtime_nsec = stat->ia_mtime_nsec; + linked_inode = inode_link(loc.inode, loc.parent, bname, &iatt); + if (!linked_inode) + goto out; - time->ctime = stat->ia_ctime; - time->ctime_nsec = stat->ia_ctime_nsec; + loc_wipe(&loc); + gf_uuid_copy(loc.pargfid, linked_inode->gfid); + loc.inode = NULL; - time->atime = stat->ia_atime; - time->atime_nsec = stat->ia_atime_nsec; + bname = strtok_r(NULL, "/", &save_ptr); + if (bname) + loc.parent = linked_inode; + } +out: + inode_ref(linked_inode); + loc_wipe(&loc); + GF_FREE(tmp_path); - return; + return linked_inode; } - int -dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat, - int32_t post) +dht_heal_full_path(void *data) { - dht_inode_ctx_t *ctx = NULL; - dht_stat_time_t *time = 0; - int ret = -1; - - GF_VALIDATE_OR_GOTO (this->name, stat, out); - GF_VALIDATE_OR_GOTO (this->name, inode, out); + call_frame_t *heal_frame = data; + dht_local_t *local = NULL; + loc_t loc = { + 0, + }; + dict_t *dict = NULL; + char *path = NULL; + int ret = -1; + xlator_t *source = NULL; + xlator_t *this = NULL; + inode_table_t *itable = NULL; + inode_t *inode = NULL; + inode_t *tmp_inode = NULL; + + GF_VALIDATE_OR_GOTO("DHT", heal_frame, out); + + local = heal_frame->local; + this = heal_frame->this; + source = heal_frame->cookie; + heal_frame->cookie = NULL; + gf_uuid_copy(loc.gfid, local->gfid); + + if (local->loc.inode) + loc.inode = inode_ref(local->loc.inode); + else + goto out; + + itable = loc.inode->table; + ret = syncop_getxattr(source, &loc, &dict, GET_ANCESTRY_PATH_KEY, NULL, + NULL); + if (ret) { + gf_smsg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_HEAL_ABORT, + "subvol=%s", source->name, NULL); + goto out; + } + + ret = dict_get_str(dict, GET_ANCESTRY_PATH_KEY, &path); + if (path) { + inode = dht_heal_path(this, path, itable); + if (inode && inode != local->inode) { + /* + * if inode returned by heal function is different + * from what we passed, which means a racing thread + * already linked a different inode for dentry. + * So we will update our local->inode, so that we can + * retrurn proper inode. + */ + tmp_inode = local->inode; + local->inode = inode; + inode_unref(tmp_inode); + tmp_inode = NULL; + } else { + inode_unref(inode); + } + } - ret = dht_inode_ctx_get (inode, this, &ctx); +out: + loc_wipe(&loc); + if (dict) + dict_unref(dict); + return 0; +} +int +dht_heal_full_path_done(int op_ret, call_frame_t *heal_frame, void *data) +{ + call_frame_t *main_frame = NULL; + dht_local_t *local = NULL; + xlator_t *this = NULL; + int ret = -1; + int op_errno = 0; + + local = heal_frame->local; + main_frame = local->main_frame; + local->main_frame = NULL; + this = heal_frame->this; + + dht_set_fixed_dir_stat(&local->postparent); + if (local->need_xattr_heal) { + local->need_xattr_heal = 0; + ret = dht_dir_xattr_heal(this, local, &op_errno); if (ret) { - ctx = GF_CALLOC (1, sizeof (*ctx), gf_dht_mt_inode_ctx_t); - if (!ctx) - return -1; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_DIR_XATTR_HEAL_FAILED, "path=%s", local->loc.path, + NULL); } + } - time = &ctx->time; + DHT_STACK_UNWIND(lookup, main_frame, 0, 0, local->inode, &local->stbuf, + local->xattr, &local->postparent); - DHT_UPDATE_TIME(time->mtime, time->mtime_nsec, - stat->ia_mtime, stat->ia_mtime_nsec, inode, post); - DHT_UPDATE_TIME(time->ctime, time->ctime_nsec, - stat->ia_ctime, stat->ia_ctime_nsec, inode, post); - DHT_UPDATE_TIME(time->atime, time->atime_nsec, - stat->ia_atime, stat->ia_atime_nsec, inode, post); - - ret = dht_inode_ctx_set (inode, this, ctx); -out: - return 0; + DHT_STACK_DESTROY(heal_frame); + return 0; } +/* This function must be called inside an inode lock */ int -dht_inode_ctx_get (inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx) +__dht_lock_subvol_set(inode_t *inode, xlator_t *this, xlator_t *lock_subvol) { - int ret = -1; - uint64_t ctx_int = 0; + dht_inode_ctx_t *ctx = NULL; + int ret = -1; + uint64_t value = 0; - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); - ret = inode_ctx_get (inode, this, &ctx_int); - - if (ret) - return ret; + ret = __inode_ctx_get0(inode, this, &value); + if (ret || !value) { + return -1; + } - if (ctx) - *ctx = (dht_inode_ctx_t *) ctx_int; + ctx = (dht_inode_ctx_t *)(uintptr_t)value; + ctx->lock_subvol = lock_subvol; out: - return ret; + return ret; } -int dht_inode_ctx_set (inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx) +xlator_t * +dht_get_lock_subvolume(xlator_t *this, struct gf_flock *lock, + dht_local_t *local) { - int ret = -1; - uint64_t ctx_int = 0; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO (this->name, inode, out); - GF_VALIDATE_OR_GOTO (this->name, ctx, out); + xlator_t *subvol = NULL; + inode_t *inode = NULL; + int32_t ret = -1; + uint64_t value = 0; + xlator_t *cached_subvol = NULL; + dht_inode_ctx_t *ctx = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + GF_VALIDATE_OR_GOTO(this->name, lock, out); + GF_VALIDATE_OR_GOTO(this->name, local, out); + + cached_subvol = local->cached_subvol; + + if (local->loc.inode || local->fd) { + inode = local->loc.inode ? local->loc.inode : local->fd->inode; + } + + if (!inode) + goto out; + + if (!(IA_ISDIR(inode->ia_type) || IA_ISINVAL(inode->ia_type))) { + /* + * We may get non-linked inode for directories as part + * of the selfheal code path. So checking for IA_INVAL + * type also. This will only happen for directory. + */ + subvol = local->cached_subvol; + goto out; + } + + if (lock->l_type != F_UNLCK) { + /* + * inode purging might happen on NFS between a lk + * and unlk. Due to this lk and unlk might be sent + * to different subvols. + * So during a lock request, taking a ref on inode + * to prevent inode purging. inode unref will happen + * in unlock cbk code path. + */ + inode_ref(inode); + } + + LOCK(&inode->lock); + ret = __inode_ctx_get0(inode, this, &value); + if (!ret && value) { + ctx = (dht_inode_ctx_t *)(uintptr_t)value; + subvol = ctx->lock_subvol; + } + if (!subvol && lock->l_type != F_UNLCK && cached_subvol) { + ret = __dht_lock_subvol_set(inode, this, cached_subvol); + if (ret) { + gf_uuid_unparse(inode->gfid, gfid); + UNLOCK(&inode->lock); + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SET_INODE_CTX_FAILED, + "lock_subvol gfid=%s", gfid, NULL); + goto post_unlock; + } + subvol = cached_subvol; + } + UNLOCK(&inode->lock); +post_unlock: + if (!subvol && inode && lock->l_type != F_UNLCK) { + inode_unref(inode); + } +out: + return subvol; +} - ctx_int = (long)ctx; - ret = inode_ctx_set (inode, this, &ctx_int); +int +dht_lk_inode_unref(call_frame_t *frame, int32_t op_ret) +{ + int ret = -1; + dht_local_t *local = NULL; + inode_t *inode = NULL; + xlator_t *this = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + this = frame->this; + + if (local->loc.inode || local->fd) { + inode = local->loc.inode ? local->loc.inode : local->fd->inode; + } + if (!inode) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LOCK_INODE_UNREF_FAILED, + NULL); + goto out; + } + + if (!(IA_ISDIR(inode->ia_type) || IA_ISINVAL(inode->ia_type))) { + ret = 0; + goto out; + } + + switch (local->lock_type) { + case F_RDLCK: + case F_WRLCK: + if (op_ret) { + gf_uuid_unparse(inode->gfid, gfid); + gf_msg_debug(this->name, 0, "lock request failed for gfid %s", + gfid); + inode_unref(inode); + goto out; + } + break; + + case F_UNLCK: + if (!op_ret) { + inode_unref(inode); + } else { + gf_uuid_unparse(inode->gfid, gfid); + gf_smsg(this->name, GF_LOG_WARNING, 0, + DHT_MSG_LOCK_INODE_UNREF_FAILED, "gfid=%s", gfid, NULL); + goto out; + } + default: + break; + } + ret = 0; out: - return ret; + return ret; +} + +/* Code to update custom extended attributes from src dict to dst dict + */ +void +dht_dir_set_heal_xattr(xlator_t *this, dht_local_t *local, dict_t *dst, + dict_t *src, int *uret, int *uflag) +{ + int ret = -1; + data_t *keyval = NULL; + int luret = -1; + int luflag = -1; + int i = 0; + char **xattrs_to_heal; + + if (!src || !dst) { + gf_smsg(this->name, GF_LOG_WARNING, EINVAL, DHT_MSG_DST_NULL_SET_FAILED, + "path=%s", local->loc.path, NULL); + return; + } + /* Check if any user xattr present in src dict and set + it to dst dict + */ + luret = dict_foreach_fnmatch(src, "user.*", dht_set_user_xattr, dst); + /* Check if any other custom xattr present in src dict + and set it to dst dict, here index start from 1 because + user xattr already checked in previous statement + */ + + xattrs_to_heal = get_xattrs_to_heal(); + + for (i = 1; xattrs_to_heal[i]; i++) { + keyval = dict_get(src, xattrs_to_heal[i]); + if (keyval) { + luflag = 1; + ret = dict_set(dst, xattrs_to_heal[i], keyval); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, ENOMEM, + DHT_MSG_DICT_SET_FAILED, "key=%s", xattrs_to_heal[i], + "path=%s", local->loc.path, NULL); + keyval = NULL; + } + } + if (uret) + (*uret) = luret; + if (uflag) + (*uflag) = luflag; } diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c index e8a9a7196bf..dbb8070b0da 100644 --- a/xlators/cluster/dht/src/dht-inode-read.c +++ b/xlators/cluster/dht/src/dht-inode-read.c @@ -8,1132 +8,1651 @@ cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - #include "dht-common.h" -int dht_access2 (xlator_t *this, call_frame_t *frame, int ret); -int dht_readv2 (xlator_t *this, call_frame_t *frame, int ret); -int dht_attr2 (xlator_t *this, call_frame_t *frame, int ret); -int dht_open2 (xlator_t *this, call_frame_t *frame, int ret); -int dht_flush2 (xlator_t *this, call_frame_t *frame, int ret); -int dht_lk2 (xlator_t *this, call_frame_t *frame, int ret); -int dht_fsync2 (xlator_t *this, call_frame_t *frame, int ret); - -int -dht_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd, dict_t *xdata) +static int +dht_access2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret); +static int +dht_readv2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret); +static int +dht_attr2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret); +static int +dht_open2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret); +static int +dht_flush2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret); +static int +dht_lk2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret); +static int +dht_fsync2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret); +static int +dht_common_xattrop2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, + int ret); + +static int +dht_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, fd_t *fd, dict_t *xdata) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - int ret = 0; - - local = frame->local; - prev = cookie; - - local->op_errno = op_errno; - if ((op_ret == -1) && !dht_inode_missing(op_errno)) { - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto out; - } - - if (!op_ret || (local->call_cnt != 1)) - goto out; - - /* rebalance would have happened */ - local->rebalance.target_op_fn = dht_open2; - ret = dht_rebalance_complete_check (this, frame); - if (!ret) - return 0; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + int ret = 0; + + local = frame->local; + prev = cookie; + + local->op_errno = op_errno; + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + goto out; + } + + /* Update ctx if the fd has been opened on the target*/ + if (!op_ret && (local->call_cnt == 1)) { + dht_fd_ctx_set(this, fd, prev); + goto out; + } + + if (!op_ret || (local->call_cnt != 1)) + goto out; + + /* rebalance would have happened */ + local->rebalance.target_op_fn = dht_open2; + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; out: - DHT_STACK_UNWIND (open, frame, op_ret, op_errno, local->fd, xdata); + DHT_STACK_UNWIND(open, frame, op_ret, op_errno, local->fd, xdata); - return 0; + return 0; } -int -dht_open2 (xlator_t *this, call_frame_t *frame, int op_ret) +static int +dht_open2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) { - dht_local_t *local = NULL; - xlator_t *subvol = NULL; - int op_errno = EINVAL; - - local = frame->local; - if (!local) - goto out; + dht_local_t *local = NULL; + int op_errno = EINVAL; - op_errno = ENOENT; - if (op_ret) - goto out; + if (!frame || !frame->local) + goto out; - local->call_cnt = 2; - subvol = local->cached_subvol; + local = frame->local; + op_errno = local->op_errno; - STACK_WIND (frame, dht_open_cbk, subvol, subvol->fops->open, - &local->loc, local->rebalance.flags, local->fd, - NULL); + if (we_are_not_migrating(ret)) { + /* This DHT layer is not migrating the file */ + DHT_STACK_UNWIND(open, frame, -1, local->op_errno, NULL, + local->rebalance.xdata); return 0; + } + + if (subvol == NULL) + goto out; + + local->call_cnt = 2; + + STACK_WIND_COOKIE(frame, dht_open_cbk, subvol, subvol, subvol->fops->open, + &local->loc, local->rebalance.flags, local->fd, + local->xattr_req); + return 0; out: - DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); - return 0; + DHT_STACK_UNWIND(open, frame, -1, op_errno, NULL, NULL); + return 0; } int -dht_open (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags, fd_t *fd, dict_t *xdata) +dht_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd, + dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - local = dht_local_init (frame, loc, fd, GF_FOP_OPEN); - if (!local) { - op_errno = ENOMEM; - goto err; - } + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; - subvol = local->cached_subvol; - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); - local->rebalance.flags = flags; - local->call_cnt = 1; + local = dht_local_init(frame, loc, fd, GF_FOP_OPEN); + if (!local) { + op_errno = ENOMEM; + goto err; + } - STACK_WIND (frame, dht_open_cbk, subvol, subvol->fops->open, - loc, flags, fd, xdata); + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + if (xdata) + local->xattr_req = dict_ref(xdata); - return 0; + local->rebalance.flags = flags; + local->call_cnt = 1; + + STACK_WIND_COOKIE(frame, dht_open_cbk, subvol, subvol, subvol->fops->open, + loc, flags, fd, xdata); + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (open, frame, -1, op_errno, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(open, frame, -1, op_errno, NULL, NULL); - return 0; + return 0; } int -dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *stbuf, dict_t *xdata) +dht_file_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *stbuf, dict_t *xdata) { - xlator_t *subvol = 0; - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - int ret = -1; - inode_t *inode = NULL; - - GF_VALIDATE_OR_GOTO ("dht", frame, err); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", cookie, out); - - local = frame->local; - prev = cookie; - - if ((op_ret == -1) && !dht_inode_missing(op_errno)) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto out; - } - - if (local->call_cnt != 1) - goto out; + xlator_t *subvol1 = 0; + xlator_t *subvol2 = 0; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + int ret = -1; + inode_t *inode = NULL; + + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((local->fop == GF_FOP_FSTAT) && + dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { local->op_errno = op_errno; - /* Check if the rebalance phase2 is true */ - if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) { - inode = (local->fd) ? local->fd->inode : local->loc.inode; - ret = dht_inode_ctx_get1 (this, inode, &subvol); - if (!subvol) { - /* Phase 2 of migration */ - local->rebalance.target_op_fn = dht_attr2; - ret = dht_rebalance_complete_check (this, frame); - if (!ret) - return 0; - } else { - /* value is already set in fd_ctx, that means no need - to check for whether its complete or not. */ - dht_attr2 (this, frame, 0); - return 0; - } + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + goto out; + } + + if (local->call_cnt != 1) + goto out; + + local->op_errno = op_errno; + local->op_ret = op_ret; + + /* Check if the rebalance phase2 is true */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) { + local->rebalance.target_op_fn = dht_attr2; + dht_set_local_rebalance(this, local, NULL, NULL, stbuf, xdata); + inode = (local->fd) ? local->fd->inode : local->loc.inode; + + dht_inode_ctx_get_mig_info(this, inode, &subvol1, &subvol2); + if (dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) { + /* Phase 2 of migration */ + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } else { + /* it is a non-fd op or it is an fd based Fop and + opened on the dst.*/ + if (local->fd && !dht_fd_open_on_dst(this, local->fd, subvol2)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } else { + dht_attr2(this, subvol2, frame, 0); + return 0; + } } + } out: - DHT_STRIP_PHASE1_FLAGS (stbuf); - DHT_STACK_UNWIND (stat, frame, op_ret, op_errno, stbuf, xdata); + DHT_STRIP_PHASE1_FLAGS(stbuf); + DHT_STACK_UNWIND(stat, frame, op_ret, op_errno, stbuf, xdata); err: - return 0; + return 0; } -int -dht_attr2 (xlator_t *this, call_frame_t *frame, int op_ret) +static int +dht_attr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) { - dht_local_t *local = NULL; - xlator_t *subvol = NULL; - int op_errno = EINVAL; + dht_local_t *local = NULL; + int op_errno = EINVAL; + + local = frame->local; + if (!local) + goto out; + + op_errno = local->op_errno; + + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + DHT_STACK_UNWIND(stat, frame, local->op_ret, op_errno, + &local->rebalance.postbuf, local->rebalance.xdata); + return 0; + } - local = frame->local; - if (!local) - goto out; + if (subvol == NULL) + goto out; - op_errno = local->op_errno; - if (op_ret == -1) - goto out; + local->call_cnt = 2; - subvol = local->cached_subvol; - local->call_cnt = 2; + if (local->fop == GF_FOP_FSTAT) { + STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol, + subvol->fops->fstat, local->fd, local->xattr_req); + } else { + STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol, + subvol->fops->stat, &local->loc, local->xattr_req); + } - if (local->fop == GF_FOP_FSTAT) { - STACK_WIND (frame, dht_file_attr_cbk, subvol, - subvol->fops->fstat, local->fd, NULL); - } else { - STACK_WIND (frame, dht_file_attr_cbk, subvol, - subvol->fops->stat, &local->loc, NULL); - } - return 0; + return 0; out: - DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); - return 0; + DHT_STACK_UNWIND(stat, frame, -1, op_errno, NULL, NULL); + return 0; +} + +static int +dht_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *stbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + local = frame->local; + prev = cookie; + + LOCK(&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + UNLOCK(&frame->lock); + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + + goto post_unlock; + } + + dht_iatt_merge(this, &local->stbuf, stbuf); + + local->op_ret = 0; + } + UNLOCK(&frame->lock); +post_unlock: + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + DHT_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno, + &local->stbuf, xdata); + } + + return 0; } int -dht_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *stbuf, dict_t *xdata) +dht_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + VALIDATE_OR_GOTO(loc->path, err); + + local = dht_local_init(frame, loc, NULL, GF_FOP_STAT); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + if (xdata) + local->xattr_req = dict_ref(xdata); + + if (IA_ISREG(loc->inode->ia_type)) { + local->call_cnt = 1; - GF_VALIDATE_OR_GOTO ("dht", frame, err); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", cookie, out); + subvol = local->cached_subvol; - local = frame->local; - prev = cookie; + STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol, + subvol->fops->stat, loc, xdata); - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); + return 0; + } - goto unlock; - } + local->call_cnt = call_cnt = layout->cnt; - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); + for (i = 0; i < call_cnt; i++) { + subvol = layout->list[i].xlator; + + STACK_WIND_COOKIE(frame, dht_attr_cbk, subvol, subvol, + subvol->fops->stat, loc, xdata); + } + + return 0; - local->op_ret = 0; - } -unlock: - UNLOCK (&frame->lock); -out: - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (stat, frame, local->op_ret, local->op_errno, - &local->stbuf, xdata); - } err: - return 0; + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(stat, frame, -1, op_errno, NULL, NULL); + + return 0; } int -dht_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +dht_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - int i = 0; - int call_cnt = 0; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - - local = dht_local_init (frame, loc, NULL, GF_FOP_STAT); - if (!local) { - op_errno = ENOMEM; - goto err; - } + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_FSTAT); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "no layout for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + if (xdata) + local->xattr_req = dict_ref(xdata); + + if (IA_ISREG(fd->inode->ia_type)) { + local->call_cnt = 1; - layout = local->layout; - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + subvol = local->cached_subvol; - if (IA_ISREG (loc->inode->ia_type)) { - local->call_cnt = 1; + STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol, + subvol->fops->fstat, fd, xdata); + return 0; + } + + local->call_cnt = call_cnt = layout->cnt; + + for (i = 0; i < call_cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND_COOKIE(frame, dht_attr_cbk, subvol, subvol, + subvol->fops->fstat, fd, xdata); + } - subvol = local->cached_subvol; + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(fstat, frame, -1, op_errno, NULL, NULL); - STACK_WIND (frame, dht_file_attr_cbk, subvol, - subvol->fops->stat, loc, xdata); + return 0; +} +int +dht_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iovec *vector, int count, struct iatt *stbuf, + struct iobref *iobref, dict_t *xdata) +{ + dht_local_t *local = NULL; + int ret = 0; + xlator_t *src_subvol = 0; + xlator_t *dst_subvol = 0; + + local = frame->local; + if (!local) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + /* This is already second try, no need for re-check */ + if (local->call_cnt != 1) + goto out; + + if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } + + if ((op_ret == -1) && !dht_inode_missing(op_errno)) + goto out; + + local->op_errno = op_errno; + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) { + local->op_ret = op_ret; + local->rebalance.target_op_fn = dht_readv2; + dht_set_local_rebalance(this, local, NULL, NULL, stbuf, xdata); + /* File would be migrated to other node */ + ret = dht_inode_ctx_get_mig_info(this, local->fd->inode, &src_subvol, + &dst_subvol); + + if (dht_mig_info_is_invalid(local->cached_subvol, src_subvol, + dst_subvol) || + !dht_fd_open_on_dst(this, local->fd, dst_subvol)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) return 0; + } else { + /* value is already set in fd_ctx, that means no need + to check for whether its complete or not. */ + dht_readv2(this, dst_subvol, frame, 0); + return 0; } + } - local->call_cnt = call_cnt = layout->cnt; +out: + DHT_STRIP_PHASE1_FLAGS(stbuf); - for (i = 0; i < call_cnt; i++) { - subvol = layout->list[i].xlator; + DHT_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, stbuf, + iobref, xdata); - STACK_WIND (frame, dht_attr_cbk, - subvol, subvol->fops->stat, - loc, xdata); - } + return 0; +} +static int +dht_readv2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) +{ + dht_local_t *local = NULL; + int op_errno = EINVAL; + + local = frame->local; + if (!local) + goto out; + + op_errno = local->op_errno; + + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + DHT_STACK_UNWIND(readv, frame, local->op_ret, op_errno, NULL, 0, + &local->rebalance.postbuf, NULL, + local->rebalance.xdata); return 0; + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); + if (subvol == NULL) + goto out; - return 0; -} + local->call_cnt = 2; + + STACK_WIND(frame, dht_readv_cbk, subvol, subvol->fops->readv, local->fd, + local->rebalance.size, local->rebalance.offset, + local->rebalance.flags, local->xattr_req); + return 0; + +out: + DHT_STACK_UNWIND(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); + return 0; +} int -dht_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +dht_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t off, + uint32_t flags, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - int i = 0; - int call_cnt = 0; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - local = dht_local_init (frame, NULL, fd, GF_FOP_FSTAT); - if (!local) { - op_errno = ENOMEM; - goto err; - } + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; - layout = local->layout; - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); - if (IA_ISREG (fd->inode->ia_type)) { - local->call_cnt = 1; + local = dht_local_init(frame, NULL, fd, GF_FOP_READ); + if (!local) { + op_errno = ENOMEM; + goto err; + } - subvol = local->cached_subvol; + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } - STACK_WIND (frame, dht_file_attr_cbk, subvol, - subvol->fops->fstat, fd, xdata); - - return 0; - } + if (xdata) + local->xattr_req = dict_ref(xdata); - local->call_cnt = call_cnt = layout->cnt; + local->rebalance.offset = off; + local->rebalance.size = size; + local->rebalance.flags = flags; + local->call_cnt = 1; - for (i = 0; i < call_cnt; i++) { - subvol = layout->list[i].xlator; - STACK_WIND (frame, dht_attr_cbk, - subvol, subvol->fops->fstat, - fd, xdata); - } + STACK_WIND(frame, dht_readv_cbk, subvol, subvol->fops->readv, local->fd, + local->rebalance.size, local->rebalance.offset, + local->rebalance.flags, local->xattr_req); - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); - return 0; + return 0; } -int -dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - struct iovec *vector, int count, struct iatt *stbuf, - struct iobref *iobref, dict_t *xdata) +static int +dht_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - int ret = 0; - inode_t *inode = NULL; - xlator_t *subvol = 0; - - local = frame->local; - if (!local) { - op_ret = -1; - op_errno = EINVAL; - goto out; - } - - /* This is already second try, no need for re-check */ - if (local->call_cnt != 1) - goto out; + int ret = -1; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + xlator_t *prev = NULL; + + local = frame->local; + prev = cookie; + + if (!prev) + goto out; + if (local->call_cnt != 1) + goto out; + if ((op_ret == -1) && + ((op_errno == ENOTCONN) || dht_inode_missing(op_errno)) && + IA_ISDIR(local->loc.inode->ia_type)) { + subvol = dht_subvol_next_available(this, prev); + if (!subvol) + goto out; - if ((op_ret == -1) && !dht_inode_missing(op_errno)) - goto out; + /* check if we are done with visiting every node */ + if (subvol == local->cached_subvol) { + goto out; + } + STACK_WIND_COOKIE(frame, dht_access_cbk, subvol, subvol, + subvol->fops->access, &local->loc, + local->rebalance.flags, NULL); + return 0; + } + if ((op_ret == -1) && dht_inode_missing(op_errno) && + !(IA_ISDIR(local->loc.inode->ia_type))) { + /* File would be migrated to other node */ local->op_errno = op_errno; - if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) { - /* File would be migrated to other node */ - ret = dht_inode_ctx_get1 (this, inode, &subvol); - if (!subvol) { - local->rebalance.target_op_fn = dht_readv2; - ret = dht_rebalance_complete_check (this, frame); - if (!ret) - return 0; - } else { - /* value is already set in fd_ctx, that means no need - to check for whether its complete or not. */ - dht_readv2 (this, frame, 0); - return 0; - } - } + local->rebalance.target_op_fn = dht_access2; + ret = dht_rebalance_complete_check(frame->this, frame); + if (!ret) + return 0; + } out: - DHT_STRIP_PHASE1_FLAGS (stbuf); - DHT_STACK_UNWIND (readv, frame, op_ret, op_errno, vector, count, stbuf, - iobref, xdata); - - return 0; + DHT_STACK_UNWIND(access, frame, op_ret, op_errno, xdata); + return 0; } -int -dht_readv2 (xlator_t *this, call_frame_t *frame, int op_ret) +static int +dht_access2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) { - dht_local_t *local = NULL; - xlator_t *subvol = NULL; - int op_errno = EINVAL; + dht_local_t *local = NULL; + int op_errno = EINVAL; - local = frame->local; - if (!local) - goto out; + local = frame->local; + if (!local) + goto out; - op_errno = local->op_errno; - if (op_ret == -1) - goto out; + op_errno = local->op_errno; - local->call_cnt = 2; - subvol = local->cached_subvol; - - STACK_WIND (frame, dht_readv_cbk, subvol, subvol->fops->readv, - local->fd, local->rebalance.size, local->rebalance.offset, - local->rebalance.flags, NULL); + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + DHT_STACK_UNWIND(access, frame, -1, op_errno, NULL); return 0; + } + + if (subvol == NULL) + goto out; + + local->call_cnt = 2; + + STACK_WIND_COOKIE(frame, dht_access_cbk, subvol, subvol, + subvol->fops->access, &local->loc, local->rebalance.flags, + local->xattr_req); + + return 0; out: - DHT_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); - return 0; + DHT_STACK_UNWIND(access, frame, -1, op_errno, NULL); + return 0; } int -dht_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off, uint32_t flags, dict_t *xdata) +dht_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - local = dht_local_init (frame, NULL, fd, GF_FOP_READ); - if (!local) { - op_errno = ENOMEM; - goto err; - } + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + VALIDATE_OR_GOTO(loc->path, err); + + local = dht_local_init(frame, loc, NULL, GF_FOP_ACCESS); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.flags = mask; + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + if (xdata) + local->xattr_req = dict_ref(xdata); + + STACK_WIND_COOKIE(frame, dht_access_cbk, subvol, subvol, + subvol->fops->access, loc, mask, xdata); + + return 0; - subvol = local->cached_subvol; - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(access, frame, -1, op_errno, NULL); - local->rebalance.offset = off; - local->rebalance.size = size; - local->rebalance.flags = flags; - local->call_cnt = 1; + return 0; +} + +int +dht_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *subvol = 0; + int ret = 0; + + local = frame->local; - STACK_WIND (frame, dht_readv_cbk, - subvol, subvol->fops->readv, - fd, size, off, flags, xdata); + local->op_errno = op_errno; + if (local->call_cnt != 1) + goto out; + + if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; return 0; + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); + local->rebalance.target_op_fn = dht_flush2; + local->op_ret = op_ret; + local->op_errno = op_errno; + + /* If context is set, then send flush() it to the destination */ + dht_inode_ctx_get_mig_info(this, local->fd->inode, NULL, &subvol); + if (subvol && dht_fd_open_on_dst(this, local->fd, subvol)) { + dht_flush2(this, subvol, frame, 0); return 0; -} + } -int -dht_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xdata) -{ - int ret = -1; - dht_local_t *local = NULL; - xlator_t *subvol = NULL; - call_frame_t *prev = NULL; - - local = frame->local; - prev = cookie; - - if (!prev || !prev->this) - goto out; - if (local->call_cnt != 1) - goto out; - if ((op_ret == -1) && (op_errno == ENOTCONN) && - IA_ISDIR(local->loc.inode->ia_type)) { - - subvol = dht_subvol_next_available (this, prev->this); - if (!subvol) - goto out; - - /* check if we are done with visiting every node */ - if (subvol == local->cached_subvol) { - goto out; - } - - STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access, - &local->loc, local->rebalance.flags, NULL); - return 0; - } - if ((op_ret == -1) && dht_inode_missing(op_errno)) { - /* File would be migrated to other node */ - local->op_errno = op_errno; - local->rebalance.target_op_fn = dht_access2; - ret = dht_rebalance_complete_check (frame->this, frame); - if (!ret) - return 0; + if (op_errno == EREMOTE) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) { + return 0; } + } out: - DHT_STACK_UNWIND (access, frame, op_ret, op_errno, xdata); - return 0; + DHT_STACK_UNWIND(flush, frame, op_ret, op_errno, xdata); + + return 0; } -int -dht_access2 (xlator_t *this, call_frame_t *frame, int op_ret) +static int +dht_flush2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) { - dht_local_t *local = NULL; - xlator_t *subvol = NULL; - int op_errno = EINVAL; + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; - local = frame->local; - if (!local) - goto out; + if ((frame == NULL) || (frame->local == NULL)) + goto out; - op_errno = local->op_errno; - if (op_ret == -1) - goto out; + local = frame->local; - local->call_cnt = 2; - subvol = local->cached_subvol; + op_errno = local->op_errno; - STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access, - &local->loc, local->rebalance.flags, NULL); + if (subvol == NULL) + goto out; - return 0; + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_flush_cbk, subvol, subvol->fops->flush, local->fd, + local->xattr_req); + + return 0; out: - DHT_STACK_UNWIND (access, frame, -1, op_errno, NULL); - return 0; + DHT_STACK_UNWIND(flush, frame, -1, op_errno, NULL); + return 0; } - int -dht_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, - dict_t *xdata) +dht_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - local = dht_local_init (frame, loc, NULL, GF_FOP_ACCESS); - if (!local) { - op_errno = ENOMEM; - goto err; - } + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; - local->rebalance.flags = mask; - local->call_cnt = 1; - subvol = local->cached_subvol; - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); - STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access, - loc, mask, xdata); + local = dht_local_init(frame, NULL, fd, GF_FOP_FLUSH); + if (!local) { + op_errno = ENOMEM; + goto err; + } - return 0; + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (xdata) + local->xattr_req = dict_ref(xdata); + + local->call_cnt = 1; + + STACK_WIND(frame, dht_flush_cbk, subvol, subvol->fops->flush, fd, + local->xattr_req); + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (access, frame, -1, op_errno, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(flush, frame, -1, op_errno, NULL); - return 0; + return 0; } - int -dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xdata) +dht_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) { - dht_local_t *local = NULL; - inode_t *inode = NULL; - xlator_t *subvol = 0; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + int ret = -1; + inode_t *inode = NULL; + xlator_t *src_subvol = 0; + xlator_t *dst_subvol = 0; + + local = frame->local; + prev = cookie; + + local->op_errno = op_errno; + + if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } + + if (op_ret == -1 && !dht_inode_missing(op_errno)) { + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge(this, postbuf, &local->stbuf); + dht_iatt_merge(this, prebuf, &local->prebuf); + } + goto out; + } - local = frame->local; + local->op_ret = op_ret; + inode = local->fd->inode; - local->op_errno = op_errno; + local->rebalance.target_op_fn = dht_fsync2; + dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata); - if (local->call_cnt != 1) - goto out; + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1(postbuf)) { + dht_iatt_merge(this, &local->stbuf, postbuf); + dht_iatt_merge(this, &local->prebuf, prebuf); + + dht_inode_ctx_get_mig_info(this, inode, &src_subvol, &dst_subvol); - /* If context is set, then send flush() it to the destination */ - dht_inode_ctx_get1 (this, inode, &subvol); - if (subvol) { - dht_flush2 (this, frame, 0); + if (dht_mig_info_is_invalid(local->cached_subvol, src_subvol, + dst_subvol) || + !dht_fd_open_on_dst(this, local->fd, dst_subvol)) { + ret = dht_rebalance_in_progress_check(this, frame); + if (!ret) return 0; + } else { + dht_fsync2(this, dst_subvol, frame, 0); + return 0; } + } out: - DHT_STACK_UNWIND (flush, frame, op_ret, op_errno, xdata); + DHT_STRIP_PHASE1_FLAGS(postbuf); + DHT_STRIP_PHASE1_FLAGS(prebuf); - return 0; + DHT_STACK_UNWIND(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata); + + return 0; } -int -dht_flush2 (xlator_t *this, call_frame_t *frame, int op_ret) +static int +dht_fsync2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) { - dht_local_t *local = NULL; - xlator_t *subvol = NULL; - - local = frame->local; + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; + + if ((frame == NULL) || (frame->local == NULL)) + goto out; + + local = frame->local; + op_errno = local->op_errno; + + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + DHT_STACK_UNWIND(fsync, frame, local->op_ret, op_errno, + &local->rebalance.prebuf, &local->rebalance.postbuf, + local->rebalance.xdata); + return 0; + } - dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol == NULL) + goto out; - if (!subvol) - subvol = local->cached_subvol; + local->call_cnt = 2; /* This is the second attempt */ - local->call_cnt = 2; /* This is the second attempt */ + STACK_WIND_COOKIE(frame, dht_fsync_cbk, subvol, subvol, subvol->fops->fsync, + local->fd, local->rebalance.flags, local->xattr_req); - STACK_WIND (frame, dht_flush_cbk, - subvol, subvol->fops->flush, local->fd, NULL); + return 0; - return 0; +out: + DHT_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } - int -dht_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +dht_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync, + dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - local = dht_local_init (frame, NULL, fd, GF_FOP_FLUSH); - if (!local) { - op_errno = ENOMEM; - goto err; - } + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; - subvol = local->cached_subvol; - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); - local->call_cnt = 1; + local = dht_local_init(frame, NULL, fd, GF_FOP_FSYNC); + if (!local) { + op_errno = ENOMEM; - STACK_WIND (frame, dht_flush_cbk, - subvol, subvol->fops->flush, fd, xdata); + goto err; + } + if (xdata) + local->xattr_req = dict_ref(xdata); - return 0; + local->call_cnt = 1; + local->rebalance.flags = datasync; + + subvol = local->cached_subvol; + + STACK_WIND_COOKIE(frame, dht_fsync_cbk, subvol, subvol, subvol->fops->fsync, + local->fd, local->rebalance.flags, local->xattr_req); + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (flush, frame, -1, op_errno, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + return 0; } - -int -dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, - int op_errno, struct iatt *prebuf, struct iatt *postbuf, - dict_t *xdata) +/* TODO: for 'lk()' call, we need some other special error, may be ESTALE to + indicate that lock migration happened on the fd, so we can consider it as + phase 2 of migration */ +static int +dht_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct gf_flock *flock, dict_t *xdata) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - int ret = -1; - inode_t *inode = NULL; - xlator_t *subvol = 0; + dht_local_t *local = NULL; + int ret = -1; + xlator_t *subvol = NULL; - local = frame->local; - prev = cookie; + local = frame->local; - local->op_errno = op_errno; - if (op_ret == -1 && !dht_inode_missing(op_errno)) { - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto out; - } + if (!local) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } - if (local->call_cnt != 1) { - if (local->stbuf.ia_blocks) { - dht_iatt_merge (this, postbuf, &local->stbuf, NULL); - dht_iatt_merge (this, prebuf, &local->prebuf, NULL); - } - goto out; - } + if (local->call_cnt != 1) + goto out; - local->op_errno = op_errno; - dht_inode_ctx_get1 (this, inode, &subvol); - if (!subvol) { - local->rebalance.target_op_fn = dht_fsync2; - - /* Check if the rebalance phase1 is true */ - if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { - dht_iatt_merge (this, &local->stbuf, postbuf, NULL); - dht_iatt_merge (this, &local->prebuf, prebuf, NULL); - - ret = dht_rebalance_in_progress_check (this, frame); - } - - /* Check if the rebalance phase2 is true */ - if (IS_DHT_MIGRATION_PHASE2 (postbuf)) { - ret = dht_rebalance_complete_check (this, frame); - } - if (!ret) - return 0; + local->rebalance.target_op_fn = dht_lk2; + + local->op_ret = op_ret; + local->op_errno = op_errno; + + if (xdata) + local->rebalance.xdata = dict_ref(xdata); + + if (op_errno == EREMOTE) { + dht_inode_ctx_get_mig_info(this, local->fd->inode, NULL, &subvol); + if (subvol && dht_fd_open_on_dst(this, local->fd, subvol)) { + dht_lk2(this, subvol, frame, 0); + return 0; } else { - dht_fsync2 (this, frame, 0); + ret = dht_rebalance_complete_check(this, frame); + if (!ret) { return 0; + } } + } out: - DHT_STRIP_PHASE1_FLAGS (postbuf); - DHT_STRIP_PHASE1_FLAGS (prebuf); - DHT_STACK_UNWIND (fsync, frame, op_ret, op_errno, - prebuf, postbuf, xdata); + dht_lk_inode_unref(frame, op_ret); + DHT_STACK_UNWIND(lk, frame, op_ret, op_errno, flock, xdata); - return 0; + return 0; } -int -dht_fsync2 (xlator_t *this, call_frame_t *frame, int op_ret) +static int +dht_lk2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) { - dht_local_t *local = NULL; - xlator_t *subvol = NULL; + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; - local = frame->local; + if ((frame == NULL) || (frame->local == NULL)) + goto out; - dht_inode_ctx_get1 (this, local->fd->inode, &subvol); - if (!subvol) - subvol = local->cached_subvol; + local = frame->local; - local->call_cnt = 2; /* This is the second attempt */ + op_errno = local->op_errno; - STACK_WIND (frame, dht_fsync_cbk, subvol, subvol->fops->fsync, - local->fd, local->rebalance.flags, NULL); + if (subvol == NULL) + goto out; - return 0; + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_lk_cbk, subvol, subvol->fops->lk, local->fd, + local->rebalance.lock_cmd, &local->rebalance.flock, + local->xattr_req); + + return 0; + +out: + DHT_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL); + return 0; } int -dht_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync, - dict_t *xdata) +dht_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int cmd, + struct gf_flock *flock, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; + xlator_t *lock_subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_LK); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->lock_type = flock->l_type; + lock_subvol = dht_get_lock_subvolume(this, flock, local); + if (!lock_subvol) { + gf_msg_debug(this->name, 0, "no lock subvolume for path=%p", fd); + op_errno = EINVAL; + goto err; + } + + /* + local->cached_subvol = lock_subvol; + ret = dht_check_and_open_fd_on_subvol (this, frame); + if (ret) + goto err; + */ + if (xdata) + local->xattr_req = dict_ref(xdata); + + local->rebalance.flock = *flock; + local->rebalance.lock_cmd = cmd; + + local->call_cnt = 1; + + STACK_WIND(frame, dht_lk_cbk, lock_subvol, lock_subvol->fops->lk, fd, cmd, + flock, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); + return 0; +} - local = dht_local_init (frame, NULL, fd, GF_FOP_FSYNC); - if (!local) { - op_errno = ENOMEM; +static int +dht_lease_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct gf_lease *lease, dict_t *xdata) +{ + DHT_STACK_UNWIND(lease, frame, op_ret, op_errno, lease, xdata); - goto err; - } + return 0; +} - local->call_cnt = 1; - local->rebalance.flags = datasync; +int +dht_lease(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct gf_lease *lease, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; - subvol = local->cached_subvol; + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); - STACK_WIND (frame, dht_fsync_cbk, subvol, subvol->fops->fsync, - fd, datasync, xdata); + subvol = dht_subvol_get_cached(this, loc->inode); + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } - return 0; + /* TODO: for rebalance, we need to preserve the fop arguments */ + STACK_WIND(frame, dht_lease_cbk, subvol, subvol->fops->lease, loc, lease, + xdata); + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(lease, frame, -1, op_errno, NULL, NULL); - return 0; + return 0; } - -/* TODO: for 'lk()' call, we need some other special error, may be ESTALE to - indicate that lock migration happened on the fd, so we can consider it as - phase 2 of migration */ -int -dht_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct gf_flock *flock, dict_t *xdata) +/* Symlinks are currently not migrated, so no need for any check here */ +static int +dht_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, const char *path, struct iatt *stbuf, + dict_t *xdata) { - DHT_STACK_UNWIND (lk, frame, op_ret, op_errno, flock, xdata); + dht_local_t *local = NULL; - return 0; -} + local = frame->local; + if (op_ret == -1) + goto err; + + if (!local) { + op_ret = -1; + op_errno = EINVAL; + } +err: + DHT_STRIP_PHASE1_FLAGS(stbuf); + DHT_STACK_UNWIND(readlink, frame, op_ret, op_errno, path, stbuf, xdata); + + return 0; +} int -dht_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int cmd, struct gf_flock *flock, dict_t *xdata) +dht_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + VALIDATE_OR_GOTO(loc->path, err); + + local = dht_local_init(frame, loc, NULL, GF_FOP_READLINK); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + + STACK_WIND(frame, dht_readlink_cbk, subvol, subvol->fops->readlink, loc, + size, xdata); + + return 0; +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(readlink, frame, -1, op_errno, NULL, NULL, NULL); - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + return 0; +} - /* TODO: for rebalance, we need to preserve the fop arguments */ - STACK_WIND (frame, dht_lk_cbk, subvol, subvol->fops->lk, fd, - cmd, flock, xdata); +/* Get both DHT_IATT_IN_XDATA_KEY and DHT_MODE_IN_XDATA_KEY + * Use DHT_MODE_IN_XDATA_KEY if available, else fall back to + * DHT_IATT_IN_XDATA_KEY + * This will return a dummy iatt with only the mode and type set + */ +static int +dht_read_iatt_from_xdata(dict_t *xdata, struct iatt *stbuf) +{ + int ret = -1; + int32_t mode = 0; - return 0; + ret = dict_get_int32(xdata, DHT_MODE_IN_XDATA_KEY, &mode); -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); + if (ret) { + ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf); + } else { + stbuf->ia_prot = ia_prot_from_st_mode(mode); + stbuf->ia_type = ia_type_from_st_mode(mode); + } - return 0; + return ret; } -/* Symlinks are currently not migrated, so no need for any check here */ int -dht_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, const char *path, - struct iatt *stbuf, dict_t *xdata) +dht_common_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - dht_local_t *local = NULL; + dht_local_t *local = NULL; + call_frame_t *call_frame = NULL; + xlator_t *prev = NULL; + xlator_t *src_subvol = NULL; + xlator_t *dst_subvol = NULL; + struct iatt stbuf = { + 0, + }; + int ret = -1; + inode_t *inode = NULL; + + local = frame->local; + call_frame = cookie; + prev = call_frame->this; + + local->op_errno = op_errno; + + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1.", + prev->name); + goto out; + } + + if (local->call_cnt != 1) + goto out; + + if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } + + ret = dht_read_iatt_from_xdata(xdata, &stbuf); - local = frame->local; - if (op_ret == -1) - goto err; + if ((!op_ret) && (ret)) { + /* This is a potential problem and can cause corruption + * with sharding. + * Oh well. We tried. + */ + goto out; + } - if (!local) { - op_ret = -1; - op_errno = EINVAL; + local->op_ret = op_ret; + local->rebalance.target_op_fn = dht_common_xattrop2; + if (xdata) + local->rebalance.xdata = dict_ref(xdata); + + if (dict) + local->rebalance.dict = dict_ref(dict); + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(&stbuf)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1(&stbuf)) { + inode = local->loc.inode ? local->loc.inode : local->fd->inode; + dht_inode_ctx_get_mig_info(this, inode, &src_subvol, &dst_subvol); + + if (dht_mig_info_is_invalid(local->cached_subvol, src_subvol, + dst_subvol) || + !dht_fd_open_on_dst(this, local->fd, dst_subvol)) { + ret = dht_rebalance_in_progress_check(this, frame); + if (!ret) + return 0; + } else { + dht_common_xattrop2(this, dst_subvol, frame, 0); + return 0; } + } -err: - DHT_STRIP_PHASE1_FLAGS (stbuf); - DHT_STACK_UNWIND (readlink, frame, op_ret, op_errno, path, stbuf, xdata); +out: + if (local->fop == GF_FOP_XATTROP) { + DHT_STACK_UNWIND(xattrop, frame, op_ret, op_errno, dict, xdata); + } else { + DHT_STACK_UNWIND(fxattrop, frame, op_ret, op_errno, dict, xdata); + } - return 0; + return 0; } - -int -dht_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, - dict_t *xdata) +static int +dht_common_xattrop2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, + int ret) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - local = dht_local_init (frame, loc, NULL, GF_FOP_READLINK); - if (!local) { - op_errno = ENOMEM; - goto err; + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; + + if ((frame == NULL) || (frame->local == NULL)) + goto out; + + local = frame->local; + op_errno = local->op_errno; + + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + if (local->fop == GF_FOP_XATTROP) { + DHT_STACK_UNWIND(xattrop, frame, local->op_ret, op_errno, + local->rebalance.dict, local->rebalance.xdata); + } else { + DHT_STACK_UNWIND(fxattrop, frame, local->op_ret, op_errno, + local->rebalance.dict, local->rebalance.xdata); } - subvol = local->cached_subvol; - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + return 0; + } - STACK_WIND (frame, dht_readlink_cbk, - subvol, subvol->fops->readlink, - loc, size, xdata); + if (subvol == NULL) + goto out; - return 0; + local->call_cnt = 2; /* This is the second attempt */ -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL, NULL); + if (local->fop == GF_FOP_XATTROP) { + STACK_WIND(frame, dht_common_xattrop_cbk, subvol, subvol->fops->xattrop, + &local->loc, local->rebalance.flags, local->rebalance.xattr, + local->xattr_req); + } else { + STACK_WIND(frame, dht_common_xattrop_cbk, subvol, + subvol->fops->fxattrop, local->fd, local->rebalance.flags, + local->rebalance.xattr, local->xattr_req); + } - return 0; -} + return 0; -/* Currently no translators on top of 'distribute' will be using - * below fops, hence not implementing 'migration' related checks - */ +out: -int -dht_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) + /* If local is unavailable we could be unwinding the wrong + * function here */ + + if (local && (local->fop == GF_FOP_XATTROP)) { + DHT_STACK_UNWIND(xattrop, frame, -1, op_errno, NULL, NULL); + } else { + DHT_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL); + } + return 0; +} + +static int +dht_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - DHT_STACK_UNWIND (xattrop, frame, op_ret, op_errno, dict, xdata); - return 0; + DHT_STACK_UNWIND(xattrop, frame, op_ret, op_errno, dict, xdata); + return 0; } +/* Set both DHT_IATT_IN_XDATA_KEY and DHT_MODE_IN_XDATA_KEY + * Use DHT_MODE_IN_XDATA_KEY if available. Else fall back to + * DHT_IATT_IN_XDATA_KEY + */ +static int +dht_request_iatt_in_xdata(dict_t *xattr_req) +{ + int ret = -1; + + ret = dict_set_int8(xattr_req, DHT_MODE_IN_XDATA_KEY, 1); + ret = dict_set_int8(xattr_req, DHT_IATT_IN_XDATA_KEY, 1); + + /* At least one call succeeded */ + return ret; +} int -dht_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +dht_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - local = dht_local_init (frame, loc, NULL, GF_FOP_XATTROP); - if (!local) { - op_errno = ENOMEM; - goto err; - } + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + int ret = -1; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + + local = dht_local_init(frame, loc, NULL, GF_FOP_XATTROP); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for gfid=%s", + uuid_utoa(loc->inode->gfid)); + op_errno = EINVAL; + goto err; + } + + /* Todo : Handle dirs as well. At the moment the only xlator above dht + * that uses xattrop is sharding and that is only for files */ + + if (IA_ISDIR(loc->inode->ia_type)) { + STACK_WIND(frame, dht_xattrop_cbk, subvol, subvol->fops->xattrop, loc, + flags, dict, xdata); + + } else { + local->xattr_req = xdata ? dict_ref(xdata) : dict_new(); + local->call_cnt = 1; - subvol = local->cached_subvol; - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + local->rebalance.xattr = dict_ref(dict); + local->rebalance.flags = flags; - local->call_cnt = 1; + ret = dht_request_iatt_in_xdata(local->xattr_req); - STACK_WIND (frame, - dht_xattrop_cbk, - subvol, subvol->fops->xattrop, - loc, flags, dict, xdata); + if (ret) { + gf_msg_debug(this->name, 0, + "Failed to set dictionary key %s file=%s", + DHT_IATT_IN_XDATA_KEY, loc->path); + } - return 0; + STACK_WIND(frame, dht_common_xattrop_cbk, subvol, subvol->fops->xattrop, + loc, local->rebalance.flags, local->rebalance.xattr, + local->xattr_req); + } + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(xattrop, frame, -1, op_errno, NULL, NULL); - return 0; + return 0; } - -int -dht_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) +static int +dht_fxattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - DHT_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, dict, xdata); - return 0; + DHT_STACK_UNWIND(fxattrop, frame, op_ret, op_errno, dict, xdata); + return 0; } - int -dht_fxattrop (call_frame_t *frame, xlator_t *this, - fd_t *fd, gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +dht_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + int ret = -1; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + subvol = dht_subvol_get_cached(this, fd->inode); + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init(frame, NULL, fd, GF_FOP_FXATTROP); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + /* Todo : Handle dirs as well. At the moment the only xlator above dht + * that uses xattrop is sharding and that is only for files */ + + if (IA_ISDIR(fd->inode->ia_type)) { + STACK_WIND(frame, dht_fxattrop_cbk, subvol, subvol->fops->fxattrop, fd, + flags, dict, xdata); + + } else { + local->xattr_req = xdata ? dict_ref(xdata) : dict_new(); + local->call_cnt = 1; + + local->rebalance.xattr = dict_ref(dict); + local->rebalance.flags = flags; + + ret = dht_request_iatt_in_xdata(local->xattr_req); + + if (ret) { + gf_msg_debug(this->name, 0, "Failed to set dictionary key %s fd=%p", + DHT_IATT_IN_XDATA_KEY, fd); } - STACK_WIND (frame, - dht_fxattrop_cbk, - subvol, subvol->fops->fxattrop, - fd, flags, dict, xdata); + STACK_WIND(frame, dht_common_xattrop_cbk, subvol, + subvol->fops->fxattrop, fd, local->rebalance.flags, + local->rebalance.xattr, local->xattr_req); + } - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL); - return 0; + return 0; } +/* Currently no translators on top of 'distribute' will be using + * below fops, hence not implementing 'migration' related checks + */ -int -dht_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) +static int +dht_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - DHT_STACK_UNWIND (inodelk, frame, op_ret, op_errno, xdata); - return 0; + dht_lk_inode_unref(frame, op_ret); + DHT_STACK_UNWIND(inodelk, frame, op_ret, op_errno, xdata); + return 0; } - int32_t -dht_inodelk (call_frame_t *frame, xlator_t *this, const char *volume, - loc_t *loc, int32_t cmd, struct gf_flock *lock, dict_t *xdata) +dht_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + int32_t cmd, struct gf_flock *lock, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - + xlator_t *lock_subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); - local = dht_local_init (frame, loc, NULL, GF_FOP_INODELK); - if (!local) { - op_errno = ENOMEM; - goto err; - } + local = dht_local_init(frame, loc, NULL, GF_FOP_INODELK); + if (!local) { + op_errno = ENOMEM; + goto err; + } - subvol = local->cached_subvol; - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + local->lock_type = lock->l_type; + lock_subvol = dht_get_lock_subvolume(this, lock, local); + if (!lock_subvol) { + gf_msg_debug(this->name, 0, "no lock subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } - local->call_cnt = 1; + local->call_cnt = 1; - STACK_WIND (frame, - dht_inodelk_cbk, - subvol, subvol->fops->inodelk, - volume, loc, cmd, lock, xdata); + STACK_WIND(frame, dht_inodelk_cbk, lock_subvol, lock_subvol->fops->inodelk, + volume, loc, cmd, lock, xdata); - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(inodelk, frame, -1, op_errno, NULL); - return 0; + return 0; } - int -dht_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +dht_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - DHT_STACK_UNWIND (finodelk, frame, op_ret, op_errno, xdata); - return 0; -} + dht_local_t *local = NULL; + int ret = 0; + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); -int -dht_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, - fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata) -{ - xlator_t *subvol = NULL; - int op_errno = -1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - subvol = dht_subvol_get_cached (this, fd->inode); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + local = frame->local; + if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } - STACK_WIND (frame, dht_finodelk_cbk, subvol, subvol->fops->finodelk, - volume, fd, cmd, lock, xdata); +out: + dht_lk_inode_unref(frame, op_ret); + DHT_STACK_UNWIND(finodelk, frame, op_ret, op_errno, xdata); - return 0; + return 0; +} + +int +dht_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *lock, dict_t *xdata) +{ + xlator_t *lock_subvol = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_INODELK); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->call_cnt = 1; + local->lock_type = lock->l_type; + + lock_subvol = dht_get_lock_subvolume(this, lock, local); + if (!lock_subvol) { + gf_msg_debug(this->name, 0, "no lock subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + /* + local->cached_subvol = lock_subvol; + ret = dht_check_and_open_fd_on_subvol (this, frame); + if (ret) + goto err; + */ + local->rebalance.flock = *lock; + local->rebalance.lock_cmd = cmd; + local->key = gf_strdup(volume); + + if (xdata) + local->xattr_req = dict_ref(xdata); + + STACK_WIND(frame, dht_finodelk_cbk, lock_subvol, + lock_subvol->fops->finodelk, volume, fd, cmd, lock, xdata); + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(finodelk, frame, -1, op_errno, NULL); - return 0; + return 0; } diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c index 576f007e513..2f23ce90fbd 100644 --- a/xlators/cluster/dht/src/dht-inode-write.c +++ b/xlators/cluster/dht/src/dht-inode-write.c @@ -8,1010 +8,1397 @@ cases as published by the Free Software Foundation. */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - #include "dht-common.h" -int dht_writev2 (xlator_t *this, call_frame_t *frame, int ret); -int dht_truncate2 (xlator_t *this, call_frame_t *frame, int ret); -int dht_setattr2 (xlator_t *this, call_frame_t *frame, int ret); -int dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret); -int dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret); -int dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret); +static int +dht_writev2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret); +static int +dht_truncate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret); +static int +dht_setattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret); +static int +dht_fallocate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret); +static int +dht_discard2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret); +static int +dht_zerofill2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret); int -dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +dht_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) { - dht_local_t *local = NULL; - int ret = -1; - xlator_t *subvol = NULL; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + int ret = -1; + xlator_t *subvol1 = NULL; + xlator_t *subvol2 = NULL; + + local = frame->local; + prev = cookie; + + if (!local) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + /* writev fails with EBADF if dht has not yet opened the fd + * on the cached subvol. This could happen if the file was migrated + * and a lookup updated the cached subvol in the inode ctx. + * We only check once as this could be a valid bad fd error. + */ + + if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } - if (op_ret == -1 && !dht_inode_missing(op_errno)) { - goto out; + if (op_ret == -1 && !dht_inode_missing(op_errno)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_msg_debug(this->name, 0, "subvolume %s returned -1 (%s)", prev->name, + strerror(op_errno)); + goto out; + } + + if (local->call_cnt != 1) { + /* preserve the modes of source */ + if (local->stbuf.ia_blocks) { + dht_iatt_merge(this, postbuf, &local->stbuf); + dht_iatt_merge(this, prebuf, &local->prebuf); } + goto out; + } - local = frame->local; - if (!local) { - op_ret = -1; - op_errno = EINVAL; - goto out; - } + local->rebalance.target_op_fn = dht_writev2; - if (local->call_cnt != 1) { - /* preserve the modes of source */ - if (local->stbuf.ia_blocks) { - dht_iatt_merge (this, postbuf, &local->stbuf, NULL); - dht_iatt_merge (this, prebuf, &local->prebuf, NULL); - } + local->op_ret = op_ret; + local->op_errno = op_errno; + + /* We might need to pass the stbuf information to the higher DHT + * layer for appropriate handling. + */ + + dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata); + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1(postbuf)) { + if (!local->xattr_req) { + local->xattr_req = dict_new(); + if (!local->xattr_req) { + gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, ENOMEM, + "insufficient memory"); + local->op_errno = ENOMEM; + local->op_ret = -1; goto out; + } } - local->rebalance.target_op_fn = dht_writev2; - - local->op_errno = op_errno; - /* Phase 2 of migration */ - if (IS_DHT_MIGRATION_PHASE2 (postbuf)) { - ret = dht_rebalance_complete_check (this, frame); - if (!ret) - return 0; + ret = dict_set_uint32(local->xattr_req, GF_PROTECT_FROM_EXTERNAL_WRITES, + 1); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_DICT_SET_FAILED, 0, + "Failed to set key %s in dictionary", + GF_PROTECT_FROM_EXTERNAL_WRITES); + local->op_errno = ENOMEM; + local->op_ret = -1; + goto out; } - /* Check if the rebalance phase1 is true */ - if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { - dht_iatt_merge (this, &local->stbuf, postbuf, NULL); - dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + dht_iatt_merge(this, &local->stbuf, postbuf); + dht_iatt_merge(this, &local->prebuf, prebuf); - ret = dht_inode_ctx_get1 (this, local->fd->inode, &subvol); - if (subvol) { - dht_writev2 (this, frame, 0); - return 0; - } - ret = dht_rebalance_in_progress_check (this, frame); - if (!ret) - return 0; + ret = dht_inode_ctx_get_mig_info(this, local->fd->inode, &subvol1, + &subvol2); + if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) { + if (dht_fd_open_on_dst(this, local->fd, subvol2)) { + dht_writev2(this, subvol2, frame, 0); + return 0; + } } + ret = dht_rebalance_in_progress_check(this, frame); + if (!ret) + return 0; + } out: - DHT_STRIP_PHASE1_FLAGS (postbuf); - DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STRIP_PHASE1_FLAGS(postbuf); + DHT_STRIP_PHASE1_FLAGS(prebuf); - DHT_STACK_UNWIND (writev, frame, op_ret, op_errno, prebuf, postbuf, - xdata); + DHT_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, postbuf, xdata); - return 0; + return 0; } -int -dht_writev2 (xlator_t *this, call_frame_t *frame, int op_ret) +static int +dht_writev2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) { - dht_local_t *local = NULL; - xlator_t *subvol = NULL; + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; + + if ((frame == NULL) || (frame->local == NULL)) + goto out; + + local = frame->local; + op_errno = local->op_errno; + + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + DHT_STACK_UNWIND(writev, frame, local->op_ret, local->op_errno, + &local->rebalance.prebuf, &local->rebalance.postbuf, + local->rebalance.xdata); + return 0; + } - local = frame->local; + if (subvol == NULL) + goto out; - dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + local->call_cnt = 2; /* This is the second attempt */ - if (!subvol) - subvol = local->cached_subvol; + STACK_WIND_COOKIE(frame, dht_writev_cbk, subvol, subvol, + subvol->fops->writev, local->fd, local->rebalance.vector, + local->rebalance.count, local->rebalance.offset, + local->rebalance.flags, local->rebalance.iobref, + local->xattr_req); - local->call_cnt = 2; /* This is the second attempt */ + return 0; - STACK_WIND (frame, dht_writev_cbk, - subvol, subvol->fops->writev, - local->fd, local->rebalance.vector, local->rebalance.count, - local->rebalance.offset, local->rebalance.flags, - local->rebalance.iobref, NULL); +out: + DHT_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + return 0; } int -dht_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int count, off_t off, uint32_t flags, - struct iobref *iobref, dict_t *xdata) +dht_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int count, off_t off, uint32_t flags, struct iobref *iobref, + dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - local = dht_local_init (frame, NULL, fd, GF_FOP_WRITE); - if (!local) { - - op_errno = ENOMEM; - goto err; - } - - subvol = local->cached_subvol; - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } - + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_WRITE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (xdata) + local->xattr_req = dict_ref(xdata); + + local->rebalance.vector = iov_dup(vector, count); + local->rebalance.offset = off; + local->rebalance.count = count; + local->rebalance.flags = flags; + local->rebalance.iobref = iobref_ref(iobref); + local->call_cnt = 1; + + STACK_WIND_COOKIE(frame, dht_writev_cbk, subvol, subvol, + subvol->fops->writev, fd, local->rebalance.vector, + local->rebalance.count, local->rebalance.offset, + local->rebalance.flags, local->rebalance.iobref, + local->xattr_req); + + return 0; - local->rebalance.vector = iov_dup (vector, count); - local->rebalance.offset = off; - local->rebalance.count = count; - local->rebalance.flags = flags; - local->rebalance.iobref = iobref_ref (iobref); - local->call_cnt = 1; +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL); - STACK_WIND (frame, dht_writev_cbk, - subvol, subvol->fops->writev, - fd, vector, count, off, flags, iobref, xdata); + return 0; +} +int +dht_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *prev = NULL; + int ret = -1; + xlator_t *src_subvol = NULL; + xlator_t *dst_subvol = NULL; + inode_t *inode = NULL; + + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", cookie, out); + + local = frame->local; + prev = cookie; + + /* Needs to be checked only for ftruncate. + * ftruncate fails with EBADF/EINVAL if dht has not yet opened the fd + * on the cached subvol. This could happen if the file was migrated + * and a lookup updated the cached subvol in the inode ctx. + * We only check once as this could actually be a valid error. + */ + + if ((local->fop == GF_FOP_FTRUNCATE) && + dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; return 0; + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); - return 0; -} + goto out; + } + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge(this, postbuf, &local->stbuf); + dht_iatt_merge(this, prebuf, &local->prebuf); + } + goto out; + } + local->rebalance.target_op_fn = dht_truncate2; -int -dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) -{ - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - int ret = -1; - xlator_t *subvol = NULL; - inode_t *inode = NULL; - - GF_VALIDATE_OR_GOTO ("dht", frame, err); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", cookie, out); - - local = frame->local; - prev = cookie; - - if ((op_ret == -1) && !dht_inode_missing(op_errno)) { - local->op_errno = op_errno; - local->op_ret = -1; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); + local->op_ret = op_ret; + local->op_errno = op_errno; - goto out; - } + /* We might need to pass the stbuf information to the higher DHT + * layer for appropriate handling. + */ - if (local->call_cnt != 1) { - if (local->stbuf.ia_blocks) { - dht_iatt_merge (this, postbuf, &local->stbuf, NULL); - dht_iatt_merge (this, prebuf, &local->prebuf, NULL); - } - goto out; - } + dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata); - local->rebalance.target_op_fn = dht_truncate2; + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } - local->op_errno = op_errno; - /* Phase 2 of migration */ - if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { - ret = dht_rebalance_complete_check (this, frame); - if (!ret) - return 0; - } + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1(postbuf)) { + dht_iatt_merge(this, &local->stbuf, postbuf); + dht_iatt_merge(this, &local->prebuf, prebuf); - /* Check if the rebalance phase1 is true */ - if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { - dht_iatt_merge (this, &local->stbuf, postbuf, NULL); - dht_iatt_merge (this, &local->prebuf, prebuf, NULL); - inode = (local->fd) ? local->fd->inode : local->loc.inode; - dht_inode_ctx_get1 (this, inode, &subvol); - if (subvol) { - dht_truncate2 (this, frame, 0); - return 0; - } - ret = dht_rebalance_in_progress_check (this, frame); - if (!ret) - return 0; + inode = (local->fd) ? local->fd->inode : local->loc.inode; + + dht_inode_ctx_get_mig_info(this, inode, &src_subvol, &dst_subvol); + if (!dht_mig_info_is_invalid(local->cached_subvol, src_subvol, + dst_subvol)) { + if ((!local->fd) || + ((local->fd) && + dht_fd_open_on_dst(this, local->fd, dst_subvol))) { + dht_truncate2(this, dst_subvol, frame, 0); + return 0; + } } + ret = dht_rebalance_in_progress_check(this, frame); + if (!ret) + return 0; + } out: - DHT_STRIP_PHASE1_FLAGS (postbuf); - DHT_STRIP_PHASE1_FLAGS (prebuf); - DHT_STACK_UNWIND (truncate, frame, op_ret, op_errno, - prebuf, postbuf, xdata); + DHT_STRIP_PHASE1_FLAGS(postbuf); + DHT_STRIP_PHASE1_FLAGS(prebuf); + + DHT_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf, xdata); err: - return 0; + return 0; } - -int -dht_truncate2 (xlator_t *this, call_frame_t *frame, int op_ret) +static int +dht_truncate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) { - dht_local_t *local = NULL; - xlator_t *subvol = NULL; - inode_t *inode = NULL; + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; - local = frame->local; + if (!frame || !frame->local) + goto out; - inode = local->fd ? local->fd->inode : local->loc.inode; + local = frame->local; + op_errno = local->op_errno; - dht_inode_ctx_get1 (this, inode, &subvol); - if (!subvol) - subvol = local->cached_subvol; + /* This dht xlator is not migrating the file */ + if (we_are_not_migrating(ret)) { + DHT_STACK_UNWIND(truncate, frame, local->op_ret, local->op_errno, + &local->rebalance.prebuf, &local->rebalance.postbuf, + local->rebalance.xdata); + return 0; + } - local->call_cnt = 2; /* This is the second attempt */ + if (subvol == NULL) + goto out; - if (local->fop == GF_FOP_TRUNCATE) { - STACK_WIND (frame, dht_truncate_cbk, subvol, - subvol->fops->truncate, &local->loc, - local->rebalance.offset, NULL); - } else { - STACK_WIND (frame, dht_truncate_cbk, subvol, - subvol->fops->ftruncate, local->fd, - local->rebalance.offset, NULL); - } + local->call_cnt = 2; /* This is the second attempt */ - return 0; + if (local->fop == GF_FOP_TRUNCATE) { + STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol, + subvol->fops->truncate, &local->loc, + local->rebalance.offset, local->xattr_req); + } else { + STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol, + subvol->fops->ftruncate, local->fd, + local->rebalance.offset, local->xattr_req); + } + + return 0; + +out: + DHT_STACK_UNWIND(truncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } int -dht_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, - dict_t *xdata) +dht_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - local = dht_local_init (frame, loc, NULL, GF_FOP_TRUNCATE); - if (!local) { - op_errno = ENOMEM; - goto err; - } - - local->rebalance.offset = offset; - local->call_cnt = 1; - subvol = local->cached_subvol; - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, dht_truncate_cbk, - subvol, subvol->fops->truncate, - loc, offset, xdata); - - return 0; + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + + local = dht_local_init(frame, loc, NULL, GF_FOP_TRUNCATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for gfid=%s", + uuid_utoa(loc->inode->gfid)); + op_errno = EINVAL; + goto err; + } + + if (xdata) + local->xattr_req = dict_ref(xdata); + + STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol, + subvol->fops->truncate, loc, offset, xdata); + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(truncate, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + return 0; } int -dht_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - dict_t *xdata) +dht_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - local = dht_local_init (frame, NULL, fd, GF_FOP_FTRUNCATE); - if (!local) { - op_errno = ENOMEM; - goto err; - } - - local->rebalance.offset = offset; - local->call_cnt = 1; - subvol = local->cached_subvol; - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, dht_truncate_cbk, - subvol, subvol->fops->ftruncate, - fd, offset, xdata); - - return 0; + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_FTRUNCATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (xdata) + local->xattr_req = dict_ref(xdata); + + STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol, + subvol->fops->ftruncate, fd, local->rebalance.offset, + local->xattr_req); + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(ftruncate, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + return 0; } - int -dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - int ret = -1; - xlator_t *subvol = NULL; - - GF_VALIDATE_OR_GOTO ("dht", frame, err); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", cookie, out); + dht_local_t *local = NULL; + xlator_t *prev = NULL; + int ret = -1; + xlator_t *src_subvol = NULL; + xlator_t *dst_subvol = NULL; + + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", cookie, out); + + local = frame->local; + prev = cookie; + + /* fallocate fails with EBADF if dht has not yet opened the fd + * on the cached subvol. This could happen if the file was migrated + * and a lookup updated the cached subvol in the inode ctx. + * We only check once as this could actually be a valid error. + */ + + if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } - local = frame->local; - prev = cookie; + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); - if ((op_ret == -1) && !dht_inode_missing(op_errno)) { - local->op_errno = op_errno; - local->op_ret = -1; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); + goto out; + } - goto out; + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge(this, postbuf, &local->stbuf); + dht_iatt_merge(this, prebuf, &local->prebuf); } + goto out; + } - if (local->call_cnt != 1) { - if (local->stbuf.ia_blocks) { - dht_iatt_merge (this, postbuf, &local->stbuf, NULL); - dht_iatt_merge (this, prebuf, &local->prebuf, NULL); - } - goto out; - } - local->rebalance.target_op_fn = dht_fallocate2; + local->op_ret = op_ret; + local->op_errno = op_errno; + local->rebalance.target_op_fn = dht_fallocate2; - /* Phase 2 of migration */ - if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { - ret = dht_rebalance_complete_check (this, frame); - if (!ret) - return 0; - } + dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata); - /* Check if the rebalance phase1 is true */ - if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { - dht_iatt_merge (this, &local->stbuf, postbuf, NULL); - dht_iatt_merge (this, &local->prebuf, prebuf, NULL); - dht_inode_ctx_get1 (this, local->fd->inode, &subvol); - if (subvol) { - dht_fallocate2 (this, frame, 0); - return 0; - } - ret = dht_rebalance_in_progress_check (this, frame); - if (!ret) - return 0; + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1(postbuf)) { + dht_iatt_merge(this, &local->stbuf, postbuf); + dht_iatt_merge(this, &local->prebuf, prebuf); + + dht_inode_ctx_get_mig_info(this, local->fd->inode, &src_subvol, + &dst_subvol); + if (!dht_mig_info_is_invalid(local->cached_subvol, src_subvol, + dst_subvol)) { + if (dht_fd_open_on_dst(this, local->fd, dst_subvol)) { + dht_fallocate2(this, dst_subvol, frame, 0); + return 0; + } } + ret = dht_rebalance_in_progress_check(this, frame); + if (!ret) + return 0; + } out: - DHT_STRIP_PHASE1_FLAGS (postbuf); - DHT_STRIP_PHASE1_FLAGS (prebuf); - DHT_STACK_UNWIND (fallocate, frame, op_ret, op_errno, - prebuf, postbuf, xdata); + DHT_STRIP_PHASE1_FLAGS(postbuf); + DHT_STRIP_PHASE1_FLAGS(prebuf); + + DHT_STACK_UNWIND(fallocate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); err: - return 0; + return 0; } -int -dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret) +static int +dht_fallocate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) { - dht_local_t *local = NULL; - xlator_t *subvol = NULL; - - local = frame->local; + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; + + if (!frame || !frame->local) + goto out; + + local = frame->local; + op_errno = local->op_errno; + + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + DHT_STACK_UNWIND(fallocate, frame, local->op_ret, local->op_errno, + &local->rebalance.prebuf, &local->rebalance.postbuf, + local->rebalance.xdata); + return 0; + } - dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol == NULL) + goto out; - if (!subvol) - subvol = local->cached_subvol; + local->call_cnt = 2; /* This is the second attempt */ - local->call_cnt = 2; /* This is the second attempt */ + STACK_WIND_COOKIE(frame, dht_fallocate_cbk, subvol, subvol, + subvol->fops->fallocate, local->fd, + local->rebalance.flags, local->rebalance.offset, + local->rebalance.size, local->xattr_req); - STACK_WIND(frame, dht_fallocate_cbk, subvol, subvol->fops->fallocate, - local->fd, local->rebalance.flags, local->rebalance.offset, - local->rebalance.size, NULL); + return 0; - return 0; +out: + DHT_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } int dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, - off_t offset, size_t len, dict_t *xdata) + off_t offset, size_t len, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - local = dht_local_init (frame, NULL, fd, GF_FOP_FALLOCATE); - if (!local) { - op_errno = ENOMEM; - goto err; - } - - local->rebalance.flags = mode; - local->rebalance.offset = offset; - local->rebalance.size = len; - - local->call_cnt = 1; - subvol = local->cached_subvol; - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } - - STACK_WIND (frame, dht_fallocate_cbk, - subvol, subvol->fops->fallocate, - fd, mode, offset, len, xdata); - - return 0; + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_FALLOCATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.flags = mode; + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (xdata) + local->xattr_req = dict_ref(xdata); + + STACK_WIND_COOKIE(frame, dht_fallocate_cbk, subvol, subvol, + subvol->fops->fallocate, fd, local->rebalance.flags, + local->rebalance.offset, local->rebalance.size, + local->xattr_req); + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + return 0; } - int -dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - int ret = -1; - xlator_t *subvol = NULL; - - GF_VALIDATE_OR_GOTO ("dht", frame, err); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", cookie, out); + dht_local_t *local = NULL; + xlator_t *prev = NULL; + int ret = -1; + xlator_t *src_subvol = NULL; + xlator_t *dst_subvol = NULL; + + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", cookie, out); + + local = frame->local; + prev = cookie; + + /* discard fails with EBADF if dht has not yet opened the fd + * on the cached subvol. This could happen if the file was migrated + * and a lookup updated the cached subvol in the inode ctx. + * We only check once as this could actually be a valid error. + */ + if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } - local = frame->local; - prev = cookie; + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); - if ((op_ret == -1) && !dht_inode_missing(op_errno)) { - local->op_errno = op_errno; - local->op_ret = -1; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); + goto out; + } - goto out; + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge(this, postbuf, &local->stbuf); + dht_iatt_merge(this, prebuf, &local->prebuf); } + goto out; + } - if (local->call_cnt != 1) { - if (local->stbuf.ia_blocks) { - dht_iatt_merge (this, postbuf, &local->stbuf, NULL); - dht_iatt_merge (this, prebuf, &local->prebuf, NULL); - } - goto out; - } - local->rebalance.target_op_fn = dht_discard2; + local->rebalance.target_op_fn = dht_discard2; + local->op_ret = op_ret; + local->op_errno = op_errno; - /* Phase 2 of migration */ - if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { - ret = dht_rebalance_complete_check (this, frame); - if (!ret) - return 0; - } + dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata); - /* Check if the rebalance phase1 is true */ - if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { - dht_iatt_merge (this, &local->stbuf, postbuf, NULL); - dht_iatt_merge (this, &local->prebuf, prebuf, NULL); - dht_inode_ctx_get1 (this, local->fd->inode, &subvol); - if (subvol) { - dht_discard2 (this, frame, 0); - return 0; - } - ret = dht_rebalance_in_progress_check (this, frame); - if (!ret) - return 0; + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1(postbuf)) { + dht_iatt_merge(this, &local->stbuf, postbuf); + dht_iatt_merge(this, &local->prebuf, prebuf); + + dht_inode_ctx_get_mig_info(this, local->fd->inode, &src_subvol, + &dst_subvol); + if (!dht_mig_info_is_invalid(local->cached_subvol, src_subvol, + dst_subvol)) { + if (dht_fd_open_on_dst(this, local->fd, dst_subvol)) { + dht_discard2(this, dst_subvol, frame, 0); + return 0; + } } + ret = dht_rebalance_in_progress_check(this, frame); + if (!ret) + return 0; + } out: - DHT_STRIP_PHASE1_FLAGS (postbuf); - DHT_STRIP_PHASE1_FLAGS (prebuf); - DHT_STACK_UNWIND (discard, frame, op_ret, op_errno, - prebuf, postbuf, xdata); + DHT_STRIP_PHASE1_FLAGS(postbuf); + DHT_STRIP_PHASE1_FLAGS(prebuf); + + DHT_STACK_UNWIND(discard, frame, op_ret, op_errno, prebuf, postbuf, xdata); err: - return 0; + return 0; } -int -dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret) +static int +dht_discard2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) { - dht_local_t *local = NULL; - xlator_t *subvol = NULL; - - local = frame->local; + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; + + if (!frame || !frame->local) + goto out; + + local = frame->local; + op_errno = local->op_errno; + + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + DHT_STACK_UNWIND(discard, frame, local->op_ret, local->op_errno, + &local->rebalance.prebuf, &local->rebalance.postbuf, + local->rebalance.xdata); + return 0; + } - dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol == NULL) + goto out; - if (!subvol) - subvol = local->cached_subvol; + local->call_cnt = 2; /* This is the second attempt */ - local->call_cnt = 2; /* This is the second attempt */ + STACK_WIND_COOKIE(frame, dht_discard_cbk, subvol, subvol, + subvol->fops->discard, local->fd, local->rebalance.offset, + local->rebalance.size, local->xattr_req); - STACK_WIND(frame, dht_discard_cbk, subvol, subvol->fops->discard, - local->fd, local->rebalance.offset, local->rebalance.size, - NULL); + return 0; - return 0; +out: + DHT_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } int dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - size_t len, dict_t *xdata) + size_t len, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - local = dht_local_init (frame, NULL, fd, GF_FOP_DISCARD); - if (!local) { - op_errno = ENOMEM; - goto err; - } + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; - local->rebalance.offset = offset; - local->rebalance.size = len; + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); - local->call_cnt = 1; - subvol = local->cached_subvol; - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + local = dht_local_init(frame, NULL, fd, GF_FOP_DISCARD); + if (!local) { + op_errno = ENOMEM; + goto err; + } - STACK_WIND (frame, dht_discard_cbk, subvol, subvol->fops->discard, - fd, offset, len, xdata); + local->rebalance.offset = offset; + local->rebalance.size = len; - return 0; + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (xdata) + local->xattr_req = dict_ref(xdata); + + STACK_WIND_COOKIE(frame, dht_discard_cbk, subvol, subvol, + subvol->fops->discard, fd, local->rebalance.offset, + local->rebalance.size, local->xattr_req); + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + return 0; } int -dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - int ret = -1; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + int ret = -1; + xlator_t *subvol1 = NULL, *subvol2 = NULL; + + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + GF_VALIDATE_OR_GOTO("dht", cookie, out); + + local = frame->local; + prev = cookie; + + /* zerofill fails with EBADF if dht has not yet opened the fd + * on the cached subvol. This could happen if the file was migrated + * and a lookup updated the cached subvol in the inode ctx. + * We only check once as this could actually be a valid error. + */ + if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } - GF_VALIDATE_OR_GOTO ("dht", frame, err); - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", frame->local, out); - GF_VALIDATE_OR_GOTO ("dht", cookie, out); + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge(this, postbuf, &local->stbuf); + dht_iatt_merge(this, prebuf, &local->prebuf); + } + goto out; + } - local = frame->local; - prev = cookie; + local->rebalance.target_op_fn = dht_zerofill2; + local->op_ret = op_ret; + local->op_errno = op_errno; - if ((op_ret == -1) && !dht_inode_missing(op_errno)) { - local->op_errno = op_errno; - local->op_ret = -1; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto out; - } + dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata); - if (local->call_cnt != 1) { - if (local->stbuf.ia_blocks) { - dht_iatt_merge (this, postbuf, &local->stbuf, NULL); - dht_iatt_merge (this, prebuf, &local->prebuf, NULL); - } - goto out; - } - local->rebalance.target_op_fn = dht_zerofill2; - /* Phase 2 of migration */ - if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { - ret = dht_rebalance_complete_check (this, frame); - if (!ret) - return 0; + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) { + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1(postbuf)) { + dht_iatt_merge(this, &local->stbuf, postbuf); + dht_iatt_merge(this, &local->prebuf, prebuf); + + ret = dht_inode_ctx_get_mig_info(this, local->fd->inode, &subvol1, + &subvol2); + if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) { + if (dht_fd_open_on_dst(this, local->fd, subvol2)) { + dht_zerofill2(this, subvol2, frame, 0); + return 0; + } } - /* Check if the rebalance phase1 is true */ - if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { - dht_iatt_merge (this, &local->stbuf, postbuf, NULL); - dht_iatt_merge (this, &local->prebuf, prebuf, NULL); - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret) { - dht_zerofill2 (this, frame, 0); - return 0; - } - ret = dht_rebalance_in_progress_check (this, frame); - if (!ret) - return 0; - } + ret = dht_rebalance_in_progress_check(this, frame); + if (!ret) + return 0; + } out: - DHT_STRIP_PHASE1_FLAGS (postbuf); - DHT_STRIP_PHASE1_FLAGS (prebuf); - DHT_STACK_UNWIND (zerofill, frame, op_ret, op_errno, - prebuf, postbuf, xdata); + DHT_STRIP_PHASE1_FLAGS(postbuf); + DHT_STRIP_PHASE1_FLAGS(prebuf); + + DHT_STACK_UNWIND(zerofill, frame, op_ret, op_errno, prebuf, postbuf, xdata); err: - return 0; + return 0; } -int -dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret) +static int +dht_zerofill2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) { - dht_local_t *local = NULL; - xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; - - local = frame->local; + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + if (!frame || !frame->local) + goto out; - if (!subvol) - subvol = local->cached_subvol; + local = frame->local; - local->call_cnt = 2; /* This is the second attempt */ + op_errno = local->op_errno; - STACK_WIND(frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill, - local->fd, local->rebalance.offset, local->rebalance.size, - NULL); + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + DHT_STACK_UNWIND(zerofill, frame, local->op_ret, local->op_errno, + &local->rebalance.prebuf, &local->rebalance.postbuf, + local->rebalance.xdata); return 0; + } + + if (subvol == NULL) + goto out; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND_COOKIE(frame, dht_zerofill_cbk, subvol, subvol, + subvol->fops->zerofill, local->fd, + local->rebalance.offset, local->rebalance.size, + local->xattr_req); + + return 0; + +out: + + DHT_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } int dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - off_t len, dict_t *xdata) + off_t len, dict_t *xdata) { - xlator_t *subvol = NULL; - int op_errno = -1; - dht_local_t *local = NULL; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - local = dht_local_init (frame, NULL, fd, GF_FOP_ZEROFILL); - if (!local) { - op_errno = ENOMEM; - goto err; - } + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; - local->rebalance.offset = offset; - local->rebalance.size = len; + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); - local->call_cnt = 1; - subvol = local->cached_subvol; - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no cached subvolume for fd=%p", fd); - op_errno = EINVAL; - goto err; - } + local = dht_local_init(frame, NULL, fd, GF_FOP_ZEROFILL); + if (!local) { + op_errno = ENOMEM; + goto err; + } - STACK_WIND (frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill, - fd, offset, len, xdata); + local->rebalance.offset = offset; + local->rebalance.size = len; - return 0; + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL); + if (xdata) + local->xattr_req = dict_ref(xdata); - return 0; -} + STACK_WIND_COOKIE(frame, dht_zerofill_cbk, subvol, subvol, + subvol->fops->zerofill, fd, local->rebalance.offset, + local->rebalance.size, local->xattr_req); + return 0; +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} /* handle cases of migration here for 'setattr()' calls */ int -dht_file_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +dht_file_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - int ret = -1; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + int ret = -1; - local = frame->local; - prev = cookie; + local = frame->local; + prev = cookie; - local->op_errno = op_errno; - if ((op_ret == -1) && !dht_inode_missing(op_errno)) { - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto out; - } + local->op_errno = op_errno; - if (local->call_cnt != 1) - goto out; + if ((local->fop == GF_FOP_FSETATTR) && + dht_check_remote_fd_failed_error(local, op_ret, op_errno)) { + ret = dht_check_and_open_fd_on_subvol(this, frame); + if (ret) + goto out; + return 0; + } - local->rebalance.target_op_fn = dht_setattr2; + if ((op_ret == -1) && !dht_inode_missing(op_errno)) { + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + goto out; + } - /* Phase 2 of migration */ - if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { - ret = dht_rebalance_complete_check (this, frame); - if (!ret) - return 0; - } + if (local->call_cnt != 1) + goto out; + + local->op_ret = op_ret; + local->op_errno = op_errno; + + local->rebalance.target_op_fn = dht_setattr2; - /* At the end of the migration process, whatever 'attr' we - have on source file will be migrated to destination file - in one shot, hence we don't need to check for in progress - state here (ie, PHASE1) */ + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) { + dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata); + + ret = dht_rebalance_complete_check(this, frame); + if (!ret) + return 0; + } + + /* At the end of the migration process, whatever 'attr' we + have on source file will be migrated to destination file + in one shot, hence we don't need to check for in progress + state here (ie, PHASE1) */ out: - DHT_STRIP_PHASE1_FLAGS (postbuf); - DHT_STRIP_PHASE1_FLAGS (prebuf); - DHT_STACK_UNWIND (setattr, frame, op_ret, op_errno, - prebuf, postbuf, xdata); + DHT_STRIP_PHASE1_FLAGS(postbuf); + DHT_STRIP_PHASE1_FLAGS(prebuf); - return 0; + DHT_STACK_UNWIND(setattr, frame, op_ret, op_errno, prebuf, postbuf, xdata); + + return 0; } -int -dht_setattr2 (xlator_t *this, call_frame_t *frame, int op_ret) +static int +dht_setattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) { - dht_local_t *local = NULL; - xlator_t *subvol = NULL; - inode_t *inode = NULL; + dht_local_t *local = NULL; + int32_t op_errno = EINVAL; + + if (!frame || !frame->local) + goto out; + + local = frame->local; + op_errno = local->op_errno; + + if (we_are_not_migrating(ret)) { + /* This dht xlator is not migrating the file. Unwind and + * pass on the original mode bits so the higher DHT layer + * can handle this. + */ + DHT_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno, + &local->rebalance.prebuf, &local->rebalance.postbuf, + local->rebalance.xdata); + return 0; + } - local = frame->local; + if (subvol == NULL) + goto out; - inode = (local->fd) ? local->fd->inode : local->loc.inode; + local->call_cnt = 2; /* This is the second attempt */ - dht_inode_ctx_get1 (this, inode, &subvol); + if (local->fop == GF_FOP_SETATTR) { + STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol, + subvol->fops->setattr, &local->loc, + &local->rebalance.stbuf, local->rebalance.flags, + local->xattr_req); + } else { + STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol, + subvol->fops->fsetattr, local->fd, + &local->rebalance.stbuf, local->rebalance.flags, + local->xattr_req); + } - if (!subvol) - subvol = local->cached_subvol; + return 0; - local->call_cnt = 2; /* This is the second attempt */ +out: + DHT_STACK_UNWIND(setattr, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} - if (local->fop == GF_FOP_SETATTR) { - STACK_WIND (frame, dht_file_setattr_cbk, subvol, - subvol->fops->setattr, &local->loc, - &local->rebalance.stbuf, local->rebalance.flags, - NULL); - } else { - STACK_WIND (frame, dht_file_setattr_cbk, subvol, - subvol->fops->fsetattr, local->fd, - &local->rebalance.stbuf, local->rebalance.flags, - NULL); +/* Keep the existing code same for all the cases other than regular file */ +int +dht_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *statpre, struct iatt *statpost, + dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + + local = frame->local; + prev = cookie; + + LOCK(&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + UNLOCK(&frame->lock); + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + goto post_unlock; } - return 0; + dht_iatt_merge(this, &local->prebuf, statpre); + dht_iatt_merge(this, &local->stbuf, statpost); + + local->op_ret = 0; + local->op_errno = 0; + } + UNLOCK(&frame->lock); +post_unlock: + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + if (local->op_ret == 0) + dht_inode_ctx_time_set(local->loc.inode, this, &local->stbuf); + DHT_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno, + &local->prebuf, &local->stbuf, xdata); + } + + return 0; } - /* Keep the existing code same for all the cases other than regular file */ int -dht_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *statpre, - struct iatt *statpost, dict_t *xdata) +dht_non_mds_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - - - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + + local = frame->local; + prev = cookie; + + if (op_ret == -1) { + gf_msg(this->name, op_errno, 0, 0, "subvolume %s returned -1", + prev->name); + goto post_unlock; + } + + LOCK(&frame->lock); + { + dht_iatt_merge(this, &local->prebuf, statpre); + dht_iatt_merge(this, &local->stbuf, statpost); + + local->op_ret = 0; + local->op_errno = 0; + } + UNLOCK(&frame->lock); +post_unlock: + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + dht_inode_ctx_time_set(local->loc.inode, this, &local->stbuf); + DHT_STACK_UNWIND(setattr, frame, 0, 0, &local->prebuf, &local->stbuf, + xdata); + } + + return 0; +} - dht_iatt_merge (this, &local->prebuf, statpre, prev->this); - dht_iatt_merge (this, &local->stbuf, statpost, prev->this); +int +dht_mds_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) - local->op_ret = 0; - } -unlock: - UNLOCK (&frame->lock); - - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - if (local->op_ret == 0) - dht_inode_ctx_time_set (local->loc.inode, this, - &local->stbuf); - DHT_STACK_UNWIND (setattr, frame, local->op_ret, local->op_errno, - &local->prebuf, &local->stbuf, xdata); - } +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *prev = NULL; + xlator_t *mds_subvol = NULL; + struct iatt loc_stbuf = { + 0, + }; + int i = 0; + + local = frame->local; + prev = cookie; + conf = this->private; + mds_subvol = local->mds_subvol; + + if (op_ret == -1) { + local->op_ret = op_ret; + local->op_errno = op_errno; + gf_msg_debug(this->name, op_errno, "subvolume %s returned -1", + prev->name); + goto out; + } + + local->op_ret = 0; + loc_stbuf = local->stbuf; + dht_iatt_merge(this, &local->prebuf, statpre); + dht_iatt_merge(this, &local->stbuf, statpost); + + local->call_cnt = conf->subvolume_cnt - 1; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (mds_subvol == conf->subvolumes[i]) + continue; + STACK_WIND_COOKIE(frame, dht_non_mds_setattr_cbk, conf->subvolumes[i], + conf->subvolumes[i], + conf->subvolumes[i]->fops->setattr, &local->loc, + &loc_stbuf, local->valid, local->xattr_req); + } + + return 0; +out: + DHT_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno, + &local->prebuf, &local->stbuf, xdata); - return 0; + return 0; } - int -dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid, dict_t *xdata) +dht_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) { - xlator_t *subvol = NULL; - dht_layout_t *layout = NULL; - dht_local_t *local = NULL; - int op_errno = -1; - int i = -1; - int call_cnt = 0; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - local = dht_local_init (frame, loc, NULL, GF_FOP_SETATTR); - if (!local) { - op_errno = ENOMEM; - goto err; - } + xlator_t *subvol = NULL; + xlator_t *mds_subvol = NULL; + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + int ret = -1; + int call_cnt = 0; + dht_conf_t *conf = NULL; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + VALIDATE_OR_GOTO(loc->path, err); + + conf = this->private; + local = dht_local_init(frame, loc, NULL, GF_FOP_SETATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane(layout)) { + gf_msg_debug(this->name, 0, "layout is not sane for path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + if (xdata) + local->xattr_req = dict_ref(xdata); + + if (IA_ISREG(loc->inode->ia_type)) { + /* in the regular file _cbk(), we need to check for + migration possibilities */ + local->rebalance.stbuf = *stbuf; + local->rebalance.flags = valid; + local->call_cnt = 1; + subvol = local->cached_subvol; - layout = local->layout; - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for path=%s", loc->path); - op_errno = EINVAL; - goto err; - } + STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol, + subvol->fops->setattr, loc, stbuf, valid, xdata); - if (!layout_is_sane (layout)) { - gf_log (this->name, GF_LOG_DEBUG, - "layout is not sane for path=%s", loc->path); - op_errno = EINVAL; - goto err; + return 0; + } + + local->call_cnt = call_cnt = layout->cnt; + + if (IA_ISDIR(loc->inode->ia_type) && !__is_root_gfid(loc->inode->gfid) && + call_cnt != 1) { + ret = dht_inode_ctx_mdsvol_get(loc->inode, this, &mds_subvol); + if (ret || !mds_subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "Failed to get mds subvol for path %s", local->loc.path); + op_errno = EINVAL; + goto err; } - if (IA_ISREG (loc->inode->ia_type)) { - /* in the regular file _cbk(), we need to check for - migration possibilities */ - local->rebalance.stbuf = *stbuf; - local->rebalance.flags = valid; - local->call_cnt = 1; - subvol = local->cached_subvol; - - STACK_WIND (frame, dht_file_setattr_cbk, subvol, - subvol->fops->setattr, - loc, stbuf, valid, xdata); - - return 0; + local->mds_subvol = mds_subvol; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == mds_subvol) { + if (!conf->subvolume_status[i]) { + gf_msg(this->name, GF_LOG_WARNING, layout->list[i].err, + DHT_MSG_HASHED_SUBVOL_DOWN, + "MDS subvol is down for path " + " %s Unable to set attr ", + local->loc.path); + op_errno = ENOTCONN; + goto err; + } + } } + local->valid = valid; + local->stbuf = *stbuf; - local->call_cnt = call_cnt = layout->cnt; - + STACK_WIND_COOKIE(frame, dht_mds_setattr_cbk, local->mds_subvol, + local->mds_subvol, local->mds_subvol->fops->setattr, + loc, stbuf, valid, xdata); + return 0; + } else { for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_setattr_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->setattr, - loc, stbuf, valid, xdata); + STACK_WIND_COOKIE(frame, dht_setattr_cbk, layout->list[i].xlator, + layout->list[i].xlator, + layout->list[i].xlator->fops->setattr, loc, stbuf, + valid, xdata); } + } - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(setattr, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + return 0; } - int -dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, - int32_t valid, dict_t *xdata) +dht_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, + int32_t valid, dict_t *xdata) { - xlator_t *subvol = NULL; - dht_layout_t *layout = NULL; - dht_local_t *local = NULL; - int op_errno = -1; - int i = -1; - int call_cnt = 0; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (fd, err); - - local = dht_local_init (frame, NULL, fd, GF_FOP_FSETATTR); - if (!local) { - op_errno = ENOMEM; - goto err; - } - - layout = local->layout; - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "no layout for fd=%p", fd); - op_errno = EINVAL; - goto err; - } - - if (!layout_is_sane (layout)) { - gf_log (this->name, GF_LOG_DEBUG, - "layout is not sane for fd=%p", fd); - op_errno = EINVAL; - goto err; - } - - if (IA_ISREG (fd->inode->ia_type)) { - /* in the regular file _cbk(), we need to check for - migration possibilities */ - local->rebalance.stbuf = *stbuf; - local->rebalance.flags = valid; - local->call_cnt = 1; - subvol = local->cached_subvol; - - STACK_WIND (frame, dht_file_setattr_cbk, subvol, - subvol->fops->fsetattr, - fd, stbuf, valid, xdata); + xlator_t *subvol = NULL; + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + int call_cnt = 0; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(fd, err); + + local = dht_local_init(frame, NULL, fd, GF_FOP_FSETATTR); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + layout = local->layout; + if (!layout) { + gf_msg_debug(this->name, 0, "no layout for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane(layout)) { + gf_msg_debug(this->name, 0, "layout is not sane for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + if (xdata) + local->xattr_req = dict_ref(xdata); + + if (IA_ISREG(fd->inode->ia_type)) { + /* in the regular file _cbk(), we need to check for + migration possibilities */ + local->rebalance.stbuf = *stbuf; + local->rebalance.flags = valid; + local->call_cnt = 1; + subvol = local->cached_subvol; - return 0; - } + STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol, + subvol->fops->fsetattr, fd, &local->rebalance.stbuf, + local->rebalance.flags, local->xattr_req); + return 0; + } - local->call_cnt = call_cnt = layout->cnt; + local->call_cnt = call_cnt = layout->cnt; - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_setattr_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->fsetattr, - fd, stbuf, valid, xdata); - } + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(frame, dht_setattr_cbk, layout->list[i].xlator, + layout->list[i].xlator, + layout->list[i].xlator->fops->fsetattr, fd, stbuf, + valid, xdata); + } - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(fsetattr, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + return 0; } diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c index e1a37b77cda..fda904c92c9 100644 --- a/xlators/cluster/dht/src/dht-layout.c +++ b/xlators/cluster/dht/src/dht-layout.c @@ -8,799 +8,801 @@ cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - - -#include "glusterfs.h" -#include "xlator.h" #include "dht-common.h" -#include "byte-order.h" +#include <glusterfs/byte-order.h> +#include "unittest/unittest.h" -#define layout_base_size (sizeof (dht_layout_t)) +#define layout_base_size (sizeof(dht_layout_t)) -#define layout_entry_size (sizeof ((dht_layout_t *)NULL)->list[0]) +#define layout_entry_size (sizeof((dht_layout_t *)NULL)->list[0]) #define layout_size(cnt) (layout_base_size + (cnt * layout_entry_size)) dht_layout_t * -dht_layout_new (xlator_t *this, int cnt) +dht_layout_new(xlator_t *this, int cnt) { - dht_layout_t *layout = NULL; - dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; - conf = this->private; + REQUIRE(NULL != this); + REQUIRE(cnt >= 0); - layout = GF_CALLOC (1, layout_size (cnt), - gf_dht_mt_dht_layout_t); - if (!layout) { - goto out; - } + conf = this->private; - layout->type = DHT_HASH_TYPE_DM; - layout->cnt = cnt; + layout = GF_CALLOC(1, layout_size(cnt), gf_dht_mt_dht_layout_t); + if (!layout) { + goto out; + } - if (conf) { - layout->spread_cnt = conf->dir_spread_cnt; - layout->gen = conf->gen; - } + layout->type = DHT_HASH_TYPE_DM; + layout->cnt = cnt; + + if (conf) { + layout->spread_cnt = conf->dir_spread_cnt; + layout->gen = conf->gen; + } - layout->ref = 1; + GF_ATOMIC_INIT(layout->ref, 1); + ENSURE(NULL != layout); + ENSURE(layout->type == DHT_HASH_TYPE_DM); + ENSURE(layout->cnt == cnt); + ENSURE(GF_ATOMIC_GET(layout->ref) == 1); out: - return layout; + return layout; } - dht_layout_t * -dht_layout_get (xlator_t *this, inode_t *inode) +dht_layout_get(xlator_t *this, inode_t *inode) { - dht_conf_t *conf = NULL; - dht_layout_t *layout = NULL; - - conf = this->private; - if (!conf) - goto out; - - LOCK (&conf->layout_lock); - { - dht_inode_ctx_layout_get (inode, this, &layout); - if (layout) { - layout->ref++; - } - } - UNLOCK (&conf->layout_lock); - -out: - return layout; + dht_layout_t *layout = NULL; + int ret = 0; + + ret = dht_inode_ctx_layout_get(inode, this, &layout); + if ((!ret) && layout) { + GF_ATOMIC_INC(layout->ref); + } + return layout; } - int -dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout) +dht_layout_set(xlator_t *this, inode_t *inode, dht_layout_t *layout) { - dht_conf_t *conf = NULL; - int oldret = -1; - int ret = 0; - dht_layout_t *old_layout; - - conf = this->private; - if (!conf) - goto out; - - LOCK (&conf->layout_lock); - { - oldret = dht_inode_ctx_layout_get (inode, this, &old_layout); - layout->ref++; - dht_inode_ctx_layout_set (inode, this, layout); - } - UNLOCK (&conf->layout_lock); - - if (!oldret) { - dht_layout_unref (this, old_layout); - } + dht_conf_t *conf = NULL; + int oldret = -1; + int ret = -1; + dht_layout_t *old_layout; + + conf = this->private; + if (!conf || !layout) + goto out; + + LOCK(&conf->layout_lock); + { + oldret = dht_inode_ctx_layout_get(inode, this, &old_layout); + if (layout) + GF_ATOMIC_INC(layout->ref); + ret = dht_inode_ctx_layout_set(inode, this, layout); + } + UNLOCK(&conf->layout_lock); + + if (!oldret) { + dht_layout_unref(this, old_layout); + } + if (ret) + GF_ATOMIC_DEC(layout->ref); out: - return ret; + return ret; } - void -dht_layout_unref (xlator_t *this, dht_layout_t *layout) +dht_layout_unref(xlator_t *this, dht_layout_t *layout) { - dht_conf_t *conf = NULL; - int ref = 0; + int ref = 0; - if (!layout || layout->preset || !this->private) - return; + if (!layout || layout->preset || !this->private) + return; - conf = this->private; - - LOCK (&conf->layout_lock); - { - ref = --layout->ref; - } - UNLOCK (&conf->layout_lock); + ref = GF_ATOMIC_DEC(layout->ref); - if (!ref) - GF_FREE (layout); + if (!ref) + GF_FREE(layout); } - dht_layout_t * -dht_layout_ref (xlator_t *this, dht_layout_t *layout) +dht_layout_ref(xlator_t *this, dht_layout_t *layout) { - dht_conf_t *conf = NULL; - - if (layout->preset || !this->private) - return layout; + if (layout->preset || !this->private) + return layout; - conf = this->private; - LOCK (&conf->layout_lock); - { - layout->ref++; - } - UNLOCK (&conf->layout_lock); + GF_ATOMIC_INC(layout->ref); - return layout; + return layout; } - xlator_t * -dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name) +dht_layout_search(xlator_t *this, dht_layout_t *layout, const char *name) { - uint32_t hash = 0; - xlator_t *subvol = NULL; - int i = 0; - int ret = 0; - - - ret = dht_hash_compute (this, layout->type, name, &hash); - if (ret != 0) { - gf_log (this->name, GF_LOG_WARNING, - "hash computation failed for type=%d name=%s", - layout->type, name); - goto out; - } - - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].start <= hash - && layout->list[i].stop >= hash) { - subvol = layout->list[i].xlator; - break; - } - } - - if (!subvol) { - gf_log (this->name, GF_LOG_WARNING, - "no subvolume for hash (value) = %u", hash); - } + uint32_t hash = 0; + xlator_t *subvol = NULL; + int i = 0; + int ret = 0; + + ret = dht_hash_compute(this, layout->type, name, &hash); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_COMPUTE_HASH_FAILED, + "type=%d", layout->type, "name=%s", name, NULL); + goto out; + } + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].start <= hash && layout->list[i].stop >= hash) { + subvol = layout->list[i].xlator; + break; + } + } + + if (!subvol) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "hash-value=0x%x", hash, NULL); + } out: - return subvol; + return subvol; } - dht_layout_t * -dht_layout_for_subvol (xlator_t *this, xlator_t *subvol) +dht_layout_for_subvol(xlator_t *this, xlator_t *subvol) { - dht_conf_t *conf = NULL; - dht_layout_t *layout = NULL; - int i = 0; - - conf = this->private; - if (!conf) - goto out; - - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->subvolumes[i] == subvol) { - layout = conf->file_layouts[i]; - break; - } + dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + int i = 0; + + conf = this->private; + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == subvol) { + layout = conf->file_layouts[i]; + break; } + } out: - return layout; + return layout; } - int -dht_layouts_init (xlator_t *this, dht_conf_t *conf) +dht_layouts_init(xlator_t *this, dht_conf_t *conf) { - dht_layout_t *layout = NULL; - int i = 0; - int ret = -1; - - if (!conf) - goto out; - - conf->file_layouts = GF_CALLOC (conf->subvolume_cnt, - sizeof (dht_layout_t *), - gf_dht_mt_dht_layout_t); - if (!conf->file_layouts) { - goto out; - } - - for (i = 0; i < conf->subvolume_cnt; i++) { - layout = dht_layout_new (this, 1); + dht_layout_t *layout = NULL; + int i = 0; + int ret = -1; - if (!layout) { - goto out; - } + if (!conf) + goto out; - layout->preset = 1; + conf->file_layouts = GF_CALLOC(conf->subvolume_cnt, sizeof(dht_layout_t *), + gf_dht_mt_dht_layout_t); + if (!conf->file_layouts) { + goto out; + } - layout->list[0].xlator = conf->subvolumes[i]; + for (i = 0; i < conf->subvolume_cnt; i++) { + layout = dht_layout_new(this, 1); - conf->file_layouts[i] = layout; + if (!layout) { + goto out; } - ret = 0; + layout->preset = 1; + + layout->list[0].xlator = conf->subvolumes[i]; + + conf->file_layouts[i] = layout; + } + + ret = 0; out: - return ret; + return ret; } - int -dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, - int pos, int32_t **disk_layout_p) +dht_disk_layout_extract(xlator_t *this, dht_layout_t *layout, int pos, + int32_t **disk_layout_p) { - int ret = -1; - int32_t *disk_layout = NULL; + int ret = -1; + int32_t *disk_layout = NULL; - disk_layout = GF_CALLOC (5, sizeof (int), - gf_dht_mt_int32_t); - if (!disk_layout) { - goto out; - } + disk_layout = GF_CALLOC(5, sizeof(int), gf_dht_mt_int32_t); + if (!disk_layout) { + goto out; + } - disk_layout[0] = hton32 (1); - disk_layout[1] = hton32 (layout->type); - disk_layout[2] = hton32 (layout->list[pos].start); - disk_layout[3] = hton32 (layout->list[pos].stop); + disk_layout[0] = hton32(layout->list[pos].commit_hash); + disk_layout[1] = hton32(layout->type); + disk_layout[2] = hton32(layout->list[pos].start); + disk_layout[3] = hton32(layout->list[pos].stop); - if (disk_layout_p) - *disk_layout_p = disk_layout; - else - GF_FREE (disk_layout); + if (disk_layout_p) + *disk_layout_p = disk_layout; + else + GF_FREE(disk_layout); - ret = 0; + ret = 0; out: - return ret; + return ret; } - int -dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, - int pos, void *disk_layout_raw, int disk_layout_len) +dht_disk_layout_extract_for_subvol(xlator_t *this, dht_layout_t *layout, + xlator_t *subvol, int32_t **disk_layout_p) { - int cnt = 0; - int type = 0; - int start_off = 0; - int stop_off = 0; - int disk_layout[4]; - - if (!disk_layout_raw) { - gf_log (this->name, GF_LOG_CRITICAL, - "error no layout on disk for merge"); - return -1; - } - - GF_ASSERT (disk_layout_len == sizeof (disk_layout)); - - memcpy (disk_layout, disk_layout_raw, disk_layout_len); - - cnt = ntoh32 (disk_layout[0]); - if (cnt != 1) { - gf_log (this->name, GF_LOG_ERROR, - "disk layout has invalid count %d", cnt); - return -1; - } + int i = 0; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == subvol) + break; + } + + if (i == layout->cnt) + return -1; + + return dht_disk_layout_extract(this, layout, i, disk_layout_p); +} + +static int +dht_disk_layout_merge(xlator_t *this, dht_layout_t *layout, int pos, + void *disk_layout_raw, int disk_layout_len) +{ + int type = 0; + int start_off = 0; + int stop_off = 0; + int commit_hash = 0; + int disk_layout[4]; + + if (!disk_layout_raw) { + gf_smsg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_LAYOUT_MERGE_FAILED, + NULL); + return -1; + } + + GF_ASSERT(disk_layout_len == sizeof(disk_layout)); + + memcpy(disk_layout, disk_layout_raw, disk_layout_len); - type = ntoh32 (disk_layout[1]); - switch (type) { + type = ntoh32(disk_layout[1]); + switch (type) { case DHT_HASH_TYPE_DM_USER: - gf_log (this->name, GF_LOG_DEBUG, "found user-set layout"); - layout->type = type; - /* Fall through. */ - case DHT_HASH_TYPE_DM: - break; + gf_msg_debug(this->name, 0, "found user-set layout"); + layout->type = type; + /* Fall through. */ + case DHT_HASH_TYPE_DM: + break; default: - gf_log (this->name, GF_LOG_CRITICAL, - "Catastrophic error layout with unknown type found %d", - disk_layout[1]); - return -1; - } + gf_smsg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_INVALID_DISK_LAYOUT, + "layout=%d", disk_layout[1], NULL); + return -1; + } - start_off = ntoh32 (disk_layout[2]); - stop_off = ntoh32 (disk_layout[3]); + commit_hash = ntoh32(disk_layout[0]); + start_off = ntoh32(disk_layout[2]); + stop_off = ntoh32(disk_layout[3]); - layout->list[pos].start = start_off; - layout->list[pos].stop = stop_off; + layout->list[pos].commit_hash = commit_hash; + layout->list[pos].start = start_off; + layout->list[pos].stop = stop_off; - gf_log (this->name, GF_LOG_TRACE, - "merged to layout: %u - %u (type %d) from %s", - start_off, stop_off, type, - layout->list[pos].xlator->name); + gf_msg_trace(this->name, 0, + "merged to layout: 0x%x - 0x%x (hash 0x%x, type %d) from %s", + start_off, stop_off, commit_hash, type, + layout->list[pos].xlator->name); - return 0; + return 0; } - int -dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, - int op_ret, int op_errno, dict_t *xattr) +dht_layout_merge(xlator_t *this, dht_layout_t *layout, xlator_t *subvol, + int op_ret, int op_errno, dict_t *xattr) { - int i = 0; - int ret = -1; - int err = -1; - void *disk_layout_raw = NULL; - int disk_layout_len = 0; - dht_conf_t *conf = this->private; - - if (op_ret != 0) { - err = op_errno; - } + int i = 0; + int ret = -1; + int err = -1; + void *disk_layout_raw = NULL; + int disk_layout_len = 0; + dht_conf_t *conf = this->private; - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].xlator == NULL) { - layout->list[i].err = err; - layout->list[i].xlator = subvol; - break; - } - } + if (op_ret != 0) { + err = op_errno; + } - if (op_ret != 0) { - ret = 0; - goto out; - } + if (!layout) + goto out; - if (xattr) { - /* during lookup and not mkdir */ - ret = dict_get_ptr_and_len (xattr, conf->xattr_name, - &disk_layout_raw, &disk_layout_len); + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == NULL) { + layout->list[i].err = err; + layout->list[i].xlator = subvol; + break; } + } - if (ret != 0) { - layout->list[i].err = 0; - gf_log (this->name, GF_LOG_TRACE, - "missing disk layout on %s. err = %d", - subvol->name, err); - ret = 0; - goto out; - } + if (op_ret != 0) { + ret = 0; + goto out; + } - ret = dht_disk_layout_merge (this, layout, i, disk_layout_raw, - disk_layout_len); - if (ret != 0) { - gf_log (this->name, GF_LOG_WARNING, - "layout merge from subvolume %s failed", - subvol->name); - goto out; - } + if (xattr) { + /* during lookup and not mkdir */ + ret = dict_get_ptr_and_len(xattr, conf->xattr_name, &disk_layout_raw, + &disk_layout_len); + } + + if (ret != 0) { layout->list[i].err = 0; + gf_msg_trace(this->name, 0, "Missing disk layout on %s. err = %d", + subvol->name, err); + ret = 0; + goto out; + } + + ret = dht_disk_layout_merge(this, layout, i, disk_layout_raw, + disk_layout_len); + if (ret != 0) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED, + "subvolume=%s", subvol->name, NULL); + goto out; + } + + if (layout->commit_hash == 0) { + layout->commit_hash = layout->list[i].commit_hash; + } else if (layout->commit_hash != layout->list[i].commit_hash) { + layout->commit_hash = DHT_LAYOUT_HASH_INVALID; + } + + layout->list[i].err = 0; out: - return ret; + return ret; } - void -dht_layout_entry_swap (dht_layout_t *layout, int i, int j) +dht_layout_entry_swap(dht_layout_t *layout, int i, int j) { - uint32_t start_swap = 0; - uint32_t stop_swap = 0; - xlator_t *xlator_swap = 0; - int err_swap = 0; - - start_swap = layout->list[i].start; - stop_swap = layout->list[i].stop; - xlator_swap = layout->list[i].xlator; - err_swap = layout->list[i].err; - - layout->list[i].start = layout->list[j].start; - layout->list[i].stop = layout->list[j].stop; - layout->list[i].xlator = layout->list[j].xlator; - layout->list[i].err = layout->list[j].err; - - layout->list[j].start = start_swap; - layout->list[j].stop = stop_swap; - layout->list[j].xlator = xlator_swap; - layout->list[j].err = err_swap; + uint32_t start_swap = 0; + uint32_t stop_swap = 0; + uint32_t commit_hash_swap = 0; + xlator_t *xlator_swap = 0; + int err_swap = 0; + + start_swap = layout->list[i].start; + stop_swap = layout->list[i].stop; + xlator_swap = layout->list[i].xlator; + err_swap = layout->list[i].err; + commit_hash_swap = layout->list[i].commit_hash; + + layout->list[i].start = layout->list[j].start; + layout->list[i].stop = layout->list[j].stop; + layout->list[i].xlator = layout->list[j].xlator; + layout->list[i].err = layout->list[j].err; + layout->list[i].commit_hash = layout->list[j].commit_hash; + + layout->list[j].start = start_swap; + layout->list[j].stop = stop_swap; + layout->list[j].xlator = xlator_swap; + layout->list[j].err = err_swap; + layout->list[j].commit_hash = commit_hash_swap; } void -dht_layout_range_swap (dht_layout_t *layout, int i, int j) +dht_layout_range_swap(dht_layout_t *layout, int i, int j) { - uint32_t start_swap = 0; - uint32_t stop_swap = 0; + uint32_t start_swap = 0; + uint32_t stop_swap = 0; - start_swap = layout->list[i].start; - stop_swap = layout->list[i].stop; + start_swap = layout->list[i].start; + stop_swap = layout->list[i].stop; - layout->list[i].start = layout->list[j].start; - layout->list[i].stop = layout->list[j].stop; + layout->list[i].start = layout->list[j].start; + layout->list[i].stop = layout->list[j].stop; - layout->list[j].start = start_swap; - layout->list[j].stop = stop_swap; + layout->list[j].start = start_swap; + layout->list[j].stop = stop_swap; } - -int64_t -dht_layout_entry_cmp_volname (dht_layout_t *layout, int i, int j) +static int64_t +dht_layout_entry_cmp_volname(dht_layout_t *layout, int i, int j) { - return (strcmp (layout->list[i].xlator->name, - layout->list[j].xlator->name)); + return (strcmp(layout->list[i].xlator->name, layout->list[j].xlator->name)); } - gf_boolean_t -dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator) +dht_is_subvol_in_layout(dht_layout_t *layout, xlator_t *xlator) { - int i = 0; - - for (i = 0; i < layout->cnt; i++) { - /* Check if xlator is already part of layout, and layout is - * non-zero. */ - if (!strcmp (layout->list[i].xlator->name, xlator->name)) { - if (layout->list[i].start != layout->list[i].stop) - return _gf_true; - break; - } - } - return _gf_false; + int i = 0; + + for (i = 0; i < layout->cnt; i++) { + /* Check if xlator is already part of layout, and layout is + * non-zero. */ + if (!strcmp(layout->list[i].xlator->name, xlator->name)) { + if (layout->list[i].start != layout->list[i].stop) + return _gf_true; + break; + } + } + return _gf_false; } -int64_t -dht_layout_entry_cmp (dht_layout_t *layout, int i, int j) +static int64_t +dht_layout_entry_cmp(dht_layout_t *layout, int i, int j) { - int64_t diff = 0; + int64_t diff = 0; - /* swap zero'ed out layouts to front, if needed */ - if (!layout->list[j].start && !layout->list[j].stop) { - diff = (int64_t) layout->list[i].stop - - (int64_t) layout->list[j].stop; - goto out; - } - diff = (int64_t) layout->list[i].start - - (int64_t) layout->list[j].start; + /* swap zero'ed out layouts to front, if needed */ + if (!layout->list[j].start && !layout->list[j].stop) { + diff = (int64_t)layout->list[i].stop - (int64_t)layout->list[j].stop; + goto out; + } + diff = (int64_t)layout->list[i].start - (int64_t)layout->list[j].start; out: - return diff; + return diff; } - int -dht_layout_sort (dht_layout_t *layout) +dht_layout_sort(dht_layout_t *layout) { - int i = 0; - int j = 0; - int64_t ret = 0; + int i = 0; + int j = 0; + int64_t ret = 0; - /* TODO: O(n^2) -- bad bad */ + /* TODO: O(n^2) -- bad bad */ - for (i = 0; i < layout->cnt - 1; i++) { - for (j = i + 1; j < layout->cnt; j++) { - ret = dht_layout_entry_cmp (layout, i, j); - if (ret > 0) - dht_layout_entry_swap (layout, i, j); - } + for (i = 0; i < layout->cnt - 1; i++) { + for (j = i + 1; j < layout->cnt; j++) { + ret = dht_layout_entry_cmp(layout, i, j); + if (ret > 0) + dht_layout_entry_swap(layout, i, j); } + } - return 0; + return 0; } -int -dht_layout_sort_volname (dht_layout_t *layout) +void +dht_layout_sort_volname(dht_layout_t *layout) { - int i = 0; - int j = 0; - int64_t ret = 0; + int i = 0; + int j = 0; + int64_t ret = 0; - /* TODO: O(n^2) -- bad bad */ + /* TODO: O(n^2) -- bad bad */ - for (i = 0; i < layout->cnt - 1; i++) { - for (j = i + 1; j < layout->cnt; j++) { - ret = dht_layout_entry_cmp_volname (layout, i, j); - if (ret > 0) - dht_layout_entry_swap (layout, i, j); - } + for (i = 0; i < layout->cnt - 1; i++) { + for (j = i + 1; j < layout->cnt; j++) { + ret = dht_layout_entry_cmp_volname(layout, i, j); + if (ret > 0) + dht_layout_entry_swap(layout, i, j); } - - return 0; + } } - -int -dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, - uint32_t *holes_p, uint32_t *overlaps_p, - uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p, - uint32_t *no_space_p) +void +dht_layout_anomalies(xlator_t *this, loc_t *loc, dht_layout_t *layout, + uint32_t *holes_p, uint32_t *overlaps_p, + uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p, + uint32_t *no_space_p) { - uint32_t overlaps = 0; - uint32_t missing = 0; - uint32_t down = 0; - uint32_t misc = 0; - uint32_t hole_cnt = 0; - uint32_t overlap_cnt = 0; - int i = 0; - int ret = 0; - uint32_t prev_stop = 0; - uint32_t last_stop = 0; - char is_virgin = 1; - uint32_t no_space = 0; - - /* This funtion scans through the layout spread of a directory to - check if there are any anomalies. Prior to calling this function - the layout entries should be sorted in the ascending order. - - If the layout entry has err != 0 - then increment the corresponding anomaly. - else - if (start of the current layout entry > stop + 1 of previous - non erroneous layout entry) - then it indicates a hole in the layout - if (start of the current layout entry < stop + 1 of previous - non erroneous layout entry) - then it indicates an overlap in the layout - */ - last_stop = layout->list[0].start - 1; - prev_stop = last_stop; - - for (i = 0; i < layout->cnt; i++) { - switch (layout->list[i].err) { - case -1: - case ENOENT: - case ESTALE: - missing++; - continue; - case ENOTCONN: - down++; - continue; - case ENOSPC: - no_space++; - continue; - case 0: - /* if err == 0 and start == stop, then it is a non misc++; - * participating subvolume(spread-cnt). Then, do not - * check for anomalies. If start != stop, then treat it - * as misc err */ - if (layout->list[i].start == layout->list[i].stop) { - continue; - } - break; - default: - misc++; - continue; - } - - is_virgin = 0; - - if ((prev_stop + 1) < layout->list[i].start) { - hole_cnt++; + uint32_t overlaps = 0; + uint32_t missing = 0; + uint32_t down = 0; + uint32_t misc = 0; + uint32_t hole_cnt = 0; + uint32_t overlap_cnt = 0; + int i = 0; + uint32_t prev_stop = 0; + uint32_t last_stop = 0; + char is_virgin = 1; + uint32_t no_space = 0; + + /* This function scans through the layout spread of a directory to + check if there are any anomalies. Prior to calling this function + the layout entries should be sorted in the ascending order. + + If the layout entry has err != 0 + then increment the corresponding anomaly. + else + if (start of the current layout entry > stop + 1 of previous + non erroneous layout entry) + then it indicates a hole in the layout + if (start of the current layout entry < stop + 1 of previous + non erroneous layout entry) + then it indicates an overlap in the layout + */ + last_stop = layout->list[0].start - 1; + prev_stop = last_stop; + + for (i = 0; i < layout->cnt; i++) { + switch (layout->list[i].err) { + case -1: + case ENOENT: + case ESTALE: + missing++; + continue; + case ENOTCONN: + down++; + continue; + case ENOSPC: + no_space++; + continue; + case 0: + /* if err == 0 and start == stop, then it is a non misc++; + * participating subvolume(spread-cnt). Then, do not + * check for anomalies. If start != stop, then treat it + * as misc err */ + if (layout->list[i].start == layout->list[i].stop) { + continue; } + break; + default: + misc++; + continue; + } - if ((prev_stop + 1) > layout->list[i].start) { - overlap_cnt++; - overlaps += ((prev_stop + 1) - layout->list[i].start); - } - prev_stop = layout->list[i].stop; + is_virgin = 0; + + if ((prev_stop + 1) < layout->list[i].start) { + hole_cnt++; } - if ((last_stop - prev_stop) || is_virgin) - hole_cnt++; + if ((prev_stop + 1) > layout->list[i].start) { + overlap_cnt++; + overlaps += ((prev_stop + 1) - layout->list[i].start); + } + prev_stop = layout->list[i].stop; + } - if (holes_p) - *holes_p = hole_cnt; + if ((last_stop - prev_stop) || is_virgin) + hole_cnt++; - if (overlaps_p) - *overlaps_p = overlap_cnt; + if (holes_p) + *holes_p = hole_cnt; - if (missing_p) - *missing_p = missing; + if (overlaps_p) + *overlaps_p = overlap_cnt; - if (down_p) - *down_p = down; + if (missing_p) + *missing_p = missing; - if (misc_p) - *misc_p = misc; + if (down_p) + *down_p = down; - if (no_space_p) - *no_space_p = no_space; + if (misc_p) + *misc_p = misc; - return ret; + if (no_space_p) + *no_space_p = no_space; } - int -dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout) +dht_layout_missing_dirs(dht_layout_t *layout) { - int ret = 0; - int i = 0; - uint32_t holes = 0; - uint32_t overlaps = 0; - uint32_t missing = 0; - uint32_t down = 0; - uint32_t misc = 0; - - ret = dht_layout_sort (layout); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "sort failed?! how the ...."); - goto out; - } + int i = 0, missing = 0; - ret = dht_layout_anomalies (this, loc, layout, - &holes, &overlaps, - &missing, &down, &misc, NULL); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "error while finding anomalies in %s -- not good news", - loc->path); - goto out; - } + if (layout == NULL) + goto out; - if (holes || overlaps) { - if (missing == layout->cnt) { - gf_log (this->name, GF_LOG_DEBUG, - "directory %s looked up first time", - loc->path); - } else { - gf_log (this->name, GF_LOG_INFO, - "found anomalies in %s. holes=%d overlaps=%d", - loc->path, holes, overlaps); - } - ret = -1; + for (i = 0; i < layout->cnt; i++) { + if ((layout->list[i].err == ENOENT) || + ((layout->list[i].err == -1) && (layout->list[i].start == 0) && + (layout->list[i].stop == 0))) { + missing++; } + } - for (i = 0; i < layout->cnt; i++) { - /* TODO During DHT selfheal rewrite (almost) find a better place - * to detect this - probably in dht_layout_anomalies() - */ - if (layout->list[i].err > 0) { - gf_log_callingfn (this->name, GF_LOG_DEBUG, - "path=%s err=%s on subvol=%s", - loc->path, - strerror (layout->list[i].err), - (layout->list[i].xlator ? - layout->list[i].xlator->name - : "<>")); - if ((layout->list[i].err == ENOENT) && (ret >= 0)) { - ret++; - } - } - } +out: + return missing; +} +int +dht_layout_normalize(xlator_t *this, loc_t *loc, dht_layout_t *layout) +{ + int ret = 0; + uint32_t holes = 0; + uint32_t overlaps = 0; + uint32_t missing = 0; + uint32_t down = 0; + uint32_t misc = 0, missing_dirs = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + ret = dht_layout_sort(layout); + if (ret == -1) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_SORT_FAILED, + NULL); + goto out; + } + + gf_uuid_unparse(loc->gfid, gfid); + + dht_layout_anomalies(this, loc, layout, &holes, &overlaps, &missing, &down, + &misc, NULL); + + if (holes || overlaps) { + if (missing == layout->cnt) { + gf_msg_debug(this->name, 0, + "Directory %s looked up first time" + " gfid = %s", + loc->path, gfid); + } else { + gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_ANOMALIES_INFO, + "path=%s", loc->path, "gfid=%s", gfid, "holes=%d", holes, + "overlaps=%d", overlaps, NULL); + } + ret = -1; + } + + if (ret >= 0) { + missing_dirs = dht_layout_missing_dirs(layout); + /* TODO During DHT selfheal rewrite (almost) find a better place + * to detect this - probably in dht_layout_anomalies() + */ + if (missing_dirs > 0) + ret += missing_dirs; + } out: - return ret; + return ret; } int -dht_dir_has_layout (dict_t *xattr, char *name) +dht_dir_has_layout(dict_t *xattr, char *name) { + void *disk_layout_raw = NULL; - void *disk_layout_raw = NULL; - - return dict_get_ptr (xattr, name, &disk_layout_raw); + return dict_get_ptr(xattr, name, &disk_layout_raw); } int -dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, - loc_t *loc, dict_t *xattr) +dht_layout_dir_mismatch(xlator_t *this, dht_layout_t *layout, xlator_t *subvol, + loc_t *loc, dict_t *xattr) { - int idx = 0; - int pos = -1; - int ret = 0; - int err = 0; - int dict_ret = 0; - int32_t disk_layout[4]; - void *disk_layout_raw = NULL; - int32_t count = -1; - uint32_t start_off = -1; - uint32_t stop_off = -1; - dht_conf_t *conf = this->private; - - - for (idx = 0; idx < layout->cnt; idx++) { - if (layout->list[idx].xlator == subvol) { - pos = idx; - break; - } - } - - if (pos == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "%s - no layout info for subvolume %s", - loc->path, subvol->name); - ret = 1; - goto out; - } + int idx = 0; + int pos = -1; + int ret = 0; + int err = 0; + int dict_ret = 0; + int32_t disk_layout[4]; + void *disk_layout_raw = NULL; + uint32_t start_off = -1; + uint32_t stop_off = -1; + uint32_t commit_hash = -1; + dht_conf_t *conf = this->private; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + if (loc && loc->inode) + gf_uuid_unparse(loc->inode->gfid, gfid); + + for (idx = 0; idx < layout->cnt; idx++) { + if (layout->list[idx].xlator == subvol) { + pos = idx; + break; + } + } + + if (pos == -1) { + if (loc) { + gf_msg_debug(this->name, 0, "%s - no layout info for subvolume %s", + loc ? loc->path : "path not found", subvol->name); + } + ret = 1; + goto out; + } + + err = layout->list[pos].err; + + if (!xattr) { + if (err == 0) { + if (loc) { + gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_XATTR_DICT_NULL, + "path=%s", loc->path, NULL); + } else { + gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_XATTR_DICT_NULL, + "path not found", NULL); + } + ret = -1; + } + goto out; + } + + dict_ret = dict_get_ptr(xattr, conf->xattr_name, &disk_layout_raw); + + if (dict_ret < 0) { + if (err == 0 && layout->list[pos].stop) { + if (loc) { + gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_DISK_LAYOUT_MISSING, + "path=%s", loc->path, "gfid=%s", gfid, NULL); + } else { + gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_DISK_LAYOUT_MISSING, + "path not found" + "gfid=%s", + gfid, NULL); + } + ret = -1; + } + goto out; + } + + memcpy(disk_layout, disk_layout_raw, sizeof(disk_layout)); + + start_off = ntoh32(disk_layout[2]); + stop_off = ntoh32(disk_layout[3]); + commit_hash = ntoh32(disk_layout[0]); + + if ((layout->list[pos].start != start_off) || + (layout->list[pos].stop != stop_off) || + (layout->list[pos].commit_hash != commit_hash)) { + gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_INFO, "subvol=%s", + layout->list[pos].xlator->name, "inode-layout:start=0x%x", + layout->list[pos].start, "inode-layout:stop=0x%x", + layout->list[pos].stop, "layout-commit-hash=0x%x; ", + layout->list[pos].commit_hash, "disk-layout:start-off=0x%x", + start_off, "disk-layout:top-off=0x%x", stop_off, + "commit-hash=0x%x", commit_hash, NULL); + ret = 1; + } else { + ret = 0; + } +out: + return ret; +} - err = layout->list[pos].err; +int +dht_layout_preset(xlator_t *this, xlator_t *subvol, inode_t *inode) +{ + dht_layout_t *layout = NULL; + int ret = -1; + dht_conf_t *conf = NULL; - if (!xattr) { - if (err == 0) { - gf_log (this->name, GF_LOG_INFO, - "%s - xattr dictionary is NULL", - loc->path); - ret = -1; - } - goto out; - } + conf = this->private; + if (!conf) + goto out; - dict_ret = dict_get_ptr (xattr, conf->xattr_name, - &disk_layout_raw); + layout = dht_layout_for_subvol(this, subvol); + if (!layout) { + gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_NO_LAYOUT_INFO, + "subvolume=%s", subvol ? subvol->name : "<nil>", NULL); + ret = -1; + goto out; + } - if (dict_ret < 0) { - if (err == 0 && layout->list[pos].stop) { - gf_log (this->name, GF_LOG_INFO, - "%s - disk layout missing", loc->path); - ret = -1; - } - goto out; - } + gf_msg_debug(this->name, 0, "file = %s, subvol = %s", + uuid_utoa(inode->gfid), subvol ? subvol->name : "<nil>"); - memcpy (disk_layout, disk_layout_raw, sizeof (disk_layout)); + LOCK(&conf->layout_lock); + { + dht_inode_ctx_layout_set(inode, this, layout); + } - count = ntoh32 (disk_layout[0]); - if (count != 1) { - gf_log (this->name, GF_LOG_ERROR, - "%s - disk layout has invalid count %d", - loc->path, count); - ret = -1; - goto out; - } + UNLOCK(&conf->layout_lock); - start_off = ntoh32 (disk_layout[2]); - stop_off = ntoh32 (disk_layout[3]); - - if ((layout->list[pos].start != start_off) - || (layout->list[pos].stop != stop_off)) { - gf_log (this->name, GF_LOG_INFO, - "subvol: %s; inode layout - %"PRIu32" - %"PRIu32"; " - "disk layout - %"PRIu32" - %"PRIu32, - layout->list[pos].xlator->name, - layout->list[pos].start, layout->list[pos].stop, - start_off, stop_off); - ret = 1; - } else { - ret = 0; - } + ret = 0; out: - return ret; + return ret; } - int -dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode) +dht_layout_index_for_subvol(dht_layout_t *layout, xlator_t *subvol) { - dht_layout_t *layout = NULL; - int ret = -1; - dht_conf_t *conf = NULL; - - conf = this->private; - if (!conf) - goto out; + int i = 0, ret = -1; - layout = dht_layout_for_subvol (this, subvol); - if (!layout) { - gf_log (this->name, GF_LOG_INFO, - "no pre-set layout for subvolume %s", - subvol ? subvol->name : "<nil>"); - ret = -1; - goto out; + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == subvol) { + ret = i; + break; } + } - LOCK (&conf->layout_lock); - { - dht_inode_ctx_layout_set (inode, this, layout); - } - UNLOCK (&conf->layout_lock); - - ret = 0; -out: - return ret; + return ret; } diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c index dbc9d0b3c7b..89ec6cca56e 100644 --- a/xlators/cluster/dht/src/dht-linkfile.c +++ b/xlators/cluster/dht/src/dht-linkfile.c @@ -8,321 +8,321 @@ cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - - -#include "glusterfs.h" -#include "xlator.h" -#include "compat.h" +#include <glusterfs/compat.h> #include "dht-common.h" -int -dht_linkfile_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, dict_t *xattr, - struct iatt *postparent) +static int +dht_linkfile_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) { - char is_linkfile = 0; - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - - local = frame->local; - prev = cookie; - conf = this->private; - - if (op_ret) - goto out; - - is_linkfile = check_is_linkfile (inode, stbuf, xattr, - conf->link_xattr_name); - if (!is_linkfile) - gf_log (this->name, GF_LOG_WARNING, "got non-linkfile %s:%s", - prev->this->name, local->loc.path); + char is_linkfile = 0; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + prev = cookie; + conf = this->private; + + if (op_ret) + goto out; + + gf_uuid_unparse(local->loc.gfid, gfid); + + is_linkfile = check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name); + if (!is_linkfile) + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NOT_LINK_FILE_ERROR, + "name=%s", prev->name, "path=%s", local->loc.path, "gfid=%s", + gfid, NULL); out: - local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno, - inode, stbuf, postparent, postparent, - xattr); - return 0; + local->linkfile.linkfile_cbk(frame, cookie, this, op_ret, op_errno, inode, + stbuf, postparent, postparent, xattr); + return 0; } -#define is_equal(a, b) (a == b) -int -dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, inode_t *inode, - struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +static int +dht_linkfile_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - dht_local_t *local = NULL; - xlator_t *subvol = NULL; - call_frame_t *prev = NULL; - dict_t *xattrs = NULL; - dht_conf_t *conf = NULL; - int ret = -1; - - local = frame->local; - - if (!op_ret) - local->linked = _gf_true; - - FRAME_SU_UNDO (frame, dht_local_t); - - if (op_ret && (op_errno == EEXIST)) { - conf = this->private; - prev = cookie; - subvol = prev->this; - if (!subvol) - goto out; - xattrs = dict_new (); - if (!xattrs) - goto out; - ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to set linkto key"); - goto out; - } - - STACK_WIND (frame, dht_linkfile_lookup_cbk, subvol, - subvol->fops->lookup, &local->loc, xattrs); - if (xattrs) - dict_unref (xattrs); - return 0; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + dict_t *xattrs = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + + local = frame->local; + + if (!op_ret) + local->linked = _gf_true; + + FRAME_SU_UNDO(frame, dht_local_t); + + if (op_ret && (op_errno == EEXIST)) { + conf = this->private; + subvol = cookie; + if (!subvol) + goto out; + xattrs = dict_new(); + if (!xattrs) + goto out; + ret = dict_set_uint32(xattrs, conf->link_xattr_name, 256); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "mame=%s", conf->link_xattr_name, NULL); + goto out; } -out: - local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno, - inode, stbuf, preparent, postparent, - xdata); + + STACK_WIND_COOKIE(frame, dht_linkfile_lookup_cbk, subvol, subvol, + subvol->fops->lookup, &local->linkfile.loc, xattrs); if (xattrs) - dict_unref (xattrs); + dict_unref(xattrs); return 0; + } +out: + local->linkfile.linkfile_cbk(frame, cookie, this, op_ret, op_errno, inode, + stbuf, preparent, postparent, xdata); + if (xattrs) + dict_unref(xattrs); + return 0; } - int -dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, - xlator_t *this, - xlator_t *tovol, xlator_t *fromvol, loc_t *loc) +dht_linkfile_create(call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, + xlator_t *this, xlator_t *tovol, xlator_t *fromvol, + loc_t *loc) { - dht_local_t *local = NULL; - dict_t *dict = NULL; - int need_unref = 0; - int ret = 0; - dht_conf_t *conf = this->private; - - local = frame->local; - local->linkfile.linkfile_cbk = linkfile_cbk; - local->linkfile.srcvol = tovol; - - local->linked = _gf_false; - - dict = local->params; - if (!dict) { - dict = dict_new (); - if (!dict) - goto out; - need_unref = 1; - } - - if (!uuid_is_null (local->gfid)) { - ret = dict_set_static_bin (dict, "gfid-req", local->gfid, 16); - if (ret) - gf_log ("dht-linkfile", GF_LOG_INFO, - "%s: gfid set failed", loc->path); - } - - ret = dict_set_str (dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); + dht_local_t *local = NULL; + dict_t *dict = NULL; + int need_unref = 0; + int ret = 0; + dht_conf_t *conf = this->private; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + local->linkfile.linkfile_cbk = linkfile_cbk; + local->linkfile.srcvol = tovol; + loc_copy(&local->linkfile.loc, loc); + + local->linked = _gf_false; + + dict = local->params; + if (!dict) { + dict = dict_new(); + if (!dict) + goto out; + need_unref = 1; + } + + if (!gf_uuid_is_null(local->gfid)) { + gf_uuid_unparse(local->gfid, gfid); + + ret = dict_set_gfuuid(dict, "gfid-req", local->gfid, true); if (ret) - gf_log ("dht-linkfile", GF_LOG_INFO, - "%s: internal-fop set failed", loc->path); - - ret = dict_set_str (dict, conf->link_xattr_name, tovol->name); - - if (ret < 0) { - gf_log (frame->this->name, GF_LOG_INFO, - "%s: failed to initialize linkfile data", - loc->path); - goto out; - } - - local->link_subvol = fromvol; - /* Always create as root:root. dht_linkfile_attr_heal fixes the - * ownsership */ - FRAME_SU_DO (frame, dht_local_t); - STACK_WIND (frame, dht_linkfile_create_cbk, - fromvol, fromvol->fops->mknod, loc, - S_IFREG | DHT_LINKFILE_MODE, 0, 0, dict); - - if (need_unref && dict) - dict_unref (dict); - - return 0; + gf_smsg("dht-linkfile", GF_LOG_INFO, 0, DHT_MSG_DICT_SET_FAILED, + "path=%s", loc->path, "gfid=%s", gfid, NULL); + } else { + gf_uuid_unparse(loc->gfid, gfid); + } + + ret = dict_set_str(dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); + if (ret) + gf_smsg("dht-linkfile", GF_LOG_INFO, 0, DHT_MSG_DICT_SET_FAILED, + "path=%s", loc->path, "key=%s", GLUSTERFS_INTERNAL_FOP_KEY, + "gfid=%s", gfid, NULL); + + ret = dict_set_str(dict, conf->link_xattr_name, tovol->name); + + if (ret < 0) { + gf_smsg(frame->this->name, GF_LOG_INFO, 0, DHT_MSG_CREATE_LINK_FAILED, + "path=%s", loc->path, "gfid=%s", gfid, NULL); + goto out; + } + + local->link_subvol = fromvol; + /* Always create as root:root. dht_linkfile_attr_heal fixes the + * ownsership */ + FRAME_SU_DO(frame, dht_local_t); + STACK_WIND_COOKIE(frame, dht_linkfile_create_cbk, fromvol, fromvol, + fromvol->fops->mknod, loc, S_IFREG | DHT_LINKFILE_MODE, 0, + 0, dict); + + if (need_unref && dict) + dict_unref(dict); + + return 0; out: - local->linkfile.linkfile_cbk (frame, NULL, frame->this, -1, ENOMEM, - loc->inode, NULL, NULL, NULL, NULL); + local->linkfile.linkfile_cbk(frame, frame->this, frame->this, -1, ENOMEM, + loc->inode, NULL, NULL, NULL, NULL); - if (need_unref && dict) - dict_unref (dict); + if (need_unref && dict) + dict_unref(dict); - return 0; + return 0; } - int -dht_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, struct iatt *postparent, - dict_t *xdata) +dht_linkfile_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - xlator_t *subvol = NULL; - - local = frame->local; - prev = cookie; - subvol = prev->this; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "unlinking linkfile %s on %s failed (%s)", - local->loc.path, subvol->name, strerror (op_errno)); - } + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; - DHT_STACK_DESTROY (frame); + local = frame->local; + subvol = cookie; - return 0; -} + if (op_ret == -1) { + gf_uuid_unparse(local->loc.gfid, gfid); + gf_smsg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_UNLINK_FAILED, + "path=%s", local->loc.path, "gfid=%s", gfid, "subvolume=%s", + subvol->name, NULL); + } + DHT_STACK_DESTROY(frame); + + return 0; +} int -dht_linkfile_unlink (call_frame_t *frame, xlator_t *this, - xlator_t *subvol, loc_t *loc) +dht_linkfile_unlink(call_frame_t *frame, xlator_t *this, xlator_t *subvol, + loc_t *loc) { - call_frame_t *unlink_frame = NULL; - dht_local_t *unlink_local = NULL; + call_frame_t *unlink_frame = NULL; + dht_local_t *unlink_local = NULL; - unlink_frame = copy_frame (frame); - if (!unlink_frame) { - goto err; - } + unlink_frame = copy_frame(frame); + if (!unlink_frame) { + goto err; + } - /* Using non-fop value here, as anyways, 'local->fop' is not used in - this particular case */ - unlink_local = dht_local_init (unlink_frame, loc, NULL, - GF_FOP_MAXVALUE); - if (!unlink_local) { - goto err; - } + /* Using non-fop value here, as anyways, 'local->fop' is not used in + this particular case */ + unlink_local = dht_local_init(unlink_frame, loc, NULL, GF_FOP_MAXVALUE); + if (!unlink_local) { + goto err; + } - STACK_WIND (unlink_frame, dht_linkfile_unlink_cbk, - subvol, subvol->fops->unlink, - &unlink_local->loc, 0, NULL); + STACK_WIND_COOKIE(unlink_frame, dht_linkfile_unlink_cbk, subvol, subvol, + subvol->fops->unlink, &unlink_local->loc, 0, NULL); - return 0; + return 0; err: - if (unlink_frame) - DHT_STACK_DESTROY (unlink_frame); + if (unlink_frame) + DHT_STACK_DESTROY(unlink_frame); - return -1; + return -1; } - xlator_t * -dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *stbuf, - dict_t *xattr) +dht_linkfile_subvol(xlator_t *this, inode_t *inode, struct iatt *stbuf, + dict_t *xattr) { - dht_conf_t *conf = NULL; - xlator_t *subvol = NULL; - void *volname = NULL; - int i = 0, ret = 0; + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; + void *volname = NULL; + int i = 0, ret = 0; - conf = this->private; + conf = this->private; - if (!xattr) - goto out; + if (!xattr) + goto out; - ret = dict_get_ptr (xattr, conf->link_xattr_name, &volname); + ret = dict_get_ptr(xattr, conf->link_xattr_name, &volname); - if ((-1 == ret) || !volname) - goto out; + if ((-1 == ret) || !volname) + goto out; - for (i = 0; i < conf->subvolume_cnt; i++) { - if (strcmp (conf->subvolumes[i]->name, (char *)volname) == 0) { - subvol = conf->subvolumes[i]; - break; - } + for (i = 0; i < conf->subvolume_cnt; i++) { + if (strcmp(conf->subvolumes[i]->name, (char *)volname) == 0) { + subvol = conf->subvolumes[i]; + break; } + } out: - return subvol; + return subvol; } -int -dht_linkfile_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *statpre, - struct iatt *statpost, dict_t *xdata) +static int +dht_linkfile_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) { - dht_local_t *local = NULL; - loc_t *loc = NULL; + dht_local_t *local = NULL; + loc_t *loc = NULL; - local = frame->local; - loc = &local->loc; + local = frame->local; + loc = &local->loc; - if (op_ret) - gf_log (this->name, GF_LOG_ERROR, "setattr of uid/gid on %s" - " :<gfid:%s> failed (%s)", - (loc->path? loc->path: "NULL"), - uuid_utoa(local->gfid), strerror(op_errno)); + if (op_ret) + gf_smsg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_SETATTR_FAILED, + "path=%s", (loc->path ? loc->path : "NULL"), "gfid=%s", + uuid_utoa(local->gfid), NULL); - DHT_STACK_DESTROY (frame); + DHT_STACK_DESTROY(frame); - return 0; + return 0; } int -dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this) +dht_linkfile_attr_heal(call_frame_t *frame, xlator_t *this) { - int ret = -1; - call_frame_t *copy = NULL; - dht_local_t *local = NULL; - dht_local_t *copy_local = NULL; - xlator_t *subvol = NULL; - struct iatt stbuf = {0,}; + int ret = -1; + call_frame_t *copy = NULL; + dht_local_t *local = NULL; + dht_local_t *copy_local = NULL; + xlator_t *subvol = NULL; + struct iatt stbuf = { + 0, + }; + dict_t *xattr = NULL; + + local = frame->local; + + GF_VALIDATE_OR_GOTO("dht", local, out); + GF_VALIDATE_OR_GOTO("dht", local->link_subvol, out); + + if (local->stbuf.ia_type == IA_INVAL) + return 0; - local = frame->local; + DHT_MARK_FOP_INTERNAL(xattr); - GF_VALIDATE_OR_GOTO ("dht", local, out); - GF_VALIDATE_OR_GOTO ("dht", local->link_subvol, out); + gf_uuid_copy(local->loc.gfid, local->stbuf.ia_gfid); - if (local->stbuf.ia_type == IA_INVAL) - return 0; + copy = copy_frame(frame); - uuid_copy (local->loc.gfid, local->stbuf.ia_gfid); + if (!copy) + goto out; - copy = copy_frame (frame); + copy_local = dht_local_init(copy, &local->loc, NULL, 0); - if (!copy) - goto out; + if (!copy_local) + goto out; - copy_local = dht_local_init (copy, &local->loc, NULL, 0); + stbuf = local->stbuf; + subvol = local->link_subvol; - if (!copy_local) - goto out; + copy->local = copy_local; - stbuf = local->stbuf; - subvol = local->link_subvol; + FRAME_SU_DO(copy, dht_local_t); - copy->local = copy_local; + STACK_WIND(copy, dht_linkfile_setattr_cbk, subvol, subvol->fops->setattr, + ©_local->loc, &stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), + xattr); + ret = 0; +out: + if ((ret < 0) && (copy)) + DHT_STACK_DESTROY(copy); - FRAME_SU_DO (copy, dht_local_t); + if (xattr) + dict_unref(xattr); - STACK_WIND (copy, dht_linkfile_setattr_cbk, subvol, - subvol->fops->setattr, ©_local->loc, - &stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL); - ret = 0; -out: - return ret; + return ret; } diff --git a/xlators/cluster/dht/src/dht-lock.c b/xlators/cluster/dht/src/dht-lock.c new file mode 100644 index 00000000000..638821ccee5 --- /dev/null +++ b/xlators/cluster/dht/src/dht-lock.c @@ -0,0 +1,1392 @@ +/* + Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "dht-lock.h" + +static char * +dht_lock_asprintf(dht_lock_t *lock) +{ + char *lk_buf = NULL; + char gfid[GF_UUID_BUF_SIZE] = { + 0, + }; + + if (lock == NULL) + goto out; + + uuid_utoa_r(lock->loc.gfid, gfid); + + gf_asprintf(&lk_buf, "%s:%s", lock->xl->name, gfid); + +out: + return lk_buf; +} + +static void +dht_log_lk_array(char *name, gf_loglevel_t log_level, dht_lock_t **lk_array, + int count) +{ + int i = 0; + char *lk_buf = NULL; + + if ((lk_array == NULL) || (count == 0)) + goto out; + + for (i = 0; i < count; i++) { + lk_buf = dht_lock_asprintf(lk_array[i]); + if (!lk_buf) + goto out; + + gf_smsg(name, log_level, 0, DHT_MSG_LK_ARRAY_INFO, "index=%d", i, + "lk_buf=%s", lk_buf, NULL); + GF_FREE(lk_buf); + } + +out: + return; +} + +static void +dht_lock_stack_destroy(call_frame_t *lock_frame, dht_lock_type_t lk) +{ + dht_local_t *local = NULL; + + local = lock_frame->local; + + if (lk == DHT_INODELK) { + local->lock[0].layout.my_layout.locks = NULL; + local->lock[0].layout.my_layout.lk_count = 0; + } else { + local->lock[0].ns.directory_ns.locks = NULL; + local->lock[0].ns.directory_ns.lk_count = 0; + } + + DHT_STACK_DESTROY(lock_frame); + return; +} + +static void +dht_lock_free(dht_lock_t *lock) +{ + if (lock == NULL) + goto out; + + loc_wipe(&lock->loc); + GF_FREE(lock->domain); + GF_FREE(lock->basename); + mem_put(lock); + +out: + return; +} + +static void +dht_set_lkowner(dht_lock_t **lk_array, int count, gf_lkowner_t *lkowner) +{ + int i = 0; + + if (!lk_array || !lkowner) + goto out; + + for (i = 0; i < count; i++) { + lk_array[i]->lk_owner = *lkowner; + } + +out: + return; +} + +static int +dht_lock_request_cmp(const void *val1, const void *val2) +{ + dht_lock_t *lock1 = NULL; + dht_lock_t *lock2 = NULL; + int ret = -1; + + lock1 = *(dht_lock_t **)val1; + lock2 = *(dht_lock_t **)val2; + + GF_VALIDATE_OR_GOTO("dht-locks", lock1, out); + GF_VALIDATE_OR_GOTO("dht-locks", lock2, out); + + ret = strcmp(lock1->xl->name, lock2->xl->name); + + if (ret == 0) { + ret = gf_uuid_compare(lock1->loc.gfid, lock2->loc.gfid); + } + +out: + return ret; +} + +static int +dht_lock_order_requests(dht_lock_t **locks, int count) +{ + int ret = -1; + + if (!locks || !count) + goto out; + + qsort(locks, count, sizeof(*locks), dht_lock_request_cmp); + ret = 0; + +out: + return ret; +} + +void +dht_lock_array_free(dht_lock_t **lk_array, int count) +{ + int i = 0; + dht_lock_t *lock = NULL; + + if (lk_array == NULL) + goto out; + + for (i = 0; i < count; i++) { + lock = lk_array[i]; + lk_array[i] = NULL; + dht_lock_free(lock); + } + +out: + return; +} + +int32_t +dht_lock_count(dht_lock_t **lk_array, int lk_count) +{ + int i = 0, locked = 0; + + if ((lk_array == NULL) || (lk_count == 0)) + goto out; + + for (i = 0; i < lk_count; i++) { + if (lk_array[i]->locked) + locked++; + } +out: + return locked; +} + +static call_frame_t * +dht_lock_frame(call_frame_t *parent_frame) +{ + call_frame_t *lock_frame = NULL; + + lock_frame = copy_frame(parent_frame); + if (lock_frame == NULL) + goto out; + + set_lk_owner_from_ptr(&lock_frame->root->lk_owner, parent_frame->root); + +out: + return lock_frame; +} + +dht_lock_t * +dht_lock_new(xlator_t *this, xlator_t *xl, loc_t *loc, short type, + const char *domain, const char *basename, + dht_reaction_type_t do_on_failure) +{ + dht_conf_t *conf = NULL; + dht_lock_t *lock = NULL; + + conf = this->private; + + lock = mem_get0(conf->lock_pool); + if (lock == NULL) + goto out; + + lock->xl = xl; + lock->type = type; + lock->do_on_failure = do_on_failure; + + lock->domain = gf_strdup(domain); + if (lock->domain == NULL) { + dht_lock_free(lock); + lock = NULL; + goto out; + } + + if (basename) { + lock->basename = gf_strdup(basename); + if (lock->basename == NULL) { + dht_lock_free(lock); + lock = NULL; + goto out; + } + } + + /* Fill only inode and gfid. + posix and protocol/server give preference to pargfid/basename over + gfid/inode for resolution if all the three parameters of loc_t are + present. I want to avoid the following hypothetical situation: + + 1. rebalance did a lookup on a dentry and got a gfid. + 2. rebalance acquires lock on loc_t which was filled with gfid and + path (pargfid/bname) from step 1. + 3. somebody deleted and recreated the same file + 4. rename on the same path acquires lock on loc_t which now points + to a different inode (and hence gets the lock). + 5. rebalance continues to migrate file (note that not all fops done + by rebalance during migration are inode/gfid based Eg., unlink) + 6. rename continues. + */ + lock->loc.inode = inode_ref(loc->inode); + loc_gfid(loc, lock->loc.gfid); + +out: + return lock; +} + +static int +dht_local_entrylk_init(call_frame_t *frame, dht_lock_t **lk_array, int lk_count, + fop_entrylk_cbk_t entrylk_cbk) +{ + int ret = -1; + dht_local_t *local = NULL; + + local = frame->local; + + if (local == NULL) { + local = dht_local_init(frame, NULL, NULL, 0); + } + + if (local == NULL) { + goto out; + } + + local->lock[0].ns.directory_ns.entrylk_cbk = entrylk_cbk; + local->lock[0].ns.directory_ns.locks = lk_array; + local->lock[0].ns.directory_ns.lk_count = lk_count; + + ret = dht_lock_order_requests(local->lock[0].ns.directory_ns.locks, + local->lock[0].ns.directory_ns.lk_count); + if (ret < 0) + goto out; + + ret = 0; +out: + return ret; +} + +static void +dht_entrylk_done(call_frame_t *lock_frame) +{ + fop_entrylk_cbk_t entrylk_cbk = NULL; + call_frame_t *main_frame = NULL; + dht_local_t *local = NULL; + + local = lock_frame->local; + main_frame = local->main_frame; + + local->lock[0].ns.directory_ns.locks = NULL; + local->lock[0].ns.directory_ns.lk_count = 0; + + entrylk_cbk = local->lock[0].ns.directory_ns.entrylk_cbk; + local->lock[0].ns.directory_ns.entrylk_cbk = NULL; + + entrylk_cbk(main_frame, NULL, main_frame->this, + local->lock[0].ns.directory_ns.op_ret, + local->lock[0].ns.directory_ns.op_errno, NULL); + + dht_lock_stack_destroy(lock_frame, DHT_ENTRYLK); + return; +} + +static int32_t +dht_unlock_entrylk_done(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + gf_uuid_unparse(local->lock[0].ns.directory_ns.locks[0]->loc.inode->gfid, + gfid); + + if (op_ret < 0) { + gf_smsg(this->name, GF_LOG_WARNING, op_errno, + DHT_MSG_UNLOCK_GFID_FAILED, "gfid=%s", gfid, + "DHT_LAYOUT_HEAL_DOMAIN", NULL); + } + + DHT_STACK_DESTROY(frame); + return 0; +} + +static int32_t +dht_unlock_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + int lk_index = 0, call_cnt = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + lk_index = (long)cookie; + + local = frame->local; + + uuid_utoa_r(local->lock[0].ns.directory_ns.locks[lk_index]->loc.gfid, gfid); + + if (op_ret < 0) { + gf_smsg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_UNLOCKING_FAILED, + "name=%s", + local->lock[0].ns.directory_ns.locks[lk_index]->xl->name, + "gfid=%s", gfid, NULL); + } else { + local->lock[0].ns.directory_ns.locks[lk_index]->locked = 0; + } + + call_cnt = dht_frame_return(frame); + if (is_last_call(call_cnt)) { + dht_entrylk_done(frame); + } + + return 0; +} + +static int32_t +dht_unlock_entrylk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count, + fop_entrylk_cbk_t entrylk_cbk) +{ + dht_local_t *local = NULL; + int ret = -1, i = 0; + call_frame_t *lock_frame = NULL; + int call_cnt = 0; + + GF_VALIDATE_OR_GOTO("dht-locks", frame, done); + GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, done); + GF_VALIDATE_OR_GOTO(frame->this->name, entrylk_cbk, done); + + call_cnt = dht_lock_count(lk_array, lk_count); + if (call_cnt == 0) { + ret = 0; + goto done; + } + + lock_frame = dht_lock_frame(frame); + if (lock_frame == NULL) { + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, + DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS, + NULL); + + dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count); + goto done; + } + + ret = dht_local_entrylk_init(lock_frame, lk_array, lk_count, entrylk_cbk); + if (ret < 0) { + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, + DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK, + NULL); + + dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count); + + goto done; + } + + local = lock_frame->local; + local->main_frame = frame; + local->call_cnt = call_cnt; + + for (i = 0; i < local->lock[0].ns.directory_ns.lk_count; i++) { + if (!local->lock[0].ns.directory_ns.locks[i]->locked) + continue; + + lock_frame->root + ->lk_owner = local->lock[0].ns.directory_ns.locks[i]->lk_owner; + STACK_WIND_COOKIE( + lock_frame, dht_unlock_entrylk_cbk, (void *)(long)i, + local->lock[0].ns.directory_ns.locks[i]->xl, + local->lock[0].ns.directory_ns.locks[i]->xl->fops->entrylk, + local->lock[0].ns.directory_ns.locks[i]->domain, + &local->lock[0].ns.directory_ns.locks[i]->loc, + local->lock[0].ns.directory_ns.locks[i]->basename, ENTRYLK_UNLOCK, + ENTRYLK_WRLCK, NULL); + if (!--call_cnt) + break; + } + + return 0; + +done: + if (lock_frame) + dht_lock_stack_destroy(lock_frame, DHT_ENTRYLK); + + /* no locks acquired, invoke entrylk_cbk */ + if (ret == 0) + entrylk_cbk(frame, NULL, frame->this, 0, 0, NULL); + + return ret; +} + +int32_t +dht_unlock_entrylk_wrapper(call_frame_t *frame, dht_elock_wrap_t *entrylk) +{ + dht_local_t *local = NULL, *lock_local = NULL; + call_frame_t *lock_frame = NULL; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + int ret = 0; + + local = frame->local; + + if (!entrylk || !entrylk->locks) + goto out; + + gf_uuid_unparse(local->loc.parent->gfid, pgfid); + + lock_frame = copy_frame(frame); + if (lock_frame == NULL) { + gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM, + DHT_MSG_COPY_FRAME_FAILED, "pgfid=%s", pgfid, "name=%s", + local->loc.name, "path=%s", local->loc.path, NULL); + goto done; + } + + lock_local = dht_local_init(lock_frame, NULL, NULL, 0); + if (lock_local == NULL) { + gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM, + DHT_MSG_CREATE_FAILED, "local", "pgfid=%s", pgfid, "name=%s", + local->loc.name, "path=%s", local->loc.path, NULL); + goto done; + } + + lock_frame->local = lock_local; + + lock_local->lock[0].ns.directory_ns.locks = entrylk->locks; + lock_local->lock[0].ns.directory_ns.lk_count = entrylk->lk_count; + entrylk->locks = NULL; + entrylk->lk_count = 0; + + ret = dht_unlock_entrylk( + lock_frame, lock_local->lock[0].ns.directory_ns.locks, + lock_local->lock[0].ns.directory_ns.lk_count, dht_unlock_entrylk_done); + if (ret) + goto done; + + lock_frame = NULL; + +done: + if (lock_frame != NULL) { + DHT_STACK_DESTROY(lock_frame); + } + +out: + return 0; +} + +static int +dht_entrylk_cleanup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_entrylk_done(frame); + return 0; +} + +static void +dht_entrylk_cleanup(call_frame_t *lock_frame) +{ + dht_lock_t **lk_array = NULL; + int lk_count = 0, lk_acquired = 0; + dht_local_t *local = NULL; + + local = lock_frame->local; + + lk_array = local->lock[0].ns.directory_ns.locks; + lk_count = local->lock[0].ns.directory_ns.lk_count; + + lk_acquired = dht_lock_count(lk_array, lk_count); + if (lk_acquired != 0) { + dht_unlock_entrylk(lock_frame, lk_array, lk_count, + dht_entrylk_cleanup_cbk); + } else { + dht_entrylk_done(lock_frame); + } + + return; +} + +static int32_t +dht_blocking_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + int lk_index = 0; + int i = 0; + dht_local_t *local = NULL; + + lk_index = (long)cookie; + + local = frame->local; + if (op_ret == 0) { + local->lock[0].ns.directory_ns.locks[lk_index]->locked = _gf_true; + } else { + switch (op_errno) { + case ESTALE: + case ENOENT: + if (local->lock[0] + .ns.directory_ns.locks[lk_index] + ->do_on_failure != IGNORE_ENOENT_ESTALE) { + local->lock[0].ns.directory_ns.op_ret = -1; + local->lock[0].ns.directory_ns.op_errno = op_errno; + goto cleanup; + } + break; + default: + local->lock[0].ns.directory_ns.op_ret = -1; + local->lock[0].ns.directory_ns.op_errno = op_errno; + goto cleanup; + } + } + + if (lk_index == (local->lock[0].ns.directory_ns.lk_count - 1)) { + for (i = 0; (i < local->lock[0].ns.directory_ns.lk_count) && + (!local->lock[0].ns.directory_ns.locks[i]->locked); + i++) + ; + + if (i == local->lock[0].ns.directory_ns.lk_count) { + local->lock[0].ns.directory_ns.op_ret = -1; + local->lock[0].ns.directory_ns.op_errno = op_errno; + } + + dht_entrylk_done(frame); + } else { + dht_blocking_entrylk_rec(frame, ++lk_index); + } + + return 0; + +cleanup: + dht_entrylk_cleanup(frame); + + return 0; +} + +void +dht_blocking_entrylk_rec(call_frame_t *frame, int i) +{ + dht_local_t *local = NULL; + + local = frame->local; + + STACK_WIND_COOKIE( + frame, dht_blocking_entrylk_cbk, (void *)(long)i, + local->lock[0].ns.directory_ns.locks[i]->xl, + local->lock[0].ns.directory_ns.locks[i]->xl->fops->entrylk, + local->lock[0].ns.directory_ns.locks[i]->domain, + &local->lock[0].ns.directory_ns.locks[i]->loc, + local->lock[0].ns.directory_ns.locks[i]->basename, ENTRYLK_LOCK, + ENTRYLK_WRLCK, NULL); + + return; +} + +int +dht_blocking_entrylk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count, + fop_entrylk_cbk_t entrylk_cbk) +{ + int ret = -1; + call_frame_t *lock_frame = NULL; + dht_local_t *local = NULL; + + GF_VALIDATE_OR_GOTO("dht-locks", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, out); + GF_VALIDATE_OR_GOTO(frame->this->name, entrylk_cbk, out); + + lock_frame = dht_lock_frame(frame); + if (lock_frame == NULL) + goto out; + + ret = dht_local_entrylk_init(lock_frame, lk_array, lk_count, entrylk_cbk); + if (ret < 0) { + goto out; + } + + dht_set_lkowner(lk_array, lk_count, &lock_frame->root->lk_owner); + + local = lock_frame->local; + local->main_frame = frame; + + dht_blocking_entrylk_rec(lock_frame, 0); + + return 0; +out: + if (lock_frame) + dht_lock_stack_destroy(lock_frame, DHT_ENTRYLK); + + return -1; +} + +static int +dht_local_inodelk_init(call_frame_t *frame, dht_lock_t **lk_array, int lk_count, + fop_inodelk_cbk_t inodelk_cbk) +{ + int ret = -1; + dht_local_t *local = NULL; + + local = frame->local; + + if (local == NULL) { + local = dht_local_init(frame, NULL, NULL, 0); + } + + if (local == NULL) { + goto out; + } + + local->lock[0].layout.my_layout.inodelk_cbk = inodelk_cbk; + local->lock[0].layout.my_layout.locks = lk_array; + local->lock[0].layout.my_layout.lk_count = lk_count; + + ret = dht_lock_order_requests(local->lock[0].layout.my_layout.locks, + local->lock[0].layout.my_layout.lk_count); + if (ret < 0) + goto out; + + ret = 0; +out: + return ret; +} + +static void +dht_inodelk_done(call_frame_t *lock_frame) +{ + fop_inodelk_cbk_t inodelk_cbk = NULL; + call_frame_t *main_frame = NULL; + dht_local_t *local = NULL; + + local = lock_frame->local; + main_frame = local->main_frame; + + local->lock[0].layout.my_layout.locks = NULL; + local->lock[0].layout.my_layout.lk_count = 0; + + inodelk_cbk = local->lock[0].layout.my_layout.inodelk_cbk; + local->lock[0].layout.my_layout.inodelk_cbk = NULL; + + inodelk_cbk(main_frame, NULL, main_frame->this, + local->lock[0].layout.my_layout.op_ret, + local->lock[0].layout.my_layout.op_errno, NULL); + + dht_lock_stack_destroy(lock_frame, DHT_INODELK); + return; +} + +static int32_t +dht_unlock_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + int lk_index = 0, call_cnt = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + lk_index = (long)cookie; + + local = frame->local; + if (op_ret < 0) { + uuid_utoa_r(local->lock[0].layout.my_layout.locks[lk_index]->loc.gfid, + gfid); + + gf_smsg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_UNLOCKING_FAILED, + "name=%s", + local->lock[0].layout.my_layout.locks[lk_index]->xl->name, + "gfid=%s", gfid, NULL); + } else { + local->lock[0].layout.my_layout.locks[lk_index]->locked = 0; + } + + call_cnt = dht_frame_return(frame); + if (is_last_call(call_cnt)) { + dht_inodelk_done(frame); + } + + return 0; +} + +static int32_t +dht_unlock_inodelk_done(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + gf_uuid_unparse(local->lock[0].layout.my_layout.locks[0]->loc.inode->gfid, + gfid); + + if (op_ret < 0) { + gf_smsg(this->name, GF_LOG_WARNING, op_errno, + DHT_MSG_UNLOCK_GFID_FAILED, "DHT_LAYOUT_HEAL_DOMAIN gfid=%s", + gfid, NULL); + } + + DHT_STACK_DESTROY(frame); + return 0; +} + +int32_t +dht_unlock_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count, + fop_inodelk_cbk_t inodelk_cbk) +{ + dht_local_t *local = NULL; + struct gf_flock flock = { + 0, + }; + int ret = -1, i = 0; + call_frame_t *lock_frame = NULL; + int call_cnt = 0; + + GF_VALIDATE_OR_GOTO("dht-locks", frame, done); + GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, done); + GF_VALIDATE_OR_GOTO(frame->this->name, inodelk_cbk, done); + + call_cnt = dht_lock_count(lk_array, lk_count); + if (call_cnt == 0) { + ret = 0; + goto done; + } + + lock_frame = dht_lock_frame(frame); + if (lock_frame == NULL) { + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, + DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS, + NULL); + + dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count); + goto done; + } + + ret = dht_local_inodelk_init(lock_frame, lk_array, lk_count, inodelk_cbk); + if (ret < 0) { + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, + DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK, + NULL); + + dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count); + + goto done; + } + + local = lock_frame->local; + local->main_frame = frame; + local->call_cnt = call_cnt; + + flock.l_type = F_UNLCK; + + for (i = 0; i < local->lock[0].layout.my_layout.lk_count; i++) { + if (!local->lock[0].layout.my_layout.locks[i]->locked) + continue; + + lock_frame->root + ->lk_owner = local->lock[0].layout.my_layout.locks[i]->lk_owner; + STACK_WIND_COOKIE( + lock_frame, dht_unlock_inodelk_cbk, (void *)(long)i, + local->lock[0].layout.my_layout.locks[i]->xl, + local->lock[0].layout.my_layout.locks[i]->xl->fops->inodelk, + local->lock[0].layout.my_layout.locks[i]->domain, + &local->lock[0].layout.my_layout.locks[i]->loc, F_SETLK, &flock, + NULL); + if (!--call_cnt) + break; + } + + return 0; + +done: + if (lock_frame) + dht_lock_stack_destroy(lock_frame, DHT_INODELK); + + /* no locks acquired, invoke inodelk_cbk */ + if (ret == 0) + inodelk_cbk(frame, NULL, frame->this, 0, 0, NULL); + + return ret; +} + +int32_t +dht_unlock_inodelk_wrapper(call_frame_t *frame, dht_ilock_wrap_t *inodelk) +{ + dht_local_t *local = NULL, *lock_local = NULL; + call_frame_t *lock_frame = NULL; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + int ret = 0; + + local = frame->local; + + if (!inodelk || !inodelk->locks) + goto out; + + gf_uuid_unparse(local->loc.parent->gfid, pgfid); + + lock_frame = copy_frame(frame); + if (lock_frame == NULL) { + gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM, + DHT_MSG_COPY_FRAME_FAILED, "pgfid=%s", pgfid, "name=%s", + local->loc.name, "path=%s", local->loc.path, NULL); + goto done; + } + + lock_local = dht_local_init(lock_frame, NULL, NULL, 0); + if (lock_local == NULL) { + gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM, + DHT_MSG_CREATE_FAILED, "local", "gfid=%s", pgfid, "name=%s", + local->loc.name, "path=%s", local->loc.path, NULL); + goto done; + } + + lock_frame->local = lock_local; + + lock_local->lock[0].layout.my_layout.locks = inodelk->locks; + lock_local->lock[0].layout.my_layout.lk_count = inodelk->lk_count; + inodelk->locks = NULL; + inodelk->lk_count = 0; + + ret = dht_unlock_inodelk( + lock_frame, lock_local->lock[0].layout.my_layout.locks, + lock_local->lock[0].layout.my_layout.lk_count, dht_unlock_inodelk_done); + + if (ret) + goto done; + + lock_frame = NULL; + +done: + if (lock_frame != NULL) { + DHT_STACK_DESTROY(lock_frame); + } +out: + return 0; +} + +static int +dht_inodelk_cleanup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_inodelk_done(frame); + return 0; +} + +static void +dht_inodelk_cleanup(call_frame_t *lock_frame) +{ + dht_lock_t **lk_array = NULL; + int lk_count = 0, lk_acquired = 0; + dht_local_t *local = NULL; + + local = lock_frame->local; + + lk_array = local->lock[0].layout.my_layout.locks; + lk_count = local->lock[0].layout.my_layout.lk_count; + + lk_acquired = dht_lock_count(lk_array, lk_count); + if (lk_acquired != 0) { + dht_unlock_inodelk(lock_frame, lk_array, lk_count, + dht_inodelk_cleanup_cbk); + } else { + dht_inodelk_done(lock_frame); + } + + return; +} + +static int32_t +dht_nonblocking_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + int lk_index = 0, call_cnt = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + lk_index = (long)cookie; + + if (op_ret == -1) { + local->lock[0].layout.my_layout.op_ret = -1; + local->lock[0].layout.my_layout.op_errno = op_errno; + + if (local && local->lock[0].layout.my_layout.locks[lk_index]) { + uuid_utoa_r(local->lock[0] + .layout.my_layout.locks[lk_index] + ->loc.inode->gfid, + gfid); + + gf_msg_debug( + this->name, op_errno, + "inodelk failed on gfid: %s " + "subvolume: %s", + gfid, + local->lock[0].layout.my_layout.locks[lk_index]->xl->name); + } + + goto out; + } + + local->lock[0].layout.my_layout.locks[lk_index]->locked = _gf_true; + +out: + call_cnt = dht_frame_return(frame); + if (is_last_call(call_cnt)) { + if (local->lock[0].layout.my_layout.op_ret < 0) { + dht_inodelk_cleanup(frame); + return 0; + } + + dht_inodelk_done(frame); + } + + return 0; +} + +int +dht_nonblocking_inodelk(call_frame_t *frame, dht_lock_t **lk_array, + int lk_count, fop_inodelk_cbk_t inodelk_cbk) +{ + struct gf_flock flock = { + 0, + }; + int i = 0, ret = 0; + dht_local_t *local = NULL; + call_frame_t *lock_frame = NULL; + + GF_VALIDATE_OR_GOTO("dht-locks", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, out); + GF_VALIDATE_OR_GOTO(frame->this->name, inodelk_cbk, out); + + lock_frame = dht_lock_frame(frame); + if (lock_frame == NULL) + goto out; + + ret = dht_local_inodelk_init(lock_frame, lk_array, lk_count, inodelk_cbk); + if (ret < 0) { + goto out; + } + + dht_set_lkowner(lk_array, lk_count, &lock_frame->root->lk_owner); + + local = lock_frame->local; + local->main_frame = frame; + + local->call_cnt = lk_count; + + for (i = 0; i < lk_count; i++) { + flock.l_type = local->lock[0].layout.my_layout.locks[i]->type; + + STACK_WIND_COOKIE( + lock_frame, dht_nonblocking_inodelk_cbk, (void *)(long)i, + local->lock[0].layout.my_layout.locks[i]->xl, + local->lock[0].layout.my_layout.locks[i]->xl->fops->inodelk, + local->lock[0].layout.my_layout.locks[i]->domain, + &local->lock[0].layout.my_layout.locks[i]->loc, F_SETLK, &flock, + NULL); + } + + return 0; + +out: + if (lock_frame) + dht_lock_stack_destroy(lock_frame, DHT_INODELK); + + return -1; +} + +static int32_t +dht_blocking_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + int lk_index = 0; + int i = 0; + dht_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = { + 0, + }; + dht_reaction_type_t reaction = 0; + + lk_index = (long)cookie; + + local = frame->local; + if (op_ret == 0) { + local->lock[0].layout.my_layout.locks[lk_index]->locked = _gf_true; + } else { + switch (op_errno) { + case ESTALE: + case ENOENT: + reaction = local->lock[0] + .layout.my_layout.locks[lk_index] + ->do_on_failure; + if ((reaction != IGNORE_ENOENT_ESTALE) && + (reaction != IGNORE_ENOENT_ESTALE_EIO)) { + gf_uuid_unparse(local->lock[0] + .layout.my_layout.locks[lk_index] + ->loc.gfid, + gfid); + local->lock[0].layout.my_layout.op_ret = -1; + local->lock[0].layout.my_layout.op_errno = op_errno; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_INODELK_FAILED, "subvol=%s", + local->lock[0] + .layout.my_layout.locks[lk_index] + ->xl->name, + "gfid=%s", gfid, NULL); + goto cleanup; + } + break; + case EIO: + reaction = local->lock[0] + .layout.my_layout.locks[lk_index] + ->do_on_failure; + if (reaction != IGNORE_ENOENT_ESTALE_EIO) { + gf_uuid_unparse(local->lock[0] + .layout.my_layout.locks[lk_index] + ->loc.gfid, + gfid); + local->lock[0].layout.my_layout.op_ret = -1; + local->lock[0].layout.my_layout.op_errno = op_errno; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_INODELK_FAILED, "subvol=%s", + local->lock[0] + .layout.my_layout.locks[lk_index] + ->xl->name, + "gfid=%s", gfid, NULL); + goto cleanup; + } + break; + + default: + gf_uuid_unparse( + local->lock[0].layout.my_layout.locks[lk_index]->loc.gfid, + gfid); + local->lock[0].layout.my_layout.op_ret = -1; + local->lock[0].layout.my_layout.op_errno = op_errno; + gf_smsg( + this->name, GF_LOG_ERROR, op_errno, DHT_MSG_INODELK_FAILED, + "subvol=%s", + local->lock[0].layout.my_layout.locks[lk_index]->xl->name, + "gfid=%s", gfid, NULL); + goto cleanup; + } + } + + if (lk_index == (local->lock[0].layout.my_layout.lk_count - 1)) { + for (i = 0; (i < local->lock[0].layout.my_layout.lk_count) && + (!local->lock[0].layout.my_layout.locks[i]->locked); + i++) + ; + + if (i == local->lock[0].layout.my_layout.lk_count) { + local->lock[0].layout.my_layout.op_ret = -1; + local->lock[0].layout.my_layout.op_errno = op_errno; + } + + dht_inodelk_done(frame); + } else { + dht_blocking_inodelk_rec(frame, ++lk_index); + } + + return 0; + +cleanup: + dht_inodelk_cleanup(frame); + + return 0; +} + +void +dht_blocking_inodelk_rec(call_frame_t *frame, int i) +{ + dht_local_t *local = NULL; + struct gf_flock flock = { + 0, + }; + + local = frame->local; + + flock.l_type = local->lock[0].layout.my_layout.locks[i]->type; + + STACK_WIND_COOKIE( + frame, dht_blocking_inodelk_cbk, (void *)(long)i, + local->lock[0].layout.my_layout.locks[i]->xl, + local->lock[0].layout.my_layout.locks[i]->xl->fops->inodelk, + local->lock[0].layout.my_layout.locks[i]->domain, + &local->lock[0].layout.my_layout.locks[i]->loc, F_SETLKW, &flock, NULL); + + return; +} + +int +dht_blocking_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count, + fop_inodelk_cbk_t inodelk_cbk) +{ + int ret = -1; + call_frame_t *lock_frame = NULL; + dht_local_t *local = NULL; + dht_local_t *tmp_local = NULL; + char gfid[GF_UUID_BUF_SIZE] = { + 0, + }; + + GF_VALIDATE_OR_GOTO("dht-locks", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, out); + GF_VALIDATE_OR_GOTO(frame->this->name, inodelk_cbk, out); + + tmp_local = frame->local; + + lock_frame = dht_lock_frame(frame); + if (lock_frame == NULL) { + gf_uuid_unparse(tmp_local->loc.gfid, gfid); + gf_smsg("dht", GF_LOG_ERROR, ENOMEM, DHT_MSG_LOCK_FRAME_FAILED, + "gfid=%s", gfid, "path=%s", tmp_local->loc.path, NULL); + goto out; + } + + ret = dht_local_inodelk_init(lock_frame, lk_array, lk_count, inodelk_cbk); + if (ret < 0) { + gf_uuid_unparse(tmp_local->loc.gfid, gfid); + gf_smsg("dht", GF_LOG_ERROR, ENOMEM, DHT_MSG_LOCAL_LOCK_INIT_FAILED, + "gfid=%s", gfid, "path=%s", tmp_local->loc.path, NULL); + goto out; + } + + dht_set_lkowner(lk_array, lk_count, &lock_frame->root->lk_owner); + + local = lock_frame->local; + local->main_frame = frame; + + dht_blocking_inodelk_rec(lock_frame, 0); + + return 0; +out: + if (lock_frame) + dht_lock_stack_destroy(lock_frame, DHT_INODELK); + + return -1; +} + +void +dht_unlock_namespace(call_frame_t *frame, dht_dir_transaction_t *lock) +{ + GF_VALIDATE_OR_GOTO("dht-locks", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, lock, out); + + dht_unlock_entrylk_wrapper(frame, &lock->ns.directory_ns); + dht_unlock_inodelk_wrapper(frame, &lock->ns.parent_layout); + +out: + return; +} + +static int32_t +dht_protect_namespace_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + + local = frame->local; + if (op_ret != 0) + dht_unlock_inodelk_wrapper(frame, &local->current->ns.parent_layout); + + local->current->ns.ns_cbk(frame, cookie, this, op_ret, op_errno, xdata); + return 0; +} + +int32_t +dht_blocking_entrylk_after_inodelk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + int ret = -1; + loc_t *loc = NULL; + dht_lock_t **lk_array = NULL; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + int count = 0; + dht_elock_wrap_t *entrylk = NULL; + + local = frame->local; + entrylk = &local->current->ns.directory_ns; + + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; + goto err; + } + + loc = &entrylk->locks[0]->loc; + gf_uuid_unparse(loc->gfid, pgfid); + + local->op_ret = 0; + lk_array = entrylk->locks; + count = entrylk->lk_count; + + ret = dht_blocking_entrylk(frame, lk_array, count, + dht_protect_namespace_cbk); + + if (ret < 0) { + local->op_ret = -1; + local->op_errno = EIO; + gf_smsg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_ENTRYLK_FAILED_AFT_INODELK, "fop=%s", + gf_fop_list[local->fop], "pgfid=%s", pgfid, "basename=%s", + entrylk->locks[0]->basename, NULL); + goto err; + } + + return 0; + +err: + if (lk_array != NULL) { + dht_lock_array_free(lk_array, count); + GF_FREE(lk_array); + entrylk->locks = NULL; + entrylk->lk_count = 0; + } + + /* Unlock inodelk. No harm calling unlock twice */ + dht_unlock_inodelk_wrapper(frame, &local->current->ns.parent_layout); + /* Call ns_cbk. It will take care of unwinding */ + local->current->ns.ns_cbk(frame, NULL, this, local->op_ret, local->op_errno, + NULL); + return 0; +} + +/* Given the loc and the subvol, this routine takes the inodelk on + * the parent inode and entrylk on (parent, loc->name). This routine + * is specific as it supports only one subvol on which it takes inodelk + * and then entrylk serially. + */ +int +dht_protect_namespace(call_frame_t *frame, loc_t *loc, xlator_t *subvol, + struct dht_namespace *ns, fop_entrylk_cbk_t ns_cbk) +{ + dht_ilock_wrap_t *inodelk = NULL; + dht_elock_wrap_t *entrylk = NULL; + dht_lock_t **lk_array = NULL; + dht_local_t *local = NULL; + xlator_t *this = NULL; + loc_t parent = { + 0, + }; + int ret = -1; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + int32_t op_errno = 0; + int count = 1; + + GF_VALIDATE_OR_GOTO("dht-locks", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, loc, out); + GF_VALIDATE_OR_GOTO(frame->this->name, loc->parent, out); + GF_VALIDATE_OR_GOTO(frame->this->name, subvol, out); + + local = frame->local; + this = frame->this; + + inodelk = &ns->parent_layout; + entrylk = &ns->directory_ns; + + /* Initialize entrylk_cbk and parent loc */ + ns->ns_cbk = ns_cbk; + + ret = dht_build_parent_loc(this, &parent, loc, &op_errno); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_LOC_FAILED, + "gfid=%s", loc->gfid, "name=%s", loc->name, "path=%s", + loc->path, NULL); + goto out; + } + gf_uuid_unparse(parent.gfid, pgfid); + + /* Alloc inodelk */ + inodelk->locks = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer); + if (inodelk->locks == NULL) { + local->op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_CALLOC_FAILED, "fop=%s", gf_fop_list[local->fop], + "pgfid=%s", pgfid, "name=%s", loc->name, "path=%s", loc->path, + NULL); + goto out; + } + + inodelk->locks[0] = dht_lock_new(this, subvol, &parent, F_RDLCK, + DHT_LAYOUT_HEAL_DOMAIN, NULL, + FAIL_ON_ANY_ERROR); + if (inodelk->locks[0] == NULL) { + local->op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_LOCK_ALLOC_FAILED, "inodelk-fop=%s", + gf_fop_list[local->fop], "pgfid=%s", pgfid, "name=%s", + loc->name, "path=%s", loc->path, NULL); + goto err; + } + inodelk->lk_count = count; + + /* Allock entrylk */ + entrylk->locks = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer); + if (entrylk->locks == NULL) { + local->op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_CALLOC_FAILED, "entrylk-fop=%s", + gf_fop_list[local->fop], "pgfid=%s", pgfid, "name=%s", + loc->name, "path=%s", loc->path, NULL); + + goto err; + } + + entrylk->locks[0] = dht_lock_new(this, subvol, &parent, F_WRLCK, + DHT_ENTRY_SYNC_DOMAIN, loc->name, + FAIL_ON_ANY_ERROR); + if (entrylk->locks[0] == NULL) { + local->op_errno = ENOMEM; + gf_smsg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_LOCK_ALLOC_FAILED, "entrylk-fop=%s", + gf_fop_list[local->fop], "pgfid=%s", pgfid, "name=%s", + loc->name, "path=%s", loc->path, NULL); + + goto err; + } + entrylk->lk_count = count; + + /* Take read inodelk on parent. If it is successful, take write entrylk + * on name in cbk. + */ + lk_array = inodelk->locks; + ret = dht_blocking_inodelk(frame, lk_array, count, + dht_blocking_entrylk_after_inodelk); + if (ret < 0) { + local->op_errno = EIO; + gf_smsg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_BLOCK_INODELK_FAILED, "fop=%s", gf_fop_list[local->fop], + "pgfid=%s", pgfid, "name=%s", loc->name, "path=%s", loc->path, + NULL); + + goto err; + } + + loc_wipe(&parent); + + return 0; +err: + if (entrylk->locks != NULL) { + dht_lock_array_free(entrylk->locks, count); + GF_FREE(entrylk->locks); + entrylk->locks = NULL; + entrylk->lk_count = 0; + } + + if (inodelk->locks != NULL) { + dht_lock_array_free(inodelk->locks, count); + GF_FREE(inodelk->locks); + inodelk->locks = NULL; + inodelk->lk_count = 0; + } + + loc_wipe(&parent); +out: + return -1; +} diff --git a/xlators/cluster/dht/src/dht-lock.h b/xlators/cluster/dht/src/dht-lock.h new file mode 100644 index 00000000000..6485c03fb6e --- /dev/null +++ b/xlators/cluster/dht/src/dht-lock.h @@ -0,0 +1,91 @@ +/* + Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _DHT_LOCK_H +#define _DHT_LOCK_H + +#include "dht-common.h" + +void +dht_lock_array_free(dht_lock_t **lk_array, int count); + +int32_t +dht_lock_count(dht_lock_t **lk_array, int lk_count); + +dht_lock_t * +dht_lock_new(xlator_t *this, xlator_t *xl, loc_t *loc, short type, + const char *domain, const char *basename, + dht_reaction_type_t do_on_failure); + +int32_t +dht_unlock_entrylk_wrapper(call_frame_t *, dht_elock_wrap_t *); + +void +dht_blocking_entrylk_rec(call_frame_t *frame, int i); + +int +dht_blocking_entrylk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count, + fop_inodelk_cbk_t entrylk_cbk); + +int32_t +dht_unlock_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count, + fop_inodelk_cbk_t inodelk_cbk); + +int32_t +dht_unlock_inodelk_wrapper(call_frame_t *, dht_ilock_wrap_t *); + +/* Acquire non-blocking inodelk on a list of xlators. + * + * @lk_array: array of lock requests lock on. + * + * @lk_count: number of locks in @lk_array + * + * @inodelk_cbk: will be called after inodelk replies are received + * + * @retval: -1 if stack_winding inodelk fails. 0 otherwise. + * inodelk_cbk is called with appropriate error on errors. + * On failure to acquire lock on all members of list, successful + * locks are unlocked before invoking cbk. + */ + +int +dht_nonblocking_inodelk(call_frame_t *frame, dht_lock_t **lk_array, + int lk_count, fop_inodelk_cbk_t inodelk_cbk); + +void +dht_blocking_inodelk_rec(call_frame_t *frame, int i); + +/* same as dht_nonblocking_inodelk, but issues sequential blocking locks on + * @lk_array directly. locks are issued on some order which remains same + * for a list of xlators (irrespective of order of xlators within list). + */ + +int +dht_blocking_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count, + fop_inodelk_cbk_t inodelk_cbk); + +int32_t +dht_blocking_entrylk_after_inodelk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata); + +int32_t +dht_blocking_entrylk_after_inodelk_rename(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata); + +void +dht_unlock_namespace(call_frame_t *, dht_dir_transaction_t *); + +int +dht_protect_namespace(call_frame_t *frame, loc_t *loc, xlator_t *subvol, + struct dht_namespace *ns, fop_entrylk_cbk_t ns_cbk); + +#endif /* _DHT_LOCK_H */ diff --git a/xlators/cluster/dht/src/dht-mem-types.h b/xlators/cluster/dht/src/dht-mem-types.h index e893eb48fd8..e3c4471334a 100644 --- a/xlators/cluster/dht/src/dht-mem-types.h +++ b/xlators/cluster/dht/src/dht-mem-types.h @@ -8,28 +8,31 @@ cases as published by the Free Software Foundation. */ - #ifndef __DHT_MEM_TYPES_H__ #define __DHT_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_dht_mem_types_ { - gf_dht_mt_dht_du_t = gf_common_mt_end + 1, - gf_dht_mt_dht_conf_t, - gf_dht_mt_char, - gf_dht_mt_int32_t, - gf_dht_mt_xlator_t, - gf_dht_mt_dht_layout_t, - gf_switch_mt_dht_conf_t, - gf_switch_mt_dht_du_t, - gf_switch_mt_switch_sched_array, - gf_switch_mt_switch_struct, - gf_dht_mt_subvol_time, - gf_dht_mt_loc_t, - gf_defrag_info_mt, - gf_dht_mt_inode_ctx_t, - gf_dht_mt_ctx_stat_time_t, - gf_dht_mt_end + gf_dht_mt_dht_du_t = gf_common_mt_end + 1, + gf_dht_mt_dht_conf_t, + gf_dht_mt_char, + gf_dht_mt_int32_t, + gf_dht_mt_xlator_t, + gf_dht_mt_dht_layout_t, + gf_switch_mt_switch_sched_array, + gf_switch_mt_switch_struct, + gf_dht_mt_subvol_time, + gf_dht_mt_loc_t, + gf_defrag_info_mt, + gf_dht_mt_inode_ctx_t, + gf_dht_mt_dirent_t, + gf_dht_mt_container_t, + gf_dht_mt_octx_t, + gf_dht_mt_miginfo_t, + gf_dht_mt_fd_ctx_t, + gf_dht_ret_cache_t, + gf_dht_nodeuuids_t, + gf_dht_mt_end }; #endif diff --git a/xlators/cluster/dht/src/dht-messages.h b/xlators/cluster/dht/src/dht-messages.h new file mode 100644 index 00000000000..601f8dad78b --- /dev/null +++ b/xlators/cluster/dht/src/dht-messages.h @@ -0,0 +1,386 @@ +/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _DHT_MESSAGES_H_ +#define _DHT_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID( + DHT, DHT_MSG_CACHED_SUBVOL_GET_FAILED, DHT_MSG_CREATE_LINK_FAILED, + DHT_MSG_DICT_SET_FAILED, DHT_MSG_DIR_ATTR_HEAL_FAILED, + DHT_MSG_DIR_SELFHEAL_FAILED, DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, + DHT_MSG_FILE_ON_MULT_SUBVOL, DHT_MSG_FILE_TYPE_MISMATCH, + DHT_MSG_GFID_MISMATCH, DHT_MSG_GFID_NULL, DHT_MSG_HASHED_SUBVOL_GET_FAILED, + DHT_MSG_INIT_FAILED, DHT_MSG_INVALID_CONFIGURATION, + DHT_MSG_INVALID_DISK_LAYOUT, DHT_MSG_INVALID_OPTION, + DHT_MSG_LAYOUT_FIX_FAILED, DHT_MSG_LAYOUT_MERGE_FAILED, + DHT_MSG_LAYOUT_MISMATCH, DHT_MSG_LAYOUT_NULL, DHT_MSG_MIGRATE_DATA_COMPLETE, + DHT_MSG_MIGRATE_DATA_FAILED, DHT_MSG_MIGRATE_FILE_COMPLETE, + DHT_MSG_MIGRATE_FILE_FAILED, DHT_MSG_NO_MEMORY, DHT_MSG_OPENDIR_FAILED, + DHT_MSG_REBALANCE_FAILED, DHT_MSG_REBALANCE_START_FAILED, + DHT_MSG_REBALANCE_STATUS, DHT_MSG_REBALANCE_STOPPED, DHT_MSG_RENAME_FAILED, + DHT_MSG_SETATTR_FAILED, DHT_MSG_SUBVOL_INSUFF_INODES, + DHT_MSG_SUBVOL_INSUFF_SPACE, DHT_MSG_UNLINK_FAILED, + DHT_MSG_LAYOUT_SET_FAILED, DHT_MSG_LOG_FIXED_LAYOUT, + DHT_MSG_GET_XATTR_FAILED, DHT_MSG_FILE_LOOKUP_FAILED, + DHT_MSG_OPEN_FD_FAILED, DHT_MSG_SET_INODE_CTX_FAILED, + DHT_MSG_UNLOCKING_FAILED, DHT_MSG_DISK_LAYOUT_NULL, DHT_MSG_SUBVOL_INFO, + DHT_MSG_CHUNK_SIZE_INFO, DHT_MSG_LAYOUT_FORM_FAILED, DHT_MSG_SUBVOL_ERROR, + DHT_MSG_LAYOUT_SORT_FAILED, DHT_MSG_REGEX_INFO, DHT_MSG_FOPEN_FAILED, + DHT_MSG_SET_HOSTNAME_FAILED, DHT_MSG_BRICK_ERROR, DHT_MSG_SYNCOP_FAILED, + DHT_MSG_MIGRATE_INFO, DHT_MSG_SOCKET_ERROR, DHT_MSG_CREATE_FD_FAILED, + DHT_MSG_READDIR_ERROR, DHT_MSG_CHILD_LOC_BUILD_FAILED, + DHT_MSG_SET_SWITCH_PATTERN_ERROR, DHT_MSG_COMPUTE_HASH_FAILED, + DHT_MSG_FIND_LAYOUT_ANOMALIES_ERROR, DHT_MSG_ANOMALIES_INFO, + DHT_MSG_LAYOUT_INFO, DHT_MSG_INODE_LK_ERROR, DHT_MSG_RENAME_INFO, + DHT_MSG_DATA_NULL, DHT_MSG_AGGREGATE_QUOTA_XATTR_FAILED, + DHT_MSG_UNLINK_LOOKUP_INFO, DHT_MSG_LINK_FILE_LOOKUP_INFO, + DHT_MSG_OPERATION_NOT_SUP, DHT_MSG_NOT_LINK_FILE_ERROR, DHT_MSG_CHILD_DOWN, + DHT_MSG_UUID_PARSE_ERROR, DHT_MSG_GET_DISK_INFO_ERROR, + DHT_MSG_INVALID_VALUE, DHT_MSG_SWITCH_PATTERN_INFO, + DHT_MSG_SUBVOL_OP_FAILED, DHT_MSG_LAYOUT_PRESET_FAILED, + DHT_MSG_INVALID_LINKFILE, DHT_MSG_FIX_LAYOUT_INFO, + DHT_MSG_GET_HOSTNAME_FAILED, DHT_MSG_WRITE_FAILED, + DHT_MSG_MIGRATE_HARDLINK_FILE_FAILED, DHT_MSG_FSYNC_FAILED, + DHT_MSG_SUBVOL_DECOMMISSION_INFO, DHT_MSG_BRICK_QUERY_FAILED, + DHT_MSG_SUBVOL_NO_LAYOUT_INFO, DHT_MSG_OPEN_FD_ON_DST_FAILED, + DHT_MSG_SUBVOL_NOT_FOUND, DHT_MSG_FILE_LOOKUP_ON_DST_FAILED, + DHT_MSG_DISK_LAYOUT_MISSING, DHT_MSG_DICT_GET_FAILED, + DHT_MSG_REVALIDATE_CBK_INFO, DHT_MSG_UPGRADE_BRICKS, DHT_MSG_LK_ARRAY_INFO, + DHT_MSG_RENAME_NOT_LOCAL, DHT_MSG_RECONFIGURE_INFO, + DHT_MSG_INIT_LOCAL_SUBVOL_FAILED, DHT_MSG_SYS_CALL_GET_TIME_FAILED, + DHT_MSG_NO_DISK_USAGE_STATUS, DHT_MSG_SUBVOL_DOWN_ERROR, + DHT_MSG_REBAL_THROTTLE_INFO, DHT_MSG_COMMIT_HASH_INFO, + DHT_MSG_REBAL_STRUCT_SET, DHT_MSG_HAS_MIGINFO, DHT_MSG_SETTLE_HASH_FAILED, + DHT_MSG_DEFRAG_PROCESS_DIR_FAILED, DHT_MSG_FD_CTX_SET_FAILED, + DHT_MSG_STALE_LOOKUP, DHT_MSG_PARENT_LAYOUT_CHANGED, + DHT_MSG_LOCK_MIGRATION_FAILED, DHT_MSG_LOCK_INODE_UNREF_FAILED, + DHT_MSG_ASPRINTF_FAILED, DHT_MSG_DIR_LOOKUP_FAILED, DHT_MSG_INODELK_FAILED, + DHT_MSG_LOCK_FRAME_FAILED, DHT_MSG_LOCAL_LOCK_INIT_FAILED, + DHT_MSG_ENTRYLK_ERROR, DHT_MSG_INODELK_ERROR, DHT_MSG_LOC_FAILED, + DHT_MSG_UNKNOWN_FOP, DHT_MSG_MIGRATE_FILE_SKIPPED, + DHT_MSG_DIR_XATTR_HEAL_FAILED, DHT_MSG_HASHED_SUBVOL_DOWN, + DHT_MSG_NON_HASHED_SUBVOL_DOWN, DHT_MSG_SYNCTASK_CREATE_FAILED, + DHT_MSG_DIR_HEAL_ABORT, DHT_MSG_MIGRATE_SKIP, DHT_MSG_FD_CREATE_FAILED, + DHT_MSG_DICT_NEW_FAILED, DHT_MSG_FAILED_TO_OPEN, DHT_MSG_CREATE_FAILED, + DHT_MSG_FILE_NOT_EXIST, DHT_MSG_CHOWN_FAILED, DHT_MSG_FALLOCATE_FAILED, + DHT_MSG_FTRUNCATE_FAILED, DHT_MSG_STATFS_FAILED, DHT_MSG_WRITE_CROSS, + DHT_MSG_NEW_TARGET_FOUND, DHT_MSG_INSUFF_MEMORY, DHT_MSG_SET_XATTR_FAILED, + DHT_MSG_SET_MODE_FAILED, DHT_MSG_FILE_EXISTS_IN_DEST, + DHT_MSG_SYMLINK_FAILED, DHT_MSG_LINKFILE_DEL_FAILED, DHT_MSG_MKNOD_FAILED, + DHT_MSG_MIGRATE_CLEANUP_FAILED, DHT_MSG_LOCK_MIGRATE, + DHT_MSG_PARENT_BUILD_FAILED, DHT_MSG_HASHED_SUBVOL_NOT_FOUND, + DHT_MSG_ACQUIRE_ENTRYLK_FAILED, DHT_MSG_CREATE_DST_FAILED, + DHT_MSG_MIGRATION_EXIT, DHT_MSG_CHANGED_DST, DHT_MSG_TRACE_FAILED, + DHT_MSG_WRITE_LOCK_FAILED, DHT_MSG_GETACTIVELK_FAILED, DHT_MSG_STAT_FAILED, + DHT_MSG_UNLINK_PERFORM_FAILED, DHT_MSG_CLANUP_SOURCE_FILE_FAILED, + DHT_MSG_UNLOCK_FILE_FAILED, DHT_MSG_REMOVE_XATTR_FAILED, + DHT_MSG_DATA_MIGRATE_ABORT, DHT_MSG_DEFRAG_NULL, DHT_MSG_PARENT_NULL, + DHT_MSG_GFID_NOT_PRESENT, DHT_MSG_CHILD_LOC_FAILED, + DHT_MSG_SET_LOOKUP_FAILED, DHT_MSG_DIR_REMOVED, DHT_MSG_FIX_NOT_COMP, + DHT_MSG_SUBVOL_DETER_FAILED, DHT_MSG_LOCAL_SUBVOL, DHT_MSG_NODE_UUID, + DHT_MSG_SIZE_FILE, DHT_MSG_GET_DATA_SIZE_FAILED, + DHT_MSG_PTHREAD_JOIN_FAILED, DHT_MSG_COUNTER_THREAD_CREATE_FAILED, + DHT_MSG_MIGRATION_INIT_QUEUE_FAILED, DHT_MSG_PAUSED_TIMEOUT, DHT_MSG_WOKE, + DHT_MSG_ABORT_REBALANCE, DHT_MSG_CREATE_TASK_REBAL_FAILED, + DHT_MSG_REBAL_ESTIMATE_NOT_AVAIL, DHT_MSG_ADD_CHOICES_ERROR, + DHT_MSG_GET_CHOICES_ERROR, DHT_MSG_PREPARE_STATUS_ERROR, + DHT_MSG_SET_CHOICE_FAILED, DHT_MSG_SET_HASHED_SUBVOL_FAILED, + DHT_MSG_XATTR_HEAL_NOT_POSS, DHT_MSG_LINKTO_FILE_FAILED, + DHT_MSG_STALE_LINKFILE_DELETE, DHT_MSG_NO_SUBVOL_FOR_LINKTO, + DHT_MSG_SUBVOL_RETURNED, DHT_MSG_UNKNOWN_LOCAL_XSEL, DHT_MSG_GET_XATTR_ERR, + DHT_MSG_ALLOC_OR_FILL_FAILED, DHT_MSG_GET_REAL_NAME_FAILED, + DHT_MSG_COPY_UUID_FAILED, DHT_MSG_MDS_DETER_FAILED, + DHT_MSG_CREATE_REBAL_FAILED, DHT_MSG_LINK_LAYOUT_FAILED, + DHT_MSG_NO_SUBVOL_IN_LAYOUT, DHT_MSG_MEM_ALLOC_FAILED, + DHT_MSG_SET_IN_PARAMS_DICT_FAILED, DHT_MSG_LOC_COPY_FAILED, + DHT_MSG_PARENT_LOC_FAILED, DHT_MSG_CREATE_LOCK_FAILED, + DHT_MSG_PREV_ATTEMPT_FAILED, DHT_MSG_REFRESH_ATTEMPT, + DHT_MSG_ACQUIRE_LOCK_FAILED, DHT_MSG_CREATE_STUB_FAILED, + DHT_MSG_WIND_LOCK_REQ_FAILED, DHT_MSG_REFRESH_FAILED, + DHT_MSG_CACHED_SUBVOL_ERROR, DHT_MSG_NO_LINK_SUBVOL, DHT_MSG_SET_KEY_FAILED, + DHT_MSG_REMOVE_LINKTO_FAILED, DHT_MSG_LAYOUT_DICT_SET_FAILED, + DHT_MSG_XATTR_DICT_NULL, DHT_MSG_DUMMY_ALLOC_FAILED, DHT_MSG_DICT_IS_NULL, + DHT_MSG_LINK_INODE_FAILED, DHT_MSG_SELFHEAL_FAILED, DHT_MSG_NO_MDS_SUBVOL, + DHT_MSG_LIST_XATTRS_FAILED, DHT_MSG_RESET_INTER_XATTR_FAILED, + DHT_MSG_MDS_DOWN_UNABLE_TO_SET, DHT_MSG_WIND_UNLOCK_FAILED, + DHT_MSG_COMMIT_HASH_FAILED, DHT_MSG_UNLOCK_GFID_FAILED, + DHT_MSG_UNLOCK_FOLLOW_ENTRYLK, DHT_MSG_COPY_FRAME_FAILED, + DHT_MSG_UNLOCK_FOLLOW_LOCKS, DHT_MSG_ENTRYLK_FAILED_AFT_INODELK, + DHT_MSG_CALLOC_FAILED, DHT_MSG_LOCK_ALLOC_FAILED, + DHT_MSG_BLOCK_INODELK_FAILED, + DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK, + DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS, + DHT_MSG_DST_NULL_SET_FAILED); + +#define DHT_MSG_FD_CTX_SET_FAILED_STR "Failed to set fd ctx" +#define DHT_MSG_INVALID_VALUE_STR "Different dst found in the fd ctx" +#define DHT_MSG_UNKNOWN_FOP_STR "Unknown FOP on file" +#define DHT_MSG_OPEN_FD_ON_DST_FAILED_STR "Failed to open the fd on file" +#define DHT_MSG_SYNCTASK_CREATE_FAILED_STR "Failed to create synctask" +#define DHT_MSG_ASPRINTF_FAILED_STR \ + "asprintf failed while fetching subvol from the id" +#define DHT_MSG_HAS_MIGINFO_STR "Found miginfo in the inode ctx" +#define DHT_MSG_FILE_LOOKUP_FAILED_STR "failed to lookup the file" +#define DHT_MSG_INVALID_LINKFILE_STR \ + "linkto target is different from cached-subvol. treating as destination " \ + "subvol" +#define DHT_MSG_GFID_MISMATCH_STR "gfid different on the target file" +#define DHT_MSG_GET_XATTR_FAILED_STR "failed to get 'linkto' xattr" +#define DHT_MSG_SET_INODE_CTX_FAILED_STR "failed to set inode-ctx target file" +#define DHT_MSG_DIR_SELFHEAL_FAILED_STR "Healing of path failed" +#define DHT_MSG_DIR_HEAL_ABORT_STR \ + "Failed to get path from subvol. Aborting directory healing" +#define DHT_MSG_DIR_XATTR_HEAL_FAILED_STR "xattr heal failed for directory" +#define DHT_MSG_LOCK_INODE_UNREF_FAILED_STR \ + "Found a NULL inode. Failed to unref the inode" +#define DHT_MSG_DICT_SET_FAILED_STR "Failed to set dictionary value" +#define DHT_MSG_NOT_LINK_FILE_ERROR_STR "got non-linkfile" +#define DHT_MSG_CREATE_LINK_FAILED_STR "failed to initialize linkfile data" +#define DHT_MSG_UNLINK_FAILED_STR "Unlinking linkfile on subvolume failed" +#define DHT_MSG_MIGRATE_FILE_FAILED_STR "Migrate file failed" +#define DHT_MSG_NO_MEMORY_STR "could not allocate memory for dict" +#define DHT_MSG_SUBVOL_ERROR_STR "Failed to get linkto subvol" +#define DHT_MSG_MIGRATE_HARDLINK_FILE_FAILED_STR "link failed on subvol" +#define DHT_MSG_MIGRATE_FILE_SKIPPED_STR "Migration skipped" +#define DHT_MSG_FD_CREATE_FAILED_STR "fd create failed" +#define DHT_MSG_DICT_NEW_FAILED_STR "dict_new failed" +#define DHT_MSG_FAILED_TO_OPEN_STR "failed to open" +#define DHT_MSG_CREATE_FAILED_STR "failed to create" +#define DHT_MSG_FILE_NOT_EXIST_STR "file does not exist" +#define DHT_MSG_CHOWN_FAILED_STR "chown failed" +#define DHT_MSG_FALLOCATE_FAILED_STR "fallocate failed" +#define DHT_MSG_FTRUNCATE_FAILED_STR "ftruncate failed" +#define DHT_MSG_STATFS_FAILED_STR "failed to get statfs" +#define DHT_MSG_WRITE_CROSS_STR \ + "write will cross min-fre-disk for file on subvol. looking for new subvol" +#define DHT_MSG_SUBVOL_INSUFF_SPACE_STR \ + "Could not find any subvol with space accommodating the file. Cosider " \ + "adding bricks" +#define DHT_MSG_NEW_TARGET_FOUND_STR "New target found for file" +#define DHT_MSG_INSUFF_MEMORY_STR "insufficient memory" +#define DHT_MSG_SET_XATTR_FAILED_STR "failed to set xattr" +#define DHT_MSG_SET_MODE_FAILED_STR "failed to set mode" +#define DHT_MSG_FILE_EXISTS_IN_DEST_STR "file exists in destination" +#define DHT_MSG_LINKFILE_DEL_FAILED_STR "failed to delete the linkfile" +#define DHT_MSG_SYMLINK_FAILED_STR "symlink failed" +#define DHT_MSG_MKNOD_FAILED_STR "mknod failed" +#define DHT_MSG_SETATTR_FAILED_STR "failed to perform setattr" +#define DHT_MSG_MIGRATE_CLEANUP_FAILED_STR \ + "Migrate file cleanup failed: failed to fstat file" +#define DHT_MSG_LOCK_MIGRATE_STR "locks will be migrated for file" +#define DHT_MSG_PARENT_BUILD_FAILED_STR \ + "failed to build parent loc, which is needed to acquire entrylk to " \ + "synchronize with renames on this path. Skipping migration" +#define DHT_MSG_HASHED_SUBVOL_NOT_FOUND_STR \ + "cannot find hashed subvol which is needed to synchronize with renames " \ + "on this path. Skipping migration" +#define DHT_MSG_ACQUIRE_ENTRYLK_FAILED_STR "failed to acquire entrylk on subvol" +#define DHT_MSG_CREATE_DST_FAILED_STR "create dst failed for file" +#define DHT_MSG_MIGRATION_EXIT_STR "Exiting migration" +#define DHT_MSG_CHANGED_DST_STR "destination changed fo file" +#define DHT_MSG_TRACE_FAILED_STR "Trace failed" +#define DHT_MSG_WRITE_LOCK_FAILED_STR "write lock failed" +#define DHT_MSG_GETACTIVELK_FAILED_STR "getactivelk failed for file" +#define DHT_MSG_STAT_FAILED_STR "failed to do a stat" +#define DHT_MSG_UNLINK_PERFORM_FAILED_STR "failed to perform unlink" +#define DHT_MSG_MIGRATE_FILE_COMPLETE_STR "completed migration" +#define DHT_MSG_CLANUP_SOURCE_FILE_FAILED_STR "failed to cleanup source file" +#define DHT_MSG_UNLOCK_FILE_FAILED_STR "failed to unlock file" +#define DHT_MSG_REMOVE_XATTR_FAILED_STR "remove xattr failed" +#define DHT_MSG_SOCKET_ERROR_STR "Failed to unlink listener socket" +#define DHT_MSG_HASHED_SUBVOL_GET_FAILED_STR "Failed to get hashed subvolume" +#define DHT_MSG_CACHED_SUBVOL_GET_FAILED_STR "Failed to get cached subvolume" +#define DHT_MSG_MIGRATE_DATA_FAILED_STR "migrate-data failed" +#define DHT_MSG_DEFRAG_NULL_STR "defrag is NULL" +#define DHT_MSG_DATA_MIGRATE_ABORT_STR \ + "Readdirp failed. Aborting data migration for dict" +#define DHT_MSG_LAYOUT_FIX_FAILED_STR "fix layout failed" +#define DHT_MSG_PARENT_NULL_STR "parent is NULL" +#define DHT_MSG_GFID_NOT_PRESENT_STR "gfid not present" +#define DHT_MSG_CHILD_LOC_FAILED_STR "Child loc build failed" +#define DHT_MSG_SET_LOOKUP_FAILED_STR "Failed to set lookup" +#define DHT_MSG_DIR_LOOKUP_FAILED_STR "lookup failed" +#define DHT_MSG_DIR_REMOVED_STR "Dir renamed or removed. Skipping" +#define DHT_MSG_READDIR_ERROR_STR "readdir failed, Aborting fix-layout" +#define DHT_MSG_SETTLE_HASH_FAILED_STR "Settle hash failed" +#define DHT_MSG_DEFRAG_PROCESS_DIR_FAILED_STR "gf_defrag_process_dir failed" +#define DHT_MSG_FIX_NOT_COMP_STR \ + "Unable to retrieve fixlayout xattr. Assume background fix layout not " \ + "complete" +#define DHT_MSG_SUBVOL_DETER_FAILED_STR \ + "local subvolume determination failed with error" +#define DHT_MSG_LOCAL_SUBVOL_STR "local subvol" +#define DHT_MSG_NODE_UUID_STR "node uuid" +#define DHT_MSG_SIZE_FILE_STR "Total size files" +#define DHT_MSG_GET_DATA_SIZE_FAILED_STR \ + "Failed to get the total data size. Unable to estimate time to complete " \ + "rebalance" +#define DHT_MSG_PTHREAD_JOIN_FAILED_STR \ + "file_counter_thread: pthread_join failed" +#define DHT_MSG_COUNTER_THREAD_CREATE_FAILED_STR \ + "Failed to create the file counter thread" +#define DHT_MSG_MIGRATION_INIT_QUEUE_FAILED_STR \ + "Failed to initialise migration queue" +#define DHT_MSG_REBALANCE_STOPPED_STR "Received stop command on rebalance" +#define DHT_MSG_PAUSED_TIMEOUT_STR "Request pause timer timeout" +#define DHT_MSG_WOKE_STR "woken" +#define DHT_MSG_ABORT_REBALANCE_STR "Aborting rebalance" +#define DHT_MSG_REBALANCE_START_FAILED_STR \ + "Failed to start rebalance: look up on / failed" +#define DHT_MSG_CREATE_TASK_REBAL_FAILED_STR \ + "Could not create task for rebalance" +#define DHT_MSG_REBAL_ESTIMATE_NOT_AVAIL_STR \ + "Rebalance estimates will not be available" +#define DHT_MSG_REBALANCE_STATUS_STR "Rebalance status" +#define DHT_MSG_DATA_NULL_STR "data value is NULL" +#define DHT_MSG_ADD_CHOICES_ERROR_STR "Error to add choices in buffer" +#define DHT_MSG_GET_CHOICES_ERROR_STR "Error to get choices" +#define DHT_MSG_PREPARE_STATUS_ERROR_STR "Error to prepare status" +#define DHT_MSG_SET_CHOICE_FAILED_STR "Failed to set full choice" +#define DHT_MSG_AGGREGATE_QUOTA_XATTR_FAILED_STR \ + "Failed to aggregate quota xattr" +#define DHT_MSG_FILE_TYPE_MISMATCH_STR \ + "path exists as a file on one subvolume and directory on another. Please " \ + "fix it manually" +#define DHT_MSG_LAYOUT_SET_FAILED_STR "failed to set layout for subvolume" +#define DHT_MSG_LAYOUT_MERGE_FAILED_STR "failed to merge layouts for subvolume" +#define DHT_MSG_SET_HASHED_SUBVOL_FAILED_STR "Failed to set hashed subvolume" +#define DHT_MSG_XATTR_HEAL_NOT_POSS_STR \ + "No gfid exists for path. so healing xattr is not possible" +#define DHT_MSG_REVALIDATE_CBK_INFO_STR "Revalidate: subvolume returned -1" +#define DHT_MSG_LAYOUT_MISMATCH_STR "Mismatching layouts" +#define DHT_MSG_UNLINK_LOOKUP_INFO_STR "lookup_unlink retuened" +#define DHT_MSG_LINKTO_FILE_FAILED_STR \ + "Could not unlink the linkto file as either fd is open and/or linkto " \ + "xattr is set" +#define DHT_MSG_LAYOUT_PRESET_FAILED_STR \ + "Could not set pre-set layout for subvolume" +#define DHT_MSG_FILE_ON_MULT_SUBVOL_STR \ + "multiple subvolumes have file (preferably rename the file in the " \ + "backend, and do a fresh lookup" +#define DHT_MSG_STALE_LINKFILE_DELETE_STR \ + "attempting deletion of stale linkfile" +#define DHT_MSG_LINK_FILE_LOOKUP_INFO_STR "Lookup on following linkfile" +#define DHT_MSG_NO_SUBVOL_FOR_LINKTO_STR "No link subvolume for linkto" +#define DHT_MSG_SUBVOL_RETURNED_STR "Subvolume returned -1" +#define DHT_MSG_UNKNOWN_LOCAL_XSEL_STR "Unknown local->xsel" +#define DHT_MSG_DICT_GET_FAILED_STR "Failed to get" +#define DHT_MSG_UUID_PARSE_ERROR_STR "Failed to parse uuid" +#define DHT_MSG_GET_XATTR_ERR_STR "getxattr err for dir" +#define DHT_MSG_ALLOC_OR_FILL_FAILED_STR "alloc or fill failed" +#define DHT_MSG_UPGRADE_BRICKS_STR \ + "At least one of the bricks does not support this operation. Please " \ + "upgrade all bricks" +#define DHT_MSG_GET_REAL_NAME_FAILED_STR "Failed to get real filename" +#define DHT_MSG_LAYOUT_NULL_STR "Layout is NULL" +#define DHT_MSG_COPY_UUID_FAILED_STR "Failed to copy node uuid key" +#define DHT_MSG_MDS_DETER_FAILED_STR \ + "Cannot determine MDS, fetching xattr randomly from a subvol" +#define DHT_MSG_HASHED_SUBVOL_DOWN_STR \ + "MDS is down for path, so fetching xattr randomly from subvol" +#define DHT_MSG_CREATE_REBAL_FAILED_STR \ + "failed to create a new rebalance synctask" +#define DHT_MSG_FIX_LAYOUT_INFO_STR "fixing the layout" +#define DHT_MSG_OPERATION_NOT_SUP_STR "wrong directory-spread-count value" +#define DHT_MSG_LINK_LAYOUT_FAILED_STR "failed to link the layout in inode" +#define DHT_MSG_NO_SUBVOL_IN_LAYOUT_STR "no subvolume in layout for path" +#define DHT_MSG_INODE_LK_ERROR_STR "mknod lock failed for file" +#define DHT_MSG_MEM_ALLOC_FAILED_STR "mem allocation failed" +#define DHT_MSG_PARENT_LAYOUT_CHANGED_STR \ + "extracting in-memory layout of parent failed" +#define DHT_MSG_SET_IN_PARAMS_DICT_FAILED_STR \ + "setting in params dictionary failed" +#define DHT_MSG_LOC_COPY_FAILED_STR "loc_copy failed" +#define DHT_MSG_LOC_FAILED_STR "parent loc build failed" +#define DHT_MSG_PARENT_LOC_FAILED_STR "locking parent failed" +#define DHT_MSG_CREATE_LOCK_FAILED_STR "Create lock failed" +#define DHT_MSG_PREV_ATTEMPT_FAILED_STR \ + "mkdir loop detected. parent layout didn't change even though previous " \ + "attempt of mkdir failed because of in-memory layout not matching with " \ + "that on disk." +#define DHT_MSG_REFRESH_ATTEMPT_STR \ + "mkdir parent layout changed. Attempting a refresh and then a retry" +#define DHT_MSG_ACQUIRE_LOCK_FAILED_STR \ + "Acquiring lock on parent to guard against layout-change failed" +#define DHT_MSG_CREATE_STUB_FAILED_STR "creating stub failed" +#define DHT_MSG_WIND_LOCK_REQ_FAILED_STR \ + "cannot wind lock request to guard parent layout" +#define DHT_MSG_REFRESH_FAILED_STR "refreshing parent layout failed." +#define DHT_MSG_CACHED_SUBVOL_ERROR_STR "On cached subvol" +#define DHT_MSG_NO_LINK_SUBVOL_STR "Linkfile does not have link subvolume" +#define DHT_MSG_SET_KEY_FAILED_STR "failed to set key" +#define DHT_MSG_CHILD_DOWN_STR "Received CHILD_DOWN. Exiting" +#define DHT_MSG_LOG_FIXED_LAYOUT_STR "log layout fixed" +#define DHT_MSG_REBAL_STRUCT_SET_STR "local->rebalance already set" +#define DHT_MSG_REMOVE_LINKTO_FAILED_STR "Removal of linkto failed at subvol" +#define DHT_MSG_LAYOUT_DICT_SET_FAILED_STR "dht layout dict set failed" +#define DHT_MSG_SUBVOL_INFO_STR "creating subvolume" +#define DHT_MSG_COMPUTE_HASH_FAILED_STR "hash computation failed" +#define DHT_MSG_INVALID_DISK_LAYOUT_STR \ + "Invalid disk layout: Catastrophic error layout with unknown type found" +#define DHT_MSG_LAYOUT_SORT_FAILED_STR "layout sort failed" +#define DHT_MSG_ANOMALIES_INFO_STR "Found anomalies" +#define DHT_MSG_XATTR_DICT_NULL_STR "xattr dictionary is NULL" +#define DHT_MSG_DISK_LAYOUT_MISSING_STR "Disk layout missing" +#define DHT_MSG_LAYOUT_INFO_STR "layout info" +#define DHT_MSG_SUBVOL_NO_LAYOUT_INFO_STR "no pre-set layout for subvol" +#define DHT_MSG_SELFHEAL_XATTR_FAILED_STR "layout setxattr failed" +#define DHT_MSG_DIR_SELFHEAL_XATTR_FAILED_STR "Directory self heal xattr failed" +#define DHT_MSG_DUMMY_ALLOC_FAILED_STR "failed to allocate dummy layout" +#define DHT_MSG_DICT_IS_NULL_STR \ + "dict is NULL, need to make sure gfids are same" +#define DHT_MSG_ENTRYLK_ERROR_STR "acquiring entrylk after inodelk failed" +#define DHT_MSG_NO_DISK_USAGE_STATUS_STR "no du stats" +#define DHT_MSG_LINK_INODE_FAILED_STR "linking inode failed" +#define DHT_MSG_SELFHEAL_FAILED_STR "Directory selfheal failed" +#define DHT_MSG_NO_MDS_SUBVOL_STR "No mds subvol" +#define DHT_MSG_LIST_XATTRS_FAILED_STR "failed to list xattrs" +#define DHT_MSG_RESET_INTER_XATTR_FAILED_STR "Failed to reset internal xattr" +#define DHT_MSG_MDS_DOWN_UNABLE_TO_SET_STR \ + "mds subvol is down, unable to set xattr" +#define DHT_MSG_DIR_ATTR_HEAL_FAILED_STR \ + "Directory attr heal failed. Failed to set uid/gid" +#define DHT_MSG_WIND_UNLOCK_FAILED_STR \ + "Winding unlock failed: stale locks left on brick" +#define DHT_MSG_COMMIT_HASH_FAILED_STR "Directory commit hash updaten failed" +#define DHT_MSG_LK_ARRAY_INFO_STR "lk info" +#define DHT_MSG_UNLOCK_GFID_FAILED_STR \ + "unlock failed on gfid: stale lock might be left" +#define DHT_MSG_UNLOCKING_FAILED_STR "unlocking failed" +#define DHT_MSG_UNLOCK_FOLLOW_ENTRYLK_STR "not unlocking following entrylks" +#define DHT_MSG_COPY_FRAME_FAILED_STR "copy frame failed" +#define DHT_MSG_UNLOCK_FOLLOW_LOCKS_STR "not unlocking following locks" +#define DHT_MSG_INODELK_FAILED_STR "inodelk failed on subvol" +#define DHT_MSG_LOCK_FRAME_FAILED_STR "memory allocation failed for lock_frame" +#define DHT_MSG_LOCAL_LOCK_INIT_FAILED_STR "dht_local_lock_init failed" +#define DHT_MSG_ENTRYLK_FAILED_AFT_INODELK_STR \ + "dht_blocking_entrylk failed after taking inodelk" +#define DHT_MSG_BLOCK_INODELK_FAILED_STR "dht_blocking_inodelk failed" +#define DHT_MSG_CALLOC_FAILED_STR "calloc failed" +#define DHT_MSG_LOCK_ALLOC_FAILED_STR "lock allocation failed" +#define DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS_STR \ + "cannot allocate a frame, not unlocking following entrylks" +#define DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK_STR \ + "storing locks in local failed, not unlocking following entrylks" +#define DHT_MSG_DST_NULL_SET_FAILED_STR \ + "src or dst is NULL, Failed to set dictionary value" + +#endif /* _DHT_MESSAGES_H_ */ diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index 4f78f5203cb..8ba8082bd86 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -8,90 +8,143 @@ cases as published by the Free Software Foundation. */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - #include "dht-common.h" -#include "xlator.h" -#include <signal.h> +#include <glusterfs/syscall.h> #include <fnmatch.h> +#include <signal.h> +#include <glusterfs/events.h> +#include "glusterfs/compat-errno.h" // for ENODATA on BSD + +#define GF_DISK_SECTOR_SIZE 512 +#define DHT_REBALANCE_PID 4242 /* Change it if required */ +#define DHT_REBALANCE_BLKSIZE 1048576 /* 1 MB */ +#define MAX_MIGRATE_QUEUE_COUNT 500 +#define MIN_MIGRATE_QUEUE_COUNT 200 +#define MAX_REBAL_TYPE_SIZE 16 +#define FILE_CNT_INTERVAL 600 /* 10 mins */ +#define ESTIMATE_START_INTERVAL 600 /* 10 mins */ +#define HARDLINK_MIG_INPROGRESS -2 +#define SKIP_MIGRATION_FD_POSITIVE -3 +#ifndef MAX +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) +#endif -#define GF_DISK_SECTOR_SIZE 512 -#define DHT_REBALANCE_PID 4242 /* Change it if required */ -#define DHT_REBALANCE_BLKSIZE (128 * 1024) +#define GF_CRAWL_INDEX_MOVE(idx, sv_cnt) \ + { \ + idx++; \ + idx %= sv_cnt; \ + } -static int -dht_write_with_holes (xlator_t *to, fd_t *fd, struct iovec *vec, int count, - int32_t size, off_t offset, struct iobref *iobref) +uint64_t g_totalfiles = 0; +uint64_t g_totalsize = 0; + +void +gf_defrag_free_dir_dfmeta(struct dir_dfmeta *meta, int local_subvols_cnt) { - int i = 0; - int ret = -1; - int start_idx = 0; - int tmp_offset = 0; - int write_needed = 0; - int buf_len = 0; - int size_pending = 0; - char *buf = NULL; - - /* loop through each vector */ - for (i = 0; i < count; i++) { - buf = vec[i].iov_base; - buf_len = vec[i].iov_len; - - for (start_idx = 0; (start_idx + GF_DISK_SECTOR_SIZE) <= buf_len; - start_idx += GF_DISK_SECTOR_SIZE) { - - if (mem_0filled (buf + start_idx, GF_DISK_SECTOR_SIZE) != 0) { - write_needed = 1; - continue; - } + int i = 0; + + if (meta) { + for (i = 0; i < local_subvols_cnt; i++) { + if (meta->equeue) + gf_dirent_free(&meta->equeue[i]); + if (meta->lfd && meta->lfd[i]) + fd_unref(meta->lfd[i]); + } - if (write_needed) { - ret = syncop_write (to, fd, (buf + tmp_offset), - (start_idx - tmp_offset), - (offset + tmp_offset), - iobref, 0); - /* 'path' will be logged in calling function */ - if (ret < 0) { - gf_log (THIS->name, GF_LOG_WARNING, - "failed to write (%s)", - strerror (-ret)); - ret = -1; - goto out; - } - - write_needed = 0; - } - tmp_offset = start_idx + GF_DISK_SECTOR_SIZE; - } + GF_FREE(meta->equeue); + GF_FREE(meta->head); + GF_FREE(meta->iterator); + GF_FREE(meta->offset_var); + GF_FREE(meta->fetch_entries); + GF_FREE(meta->lfd); + GF_FREE(meta); + } +} - if ((start_idx < buf_len) || write_needed) { - /* This means, last chunk is not yet written.. write it */ - ret = syncop_write (to, fd, (buf + tmp_offset), - (buf_len - tmp_offset), - (offset + tmp_offset), iobref, 0); - if (ret < 0) { - /* 'path' will be logged in calling function */ - gf_log (THIS->name, GF_LOG_WARNING, - "failed to write (%s)", - strerror (-ret)); - ret = -1; - goto out; - } - } +void +gf_defrag_free_container(struct dht_container *container) +{ + if (container) { + gf_dirent_entry_free(container->df_entry); - size_pending = (size - buf_len); - if (!size_pending) - break; + if (container->parent_loc) { + loc_wipe(container->parent_loc); } - ret = size; -out: - return ret; + GF_FREE(container->parent_loc); + GF_FREE(container); + } +} + +void +dht_set_global_defrag_error(gf_defrag_info_t *defrag, int ret) +{ + LOCK(&defrag->lock); + { + defrag->global_error = ret; + } + UNLOCK(&defrag->lock); + return; +} + +static int +dht_send_rebalance_event(xlator_t *this, int cmd, gf_defrag_status_t status) +{ + int ret = -1; + char *volname = NULL; + char *tmpstr = NULL; + char *ptr = NULL; + char *suffix = "-dht"; + int len = 0; + + eventtypes_t event = EVENT_LAST; + + switch (status) { + case GF_DEFRAG_STATUS_COMPLETE: + event = EVENT_VOLUME_REBALANCE_COMPLETE; + break; + case GF_DEFRAG_STATUS_FAILED: + event = EVENT_VOLUME_REBALANCE_FAILED; + break; + case GF_DEFRAG_STATUS_STOPPED: + event = EVENT_VOLUME_REBALANCE_STOP; + break; + default: + break; + } + + /* DHT volume */ + len = strlen(this->name) - strlen(suffix); + tmpstr = gf_strdup(this->name); + if (tmpstr) { + ptr = tmpstr + len; + if (!strcmp(ptr, suffix)) { + tmpstr[len] = '\0'; + volname = tmpstr; + } + } + + if (!volname) { + /* Better than nothing */ + volname = this->name; + } + + if (event != EVENT_LAST) { + gf_event(event, "volume=%s", volname); + } + + GF_FREE(tmpstr); + return ret; +} + +static void +dht_strip_out_acls(dict_t *dict) +{ + if (dict) { + dict_del(dict, "trusted.SGI_ACL_FILE"); + dict_del(dict, POSIX_ACL_ACCESS_XATTR); + } } /* @@ -130,115 +183,316 @@ be converted to "0" in dht_migrate_file. */ int32_t -gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs, - struct iatt *stbuf) +gf_defrag_handle_hardlink(xlator_t *this, loc_t *loc, int *fop_errno) { - int32_t ret = -1; - xlator_t *cached_subvol = NULL; - xlator_t *hashed_subvol = NULL; - xlator_t *linkto_subvol = NULL; - data_t *data = NULL; - struct iatt iatt = {0,}; - int32_t op_errno = 0; - dht_conf_t *conf = NULL; - - GF_VALIDATE_OR_GOTO ("defrag", loc, out); - GF_VALIDATE_OR_GOTO ("defrag", loc->name, out); - GF_VALIDATE_OR_GOTO ("defrag", stbuf, out); - GF_VALIDATE_OR_GOTO ("defrag", this, out); - GF_VALIDATE_OR_GOTO ("defrag", xattrs, out); - GF_VALIDATE_OR_GOTO ("defrag", this->private, out); - - conf = this->private; - - if (uuid_is_null (loc->pargfid)) { - gf_log ("", GF_LOG_ERROR, "loc->pargfid is NULL for " - "%s", loc->path); - goto out; - } + int32_t ret = -1; + xlator_t *cached_subvol = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *linkto_subvol = NULL; + data_t *data = NULL; + struct iatt iatt = { + 0, + }; + int32_t op_errno = 0; + dht_conf_t *conf = NULL; + gf_loglevel_t loglevel = 0; + dict_t *link_xattr = NULL; + dict_t *dict = NULL; + dict_t *xattr_rsp = NULL; + struct iatt stbuf = { + 0, + }; + + *fop_errno = EINVAL; + + GF_VALIDATE_OR_GOTO("defrag", loc, out); + GF_VALIDATE_OR_GOTO("defrag", loc->name, out); + GF_VALIDATE_OR_GOTO("defrag", this, out); + GF_VALIDATE_OR_GOTO("defrag", this->private, out); + + conf = this->private; + + if (gf_uuid_is_null(loc->pargfid)) { + gf_msg("", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed :" + "loc->pargfid is NULL for %s", + loc->path); + *fop_errno = EINVAL; + ret = -1; + goto out; + } + + if (gf_uuid_is_null(loc->gfid)) { + gf_msg("", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed :" + "loc->gfid is NULL for %s", + loc->path); + *fop_errno = EINVAL; + ret = -1; + goto out; + } - if (uuid_is_null (loc->gfid)) { - gf_log ("", GF_LOG_ERROR, "loc->gfid is NULL for " - "%s", loc->path); - goto out; + link_xattr = dict_new(); + if (!link_xattr) { + ret = -1; + *fop_errno = ENOMEM; + goto out; + } + + /* + Parallel migration can lead to migration of the hard link multiple + times which can lead to data loss. Hence, adding a fresh lookup to + decide whether migration is required or not. + + Elaborating the scenario for let say 10 hardlinks [link{1..10}]: + Let say the first hard link "link1" does the setxattr of the + new hashed subvolume info on the cached file. As there are multiple + threads working, we might have already all the links created on the + new hashed by the time we reach hardlink let say link5. Now the + number of links on hashed is equal to that of cached. Hence, file + migration will happen for link6. + + Cached Hashed + --------T link6 rwxrwxrwx link6 + + Now post above state all the link file on the cached will be zero + byte linkto files. Hence, if we still do migration for the following + files link{7..10}, we will end up migrating 0 data leading to data + loss. + Hence, a lookup can make sure whether we need to migrate the + file or not. + */ + + dict = dict_new(); + if (!dict) { + ret = -1; + *fop_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "could not allocate memory for dict"); + goto out; + } + + ret = dict_set_int32(dict, conf->link_xattr_name, 256); + if (ret) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: failed to set 'linkto' key in dict", + loc->path); + goto out; + } + + ret = syncop_lookup(this, loc, &stbuf, NULL, dict, &xattr_rsp); + if (ret) { + /*Ignore ENOENT and ESTALE as file might have been + migrated already*/ + if (-ret == ENOENT || -ret == ESTALE) { + ret = -2; + goto out; } - - cached_subvol = dht_subvol_get_cached (this, loc->inode); - if (!cached_subvol) { - gf_log (this->name, GF_LOG_ERROR, "Failed to get cached subvol" - " for %s on %s", loc->name, this->name); - goto out; + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:%s lookup failed with ret = %d", loc->path, + ret); + *fop_errno = -ret; + ret = -1; + goto out; + } + + cached_subvol = dht_subvol_get_cached(this, loc->inode); + if (!cached_subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed :" + "Failed to get cached subvol" + " for %s on %s", + loc->name, this->name); + *fop_errno = EINVAL; + ret = -1; + goto out; + } + + hashed_subvol = dht_subvol_get_hashed(this, loc); + if (!hashed_subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed :" + "Failed to get hashed subvol" + " for %s on %s", + loc->name, this->name); + *fop_errno = EINVAL; + ret = -1; + goto out; + } + + /* Hardlink migration happens only with remove-brick. So this condition will + * be true only when the migration has happened. In case hardlinks are + * migrated for rebalance case, remove this check. Having this check here + * avoid redundant calls below*/ + if (hashed_subvol == cached_subvol) { + ret = -2; + goto out; + } + + gf_log(this->name, GF_LOG_INFO, + "Attempting to migrate hardlink %s " + "with gfid %s from %s -> %s", + loc->name, uuid_utoa(loc->gfid), cached_subvol->name, + hashed_subvol->name); + + data = dict_get(xattr_rsp, conf->link_xattr_name); + /* set linkto on cached -> hashed if not present, else link it */ + if (!data) { + ret = dict_set_str(link_xattr, conf->link_xattr_name, + hashed_subvol->name); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed :" + "Failed to set dictionary value:" + " key = %s for %s", + conf->link_xattr_name, loc->name); + *fop_errno = ENOMEM; + ret = -1; + goto out; } - hashed_subvol = dht_subvol_get_hashed (this, loc); - if (!hashed_subvol) { - gf_log (this->name, GF_LOG_ERROR, "Failed to get hashed subvol" - " for %s on %s", loc->name, this->name); - goto out; + ret = syncop_setxattr(cached_subvol, loc, link_xattr, 0, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed :" + "Linkto setxattr failed %s -> %s", + cached_subvol->name, loc->name); + *fop_errno = -ret; + ret = -1; + goto out; } - gf_log (this->name, GF_LOG_INFO, "Attempting to migrate hardlink %s " - "with gfid %s from %s -> %s", loc->name, uuid_utoa (loc->gfid), - cached_subvol->name, hashed_subvol->name); - data = dict_get (xattrs, conf->link_xattr_name); - /* set linkto on cached -> hashed if not present, else link it */ - if (!data) { - ret = dict_set_str (xattrs, conf->link_xattr_name, - hashed_subvol->name); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Failed to set " - "linkto xattr in dict for %s", loc->name); - goto out; - } + gf_msg_debug(this->name, 0, + "hardlink target subvol created on %s " + ",cached %s, file %s", + hashed_subvol->name, cached_subvol->name, loc->path); - ret = syncop_setxattr (cached_subvol, loc, xattrs, 0); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Linkto setxattr " - "failed %s -> %s (%s)", cached_subvol->name, - loc->name, strerror (-ret)); - ret = -1; - goto out; - } - ret = -2; - goto out; + ret = -2; + goto out; + } else { + linkto_subvol = dht_linkfile_subvol(this, NULL, NULL, xattr_rsp); + if (!linkto_subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SUBVOL_ERROR, + "Failed to get " + "linkto subvol for %s", + loc->name); } else { - linkto_subvol = dht_linkfile_subvol (this, NULL, NULL, xattrs); - if (!linkto_subvol) { - gf_log (this->name, GF_LOG_ERROR, "Failed to get " - "linkto subvol for %s", loc->name); - } else { - hashed_subvol = linkto_subvol; - } - - ret = syncop_link (hashed_subvol, loc, loc); - if (ret) { - op_errno = -ret; - ret = -1; - gf_log (this->name, GF_LOG_ERROR, "link of %s -> %s" - " failed on subvol %s (%s)", loc->name, - uuid_utoa(loc->gfid), - hashed_subvol->name, strerror (op_errno)); - if (op_errno != EEXIST) - goto out; - } + hashed_subvol = linkto_subvol; } - ret = syncop_lookup (hashed_subvol, loc, NULL, &iatt, NULL, NULL); + + ret = syncop_link(hashed_subvol, loc, loc, &iatt, NULL, NULL); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Failed lookup %s on %s (%s)" - , loc->name, hashed_subvol->name, strerror (-ret)); - ret = -1; + op_errno = -ret; + ret = -1; + + loglevel = (op_errno == EEXIST) ? GF_LOG_DEBUG : GF_LOG_ERROR; + gf_msg(this->name, loglevel, op_errno, + DHT_MSG_MIGRATE_HARDLINK_FILE_FAILED, + "link of %s -> %s" + " failed on subvol %s", + loc->name, uuid_utoa(loc->gfid), hashed_subvol->name); + if (op_errno != EEXIST) { + *fop_errno = op_errno; goto out; + } + } else { + gf_msg_debug(this->name, 0, + "syncop_link successful for" + " hardlink %s on subvol %s, cached %s", + loc->path, hashed_subvol->name, cached_subvol->name); } + } - if (iatt.ia_nlink == stbuf->ia_nlink) { - ret = dht_migrate_file (this, loc, cached_subvol, hashed_subvol, - GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS); - if (ret) - goto out; - } + ret = syncop_lookup(hashed_subvol, loc, &iatt, NULL, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed :Failed lookup %s on %s ", loc->name, + hashed_subvol->name); + + *fop_errno = -ret; + ret = -1; + goto out; + } + + /* There is a race where on the target subvol for the hardlink + * (note: hash subvol for the hardlink might differ from this), some + * other client(non-rebalance) would have created a linkto file for that + * hardlink as part of lookup. So let say there are 10 hardlinks, on the + * 5th hardlink it self the hardlinks might have migrated. Now for + * (6..10th) hardlinks the cached and target would be same as the file + * has already migrated. Hence this check is needed */ + if (cached_subvol == hashed_subvol) { + gf_msg_debug(this->name, 0, + "source %s and destination %s " + "for hardlink %s are same", + cached_subvol->name, hashed_subvol->name, loc->path); ret = -2; + goto out; + } + + if (iatt.ia_nlink == stbuf.ia_nlink) { + ret = dht_migrate_file(this, loc, cached_subvol, hashed_subvol, + GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS, fop_errno); + if (ret) { + goto out; + } + } + ret = -2; out: + if (link_xattr) + dict_unref(link_xattr); + + if (xattr_rsp) + dict_unref(xattr_rsp); + + if (dict) + dict_unref(dict); + + return ret; +} + +static int +__check_file_has_hardlink(xlator_t *this, loc_t *loc, struct iatt *stbuf, + dict_t *xattrs, int flags, gf_defrag_info_t *defrag, + dht_conf_t *conf, int *fop_errno) +{ + int ret = 0; + + if (flags == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS) { + ret = 0; return ret; + } + if (stbuf->ia_nlink > 1) { + /* support for decomission */ + if (flags == GF_DHT_MIGRATE_HARDLINK) { + synclock_lock(&conf->link_lock); + ret = gf_defrag_handle_hardlink(this, loc, fop_errno); + synclock_unlock(&conf->link_lock); + /* + Returning zero will force the file to be remigrated. + Checkout gf_defrag_handle_hardlink for more information. + */ + if (ret && ret != -2) { + gf_msg(this->name, GF_LOG_WARNING, 0, + DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: failed to migrate file with link", + loc->path); + } + } else { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migration skipped for:" + "%s: file has hardlinks", + loc->path); + *fop_errno = ENOTSUP; + ret = 1; + } + } + + return ret; } /* @@ -249,491 +503,962 @@ out: gf_defrag_handle_hardlink for description of "returning -2") -1 : failure */ -static inline int -__is_file_migratable (xlator_t *this, loc_t *loc, - struct iatt *stbuf, dict_t *xattrs, int flags) +static int +__is_file_migratable(xlator_t *this, loc_t *loc, struct iatt *stbuf, + dict_t *xattrs, int flags, gf_defrag_info_t *defrag, + dht_conf_t *conf, int *fop_errno) { - int ret = -1; + int ret = -1; + int lock_count = 0; + + if (IA_ISDIR(stbuf->ia_type)) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: migrate-file called on directory", + loc->path); + *fop_errno = EISDIR; + ret = -1; + goto out; + } - if (IA_ISDIR (stbuf->ia_type)) { - gf_log (this->name, GF_LOG_WARNING, - "%s: migrate-file called on directory", loc->path); - ret = -1; - goto out; + if (!conf->lock_migration_enabled) { + ret = dict_get_int32(xattrs, GLUSTERFS_POSIXLK_COUNT, &lock_count); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: Unable to get lock count for file", + loc->path); + *fop_errno = EINVAL; + ret = -1; + goto out; } - if (flags == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS) { - ret = 0; - goto out; + if (lock_count) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed: %s: File has locks." + " Skipping file migration", + loc->path); + *fop_errno = ENOTSUP; + ret = 1; + goto out; } - if (stbuf->ia_nlink > 1) { - /* support for decomission */ - if (flags == GF_DHT_MIGRATE_HARDLINK) { - ret = gf_defrag_handle_hardlink (this, loc, - xattrs, stbuf); - - /* - Returning zero will force the file to be remigrated. - Checkout gf_defrag_handle_hardlink for more information. - */ - if (ret && ret != -2) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to migrate file with link", - loc->path); - } - } else { - gf_log (this->name, GF_LOG_WARNING, - "%s: file has hardlinks", loc->path); - ret = -ENOTSUP; - } - goto out; - } - - ret = 0; + } + /* Check if file has hardlink*/ + ret = __check_file_has_hardlink(this, loc, stbuf, xattrs, flags, defrag, + conf, fop_errno); out: - return ret; + return ret; } -static inline int -__dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struct iatt *stbuf, - dict_t *dict, fd_t **dst_fd, dict_t *xattr) +static int +__dht_rebalance_create_dst_file(xlator_t *this, xlator_t *to, xlator_t *from, + loc_t *loc, struct iatt *stbuf, fd_t **dst_fd, + int *fop_errno, int file_has_holes) { - xlator_t *this = NULL; - int ret = -1; - fd_t *fd = NULL; - struct iatt new_stbuf = {0,}; - dht_conf_t *conf = NULL; - - this = THIS; - conf = this->private; - - ret = dict_set_static_bin (dict, "gfid-req", stbuf->ia_gfid, 16); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to set gfid in dict for create", loc->path); - goto out; - } - - ret = dict_set_str (dict, conf->link_xattr_name, from->name); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to set gfid in dict for create", loc->path); - goto out; + int ret = -1; + int ret2 = -1; + fd_t *fd = NULL; + struct iatt new_stbuf = { + 0, + }; + struct iatt check_stbuf = { + 0, + }; + dht_conf_t *conf = NULL; + dict_t *dict = NULL; + dict_t *xdata = NULL; + + conf = this->private; + + dict = dict_new(); + if (!dict) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "dictionary allocation failed for" + "path:%s", + loc->path); + goto out; + } + ret = dict_set_gfuuid(dict, "gfid-req", stbuf->ia_gfid, true); + if (ret) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "%s: failed to set dictionary value: key = gfid-req", loc->path); + goto out; + } + + ret = dict_set_str(dict, conf->link_xattr_name, from->name); + if (ret) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "%s: failed to set dictionary value: key = %s ", loc->path, + conf->link_xattr_name); + goto out; + } + + fd = fd_create(loc->inode, DHT_REBALANCE_PID); + if (!fd) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: fd create failed (destination)", loc->path); + goto out; + } + + xdata = dict_new(); + if (!xdata) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: dict_new failed)", loc->path); + goto out; + } + + ret = dict_set_int32_sizen(xdata, GF_CLEAN_WRITE_PROTECTION, 1); + if (ret) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "%s: failed to set dictionary value: key = %s ", loc->path, + GF_CLEAN_WRITE_PROTECTION); + goto out; + } + + ret = syncop_lookup(to, loc, &new_stbuf, NULL, xdata, NULL); + if (!ret) { + /* File exits in the destination, check if gfid matches */ + if (gf_uuid_compare(stbuf->ia_gfid, new_stbuf.ia_gfid) != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH, + "file %s exists in %s with different gfid", loc->path, + to->name); + *fop_errno = EINVAL; + ret = -1; + goto out; } - - fd = fd_create (loc->inode, DHT_REBALANCE_PID); - if (!fd) { - gf_log (this->name, GF_LOG_ERROR, - "%s: fd create failed (destination) (%s)", - loc->path, strerror (errno)); - ret = -1; - goto out; + } + if ((ret < 0) && (-ret != ENOENT)) { + /* File exists in destination, but not accessible */ + gf_msg(THIS->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to lookup file", loc->path); + *fop_errno = -ret; + ret = -1; + goto out; + } + + /* Create the destination with LINKFILE mode, and linkto xattr, + if the linkfile already exists, just open the file */ + if (!ret) { + /* + * File already present, just open the file. + */ + ret = syncop_open(to, loc, O_RDWR, fd, NULL, NULL); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "failed to open %s on %s", loc->path, to->name); + *fop_errno = -ret; + ret = -1; + goto out; } - - ret = syncop_lookup (to, loc, NULL, &new_stbuf, NULL, NULL); - if (!ret) { - /* File exits in the destination, check if gfid matches */ - if (uuid_compare (stbuf->ia_gfid, new_stbuf.ia_gfid) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "file %s exits in %s with different gfid", - loc->path, to->name); - fd_unref (fd); - goto out; - } + } else { + ret = syncop_create(to, loc, O_RDWR, DHT_LINKFILE_MODE, fd, &new_stbuf, + dict, NULL); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "failed to create %s on %s", loc->path, to->name); + *fop_errno = -ret; + ret = -1; + goto out; } - if ((ret < 0) && (-ret != ENOENT)) { - /* File exists in destination, but not accessible */ - gf_log (THIS->name, GF_LOG_WARNING, - "%s: failed to lookup file (%s)", - loc->path, strerror (-ret)); - ret = -1; - goto out; + } + + fd_bind(fd); + + /*Reason of doing lookup after create again: + *In the create, there is some time-gap between opening fd at the + *server (posix_layer) and binding it in server (incrementing fd count), + *so if in that time-gap, if other process sends unlink considering it + *as a linkto file, because inode->fd count will be 0, so file will be + *unlinked at the backend. And because further operations are performed + *on fd, so though migration will be done but will end with no file + *at the backend. + */ + + ret = syncop_lookup(to, loc, &check_stbuf, NULL, NULL, NULL); + if (!ret) { + if (gf_uuid_compare(stbuf->ia_gfid, check_stbuf.ia_gfid) != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH, + "file %s exists in %s with different gfid," + "found in lookup after create", + loc->path, to->name); + *fop_errno = EINVAL; + ret = -1; + goto out; } - - /* Create the destination with LINKFILE mode, and linkto xattr, - if the linkfile already exists, it will just open the file */ - ret = syncop_create (to, loc, O_RDWR, DHT_LINKFILE_MODE, fd, - dict, &new_stbuf); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "failed to create %s on %s (%s)", - loc->path, to->name, strerror (-ret)); - ret = -1; - goto out; + } + + if (-ret == ENOENT) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: file does not exist" + "on %s", + loc->path, to->name); + *fop_errno = -ret; + ret = -1; + goto out; + } + + ret = syncop_fsetattr(to, fd, stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), + NULL, NULL, NULL, NULL); + if (ret < 0) { + *fop_errno = -ret; + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "chown failed for %s on %s", loc->path, to->name); + } + + /* No need to bother about 0 byte size files */ + if (stbuf->ia_size > 0) { + if (conf->use_fallocate && !file_has_holes) { + ret = syncop_fallocate(to, fd, 0, 0, stbuf->ia_size, NULL, NULL); + if (ret < 0) { + if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -ENOSYS) { + conf->use_fallocate = _gf_false; + } else { + gf_msg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "fallocate failed for %s on %s", loc->path, + to->name); + + *fop_errno = -ret; + + /* fallocate does not release the space + * in some cases + */ + ret2 = syncop_ftruncate(to, fd, 0, NULL, NULL, NULL, NULL); + if (ret2 < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret2, + DHT_MSG_MIGRATE_FILE_FAILED, + "ftruncate failed for " + "%s on %s", + loc->path, to->name); + } + goto out; + } + } + } else { + ret = syncop_ftruncate(to, fd, stbuf->ia_size, NULL, NULL, NULL, + NULL); + if (ret < 0) { + *fop_errno = -ret; + gf_msg(this->name, GF_LOG_WARNING, -ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "ftruncate failed for %s on %s", loc->path, to->name); + } } + } - ret = syncop_fsetxattr (to, fd, xattr, 0); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set xattr on %s (%s)", - loc->path, to->name, strerror (-ret)); + /* success */ + ret = 0; - ret = syncop_ftruncate (to, fd, stbuf->ia_size); - if (ret < 0) - gf_log (this->name, GF_LOG_ERROR, - "ftruncate failed for %s on %s (%s)", - loc->path, to->name, strerror (-ret)); + if (dst_fd) + *dst_fd = fd; - ret = syncop_fsetattr (to, fd, stbuf, - (GF_SET_ATTR_UID | GF_SET_ATTR_GID), - NULL, NULL); - if (ret < 0) - gf_log (this->name, GF_LOG_ERROR, - "chown failed for %s on %s (%s)", - loc->path, to->name, strerror (-ret)); +out: + if (ret) { + if (fd) { + fd_unref(fd); + } + } + if (dict) + dict_unref(dict); - if (dst_fd) - *dst_fd = fd; + if (xdata) + dict_unref(xdata); - /* success */ - ret = 0; - -out: - return ret; + return ret; } -static inline int -__dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, - struct iatt *stbuf, int flag) +static int +__dht_check_free_space(xlator_t *this, xlator_t *to, xlator_t *from, loc_t *loc, + struct iatt *stbuf, int flag, dht_conf_t *conf, + gf_boolean_t *target_changed, xlator_t **new_subvol, + int *fop_errno) { - struct statvfs src_statfs = {0,}; - struct statvfs dst_statfs = {0,}; - int ret = -1; - xlator_t *this = NULL; - - uint64_t src_statfs_blocks = 1; - uint64_t dst_statfs_blocks = 1; - - this = THIS; - - ret = syncop_statfs (from, loc, &src_statfs); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "failed to get statfs of %s on %s (%s)", - loc->path, from->name, strerror (-ret)); - ret = -1; - goto out; + struct statvfs src_statfs = { + 0, + }; + struct statvfs dst_statfs = { + 0, + }; + int ret = -1; + dict_t *xdata = NULL; + dht_layout_t *layout = NULL; + uint64_t src_statfs_blocks = 1; + uint64_t dst_statfs_blocks = 1; + double dst_post_availspacepercent = 0; + double src_post_availspacepercent = 0; + uint64_t file_blocks = 0; + uint64_t src_total_blocks = 0; + uint64_t dst_total_blocks = 0; + + xdata = dict_new(); + if (!xdata) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "failed to allocate dictionary"); + goto out; + } + + ret = dict_set_int8(xdata, GF_INTERNAL_IGNORE_DEEM_STATFS, 1); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "Failed to set " GF_INTERNAL_IGNORE_DEEM_STATFS " in dict"); + ret = -1; + *fop_errno = ENOMEM; + goto out; + } + + ret = syncop_statfs(from, loc, &src_statfs, xdata, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "failed to get statfs of %s on %s", loc->path, from->name); + *fop_errno = -ret; + ret = -1; + goto out; + } + + ret = syncop_statfs(to, loc, &dst_statfs, xdata, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "failed to get statfs of %s on %s", loc->path, to->name); + *fop_errno = -ret; + ret = -1; + goto out; + } + + gf_msg_debug(this->name, 0, + "min_free_disk - %f , block available - %" PRId64 + ", block size - %lu", + conf->min_free_disk, dst_statfs.f_bavail, dst_statfs.f_bsize); + + dst_statfs_blocks = dst_statfs.f_bavail * + (dst_statfs.f_frsize / GF_DISK_SECTOR_SIZE); + + src_statfs_blocks = src_statfs.f_bavail * + (src_statfs.f_frsize / GF_DISK_SECTOR_SIZE); + + dst_total_blocks = dst_statfs.f_blocks * + (dst_statfs.f_frsize / GF_DISK_SECTOR_SIZE); + + src_total_blocks = src_statfs.f_blocks * + (src_statfs.f_frsize / GF_DISK_SECTOR_SIZE); + + /* if force option is given, do not check for space @ dst. + * Check only if space is avail for the file */ + if (flag != GF_DHT_MIGRATE_DATA) + goto check_avail_space; + + /* Check: + During rebalance `migrate-data` - Destination subvol experiences + a `reduction` in 'blocks' of free space, at the same time source + subvol gains certain 'blocks' of free space. A valid check is + necessary here to avoid erroneous move to destination where + the space could be scantily available. + With heterogeneous brick support, an actual space comparison could + prevent any files being migrated to newly added bricks if they are + smaller then the free space available on the existing bricks. + */ + if (!conf->use_fallocate) { + file_blocks = stbuf->ia_size + GF_DISK_SECTOR_SIZE - 1; + file_blocks /= GF_DISK_SECTOR_SIZE; + + if (file_blocks >= dst_statfs_blocks) { + dst_statfs_blocks = 0; + } else { + dst_statfs_blocks -= file_blocks; } + } + + src_post_availspacepercent = ((src_statfs_blocks + file_blocks) * 100) / + src_total_blocks; + + dst_post_availspacepercent = (dst_statfs_blocks * 100) / dst_total_blocks; + + if (dst_post_availspacepercent < src_post_availspacepercent) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "data movement of file " + "{blocks:%" PRIu64 + " name:(%s)} would result in " + "dst node (%s:%" PRIu64 + ") having lower disk " + "space than the source node (%s:%" PRIu64 + ")" + ".Skipping file.", + stbuf->ia_blocks, loc->path, to->name, dst_statfs_blocks, + from->name, src_statfs_blocks); + + /* this is not a 'failure', but we don't want to + consider this as 'success' too :-/ */ + *fop_errno = ENOSPC; + ret = 1; + goto out; + } - ret = syncop_statfs (to, loc, &dst_statfs); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "failed to get statfs of %s on %s (%s)", - loc->path, to->name, strerror (-ret)); - ret = -1; - goto out; +check_avail_space: + if (conf->disk_unit == 'p' && dst_statfs.f_blocks) { + dst_post_availspacepercent = (dst_statfs_blocks * 100) / + dst_total_blocks; + + gf_msg_debug(this->name, 0, + "file : %s, post_availspacepercent" + " : %lf f_bavail : %" PRIu64 " min-free-disk: %lf", + loc->path, dst_post_availspacepercent, dst_statfs.f_bavail, + conf->min_free_disk); + + if (dst_post_availspacepercent < conf->min_free_disk) { + gf_msg(this->name, GF_LOG_WARNING, 0, 0, + "Write will cross min-free-disk for " + "file - %s on subvol - %s. Looking " + "for new subvol", + loc->path, to->name); + + goto find_new_subvol; + } else { + ret = 0; + goto out; } + } - /* if force option is given, do not check for space @ dst. - * Check only if space is avail for the file */ - if (flag != GF_DHT_MIGRATE_DATA) - goto check_avail_space; + if (conf->disk_unit != 'p') { + if ((dst_statfs_blocks * GF_DISK_SECTOR_SIZE) < conf->min_free_disk) { + gf_msg_debug(this->name, 0, + "file : %s, destination frsize: %lu " + "f_bavail : %" PRIu64 " min-free-disk: %lf", + loc->path, dst_statfs.f_frsize, dst_statfs.f_bavail, + conf->min_free_disk); - /* Check: - During rebalance `migrate-data` - Destination subvol experiences - a `reduction` in 'blocks' of free space, at the same time source - subvol gains certain 'blocks' of free space. A valid check is - necessary here to avoid errorneous move to destination where - the space could be scantily available. - */ - if (stbuf) { - dst_statfs_blocks = ((dst_statfs.f_bavail * - dst_statfs.f_bsize) / - GF_DISK_SECTOR_SIZE); - src_statfs_blocks = ((src_statfs.f_bavail * - src_statfs.f_bsize) / - GF_DISK_SECTOR_SIZE); - if ((dst_statfs_blocks - stbuf->ia_blocks) < - (src_statfs_blocks + stbuf->ia_blocks)) { - gf_log (this->name, GF_LOG_WARNING, - "data movement attempted from node (%s) with" - " higher disk space to a node (%s) with " - "lesser disk space (%s)", from->name, - to->name, loc->path); - - /* this is not a 'failure', but we don't want to - consider this as 'success' too :-/ */ - ret = 1; - goto out; - } - } -check_avail_space: - if (((dst_statfs.f_bavail * dst_statfs.f_bsize) / - GF_DISK_SECTOR_SIZE) < stbuf->ia_blocks) { - gf_log (this->name, GF_LOG_ERROR, - "data movement attempted from node (%s) with " - "to node (%s) which does not have required free space" - " for %s", from->name, to->name, loc->path); - ret = 1; - goto out; + gf_msg(this->name, GF_LOG_WARNING, 0, 0, + "write will" + " cross min-free-disk for file - %s on subvol -" + " %s. looking for new subvol", + loc->path, to->name); + + goto find_new_subvol; + + } else { + ret = 0; + goto out; } + } +find_new_subvol: + layout = dht_layout_get(this, loc->parent); + if (!layout) { + gf_log(this->name, GF_LOG_ERROR, "Layout is NULL"); + *fop_errno = EINVAL; + ret = -1; + goto out; + } + + *new_subvol = dht_subvol_with_free_space_inodes(this, to, from, layout, + stbuf->ia_size); + if ((!(*new_subvol)) || (*new_subvol == from)) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SUBVOL_INSUFF_SPACE, + "Could not find any subvol" + " with space accommodating the file - %s. Consider " + "adding bricks", + loc->path); + + *target_changed = _gf_false; + *fop_errno = ENOSPC; + ret = -1; + } else { + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "new target found - %s" + " for file - %s", + (*new_subvol)->name, loc->path); + *target_changed = _gf_true; ret = 0; + } + out: - return ret; + if (xdata) + dict_unref(xdata); + return ret; } -static inline int -__dht_rebalance_migrate_data (xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst, - uint64_t ia_size, int hole_exists) +static int +__dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag, + xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst, + uint64_t ia_size, int hole_exists, int *fop_errno) { - int ret = 0; - int count = 0; - off_t offset = 0; - struct iovec *vector = NULL; - struct iobref *iobref = NULL; - uint64_t total = 0; - size_t read_size = 0; - - /* if file size is '0', no need to enter this loop */ - while (total < ia_size) { - read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE) ? - DHT_REBALANCE_BLKSIZE : (ia_size - total)); - ret = syncop_readv (from, src, read_size, - offset, 0, &vector, &count, &iobref); - if (!ret || (ret < 0)) { - break; + int ret = 0; + int count = 0; + off_t offset = 0; + off_t data_offset = 0; + off_t hole_offset = 0; + struct iovec *vector = NULL; + struct iobref *iobref = NULL; + uint64_t total = 0; + size_t read_size = 0; + size_t data_block_size = 0; + dict_t *xdata = NULL; + dht_conf_t *conf = NULL; + + conf = this->private; + + /* if file size is '0', no need to enter this loop */ + while (total < ia_size) { + /* This is a regular file - read it sequentially */ + if (!hole_exists) { + read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE) + ? DHT_REBALANCE_BLKSIZE + : (ia_size - total)); + } else { + /* This is a sparse file - read only the data segments in the file + */ + + /* If the previous data block is fully copied, find the next data + * segment + * starting at the offset of the last read and written byte, */ + if (data_block_size <= 0) { + ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL, + &data_offset); + if (ret) { + if (ret == -ENXIO) + ret = 0; /* No more data segments */ + else + *fop_errno = -ret; /* Error occurred */ + + break; } - if (hole_exists) - ret = dht_write_with_holes (to, dst, vector, count, - ret, offset, iobref); - else - ret = syncop_writev (to, dst, vector, count, - offset, iobref, 0); - if (ret < 0) { + /* If the position of the current data segment is greater than + * the position of the next hole, find the next hole in order to + * calculate the length of the new data segment */ + if (data_offset > hole_offset) { + /* Starting at the offset of the last data segment, find the + * next hole */ + ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE, + NULL, &hole_offset); + if (ret) { + /* If an error occurred here it's a real error because + * if the seek for a data segment was successful then + * necessarily another hole must exist (EOF is a hole) + */ + *fop_errno = -ret; break; - } - offset += ret; - total += ret; + } - GF_FREE (vector); - if (iobref) - iobref_unref (iobref); - iobref = NULL; - vector = NULL; + /* Calculate the total size of the current data block */ + data_block_size = hole_offset - data_offset; + } + } else { + /* There is still data in the current segment, move the + * data_offset to the position of the last written byte */ + data_offset = offset; + } + + /* Calculate how much data needs to be read and written. If the data + * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and + * write DHT_REBALANCE_BLKSIZE data length and the rest in the + * next iteration(s) */ + read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE) + ? DHT_REBALANCE_BLKSIZE + : data_block_size); + + /* Calculate the remaining size of the data block - maybe there's no + * need to seek for data in the next iteration */ + data_block_size -= read_size; + + /* Set offset to the offset of the data segment so read and write + * will have the correct position */ + offset = data_offset; } - if (iobref) - iobref_unref (iobref); - GF_FREE (vector); - - if (ret >= 0) - ret = 0; - else - ret = -1; - return ret; -} + ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count, + &iobref, NULL, NULL, NULL); + if (!ret || (ret < 0)) { + if (!ret) { + /* File was probably truncated*/ + ret = -1; + *fop_errno = ENOSPC; + } else { + *fop_errno = -ret; + } + break; + } -static inline int -__dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc, - struct iatt *stbuf, fd_t **src_fd) -{ - int ret = 0; - fd_t *fd = NULL; - dict_t *dict = NULL; - xlator_t *this = NULL; - struct iatt iatt = {0,}; - dht_conf_t *conf = NULL; - - this = THIS; - conf = this->private; + if (!conf->force_migration) { + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_msg("dht", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "insufficient memory"); + ret = -1; + *fop_errno = ENOMEM; + break; + } - fd = fd_create (loc->inode, DHT_REBALANCE_PID); - if (!fd) { - gf_log (this->name, GF_LOG_ERROR, - "%s: fd create failed (source)", loc->path); - ret = -1; - goto out; + /* Fail this write and abort rebalance if we + * detect a write from client since migration of + * this file started. This is done to avoid + * potential data corruption due to out of order + * writes from rebalance and client to the same + * region (as compared between src and dst + * files). See + * https://github.com/gluster/glusterfs/issues/308 + * for more details. + */ + ret = dict_set_int32_sizen(xdata, GF_AVOID_OVERWRITE, 1); + if (ret) { + gf_msg("dht", GF_LOG_ERROR, 0, ENOMEM, + "failed to set dict"); + ret = -1; + *fop_errno = ENOMEM; + break; + } + } } - ret = syncop_open (from, loc, O_RDWR, fd); + ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL, + NULL, xdata, NULL); if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "failed to open file %s on %s (%s)", - loc->path, from->name, strerror (-ret)); - ret = -1; - goto out; + *fop_errno = -ret; + break; } + offset += ret; + total += ret; + + GF_FREE(vector); + if (iobref) + iobref_unref(iobref); + iobref = NULL; + vector = NULL; + } + if (iobref) + iobref_unref(iobref); + GF_FREE(vector); + + if (ret >= 0) + ret = 0; + else ret = -1; - dict = dict_new (); - if (!dict) - goto out; - ret = dict_set_str (dict, conf->link_xattr_name, to->name); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "failed to set xattr in dict for %s (linkto:%s)", - loc->path, to->name); - goto out; - } + if (xdata) { + dict_unref(xdata); + } - /* Once the migration starts, the source should have 'linkto' key set - to show which is the target, so other clients can work around it */ - ret = syncop_setxattr (from, loc, dict, 0); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "failed to set xattr on %s in %s (%s)", - loc->path, from->name, strerror (-ret)); - ret = -1; - goto out; - } + return ret; +} - /* mode should be (+S+T) to indicate migration is in progress */ - iatt.ia_prot = stbuf->ia_prot; - iatt.ia_type = stbuf->ia_type; - iatt.ia_prot.sticky = 1; - iatt.ia_prot.sgid = 1; +static int +__dht_rebalance_open_src_file(xlator_t *this, xlator_t *from, xlator_t *to, + loc_t *loc, struct iatt *stbuf, fd_t **src_fd, + gf_boolean_t *clean_src, int *fop_errno) +{ + int ret = 0; + fd_t *fd = NULL; + dict_t *dict = NULL; + struct iatt iatt = { + 0, + }; + dht_conf_t *conf = NULL; + + conf = this->private; + + *clean_src = _gf_false; + + fd = fd_create(loc->inode, DHT_REBALANCE_PID); + if (!fd) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: fd create failed (source)", loc->path); + *fop_errno = ENOMEM; + ret = -1; + goto out; + } + + ret = syncop_open(from, loc, O_RDWR, fd, NULL, NULL); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "failed to open file %s on %s", loc->path, from->name); + *fop_errno = -ret; + ret = -1; + goto out; + } - ret = syncop_setattr (from, loc, &iatt, GF_SET_ATTR_MODE, NULL, NULL); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "failed to set mode on %s in %s (%s)", - loc->path, from->name, strerror (-ret)); - ret = -1; - goto out; - } + fd_bind(fd); - if (src_fd) - *src_fd = fd; + if (src_fd) + *src_fd = fd; - /* success */ - ret = 0; + ret = -1; + dict = dict_new(); + if (!dict) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: Could not allocate memory for dict", loc->path); + *fop_errno = ENOMEM; + ret = -1; + goto out; + } + + ret = dict_set_str(dict, conf->link_xattr_name, to->name); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "failed to set xattr in dict for %s (linkto:%s)", loc->path, + to->name); + *fop_errno = ENOMEM; + ret = -1; + goto out; + } + + /* Once the migration starts, the source should have 'linkto' key set + to show which is the target, so other clients can work around it */ + ret = syncop_setxattr(from, loc, dict, 0, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "failed to set xattr on %s in %s", loc->path, from->name); + *fop_errno = -ret; + ret = -1; + goto out; + } + + /* Reset source mode/xattr if migration fails*/ + *clean_src = _gf_true; + + /* mode should be (+S+T) to indicate migration is in progress */ + iatt.ia_prot = stbuf->ia_prot; + iatt.ia_type = stbuf->ia_type; + iatt.ia_prot.sticky = 1; + iatt.ia_prot.sgid = 1; + + ret = syncop_setattr(from, loc, &iatt, GF_SET_ATTR_MODE, NULL, NULL, NULL, + NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "failed to set mode on %s in %s", loc->path, from->name); + *fop_errno = -ret; + ret = -1; + goto out; + } + + /* success */ + ret = 0; out: - if (dict) - dict_unref (dict); + if (dict) + dict_unref(dict); - return ret; + return ret; } int -migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc, - struct iatt *buf) +migrate_special_files(xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc, + struct iatt *buf, int *fop_errno) { - int ret = -1; - dict_t *rsp_dict = NULL; - dict_t *dict = NULL; - char *link = NULL; - struct iatt stbuf = {0,}; - dht_conf_t *conf = this->private; - - dict = dict_new (); - if (!dict) - goto out; + int ret = -1; + dict_t *rsp_dict = NULL; + dict_t *dict = NULL; + char *link = NULL; + struct iatt stbuf = { + 0, + }; + dht_conf_t *conf = this->private; + + dict = dict_new(); + if (!dict) { + *fop_errno = ENOMEM; + ret = -1; + goto out; + } + ret = dict_set_int32(dict, conf->link_xattr_name, 256); + if (ret) { + *fop_errno = ENOMEM; + ret = -1; + gf_log(this->name, GF_LOG_ERROR, + "%s: failed to set 'linkto' key in dict", loc->path); + goto out; + } + + /* check in the destination if the file is link file */ + ret = syncop_lookup(to, loc, &stbuf, NULL, dict, &rsp_dict); + if ((ret < 0) && (-ret != ENOENT)) { + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: lookup failed", loc->path); + *fop_errno = -ret; + ret = -1; + goto out; + } + + /* we no more require this key */ + dict_del(dict, conf->link_xattr_name); + + /* file exists in target node, only if it is 'linkfile' its valid, + otherwise, error out */ + if (!ret) { + if (!check_is_linkfile(loc->inode, &stbuf, rsp_dict, + conf->link_xattr_name)) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: file exists in destination", loc->path); + *fop_errno = EINVAL; + ret = -1; + goto out; + } - ret = dict_set_int32 (dict, conf->link_xattr_name, 256); + /* as file is linkfile, delete it */ + ret = syncop_unlink(to, loc, NULL, NULL); if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to set 'linkto' key in dict", loc->path); - goto out; + gf_msg(this->name, GF_LOG_WARNING, -ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to delete the linkfile", loc->path); + *fop_errno = -ret; + ret = -1; + goto out; } + } - /* check in the destination if the file is link file */ - ret = syncop_lookup (to, loc, dict, &stbuf, &rsp_dict, NULL); - if ((ret < 0) && (-ret != ENOENT)) { - gf_log (this->name, GF_LOG_WARNING, "%s: lookup failed (%s)", - loc->path, strerror (-ret)); - ret = -1; - goto out; + /* Set the gfid of the source file in dict */ + ret = dict_set_gfuuid(dict, "gfid-req", buf->ia_gfid, true); + if (ret) { + *fop_errno = ENOMEM; + ret = -1; + gf_log(this->name, GF_LOG_ERROR, + "%s: failed to set gfid in dict for create", loc->path); + goto out; + } + + /* Create the file in target */ + if (IA_ISLNK(buf->ia_type)) { + /* Handle symlinks separately */ + ret = syncop_readlink(from, loc, &link, buf->ia_size, NULL, NULL); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: readlink on symlink failed", loc->path); + *fop_errno = -ret; + ret = -1; + goto out; } - /* we no more require this key */ - dict_del (dict, conf->link_xattr_name); + ret = syncop_symlink(to, loc, link, 0, dict, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, + DHT_MSG_MIGRATE_FILE_FAILED, "%s: creating symlink failed", + loc->path); + *fop_errno = -ret; + ret = -1; + goto out; + } - /* file exists in target node, only if it is 'linkfile' its valid, - otherwise, error out */ - if (!ret) { - if (!check_is_linkfile (loc->inode, &stbuf, rsp_dict, - conf->link_xattr_name)) { - gf_log (this->name, GF_LOG_WARNING, - "%s: file exists in destination", loc->path); - ret = -1; - goto out; - } + goto done; + } - /* as file is linkfile, delete it */ - ret = syncop_unlink (to, loc); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to delete the linkfile (%s)", - loc->path, strerror (-ret)); - ret = -1; - goto out; - } - } + ret = syncop_mknod(to, loc, st_mode_from_ia(buf->ia_prot, buf->ia_type), + makedev(ia_major(buf->ia_rdev), ia_minor(buf->ia_rdev)), + 0, dict, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: mknod failed", loc->path); + *fop_errno = -ret; + ret = -1; + goto out; + } - /* Set the gfid of the source file in dict */ - ret = dict_set_static_bin (dict, "gfid-req", buf->ia_gfid, 16); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to set gfid in dict for create", loc->path); - goto out; - } +done: + ret = syncop_setattr(to, loc, buf, + (GF_SET_ATTR_MTIME | GF_SET_ATTR_UID | + GF_SET_ATTR_GID | GF_SET_ATTR_MODE), + NULL, NULL, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to perform setattr on %s", loc->path, to->name); + *fop_errno = -ret; + } + + ret = syncop_unlink(from, loc, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: unlink failed", loc->path); + *fop_errno = -ret; + ret = -1; + } - /* Create the file in target */ - if (IA_ISLNK (buf->ia_type)) { - /* Handle symlinks separately */ - ret = syncop_readlink (from, loc, &link, buf->ia_size); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: readlink on symlink failed (%s)", - loc->path, strerror (-ret)); - ret = -1; - goto out; - } +out: + GF_FREE(link); + if (dict) + dict_unref(dict); - ret = syncop_symlink (to, loc, link, dict, 0); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: creating symlink failed (%s)", - loc->path, strerror (-ret)); - ret = -1; - goto out; - } + if (rsp_dict) + dict_unref(rsp_dict); - goto done; - } + return ret; +} - ret = syncop_mknod (to, loc, st_mode_from_ia (buf->ia_prot, - buf->ia_type), - makedev (ia_major (buf->ia_rdev), - ia_minor (buf->ia_rdev)), dict, 0); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, "%s: mknod failed (%s)", - loc->path, strerror (-ret)); - ret = -1; - goto out; - } +static int +__dht_migration_cleanup_src_file(xlator_t *this, loc_t *loc, fd_t *fd, + xlator_t *from, ia_prot_t *src_ia_prot) +{ + int ret = -1; + dht_conf_t *conf = NULL; + struct iatt new_stbuf = { + 0, + }; + + if (!this || !fd || !from || !src_ia_prot) { + goto out; + } + + conf = this->private; + + /*Revert source mode and xattr changes*/ + ret = syncop_fstat(from, fd, &new_stbuf, NULL, NULL); + if (ret < 0) { + /* Failed to get the stat info */ + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file cleanup failed: failed to fstat " + "file %s on %s ", + loc->path, from->name); + ret = -1; + goto out; + } -done: - ret = syncop_setattr (to, loc, buf, - (GF_SET_ATTR_UID | GF_SET_ATTR_GID | - GF_SET_ATTR_MODE), NULL, NULL); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to perform setattr on %s (%s)", - loc->path, to->name, strerror (-ret)); - ret = -1; - } + /* Remove the sticky bit and sgid bit set, reset it to 0*/ + if (!src_ia_prot->sticky) + new_stbuf.ia_prot.sticky = 0; - ret = syncop_unlink (from, loc); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, "%s: unlink failed (%s)", - loc->path, strerror (-ret)); - ret = -1; - } + if (!src_ia_prot->sgid) + new_stbuf.ia_prot.sgid = 0; -out: - if (dict) - dict_unref (dict); + ret = syncop_fsetattr(from, fd, &new_stbuf, + (GF_SET_ATTR_GID | GF_SET_ATTR_MODE), NULL, NULL, + NULL, NULL); - if (rsp_dict) - dict_unref (rsp_dict); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file cleanup failed:" + "%s: failed to perform fsetattr on %s ", + loc->path, from->name); + ret = -1; + goto out; + } + + ret = syncop_fremovexattr(from, fd, conf->link_xattr_name, 0, NULL); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, + "%s: failed to remove linkto xattr on %s (%s)", loc->path, + from->name, strerror(-ret)); + ret = -1; + goto out; + } - return ret; + ret = 0; + +out: + return ret; } /* @@ -744,1165 +1469,3234 @@ out: 1 : not a failure, but we can't migrate data as of now */ int -dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, - int flag) +dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, + int flag, int *fop_errno) { - int ret = -1; - struct iatt new_stbuf = {0,}; - struct iatt stbuf = {0,}; - struct iatt empty_iatt = {0,}; - ia_prot_t src_ia_prot = {0,}; - fd_t *src_fd = NULL; - fd_t *dst_fd = NULL; - dict_t *dict = NULL; - dict_t *xattr = NULL; - dict_t *xattr_rsp = NULL; - int file_has_holes = 0; - dht_conf_t *conf = this->private; - - gf_log (this->name, GF_LOG_INFO, "%s: attempting to move from %s to %s", - loc->path, from->name, to->name); - - dict = dict_new (); - if (!dict) - goto out; + int ret = -1; + struct iatt new_stbuf = { + 0, + }; + struct iatt stbuf = { + 0, + }; + struct iatt empty_iatt = { + 0, + }; + ia_prot_t src_ia_prot = { + 0, + }; + fd_t *src_fd = NULL; + fd_t *dst_fd = NULL; + dict_t *dict = NULL; + dict_t *xattr = NULL; + dict_t *xattr_rsp = NULL; + int file_has_holes = 0; + dht_conf_t *conf = this->private; + int rcvd_enoent_from_src = 0; + struct gf_flock flock = { + 0, + }; + struct gf_flock plock = { + 0, + }; + loc_t tmp_loc = { + 0, + }; + loc_t parent_loc = { + 0, + }; + gf_boolean_t inodelk_locked = _gf_false; + gf_boolean_t entrylk_locked = _gf_false; + gf_boolean_t p_locked = _gf_false; + int lk_ret = -1; + gf_defrag_info_t *defrag = NULL; + gf_boolean_t clean_src = _gf_false; + gf_boolean_t clean_dst = _gf_false; + int log_level = GF_LOG_INFO; + gf_boolean_t delete_src_linkto = _gf_true; + lock_migration_info_t locklist; + dict_t *meta_dict = NULL; + gf_boolean_t meta_locked = _gf_false; + gf_boolean_t target_changed = _gf_false; + xlator_t *new_target = NULL; + xlator_t *old_target = NULL; + xlator_t *hashed_subvol = NULL; + fd_t *linkto_fd = NULL; + dict_t *xdata = NULL; + + if (from == to) { + gf_msg_debug(this->name, 0, + "destination and source are same. file %s" + " might have migrated already", + loc->path); + ret = 0; + goto out; + } + + gf_log(this->name, log_level, "%s: attempting to move from %s to %s", + loc->path, from->name, to->name); - ret = dict_set_int32 (dict, conf->link_xattr_name, 256); + dict = dict_new(); + if (!dict) { + ret = -1; + *fop_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "Could not allocate memory for dict"); + goto out; + } + ret = dict_set_int32(dict, conf->link_xattr_name, 256); + if (ret) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: failed to set 'linkto' key in dict", + loc->path); + goto out; + } + + /* Do not migrate file in case lock migration is not enabled on the + * volume*/ + if (!conf->lock_migration_enabled) { + ret = dict_set_int32(dict, GLUSTERFS_POSIXLK_COUNT, sizeof(int32_t)); if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to set 'linkto' key in dict", loc->path); - goto out; + *fop_errno = ENOMEM; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed: %s: failed to " + "set " GLUSTERFS_POSIXLK_COUNT " key in dict", + loc->path); + goto out; + } + } else { + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "locks will be migrated" + " for file: %s", + loc->path); + } + + /* The file is locked to prevent a rename during a migration. Renames + * and migrations on the file at the same time can lead to data loss. + */ + + ret = dht_build_parent_loc(this, &parent_loc, loc, fop_errno); + if (ret < 0) { + ret = -1; + gf_msg(this->name, GF_LOG_WARNING, *fop_errno, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to build parent loc, which is needed to " + "acquire entrylk to synchronize with renames on this " + "path. Skipping migration", + loc->path); + goto out; + } + + hashed_subvol = dht_subvol_get_hashed(this, loc); + if (hashed_subvol == NULL) { + ret = -1; + gf_msg(this->name, GF_LOG_WARNING, EINVAL, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: cannot find hashed subvol which is needed to " + "synchronize with renames on this path. " + "Skipping migration", + loc->path); + goto out; + } + + flock.l_type = F_WRLCK; + + tmp_loc.inode = inode_ref(loc->inode); + gf_uuid_copy(tmp_loc.gfid, loc->gfid); + tmp_loc.path = gf_strdup(loc->path); + + /* this inodelk happens with flock.owner being zero. But to synchronize + * hardlink migration we need to have different lkowner for each migration + * Filed a bug here: https://bugzilla.redhat.com/show_bug.cgi?id=1468202 to + * track the fix for this. Currently synclock takes care of synchronizing + * hardlink migration. Once this bug is fixed we can avoid taking synclock + */ + ret = syncop_inodelk(from, DHT_FILE_MIGRATE_DOMAIN, &tmp_loc, F_SETLKW, + &flock, NULL, NULL); + if (ret < 0) { + *fop_errno = -ret; + ret = -1; + gf_msg(this->name, GF_LOG_WARNING, *fop_errno, + DHT_MSG_MIGRATE_FILE_FAILED, + "migrate file failed: " + "%s: failed to lock file on %s", + loc->path, from->name); + goto out; + } + + inodelk_locked = _gf_true; + + /* dht_rename has changed to use entrylk on hashed subvol for + * synchronization. So, rebalance too has to acquire an entrylk on + * hashed subvol. + */ + ret = syncop_entrylk(hashed_subvol, DHT_ENTRY_SYNC_DOMAIN, &parent_loc, + loc->name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL, NULL); + if (ret < 0) { + *fop_errno = -ret; + ret = -1; + gf_msg(this->name, GF_LOG_WARNING, *fop_errno, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to acquire entrylk on subvol %s", loc->path, + hashed_subvol->name); + goto out; + } + + entrylk_locked = _gf_true; + + /* Phase 1 - Data migration is in progress from now on */ + ret = syncop_lookup(from, loc, &stbuf, NULL, dict, &xattr_rsp); + if (ret) { + *fop_errno = -ret; + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, *fop_errno, + DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: lookup failed on %s", + loc->path, from->name); + goto out; + } + + /* preserve source mode, so set the same to the destination */ + src_ia_prot = stbuf.ia_prot; + + /* Check if file can be migrated */ + ret = __is_file_migratable(this, loc, &stbuf, xattr_rsp, flag, defrag, conf, + fop_errno); + if (ret) { + if (ret == HARDLINK_MIG_INPROGRESS) + ret = 0; + goto out; + } + + /* Take care of the special files */ + if (!IA_ISREG(stbuf.ia_type)) { + /* Special files */ + ret = migrate_special_files(this, from, to, loc, &stbuf, fop_errno); + goto out; + } + + /* Try to preserve 'holes' while migrating data */ + if (stbuf.ia_size > (stbuf.ia_blocks * GF_DISK_SECTOR_SIZE)) + file_has_holes = 1; + + /* create the destination, with required modes/xattr */ + ret = __dht_rebalance_create_dst_file(this, to, from, loc, &stbuf, &dst_fd, + fop_errno, file_has_holes); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "Create dst failed" + " on - %s for file - %s", + to->name, loc->path); + goto out; + } + + clean_dst = _gf_true; + + ret = __dht_check_free_space(this, to, from, loc, &stbuf, flag, conf, + &target_changed, &new_target, fop_errno); + if (target_changed) { + /* Can't handle for hardlinks. Marking this as failure */ + if (flag == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS || stbuf.ia_nlink > 1) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SUBVOL_INSUFF_SPACE, + "Exiting migration for" + " file - %s. flag - %d, stbuf.ia_nlink - %d", + loc->path, flag, stbuf.ia_nlink); + ret = -1; + goto out; } - /* Phase 1 - Data migration is in progress from now on */ - ret = syncop_lookup (from, loc, dict, &stbuf, &xattr_rsp, NULL); + ret = syncop_ftruncate(to, dst_fd, 0, NULL, NULL, NULL, NULL); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "%s: lookup failed on %s (%s)", - loc->path, from->name, strerror (-ret)); - ret = -1; - goto out; + gf_log(this->name, GF_LOG_WARNING, + "%s: failed to perform truncate on %s (%s)", loc->path, + to->name, strerror(-ret)); } - /* we no more require this key */ - dict_del (dict, conf->link_xattr_name); + syncop_close(dst_fd); + dst_fd = NULL; + + old_target = to; + to = new_target; - /* preserve source mode, so set the same to the destination */ - src_ia_prot = stbuf.ia_prot; + clean_dst = _gf_false; - /* Check if file can be migrated */ - ret = __is_file_migratable (this, loc, &stbuf, xattr_rsp, flag); + /* if the file migration is successful to this new target, then + * update the xattr on the old destination to point the new + * destination. We need to do update this only post migration + * as in case of failure the linkto needs to point to the source + * subvol */ + ret = __dht_rebalance_create_dst_file( + this, to, from, loc, &stbuf, &dst_fd, fop_errno, file_has_holes); if (ret) { - if (ret == -2) - ret = 0; - goto out; - } - /* Take care of the special files */ - if (!IA_ISREG (stbuf.ia_type)) { - /* Special files */ - ret = migrate_special_files (this, from, to, loc, &stbuf); - goto out; + gf_log(this->name, GF_LOG_ERROR, + "Create dst failed" + " on - %s for file - %s", + to->name, loc->path); + goto out; + } else { + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "destination for file " + "- %s is changed to - %s", + loc->path, to->name); + clean_dst = _gf_true; } + } + + if (ret) { + goto out; + } + + /* Open the source, and also update mode/xattr */ + ret = __dht_rebalance_open_src_file(this, from, to, loc, &stbuf, &src_fd, + &clean_src, fop_errno); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed: failed to open %s on %s", loc->path, + from->name); + goto out; + } + + /* TODO: move all xattr related operations to fd based operations */ + ret = syncop_listxattr(from, loc, &xattr, NULL, NULL); + if (ret < 0) { + *fop_errno = -ret; + gf_msg(this->name, GF_LOG_WARNING, *fop_errno, + DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: failed to get xattr from %s", + loc->path, from->name); + ret = -1; + goto out; + } + + /* Copying posix acls to the linkto file messes up the permissions*/ + dht_strip_out_acls(xattr); + + /* Remove the linkto xattr as we don't want to overwrite the value + * set on the dst. + */ + dict_del(xattr, conf->link_xattr_name); + + /* We need to error out if this fails as having the wrong shard xattrs + * set on the dst could cause data corruption + */ + ret = syncop_fsetxattr(to, dst_fd, xattr, 0, NULL, NULL); + if (ret < 0) { + *fop_errno = -ret; + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to set xattr on %s", loc->path, to->name); + ret = -1; + goto out; + } - /* TODO: move all xattr related operations to fd based operations */ - ret = syncop_listxattr (from, loc, &xattr); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to get xattr from %s (%s)", - loc->path, from->name, strerror (-ret)); - ret = -1; + if (xattr_rsp) { + /* we no more require this key */ + dict_del(dict, conf->link_xattr_name); + dict_unref(xattr_rsp); + } + + ret = syncop_fstat(from, src_fd, &stbuf, dict, &xattr_rsp); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:failed to lookup %s on %s ", loc->path, + from->name); + *fop_errno = -ret; + ret = -1; + goto out; + } + + /* Check again if file has hardlink */ + ret = __check_file_has_hardlink(this, loc, &stbuf, xattr_rsp, flag, defrag, + conf, fop_errno); + if (ret) { + if (ret == HARDLINK_MIG_INPROGRESS) + ret = 0; + goto out; + } + + ret = __dht_rebalance_migrate_data(this, defrag, from, to, src_fd, dst_fd, + stbuf.ia_size, file_has_holes, + fop_errno); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed: %s: failed to migrate data", loc->path); + + ret = -1; + goto out; + } + + /* TODO: Sync the locks */ + + xdata = dict_new(); + if (!xdata || dict_set_int8(xdata, "last-fsync", 1)) { + gf_log(this->name, GF_LOG_ERROR, + "%s: failed to set last-fsync flag on " + "%s (%s)", + loc->path, to->name, strerror(ENOMEM)); + } + + ret = syncop_fsync(to, dst_fd, 0, NULL, NULL, xdata, NULL); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, "%s: failed to fsync on %s (%s)", + loc->path, to->name, strerror(-ret)); + *fop_errno = -ret; + } + + /* Phase 2 - Data-Migration Complete, Housekeeping updates pending */ + + ret = syncop_fstat(from, src_fd, &new_stbuf, NULL, NULL); + if (ret < 0) { + /* Failed to get the stat info */ + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed: failed to fstat file %s on %s ", loc->path, + from->name); + *fop_errno = -ret; + ret = -1; + goto out; + } + + /* Lock the entire source file to prevent clients from taking a + lock on it as dht_lk does not handle file migration. + + This still leaves a small window where conflicting locks can + be granted to different clients. If client1 requests a blocking + lock on the src file, it will be granted after the migrating + process releases its lock. If client2 requests a lock on the dst + data file, it will also be granted, but all FOPs will be redirected + to the dst data file. + */ + + /* Take meta lock */ + + if (conf->lock_migration_enabled) { + meta_dict = dict_new(); + if (!meta_dict) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "dict_new failed"); + + *fop_errno = ENOMEM; + ret = -1; + goto out; } - /* create the destination, with required modes/xattr */ - ret = __dht_rebalance_create_dst_file (to, from, loc, &stbuf, - dict, &dst_fd, xattr); - if (ret) - goto out; + ret = dict_set_str(meta_dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value: key = %s," + " path = %s", + GLUSTERFS_INTERNAL_FOP_KEY, loc->path); + *fop_errno = ENOMEM; + ret = -1; + goto out; + } - ret = __dht_check_free_space (to, from, loc, &stbuf, flag); + ret = dict_set_int32(meta_dict, GF_META_LOCK_KEY, 1); if (ret) { - goto out; + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Trace dict_set failed"); + *fop_errno = ENOMEM; + ret = -1; + goto out; } - /* Open the source, and also update mode/xattr */ - ret = __dht_rebalance_open_src_file (from, to, loc, &stbuf, &src_fd); + ret = syncop_setxattr(from, loc, meta_dict, 0, NULL, NULL); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "failed to open %s on %s", - loc->path, from->name); - goto out; + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Trace syncop_setxattr metalock failed"); + + *fop_errno = -ret; + ret = -1; + goto out; + } else { + meta_locked = _gf_true; } + } + if (!conf->lock_migration_enabled) { + plock.l_type = F_WRLCK; + plock.l_start = 0; + plock.l_len = 0; + plock.l_whence = SEEK_SET; - ret = syncop_fstat (from, src_fd, &stbuf); + ret = syncop_lk(from, src_fd, F_SETLK, &plock, NULL, NULL); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "failed to lookup %s on %s (%s)", - loc->path, from->name, strerror (-ret)); - ret = -1; - goto out; + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: Failed to lock on %s", + loc->path, from->name); + *fop_errno = -ret; + ret = -1; + goto out; } - /* Try to preserve 'holes' while migrating data */ - if (stbuf.ia_size > (stbuf.ia_blocks * GF_DISK_SECTOR_SIZE)) - file_has_holes = 1; + p_locked = _gf_true; + + } else { + INIT_LIST_HEAD(&locklist.list); + + ret = syncop_getactivelk(from, loc, &locklist, NULL, NULL); + if (ret == 0) { + gf_log(this->name, GF_LOG_INFO, "No active locks on:%s", loc->path); - /* All I/O happens in this function */ - ret = __dht_rebalance_migrate_data (from, to, src_fd, dst_fd, - stbuf.ia_size, file_has_holes); + } else if (ret > 0) { + ret = syncop_setactivelk(to, loc, &locklist, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_LOCK_MIGRATION_FAILED, "write lock failed on:%s", + loc->path); + + *fop_errno = -ret; + ret = -1; + goto metaunlock; + } + } else { + gf_msg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_LOCK_MIGRATION_FAILED, + "getactivelk failed for file: %s", loc->path); + *fop_errno = -ret; + } + } + + /* source would have both sticky bit and sgid bit set, reset it to 0, + and set the source permission on destination, if it was not set + prior to setting rebalance-modes in source */ + if (!src_ia_prot.sticky) + new_stbuf.ia_prot.sticky = 0; + + if (!src_ia_prot.sgid) + new_stbuf.ia_prot.sgid = 0; + + /* TODO: if the source actually had sticky bit, or sgid bit set, + we are not handling it */ + + ret = syncop_fsetattr( + to, dst_fd, &new_stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE), NULL, NULL, + NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: failed to perform setattr on %s ", + loc->path, to->name); + *fop_errno = -ret; + ret = -1; + goto metaunlock; + } + + /* Because 'futimes' is not portable */ + ret = syncop_setattr(to, loc, &new_stbuf, + (GF_SET_ATTR_MTIME | GF_SET_ATTR_ATIME), NULL, NULL, + NULL, NULL); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, + "%s: failed to perform setattr on %s ", loc->path, to->name); + *fop_errno = -ret; + } + + if (target_changed) { + dict_del(dict, GLUSTERFS_POSIXLK_COUNT); + ret = dict_set_str(dict, conf->link_xattr_name, to->name); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "%s: failed to migrate data", - loc->path); - /* reset the destination back to 0 */ - ret = syncop_ftruncate (to, dst_fd, 0); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to reset target size back to 0 (%s)", - loc->path, strerror (-ret)); - } + gf_log(this->name, GF_LOG_ERROR, + "failed to set xattr in dict for %s (linkto:%s)", loc->path, + to->name); + *fop_errno = ENOMEM; + ret = -1; + goto out; + } + ret = syncop_setxattr(old_target, loc, dict, 0, NULL, NULL); + if (ret && -ret != ESTALE && -ret != ENOENT) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "failed to set xattr on %s in %s", loc->path, + old_target->name); + *fop_errno = -ret; + ret = -1; + goto out; + } else if (-ret == ESTALE || -ret == ENOENT) { + /* The failure ESTALE indicates that the linkto + * file on the hashed subvol might have been deleted. + * In this case will create a linkto file with new target + * as linkto xattr value*/ + linkto_fd = fd_create(loc->inode, DHT_REBALANCE_PID); + if (!linkto_fd) { + gf_msg(this->name, GF_LOG_ERROR, errno, + DHT_MSG_MIGRATE_FILE_FAILED, "%s: fd create failed", + loc->path); + *fop_errno = ENOMEM; + ret = -1; + goto out; + } + ret = syncop_create(old_target, loc, O_RDWR, DHT_LINKFILE_MODE, + linkto_fd, NULL, dict, NULL); + if (ret != 0 && -ret != EEXIST && -ret != ESTALE) { + *fop_errno = -ret; ret = -1; + gf_msg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "failed to create linkto file on %s in %s", loc->path, + old_target->name); goto out; + } else if (ret == 0) { + ret = syncop_fsetattr(old_target, linkto_fd, &stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL, + NULL, NULL, NULL); + if (ret < 0) { + *fop_errno = -ret; + gf_msg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "chown failed for %s on %s", loc->path, + old_target->name); + } + } + } + } + + clean_dst = _gf_false; + + /* Posix acls are not set on DHT linkto files as part of the initial + * initial xattrs set on the dst file, so these need + * to be set on the dst file after the linkto attrs are removed. + * TODO: Optimize this. + */ + if (xattr) { + dict_unref(xattr); + xattr = NULL; + } + + /* Set only the Posix ACLs this time */ + ret = syncop_getxattr(from, loc, &xattr, POSIX_ACL_ACCESS_XATTR, NULL, + NULL); + if (ret < 0) { + if ((-ret != ENODATA) && (-ret != ENOATTR)) { + gf_msg(this->name, GF_LOG_WARNING, -ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: failed to get xattr from %s", + loc->path, from->name); + *fop_errno = -ret; + } + } else { + ret = syncop_setxattr(to, loc, xattr, 0, NULL, NULL); + if (ret < 0) { + /* Potential problem here where Posix ACLs will + * not be set on the target file */ + + gf_msg(this->name, GF_LOG_WARNING, -ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: failed to set xattr on %s", + loc->path, to->name); + *fop_errno = -ret; + } + } + + /* The src file is being unlinked after this so we don't need + to clean it up */ + clean_src = _gf_false; + + /* Make the source as a linkfile first before deleting it */ + empty_iatt.ia_prot.sticky = 1; + ret = syncop_fsetattr(from, src_fd, &empty_iatt, GF_SET_ATTR_MODE, NULL, + NULL, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed:" + "%s: failed to perform setattr on %s ", + loc->path, from->name); + *fop_errno = -ret; + ret = -1; + goto metaunlock; + } + + /* Free up the data blocks on the source node, as the whole + file is migrated */ + ret = syncop_ftruncate(from, src_fd, 0, NULL, NULL, NULL, NULL); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, + "%s: failed to perform truncate on %s (%s)", loc->path, + from->name, strerror(-ret)); + *fop_errno = -ret; + } + + /* remove the 'linkto' xattr from the destination */ + ret = syncop_fremovexattr(to, dst_fd, conf->link_xattr_name, 0, NULL); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, + "%s: failed to perform removexattr on %s (%s)", loc->path, + to->name, strerror(-ret)); + *fop_errno = -ret; + } + + /* Do a stat and check the gfid before unlink */ + + /* + * Cached file changes its state from non-linkto to linkto file after + * migrating data. If lookup from any other mount-point is performed, + * converted-linkto-cached file will be treated as a stale and will be + * unlinked. But by this time, file is already migrated. So further + * failure because of ENOENT should not be treated as error + */ + + ret = syncop_stat(from, loc, &empty_iatt, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to do a stat on %s", loc->path, from->name); + + if (-ret != ENOENT) { + *fop_errno = -ret; + ret = -1; + goto metaunlock; } - /* TODO: Sync the locks */ + rcvd_enoent_from_src = 1; + } - ret = syncop_fsync (to, dst_fd, 0); + if ((gf_uuid_compare(empty_iatt.ia_gfid, loc->gfid) == 0) && + (!rcvd_enoent_from_src) && delete_src_linkto) { + /* take out the source from namespace */ + ret = syncop_unlink(from, loc, NULL, NULL); if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to fsync on %s (%s)", - loc->path, to->name, strerror (-ret)); - ret = -1; + gf_msg(this->name, GF_LOG_WARNING, -ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to perform unlink on %s", loc->path, from->name); + *fop_errno = -ret; + ret = -1; + goto metaunlock; } + } + ret = syncop_lookup(this, loc, NULL, NULL, NULL, NULL); + if (ret) { + gf_msg_debug(this->name, -ret, + "%s: failed to lookup the file on subvolumes", loc->path); + *fop_errno = -ret; + } - /* Phase 2 - Data-Migration Complete, Housekeeping updates pending */ - - ret = syncop_fstat (from, src_fd, &new_stbuf); - if (ret < 0) { - /* Failed to get the stat info */ - gf_log (this->name, GF_LOG_ERROR, - "failed to fstat file %s on %s (%s)", - loc->path, from->name, strerror (-ret)); - ret = -1; - goto out; - } + gf_msg(this->name, log_level, 0, DHT_MSG_MIGRATE_FILE_COMPLETE, + "completed migration of %s from subvolume %s to %s", loc->path, + from->name, to->name); - /* source would have both sticky bit and sgid bit set, reset it to 0, - and set the source permission on destination, if it was not set - prior to setting rebalance-modes in source */ - if (!src_ia_prot.sticky) - new_stbuf.ia_prot.sticky = 0; + ret = 0; - if (!src_ia_prot.sgid) - new_stbuf.ia_prot.sgid = 0; +metaunlock: - /* TODO: if the source actually had sticky bit, or sgid bit set, - we are not handling it */ + if (conf->lock_migration_enabled && meta_locked) { + dict_del(meta_dict, GF_META_LOCK_KEY); - ret = syncop_fsetattr (to, dst_fd, &new_stbuf, - (GF_SET_ATTR_UID | GF_SET_ATTR_GID | - GF_SET_ATTR_MODE), NULL, NULL); + ret = dict_set_int32(meta_dict, GF_META_UNLOCK_KEY, 1); if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to perform setattr on %s (%s)", - loc->path, to->name, strerror (-ret)); - ret = -1; - goto out; - } + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Trace dict_set failed"); - /* Because 'futimes' is not portable */ - ret = syncop_setattr (to, loc, &new_stbuf, - (GF_SET_ATTR_MTIME | GF_SET_ATTR_ATIME), - NULL, NULL); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to perform setattr on %s (%s)", - loc->path, to->name, strerror (-ret)); - ret = -1; + *fop_errno = ENOMEM; + ret = -1; + goto out; } - /* Make the source as a linkfile first before deleting it */ - empty_iatt.ia_prot.sticky = 1; - ret = syncop_fsetattr (from, src_fd, &empty_iatt, - GF_SET_ATTR_MODE, NULL, NULL); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, \ - "%s: failed to perform setattr on %s (%s)", - loc->path, from->name, strerror (-ret)); - ret = -1; - goto out; - } + if (clean_dst == _gf_false) + ret = dict_set_int32(meta_dict, "status", 1); + else + ret = dict_set_int32(meta_dict, "status", 0); - /* Free up the data blocks on the source node, as the whole - file is migrated */ - ret = syncop_ftruncate (from, src_fd, 0); if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to perform truncate on %s (%s)", - loc->path, from->name, strerror (-ret)); - ret = -1; - } + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "Trace dict_set failed"); - /* remove the 'linkto' xattr from the destination */ - ret = syncop_fremovexattr (to, dst_fd, conf->link_xattr_name, 0); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to perform removexattr on %s (%s)", - loc->path, to->name, strerror (-ret)); - ret = -1; + *fop_errno = ENOMEM; + ret = -1; + goto out; } - /* Do a stat and check the gfid before unlink */ - ret = syncop_stat (from, loc, &empty_iatt); + ret = syncop_setxattr(from, loc, meta_dict, 0, NULL, NULL); if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to do a stat on %s (%s)", - loc->path, from->name, strerror (-ret)); - ret = -1; - goto out; + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Trace syncop_setxattr meta unlock failed"); + + *fop_errno = -ret; + ret = -1; + goto out; } + } - if (uuid_compare (empty_iatt.ia_gfid, loc->gfid) == 0) { - /* take out the source from namespace */ - ret = syncop_unlink (from, loc); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to perform unlink on %s (%s)", - loc->path, from->name, strerror (-ret)); - ret = -1; - goto out; - } +out: + if (clean_src) { + /* Revert source mode and xattr changes*/ + lk_ret = __dht_migration_cleanup_src_file(this, loc, src_fd, from, + &src_ia_prot); + if (lk_ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to cleanup source file on %s", loc->path, + from->name); } + } + + /* reset the destination back to 0 */ + if (clean_dst) { + lk_ret = syncop_ftruncate(to, dst_fd, 0, NULL, NULL, NULL, NULL); + if (lk_ret) { + gf_msg(this->name, GF_LOG_ERROR, -lk_ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed: " + "%s: failed to reset target size back to 0", + loc->path); + } + } - ret = syncop_lookup (this, loc, NULL, NULL, NULL, NULL); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: failed to lookup the file on subvolumes (%s)", - loc->path, strerror (-ret)); - ret = -1; + if (inodelk_locked) { + flock.l_type = F_UNLCK; + + lk_ret = syncop_inodelk(from, DHT_FILE_MIGRATE_DOMAIN, &tmp_loc, + F_SETLK, &flock, NULL, NULL); + if (lk_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -lk_ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to unlock file on %s", loc->path, from->name); } + } + + if (entrylk_locked) { + lk_ret = syncop_entrylk(hashed_subvol, DHT_ENTRY_SYNC_DOMAIN, + &parent_loc, loc->name, ENTRYLK_UNLOCK, + ENTRYLK_UNLOCK, NULL, NULL); + if (lk_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -lk_ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to unlock entrylk on %s", loc->path, + hashed_subvol->name); + } + } - gf_log (this->name, GF_LOG_INFO, - "completed migration of %s from subvolume %s to %s", - loc->path, from->name, to->name); + if (p_locked) { + plock.l_type = F_UNLCK; + lk_ret = syncop_lk(from, src_fd, F_SETLK, &plock, NULL, NULL); - ret = 0; -out: - if (dict) - dict_unref (dict); + if (lk_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -lk_ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to unlock file on %s", loc->path, from->name); + } + } - if (xattr) - dict_unref (xattr); - if (xattr_rsp) - dict_unref (xattr_rsp); + lk_ret = syncop_removexattr(to, loc, GF_PROTECT_FROM_EXTERNAL_WRITES, NULL, + NULL); + if (lk_ret && (lk_ret != -ENODATA) && (lk_ret != -ENOATTR)) { + gf_msg(this->name, GF_LOG_WARNING, -lk_ret, 0, + "%s: removexattr failed key %s", loc->path, + GF_PROTECT_FROM_EXTERNAL_WRITES); + } - if (dst_fd) - syncop_close (dst_fd); - if (src_fd) - syncop_close (src_fd); + if (dict) + dict_unref(dict); - return ret; -} + if (xattr) + dict_unref(xattr); + if (xattr_rsp) + dict_unref(xattr_rsp); -static int -rebalance_task (void *data) -{ - int ret = -1; - dht_local_t *local = NULL; - call_frame_t *frame = NULL; + if (dst_fd) + syncop_close(dst_fd); - frame = data; + if (src_fd) + syncop_close(src_fd); + if (linkto_fd) + syncop_close(linkto_fd); - local = frame->local; + if (xdata) + dict_unref(xdata); - /* This function is 'synchrounous', hence if it returns, - we are done with the task */ - ret = dht_migrate_file (THIS, &local->loc, local->rebalance.from_subvol, - local->rebalance.target_node, local->flags); + loc_wipe(&tmp_loc); + loc_wipe(&parent_loc); - return ret; + return ret; } static int -rebalance_task_completion (int op_ret, call_frame_t *sync_frame, void *data) +rebalance_task(void *data) { - int ret = -1; - uint64_t layout_int = 0; - dht_layout_t *layout = 0; - xlator_t *this = NULL; - dht_local_t *local = NULL; - int32_t op_errno = EINVAL; - - this = THIS; - local = sync_frame->local; - - if (!op_ret) { - /* Make sure we have valid 'layout' in inode ctx - after the operation */ - ret = inode_ctx_del (local->loc.inode, this, &layout_int); - if (!ret && layout_int) { - layout = (dht_layout_t *)(long)layout_int; - dht_layout_unref (this, layout); - } + int ret = -1; + dht_local_t *local = NULL; + call_frame_t *frame = NULL; + int fop_errno = 0; - ret = dht_layout_preset (this, local->rebalance.target_node, - local->loc.inode); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set inode ctx", local->loc.path); - } + frame = data; - if (op_ret == -1) { - /* Failure of migration process, mostly due to write process. - as we can't preserve the exact errno, lets say there was - no space to migrate-data - */ - op_errno = ENOSPC; - } + local = frame->local; - if (op_ret == 1) { - /* migration didn't happen, but is not a failure, let the user - understand that he doesn't have permission to migrate the - file. - */ - op_ret = -1; - op_errno = EPERM; - } + /* This function is 'synchrounous', hence if it returns, + we are done with the task */ + ret = dht_migrate_file(THIS, &local->loc, local->rebalance.from_subvol, + local->rebalance.target_node, local->flags, + &fop_errno); - DHT_STACK_UNWIND (setxattr, sync_frame, op_ret, op_errno, NULL); - return 0; + return ret; +} + +static int +rebalance_task_completion(int op_ret, call_frame_t *sync_frame, void *data) +{ + int32_t op_errno = EINVAL; + + if (op_ret == -1) { + /* Failure of migration process, mostly due to write process. + as we can't preserve the exact errno, lets say there was + no space to migrate-data + */ + op_errno = ENOSPC; + } else if (op_ret == 1) { + /* migration didn't happen, but is not a failure, let the user + understand that he doesn't have permission to migrate the + file. + */ + op_ret = -1; + op_errno = EPERM; + } else if (op_ret != 0) { + op_errno = -op_ret; + op_ret = -1; + } + + DHT_STACK_UNWIND(setxattr, sync_frame, op_ret, op_errno, NULL); + return 0; } int -dht_start_rebalance_task (xlator_t *this, call_frame_t *frame) +dht_start_rebalance_task(xlator_t *this, call_frame_t *frame) { - int ret = -1; + int ret = -1; - ret = synctask_new (this->ctx->env, rebalance_task, - rebalance_task_completion, - frame, frame); - return ret; + ret = synctask_new(this->ctx->env, rebalance_task, + rebalance_task_completion, frame, frame); + return ret; } int -gf_listener_stop (xlator_t *this) +gf_listener_stop(xlator_t *this) { - glusterfs_ctx_t *ctx = NULL; - cmd_args_t *cmd_args = NULL; - int ret = 0; - - ctx = this->ctx; - GF_ASSERT (ctx); - cmd_args = &ctx->cmd_args; - if (cmd_args->sock_file) { - ret = unlink (cmd_args->sock_file); - if (ret && (ENOENT == errno)) { - ret = 0; - } - } - - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Failed to unlink listener " - "socket %s, error: %s", cmd_args->sock_file, - strerror (errno)); + glusterfs_ctx_t *ctx = NULL; + cmd_args_t *cmd_args = NULL; + int ret = 0; + + ctx = this->ctx; + GF_ASSERT(ctx); + cmd_args = &ctx->cmd_args; + if (cmd_args->sock_file) { + ret = sys_unlink(cmd_args->sock_file); + if (ret && (ENOENT == errno)) { + ret = 0; } - return ret; + } + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, errno, DHT_MSG_SOCKET_ERROR, + "Failed to unlink listener " + "socket %s", + cmd_args->sock_file); + } + return ret; } void -dht_build_root_inode (xlator_t *this, inode_t **inode) +dht_build_root_inode(xlator_t *this, inode_t **inode) { - inode_table_t *itable = NULL; - uuid_t root_gfid = {0, }; + inode_table_t *itable = NULL; + static uuid_t root_gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; - itable = inode_table_new (0, this); - if (!itable) - return; + itable = inode_table_new(0, this); + if (!itable) + return; - root_gfid[15] = 1; - *inode = inode_find (itable, root_gfid); + *inode = inode_find(itable, root_gfid); } void -dht_build_root_loc (inode_t *inode, loc_t *loc) +dht_build_root_loc(inode_t *inode, loc_t *loc) { - loc->path = "/"; - loc->inode = inode; - loc->inode->ia_type = IA_IFDIR; - memset (loc->gfid, 0, 16); - loc->gfid[15] = 1; + loc->path = "/"; + loc->inode = inode; + loc->inode->ia_type = IA_IFDIR; + memset(loc->gfid, 0, 16); + loc->gfid[15] = 1; } - /* return values: 1 -> error, bug ignore and continue 0 -> proceed -1 -> error, handle it */ int32_t -gf_defrag_handle_migrate_error (int32_t op_errno, gf_defrag_info_t *defrag) +gf_defrag_handle_migrate_error(int32_t op_errno, gf_defrag_info_t *defrag) { - /* if errno is not ENOSPC or ENOTCONN, we can still continue - with rebalance process */ - if ((op_errno != ENOSPC) || (op_errno != ENOTCONN)) - return 1; - - if (op_errno == ENOTCONN) { - /* Most probably mount point went missing (mostly due - to a brick down), say rebalance failure to user, - let him restart it if everything is fine */ - defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; - return -1; - } - - if (op_errno == ENOSPC) { - /* rebalance process itself failed, may be - remote brick went down, or write failed due to - disk full etc etc.. */ - defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; - return -1; - } + int ret = 0; + /* if errno is not ENOTCONN, we can still continue + with rebalance process */ + if (op_errno != ENOTCONN) { + ret = 1; + goto out; + } + + if (op_errno == ENOTCONN) { + /* Most probably mount point went missing (mostly due + to a brick down), say rebalance failure to user, + let him restart it if everything is fine */ + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + ret = -1; + goto out; + } - return 0; +out: + return ret; } static gf_boolean_t -gf_defrag_pattern_match (gf_defrag_info_t *defrag, char *name, uint64_t size) +gf_defrag_pattern_match(gf_defrag_info_t *defrag, char *name, uint64_t size) { - gf_defrag_pattern_list_t *trav = NULL; - gf_boolean_t match = _gf_false; - gf_boolean_t ret = _gf_false; + gf_defrag_pattern_list_t *trav = NULL; + gf_boolean_t match = _gf_false; + gf_boolean_t ret = _gf_false; - GF_VALIDATE_OR_GOTO ("dht", defrag, out); + GF_VALIDATE_OR_GOTO("dht", defrag, out); - trav = defrag->defrag_pattern; - while (trav) { - if (!fnmatch (trav->path_pattern, name, FNM_NOESCAPE)) { - match = _gf_true; - break; - } - trav = trav->next; + trav = defrag->defrag_pattern; + while (trav) { + if (!fnmatch(trav->path_pattern, name, FNM_NOESCAPE)) { + match = _gf_true; + break; } + trav = trav->next; + } - if ((match == _gf_true) && (size >= trav->size)) - ret = _gf_true; + if ((match == _gf_true) && (size >= trav->size)) + ret = _gf_true; - out: - return ret; +out: + return ret; +} + +int +dht_dfreaddirp_done(dht_dfoffset_ctx_t *offset_var, int cnt) +{ + int i; + int result = 1; + + for (i = 0; i < cnt; i++) { + if (offset_var[i].readdir_done == 0) { + result = 0; + break; + } + } + return result; } -/* We do a depth first traversal of directories. But before we move into - * subdirs, we complete the data migration of those directories whose layouts - * have been fixed +int static gf_defrag_ctx_subvols_init(dht_dfoffset_ctx_t *offset_var, + xlator_t *this) +{ + int i; + dht_conf_t *conf = NULL; + + conf = this->private; + + if (!conf) + return -1; + + for (i = 0; i < conf->local_subvols_cnt; i++) { + offset_var[i].this = conf->local_subvols[i]; + offset_var[i].offset = (off_t)0; + offset_var[i].readdir_done = 0; + } + + return 0; +} + +static int +dht_get_first_non_null_index(subvol_nodeuuids_info_t *entry) +{ + int i = 0; + int index = 0; + + for (i = 0; i < entry->count; i++) { + if (!gf_uuid_is_null(entry->elements[i].uuid)) { + index = i; + goto out; + } + } + + if (i == entry->count) { + index = -1; + } +out: + return index; +} + +/* Return value + * 0 : this node does not migrate the file + * 1 : this node migrates the file + * + * Use the hash value of the gfid to determine which node will migrate files. + * Using the gfid instead of the name also ensures that the same node handles + * all hardlinks. */ -int -gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, - dict_t *migrate_data) +gf_boolean_t +gf_defrag_should_i_migrate(xlator_t *this, int local_subvol_index, uuid_t gfid) { - int ret = -1; - loc_t entry_loc = {0,}; - fd_t *fd = NULL; - gf_dirent_t entries; - gf_dirent_t *tmp = NULL; - gf_dirent_t *entry = NULL; - gf_boolean_t free_entries = _gf_false; - off_t offset = 0; - dict_t *dict = NULL; - struct iatt iatt = {0,}; - int32_t op_errno = 0; - char *uuid_str = NULL; - uuid_t node_uuid = {0,}; - struct timeval dir_start = {0,}; - struct timeval end = {0,}; - double elapsed = {0,}; - struct timeval start = {0,}; - int32_t err = 0; - int loglevel = GF_LOG_TRACE; - - gf_log (this->name, GF_LOG_INFO, "migrate data called on %s", - loc->path); - gettimeofday (&dir_start, NULL); - - fd = fd_create (loc->inode, defrag->pid); - if (!fd) { - gf_log (this->name, GF_LOG_ERROR, "Failed to create fd"); - goto out; + gf_boolean_t ret = _gf_false; + int i = local_subvol_index; + char *str = NULL; + uint32_t hashval = 0; + int32_t index = 0; + dht_conf_t *conf = NULL; + char buf[UUID_CANONICAL_FORM_LEN + 1] = { + 0, + }; + subvol_nodeuuids_info_t *entry = NULL; + + conf = this->private; + + /* Pure distribute. A subvol in this case + will be handled by only one node */ + + entry = &(conf->local_nodeuuids[i]); + if (entry->count == 1) { + return 1; + } + + str = uuid_utoa_r(gfid, buf); + if (dht_hash_compute(this, 0, str, &hashval) == 0) { + index = (hashval % entry->count); + if (entry->elements[index].info == REBAL_NODEUUID_MINE) { + /* Index matches this node's nodeuuid.*/ + ret = _gf_true; + goto out; } - ret = syncop_opendir (this, loc, fd); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Failed to open dir %s", - loc->path); - ret = -1; + /* Brick down - some other node has to migrate these files*/ + if (gf_uuid_is_null(entry->elements[index].uuid)) { + /* Fall back to the first non-null index */ + index = dht_get_first_non_null_index(entry); + + if (index == -1) { + /* None of the bricks in the subvol are up. + * CHILD_DOWN will kill the process soon */ + + return _gf_false; + } + + if (entry->elements[index].info == REBAL_NODEUUID_MINE) { + /* Index matches this node's nodeuuid.*/ + ret = _gf_true; goto out; + } } + } +out: + return ret; +} - INIT_LIST_HEAD (&entries.list); +int +gf_defrag_migrate_single_file(void *opaque) +{ + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + int ret = 0; + gf_dirent_t *entry = NULL; + struct timeval start = { + 0, + }; + loc_t entry_loc = { + 0, + }; + loc_t *loc = NULL; + struct iatt iatt = { + 0, + }; + dict_t *migrate_data = NULL; + struct timeval end = { + 0, + }; + double elapsed = { + 0, + }; + struct dht_container *rebal_entry = NULL; + inode_t *inode = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *cached_subvol = NULL; + call_frame_t *statfs_frame = NULL; + xlator_t *old_THIS = NULL; + data_t *tmp = NULL; + int fop_errno = 0; + gf_dht_migrate_data_type_t rebal_type = GF_DHT_MIGRATE_DATA; + char value[MAX_REBAL_TYPE_SIZE] = { + 0, + }; + struct iatt *iatt_ptr = NULL; + gf_boolean_t update_skippedcount = _gf_true; + int i = 0; + gf_boolean_t should_i_migrate = 0; + + rebal_entry = (struct dht_container *)opaque; + if (!rebal_entry) { + gf_log("DHT", GF_LOG_ERROR, "rebal_entry is NULL"); + ret = -1; + goto out; + } - while ((ret = syncop_readdirp (this, fd, 131072, offset, NULL, - &entries)) != 0) { + this = rebal_entry->this; - if (ret < 0) { + conf = this->private; - gf_log (this->name, GF_LOG_ERROR, "Readdir returned %s." - " Aborting migrate-data", - strerror(-ret)); - ret = -1; - goto out; - } + defrag = conf->defrag; - if (list_empty (&entries.list)) - break; + loc = rebal_entry->parent_loc; - free_entries = _gf_true; + migrate_data = rebal_entry->migrate_data; - list_for_each_entry_safe (entry, tmp, &entries.list, list) { - if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { - ret = 1; - goto out; - } + entry = rebal_entry->df_entry; + iatt_ptr = &entry->d_stat; - offset = entry->d_off; + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { + ret = -1; + goto out; + } - if (!strcmp (entry->d_name, ".") || - !strcmp (entry->d_name, "..")) - continue; + if (defrag->stats == _gf_true) { + gettimeofday(&start, NULL); + } - if (IA_ISDIR (entry->d_stat.ia_type)) - continue; + if (defrag->defrag_pattern && + (gf_defrag_pattern_match(defrag, entry->d_name, + entry->d_stat.ia_size) == _gf_false)) { + gf_log(this->name, GF_LOG_ERROR, "pattern_match failed"); + goto out; + } - defrag->num_files_lookedup++; - if (defrag->stats == _gf_true) { - gettimeofday (&start, NULL); - } - if (defrag->defrag_pattern && - (gf_defrag_pattern_match (defrag, entry->d_name, - entry->d_stat.ia_size) - == _gf_false)) { - continue; - } - loc_wipe (&entry_loc); - ret =dht_build_child_loc (this, &entry_loc, loc, - entry->d_name); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Child loc" - " build failed"); - goto out; - } + memset(&entry_loc, 0, sizeof(entry_loc)); - if (uuid_is_null (entry->d_stat.ia_gfid)) { - gf_log (this->name, GF_LOG_ERROR, "%s/%s" - " gfid not present", loc->path, - entry->d_name); - continue; - } + ret = dht_build_child_loc(this, &entry_loc, loc, entry->d_name); + if (ret) { + LOCK(&defrag->lock); + { + defrag->total_failures += 1; + } + UNLOCK(&defrag->lock); - uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid); + ret = 0; - if (uuid_is_null (loc->gfid)) { - gf_log (this->name, GF_LOG_ERROR, "%s/%s" - " gfid not present", loc->path, - entry->d_name); - continue; - } + gf_log(this->name, GF_LOG_ERROR, "Child loc build failed"); - uuid_copy (entry_loc.pargfid, loc->gfid); + goto out; + } - entry_loc.inode->ia_type = entry->d_stat.ia_type; + should_i_migrate = gf_defrag_should_i_migrate( + this, rebal_entry->local_subvol_index, entry->d_stat.ia_gfid); - ret = syncop_lookup (this, &entry_loc, NULL, &iatt, - NULL, NULL); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "%s" - " lookup failed", entry_loc.path); - ret = -1; - continue; - } + gf_uuid_copy(entry_loc.gfid, entry->d_stat.ia_gfid); - ret = syncop_getxattr (this, &entry_loc, &dict, - GF_XATTR_NODE_UUID_KEY); - if(ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "Failed to " - "get node-uuid for %s", entry_loc.path); - ret = -1; - continue; - } + gf_uuid_copy(entry_loc.pargfid, loc->gfid); - ret = dict_get_str (dict, GF_XATTR_NODE_UUID_KEY, - &uuid_str); - if(ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "Failed to " - "get node-uuid from dict for %s", - entry_loc.path); - ret = -1; - continue; - } + ret = syncop_lookup(this, &entry_loc, &iatt, NULL, NULL, NULL); - if (uuid_parse (uuid_str, node_uuid)) { - gf_log (this->name, GF_LOG_ERROR, "uuid_parse " - "failed for %s", entry_loc.path); - continue; - } + if (!should_i_migrate) { + /* this node isn't supposed to migrate the file. suppressing any + * potential error from lookup as this file is under migration by + * another node */ + if (ret) { + gf_msg_debug(this->name, -ret, + "Ignoring lookup failure: node isn't migrating %s", + entry_loc.path); + ret = 0; + } + gf_msg_debug(this->name, 0, "Don't migrate %s ", entry_loc.path); + goto out; + } - /* if file belongs to different node, skip migration - * the other node will take responsibility of migration - */ - if (uuid_compare (node_uuid, defrag->node_uuid)) { - gf_log (this->name, GF_LOG_TRACE, "%s does not" - "belong to this node", entry_loc.path); - continue; - } + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED, + "Migrate file failed: %s lookup failed", entry_loc.path); - uuid_str = NULL; - - dict_del (dict, GF_XATTR_NODE_UUID_KEY); - - - /* if distribute is present, it will honor this key. - * -1, ENODATA is returned if distribute is not present - * or file doesn't have a link-file. If file has - * link-file, the path of link-file will be the value, - * and also that guarantees that file has to be mostly - * migrated */ - - ret = syncop_getxattr (this, &entry_loc, &dict, - GF_XATTR_LINKINFO_KEY); - if (ret < 0) { - if (-ret != ENODATA) { - loglevel = GF_LOG_ERROR; - defrag->total_failures += 1; - } else { - loglevel = GF_LOG_TRACE; - } - gf_log (this->name, loglevel, "%s: failed to " - "get "GF_XATTR_LINKINFO_KEY" key - %s", - entry_loc.path, strerror (-ret)); - ret = -1; - continue; - } + /* Increase failure count only for remove-brick op, so that + * user is warned to check the removed-brick for any files left + * unmigrated + */ + if (conf->decommission_subvols_cnt) { + LOCK(&defrag->lock); + { + defrag->total_failures += 1; + } + UNLOCK(&defrag->lock); + } - ret = syncop_setxattr (this, &entry_loc, migrate_data, - 0); - if (ret) { - err = op_errno; - /* errno is overloaded. See - * rebalance_task_completion () */ - if (err != ENOSPC) { - gf_log (this->name, GF_LOG_DEBUG, - "migrate-data skipped for %s" - " due to space constraints", - entry_loc.path); - defrag->skipped +=1; - } else{ - gf_log (this->name, GF_LOG_ERROR, - "migrate-data failed for %s", - entry_loc.path); - defrag->total_failures +=1; - } - } + ret = 0; + goto out; + } - if (ret < 0) { - op_errno = -ret; - ret = gf_defrag_handle_migrate_error (op_errno, - defrag); - - if (!ret) - gf_log (this->name, GF_LOG_DEBUG, - "migrate-data on %s failed: %s", - entry_loc.path, - strerror (op_errno)); - else if (ret == 1) - continue; - else if (ret == -1) - goto out; - } + iatt_ptr = &iatt; - LOCK (&defrag->lock); + hashed_subvol = dht_subvol_get_hashed(this, &entry_loc); + if (!hashed_subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "Failed to get hashed subvol for %s", entry_loc.path); + ret = 0; + goto out; + } + + cached_subvol = dht_subvol_get_cached(this, entry_loc.inode); + if (!cached_subvol) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_CACHED_SUBVOL_GET_FAILED, + "Failed to get cached subvol for %s", entry_loc.path); + + ret = 0; + goto out; + } + + if (hashed_subvol == cached_subvol) { + ret = 0; + goto out; + } + + inode = inode_link(entry_loc.inode, entry_loc.parent, entry->d_name, &iatt); + inode_unref(entry_loc.inode); + /* use the inode returned by inode_link */ + entry_loc.inode = inode; + + old_THIS = THIS; + THIS = this; + statfs_frame = create_frame(this, this->ctx->pool); + if (!statfs_frame) { + gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, ENOMEM, + "Insufficient memory. Frame creation failed"); + ret = -1; + goto out; + } + + /* async statfs information for honoring min-free-disk */ + dht_get_du_info(statfs_frame, this, loc); + THIS = old_THIS; + + tmp = dict_get(migrate_data, GF_XATTR_FILE_MIGRATE_KEY); + if (tmp) { + memcpy(value, tmp->data, tmp->len); + if (strcmp(value, "force") == 0) + rebal_type = GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS; + + if (conf->decommission_in_progress) + rebal_type = GF_DHT_MIGRATE_HARDLINK; + } + + ret = dht_migrate_file(this, &entry_loc, cached_subvol, hashed_subvol, + rebal_type, &fop_errno); + if (ret == 1) { + if (fop_errno == ENOSPC) { + gf_msg_debug(this->name, 0, + "migrate-data skipped for" + " %s due to space constraints", + entry_loc.path); + + /* For remove-brick case if the source is not one of the + * removed-brick, do not mark the error as failure */ + if (conf->decommission_subvols_cnt) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i] == cached_subvol) { + LOCK(&defrag->lock); { - defrag->total_files += 1; - defrag->total_data += iatt.ia_size; - } - UNLOCK (&defrag->lock); - if (defrag->stats == _gf_true) { - gettimeofday (&end, NULL); - elapsed = (end.tv_sec - start.tv_sec) * 1e6 + - (end.tv_usec - start.tv_usec); - gf_log (this->name, GF_LOG_INFO, "Migration of " - "file:%s size:%"PRIu64" bytes took %.2f" - "secs", entry_loc.path, iatt.ia_size, - elapsed/1e6); + defrag->total_failures += 1; + update_skippedcount = _gf_false; } + UNLOCK(&defrag->lock); + + break; + } } + } - gf_dirent_free (&entries); - free_entries = _gf_false; - INIT_LIST_HEAD (&entries.list); + if (update_skippedcount) { + LOCK(&defrag->lock); + { + defrag->skipped += 1; + } + UNLOCK(&defrag->lock); + + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_MIGRATE_FILE_SKIPPED, + "File migration skipped for %s.", entry_loc.path); + } + + } else if (fop_errno == ENOTSUP) { + gf_msg_debug(this->name, 0, + "migrate-data skipped for" + " hardlink %s ", + entry_loc.path); + LOCK(&defrag->lock); + { + defrag->skipped += 1; + } + UNLOCK(&defrag->lock); + + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_MIGRATE_FILE_SKIPPED, + "File migration skipped for %s.", entry_loc.path); } - gettimeofday (&end, NULL); - elapsed = (end.tv_sec - dir_start.tv_sec) * 1e6 + - (end.tv_usec - dir_start.tv_usec); - gf_log (this->name, GF_LOG_INFO, "Migration operation on dir %s took " - "%.2f secs", loc->path, elapsed/1e6); ret = 0; + goto out; + } else if (ret < 0) { + if (fop_errno != EEXIST) { + gf_msg(this->name, GF_LOG_ERROR, fop_errno, + DHT_MSG_MIGRATE_FILE_FAILED, "migrate-data failed for %s", + entry_loc.path); + + LOCK(&defrag->lock); + { + defrag->total_failures += 1; + } + UNLOCK(&defrag->lock); + } + + ret = gf_defrag_handle_migrate_error(fop_errno, defrag); + + if (!ret) { + gf_msg(this->name, GF_LOG_ERROR, fop_errno, + DHT_MSG_MIGRATE_FILE_FAILED, + "migrate-data on %s failed:", entry_loc.path); + } else if (ret == 1) { + ret = 0; + } + + goto out; + } + + LOCK(&defrag->lock); + { + defrag->total_files += 1; + defrag->total_data += iatt.ia_size; + } + UNLOCK(&defrag->lock); + + if (defrag->stats == _gf_true) { + gettimeofday(&end, NULL); + elapsed = gf_tvdiff(&start, &end); + gf_log(this->name, GF_LOG_INFO, + "Migration of " + "file:%s size:%" PRIu64 + " bytes took %.2f" + "secs and ret: %d", + entry_loc.name, iatt.ia_size, elapsed / 1e6, ret); + } + out: - if (free_entries) - gf_dirent_free (&entries); + if (statfs_frame) { + STACK_DESTROY(statfs_frame->root); + } - loc_wipe (&entry_loc); + if (iatt_ptr) { + LOCK(&defrag->lock); + { + defrag->size_processed += iatt_ptr->ia_size; + } + UNLOCK(&defrag->lock); + } + loc_wipe(&entry_loc); - if (dict) - dict_unref(dict); + return ret; +} - if (fd) - fd_unref (fd); - return ret; +void * +gf_defrag_task(void *opaque) +{ + struct list_head *q_head = NULL; + struct dht_container *iterator = NULL; + gf_defrag_info_t *defrag = NULL; + int ret = 0; + pid_t pid = GF_CLIENT_PID_DEFRAG; + + defrag = (gf_defrag_info_t *)opaque; + if (!defrag) { + gf_msg("dht", GF_LOG_ERROR, 0, 0, "defrag is NULL"); + goto out; + } + + syncopctx_setfspid(&pid); + + q_head = &(defrag->queue[0].list); + + /* The following while loop will dequeue one entry from the defrag->queue + under lock. We will update the defrag->global_error only when there + is an error which is critical to stop the rebalance process. The stop + message will be intimated to other migrator threads by setting the + defrag->defrag_status to GF_DEFRAG_STATUS_FAILED. + + In defrag->queue, a low watermark (MIN_MIGRATE_QUEUE_COUNT) is + maintained so that crawler does not starve the file migration + workers and a high watermark (MAX_MIGRATE_QUEUE_COUNT) so that + crawler does not go far ahead in filling up the queue. + */ + + while (_gf_true) { + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { + pthread_cond_broadcast(&defrag->rebalance_crawler_alarm); + pthread_cond_broadcast(&defrag->parallel_migration_cond); + goto out; + } + + pthread_mutex_lock(&defrag->dfq_mutex); + { + /*Throttle down: + If the reconfigured count is less than current thread + count, then the current thread will sleep */ + + /*TODO: Need to refactor the following block to work + *under defrag->lock. For now access + * defrag->current_thread_count and rthcount under + * dfq_mutex lock */ + while (!defrag->crawl_done && (defrag->recon_thread_count < + defrag->current_thread_count)) { + defrag->current_thread_count--; + gf_msg_debug("DHT", 0, + "Thread sleeping. " + "current thread count: %d", + defrag->current_thread_count); + + pthread_cond_wait(&defrag->df_wakeup_thread, + &defrag->dfq_mutex); + + defrag->current_thread_count++; + gf_msg_debug("DHT", 0, + "Thread wokeup. " + "current thread count: %d", + defrag->current_thread_count); + } + + if (defrag->q_entry_count) { + iterator = list_entry(q_head->next, typeof(*iterator), list); + + gf_msg_debug("DHT", 0, + "picking entry " + "%s", + iterator->df_entry->d_name); + + list_del_init(&(iterator->list)); + + defrag->q_entry_count--; + + if ((defrag->q_entry_count < MIN_MIGRATE_QUEUE_COUNT) && + defrag->wakeup_crawler) { + pthread_cond_broadcast(&defrag->rebalance_crawler_alarm); + } + pthread_mutex_unlock(&defrag->dfq_mutex); + ret = gf_defrag_migrate_single_file((void *)iterator); + + /*Critical errors: ENOTCONN and ENOSPACE*/ + if (ret) { + dht_set_global_defrag_error(defrag, ret); + + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + + pthread_cond_broadcast(&defrag->rebalance_crawler_alarm); + + pthread_cond_broadcast(&defrag->parallel_migration_cond); + goto out; + } + + gf_defrag_free_container(iterator); + + continue; + } else { + /* defrag->crawl_done flag is set means crawling + file system is done and hence a list_empty when + the above flag is set indicates there are no more + entries to be added to the queue and rebalance is + finished */ + + if (!defrag->crawl_done) { + defrag->current_thread_count--; + gf_msg_debug("DHT", 0, + "Thread " + "sleeping while waiting " + "for migration entries. " + "current thread count:%d", + defrag->current_thread_count); + + pthread_cond_wait(&defrag->parallel_migration_cond, + &defrag->dfq_mutex); + } + + if (defrag->crawl_done && !defrag->q_entry_count) { + defrag->current_thread_count++; + gf_msg_debug("DHT", 0, "Exiting thread"); + + pthread_cond_broadcast(&defrag->parallel_migration_cond); + goto unlock; + } else { + defrag->current_thread_count++; + gf_msg_debug("DHT", 0, + "Thread woke up" + " as found migrating entries. " + "current thread count:%d", + defrag->current_thread_count); + + pthread_mutex_unlock(&defrag->dfq_mutex); + continue; + } + } + } + unlock: + pthread_mutex_unlock(&defrag->dfq_mutex); + break; + } +out: + return NULL; +} + +int static gf_defrag_get_entry(xlator_t *this, int i, + struct dht_container **container, loc_t *loc, + dht_conf_t *conf, gf_defrag_info_t *defrag, + fd_t *fd, dict_t *migrate_data, + struct dir_dfmeta *dir_dfmeta, dict_t *xattr_req, + int *perrno) +{ + int ret = 0; + char is_linkfile = 0; + gf_dirent_t *df_entry = NULL; + struct dht_container *tmp_container = NULL; + + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { + ret = -1; + goto out; + } + + if (dir_dfmeta->offset_var[i].readdir_done == 1) { + ret = 0; + goto out; + } + + if (dir_dfmeta->fetch_entries[i] == 1) { + if (!fd) { + dir_dfmeta->fetch_entries[i] = 0; + dir_dfmeta->offset_var[i].readdir_done = 1; + ret = 0; + goto out; + } + + ret = syncop_readdirp(conf->local_subvols[i], fd, 131072, + dir_dfmeta->offset_var[i].offset, + &(dir_dfmeta->equeue[i]), xattr_req, NULL); + if (ret == 0) { + dir_dfmeta->offset_var[i].readdir_done = 1; + ret = 0; + goto out; + } + + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, + DHT_MSG_MIGRATE_DATA_FAILED, + "Readdirp failed. Aborting data migration for " + "directory: %s", + loc->path); + *perrno = -ret; + ret = -1; + goto out; + } + + if (list_empty(&(dir_dfmeta->equeue[i].list))) { + dir_dfmeta->offset_var[i].readdir_done = 1; + ret = 0; + goto out; + } + + dir_dfmeta->fetch_entries[i] = 0; + } + + while (1) { + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { + ret = -1; + goto out; + } + + df_entry = list_entry(dir_dfmeta->iterator[i]->next, typeof(*df_entry), + list); + + if (&df_entry->list == dir_dfmeta->head[i]) { + gf_dirent_free(&(dir_dfmeta->equeue[i])); + INIT_LIST_HEAD(&(dir_dfmeta->equeue[i].list)); + dir_dfmeta->fetch_entries[i] = 1; + dir_dfmeta->iterator[i] = dir_dfmeta->head[i]; + ret = 0; + goto out; + } + + dir_dfmeta->iterator[i] = dir_dfmeta->iterator[i]->next; + + dir_dfmeta->offset_var[i].offset = df_entry->d_off; + if (!strcmp(df_entry->d_name, ".") || !strcmp(df_entry->d_name, "..")) + continue; + + if (IA_ISDIR(df_entry->d_stat.ia_type)) { + defrag->size_processed += df_entry->d_stat.ia_size; + continue; + } + + defrag->num_files_lookedup++; + + if (defrag->defrag_pattern && + (gf_defrag_pattern_match(defrag, df_entry->d_name, + df_entry->d_stat.ia_size) == _gf_false)) { + defrag->size_processed += df_entry->d_stat.ia_size; + continue; + } + + is_linkfile = check_is_linkfile(NULL, &df_entry->d_stat, df_entry->dict, + conf->link_xattr_name); + + if (is_linkfile) { + /* No need to add linkto file to the queue for + migration. Only the actual data file need to + be checked for migration criteria. + */ + + gf_msg_debug(this->name, 0, + "Skipping linkfile" + " %s on subvol: %s", + df_entry->d_name, conf->local_subvols[i]->name); + continue; + } + + /*Build Container Structure */ + + tmp_container = GF_CALLOC(1, sizeof(struct dht_container), + gf_dht_mt_container_t); + if (!tmp_container) { + gf_log(this->name, GF_LOG_ERROR, + "Failed to allocate " + "memory for container"); + ret = -1; + goto out; + } + tmp_container->df_entry = gf_dirent_for_name(df_entry->d_name); + if (!tmp_container->df_entry) { + gf_log(this->name, GF_LOG_ERROR, + "Failed to allocate " + "memory for df_entry"); + ret = -1; + goto out; + } + + tmp_container->local_subvol_index = i; + + tmp_container->df_entry->d_stat = df_entry->d_stat; + + tmp_container->df_entry->d_ino = df_entry->d_ino; + + tmp_container->df_entry->d_type = df_entry->d_type; + + tmp_container->df_entry->d_len = df_entry->d_len; + + tmp_container->parent_loc = GF_CALLOC(1, sizeof(*loc), gf_dht_mt_loc_t); + if (!tmp_container->parent_loc) { + gf_log(this->name, GF_LOG_ERROR, + "Failed to allocate " + "memory for loc"); + ret = -1; + goto out; + } + + ret = loc_copy(tmp_container->parent_loc, loc); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, "loc_copy failed"); + ret = -1; + goto out; + } + + tmp_container->migrate_data = migrate_data; + + tmp_container->this = this; + + if (df_entry->dict) + tmp_container->df_entry->dict = dict_ref(df_entry->dict); + + /*Build Container Structure >> END*/ + + ret = 0; + goto out; + } + +out: + if (ret == 0) { + *container = tmp_container; + } else { + if (tmp_container) { + gf_defrag_free_container(tmp_container); + } + } + + return ret; } int -gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, - dict_t *fix_layout, dict_t *migrate_data) +gf_defrag_process_dir(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, + dict_t *migrate_data, int *perrno) { - int ret = -1; - loc_t entry_loc = {0,}; - fd_t *fd = NULL; - gf_dirent_t entries; - gf_dirent_t *tmp = NULL; - gf_dirent_t *entry = NULL; - gf_boolean_t free_entries = _gf_false; - dict_t *dict = NULL; - off_t offset = 0; - struct iatt iatt = {0,}; - - ret = syncop_lookup (this, loc, NULL, &iatt, NULL, NULL); + int ret = -1; + dht_conf_t *conf = NULL; + gf_dirent_t entries; + dict_t *xattr_req = NULL; + struct timeval dir_start = { + 0, + }; + struct timeval end = { + 0, + }; + double elapsed = { + 0, + }; + int local_subvols_cnt = 0; + int i = 0; + int j = 0; + struct dht_container *container = NULL; + int ldfq_count = 0; + int dfc_index = 0; + int throttle_up = 0; + struct dir_dfmeta *dir_dfmeta = NULL; + xlator_t *old_THIS = NULL; + + gf_log(this->name, GF_LOG_INFO, "migrate data called on %s", loc->path); + gettimeofday(&dir_start, NULL); + + conf = this->private; + local_subvols_cnt = conf->local_subvols_cnt; + + if (!local_subvols_cnt) { + ret = 0; + goto out; + } + + old_THIS = THIS; + THIS = this; + + dir_dfmeta = GF_CALLOC(1, sizeof(*dir_dfmeta), gf_common_mt_pointer); + if (!dir_dfmeta) { + gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta is NULL"); + ret = -1; + goto out; + } + + dir_dfmeta->lfd = GF_CALLOC(local_subvols_cnt, sizeof(fd_t *), + gf_common_mt_pointer); + if (!dir_dfmeta->lfd) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_INSUFF_MEMORY, + "for dir_dfmeta", NULL); + ret = -1; + *perrno = ENOMEM; + goto out; + } + + for (i = 0; i < local_subvols_cnt; i++) { + dir_dfmeta->lfd[i] = fd_create(loc->inode, defrag->pid); + if (!dir_dfmeta->lfd[i]) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_FD_CREATE_FAILED, + NULL); + *perrno = ENOMEM; + ret = -1; + goto out; + } + + ret = syncop_opendir(conf->local_subvols[i], loc, dir_dfmeta->lfd[i], + NULL, NULL); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Lookup failed on %s", - loc->path); + fd_unref(dir_dfmeta->lfd[i]); + dir_dfmeta->lfd[i] = NULL; + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_FAILED_TO_OPEN, + "dir: %s", loc->path, "subvol: %s", + conf->local_subvols[i]->name, NULL); + + if (conf->decommission_in_progress) { + *perrno = -ret; ret = -1; goto out; + } + } else { + fd_bind(dir_dfmeta->lfd[i]); } + } - if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) { - ret = gf_defrag_migrate_data (this, defrag, loc, migrate_data); - if (ret) - goto out; + dir_dfmeta->head = GF_CALLOC(local_subvols_cnt, sizeof(*(dir_dfmeta->head)), + gf_common_mt_pointer); + if (!dir_dfmeta->head) { + gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta->head is NULL"); + ret = -1; + goto out; + } + + dir_dfmeta->iterator = GF_CALLOC(local_subvols_cnt, + sizeof(*(dir_dfmeta->iterator)), + gf_common_mt_pointer); + if (!dir_dfmeta->iterator) { + gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta->iterator is NULL"); + ret = -1; + goto out; + } + + dir_dfmeta->equeue = GF_CALLOC(local_subvols_cnt, sizeof(entries), + gf_dht_mt_dirent_t); + if (!dir_dfmeta->equeue) { + gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta->equeue is NULL"); + ret = -1; + goto out; + } + + dir_dfmeta->offset_var = GF_CALLOC( + local_subvols_cnt, sizeof(dht_dfoffset_ctx_t), gf_dht_mt_octx_t); + if (!dir_dfmeta->offset_var) { + gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta->offset_var is NULL"); + ret = -1; + goto out; + } + + ret = gf_defrag_ctx_subvols_init(dir_dfmeta->offset_var, this); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "dht_dfoffset_ctx_t" + "initialization failed"); + ret = -1; + goto out; + } + + dir_dfmeta->fetch_entries = GF_CALLOC(local_subvols_cnt, sizeof(int), + gf_common_mt_int); + if (!dir_dfmeta->fetch_entries) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_INSUFF_MEMORY, + "for dir_dfmeta->fetch_entries", NULL); + ret = -1; + goto out; + } + + for (i = 0; i < local_subvols_cnt; i++) { + INIT_LIST_HEAD(&(dir_dfmeta->equeue[i].list)); + dir_dfmeta->head[i] = &(dir_dfmeta->equeue[i].list); + dir_dfmeta->iterator[i] = dir_dfmeta->head[i]; + dir_dfmeta->fetch_entries[i] = 1; + } + + xattr_req = dict_new(); + if (!xattr_req) { + gf_log(this->name, GF_LOG_ERROR, "dict_new failed"); + ret = -1; + goto out; + } + + ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "failed to set dict for " + "key: %s", + conf->link_xattr_name); + ret = -1; + goto out; + } + + /* + Job: Read entries from each local subvol and store the entries + in equeue array of linked list. Now pick one entry from the + equeue array in a round robin basis and add them to defrag Queue. + */ + + while (!dht_dfreaddirp_done(dir_dfmeta->offset_var, local_subvols_cnt)) { + pthread_mutex_lock(&defrag->dfq_mutex); + { + /*Throttle up: If reconfigured count is higher than + current thread count, wake up the sleeping threads + TODO: Need to refactor this. Instead of making the + thread sleep and wake, we should terminate and spawn + threads on-demand*/ + + if (defrag->recon_thread_count > defrag->current_thread_count) { + throttle_up = (defrag->recon_thread_count - + defrag->current_thread_count); + for (j = 0; j < throttle_up; j++) { + pthread_cond_signal(&defrag->df_wakeup_thread); + } + } + + while (defrag->q_entry_count > MAX_MIGRATE_QUEUE_COUNT) { + defrag->wakeup_crawler = 1; + pthread_cond_wait(&defrag->rebalance_crawler_alarm, + &defrag->dfq_mutex); + } + + ldfq_count = defrag->q_entry_count; + + if (defrag->wakeup_crawler) { + defrag->wakeup_crawler = 0; + } } + pthread_mutex_unlock(&defrag->dfq_mutex); - gf_log (this->name, GF_LOG_TRACE, "fix layout called on %s", loc->path); + while ( + ldfq_count <= MAX_MIGRATE_QUEUE_COUNT && + !dht_dfreaddirp_done(dir_dfmeta->offset_var, local_subvols_cnt)) { + ret = gf_defrag_get_entry(this, dfc_index, &container, loc, conf, + defrag, dir_dfmeta->lfd[dfc_index], + migrate_data, dir_dfmeta, xattr_req, + perrno); + + if (defrag->defrag_status == GF_DEFRAG_STATUS_STOPPED) { + goto out; + } + + if (ret) { + gf_log(this->name, GF_LOG_WARNING, + "Found " + "error from gf_defrag_get_entry"); - fd = fd_create (loc->inode, defrag->pid); - if (!fd) { - gf_log (this->name, GF_LOG_ERROR, "Failed to create fd"); ret = -1; goto out; + } + + /* Check if we got an entry, else we need to move the + index to the next subvol */ + if (!container) { + GF_CRAWL_INDEX_MOVE(dfc_index, local_subvols_cnt); + continue; + } + + /* Q this entry in the dfq */ + pthread_mutex_lock(&defrag->dfq_mutex); + { + list_add_tail(&container->list, &(defrag->queue[0].list)); + defrag->q_entry_count++; + ldfq_count = defrag->q_entry_count; + + gf_msg_debug(this->name, 0, + "added " + "file:%s parent:%s to the queue ", + container->df_entry->d_name, + container->parent_loc->path); + + pthread_cond_signal(&defrag->parallel_migration_cond); + } + pthread_mutex_unlock(&defrag->dfq_mutex); + + GF_CRAWL_INDEX_MOVE(dfc_index, local_subvols_cnt); + } + } + + gettimeofday(&end, NULL); + elapsed = gf_tvdiff(&dir_start, &end); + gf_log(this->name, GF_LOG_INFO, + "Migration operation on dir %s took " + "%.2f secs", + loc->path, elapsed / 1e6); + ret = 0; +out: + THIS = old_THIS; + gf_defrag_free_dir_dfmeta(dir_dfmeta, local_subvols_cnt); + + if (xattr_req) + dict_unref(xattr_req); + + /* It does not matter if it errored out - this number is + * used to calculate rebalance estimated time to complete. + * No locking required as dirs are processed by a single thread. + */ + defrag->num_dirs_processed++; + return ret; +} + +int +gf_defrag_settle_hash(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, + dict_t *fix_layout) +{ + int ret; + dht_conf_t *conf = NULL; + /* + * Now we're ready to update the directory commit hash for the volume + * root, so that hash miscompares and broadcast lookups can stop. + * However, we want to skip that if fix-layout is all we did. In + * that case, we want the miscompares etc. to continue until a real + * rebalance is complete. + */ + if (defrag->cmd == GF_DEFRAG_CMD_START_LAYOUT_FIX || + defrag->cmd == GF_DEFRAG_CMD_DETACH_START) { + return 0; + } + + conf = this->private; + if (!conf) { + /*Uh oh + */ + return -1; + } + + if (conf->local_subvols_cnt == 0 || !conf->lookup_optimize) { + /* Commit hash updates are only done on local subvolumes and + * only when lookup optimization is needed (for older client + * support) + */ + return 0; + } + + ret = dict_set_uint32(fix_layout, "new-commit-hash", + defrag->new_commit_hash); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, "Failed to set new-commit-hash"); + return -1; + } + + ret = syncop_setxattr(this, loc, fix_layout, 0, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LAYOUT_FIX_FAILED, + "fix layout on %s failed", loc->path); + + if (-ret == ENOENT || -ret == ESTALE) { + /* Dir most likely is deleted */ + return 0; } - ret = syncop_opendir (this, loc, fd); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Failed to open dir %s", - loc->path); - ret = -1; + return -1; + } + + /* TBD: find more efficient solution than adding/deleting every time */ + dict_del(fix_layout, "new-commit-hash"); + + return 0; +} + +int +gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, + dict_t *fix_layout, dict_t *migrate_data) +{ + int ret = -1; + loc_t entry_loc = { + 0, + }; + fd_t *fd = NULL; + gf_dirent_t entries; + gf_dirent_t *tmp = NULL; + gf_dirent_t *entry = NULL; + gf_boolean_t free_entries = _gf_false; + off_t offset = 0; + struct iatt iatt = { + 0, + }; + inode_t *linked_inode = NULL, *inode = NULL; + dht_conf_t *conf = NULL; + int perrno = 0; + + conf = this->private; + if (!conf) { + ret = -1; + goto out; + } + + ret = syncop_lookup(this, loc, &iatt, NULL, NULL, NULL); + if (ret) { + if (strcmp(loc->path, "/") == 0) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_DIR_LOOKUP_FAILED, + "lookup failed for:%s", loc->path); + + defrag->total_failures++; + ret = -1; + goto out; + } + + if (-ret == ENOENT || -ret == ESTALE) { + gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_LOOKUP_FAILED, + "Dir:%s renamed or removed. Skipping", loc->path); + if (conf->decommission_subvols_cnt) { + defrag->total_failures++; + } + ret = 0; + goto out; + } else { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_DIR_LOOKUP_FAILED, + "lookup failed for:%s", loc->path); + + defrag->total_failures++; + goto out; + } + } + + fd = fd_create(loc->inode, defrag->pid); + if (!fd) { + gf_log(this->name, GF_LOG_ERROR, "Failed to create fd"); + ret = -1; + goto out; + } + + ret = syncop_opendir(this, loc, fd, NULL, NULL); + if (ret) { + if (-ret == ENOENT || -ret == ESTALE) { + if (conf->decommission_subvols_cnt) { + defrag->total_failures++; + } + ret = 0; + goto out; + } + + gf_log(this->name, GF_LOG_ERROR, + "Failed to open dir %s, " + "err:%d", + loc->path, -ret); + + ret = -1; + goto out; + } + + fd_bind(fd); + INIT_LIST_HEAD(&entries.list); + + while ((ret = syncop_readdirp(this, fd, 131072, offset, &entries, NULL, + NULL)) != 0) { + if (ret < 0) { + if (-ret == ENOENT || -ret == ESTALE) { + if (conf->decommission_subvols_cnt) { + defrag->total_failures++; + } + ret = 0; goto out; + } + + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_READDIR_ERROR, + "readdirp failed for " + "path %s. Aborting fix-layout", + loc->path); + + ret = -1; + goto out; } - INIT_LIST_HEAD (&entries.list); - while ((ret = syncop_readdirp (this, fd, 131072, offset, NULL, - &entries)) != 0) + if (list_empty(&entries.list)) + break; + + free_entries = _gf_true; + + list_for_each_entry_safe(entry, tmp, &entries.list, list) { + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { + ret = 1; + goto out; + } - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "Readdir returned %s" - ". Aborting fix-layout",strerror(-ret)); + offset = entry->d_off; + + if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) + continue; + if (!IA_ISDIR(entry->d_stat.ia_type)) { + continue; + } + loc_wipe(&entry_loc); + + ret = dht_build_child_loc(this, &entry_loc, loc, entry->d_name); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "Child loc" + " build failed for entry: %s", + entry->d_name); + + if (conf->decommission_in_progress) { + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + + goto out; + } else { + continue; + } + } + + if (gf_uuid_is_null(entry->d_stat.ia_gfid)) { + gf_log(this->name, GF_LOG_ERROR, + "%s/%s" + " gfid not present", + loc->path, entry->d_name); + continue; + } + + gf_uuid_copy(entry_loc.gfid, entry->d_stat.ia_gfid); + + /*In case the gfid stored in the inode by inode_link + * and the gfid obtained in the lookup differs, then + * client3_3_lookup_cbk will return ESTALE and proper + * error will be captured + */ + + linked_inode = inode_link(entry_loc.inode, loc->inode, + entry->d_name, &entry->d_stat); + + inode = entry_loc.inode; + entry_loc.inode = linked_inode; + inode_unref(inode); + + if (gf_uuid_is_null(loc->gfid)) { + gf_log(this->name, GF_LOG_ERROR, + "%s/%s" + " gfid not present", + loc->path, entry->d_name); + continue; + } + + gf_uuid_copy(entry_loc.pargfid, loc->gfid); + + ret = syncop_lookup(this, &entry_loc, &iatt, NULL, NULL, NULL); + if (ret) { + if (-ret == ENOENT || -ret == ESTALE) { + gf_msg(this->name, GF_LOG_INFO, -ret, + DHT_MSG_DIR_LOOKUP_FAILED, + "Dir:%s renamed or removed. " + "Skipping", + loc->path); + ret = 0; + if (conf->decommission_subvols_cnt) { + defrag->total_failures++; + } + continue; + } else { + gf_msg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_DIR_LOOKUP_FAILED, "lookup failed for:%s", + entry_loc.path); + + defrag->total_failures++; + + if (conf->decommission_in_progress) { + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; ret = -1; goto out; + } else { + continue; + } } + } - if (list_empty (&entries.list)) - break; + /* A return value of 2 means, either process_dir or + * lookup of a dir failed. Hence, don't commit hash + * for the current directory*/ - free_entries = _gf_true; + ret = gf_defrag_fix_layout(this, defrag, &entry_loc, fix_layout, + migrate_data); - list_for_each_entry_safe (entry, tmp, &entries.list, list) { - if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { - ret = 1; - goto out; - } + if (defrag->defrag_status == GF_DEFRAG_STATUS_STOPPED || + defrag->defrag_status == GF_DEFRAG_STATUS_FAILED) { + goto out; + } - offset = entry->d_off; + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_FIX_FAILED, + "Fix layout failed for %s", entry_loc.path); - if (!strcmp (entry->d_name, ".") || - !strcmp (entry->d_name, "..")) - continue; + defrag->total_failures++; - if (!IA_ISDIR (entry->d_stat.ia_type)) - continue; + if (conf->decommission_in_progress) { + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; - loc_wipe (&entry_loc); - ret =dht_build_child_loc (this, &entry_loc, loc, - entry->d_name); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Child loc" - " build failed"); - goto out; - } + goto out; + } else { + /* Let's not commit-hash if + * gf_defrag_fix_layout failed*/ + continue; + } + } + } - if (uuid_is_null (entry->d_stat.ia_gfid)) { - gf_log (this->name, GF_LOG_ERROR, "%s/%s" - " gfid not present", loc->path, - entry->d_name); - continue; - } + gf_dirent_free(&entries); + free_entries = _gf_false; + INIT_LIST_HEAD(&entries.list); + } + + /* A directory layout is fixed only after its subdirs are healed to + * any newly added bricks. If the layout is fixed before subdirs are + * healed, the newly added brick will get a non-null layout. + * Any subdirs which hash to that layout will no longer show up + * in a directory listing until they are healed. + */ + + ret = syncop_setxattr(this, loc, fix_layout, 0, NULL, NULL); + + /* In case of a race where the directory is deleted just before + * layout setxattr, the errors are updated in the layout structure. + * We can use this information to make a decision whether the directory + * is deleted entirely. + */ + if (ret == 0) { + ret = dht_dir_layout_error_check(this, loc->inode); + ret = -ret; + } + + if (ret) { + if (-ret == ENOENT || -ret == ESTALE) { + gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_LAYOUT_FIX_FAILED, + "Setxattr failed. Dir %s " + "renamed or removed", + loc->path); + if (conf->decommission_subvols_cnt) { + defrag->total_failures++; + } + ret = 0; + goto out; + } else { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LAYOUT_FIX_FAILED, + "Setxattr failed for %s", loc->path); - entry_loc.inode->ia_type = entry->d_stat.ia_type; + defrag->total_failures++; - uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid); - if (uuid_is_null (loc->gfid)) { - gf_log (this->name, GF_LOG_ERROR, "%s/%s" - " gfid not present", loc->path, - entry->d_name); - continue; - } + if (conf->decommission_in_progress) { + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + ret = -1; + goto out; + } + } + } - uuid_copy (entry_loc.pargfid, loc->gfid); + if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) { + ret = gf_defrag_process_dir(this, defrag, loc, migrate_data, &perrno); - ret = syncop_lookup (this, &entry_loc, NULL, &iatt, - NULL, NULL); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "%s" - " lookup failed", entry_loc.path); - ret = -1; - continue; - } + if (ret) { + if (perrno == ENOENT || perrno == ESTALE) { + ret = 0; + goto out; + } else { + defrag->total_failures++; - ret = syncop_setxattr (this, &entry_loc, fix_layout, - 0); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Setxattr " - "failed for %s", entry_loc.path); - defrag->defrag_status = - GF_DEFRAG_STATUS_FAILED; - defrag->total_failures ++; - ret = -1; - goto out; - } - ret = gf_defrag_fix_layout (this, defrag, &entry_loc, - fix_layout, migrate_data); - - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Fix layout " - "failed for %s", entry_loc.path); - defrag->total_failures++; - goto out; - } + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_DEFRAG_PROCESS_DIR_FAILED, + "gf_defrag_process_dir failed for " + "directory: %s", + loc->path); + if (conf->decommission_in_progress) { + goto out; } - gf_dirent_free (&entries); - free_entries = _gf_false; - INIT_LIST_HEAD (&entries.list); + } } + } - ret = 0; + gf_msg_trace(this->name, 0, "fix layout called on %s", loc->path); + + if (gf_defrag_settle_hash(this, defrag, loc, fix_layout) != 0) { + defrag->total_failures++; + + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SETTLE_HASH_FAILED, + "Settle hash failed for %s", loc->path); + + ret = -1; + + if (conf->decommission_in_progress) { + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + goto out; + } + } + + ret = 0; out: - if (free_entries) - gf_dirent_free (&entries); + if (free_entries) + gf_dirent_free(&entries); - loc_wipe (&entry_loc); + loc_wipe(&entry_loc); - if (dict) - dict_unref(dict); + if (fd) + fd_unref(fd); - if (fd) - fd_unref (fd); + return ret; +} +int +dht_init_local_subvols_and_nodeuuids(xlator_t *this, dht_conf_t *conf, + loc_t *loc) +{ + dict_t *dict = NULL; + uuid_t *uuid_ptr = NULL; + int ret = -1; + int i = 0; + int j = 0; + + /* Find local subvolumes */ + ret = syncop_getxattr(this, loc, &dict, GF_REBAL_FIND_LOCAL_SUBVOL, NULL, + NULL); + if (ret && (ret != -ENODATA)) { + gf_msg(this->name, GF_LOG_ERROR, -ret, 0, + "local " + "subvolume determination failed with error: %d", + -ret); + ret = -1; + goto out; + } + + if (!ret) + goto out; + + ret = syncop_getxattr(this, loc, &dict, GF_REBAL_OLD_FIND_LOCAL_SUBVOL, + NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, 0, + "local " + "subvolume determination failed with error: %d", + -ret); + ret = -1; + goto out; + } + ret = 0; + +out: + if (ret) { return ret; + } + + for (i = 0; i < conf->local_subvols_cnt; i++) { + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "local subvol: " + "%s", + conf->local_subvols[i]->name); + + for (j = 0; j < conf->local_nodeuuids[i].count; j++) { + uuid_ptr = &(conf->local_nodeuuids[i].elements[j].uuid); + gf_msg(this->name, GF_LOG_INFO, 0, 0, "node uuid : %s", + uuid_utoa(*uuid_ptr)); + } + } + return ret; } +/* Functions for the rebalance estimates feature */ -int -gf_defrag_start_crawl (void *data) +uint64_t +gf_defrag_subvol_file_size(xlator_t *this, loc_t *root_loc) { - xlator_t *this = NULL; - dht_conf_t *conf = NULL; - gf_defrag_info_t *defrag = NULL; - int ret = -1; - loc_t loc = {0,}; - struct iatt iatt = {0,}; - struct iatt parent = {0,}; - dict_t *fix_layout = NULL; - dict_t *migrate_data = NULL; - dict_t *status = NULL; - glusterfs_ctx_t *ctx = NULL; - - this = data; - if (!this) - goto out; + int ret = -1; + struct statvfs buf = { + 0, + }; + + ret = syncop_statfs(this, root_loc, &buf, NULL, NULL); + if (ret) { + /* Aargh! */ + return 0; + } + return ((buf.f_blocks - buf.f_bfree) * buf.f_frsize); +} - ctx = this->ctx; - if (!ctx) - goto out; +uint64_t +gf_defrag_total_file_size(xlator_t *this, loc_t *root_loc) +{ + dht_conf_t *conf = NULL; + int i = 0; + uint64_t size_files = 0; + uint64_t total_size = 0; - conf = this->private; - if (!conf) - goto out; + conf = this->private; + if (!conf) { + return 0; + } + + for (i = 0; i < conf->local_subvols_cnt; i++) { + size_files = gf_defrag_subvol_file_size(conf->local_subvols[i], + root_loc); + total_size += size_files; + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "local subvol: %s," + "cnt = %" PRIu64, + conf->local_subvols[i]->name, size_files); + } + + gf_msg(this->name, GF_LOG_INFO, 0, 0, "Total size files = %" PRIu64, + total_size); + + return total_size; +} - defrag = conf->defrag; - if (!defrag) - goto out; +static void * +dht_file_counter_thread(void *args) +{ + gf_defrag_info_t *defrag = NULL; + loc_t root_loc = { + 0, + }; + struct timespec time_to_wait = { + 0, + }; + uint64_t tmp_size = 0; + + if (!args) + return NULL; - gettimeofday (&defrag->start_time, NULL); - dht_build_root_inode (this, &defrag->root_inode); - if (!defrag->root_inode) - goto out; + defrag = (gf_defrag_info_t *)args; + dht_build_root_loc(defrag->root_inode, &root_loc); - dht_build_root_loc (defrag->root_inode, &loc); + while (defrag->defrag_status == GF_DEFRAG_STATUS_STARTED) { + timespec_now(&time_to_wait); + time_to_wait.tv_sec += 600; - /* fix-layout on '/' first */ + pthread_mutex_lock(&defrag->fc_mutex); + pthread_cond_timedwait(&defrag->fc_wakeup_cond, &defrag->fc_mutex, + &time_to_wait); - ret = syncop_lookup (this, &loc, NULL, &iatt, NULL, &parent); + pthread_mutex_unlock(&defrag->fc_mutex); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "look up on / failed"); - ret = -1; - goto out; + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) + break; + + tmp_size = gf_defrag_total_file_size(defrag->this, &root_loc); + + gf_log("dht", GF_LOG_INFO, "tmp data size =%" PRIu64, tmp_size); + + if (!tmp_size) { + gf_msg("dht", GF_LOG_ERROR, 0, 0, + "Failed to get " + "the total data size. Unable to estimate " + "time to complete rebalance."); + } else { + g_totalsize = tmp_size; + gf_msg_debug("dht", 0, "total data size =%" PRIu64, g_totalsize); } + } - fix_layout = dict_new (); - if (!fix_layout) { - ret = -1; - goto out; + return NULL; +} + +int +gf_defrag_estimates_cleanup(xlator_t *this, gf_defrag_info_t *defrag, + pthread_t filecnt_thread) +{ + int ret = -1; + + /* Wake up the filecounter thread. + * By now the defrag status will no longer be + * GF_DEFRAG_STATUS_STARTED so the thread will exit the loop. + */ + pthread_mutex_lock(&defrag->fc_mutex); + { + pthread_cond_broadcast(&defrag->fc_wakeup_cond); + } + pthread_mutex_unlock(&defrag->fc_mutex); + + ret = pthread_join(filecnt_thread, NULL); + if (ret) { + gf_msg("dht", GF_LOG_ERROR, ret, 0, + "file_counter_thread: pthread_join failed."); + ret = -1; + } + return ret; +} + +int +gf_defrag_estimates_init(xlator_t *this, loc_t *loc, pthread_t *filecnt_thread) +{ + int ret = -1; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + + conf = this->private; + defrag = conf->defrag; + + g_totalsize = gf_defrag_total_file_size(this, loc); + if (!g_totalsize) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "Failed to get " + "the total data size. Unable to estimate " + "time to complete rebalance."); + goto out; + } + + ret = gf_thread_create(filecnt_thread, NULL, dht_file_counter_thread, + (void *)defrag, "dhtfcnt"); + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ret, 0, + "Failed to " + "create the file counter thread "); + ret = -1; + goto out; + } + ret = 0; +out: + return ret; +} + +/* Init and cleanup functions for parallel file migration*/ +int +gf_defrag_parallel_migration_init(xlator_t *this, gf_defrag_info_t *defrag, + pthread_t **tid_array, int *thread_index) +{ + int ret = -1; + int thread_spawn_count = 0; + int index = 0; + pthread_t *tid = NULL; + + if (!defrag) + goto out; + + /* Initialize global entry queue */ + defrag->queue = GF_CALLOC(1, sizeof(struct dht_container), + gf_dht_mt_container_t); + + if (!defrag->queue) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0, + "Failed to initialise migration queue"); + ret = -1; + goto out; + } + + INIT_LIST_HEAD(&(defrag->queue[0].list)); + + thread_spawn_count = MAX(MAX_REBAL_THREADS, 4); + + gf_msg_debug(this->name, 0, "thread_spawn_count: %d", thread_spawn_count); + + tid = GF_CALLOC(thread_spawn_count, sizeof(pthread_t), + gf_common_mt_pthread_t); + if (!tid) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0, + "Failed to create migration threads"); + ret = -1; + goto out; + } + defrag->current_thread_count = thread_spawn_count; + + /*Spawn Threads Here*/ + while (index < thread_spawn_count) { + ret = gf_thread_create(&(tid[index]), NULL, gf_defrag_task, + (void *)defrag, "dhtmig%d", (index + 1) & 0x3ff); + if (ret != 0) { + gf_msg("DHT", GF_LOG_ERROR, ret, 0, "Thread[%d] creation failed. ", + index); + ret = -1; + goto out; + } else { + gf_log("DHT", GF_LOG_INFO, + "Thread[%d] " + "creation successful", + index); } + index++; + } + + ret = 0; +out: + *thread_index = index; + *tid_array = tid; + + return ret; +} + +int +gf_defrag_parallel_migration_cleanup(gf_defrag_info_t *defrag, + pthread_t *tid_array, int thread_index) +{ + int ret = -1; + int i = 0; + + if (!defrag) + goto out; + + /* Wake up all migration threads */ + pthread_mutex_lock(&defrag->dfq_mutex); + { + defrag->crawl_done = 1; + + pthread_cond_broadcast(&defrag->parallel_migration_cond); + pthread_cond_broadcast(&defrag->df_wakeup_thread); + } + pthread_mutex_unlock(&defrag->dfq_mutex); + + /*Wait for all the threads to complete their task*/ + for (i = 0; i < thread_index; i++) { + pthread_join(tid_array[i], NULL); + } + + GF_FREE(tid_array); + + /* Cleanup the migration queue */ + if (defrag->queue) { + gf_dirent_free(defrag->queue[0].df_entry); + INIT_LIST_HEAD(&(defrag->queue[0].list)); + } + + GF_FREE(defrag->queue); + + ret = 0; +out: + return ret; +} + +int +gf_defrag_start_crawl(void *data) +{ + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + dict_t *fix_layout = NULL; + dict_t *migrate_data = NULL; + dict_t *status = NULL; + glusterfs_ctx_t *ctx = NULL; + call_frame_t *statfs_frame = NULL; + xlator_t *old_THIS = NULL; + int ret = -1; + loc_t loc = { + 0, + }; + struct iatt iatt = { + 0, + }; + struct iatt parent = { + 0, + }; + int thread_index = 0; + pthread_t *tid = NULL; + pthread_t filecnt_thread; + gf_boolean_t fc_thread_started = _gf_false; + + this = data; + if (!this) + goto exit; + + ctx = this->ctx; + if (!ctx) + goto exit; + + conf = this->private; + if (!conf) + goto exit; + + defrag = conf->defrag; + if (!defrag) + goto exit; + + defrag->start_time = gf_time(); + + dht_build_root_inode(this, &defrag->root_inode); + if (!defrag->root_inode) + goto out; + + dht_build_root_loc(defrag->root_inode, &loc); + + /* fix-layout on '/' first */ + + ret = syncop_lookup(this, &loc, &iatt, &parent, NULL, NULL); + + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_REBALANCE_START_FAILED, + "Failed to start rebalance: look up on / failed"); + ret = -1; + goto out; + } + + old_THIS = THIS; + THIS = this; + + statfs_frame = create_frame(this, this->ctx->pool); + if (!statfs_frame) { + gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, ENOMEM, + "Insufficient memory. Frame creation failed"); + ret = -1; + goto out; + } + + /* async statfs update for honoring min-free-disk */ + dht_get_du_info(statfs_frame, this, &loc); + THIS = old_THIS; + + fix_layout = dict_new(); + if (!fix_layout) { + ret = -1; + goto out; + } + + /* + * Unfortunately, we can't do special xattrs (like fix.layout) and + * real ones in the same call currently, and changing it seems + * riskier than just doing two calls. + */ + + gf_log(this->name, GF_LOG_INFO, "%s using commit hash %u", __func__, + conf->vol_commit_hash); + + ret = dict_set_uint32(fix_layout, conf->commithash_xattr_name, + conf->vol_commit_hash); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, "Failed to set %s", + conf->commithash_xattr_name); + defrag->total_failures++; + ret = -1; + goto out; + } + + ret = syncop_setxattr(this, &loc, fix_layout, 0, NULL, NULL); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "Failed to set commit hash on %s. " + "Rebalance cannot proceed.", + loc.path); + defrag->total_failures++; + ret = -1; + goto out; + } + + /* We now return to our regularly scheduled program. */ + + ret = dict_set_str(fix_layout, GF_XATTR_FIX_LAYOUT_KEY, "yes"); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_REBALANCE_START_FAILED, + "Failed to start rebalance:" + "Failed to set dictionary value: key = %s", + GF_XATTR_FIX_LAYOUT_KEY); + defrag->total_failures++; + ret = -1; + goto out; + } + + defrag->new_commit_hash = conf->vol_commit_hash; - ret = dict_set_str (fix_layout, GF_XATTR_FIX_LAYOUT_KEY, "yes"); + ret = syncop_setxattr(this, &loc, fix_layout, 0, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_REBALANCE_FAILED, + "fix layout on %s failed", loc.path); + defrag->total_failures++; + ret = -1; + goto out; + } + + if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) { + /* We need to migrate files */ + + migrate_data = dict_new(); + if (!migrate_data) { + defrag->total_failures++; + ret = -1; + goto out; + } + ret = dict_set_str( + migrate_data, GF_XATTR_FILE_MIGRATE_KEY, + (defrag->cmd == GF_DEFRAG_CMD_START_FORCE) ? "force" : "non-force"); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Failed to set dict str"); - goto out; + defrag->total_failures++; + ret = -1; + goto out; } - ret = syncop_setxattr (this, &loc, fix_layout, 0); + ret = dht_init_local_subvols_and_nodeuuids(this, conf, &loc); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "fix layout on %s failed", - loc.path); - defrag->total_failures++; - ret = -1; - goto out; + ret = -1; + goto out; } - if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) { - migrate_data = dict_new (); - if (!migrate_data) { - ret = -1; - goto out; - } - if (defrag->cmd == GF_DEFRAG_CMD_START_FORCE) - ret = dict_set_str (migrate_data, - "distribute.migrate-data", "force"); - else - ret = dict_set_str (migrate_data, - "distribute.migrate-data", - "non-force"); - if (ret) - goto out; + /* Initialise the structures required for parallel migration */ + ret = gf_defrag_parallel_migration_init(this, defrag, &tid, + &thread_index); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, "Aborting rebalance."); + goto out; } - ret = gf_defrag_fix_layout (this, defrag, &loc, fix_layout, - migrate_data); - if ((defrag->defrag_status != GF_DEFRAG_STATUS_STOPPED) && - (defrag->defrag_status != GF_DEFRAG_STATUS_FAILED)) { - defrag->defrag_status = GF_DEFRAG_STATUS_COMPLETE; + + ret = gf_defrag_estimates_init(this, &loc, &filecnt_thread); + if (ret) { + /* Not a fatal error. Allow the rebalance to proceed*/ + ret = 0; + } else { + fc_thread_started = _gf_true; } + } + ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout, migrate_data); + if (ret) { + defrag->total_failures++; + ret = -1; + goto out; + } + if (gf_defrag_settle_hash(this, defrag, &loc, fix_layout) != 0) { + defrag->total_failures++; + ret = -1; + goto out; + } + gf_log("DHT", GF_LOG_INFO, "crawling file-system completed"); out: - LOCK (&defrag->lock); - { - status = dict_new (); - gf_defrag_status_get (defrag, status); - if (ctx->notify) - ctx->notify (GF_EN_DEFRAG_STATUS, status); - if (status) - dict_unref (status); - defrag->is_exiting = 1; - } - UNLOCK (&defrag->lock); - if (defrag) { - GF_FREE (defrag); - conf->defrag = NULL; - } - - return ret; + /* We are here means crawling the entire file system is done + or something failed. Set defrag->crawl_done flag to intimate + the migrator threads to exhaust the defrag->queue and terminate*/ + + if (ret) { + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + } + + gf_defrag_parallel_migration_cleanup(defrag, tid, thread_index); + + if ((defrag->defrag_status != GF_DEFRAG_STATUS_STOPPED) && + (defrag->defrag_status != GF_DEFRAG_STATUS_FAILED)) { + defrag->defrag_status = GF_DEFRAG_STATUS_COMPLETE; + } + + if (fc_thread_started) { + gf_defrag_estimates_cleanup(this, defrag, filecnt_thread); + } + + dht_send_rebalance_event(this, defrag->cmd, defrag->defrag_status); + + status = dict_new(); + LOCK(&defrag->lock); + { + gf_defrag_status_get(conf, status); + if (ctx && ctx->notify) + ctx->notify(GF_EN_DEFRAG_STATUS, status); + if (status) + dict_unref(status); + defrag->is_exiting = 1; + } + UNLOCK(&defrag->lock); + + GF_FREE(defrag); + conf->defrag = NULL; + + if (migrate_data) + dict_unref(migrate_data); + + if (statfs_frame) { + STACK_DESTROY(statfs_frame->root); + } +exit: + return ret; } - static int -gf_defrag_done (int ret, call_frame_t *sync_frame, void *data) +gf_defrag_done(int ret, call_frame_t *sync_frame, void *data) { - gf_listener_stop (sync_frame->this); + gf_listener_stop(sync_frame->this); - STACK_DESTROY (sync_frame->root); - kill (getpid(), SIGTERM); - return 0; + STACK_DESTROY(sync_frame->root); + kill(getpid(), SIGTERM); + return 0; } void * -gf_defrag_start (void *data) +gf_defrag_start(void *data) { - int ret = -1; - call_frame_t *frame = NULL; - dht_conf_t *conf = NULL; - gf_defrag_info_t *defrag = NULL; - xlator_t *this = NULL; - - this = data; - conf = this->private; - if (!conf) - goto out; + int ret = -1; + call_frame_t *frame = NULL; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + xlator_t *this = NULL; + xlator_t *old_THIS = NULL; - defrag = conf->defrag; - if (!defrag) - goto out; + this = data; + conf = this->private; + if (!conf) + goto out; - frame = create_frame (this, this->ctx->pool); - if (!frame) - goto out; + defrag = conf->defrag; + if (!defrag) + goto out; - frame->root->pid = GF_CLIENT_PID_DEFRAG; + frame = create_frame(this, this->ctx->pool); + if (!frame) + goto out; - defrag->pid = frame->root->pid; + frame->root->pid = GF_CLIENT_PID_DEFRAG; - defrag->defrag_status = GF_DEFRAG_STATUS_STARTED; + defrag->pid = frame->root->pid; - ret = synctask_new (this->ctx->env, gf_defrag_start_crawl, - gf_defrag_done, frame, this); + defrag->defrag_status = GF_DEFRAG_STATUS_STARTED; - if (ret) - gf_log (this->name, GF_LOG_ERROR, "Could not create" - " task for rebalance"); + old_THIS = THIS; + THIS = this; + ret = synctask_new(this->ctx->env, gf_defrag_start_crawl, gf_defrag_done, + frame, this); + + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_REBALANCE_START_FAILED, + "Could not create task for rebalance"); + THIS = old_THIS; out: - return NULL; + return NULL; } -int -gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict) +uint64_t +gf_defrag_get_estimates_based_on_size(dht_conf_t *conf) { - int ret = 0; - uint64_t files = 0; - uint64_t size = 0; - uint64_t lookup = 0; - uint64_t failures = 0; - uint64_t skipped = 0; - char *status = ""; - double elapsed = 0; - struct timeval end = {0,}; - - - if (!defrag) - goto out; + gf_defrag_info_t *defrag = NULL; + double rate_processed = 0; + uint64_t total_processed = 0; + uint64_t tmp_count = 0; + uint64_t time_to_complete = 0; + double elapsed = 0; - ret = 0; - if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) - goto out; + defrag = conf->defrag; - files = defrag->total_files; - size = defrag->total_data; - lookup = defrag->num_files_lookedup; - failures = defrag->total_failures; - skipped = defrag->skipped; + if (!g_totalsize) + goto out; - gettimeofday (&end, NULL); + elapsed = gf_time() - defrag->start_time; - elapsed = end.tv_sec - defrag->start_time.tv_sec; + /* Don't calculate the estimates for the first 10 minutes. + * It is unlikely to be accurate and estimates are not required + * if the process finishes in less than 10 mins. + */ - if (!dict) - goto log; + if (elapsed < ESTIMATE_START_INTERVAL) { + gf_msg(THIS->name, GF_LOG_INFO, 0, 0, + "Rebalance estimates will not be available for the " + "first %d seconds.", + ESTIMATE_START_INTERVAL); - ret = dict_set_uint64 (dict, "files", files); - if (ret) - gf_log (THIS->name, GF_LOG_WARNING, - "failed to set file count"); + goto out; + } - ret = dict_set_uint64 (dict, "size", size); - if (ret) - gf_log (THIS->name, GF_LOG_WARNING, - "failed to set size of xfer"); + total_processed = defrag->size_processed; - ret = dict_set_uint64 (dict, "lookups", lookup); - if (ret) - gf_log (THIS->name, GF_LOG_WARNING, - "failed to set lookedup file count"); + /* rate at which files processed */ + rate_processed = (total_processed) / elapsed; + tmp_count = g_totalsize; - ret = dict_set_int32 (dict, "status", defrag->defrag_status); - if (ret) - gf_log (THIS->name, GF_LOG_WARNING, - "failed to set status"); - if (elapsed) { - ret = dict_set_double (dict, "run-time", elapsed); - if (ret) - gf_log (THIS->name, GF_LOG_WARNING, - "failed to set run-time"); - } + if (rate_processed) { + time_to_complete = (tmp_count) / rate_processed; + + } else { + gf_msg(THIS->name, GF_LOG_ERROR, 0, 0, + "Unable to calculate estimated time for rebalance"); + } + + gf_log(THIS->name, GF_LOG_INFO, + "TIME: (size) total_processed=%" PRIu64 " tmp_cnt = %" PRIu64 + "," + "rate_processed=%f, elapsed = %f", + total_processed, tmp_count, rate_processed, elapsed); + +out: + return time_to_complete; +} - ret = dict_set_uint64 (dict, "failures", failures); - if (ret) - gf_log (THIS->name, GF_LOG_WARNING, - "failed to set failure count"); +int +gf_defrag_status_get(dht_conf_t *conf, dict_t *dict) +{ + int ret = 0; + uint64_t files = 0; + uint64_t size = 0; + uint64_t lookup = 0; + uint64_t failures = 0; + uint64_t skipped = 0; + char *status = ""; + double elapsed = 0; + uint64_t time_to_complete = 0; + uint64_t time_left = 0; + gf_defrag_info_t *defrag = conf->defrag; + + if (!defrag) + goto out; + + ret = 0; + if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) + goto out; + + files = defrag->total_files; + size = defrag->total_data; + lookup = defrag->num_files_lookedup; + failures = defrag->total_failures; + skipped = defrag->skipped; + + elapsed = gf_time() - defrag->start_time; + + /* The rebalance is still in progress */ + + if (defrag->defrag_status == GF_DEFRAG_STATUS_STARTED) { + time_to_complete = gf_defrag_get_estimates_based_on_size(conf); + + if (time_to_complete && (time_to_complete > elapsed)) + time_left = time_to_complete - elapsed; + + gf_log(THIS->name, GF_LOG_INFO, + "TIME: Estimated total time to complete (size)= %" PRIu64 + " seconds, seconds left = %" PRIu64 "", + time_to_complete, time_left); + } + + if (!dict) + goto log; + + ret = dict_set_uint64(dict, "files", files); + if (ret) + gf_log(THIS->name, GF_LOG_WARNING, "failed to set file count"); + + ret = dict_set_uint64(dict, "size", size); + if (ret) + gf_log(THIS->name, GF_LOG_WARNING, "failed to set size of xfer"); + + ret = dict_set_uint64(dict, "lookups", lookup); + if (ret) + gf_log(THIS->name, GF_LOG_WARNING, "failed to set lookedup file count"); + + ret = dict_set_int32(dict, "status", defrag->defrag_status); + if (ret) + gf_log(THIS->name, GF_LOG_WARNING, "failed to set status"); + + ret = dict_set_double(dict, "run-time", elapsed); + if (ret) + gf_log(THIS->name, GF_LOG_WARNING, "failed to set run-time"); + + ret = dict_set_uint64(dict, "failures", failures); + if (ret) + gf_log(THIS->name, GF_LOG_WARNING, "failed to set failure count"); + + ret = dict_set_uint64(dict, "skipped", skipped); + if (ret) + gf_log(THIS->name, GF_LOG_WARNING, "failed to set skipped file count"); + + ret = dict_set_uint64(dict, "time-left", time_left); + if (ret) + gf_log(THIS->name, GF_LOG_WARNING, "failed to set time-left"); - ret = dict_set_uint64 (dict, "skipped", skipped); - if (ret) - gf_log (THIS->name, GF_LOG_WARNING, - "failed to set skipped file count"); log: - switch (defrag->defrag_status) { + switch (defrag->defrag_status) { case GF_DEFRAG_STATUS_NOT_STARTED: - status = "not started"; - break; + status = "not started"; + break; case GF_DEFRAG_STATUS_STARTED: - status = "in progress"; - break; + status = "in progress"; + break; case GF_DEFRAG_STATUS_STOPPED: - status = "stopped"; - break; + status = "stopped"; + break; case GF_DEFRAG_STATUS_COMPLETE: - status = "completed"; - break; + status = "completed"; + break; case GF_DEFRAG_STATUS_FAILED: - status = "failed"; - break; + status = "failed"; + break; default: - break; - } - - gf_log (THIS->name, GF_LOG_INFO, "Rebalance is %s. Time taken is %.2f " - "secs", status, elapsed); - gf_log (THIS->name, GF_LOG_INFO, "Files migrated: %"PRIu64", size: %" - PRIu64", lookups: %"PRIu64", failures: %"PRIu64", skipped: " - "%"PRIu64, files, size, lookup, failures, skipped); - - + break; + } + + gf_msg(THIS->name, GF_LOG_INFO, 0, DHT_MSG_REBALANCE_STATUS, + "Rebalance is %s. Time taken is %.2f secs", status, elapsed); + gf_msg(THIS->name, GF_LOG_INFO, 0, DHT_MSG_REBALANCE_STATUS, + "Files migrated: %" PRIu64 ", size: %" PRIu64 ", lookups: %" PRIu64 + ", failures: %" PRIu64 + ", skipped: " + "%" PRIu64, + files, size, lookup, failures, skipped); out: - return 0; + return 0; } int -gf_defrag_stop (gf_defrag_info_t *defrag, gf_defrag_status_t status, - dict_t *output) +gf_defrag_stop(dht_conf_t *conf, gf_defrag_status_t status, dict_t *output) { - /* TODO: set a variable 'stop_defrag' here, it should be checked - in defrag loop */ - int ret = -1; - GF_ASSERT (defrag); + /* TODO: set a variable 'stop_defrag' here, it should be checked + in defrag loop */ + int ret = -1; + gf_defrag_info_t *defrag = conf->defrag; - if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) { - goto out; - } + GF_ASSERT(defrag); - gf_log ("", GF_LOG_INFO, "Received stop command on rebalance"); - defrag->defrag_status = status; + if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) { + goto out; + } - if (output) - gf_defrag_status_get (defrag, output); - ret = 0; + gf_msg("", GF_LOG_INFO, 0, DHT_MSG_REBALANCE_STOPPED, + "Received stop command on rebalance"); + defrag->defrag_status = status; + + if (output) + gf_defrag_status_get(conf, output); + ret = 0; out: - gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); - return ret; + gf_msg_debug("", 0, "Returning %d", ret); + return ret; } diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c index 925538cc80c..d9dbf50492f 100644 --- a/xlators/cluster/dht/src/dht-rename.c +++ b/xlators/cluster/dht/src/dht-rename.c @@ -11,999 +11,1987 @@ /* TODO: link(oldpath, newpath) fails if newpath already exists. DHT should * delete the newpath if it gets EEXISTS from link() call. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "xlator.h" #include "dht-common.h" -#include "defaults.h" +#include "dht-lock.h" +#include <glusterfs/defaults.h> +int +dht_rename_unlock(call_frame_t *frame, xlator_t *this); +int32_t +dht_rename_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); int -dht_rename_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *stbuf, - struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent, - dict_t *xdata) -{ - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - - local = frame->local; - prev = cookie; - - if (op_ret == -1) { - /* TODO: undo the damage */ - - gf_log (this->name, GF_LOG_INFO, - "rename %s -> %s on %s failed (%s)", - local->loc.path, local->loc2.path, - prev->this->name, strerror (op_errno)); - - local->op_ret = op_ret; - local->op_errno = op_errno; - goto unwind; - } - /* TODO: construct proper stbuf for dir */ - /* - * FIXME: is this the correct way to build stbuf and - * parent bufs? - */ - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - dht_iatt_merge (this, &local->preoldparent, preoldparent, - prev->this); - dht_iatt_merge (this, &local->postoldparent, postoldparent, - prev->this); - dht_iatt_merge (this, &local->preparent, prenewparent, - prev->this); - dht_iatt_merge (this, &local->postparent, postnewparent, - prev->this); +dht_rename_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + local = frame->local; -unwind: - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - WIPE (&local->preoldparent); - WIPE (&local->postoldparent); - WIPE (&local->preparent); - WIPE (&local->postparent); - - DHT_STRIP_PHASE1_FLAGS (&local->stbuf); - DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, - &local->stbuf, &local->preoldparent, - &local->postoldparent, - &local->preparent, &local->postparent, xdata); - } + dht_set_fixed_dir_stat(&local->preoldparent); + dht_set_fixed_dir_stat(&local->postoldparent); + dht_set_fixed_dir_stat(&local->preparent); + dht_set_fixed_dir_stat(&local->postparent); - return 0; + if (IA_ISREG(local->stbuf.ia_type)) + DHT_STRIP_PHASE1_FLAGS(&local->stbuf); + + DHT_STACK_UNWIND(rename, frame, local->op_ret, local->op_errno, + &local->stbuf, &local->preoldparent, &local->postoldparent, + &local->preparent, &local->postparent, local->xattr); + return 0; } +static void +dht_rename_dir_unlock_src(call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + + local = frame->local; + dht_unlock_namespace(frame, &local->lock[0]); + return; +} + +static void +dht_rename_dir_unlock_dst(call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + int op_ret = -1; + char src_gfid[GF_UUID_BUF_SIZE] = {0}; + char dst_gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + + /* Unlock entrylk */ + dht_unlock_entrylk_wrapper(frame, &local->lock[1].ns.directory_ns); + + /* Unlock inodelk */ + op_ret = dht_unlock_inodelk(frame, local->lock[1].ns.parent_layout.locks, + local->lock[1].ns.parent_layout.lk_count, + dht_rename_unlock_cbk); + if (op_ret < 0) { + uuid_utoa_r(local->loc.inode->gfid, src_gfid); + + if (local->loc2.inode) + uuid_utoa_r(local->loc2.inode->gfid, dst_gfid); + + if (IA_ISREG(local->stbuf.ia_type)) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED, + "winding unlock inodelk failed " + "rename (%s:%s:%s %s:%s:%s), " + "stale locks left on bricks", + local->loc.path, src_gfid, local->src_cached->name, + local->loc2.path, dst_gfid, + local->dst_cached ? local->dst_cached->name : NULL); + else + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED, + "winding unlock inodelk failed " + "rename (%s:%s %s:%s), " + "stale locks left on bricks", + local->loc.path, src_gfid, local->loc2.path, dst_gfid); + dht_rename_unlock_cbk(frame, NULL, this, 0, 0, NULL); + } + + return; +} + +static int +dht_rename_dir_unlock(call_frame_t *frame, xlator_t *this) +{ + dht_rename_dir_unlock_src(frame, this); + dht_rename_dir_unlock_dst(frame, this); + return 0; +} int -dht_rename_hashed_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *stbuf, - struct iatt *preoldparent, - struct iatt *postoldparent, - struct iatt *prenewparent, - struct iatt *postnewparent, dict_t *xdata) -{ - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - int call_cnt = 0; - call_frame_t *prev = NULL; - int i = 0; - - conf = this->private; - local = frame->local; - prev = cookie; - - if (op_ret == -1) { - /* TODO: undo the damage */ - - gf_log (this->name, GF_LOG_INFO, - "rename %s -> %s on %s failed (%s)", - local->loc.path, local->loc2.path, - prev->this->name, strerror (op_errno)); - - local->op_ret = op_ret; - local->op_errno = op_errno; - goto unwind; - } - /* TODO: construct proper stbuf for dir */ - /* - * FIXME: is this the correct way to build stbuf and - * parent bufs? +dht_rename_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *stbuf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + int i = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + int subvol_cnt = -1; + + conf = this->private; + local = frame->local; + prev = cookie; + subvol_cnt = dht_subvol_cnt(this, prev); + local->ret_cache[subvol_cnt] = op_ret; + + if (op_ret == -1) { + gf_uuid_unparse(local->loc.inode->gfid, gfid); + + gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_RENAME_FAILED, + "Rename %s -> %s on %s failed, (gfid = %s)", local->loc.path, + local->loc2.path, prev->name, gfid); + + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; + } + /* TODO: construct proper stbuf for dir */ + /* + * FIXME: is this the correct way to build stbuf and + * parent bufs? + */ + dht_iatt_merge(this, &local->stbuf, stbuf); + dht_iatt_merge(this, &local->preoldparent, preoldparent); + dht_iatt_merge(this, &local->postoldparent, postoldparent); + dht_iatt_merge(this, &local->preparent, prenewparent); + dht_iatt_merge(this, &local->postparent, postnewparent); + +unwind: + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + /* We get here with local->call_cnt == 0. Which means + * we are the only one executing this code, there is + * no contention. Therefore it's safe to manipulate or + * deref local->call_cnt directly (without locking). */ - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - dht_iatt_merge (this, &local->preoldparent, preoldparent, - prev->this); - dht_iatt_merge (this, &local->postoldparent, postoldparent, - prev->this); - dht_iatt_merge (this, &local->preparent, prenewparent, - prev->this); - dht_iatt_merge (this, &local->postparent, postnewparent, - prev->this); - - call_cnt = local->call_cnt = conf->subvolume_cnt - 1; - - if (!local->call_cnt) - goto unwind; - - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->subvolumes[i] == local->dst_hashed) + if (local->ret_cache[conf->subvolume_cnt] == 0) { + /* count errant subvols in last field of ret_cache */ + for (i = 0; i < conf->subvolume_cnt; i++) { + if (local->ret_cache[i] != 0) + ++local->ret_cache[conf->subvolume_cnt]; + } + if (local->ret_cache[conf->subvolume_cnt]) { + /* undoing the damage: + * for all subvolumes, where rename + * succeeded, we perform the reverse operation + */ + for (i = 0; i < conf->subvolume_cnt; i++) { + if (local->ret_cache[i] == 0) + ++local->call_cnt; + } + for (i = 0; i < conf->subvolume_cnt; i++) { + if (local->ret_cache[i]) continue; - STACK_WIND (frame, dht_rename_dir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->rename, - &local->loc, &local->loc2, NULL); - if (!--call_cnt) - break; - } + STACK_WIND(frame, dht_rename_dir_cbk, conf->subvolumes[i], + conf->subvolumes[i]->fops->rename, &local->loc2, + &local->loc, NULL); + } - return 0; -unwind: - WIPE (&local->preoldparent); - WIPE (&local->postoldparent); - WIPE (&local->preparent); - WIPE (&local->postparent); + return 0; + } + } - DHT_STRIP_PHASE1_FLAGS (&local->stbuf); - DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, - &local->stbuf, &local->preoldparent, - &local->postoldparent, - &local->preparent, &local->postparent, NULL); + WIPE(&local->preoldparent); + WIPE(&local->postoldparent); + WIPE(&local->preparent); + WIPE(&local->postparent); - return 0; + dht_rename_dir_unlock(frame, this); + } + + return 0; } +int +dht_rename_hashed_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *stbuf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int call_cnt = 0; + xlator_t *prev = NULL; + int i = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + conf = this->private; + local = frame->local; + prev = cookie; + + if (op_ret == -1) { + gf_uuid_unparse(local->loc.inode->gfid, gfid); + + gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_RENAME_FAILED, + "rename %s -> %s on %s failed, (gfid = %s) ", local->loc.path, + local->loc2.path, prev->name, gfid); + + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; + } + /* TODO: construct proper stbuf for dir */ + /* + * FIXME: is this the correct way to build stbuf and + * parent bufs? + */ + dht_iatt_merge(this, &local->stbuf, stbuf); + dht_iatt_merge(this, &local->preoldparent, preoldparent); + dht_iatt_merge(this, &local->postoldparent, postoldparent); + dht_iatt_merge(this, &local->preparent, prenewparent); + dht_iatt_merge(this, &local->postparent, postnewparent); + + call_cnt = local->call_cnt = conf->subvolume_cnt - 1; + + if (!local->call_cnt) + goto unwind; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == local->dst_hashed) + continue; + STACK_WIND_COOKIE( + frame, dht_rename_dir_cbk, conf->subvolumes[i], conf->subvolumes[i], + conf->subvolumes[i]->fops->rename, &local->loc, &local->loc2, NULL); + if (!--call_cnt) + break; + } + + return 0; +unwind: + WIPE(&local->preoldparent); + WIPE(&local->postoldparent); + WIPE(&local->preparent); + WIPE(&local->postparent); + + dht_rename_dir_unlock(frame, this); + return 0; +} int -dht_rename_dir_do (call_frame_t *frame, xlator_t *this) +dht_rename_dir_do(call_frame_t *frame, xlator_t *this) { - dht_local_t *local = NULL; + dht_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (local->op_ret == -1) - goto err; + if (local->op_ret == -1) + goto err; - local->op_ret = 0; + local->op_ret = 0; - STACK_WIND (frame, dht_rename_hashed_dir_cbk, - local->dst_hashed, - local->dst_hashed->fops->rename, - &local->loc, &local->loc2, NULL); - return 0; + STACK_WIND_COOKIE(frame, dht_rename_hashed_dir_cbk, local->dst_hashed, + local->dst_hashed, local->dst_hashed->fops->rename, + &local->loc, &local->loc2, NULL); + return 0; err: - DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, NULL, NULL, - NULL, NULL, NULL, NULL); - return 0; + dht_rename_dir_unlock(frame, this); + return 0; } - int -dht_rename_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, gf_dirent_t *entries, - dict_t *xdata) -{ - dht_local_t *local = NULL; - int this_call_cnt = -1; - call_frame_t *prev = NULL; - - local = frame->local; - prev = cookie; - - if (op_ret > 2) { - gf_log (this->name, GF_LOG_TRACE, - "readdir on %s for %s returned %d entries", - prev->this->name, local->loc.path, op_ret); - local->op_ret = -1; - local->op_errno = ENOTEMPTY; - } +dht_rename_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = -1; + xlator_t *prev = NULL; - this_call_cnt = dht_frame_return (frame); + local = frame->local; + prev = cookie; - if (is_last_call (this_call_cnt)) { - dht_rename_dir_do (frame, this); - } + if (op_ret > 2) { + gf_msg_trace(this->name, 0, "readdir on %s for %s returned %d entries", + prev->name, local->loc.path, op_ret); + local->op_ret = -1; + local->op_errno = ENOTEMPTY; + } - return 0; -} + this_call_cnt = dht_frame_return(frame); + if (is_last_call(this_call_cnt)) { + dht_rename_dir_do(frame, this); + } + + return 0; +} int -dht_rename_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd, dict_t *xdata) +dht_rename_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) { - dht_local_t *local = NULL; - int this_call_cnt = -1; - call_frame_t *prev = NULL; - + dht_local_t *local = NULL; + int this_call_cnt = -1; + xlator_t *prev = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; - local = frame->local; - prev = cookie; + local = frame->local; + prev = cookie; - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "opendir on %s for %s failed (%s)", - prev->this->name, local->loc.path, - strerror (op_errno)); - goto err; - } + if (op_ret == -1) { + gf_uuid_unparse(local->loc.inode->gfid, gfid); + gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_OPENDIR_FAILED, + "opendir on %s for %s failed,(gfid = %s) ", prev->name, + local->loc.path, gfid); + goto err; + } - STACK_WIND (frame, dht_rename_readdir_cbk, - prev->this, prev->this->fops->readdir, - local->fd, 4096, 0, NULL); + fd_bind(fd); + STACK_WIND_COOKIE(frame, dht_rename_readdir_cbk, prev, prev, + prev->fops->readdir, local->fd, 4096, 0, NULL); - return 0; + return 0; err: - this_call_cnt = dht_frame_return (frame); + this_call_cnt = dht_frame_return(frame); - if (is_last_call (this_call_cnt)) { - dht_rename_dir_do (frame, this); - } + if (is_last_call(this_call_cnt)) { + dht_rename_dir_do(frame, this); + } - return 0; + return 0; } - int -dht_rename_dir (call_frame_t *frame, xlator_t *this) +dht_rename_dir_lock2_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - int i = 0; - int op_errno = -1; + dht_local_t *local = NULL; + char src_gfid[GF_UUID_BUF_SIZE] = {0}; + char dst_gfid[GF_UUID_BUF_SIZE] = {0}; + dht_conf_t *conf = NULL; + int i = 0; + + local = frame->local; + conf = this->private; + + if (op_ret < 0) { + uuid_utoa_r(local->loc.inode->gfid, src_gfid); + + if (local->loc2.inode) + uuid_utoa_r(local->loc2.inode->gfid, dst_gfid); + + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR, + "acquiring entrylk after inodelk failed" + "rename (%s:%s:%s %s:%s:%s)", + local->loc.path, src_gfid, local->src_cached->name, + local->loc2.path, dst_gfid, + local->dst_cached ? local->dst_cached->name : NULL); + + local->op_ret = -1; + local->op_errno = op_errno; + goto err; + } + + local->fd = fd_create(local->loc.inode, frame->root->pid); + if (!local->fd) { + op_errno = ENOMEM; + goto err; + } + + local->op_ret = 0; + + if (!local->dst_cached) { + dht_rename_dir_do(frame, this); + return 0; + } + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND_COOKIE(frame, dht_rename_opendir_cbk, conf->subvolumes[i], + conf->subvolumes[i], + conf->subvolumes[i]->fops->opendir, &local->loc2, + local->fd, NULL); + } - conf = frame->this->private; - local = frame->local; + return 0; - local->call_cnt = conf->subvolume_cnt; +err: + /* No harm in calling an extra unlock */ + dht_rename_dir_unlock(frame, this); + return 0; +} - for (i = 0; i < conf->subvolume_cnt; i++) { - if (!conf->subvolume_status[i]) { - gf_log (this->name, GF_LOG_INFO, - "one of the subvolumes down (%s)", - conf->subvolumes[i]->name); - op_errno = ENOTCONN; - goto err; - } - } +int +dht_rename_dir_lock1_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + char src_gfid[GF_UUID_BUF_SIZE] = {0}; + char dst_gfid[GF_UUID_BUF_SIZE] = {0}; + int ret = 0; + loc_t *loc = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + if (op_ret < 0) { + uuid_utoa_r(local->loc.inode->gfid, src_gfid); + + if (local->loc2.inode) + uuid_utoa_r(local->loc2.inode->gfid, dst_gfid); + + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR, + "acquiring entrylk after inodelk failed" + "rename (%s:%s:%s %s:%s:%s)", + local->loc.path, src_gfid, local->src_cached->name, + local->loc2.path, dst_gfid, + local->dst_cached ? local->dst_cached->name : NULL); + + local->op_ret = -1; + local->op_errno = op_errno; + goto err; + } + + if (local->current == &local->lock[0]) { + loc = &local->loc2; + subvol = local->dst_hashed; + local->current = &local->lock[1]; + } else { + loc = &local->loc; + subvol = local->src_hashed; + local->current = &local->lock[0]; + } + ret = dht_protect_namespace(frame, loc, subvol, &local->current->ns, + dht_rename_dir_lock2_cbk); + if (ret < 0) { + op_errno = EINVAL; + goto err; + } + + return 0; +err: + /* No harm in calling an extra unlock */ + dht_rename_dir_unlock(frame, this); + return 0; +} - local->fd = fd_create (local->loc.inode, frame->root->pid); - if (!local->fd) { - op_errno = ENOMEM; - goto err; +/* + * If the hashed subvolumes of both source and dst are the different, + * lock in dictionary order of hashed subvol->name. This is important + * in case the parent directory is the same for both src and dst to + * prevent inodelk deadlocks when racing with a fix-layout op on the parent. + * + * If the hashed subvols are the same, use the gfid/name to determine + * the order of taking locks to prevent entrylk deadlocks when the parent + * dirs are the same. + * + */ +static int +dht_order_rename_lock(call_frame_t *frame, loc_t **loc, xlator_t **subvol) +{ + int ret = 0; + int op_ret = 0; + dht_local_t *local = NULL; + char *src = NULL; + char *dst = NULL; + + local = frame->local; + + if (local->src_hashed->name == local->dst_hashed->name) { + ret = 0; + } else { + ret = strcmp(local->src_hashed->name, local->dst_hashed->name); + } + + if (ret == 0) { + /* hashed subvols are the same for src and dst */ + /* Entrylks need to be ordered*/ + + src = alloca(GF_UUID_BNAME_BUF_SIZE + strlen(local->loc.name) + 1); + if (!src) { + gf_msg(frame->this->name, GF_LOG_ERROR, ENOMEM, 0, + "Insufficient memory for src"); + op_ret = -1; + goto out; } - local->op_ret = 0; + if (!gf_uuid_is_null(local->loc.pargfid)) + uuid_utoa_r(local->loc.pargfid, src); + else if (local->loc.parent) + uuid_utoa_r(local->loc.parent->gfid, src); + else + src[0] = '\0'; - if (!local->dst_cached) { - dht_rename_dir_do (frame, this); - return 0; - } + strcat(src, local->loc.name); - for (i = 0; i < conf->subvolume_cnt; i++) { - STACK_WIND (frame, dht_rename_opendir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->opendir, - &local->loc2, local->fd, NULL); + dst = alloca(GF_UUID_BNAME_BUF_SIZE + strlen(local->loc2.name) + 1); + if (!dst) { + gf_msg(frame->this->name, GF_LOG_ERROR, ENOMEM, 0, + "Insufficient memory for dst"); + op_ret = -1; + goto out; } - return 0; + if (!gf_uuid_is_null(local->loc2.pargfid)) + uuid_utoa_r(local->loc2.pargfid, dst); + else if (local->loc2.parent) + uuid_utoa_r(local->loc2.parent->gfid, dst); + else + dst[0] = '\0'; + + strcat(dst, local->loc2.name); + ret = strcmp(src, dst); + } + + if (ret <= 0) { + /*inodelk in dictionary order of hashed subvol names*/ + /*entrylk in dictionary order of gfid/basename */ + local->current = &local->lock[0]; + *loc = &local->loc; + *subvol = local->src_hashed; + + } else { + local->current = &local->lock[1]; + *loc = &local->loc2; + *subvol = local->dst_hashed; + } + + op_ret = 0; + +out: + return op_ret; +} + +int +dht_rename_dir(call_frame_t *frame, xlator_t *this) +{ + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + loc_t *loc = NULL; + xlator_t *subvol = NULL; + int i = 0; + int ret = 0; + int op_errno = -1; + + conf = frame->this->private; + local = frame->local; + + local->ret_cache = GF_CALLOC(conf->subvolume_cnt + 1, sizeof(int), + gf_dht_ret_cache_t); + + if (local->ret_cache == NULL) { + op_errno = ENOMEM; + goto err; + } + + local->call_cnt = conf->subvolume_cnt; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!conf->subvolume_status[i]) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_FAILED, + "Rename dir failed: subvolume down (%s)", + conf->subvolumes[i]->name); + op_errno = ENOTCONN; + goto err; + } + } + + /* Locks on src and dst needs to ordered which otherwise might cause + * deadlocks when rename (src, dst) and rename (dst, src) is done from + * two different clients + */ + ret = dht_order_rename_lock(frame, &loc, &subvol); + if (ret) { + op_errno = ENOMEM; + goto err; + } + + /* Rename must take locks on src to avoid lookup selfheal from + * recreating src on those subvols where the rename was successful. + * The locks can't be issued parallel as two different clients might + * attempt same rename command and be in dead lock. + */ + ret = dht_protect_namespace(frame, loc, subvol, &local->current->ns, + dht_rename_dir_lock1_cbk); + if (ret < 0) { + op_errno = EINVAL; + goto err; + } + + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, - NULL, NULL); - return 0; + DHT_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; +} + +static int +dht_rename_track_for_changelog(xlator_t *this, dict_t *xattr, loc_t *oldloc, + loc_t *newloc) +{ + int ret = -1; + dht_changelog_rename_info_t *info = NULL; + char *name = NULL; + int len1 = 0; + int len2 = 0; + int size = 0; + + if (!xattr || !oldloc || !newloc || !this) + return ret; + + len1 = strlen(oldloc->name) + 1; + len2 = strlen(newloc->name) + 1; + size = sizeof(dht_changelog_rename_info_t) + len1 + len2; + + info = GF_CALLOC(size, sizeof(char), gf_common_mt_char); + if (!info) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to calloc memory"); + return ret; + } + + gf_uuid_copy(info->old_pargfid, oldloc->pargfid); + gf_uuid_copy(info->new_pargfid, newloc->pargfid); + + info->oldname_len = len1; + info->newname_len = len2; + strncpy(info->buffer, oldloc->name, len1); + name = info->buffer + len1; + strncpy(name, newloc->name, len2); + + ret = dict_set_bin(xattr, DHT_CHANGELOG_RENAME_OP_KEY, info, size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value: key = %s," + " path = %s", + DHT_CHANGELOG_RENAME_OP_KEY, oldloc->name); + GF_FREE(info); + } + + return ret; +} + +#define DHT_MARKER_DONT_ACCOUNT(xattr) \ + do { \ + int tmp = -1; \ + if (!xattr) { \ + xattr = dict_new(); \ + if (!xattr) \ + break; \ + } \ + tmp = dict_set_str(xattr, GLUSTERFS_MARKER_DONT_ACCOUNT_KEY, "yes"); \ + if (tmp) { \ + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, \ + "Failed to set dictionary value: key = %s," \ + " path = %s", \ + GLUSTERFS_MARKER_DONT_ACCOUNT_KEY, local->loc.path); \ + } \ + } while (0) + +#define DHT_CHANGELOG_TRACK_AS_RENAME(xattr, oldloc, newloc) \ + do { \ + int tmp = -1; \ + if (!xattr) { \ + xattr = dict_new(); \ + if (!xattr) { \ + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, \ + "Failed to create dictionary to " \ + "track rename"); \ + break; \ + } \ + } \ + \ + tmp = dht_rename_track_for_changelog(this, xattr, oldloc, newloc); \ + \ + if (tmp) { \ + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, \ + "Failed to set dictionary value: key = %s," \ + " path = %s", \ + DHT_CHANGELOG_RENAME_OP_KEY, (oldloc)->path); \ + } \ + } while (0) + +int +dht_rename_unlock(call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + int op_ret = -1; + char src_gfid[GF_UUID_BUF_SIZE] = {0}; + char dst_gfid[GF_UUID_BUF_SIZE] = {0}; + dht_ilock_wrap_t inodelk_wrapper = { + 0, + }; + + local = frame->local; + inodelk_wrapper.locks = local->rename_inodelk_backward_compatible; + inodelk_wrapper.lk_count = local->rename_inodelk_bc_count; + + op_ret = dht_unlock_inodelk_wrapper(frame, &inodelk_wrapper); + if (op_ret < 0) { + uuid_utoa_r(local->loc.inode->gfid, src_gfid); + + if (local->loc2.inode) + uuid_utoa_r(local->loc2.inode->gfid, dst_gfid); + + if (IA_ISREG(local->stbuf.ia_type)) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED, + "winding unlock inodelk failed " + "rename (%s:%s:%s %s:%s:%s), " + "stale locks left on bricks", + local->loc.path, src_gfid, local->src_cached->name, + local->loc2.path, dst_gfid, + local->dst_cached ? local->dst_cached->name : NULL); + else + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED, + "winding unlock inodelk failed " + "rename (%s:%s %s:%s), " + "stale locks left on bricks", + local->loc.path, src_gfid, local->loc2.path, dst_gfid); + } + + dht_unlock_namespace(frame, &local->lock[0]); + dht_unlock_namespace(frame, &local->lock[1]); + + dht_rename_unlock_cbk(frame, NULL, this, local->op_ret, local->op_errno, + NULL); + return 0; } -#define DHT_MARK_FOP_INTERNAL(xattr) do { \ - int tmp = -1; \ - if (!xattr) { \ - xattr = dict_new (); \ - if (!xattr) \ - break; \ - } \ - tmp = dict_set_str (xattr, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); \ - if (tmp) { \ - gf_log (this->name, GF_LOG_ERROR, "Failed to set" \ - " internal dict key for %s", local->loc.path); \ - } \ - }while (0) - -#define DHT_MARKER_DONT_ACCOUNT(xattr) do { \ - int tmp = -1; \ - if (!xattr) { \ - xattr = dict_new (); \ - if (!xattr) \ - break; \ - } \ - tmp = dict_set_str (xattr, GLUSTERFS_MARKER_DONT_ACCOUNT_KEY, \ - "yes"); \ - if (tmp) { \ - gf_log (this->name, GF_LOG_ERROR, "Failed to set" \ - " marker dont account key for %s", local->loc.path); \ - } \ - }while (0) int -dht_rename_done (call_frame_t *frame, xlator_t *this) +dht_rename_done(call_frame_t *frame, xlator_t *this) { - dht_local_t *local = NULL; + dht_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (local->linked == _gf_true) { - local->linked = _gf_false; - dht_linkfile_attr_heal (frame, this); - } - DHT_STRIP_PHASE1_FLAGS (&local->stbuf); - DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, - &local->stbuf, &local->preoldparent, - &local->postoldparent, &local->preparent, - &local->postparent, NULL); + if (local->linked == _gf_true) { + local->linked = _gf_false; + dht_linkfile_attr_heal(frame, this); + } - return 0; + dht_rename_unlock(frame, this); + return 0; } int -dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +dht_rename_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - int this_call_cnt = 0; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + int this_call_cnt = 0; - local = frame->local; - prev = cookie; + local = frame->local; + prev = cookie; - if (!local) { - gf_log (this->name, GF_LOG_ERROR, - "!local, should not happen"); - goto out; - } + FRAME_SU_UNDO(frame, dht_local_t); + if (!local) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_VALUE, + "!local, should not happen"); + goto out; + } - this_call_cnt = dht_frame_return (frame); + this_call_cnt = dht_frame_return(frame); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "%s: unlink on %s failed (%s)", - local->loc.path, prev->this->name, strerror (op_errno)); - } + if (op_ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_UNLINK_FAILED, + "%s: Rename: unlink on %s failed ", local->loc.path, prev->name); + } - WIPE (&local->preoldparent); - WIPE (&local->postoldparent); - WIPE (&local->preparent); - WIPE (&local->postparent); + WIPE(&local->preoldparent); + WIPE(&local->postoldparent); + WIPE(&local->preparent); + WIPE(&local->postparent); - if (is_last_call (this_call_cnt)) { - dht_rename_done (frame, this); - } + if (is_last_call(this_call_cnt)) { + dht_rename_done(frame, this); + } out: - return 0; + return 0; } - int -dht_rename_cleanup (call_frame_t *frame) +dht_rename_cleanup(call_frame_t *frame) { - dht_local_t *local = NULL; - xlator_t *this = NULL; - xlator_t *src_hashed = NULL; - xlator_t *src_cached = NULL; - xlator_t *dst_hashed = NULL; - xlator_t *dst_cached = NULL; - int call_cnt = 0; - dict_t *xattr = NULL; + dht_local_t *local = NULL; + xlator_t *this = NULL; + xlator_t *src_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *dst_cached = NULL; + int call_cnt = 0; + dict_t *xattr = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; - local = frame->local; - this = frame->this; + local = frame->local; + this = frame->this; - src_hashed = local->src_hashed; - src_cached = local->src_cached; - dst_hashed = local->dst_hashed; - dst_cached = local->dst_cached; + src_hashed = local->src_hashed; + src_cached = local->src_cached; + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; - if (src_cached == dst_cached) - goto nolinks; + if (src_cached == dst_cached) + goto nolinks; - if (dst_hashed != src_hashed && dst_hashed != src_cached) - call_cnt++; + if (local->linked && (dst_hashed != src_hashed) && + (dst_hashed != src_cached)) { + call_cnt++; + } - if (src_cached != dst_hashed) - call_cnt++; + if (local->added_link && (src_cached != dst_hashed)) { + call_cnt++; + } - local->call_cnt = call_cnt; + local->call_cnt = call_cnt; - if (!call_cnt) - goto nolinks; + if (!call_cnt) + goto nolinks; - DHT_MARK_FOP_INTERNAL (xattr); + DHT_MARK_FOP_INTERNAL(xattr); - if (dst_hashed != src_hashed && dst_hashed != src_cached) { - dict_t *xattr_new = NULL; + gf_uuid_unparse(local->loc.inode->gfid, gfid); - gf_log (this->name, GF_LOG_TRACE, - "unlinking linkfile %s @ %s => %s", - local->loc.path, dst_hashed->name, src_cached->name); + if (local->linked && (dst_hashed != src_hashed) && + (dst_hashed != src_cached)) { + dict_t *xattr_new = NULL; - xattr_new = dict_copy_with_ref (xattr, NULL); + gf_msg_trace(this->name, 0, + "unlinking linkfile %s @ %s => %s, (gfid = %s)", + local->loc.path, dst_hashed->name, src_cached->name, gfid); + xattr_new = dict_copy_with_ref(xattr, NULL); - DHT_MARKER_DONT_ACCOUNT(xattr_new); + DHT_MARKER_DONT_ACCOUNT(xattr_new); - STACK_WIND (frame, dht_rename_unlink_cbk, - dst_hashed, dst_hashed->fops->unlink, - &local->loc, 0, xattr_new); + FRAME_SU_DO(frame, dht_local_t); + STACK_WIND_COOKIE(frame, dht_rename_unlink_cbk, dst_hashed, dst_hashed, + dst_hashed->fops->unlink, &local->loc, 0, xattr_new); - dict_unref (xattr_new); - xattr_new = NULL; - } + dict_unref(xattr_new); + xattr_new = NULL; + } - if (src_cached != dst_hashed) { - dict_t *xattr_new = NULL; + if (local->added_link && (src_cached != dst_hashed)) { + dict_t *xattr_new = NULL; - gf_log (this->name, GF_LOG_TRACE, - "unlinking link %s => %s (%s)", local->loc.path, - local->loc2.path, src_cached->name); + gf_msg_trace(this->name, 0, "unlinking link %s => %s (%s), (gfid = %s)", + local->loc.path, local->loc2.path, src_cached->name, gfid); - xattr_new = dict_copy_with_ref (xattr, NULL); + xattr_new = dict_copy_with_ref(xattr, NULL); - if (uuid_compare (local->loc.pargfid, - local->loc2.pargfid) == 0) { - DHT_MARKER_DONT_ACCOUNT(xattr_new); - } - - STACK_WIND (frame, dht_rename_unlink_cbk, - src_cached, src_cached->fops->unlink, - &local->loc2, 0, xattr_new); - - dict_unref (xattr_new); - xattr_new = NULL; + if (gf_uuid_compare(local->loc.pargfid, local->loc2.pargfid) == 0) { + DHT_MARKER_DONT_ACCOUNT(xattr_new); } + /* * + * The link to file is created using root permission. + * Hence deletion should happen using root. Otherwise + * it will fail. + */ + FRAME_SU_DO(frame, dht_local_t); + STACK_WIND_COOKIE(frame, dht_rename_unlink_cbk, src_cached, src_cached, + src_cached->fops->unlink, &local->loc2, 0, xattr_new); - if (xattr) - dict_unref (xattr); + dict_unref(xattr_new); + xattr_new = NULL; + } - return 0; + if (xattr) + dict_unref(xattr); -nolinks: - WIPE (&local->preoldparent); - WIPE (&local->postoldparent); - WIPE (&local->preparent); - WIPE (&local->postparent); + return 0; - DHT_STRIP_PHASE1_FLAGS (&local->stbuf); - DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, - &local->stbuf, &local->preoldparent, - &local->postoldparent, &local->preparent, - &local->postparent, NULL); +nolinks: + WIPE(&local->preoldparent); + WIPE(&local->postoldparent); + WIPE(&local->preparent); + WIPE(&local->postparent); - return 0; + dht_rename_unlock(frame, this); + return 0; } - int -dht_rename_links_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent, - dict_t *xdata) -{ - call_frame_t *prev = NULL; - dht_local_t *local = NULL; - - prev = cookie; - local = frame->local; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "link/file %s on %s failed (%s)", - local->loc.path, prev->this->name, strerror (op_errno)); - } +dht_rename_unlink(call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + xlator_t *src_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *dst_cached = NULL; + xlator_t *rename_subvol = NULL; + dict_t *xattr = NULL; - if (local->linked == _gf_true) { - local->linked = _gf_false; - dht_linkfile_attr_heal (frame, this); - } - DHT_STACK_DESTROY (frame); + local = frame->local; - return 0; -} + src_hashed = local->src_hashed; + src_cached = local->src_cached; + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; + local->call_cnt = 0; -int -dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *stbuf, - struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent, - dict_t *xdata) -{ - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - xlator_t *src_hashed = NULL; - xlator_t *src_cached = NULL; - xlator_t *dst_hashed = NULL; - xlator_t *dst_cached = NULL; - xlator_t *rename_subvol = NULL; - call_frame_t *link_frame = NULL; - dht_local_t *link_local = NULL; - dict_t *xattr = NULL; - - local = frame->local; - prev = cookie; - - src_hashed = local->src_hashed; - src_cached = local->src_cached; - dst_hashed = local->dst_hashed; - dst_cached = local->dst_cached; - - if (local->linked == _gf_true) - FRAME_SU_UNDO (frame, dht_local_t); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "%s: rename on %s failed (%s)", local->loc.path, - prev->this->name, strerror (op_errno)); - local->op_ret = op_ret; - local->op_errno = op_errno; - goto cleanup; - } + /* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk + * is called. since rename has already happened on rename_subvol, + * unlink shouldn't be sent for oldpath (either linkfile or cached-file) + * on rename_subvol. */ + if (src_cached == dst_cached) + rename_subvol = src_cached; + else + rename_subvol = dst_hashed; - if ((src_cached == dst_cached) && (dst_hashed != dst_cached)) { - link_frame = copy_frame (frame); - if (!link_frame) { - goto err; - } + /* TODO: delete files in background */ - /* fop value sent as maxvalue because it is not used - anywhere in this case */ - link_local = dht_local_init (link_frame, &local->loc2, NULL, - GF_FOP_MAXVALUE); - if (!link_local) { - goto err; - } + if (src_cached != dst_hashed && src_cached != dst_cached) + local->call_cnt++; - if (link_local->loc.inode) - inode_unref (link_local->loc.inode); - link_local->loc.inode = inode_ref (local->loc.inode); - uuid_copy (link_local->gfid, local->loc.inode->gfid); + if (src_hashed != rename_subvol && src_hashed != src_cached) + local->call_cnt++; - dht_linkfile_create (link_frame, dht_rename_links_create_cbk, - this, src_cached, dst_hashed, - &link_local->loc); - } + if (dst_cached && dst_cached != dst_hashed && dst_cached != src_cached) + local->call_cnt++; -err: - /* Merge attrs only from src_cached. In case there of src_cached != - * dst_hashed, this ignores linkfile attrs. */ - if (prev->this == src_cached) { - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - dht_iatt_merge (this, &local->preoldparent, preoldparent, - prev->this); - dht_iatt_merge (this, &local->postoldparent, postoldparent, - prev->this); - dht_iatt_merge (this, &local->preparent, prenewparent, - prev->this); - dht_iatt_merge (this, &local->postparent, postnewparent, - prev->this); - } + if (local->call_cnt == 0) + goto unwind; + DHT_MARK_FOP_INTERNAL(xattr); - /* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk - * is called. since rename has already happened on rename_subvol, - * unlink should not be sent for oldpath (either linkfile or cached-file) - * on rename_subvol. */ - if (src_cached == dst_cached) - rename_subvol = src_cached; - else - rename_subvol = dst_hashed; + if (src_cached != dst_hashed && src_cached != dst_cached) { + dict_t *xattr_new = NULL; - /* TODO: delete files in background */ + xattr_new = dict_copy_with_ref(xattr, NULL); - if (src_cached != dst_hashed && src_cached != dst_cached) - local->call_cnt++; + gf_msg_trace(this->name, 0, "deleting old src datafile %s @ %s", + local->loc.path, src_cached->name); - if (src_hashed != rename_subvol && src_hashed != src_cached) - local->call_cnt++; + if (gf_uuid_compare(local->loc.pargfid, local->loc2.pargfid) == 0) { + DHT_MARKER_DONT_ACCOUNT(xattr_new); + } - if (dst_cached && dst_cached != dst_hashed && dst_cached != src_cached) - local->call_cnt++; + DHT_CHANGELOG_TRACK_AS_RENAME(xattr_new, &local->loc, &local->loc2); + STACK_WIND_COOKIE(frame, dht_rename_unlink_cbk, src_cached, src_cached, + src_cached->fops->unlink, &local->loc, 0, xattr_new); - if (local->call_cnt == 0) - goto unwind; + dict_unref(xattr_new); + xattr_new = NULL; + } - DHT_MARK_FOP_INTERNAL (xattr); + if (src_hashed != rename_subvol && src_hashed != src_cached) { + dict_t *xattr_new = NULL; - if (src_cached != dst_hashed && src_cached != dst_cached) { - dict_t *xattr_new = NULL; + xattr_new = dict_copy_with_ref(xattr, NULL); - xattr_new = dict_copy_with_ref (xattr, NULL); + gf_msg_trace(this->name, 0, "deleting old src linkfile %s @ %s", + local->loc.path, src_hashed->name); - gf_log (this->name, GF_LOG_TRACE, - "deleting old src datafile %s @ %s", - local->loc.path, src_cached->name); + DHT_MARKER_DONT_ACCOUNT(xattr_new); - if (uuid_compare (local->loc.pargfid, - local->loc2.pargfid) == 0) { - DHT_MARKER_DONT_ACCOUNT(xattr_new); - } + STACK_WIND_COOKIE(frame, dht_rename_unlink_cbk, src_hashed, src_hashed, + src_hashed->fops->unlink, &local->loc, 0, xattr_new); - STACK_WIND (frame, dht_rename_unlink_cbk, - src_cached, src_cached->fops->unlink, - &local->loc, 0, xattr_new); + dict_unref(xattr_new); + xattr_new = NULL; + } - dict_unref (xattr_new); - xattr_new = NULL; - } + if (dst_cached && (dst_cached != dst_hashed) && + (dst_cached != src_cached)) { + gf_msg_trace(this->name, 0, "deleting old dst datafile %s @ %s", + local->loc2.path, dst_cached->name); - if (src_hashed != rename_subvol && src_hashed != src_cached) { - dict_t *xattr_new = NULL; + STACK_WIND_COOKIE(frame, dht_rename_unlink_cbk, dst_cached, dst_cached, + dst_cached->fops->unlink, &local->loc2, 0, xattr); + } + if (xattr) + dict_unref(xattr); + return 0; - xattr_new = dict_copy_with_ref (xattr, NULL); +unwind: + WIPE(&local->preoldparent); + WIPE(&local->postoldparent); + WIPE(&local->preparent); + WIPE(&local->postparent); - gf_log (this->name, GF_LOG_TRACE, - "deleting old src linkfile %s @ %s", - local->loc.path, src_hashed->name); + dht_rename_done(frame, this); - DHT_MARKER_DONT_ACCOUNT(xattr_new); + return 0; +} - STACK_WIND (frame, dht_rename_unlink_cbk, - src_hashed, src_hashed->fops->unlink, - &local->loc, 0, xattr_new); +int +dht_rename_links_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + xlator_t *prev = NULL; + dht_local_t *local = NULL; + call_frame_t *main_frame = NULL; + + prev = cookie; + local = frame->local; + main_frame = local->main_frame; + + /* TODO: Handle this case in lookup-optimize */ + if (op_ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_CREATE_LINK_FAILED, + "link/file %s on %s failed", local->loc.path, prev->name); + } + + if (local->linked == _gf_true) { + local->linked = _gf_false; + dht_linkfile_attr_heal(frame, this); + } + + dht_rename_unlink(main_frame, this); + DHT_STACK_DESTROY(frame); + return 0; +} - dict_unref (xattr_new); - xattr_new = NULL; +int +dht_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *stbuf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *prev = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *dst_cached = NULL; + call_frame_t *link_frame = NULL; + dht_local_t *link_local = NULL; + + local = frame->local; + prev = cookie; + + src_cached = local->src_cached; + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; + + if (local->linked == _gf_true) + FRAME_SU_UNDO(frame, dht_local_t); + + /* It is a critical failure iff we fail to rename the cached file + * if the rename of the linkto failed, it is not a critical failure, + * and we do not want to lose the created hard link for the new + * name as that could have been read by other clients. + * + * NOTE: If another client is attempting the same oldname -> newname + * rename, and finds both file names as existing, and are hard links + * to each other, then FUSE would send in an unlink for oldname. In + * this time duration if we treat the linkto as a critical error and + * unlink the newname we created, we would have effectively lost the + * file to rename operations. + * + * Repercussions of treating this as a non-critical error is that + * we could leave behind a stale linkto file and/or not create the new + * linkto file, the second case would be rectified by a subsequent + * lookup, the first case by a rebalance, like for all stale linkto + * files */ + + if (op_ret == -1) { + /* Critical failure: unable to rename the cached file */ + if (prev == src_cached) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_RENAME_FAILED, + "%s: Rename on %s failed, (gfid = %s) ", local->loc.path, + prev->name, + local->loc.inode ? uuid_utoa(local->loc.inode->gfid) : ""); + local->op_ret = op_ret; + local->op_errno = op_errno; + goto cleanup; + } else { + /* Non-critical failure, unable to rename the linkto + * file + */ + gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_RENAME_FAILED, + "%s: Rename (linkto file) on %s failed, " + "(gfid = %s) ", + local->loc.path, prev->name, + local->loc.inode ? uuid_utoa(local->loc.inode->gfid) : ""); } - - if (dst_cached - && (dst_cached != dst_hashed) - && (dst_cached != src_cached)) { - gf_log (this->name, GF_LOG_TRACE, - "deleting old dst datafile %s @ %s", - local->loc2.path, dst_cached->name); - - STACK_WIND (frame, dht_rename_unlink_cbk, - dst_cached, dst_cached->fops->unlink, - &local->loc2, 0, xattr); + } + if (xdata) { + if (!local->xattr) + local->xattr = dict_ref(xdata); + else + local->xattr = dict_copy_with_ref(xdata, local->xattr); + } + + /* Merge attrs only from src_cached. In case there of src_cached != + * dst_hashed, this ignores linkfile attrs. */ + if (prev == src_cached) { + dht_iatt_merge(this, &local->stbuf, stbuf); + dht_iatt_merge(this, &local->preoldparent, preoldparent); + dht_iatt_merge(this, &local->postoldparent, postoldparent); + dht_iatt_merge(this, &local->preparent, prenewparent); + dht_iatt_merge(this, &local->postparent, postnewparent); + } + + /* Create the linkto file for the dst file */ + if ((src_cached == dst_cached) && (dst_hashed != dst_cached)) { + link_frame = copy_frame(frame); + if (!link_frame) { + goto unlink; } - if (xattr) - dict_unref (xattr); - return 0; -unwind: - WIPE (&local->preoldparent); - WIPE (&local->postoldparent); - WIPE (&local->preparent); - WIPE (&local->postparent); - if (xattr) - dict_unref (xattr); + /* fop value sent as maxvalue because it is not used + * anywhere in this case */ + link_local = dht_local_init(link_frame, &local->loc2, NULL, + GF_FOP_MAXVALUE); + if (!link_local) { + goto unlink; + } - dht_rename_done (frame, this); + if (link_local->loc.inode) + inode_unref(link_local->loc.inode); + link_local->loc.inode = inode_ref(local->loc.inode); + link_local->main_frame = frame; + link_local->stbuf = local->stbuf; + gf_uuid_copy(link_local->gfid, local->loc.inode->gfid); + dht_linkfile_create(link_frame, dht_rename_links_create_cbk, this, + src_cached, dst_hashed, &link_local->loc); return 0; + } + +unlink: + + if (link_frame) { + DHT_STACK_DESTROY(link_frame); + } + dht_rename_unlink(frame, this); + return 0; cleanup: - if (xattr) - dict_unref (xattr); - dht_rename_cleanup (frame); + dht_rename_cleanup(frame); - return 0; + return 0; } +int +dht_do_rename(call_frame_t *frame) +{ + dht_local_t *local = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_cached = NULL; + xlator_t *this = NULL; + xlator_t *rename_subvol = NULL; + + local = frame->local; + this = frame->this; + + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; + src_cached = local->src_cached; + + if (src_cached == dst_cached) + rename_subvol = src_cached; + else + rename_subvol = dst_hashed; + + if ((src_cached != dst_hashed) && (rename_subvol == dst_hashed)) { + DHT_MARKER_DONT_ACCOUNT(local->xattr_req); + } + + if (rename_subvol == src_cached) { + DHT_CHANGELOG_TRACK_AS_RENAME(local->xattr_req, &local->loc, + &local->loc2); + } + + gf_msg_trace(this->name, 0, "renaming %s => %s (%s)", local->loc.path, + local->loc2.path, rename_subvol->name); + + if (local->linked == _gf_true) + FRAME_SU_DO(frame, dht_local_t); + STACK_WIND_COOKIE(frame, dht_rename_cbk, rename_subvol, rename_subvol, + rename_subvol->fops->rename, &local->loc, &local->loc2, + local->xattr_req); + return 0; +} int -dht_do_rename (call_frame_t *frame) +dht_rename_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - dht_local_t *local = NULL; - xlator_t *dst_hashed = NULL; - xlator_t *src_cached = NULL; - xlator_t *dst_cached = NULL; - xlator_t *this = NULL; - xlator_t *rename_subvol = NULL; - dict_t *dict = NULL; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + local = frame->local; + prev = cookie; - local = frame->local; - this = frame->this; + if (op_ret == -1) { + gf_msg_debug(this->name, 0, "link/file on %s failed (%s)", prev->name, + strerror(op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + local->added_link = _gf_false; + } else + dht_iatt_merge(this, &local->stbuf, stbuf); - dst_hashed = local->dst_hashed; - dst_cached = local->dst_cached; - src_cached = local->src_cached; + if (local->op_ret == -1) + goto cleanup; - if (src_cached == dst_cached) - rename_subvol = src_cached; - else - rename_subvol = dst_hashed; + dht_do_rename(frame); - if ((src_cached != dst_hashed) && (rename_subvol == dst_hashed)) { - DHT_MARKER_DONT_ACCOUNT(dict); - } - - gf_log (this->name, GF_LOG_TRACE, - "renaming %s => %s (%s)", - local->loc.path, local->loc2.path, rename_subvol->name); + return 0; - if (local->linked == _gf_true) - FRAME_SU_DO (frame, dht_local_t); - STACK_WIND (frame, dht_rename_cbk, - rename_subvol, rename_subvol->fops->rename, - &local->loc, &local->loc2, dict); +cleanup: + dht_rename_cleanup(frame); - return 0; + return 0; } - int -dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent, - dict_t *xdata) -{ - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - int this_call_cnt = 0; - - - local = frame->local; - prev = cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "link/file on %s failed (%s)", - prev->this->name, strerror (op_errno)); - local->op_ret = -1; - if (op_errno != ENOENT) - local->op_errno = op_errno; - } else if (local->src_cached == prev->this) { - /* merge of attr returned only from linkfile creation */ - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - } +dht_rename_linkto_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *prev = NULL; + xlator_t *src_cached = NULL; + dict_t *xattr = NULL; - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - if (local->op_ret == -1) - goto cleanup; + local = frame->local; + DHT_MARK_FOP_INTERNAL(xattr); + prev = cookie; + src_cached = local->src_cached; - dht_do_rename (frame); - } + if (op_ret == -1) { + gf_msg_debug(this->name, 0, "link/file on %s failed (%s)", prev->name, + strerror(op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + } - return 0; + /* If linkto creation failed move to failure cleanup code, + * instead of continuing with creating the link file */ + if (local->op_ret != 0) { + goto cleanup; + } + + gf_msg_trace(this->name, 0, "link %s => %s (%s)", local->loc.path, + local->loc2.path, src_cached->name); + if (gf_uuid_compare(local->loc.pargfid, local->loc2.pargfid) == 0) { + DHT_MARKER_DONT_ACCOUNT(xattr); + } + + local->added_link = _gf_true; + + STACK_WIND_COOKIE(frame, dht_rename_link_cbk, src_cached, src_cached, + src_cached->fops->link, &local->loc, &local->loc2, xattr); + + if (xattr) + dict_unref(xattr); + + return 0; cleanup: - dht_rename_cleanup (frame); + dht_rename_cleanup(frame); - return 0; -} + if (xattr) + dict_unref(xattr); + return 0; +} int -dht_rename_unlink_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, struct iatt *postparent, - dict_t *xdata) +dht_rename_unlink_links_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; + dht_local_t *local = NULL; + xlator_t *prev = NULL; + local = frame->local; + prev = cookie; - local = frame->local; - prev = cookie; + if ((op_ret == -1) && (op_errno != ENOENT)) { + gf_msg_debug(this->name, 0, "unlink of %s on %s failed (%s)", + local->loc2.path, prev->name, strerror(op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + } - if ((op_ret == -1) && (op_errno != ENOENT)) { - gf_log (this->name, GF_LOG_DEBUG, - "unlink of %s on %s failed (%s)", - local->loc2.path, prev->this->name, - strerror (op_errno)); - local->op_ret = -1; - local->op_errno = op_errno; - } + if (local->op_ret == -1) + goto cleanup; - if (local->op_ret == -1) - goto cleanup; + dht_do_rename(frame); - dht_do_rename (frame); - - return 0; + return 0; cleanup: - dht_rename_cleanup (frame); + dht_rename_cleanup(frame); - return 0; + return 0; } - int -dht_rename_create_links (call_frame_t *frame) +dht_rename_create_links(call_frame_t *frame) { - dht_local_t *local = NULL; - xlator_t *this = NULL; - xlator_t *src_hashed = NULL; - xlator_t *src_cached = NULL; - xlator_t *dst_hashed = NULL; - xlator_t *dst_cached = NULL; - int call_cnt = 0; - dict_t *xattr = NULL; + dht_local_t *local = NULL; + xlator_t *this = NULL; + xlator_t *src_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *dst_cached = NULL; + int call_cnt = 0; + dict_t *xattr = NULL; + local = frame->local; + this = frame->this; - local = frame->local; - this = frame->this; + src_hashed = local->src_hashed; + src_cached = local->src_cached; + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; - src_hashed = local->src_hashed; - src_cached = local->src_cached; - dst_hashed = local->dst_hashed; - dst_cached = local->dst_cached; + DHT_MARK_FOP_INTERNAL(xattr); - DHT_MARK_FOP_INTERNAL (xattr); + if (src_cached == dst_cached) { + dict_t *xattr_new = NULL; - if (src_cached == dst_cached) { - dict_t *xattr_new = NULL; + if (dst_hashed == dst_cached) + goto nolinks; - if (dst_hashed == dst_cached) - goto nolinks; + xattr_new = dict_copy_with_ref(xattr, NULL); - xattr_new = dict_copy_with_ref (xattr, NULL); + gf_msg_trace(this->name, 0, "unlinking dst linkfile %s @ %s", + local->loc2.path, dst_hashed->name); - gf_log (this->name, GF_LOG_TRACE, - "unlinking dst linkfile %s @ %s", - local->loc2.path, dst_hashed->name); + DHT_MARKER_DONT_ACCOUNT(xattr_new); - DHT_MARKER_DONT_ACCOUNT(xattr_new); + STACK_WIND_COOKIE(frame, dht_rename_unlink_links_cbk, dst_hashed, + dst_hashed, dst_hashed->fops->unlink, &local->loc2, 0, + xattr_new); - STACK_WIND (frame, dht_rename_unlink_links_cbk, - dst_hashed, dst_hashed->fops->unlink, - &local->loc2, 0, xattr_new); + dict_unref(xattr_new); + if (xattr) + dict_unref(xattr); - dict_unref (xattr_new); - return 0; + return 0; + } + + if (src_cached != dst_hashed) { + /* needed to create the link file */ + call_cnt++; + if (dst_hashed != src_hashed) + /* needed to create the linkto file */ + call_cnt++; + } + + /* We should not have any failures post the link creation, as this + * introduces the newname into the namespace. Clients could have cached + * the existence of the newname and may start taking actions based on + * the same. Hence create the linkto first, and then attempt the link. + * + * NOTE: If another client is attempting the same oldname -> newname + * rename, and finds both file names as existing, and are hard links + * to each other, then FUSE would send in an unlink for oldname. In + * this time duration if we treat the linkto as a critical error and + * unlink the newname we created, we would have effectively lost the + * file to rename operations. */ + if (dst_hashed != src_hashed && src_cached != dst_hashed) { + gf_msg_trace(this->name, 0, "linkfile %s @ %s => %s", local->loc.path, + dst_hashed->name, src_cached->name); + + memcpy(local->gfid, local->loc.inode->gfid, 16); + dht_linkfile_create(frame, dht_rename_linkto_cbk, this, src_cached, + dst_hashed, &local->loc); + } else if (src_cached != dst_hashed) { + dict_t *xattr_new = NULL; + + xattr_new = dict_copy_with_ref(xattr, NULL); + + gf_msg_trace(this->name, 0, "link %s => %s (%s)", local->loc.path, + local->loc2.path, src_cached->name); + if (gf_uuid_compare(local->loc.pargfid, local->loc2.pargfid) == 0) { + DHT_MARKER_DONT_ACCOUNT(xattr_new); } - if (dst_hashed != src_hashed && dst_hashed != src_cached) - call_cnt++; - - if (src_cached != dst_hashed) - call_cnt++; - - local->call_cnt = call_cnt; + local->added_link = _gf_true; - if (dst_hashed != src_hashed && dst_hashed != src_cached) { - gf_log (this->name, GF_LOG_TRACE, - "linkfile %s @ %s => %s", - local->loc.path, dst_hashed->name, src_cached->name); - memcpy (local->gfid, local->loc.inode->gfid, 16); - dht_linkfile_create (frame, dht_rename_links_cbk, this, - src_cached, dst_hashed, &local->loc); - } + STACK_WIND_COOKIE(frame, dht_rename_link_cbk, src_cached, src_cached, + src_cached->fops->link, &local->loc, &local->loc2, + xattr_new); - if (src_cached != dst_hashed) { - dict_t *xattr_new = NULL; + dict_unref(xattr_new); + } - xattr_new = dict_copy_with_ref (xattr, NULL); +nolinks: + if (!call_cnt) { + /* skip to next step */ + dht_do_rename(frame); + } + if (xattr) + dict_unref(xattr); + + return 0; +} - gf_log (this->name, GF_LOG_TRACE, - "link %s => %s (%s)", local->loc.path, - local->loc2.path, src_cached->name); - if (uuid_compare (local->loc.pargfid, - local->loc2.pargfid) == 0) { - DHT_MARKER_DONT_ACCOUNT(xattr_new); +int +dht_rename_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) +{ + dht_local_t *local = NULL; + int call_cnt = 0; + dht_conf_t *conf = NULL; + char gfid_local[GF_UUID_BUF_SIZE] = {0}; + char gfid_server[GF_UUID_BUF_SIZE] = {0}; + int child_index = -1; + gf_boolean_t is_src = _gf_false; + loc_t *loc = NULL; + + child_index = (long)cookie; + + local = frame->local; + conf = this->private; + + is_src = (child_index == 0); + if (is_src) + loc = &local->loc; + else + loc = &local->loc2; + + if (op_ret >= 0) { + if (is_src) + local->src_cached = dht_subvol_get_cached(this, local->loc.inode); + else { + if (loc->inode) + gf_uuid_unparse(loc->inode->gfid, gfid_local); + + gf_msg_debug(this->name, 0, + "dst_cached before lookup: %s, " + "(path:%s)(gfid:%s),", + local->loc2.path, + local->dst_cached ? local->dst_cached->name : NULL, + local->dst_cached ? gfid_local : NULL); + + local->dst_cached = dht_subvol_get_cached(this, + local->loc2_copy.inode); + + gf_uuid_unparse(stbuf->ia_gfid, gfid_local); + + gf_msg_debug(this->name, GF_LOG_WARNING, + "dst_cached after lookup: %s, " + "(path:%s)(gfid:%s)", + local->loc2.path, + local->dst_cached ? local->dst_cached->name : NULL, + local->dst_cached ? gfid_local : NULL); + + if ((local->loc2.inode == NULL) || + gf_uuid_compare(stbuf->ia_gfid, local->loc2.inode->gfid)) { + if (local->loc2.inode != NULL) { + inode_unlink(local->loc2.inode, local->loc2.parent, + local->loc2.name); + inode_unref(local->loc2.inode); } - STACK_WIND (frame, dht_rename_links_cbk, - src_cached, src_cached->fops->link, - &local->loc, &local->loc2, xattr_new); + local->loc2.inode = inode_link(local->loc2_copy.inode, + local->loc2_copy.parent, + local->loc2_copy.name, stbuf); + gf_uuid_copy(local->loc2.gfid, stbuf->ia_gfid); + } + } + } + + if (op_ret < 0) { + if (is_src) { + /* The meaning of is_linkfile is overloaded here. For locking + * to work properly both rebalance and rename should acquire + * lock on datafile. The reason for sending this lookup is to + * find out whether we've acquired a lock on data file. + * Between the lookup before rename and this rename, the + * file could be migrated by a rebalance process and now this + * file this might be a linkto file. We verify that by sending + * this lookup. However, if this lookup fails we cannot really + * say whether we've acquired lock on a datafile or linkto file. + * So, we act conservatively and _assume_ + * that this is a linkfile and fail the rename operation. + */ + local->is_linkfile = _gf_true; + local->op_errno = op_errno; + } else { + if (local->dst_cached) + gf_msg_debug(this->name, op_errno, + "file %s (gfid:%s) was present " + "(hashed-subvol=%s, " + "cached-subvol=%s) before rename," + " but lookup failed", + local->loc2.path, + uuid_utoa(local->loc2.inode->gfid), + local->dst_hashed->name, local->dst_cached->name); + if (dht_inode_missing(op_errno)) + local->dst_cached = NULL; + } + } else if (is_src && xattr && + check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name)) { + local->is_linkfile = _gf_true; + /* Found linkto file instead of data file, passdown ENOENT + * based on the above comment */ + local->op_errno = ENOENT; + } + + if (!local->is_linkfile && (op_ret >= 0) && + gf_uuid_compare(loc->gfid, stbuf->ia_gfid)) { + gf_uuid_unparse(loc->gfid, gfid_local); + gf_uuid_unparse(stbuf->ia_gfid, gfid_server); + + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH, + "path:%s, received a different gfid, local_gfid= %s" + " server_gfid: %s", + local->loc.path, gfid_local, gfid_server); + + /* Will passdown ENOENT anyway since the file we sent on + * rename is replaced with a different file */ + local->op_errno = ENOENT; + /* Since local->is_linkfile is used here to detect failure, + * marking this to true */ + local->is_linkfile = _gf_true; + } + + call_cnt = dht_frame_return(frame); + if (is_last_call(call_cnt)) { + if (local->is_linkfile) { + local->op_ret = -1; + goto fail; + } - dict_unref (xattr_new); - } + dht_rename_create_links(frame); + } -nolinks: - if (!call_cnt) { - /* skip to next step */ - dht_do_rename (frame); - } - if (xattr) - dict_unref (xattr); + return 0; +fail: + dht_rename_unlock(frame, this); + return 0; +} - return 0; +int +dht_rename_file_lock1_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + char src_gfid[GF_UUID_BUF_SIZE] = {0}; + char dst_gfid[GF_UUID_BUF_SIZE] = {0}; + int ret = 0; + loc_t *loc = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + if (op_ret < 0) { + uuid_utoa_r(local->loc.inode->gfid, src_gfid); + + if (local->loc2.inode) + uuid_utoa_r(local->loc2.inode->gfid, dst_gfid); + + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR, + "protecting namespace of %s failed" + "rename (%s:%s:%s %s:%s:%s)", + local->current == &local->lock[0] ? local->loc.path + : local->loc2.path, + local->loc.path, src_gfid, local->src_hashed->name, + local->loc2.path, dst_gfid, + local->dst_hashed ? local->dst_hashed->name : NULL); + + local->op_ret = -1; + local->op_errno = op_errno; + goto err; + } + + if (local->current == &local->lock[0]) { + loc = &local->loc2; + subvol = local->dst_hashed; + local->current = &local->lock[1]; + } else { + loc = &local->loc; + subvol = local->src_hashed; + local->current = &local->lock[0]; + } + + ret = dht_protect_namespace(frame, loc, subvol, &local->current->ns, + dht_rename_lock_cbk); + if (ret < 0) { + op_errno = EINVAL; + goto err; + } + + return 0; +err: + /* No harm in calling an extra unlock */ + dht_rename_unlock(frame, this); + return 0; } +int32_t +dht_rename_file_protect_namespace(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + char src_gfid[GF_UUID_BUF_SIZE] = {0}; + char dst_gfid[GF_UUID_BUF_SIZE] = {0}; + int ret = 0; + loc_t *loc = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + if (op_ret < 0) { + uuid_utoa_r(local->loc.inode->gfid, src_gfid); + + if (local->loc2.inode) + uuid_utoa_r(local->loc2.inode->gfid, dst_gfid); + + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR, + "acquiring inodelk failed " + "rename (%s:%s:%s %s:%s:%s)", + local->loc.path, src_gfid, local->src_cached->name, + local->loc2.path, dst_gfid, + local->dst_cached ? local->dst_cached->name : NULL); + + local->op_ret = -1; + local->op_errno = op_errno; + + goto err; + } + + /* Locks on src and dst needs to ordered which otherwise might cause + * deadlocks when rename (src, dst) and rename (dst, src) is done from + * two different clients + */ + ret = dht_order_rename_lock(frame, &loc, &subvol); + if (ret) { + local->op_errno = ENOMEM; + goto err; + } + + ret = dht_protect_namespace(frame, loc, subvol, &local->current->ns, + dht_rename_file_lock1_cbk); + if (ret < 0) { + op_errno = EINVAL; + goto err; + } + + return 0; -int -dht_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdata) -{ - xlator_t *src_cached = NULL; - xlator_t *src_hashed = NULL; - xlator_t *dst_cached = NULL; - xlator_t *dst_hashed = NULL; - int op_errno = -1; - int ret = -1; - dht_local_t *local = NULL; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (oldloc, err); - VALIDATE_OR_GOTO (newloc, err); - - src_hashed = dht_subvol_get_hashed (this, oldloc); - if (!src_hashed) { - gf_log (this->name, GF_LOG_INFO, - "no subvolume in layout for path=%s", - oldloc->path); - op_errno = EINVAL; - goto err; - } +err: + /* Its fine to call unlock even when no locks are acquired, as we check + * for lock->locked before winding a unlock call. + */ + dht_rename_unlock(frame, this); - src_cached = dht_subvol_get_cached (this, oldloc->inode); - if (!src_cached) { - gf_log (this->name, GF_LOG_INFO, - "no cached subvolume for path=%s", oldloc->path); - op_errno = EINVAL; - goto err; - } + return 0; +} - dst_hashed = dht_subvol_get_hashed (this, newloc); - if (!dst_hashed) { - gf_log (this->name, GF_LOG_INFO, - "no subvolume in layout for path=%s", - newloc->path); - op_errno = EINVAL; - goto err; +int32_t +dht_rename_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + char src_gfid[GF_UUID_BUF_SIZE] = {0}; + char dst_gfid[GF_UUID_BUF_SIZE] = {0}; + dict_t *xattr_req = NULL; + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *subvol = NULL; + dht_lock_t *lock = NULL; + + local = frame->local; + conf = this->private; + + if (op_ret < 0) { + uuid_utoa_r(local->loc.inode->gfid, src_gfid); + + if (local->loc2.inode) + uuid_utoa_r(local->loc2.inode->gfid, dst_gfid); + + gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR, + "protecting namespace of %s failed. " + "rename (%s:%s:%s %s:%s:%s)", + local->current == &local->lock[0] ? local->loc.path + : local->loc2.path, + local->loc.path, src_gfid, local->src_hashed->name, + local->loc2.path, dst_gfid, + local->dst_hashed ? local->dst_hashed->name : NULL); + + local->op_ret = -1; + local->op_errno = op_errno; + + goto done; + } + + xattr_req = dict_new(); + if (xattr_req == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto done; + } + + op_ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + goto done; + } + + /* dst_cached might've changed. This normally happens for two reasons: + * 1. rebalance migrated dst + * 2. Another parallel rename was done overwriting dst + * + * Doing a lookup on local->loc2 when dst exists, but is associated + * with a different gfid will result in an ESTALE error. So, do a fresh + * lookup with a new inode on dst-path and handle change of dst-cached + * in the cbk. Also, to identify dst-cached changes we do a lookup on + * "this" rather than the subvol. + */ + loc_copy(&local->loc2_copy, &local->loc2); + inode_unref(local->loc2_copy.inode); + local->loc2_copy.inode = inode_new(local->loc.inode->table); + + /* Why not use local->lock.locks[?].loc for lookup post lock phase + * --------------------------------------------------------------- + * "layout.parent_layout.locks[?].loc" does not have the name and pargfid + * populated. + * Reason: If we had populated the name and pargfid, server might + * resolve to a successful lookup even if there is a file with same name + * with a different gfid(unlink & create) as server does name based + * resolution on first priority. And this can result in operating on a + * different inode entirely. + * + * Now consider a scenario where source file was renamed by some other + * client to a new name just before this lock was granted. So if a + * lookup would be done on local->lock[0].layout.parent_layout.locks[?].loc, + * server will send success even if the entry was renamed (since server will + * do a gfid based resolution). So once a lock is granted, make sure the + * file exists with the name that the client requested with. + * */ + + local->call_cnt = 2; + for (i = 0; i < 2; i++) { + if (i == 0) { + lock = local->rename_inodelk_backward_compatible[0]; + if (gf_uuid_compare(local->loc.gfid, lock->loc.gfid) == 0) + subvol = lock->xl; + else { + lock = local->rename_inodelk_backward_compatible[1]; + subvol = lock->xl; + } + } else { + subvol = this; } - if (newloc->inode) - dst_cached = dht_subvol_get_cached (this, newloc->inode); + STACK_WIND_COOKIE(frame, dht_rename_lookup_cbk, (void *)(long)i, subvol, + subvol->fops->lookup, + (i == 0) ? &local->loc : &local->loc2_copy, + xattr_req); + } - local = dht_local_init (frame, oldloc, NULL, GF_FOP_RENAME); - if (!local) { - op_errno = ENOMEM; - goto err; - } - /* cached_subvol will be set from dht_local_init, reset it to NULL, - as the logic of handling rename is different */ - local->cached_subvol = NULL; - - ret = loc_copy (&local->loc2, newloc); - if (ret == -1) { - op_errno = ENOMEM; - goto err; - } + dict_unref(xattr_req); + return 0; - local->src_hashed = src_hashed; - local->src_cached = src_cached; - local->dst_hashed = dst_hashed; - local->dst_cached = dst_cached; +done: + /* Its fine to call unlock even when no locks are acquired, as we check + * for lock->locked before winding a unlock call. + */ + dht_rename_unlock(frame, this); - gf_log (this->name, GF_LOG_TRACE, - "renaming %s (hash=%s/cache=%s) => %s (hash=%s/cache=%s)", - oldloc->path, src_hashed->name, src_cached->name, - newloc->path, dst_hashed->name, - dst_cached ? dst_cached->name : "<nul>"); + if (xattr_req) + dict_unref(xattr_req); - if (IA_ISDIR (oldloc->inode->ia_type)) { - dht_rename_dir (frame, this); - } else { - local->op_ret = 0; - dht_rename_create_links (frame); + return 0; +} + +int +dht_rename_lock(call_frame_t *frame) +{ + dht_local_t *local = NULL; + int count = 1, ret = -1; + dht_lock_t **lk_array = NULL; + + local = frame->local; + + if (local->dst_cached) + count++; + + lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer); + if (lk_array == NULL) + goto err; + + lk_array[0] = dht_lock_new(frame->this, local->src_cached, &local->loc, + F_WRLCK, DHT_FILE_MIGRATE_DOMAIN, NULL, + FAIL_ON_ANY_ERROR); + if (lk_array[0] == NULL) + goto err; + + if (local->dst_cached) { + /* dst might be removed by the time inodelk reaches bricks, + * which can result in ESTALE errors. POSIX imposes no + * restriction for dst to be present for renames to be + * successful. So, we'll ignore ESTALE errors. As far as + * synchronization on dst goes, we'll achieve the same by + * holding entrylk on parent directory of dst in the namespace + * of basename(dst). Also, there might not be quorum in cluster + * xlators like EC/disperse on errno, in which case they return + * EIO. For eg., in a disperse (4 + 2), 3 might return success + * and three might return ESTALE. Disperse, having no Quorum + * unwinds inodelk with EIO. So, ignore EIO too. + */ + lk_array[1] = dht_lock_new(frame->this, local->dst_cached, &local->loc2, + F_WRLCK, DHT_FILE_MIGRATE_DOMAIN, NULL, + IGNORE_ENOENT_ESTALE_EIO); + if (lk_array[1] == NULL) + goto err; + } + + local->rename_inodelk_backward_compatible = lk_array; + local->rename_inodelk_bc_count = count; + + /* retaining inodelks for the sake of backward compatibility. Please + * make sure to remove this inodelk once all of 3.10, 3.12 and 3.13 + * reach EOL. Better way of getting synchronization would be to acquire + * entrylks on src and dst parent directories in the namespace of + * basenames of src and dst + */ + ret = dht_blocking_inodelk(frame, lk_array, count, + dht_rename_file_protect_namespace); + if (ret < 0) { + local->rename_inodelk_backward_compatible = NULL; + local->rename_inodelk_bc_count = 0; + goto err; + } + + return 0; +err: + if (lk_array != NULL) { + int tmp_count = 0, i = 0; + + for (i = 0; (i < count) && (lk_array[i]); i++, tmp_count++) + ; + + dht_lock_array_free(lk_array, tmp_count); + GF_FREE(lk_array); + } + + return -1; +} + +int +dht_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + xlator_t *src_cached = NULL; + xlator_t *src_hashed = NULL; + xlator_t *dst_cached = NULL; + xlator_t *dst_hashed = NULL; + int op_errno = -1; + int ret = -1; + dht_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + char newgfid[GF_UUID_BUF_SIZE] = {0}; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(oldloc, err); + VALIDATE_OR_GOTO(newloc, err); + + gf_uuid_unparse(oldloc->inode->gfid, gfid); + + src_hashed = dht_subvol_get_hashed(this, oldloc); + if (!src_hashed) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_FAILED, + "No hashed subvolume in layout for path=%s," + "(gfid = %s)", + oldloc->path, gfid); + op_errno = EINVAL; + goto err; + } + + src_cached = dht_subvol_get_cached(this, oldloc->inode); + if (!src_cached) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_FAILED, + "No cached subvolume for path = %s," + "(gfid = %s)", + oldloc->path, gfid); + + op_errno = EINVAL; + goto err; + } + + dst_hashed = dht_subvol_get_hashed(this, newloc); + if (!dst_hashed) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_FAILED, + "No hashed subvolume in layout for path=%s", newloc->path); + op_errno = EINVAL; + goto err; + } + + if (newloc->inode) + dst_cached = dht_subvol_get_cached(this, newloc->inode); + + local = dht_local_init(frame, oldloc, NULL, GF_FOP_RENAME); + if (!local) { + op_errno = ENOMEM; + goto err; + } + /* cached_subvol will be set from dht_local_init, reset it to NULL, + as the logic of handling rename is different */ + local->cached_subvol = NULL; + + ret = loc_copy(&local->loc2, newloc); + if (ret == -1) { + op_errno = ENOMEM; + goto err; + } + + local->src_hashed = src_hashed; + local->src_cached = src_cached; + local->dst_hashed = dst_hashed; + local->dst_cached = dst_cached; + if (xdata) + local->xattr_req = dict_ref(xdata); + + if (newloc->inode) + gf_uuid_unparse(newloc->inode->gfid, newgfid); + + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_INFO, + "renaming %s (%s) (hash=%s/cache=%s) => %s (%s) " + "(hash=%s/cache=%s) ", + oldloc->path, gfid, src_hashed->name, src_cached->name, newloc->path, + newloc->inode ? newgfid : NULL, dst_hashed->name, + dst_cached ? dst_cached->name : "<nul>"); + + if (IA_ISDIR(oldloc->inode->ia_type)) { + dht_rename_dir(frame, this); + } else { + local->op_ret = 0; + ret = dht_rename_lock(frame); + if (ret < 0) { + op_errno = ENOMEM; + goto err; } + } - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, - NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, + NULL); - return 0; + return 0; +} + +int +dht_pt_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + gf_boolean_t free_xdata = _gf_false; + + /* Just a pass through */ + if (!IA_ISDIR(oldloc->inode->ia_type)) { + if (!xdata) { + free_xdata = _gf_true; + } + DHT_CHANGELOG_TRACK_AS_RENAME(xdata, oldloc, newloc); + } + default_rename(frame, this, oldloc, newloc, xdata); + if (free_xdata && xdata) { + dict_unref(xdata); + xdata = NULL; + } + return 0; } diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index 0e65275442a..3e24065227c 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -8,1041 +8,2593 @@ cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - - -#include "glusterfs.h" -#include "xlator.h" -#include "dht-common.h" -#include "glusterfs-acl.h" - -#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,cnt,path) do { \ - layout->list[i].start = srt; \ - layout->list[i].stop = srt + chunk - 1; \ - \ - gf_log (this->name, GF_LOG_TRACE, \ - "gave fix: %u - %u on %s for %s", \ - layout->list[i].start, layout->list[i].stop, \ - layout->list[i].xlator->name, path); \ - } while (0) - -#define DHT_RESET_LAYOUT_RANGE(layout) do { \ - int cnt = 0; \ - for (cnt = 0; cnt < layout->cnt; cnt++ ) { \ - layout->list[cnt].start = 0; \ - layout->list[cnt].stop = 0; \ - } \ - } while (0) +#include "dht-lock.h" + +#define DHT_SET_LAYOUT_RANGE(layout, i, srt, chunk, path) \ + do { \ + layout->list[i].start = srt; \ + layout->list[i].stop = srt + chunk - 1; \ + layout->list[i].commit_hash = layout->commit_hash; \ + \ + gf_msg_trace(this->name, 0, \ + "gave fix: 0x%x - 0x%x, with commit-hash 0x%x" \ + " on %s for %s", \ + layout->list[i].start, layout->list[i].stop, \ + layout->list[i].commit_hash, \ + layout->list[i].xlator->name, path); \ + } while (0) + +#define DHT_RESET_LAYOUT_RANGE(layout) \ + do { \ + int cnt = 0; \ + for (cnt = 0; cnt < layout->cnt; cnt++) { \ + layout->list[cnt].start = 0; \ + layout->list[cnt].stop = 0; \ + } \ + } while (0) + +static int +dht_selfheal_layout_lock(call_frame_t *frame, dht_layout_t *layout, + gf_boolean_t newdir, dht_selfheal_layout_t healer, + dht_need_heal_t should_heal); static uint32_t -dht_overlap_calc (dht_layout_t *old, int o, dht_layout_t *new, int n) +dht_overlap_calc(dht_layout_t *old, int o, dht_layout_t *new, int n) { - if (o >= old->cnt || n >= new->cnt) - return 0; + if (o >= old->cnt || n >= new->cnt) + return 0; - if (old->list[o].err > 0 || new->list[n].err > 0) - return 0; + if (old->list[o].err > 0 || new->list[n].err > 0) + return 0; - if (old->list[o].start == old->list[o].stop) { - return 0; - } + if (old->list[o].start == old->list[o].stop) { + return 0; + } - if (new->list[n].start == new->list[n].stop) { - return 0; - } + if (new->list[n].start == new->list[n].stop) { + return 0; + } - if ((old->list[o].start > new->list[n].stop) || - (old->list[o].stop < new->list[n].start)) - return 0; + if ((old->list[o].start > new->list[n].stop) || + (old->list[o].stop < new->list[n].start)) + return 0; - return min (old->list[o].stop, new->list[n].stop) - - max (old->list[o].start, new->list[n].start) + 1; + return min(old->list[o].stop, new->list[n].stop) - + max(old->list[o].start, new->list[n].start) + 1; } +int +dht_selfheal_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + DHT_STACK_DESTROY(frame); + return 0; +} int -dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret) +dht_selfheal_dir_finish(call_frame_t *frame, xlator_t *this, int ret, + int invoke_cbk) { - dht_local_t *local = NULL; + dht_local_t *local = NULL, *lock_local = NULL; + call_frame_t *lock_frame = NULL; + int lock_count = 0; - local = frame->local; - local->selfheal.dir_cbk (frame, NULL, frame->this, ret, - local->op_errno, NULL); + local = frame->local; - return 0; -} + /* Unlock entrylk */ + dht_unlock_entrylk_wrapper(frame, &local->lock[0].ns.directory_ns); + + /* Unlock inodelk */ + lock_count = dht_lock_count(local->lock[0].ns.parent_layout.locks, + local->lock[0].ns.parent_layout.lk_count); + if (lock_count == 0) + goto done; + + lock_frame = copy_frame(frame); + if (lock_frame == NULL) { + goto done; + } + + lock_local = dht_local_init(lock_frame, &local->loc, NULL, + lock_frame->root->op); + if (lock_local == NULL) { + goto done; + } + + lock_local->lock[0].ns.parent_layout.locks = local->lock[0] + .ns.parent_layout.locks; + lock_local->lock[0] + .ns.parent_layout.lk_count = local->lock[0].ns.parent_layout.lk_count; + local->lock[0].ns.parent_layout.locks = NULL; + local->lock[0].ns.parent_layout.lk_count = 0; + + dht_unlock_inodelk(lock_frame, lock_local->lock[0].ns.parent_layout.locks, + lock_local->lock[0].ns.parent_layout.lk_count, + dht_selfheal_unlock_cbk); + lock_frame = NULL; + +done: + if (invoke_cbk) + local->selfheal.dir_cbk(frame, NULL, frame->this, ret, local->op_errno, + NULL); + if (lock_frame != NULL) { + DHT_STACK_DESTROY(lock_frame); + } + + return 0; +} int -dht_selfheal_dir_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xdata) +dht_refresh_layout_done(call_frame_t *frame) { - dht_local_t *local = NULL; - call_frame_t *prev = NULL; - xlator_t *subvol = NULL; - int i = 0; - dht_layout_t *layout = NULL; - int err = 0; - int this_call_cnt = 0; - - local = frame->local; - layout = local->selfheal.layout; - prev = cookie; - subvol = prev->this; - - if (op_ret == 0) - err = 0; - else - err = op_errno; + int ret = -1; + dht_layout_t *refreshed = NULL, *heal = NULL; + dht_local_t *local = NULL; + dht_need_heal_t should_heal = NULL; + dht_selfheal_layout_t healer = NULL; - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].xlator == subvol) { - layout->list[i].err = err; - break; - } - } + local = frame->local; - this_call_cnt = dht_frame_return (frame); + refreshed = local->selfheal.refreshed_layout; + heal = local->selfheal.layout; - if (is_last_call (this_call_cnt)) { - dht_selfheal_dir_finish (frame, this, 0); - } + healer = local->selfheal.healer; + should_heal = local->selfheal.should_heal; - return 0; -} + ret = dht_layout_sort(refreshed); + if (ret == -1) { + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, + DHT_MSG_LAYOUT_SORT_FAILED, NULL); + goto err; + } + if (should_heal(frame, &heal, &refreshed)) { + healer(frame, &local->loc, heal); + } else { + local->selfheal.layout = NULL; + local->selfheal.refreshed_layout = NULL; + local->selfheal.layout = refreshed; + + dht_layout_unref(frame->this, heal); + + dht_selfheal_dir_finish(frame, frame->this, 0, 1); + } + + return 0; + +err: + dht_selfheal_dir_finish(frame, frame->this, -1, 1); + return 0; +} int -dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, - dht_layout_t *layout, int i, - xlator_t *req_subvol) +dht_refresh_layout_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) { - xlator_t *subvol = NULL; - dict_t *xattr = NULL; - int ret = 0; - xlator_t *this = NULL; - int32_t *disk_layout = NULL; - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - data_t *data = NULL; - - local = frame->local; - if (req_subvol) - subvol = req_subvol; - else - subvol = layout->list[i].xlator; - this = frame->this; + dht_local_t *local = NULL; + int this_call_cnt = 0; + xlator_t *prev = NULL; + dht_layout_t *layout = NULL; + char gfid[GF_UUID_BUF_SIZE] = { + 0, + }; + + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO("dht", this, err); + GF_VALIDATE_OR_GOTO("dht", frame->local, err); + GF_VALIDATE_OR_GOTO("dht", this->private, err); + + local = frame->local; + prev = cookie; + + layout = local->selfheal.refreshed_layout; + + LOCK(&frame->lock); + { + op_ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, xattr); + + dht_iatt_merge(this, &local->stbuf, stbuf); + + if (op_ret == -1) { + gf_uuid_unparse(local->loc.gfid, gfid); + local->op_errno = op_errno; + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_FILE_LOOKUP_FAILED, "path=%s", local->loc.path, + "name=%s", prev->name, "gfid=%s", gfid, NULL); + + goto unlock; + } - GF_VALIDATE_OR_GOTO ("", this, err); - GF_VALIDATE_OR_GOTO (this->name, layout, err); - GF_VALIDATE_OR_GOTO (this->name, local, err); - GF_VALIDATE_OR_GOTO (this->name, subvol, err); - VALIDATE_OR_GOTO (this->private, err); + local->op_ret = 0; + } +unlock: + UNLOCK(&frame->lock); - conf = this->private; + this_call_cnt = dht_frame_return(frame); - xattr = get_new_dict (); - if (!xattr) { - goto err; + if (is_last_call(this_call_cnt)) { + if (local->op_ret == 0) { + local->refresh_layout_done(frame); + } else { + goto err; } + } - ret = dht_disk_layout_extract (this, layout, i, &disk_layout); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "%s: (subvol %s) failed to extract disk layout", - loc->path, subvol->name); - goto err; - } + return 0; - ret = dict_set_bin (xattr, conf->xattr_name, disk_layout, 4 * 4); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "%s: (subvol %s) failed to set xattr dictionary", - loc->path, subvol->name); - goto err; - } - disk_layout = NULL; +err: + if (local) { + local->refresh_layout_unlock(frame, this, -1, 1); + } + return 0; +} - gf_log (this->name, GF_LOG_TRACE, - "setting hash range %u - %u (type %d) on subvolume %s for %s", - layout->list[i].start, layout->list[i].stop, - layout->type, subvol->name, loc->path); - - dict_ref (xattr); - if (local->xattr) { - data = dict_get (local->xattr, QUOTA_LIMIT_KEY); - if (data) { - ret = dict_add (xattr, QUOTA_LIMIT_KEY, data); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Failed to " - "set quota limit key on %s",loc->path); - } - } +int +dht_refresh_layout(call_frame_t *frame) +{ + int call_cnt = 0; + int i = 0, ret = -1; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + xlator_t *this = NULL; + char gfid[GF_UUID_BUF_SIZE] = { + 0, + }; + + GF_VALIDATE_OR_GOTO("dht", frame, out); + GF_VALIDATE_OR_GOTO("dht", frame->local, out); + + this = frame->this; + conf = this->private; + local = frame->local; + + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + local->op_ret = -1; + + if (local->selfheal.refreshed_layout) { + dht_layout_unref(this, local->selfheal.refreshed_layout); + local->selfheal.refreshed_layout = NULL; + } + + local->selfheal.refreshed_layout = dht_layout_new(this, + conf->subvolume_cnt); + if (!local->selfheal.refreshed_layout) { + gf_uuid_unparse(local->loc.gfid, gfid); + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED, + "path=%s", local->loc.path, "gfid=%s", gfid, NULL); + goto out; + } + + if (local->xattr != NULL) { + dict_del(local->xattr, conf->xattr_name); + } + + if (local->xattr_req == NULL) { + gf_uuid_unparse(local->loc.gfid, gfid); + local->xattr_req = dict_new(); + if (local->xattr_req == NULL) { + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + "path=%s", local->loc.path, "gfid=%s", gfid, NULL); + goto out; } - if (!uuid_is_null (local->gfid)) - uuid_copy (loc->gfid, local->gfid); + } + + if (dict_get(local->xattr_req, conf->xattr_name) == 0) { + ret = dict_set_uint32(local->xattr_req, conf->xattr_name, 4 * 4); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "path=%s", local->loc.path, "key=%s", conf->xattr_name, + NULL); + } - STACK_WIND (frame, dht_selfheal_dir_xattr_cbk, - subvol, subvol->fops->setxattr, - loc, xattr, 0, NULL); + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(frame, dht_refresh_layout_cbk, conf->subvolumes[i], + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, &local->loc, + local->xattr_req); + } - dict_unref (xattr); + return 0; - return 0; +out: + if (local) { + local->refresh_layout_unlock(frame, this, -1, 1); + } + return 0; +} + +int32_t +dht_selfheal_layout_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + + local = frame->local; + + if (!local) { + goto err; + } + + if (op_ret < 0) { + local->op_errno = op_errno; + goto err; + } + + local->refresh_layout_unlock = dht_selfheal_dir_finish; + local->refresh_layout_done = dht_refresh_layout_done; + + dht_refresh_layout(frame); + return 0; err: - if (xattr) - dict_destroy (xattr); + dht_selfheal_dir_finish(frame, this, -1, 1); + return 0; +} - GF_FREE (disk_layout); +gf_boolean_t +dht_should_heal_layout(call_frame_t *frame, dht_layout_t **heal, + dht_layout_t **ondisk) +{ + gf_boolean_t fixit = _gf_true; + dht_local_t *local = NULL; + int heal_missing_dirs = 0; + + local = frame->local; + + if ((heal == NULL) || (*heal == NULL) || (ondisk == NULL) || + (*ondisk == NULL)) + goto out; + + dht_layout_anomalies( + frame->this, &local->loc, *ondisk, &local->selfheal.hole_cnt, + &local->selfheal.overlaps_cnt, &local->selfheal.missing_cnt, + &local->selfheal.down, &local->selfheal.misc, NULL); + + /* Directories might've been created as part of this self-heal. We've to + * sync non-layout xattrs and set range 0-0 on new directories + */ + heal_missing_dirs = local->selfheal.force_mkdir + ? local->selfheal.force_mkdir + : dht_layout_missing_dirs(*heal); + + if ((local->selfheal.hole_cnt == 0) && + (local->selfheal.overlaps_cnt == 0) && heal_missing_dirs) { + dht_layout_t *tmp = NULL; + + /* Just added a brick and need to set 0-0 range on this brick. + * But ondisk layout is well-formed. So, swap layouts "heal" and + * "ondisk". Now "ondisk" layout will be used for healing + * xattrs. If there are any non-participating subvols in + * "ondisk" layout, dht_selfheal_dir_xattr_persubvol will set + * 0-0 and non-layout xattrs. This way we won't end up in + * "corrupting" already set and well-formed "ondisk" layout. + */ + tmp = *heal; + *heal = *ondisk; + *ondisk = tmp; + + /* Current selfheal code, heals non-layout xattrs only after + * an add-brick. In fact non-layout xattrs are considered as + * secondary citizens which are healed only if layout xattrs + * need to be healed. This is wrong, since for eg., quota can be + * set when layout is well-formed, but a node is down. Also, + * just for healing non-layout xattrs, we don't need locking. + * This issue is _NOT FIXED_ by this patch. + */ + } + + fixit = (local->selfheal.hole_cnt || local->selfheal.overlaps_cnt || + heal_missing_dirs); - dht_selfheal_dir_xattr_cbk (frame, subvol, frame->this, - -1, ENOMEM, NULL); - return 0; +out: + return fixit; } int -dht_fix_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) +dht_layout_span(dht_layout_t *layout) { - dht_local_t *local = NULL; - int i = 0; - int count = 0; - xlator_t *this = NULL; - dht_conf_t *conf = NULL; - dht_layout_t *dummy = NULL; + int i = 0, count = 0; - local = frame->local; - this = frame->this; - conf = this->private; + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err) + continue; - gf_log (this->name, GF_LOG_DEBUG, - "writing the new range for all subvolumes"); + if (layout->list[i].start != layout->list[i].stop) + count++; + } - local->call_cnt = count = conf->subvolume_cnt; + return count; +} - for (i = 0; i < layout->cnt; i++) { - dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i, NULL); +int +dht_decommissioned_bricks_in_layout(xlator_t *this, dht_layout_t *layout) +{ + dht_conf_t *conf = NULL; + int count = 0, i = 0, j = 0; + + if ((this == NULL) || (layout == NULL)) + goto out; - if (--count == 0) - goto out; + conf = this->private; + + for (i = 0; i < layout->cnt; i++) { + for (j = 0; j < conf->subvolume_cnt; j++) { + if (conf->decommissioned_bricks[j] && + conf->decommissioned_bricks[j] == layout->list[i].xlator) { + count++; + } } - /* if we are here, subvolcount > layout_count. subvols-per-directory - * option might be set here. We need to clear out layout from the - * non-participating subvolumes, else it will result in overlaps */ - dummy = dht_layout_new (this, 1); - if (!dummy) - goto out; - for (i = 0; i < conf->subvolume_cnt; i++) { - if (_gf_false == - dht_is_subvol_in_layout (layout, conf->subvolumes[i])) { - dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0, - conf->subvolumes[i]); - if (--count == 0) - break; - } + } + +out: + return count; +} + +dht_distribution_type_t +dht_distribution_type(xlator_t *this, dht_layout_t *layout) +{ + dht_distribution_type_t type = GF_DHT_EQUAL_DISTRIBUTION; + int i = 0; + uint32_t start_range = 0, range = 0, diff = 0; + + if ((this == NULL) || (layout == NULL) || (layout->cnt < 1)) { + goto out; + } + + for (i = 0; i < layout->cnt; i++) { + if (start_range == 0) { + start_range = layout->list[i].stop - layout->list[i].start; + continue; + } + + range = layout->list[i].stop - layout->list[i].start; + diff = (range >= start_range) ? range - start_range + : start_range - range; + + if ((range != 0) && (diff > layout->cnt)) { + type = GF_DHT_WEIGHTED_DISTRIBUTION; + break; } + } - dht_layout_unref (this, dummy); out: - return 0; + return type; } -int -dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) +gf_boolean_t +dht_should_fix_layout(call_frame_t *frame, dht_layout_t **inmem, + dht_layout_t **ondisk) { - dht_local_t *local = NULL; - int missing_xattr = 0; - int i = 0; - xlator_t *this = NULL; - dht_conf_t *conf = NULL; - dht_layout_t *dummy = NULL; + gf_boolean_t fixit = _gf_true; - local = frame->local; - this = frame->this; - conf = this->private; + dht_local_t *local = NULL; + int layout_span = 0; + int decommissioned_bricks = 0; + dht_conf_t *conf = NULL; + dht_distribution_type_t inmem_dist_type = 0; + dht_distribution_type_t ondisk_dist_type = 0; - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err != -1 || !layout->list[i].stop) { - /* err != -1 would mean xattr present on the directory - * or the directory is non existent. - * !layout->list[i].stop would mean layout absent - */ + conf = frame->this->private; - continue; - } - missing_xattr++; - } - /* Also account for subvolumes with no-layout. Used for zero'ing out - * the layouts and for setting quota key's if present */ - for (i = 0; i < conf->subvolume_cnt; i++) { - if (_gf_false == - dht_is_subvol_in_layout (layout, conf->subvolumes[i])) { - missing_xattr++; - } - } - gf_log (this->name, GF_LOG_TRACE, - "%d subvolumes missing xattr for %s", - missing_xattr, loc->path); + local = frame->local; - if (missing_xattr == 0) { - dht_selfheal_dir_finish (frame, this, 0); - return 0; - } + if ((inmem == NULL) || (*inmem == NULL) || (ondisk == NULL) || + (*ondisk == NULL)) + goto out; - local->call_cnt = missing_xattr; - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err != -1 || !layout->list[i].stop) - continue; + dht_layout_anomalies(frame->this, &local->loc, *ondisk, + &local->selfheal.hole_cnt, + &local->selfheal.overlaps_cnt, NULL, + &local->selfheal.down, &local->selfheal.misc, NULL); - dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i, NULL); + if (local->selfheal.down || local->selfheal.misc) { + fixit = _gf_false; + goto out; + } - if (--missing_xattr == 0) - break; - } - dummy = dht_layout_new (this, 1); - if (!dummy) - goto out; - for (i = 0; i < conf->subvolume_cnt && missing_xattr; i++) { - if (_gf_false == - dht_is_subvol_in_layout (layout, conf->subvolumes[i])) { - dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0, - conf->subvolumes[i]); - missing_xattr--; - } - } + if (local->selfheal.hole_cnt || local->selfheal.overlaps_cnt) + goto out; + + /* If commit hashes are being updated, let it through */ + if ((*inmem)->commit_hash != (*ondisk)->commit_hash) + goto out; + + layout_span = dht_layout_span(*ondisk); + + decommissioned_bricks = dht_decommissioned_bricks_in_layout(frame->this, + *ondisk); + inmem_dist_type = dht_distribution_type(frame->this, *inmem); + ondisk_dist_type = dht_distribution_type(frame->this, *ondisk); + + if ((decommissioned_bricks == 0) && + (layout_span == + (conf->subvolume_cnt - conf->decommission_subvols_cnt)) && + (inmem_dist_type == ondisk_dist_type)) + fixit = _gf_false; - dht_layout_unref (this, dummy); out: - return 0; + + return fixit; } -int -dht_selfheal_dir_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *statpre, - struct iatt *statpost, dict_t *xdata) +static int +dht_selfheal_layout_lock(call_frame_t *frame, dht_layout_t *layout, + gf_boolean_t newdir, dht_selfheal_layout_t healer, + dht_need_heal_t should_heal) { - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - int this_call_cnt = 0; + dht_local_t *local = NULL; + int count = 1, ret = -1, i = 0; + dht_lock_t **lk_array = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *tmp = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; - local = frame->local; - layout = local->selfheal.layout; + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO(frame->this->name, frame->local, err); - this_call_cnt = dht_frame_return (frame); + local = frame->local; - if (is_last_call (this_call_cnt)) { - dht_selfheal_dir_xattr (frame, &local->loc, layout); - } + conf = frame->this->private; - return 0; -} + local->selfheal.healer = healer; + local->selfheal.should_heal = should_heal; + tmp = local->selfheal.layout; + local->selfheal.layout = dht_layout_ref(frame->this, layout); + dht_layout_unref(frame->this, tmp); -int -dht_selfheal_dir_setattr (call_frame_t *frame, loc_t *loc, struct iatt *stbuf, - int32_t valid, dht_layout_t *layout) -{ - int missing_attr = 0; - int i = 0; - dht_local_t *local = NULL; - xlator_t *this = NULL; + if (!newdir) { + count = conf->subvolume_cnt; - local = frame->local; - this = frame->this; + lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_char); + if (lk_array == NULL) { + gf_uuid_unparse(local->stbuf.ia_gfid, gfid); + gf_smsg("dht", GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED, + "lk_array-gfid=%s", gfid, "path=%s", local->loc.path, NULL); + goto err; + } - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err == -1) - missing_attr++; + for (i = 0; i < count; i++) { + lk_array[i] = dht_lock_new( + frame->this, conf->subvolumes[i], &local->loc, F_WRLCK, + DHT_LAYOUT_HEAL_DOMAIN, NULL, FAIL_ON_ANY_ERROR); + if (lk_array[i] == NULL) { + gf_uuid_unparse(local->stbuf.ia_gfid, gfid); + gf_smsg(THIS->name, GF_LOG_ERROR, ENOMEM, + DHT_MSG_MEM_ALLOC_FAILED, "lk_array-gfid=%s", gfid, + "path=%s", local->loc.path, NULL); + goto err; + } + } + } else { + count = 1; + lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_char); + if (lk_array == NULL) { + gf_uuid_unparse(local->stbuf.ia_gfid, gfid); + gf_smsg(THIS->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED, + "lk_array-gfid=%s", gfid, "path=%s", local->loc.path, NULL); + goto err; } - if (missing_attr == 0) { - dht_selfheal_dir_xattr (frame, loc, layout); - return 0; + lk_array[0] = dht_lock_new(frame->this, local->hashed_subvol, + &local->loc, F_WRLCK, DHT_LAYOUT_HEAL_DOMAIN, + NULL, FAIL_ON_ANY_ERROR); + if (lk_array[0] == NULL) { + gf_uuid_unparse(local->stbuf.ia_gfid, gfid); + gf_smsg(THIS->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED, + "lk_array-gfid=%s", gfid, "path=%s", local->loc.path, NULL); + goto err; } + } - if (!uuid_is_null (local->gfid)) - uuid_copy (loc->gfid, local->gfid); + local->lock[0].layout.my_layout.locks = lk_array; + local->lock[0].layout.my_layout.lk_count = count; - local->call_cnt = missing_attr; - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err == -1) { - gf_log (this->name, GF_LOG_TRACE, - "setattr for %s on subvol %s", - loc->path, layout->list[i].xlator->name); - - STACK_WIND (frame, dht_selfheal_dir_setattr_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->setattr, - loc, stbuf, valid, NULL); - } + ret = dht_blocking_inodelk(frame, lk_array, count, + dht_selfheal_layout_lock_cbk); + if (ret < 0) { + local->lock[0].layout.my_layout.locks = NULL; + local->lock[0].layout.my_layout.lk_count = 0; + goto err; + } + + return 0; +err: + if (lk_array != NULL) { + dht_lock_array_free(lk_array, count); + GF_FREE(lk_array); + } + + return -1; +} + +static int +dht_selfheal_dir_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + struct iatt *stbuf = NULL; + int i = 0; + int ret = 0; + dht_layout_t *layout = NULL; + int err = 0; + int this_call_cnt = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + layout = local->selfheal.layout; + subvol = cookie; + + if (op_ret == 0) { + err = 0; + } else { + gf_uuid_unparse(local->loc.gfid, gfid); + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "name=%s", subvol->name, + "path=%s", local->loc.path, "gfid=%s", gfid, NULL); + err = op_errno; + } + + ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf); + if (ret < 0) { + gf_uuid_unparse(local->loc.gfid, gfid); + gf_msg_debug(this->name, 0, + "key = %s not present in dict" + ", path:%s gfid:%s", + DHT_IATT_IN_XDATA_KEY, local->loc.path, gfid); + } + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == subvol) { + layout->list[i].err = err; + break; } + } - return 0; + LOCK(&frame->lock); + { + dht_iatt_merge(this, &local->stbuf, stbuf); + } + UNLOCK(&frame->lock); + + this_call_cnt = dht_frame_return(frame); + + if (is_last_call(this_call_cnt)) { + dht_selfheal_dir_finish(frame, this, 0, 1); + } + + return 0; } +/* Code is required to set user xattr to local->xattr + */ int -dht_selfheal_dir_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, struct iatt *postparent, - dict_t *xdata) +dht_set_user_xattr(dict_t *dict, char *k, data_t *v, void *data) { - dht_local_t *local = NULL; - dht_layout_t *layout = NULL; - call_frame_t *prev = NULL; - xlator_t *subvol = NULL; - int i = 0; - int this_call_cnt = 0; - - - local = frame->local; - layout = local->selfheal.layout; - prev = cookie; - subvol = prev->this; - - if ((op_ret == 0) || ((op_ret == -1) && (op_errno == EEXIST))) { - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].xlator == subvol) { - layout->list[i].err = -1; - break; - } - } - } + dict_t *set_xattr = data; + int ret = -1; - if (op_ret) { - gf_log (this->name, ((op_errno == EEXIST) ? GF_LOG_DEBUG : - GF_LOG_WARNING), - "selfhealing directory %s failed: %s", - local->loc.path, strerror (op_errno)); - goto out; + ret = dict_set(set_xattr, k, v); + return ret; +} + +static int +dht_selfheal_dir_xattr_persubvol(call_frame_t *frame, loc_t *loc, + dht_layout_t *layout, int i, + xlator_t *req_subvol) +{ + xlator_t *subvol = NULL; + dict_t *xattr = NULL; + dict_t *xdata = NULL; + int ret = 0; + xlator_t *this = NULL; + int32_t *disk_layout = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + data_t *data = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + if (req_subvol) + subvol = req_subvol; + else + subvol = layout->list[i].xlator; + this = frame->this; + + GF_VALIDATE_OR_GOTO("", this, err); + GF_VALIDATE_OR_GOTO(this->name, layout, err); + GF_VALIDATE_OR_GOTO(this->name, local, err); + GF_VALIDATE_OR_GOTO(this->name, subvol, err); + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + + xattr = dict_new(); + if (!xattr) { + goto err; + } + + xdata = dict_new(); + if (!xdata) + goto err; + + ret = dict_set_str(xdata, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "path=%s", loc->path, "key=%s", GLUSTERFS_INTERNAL_FOP_KEY, + "gfid=%s", gfid, NULL); + goto err; + } + + ret = dict_set_int8(xdata, DHT_IATT_IN_XDATA_KEY, 1); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "path=%s", loc->path, "key=%s", DHT_IATT_IN_XDATA_KEY, + "gfid=%s", gfid, NULL); + goto err; + } + + gf_uuid_unparse(loc->inode->gfid, gfid); + + ret = dht_disk_layout_extract(this, layout, i, &disk_layout); + if (ret == -1) { + gf_smsg(this->name, GF_LOG_WARNING, 0, + DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, + "extract-disk-layout-failed, path=%s", loc->path, "subvol=%s", + subvol->name, "gfid=%s", gfid, NULL); + goto err; + } + + ret = dict_set_bin(xattr, conf->xattr_name, disk_layout, 4 * 4); + if (ret == -1) { + gf_smsg(this->name, GF_LOG_WARNING, 0, + DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "path=%s", loc->path, + "subvol=%s", subvol->name, + "set-xattr-dictionary-failed" + "gfid=%s", + gfid, NULL); + goto err; + } + disk_layout = NULL; + + gf_msg_trace(this->name, 0, + "setting hash range 0x%x - 0x%x (type %d) on subvolume %s" + " for %s", + layout->list[i].start, layout->list[i].stop, layout->type, + subvol->name, loc->path); + + if (local->xattr) { + data = dict_get(local->xattr, QUOTA_LIMIT_KEY); + if (data) { + ret = dict_add(xattr, QUOTA_LIMIT_KEY, data); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "path=%s", loc->path, "key=%s", QUOTA_LIMIT_KEY, NULL); + } + } + data = dict_get(local->xattr, QUOTA_LIMIT_OBJECTS_KEY); + if (data) { + ret = dict_add(xattr, QUOTA_LIMIT_OBJECTS_KEY, data); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "path=%s", loc->path, "key=%s", QUOTA_LIMIT_OBJECTS_KEY, + NULL); + } } + } - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - dht_iatt_merge (this, &local->preparent, preparent, prev->this); - dht_iatt_merge (this, &local->postparent, postparent, prev->this); + if (!gf_uuid_is_null(local->gfid)) + gf_uuid_copy(loc->gfid, local->gfid); + STACK_WIND_COOKIE(frame, dht_selfheal_dir_xattr_cbk, (void *)subvol, subvol, + subvol->fops->setxattr, loc, xattr, 0, xdata); + + dict_unref(xattr); + dict_unref(xdata); + + return 0; + +err: + if (xattr) + dict_unref(xattr); + if (xdata) + dict_unref(xdata); + + GF_FREE(disk_layout); + + dht_selfheal_dir_xattr_cbk(frame, (void *)subvol, frame->this, -1, ENOMEM, + NULL); + return 0; +} + +static int +dht_fix_dir_xattr(call_frame_t *frame, loc_t *loc, dht_layout_t *layout) +{ + dht_local_t *local = NULL; + int i = 0; + int count = 0; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *dummy = NULL; + + local = frame->local; + this = frame->this; + conf = this->private; + + gf_msg_debug(this->name, 0, "%s: Writing the new range for all subvolumes", + loc->path); + + local->call_cnt = count = conf->subvolume_cnt; + + if (gf_log_get_loglevel() >= GF_LOG_DEBUG) + dht_log_new_layout_for_dir_selfheal(this, loc, layout); + + for (i = 0; i < layout->cnt; i++) { + dht_selfheal_dir_xattr_persubvol(frame, loc, layout, i, NULL); + + if (--count == 0) + goto out; + } + /* if we are here, subvolcount > layout_count. subvols-per-directory + * option might be set here. We need to clear out layout from the + * non-participating subvolumes, else it will result in overlaps */ + dummy = dht_layout_new(this, 1); + if (!dummy) + goto out; + dummy->commit_hash = layout->commit_hash; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (_gf_false == dht_is_subvol_in_layout(layout, conf->subvolumes[i])) { + dht_selfheal_dir_xattr_persubvol(frame, loc, dummy, 0, + conf->subvolumes[i]); + if (--count == 0) + break; + } + } + + dht_layout_unref(this, dummy); out: - this_call_cnt = dht_frame_return (frame); + return 0; +} - if (is_last_call (this_call_cnt)) { - dht_selfheal_dir_setattr (frame, &local->loc, &local->stbuf, 0xffffff, layout); +static int +dht_selfheal_dir_xattr(call_frame_t *frame, loc_t *loc, dht_layout_t *layout) +{ + dht_local_t *local = NULL; + int missing_xattr = 0; + int i = 0; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *dummy = NULL; + char gfid[GF_UUID_BUF_SIZE] = { + 0, + }; + + local = frame->local; + this = frame->this; + conf = this->private; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err != -1 || !layout->list[i].stop) { + /* err != -1 would mean xattr present on the directory + * or the directory is non existent. + * !layout->list[i].stop would mean layout absent + */ + + continue; + } + missing_xattr++; + } + /* Also account for subvolumes with no-layout. Used for zero'ing out + * the layouts and for setting quota key's if present */ + for (i = 0; i < conf->subvolume_cnt; i++) { + if (_gf_false == dht_is_subvol_in_layout(layout, conf->subvolumes[i])) { + missing_xattr++; } + } + gf_msg_trace(this->name, 0, "%d subvolumes missing xattr for %s", + missing_xattr, loc->path); + if (missing_xattr == 0) { + dht_selfheal_dir_finish(frame, this, 0, 1); return 0; + } + + local->call_cnt = missing_xattr; + + if (gf_log_get_loglevel() >= GF_LOG_DEBUG) + dht_log_new_layout_for_dir_selfheal(this, loc, layout); + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err != -1 || !layout->list[i].stop) + continue; + + dht_selfheal_dir_xattr_persubvol(frame, loc, layout, i, NULL); + + if (--missing_xattr == 0) + break; + } + dummy = dht_layout_new(this, 1); + if (!dummy) { + gf_uuid_unparse(loc->gfid, gfid); + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_DUMMY_ALLOC_FAILED, + "path=%s", loc->path, "gfid=%s", gfid, NULL); + goto out; + } + for (i = 0; i < conf->subvolume_cnt && missing_xattr; i++) { + if (_gf_false == dht_is_subvol_in_layout(layout, conf->subvolumes[i])) { + dht_selfheal_dir_xattr_persubvol(frame, loc, dummy, 0, + conf->subvolumes[i]); + missing_xattr--; + } + } + + dht_layout_unref(this, dummy); +out: + return 0; } -void -dht_selfheal_dir_mkdir_setacl (dict_t *xattr, dict_t *dict) +int +dht_selfheal_dir_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) { - data_t *acl_default = NULL; - data_t *acl_access = NULL; - xlator_t *this = NULL; - int ret = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int this_call_cnt = 0, ret = -1; - GF_ASSERT (xattr); - GF_ASSERT (dict); + local = frame->local; + layout = local->selfheal.layout; - this = THIS; - GF_ASSERT (this); + this_call_cnt = dht_frame_return(frame); - acl_default = dict_get (xattr, POSIX_ACL_DEFAULT_XATTR); + if (is_last_call(this_call_cnt)) { + if (!local->heal_layout) { + gf_msg_trace(this->name, 0, "Skip heal layout for %s gfid = %s ", + local->loc.path, uuid_utoa(local->gfid)); - if (!acl_default) { - gf_log (this->name, GF_LOG_DEBUG, - "ACL_DEFAULT xattr not present"); - goto cont; + dht_selfheal_dir_finish(frame, this, 0, 1); + return 0; } - ret = dict_set (dict, POSIX_ACL_DEFAULT_XATTR, acl_default); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "Could not set ACL_DEFAULT xattr"); -cont: - acl_access = dict_get (xattr, POSIX_ACL_ACCESS_XATTR); - if (!acl_access) { - gf_log (this->name, GF_LOG_DEBUG, - "ACL_ACCESS xattr not present"); - goto out; + ret = dht_selfheal_layout_lock(frame, layout, _gf_false, + dht_selfheal_dir_xattr, + dht_should_heal_layout); + + if (ret < 0) { + dht_selfheal_dir_finish(frame, this, -1, 1); } - ret = dict_set (dict, POSIX_ACL_ACCESS_XATTR, acl_access); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "Could not set ACL_ACCESS xattr"); + } -out: - return; + return 0; } int -dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc, - dht_layout_t *layout, int force) +dht_selfheal_dir_setattr(call_frame_t *frame, loc_t *loc, struct iatt *stbuf, + int32_t valid, dht_layout_t *layout) { - int missing_dirs = 0; - int i = 0; - int ret = -1; - dht_local_t *local = NULL; - xlator_t *this = NULL; - dict_t *dict = NULL; + int missing_attr = 0; + int i = 0, ret = -1; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *this = NULL; + int cnt = 0; + + local = frame->local; + this = frame->this; + conf = this->private; + + /* We need to heal the attrs if: + * 1. Any directories were missing - the newly created dirs will need + * to have the correct attrs set + * 2. An existing dir does not have the correct permissions -they may + * have been changed when a brick was down. + */ + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err == -1) + missing_attr++; + } + + if ((missing_attr == 0) && (local->need_attrheal == 0)) { + if (!local->heal_layout) { + gf_msg_trace(this->name, 0, "Skip heal layout for %s gfid = %s ", + loc->path, uuid_utoa(loc->gfid)); + dht_selfheal_dir_finish(frame, this, 0, 1); + return 0; + } + ret = dht_selfheal_layout_lock(frame, layout, _gf_false, + dht_selfheal_dir_xattr, + dht_should_heal_layout); + + if (ret < 0) { + dht_selfheal_dir_finish(frame, this, -1, 1); + } - local = frame->local; - this = frame->this; + return 0; + } + + cnt = local->call_cnt = conf->subvolume_cnt; + + for (i = 0; i < cnt; i++) { + STACK_WIND(frame, dht_selfheal_dir_setattr_cbk, layout->list[i].xlator, + layout->list[i].xlator->fops->setattr, loc, stbuf, valid, + NULL); + } + + return 0; +} +static int +dht_selfheal_dir_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + xlator_t *prev = NULL; + xlator_t *subvol = NULL; + int i = 0, ret = -1; + int this_call_cnt = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + layout = local->selfheal.layout; + prev = cookie; + subvol = prev; + + if ((op_ret == 0) || ((op_ret == -1) && (op_errno == EEXIST))) { for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err == ENOENT || force) - missing_dirs++; + if (layout->list[i].xlator == subvol) { + layout->list[i].err = -1; + break; + } } + } + + if (op_ret) { + gf_uuid_unparse(local->loc.gfid, gfid); + gf_smsg(this->name, + ((op_errno == EEXIST) ? GF_LOG_DEBUG : GF_LOG_WARNING), + op_errno, DHT_MSG_DIR_SELFHEAL_FAILED, "path=%s", + local->loc.path, "gfid=%s", gfid, NULL); + goto out; + } + dht_iatt_merge(this, &local->preparent, preparent); + dht_iatt_merge(this, &local->postparent, postparent); + ret = 0; - if (missing_dirs == 0) { - dht_selfheal_dir_setattr (frame, loc, &local->stbuf, 0xffffffff, layout); - return 0; - } +out: + this_call_cnt = dht_frame_return(frame); - local->call_cnt = missing_dirs; - if (!uuid_is_null (local->gfid)) { - dict = dict_new (); - if (!dict) - return -1; - - ret = dict_set_static_bin (dict, "gfid-req", local->gfid, 16); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set gfid in dict", loc->path); - } else if (local->params) { - /* Send the dictionary from higher layers directly */ - dict = dict_ref (local->params); - } - /* Set acls */ - if (local->xattr && dict) - dht_selfheal_dir_mkdir_setacl (local->xattr, dict); + if (is_last_call(this_call_cnt)) { + dht_selfheal_dir_finish(frame, this, ret, 0); + dht_selfheal_dir_setattr(frame, &local->loc, &local->stbuf, 0xffffff, + layout); + } + return 0; +} + +static int +dht_selfheal_dir_mkdir_lookup_done(call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + int i = 0; + dict_t *dict = NULL; + dht_layout_t *layout = NULL; + loc_t *loc = NULL; + int cnt = 0; + int ret = -1; + + VALIDATE_OR_GOTO(this->private, err); + + local = frame->local; + layout = local->layout; + loc = &local->loc; + + if (!gf_uuid_is_null(local->gfid)) { + dict = dict_new(); if (!dict) - gf_log (this->name, GF_LOG_WARNING, - "dict is NULL, need to make sure gfids are same"); + return -1; - for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err == ENOENT || force) { - gf_log (this->name, GF_LOG_DEBUG, - "creating directory %s on subvol %s", - loc->path, layout->list[i].xlator->name); - - STACK_WIND (frame, dht_selfheal_dir_mkdir_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->mkdir, - loc, - st_mode_from_ia (local->stbuf.ia_prot, - local->stbuf.ia_type), - 0, dict); - } + ret = dict_set_gfuuid(dict, "gfid-req", local->gfid, true); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "path=%s", loc->path, "key=gfid-req", NULL); + } else if (local->params) { + /* Send the dictionary from higher layers directly */ + + dict = dict_ref(local->params); + } + /* Code to update all extended attributed from local->xattr + to dict + */ + dht_dir_set_heal_xattr(this, local, dict, local->xattr, NULL, NULL); + + if (!dict) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_IS_NULL, NULL); + dict = dict_new(); + if (!dict) + return -1; + } + ret = dict_set_flag(dict, GF_INTERNAL_CTX_KEY, GF_DHT_HEAL_DIR); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, "key=%s", + GF_INTERNAL_CTX_KEY, "path=%s", loc->path, NULL); + /* We can still continue. As heal can still happen + * unless quota limits have reached for the dir. + */ + } + + cnt = layout->cnt; + for (i = 0; i < cnt; i++) { + if (layout->list[i].err == ESTALE || layout->list[i].err == ENOENT || + local->selfheal.force_mkdir) { + gf_msg_debug(this->name, 0, "Creating directory %s on subvol %s", + loc->path, layout->list[i].xlator->name); + + STACK_WIND_COOKIE( + frame, dht_selfheal_dir_mkdir_cbk, layout->list[i].xlator, + layout->list[i].xlator, layout->list[i].xlator->fops->mkdir, + loc, + st_mode_from_ia(local->stbuf.ia_prot, local->stbuf.ia_type), 0, + dict); } + } - if (dict) - dict_unref (dict); + if (dict) + dict_unref(dict); - return 0; -} + return 0; +err: + dht_selfheal_dir_finish(frame, this, -1, 1); + return 0; +} -int -dht_selfheal_layout_alloc_start (xlator_t *this, loc_t *loc, - dht_layout_t *layout) +static int +dht_selfheal_dir_mkdir_lookup_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, + dict_t *xattr, struct iatt *postparent) { - int start = 0; - uint32_t hashval = 0; - int ret = 0; + dht_local_t *local = NULL; + int i = 0; + int this_call_cnt = 0; + int missing_dirs = 0; + dht_layout_t *layout = NULL; + xlator_t *prev = 0; + loc_t *loc = NULL; + char gfid_local[GF_UUID_BUF_SIZE] = {0}; + int index = -1; + + VALIDATE_OR_GOTO(this->private, err); + + local = frame->local; + layout = local->layout; + loc = &local->loc; + prev = cookie; + + if (!gf_uuid_is_null(local->gfid)) + gf_uuid_unparse(local->gfid, gfid_local); + + LOCK(&frame->lock); + { + index = dht_layout_index_for_subvol(layout, prev); + if ((op_ret < 0) && (op_errno == ENOENT || op_errno == ESTALE)) { + local->selfheal.hole_cnt = !local->selfheal.hole_cnt + ? 1 + : local->selfheal.hole_cnt + 1; + /* the status might have changed. Update the layout with the + * new status + */ + if (index >= 0) { + layout->list[index].err = op_errno; + } + } + + if (!op_ret) { + dht_iatt_merge(this, &local->stbuf, stbuf); + if (prev == local->mds_subvol) { + dict_unref(local->xattr); + local->xattr = dict_ref(xattr); + } + /* the status might have changed. Update the layout with the + * new status + */ + if (index >= 0) { + layout->list[index].err = -1; + } + } + } + UNLOCK(&frame->lock); + + this_call_cnt = dht_frame_return(frame); + + if (is_last_call(this_call_cnt)) { + if (local->selfheal.hole_cnt == layout->cnt) { + gf_msg_debug(this->name, op_errno, + "Lookup failed, an rmdir could have " + "deleted this entry %s", + loc->name); + local->op_errno = op_errno; + goto err; + } else { + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err == ENOENT || + layout->list[i].err == ESTALE || + local->selfheal.force_mkdir) + missing_dirs++; + } + + if (missing_dirs == 0) { + dht_selfheal_dir_finish(frame, this, 0, 0); + dht_selfheal_dir_setattr(frame, loc, &local->stbuf, 0xffffffff, + layout); + return 0; + } - ret = dht_hash_compute (this, layout->type, loc->path, &hashval); - if (ret == 0) { - start = (hashval % layout->cnt); + local->call_cnt = missing_dirs; + dht_selfheal_dir_mkdir_lookup_done(frame, this); } + } + + return 0; - return start; +err: + dht_selfheal_dir_finish(frame, this, -1, 1); + return 0; } -static inline int -dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout) +static int +dht_selfheal_dir_mkdir_lock_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) { - int i = 0; - int j = 0; - int err = 0; - int count = 0; - dht_conf_t *conf = NULL; - - /* Gets in use only for replace-brick, remove-brick */ - conf = this->private; - for (i = 0; i < layout->cnt; i++) { - for (j = 0; j < conf->subvolume_cnt; j++) { - if (conf->decommissioned_bricks[j] && - conf->decommissioned_bricks[j] == layout->list[i].xlator) { - layout->list[i].err = EINVAL; - break; - } - } + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int i = 0; + int ret = -1; + xlator_t *mds_subvol = NULL; + + VALIDATE_OR_GOTO(this->private, err); + + conf = this->private; + local = frame->local; + mds_subvol = local->mds_subvol; + + local->call_cnt = conf->subvolume_cnt; + + if (op_ret < 0) { + if (op_errno == EINVAL) { + local->call_cnt = 1; + dht_selfheal_dir_mkdir_lookup_done(frame, this); + return 0; } - for (i = 0; i < layout->cnt; i++) { - err = layout->list[i].err; - if (err == -1 || err == 0 || err == ENOENT) { - /* Setting list[i].err = -1 is an indication for - dht_selfheal_layout_new_directory() to assign - a range. We set it to -1 based on any one of - the three criteria: - - - err == -1 already, which means directory - existed but layout was not set on it. - - - err == 0, which means directory exists and - has an old layout piece which will be - overwritten now. - - - err == ENOENT, which means directory does - not exist (possibly racing with mkdir or - finishing half done mkdir). The missing - directory will be attempted to be recreated. - - It is important to note that it is safe - to race with mkdir() as self-heal and - mkdir are idempotent operations. Both will - strive to set the directory and layouts to - the same final state. - */ - count++; - if (!err) - layout->list[i].err = -1; - } + gf_smsg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_ENTRYLK_ERROR, + "path=%s", local->loc.path, NULL); + + local->op_errno = op_errno; + goto err; + } + + /* After getting locks, perform lookup again to ensure that the + directory was not deleted by a racing rmdir + */ + if (!local->xattr_req) + local->xattr_req = dict_new(); + + ret = dict_set_int32(local->xattr_req, "list-xattr", 1); + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, "path=%s", + local->loc.path, NULL); + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (mds_subvol && conf->subvolumes[i] == mds_subvol) { + STACK_WIND_COOKIE(frame, dht_selfheal_dir_mkdir_lookup_cbk, + conf->subvolumes[i], conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, &local->loc, + local->xattr_req); + } else { + STACK_WIND_COOKIE(frame, dht_selfheal_dir_mkdir_lookup_cbk, + conf->subvolumes[i], conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, &local->loc, + NULL); } + } + + return 0; - /* no subvolume has enough space, but can't stop directory creation */ - if (!count || !new_layout) { - for (i = 0; i < layout->cnt; i++) { - err = layout->list[i].err; - if (err == ENOSPC) { - layout->list[i].err = -1; - count++; - } +err: + dht_selfheal_dir_finish(frame, this, -1, 1); + return 0; +} + +static int +dht_selfheal_dir_mkdir(call_frame_t *frame, loc_t *loc, dht_layout_t *layout, + int force) +{ + int missing_dirs = 0; + int i = 0; + int op_errno = 0; + int ret = -1; + dht_local_t *local = NULL; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + + local = frame->local; + this = frame->this; + conf = this->private; + + local->selfheal.force_mkdir = force; + local->selfheal.hole_cnt = 0; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err == ENOENT || force) + missing_dirs++; + } + + if (missing_dirs == 0) { + /* We don't need to create any directories. Proceed to heal the + * attrs and xattrs + */ + if (!__is_root_gfid(local->stbuf.ia_gfid)) { + if (local->need_xattr_heal) { + local->need_xattr_heal = 0; + ret = dht_dir_xattr_heal(this, local, &op_errno); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_DIR_XATTR_HEAL_FAILED, "path=%s", + local->loc.path, "gfid=%s", local->gfid, NULL); } + } else { + if (!gf_uuid_is_null(local->gfid)) + gf_uuid_copy(loc->gfid, local->gfid); + + ret = dht_common_mark_mdsxattr(frame, NULL, 0); + if (!ret) + return 0; + + gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_SET_XATTR_FAILED, + "path=%s", local->loc.path, "gfid=%s", local->gfid, + NULL); + } + } + dht_selfheal_dir_setattr(frame, loc, &local->stbuf, 0xffffffff, layout); + return 0; + } + + /* MDS xattr is populated only while DHT is having more than one + subvol.In case of graph switch while adding more dht subvols need to + consider hash subvol as a MDS to avoid MDS check failure at the time + of running fop on directory + */ + if (!dict_get(local->xattr, conf->mds_xattr_key) && + (conf->subvolume_cnt > 1)) { + if (local->hashed_subvol == NULL) { + local->hashed_subvol = dht_subvol_get_hashed(this, loc); + if (local->hashed_subvol == NULL) { + local->op_errno = EINVAL; + gf_smsg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, "gfid=%s", + loc->pargfid, "name=%s", loc->name, "path=%s", + loc->path, NULL); + goto err; + } } + ret = dht_inode_ctx_mdsvol_set(local->inode, this, + local->hashed_subvol); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED, + "Failed to set hashed subvol for %s on inode vol is %s", + local->loc.path, + local->hashed_subvol ? local->hashed_subvol->name : "NULL"); + goto err; + } + } + + if (local->hashed_subvol == NULL) { + local->hashed_subvol = dht_subvol_get_hashed(this, loc); + if (local->hashed_subvol == NULL) { + local->op_errno = EINVAL; + gf_smsg(this->name, GF_LOG_WARNING, local->op_errno, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, "gfid=%s", loc->pargfid, + "name=%s", loc->name, "path=%s", loc->path, NULL); + goto err; + } + } + + local->current = &local->lock[0]; + ret = dht_protect_namespace(frame, loc, local->hashed_subvol, + &local->current->ns, + dht_selfheal_dir_mkdir_lock_cbk); - /* if layout->spread_cnt is set, check if it is <= available - * subvolumes (down brick and decommissioned bricks are considered - * un-availbale). Else return count (available up bricks) */ - count = ((layout->spread_cnt && - (layout->spread_cnt <= count)) ? - layout->spread_cnt : ((count) ? count : 1)); + if (ret < 0) + goto err; + + return 0; +err: + return -1; +} - return count; +static int +dht_selfheal_layout_alloc_start(xlator_t *this, loc_t *loc, + dht_layout_t *layout) +{ + int start = 0; + uint32_t hashval = 0; + int ret = 0; + const char *str = NULL; + dht_conf_t *conf = NULL; + char buf[UUID_CANONICAL_FORM_LEN + 1] = { + 0, + }; + + conf = this->private; + + if (conf->randomize_by_gfid) { + str = uuid_utoa_r(loc->gfid, buf); + } else { + str = loc->path; + } + + ret = dht_hash_compute(this, layout->type, str, &hashval); + if (ret == 0) { + start = (hashval % layout->cnt); + } + + return start; } +static int +dht_get_layout_count(xlator_t *this, dht_layout_t *layout, int new_layout) +{ + int i = 0; + int j = 0; + int err = 0; + int count = 0; + dht_conf_t *conf = NULL; + + /* Gets in use only for replace-brick, remove-brick */ + conf = this->private; + for (i = 0; i < layout->cnt; i++) { + for (j = 0; j < conf->subvolume_cnt; j++) { + if (conf->decommissioned_bricks[j] && + conf->decommissioned_bricks[j] == layout->list[i].xlator) { + layout->list[i].err = EINVAL; + break; + } + } + } + + for (i = 0; i < layout->cnt; i++) { + err = layout->list[i].err; + if (err == -1 || err == 0 || err == ENOENT) { + /* Take this with a pinch of salt. The behaviour seems + * to be slightly different when this function is + * invoked from mkdir codepath. For eg., err == 0 in + * mkdir codepath means directory created but xattr + * is not set yet. + */ + + /* Setting list[i].err = -1 is an indication for + dht_selfheal_layout_new_directory() to assign + a range. We set it to -1 based on any one of + the three criteria: + + - err == -1 already, which means directory + existed but layout was not set on it. + + - err == 0, which means directory exists and + has an old layout piece which will be + overwritten now. + + - err == ENOENT, which means directory does + not exist (possibly racing with mkdir or + finishing half done mkdir). The missing + directory will be attempted to be recreated. + */ + count++; + if (!err) + layout->list[i].err = -1; + } + } -void dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, - dht_layout_t *new_layout); + /* no subvolume has enough space, but can't stop directory creation */ + if (!count || !new_layout) { + for (i = 0; i < layout->cnt; i++) { + err = layout->list[i].err; + if (err == ENOSPC) { + layout->list[i].err = -1; + count++; + } + } + } + + /* if layout->spread_cnt is set, check if it is <= available + * subvolumes (down brick and decommissioned bricks are considered + * un-available). Else return count (available up bricks) */ + count = ((layout->spread_cnt && (layout->spread_cnt <= count)) + ? layout->spread_cnt + : ((count) ? count : 1)); -void dht_layout_entry_swap (dht_layout_t *layout, int i, int j); -void dht_layout_range_swap (dht_layout_t *layout, int i, int j); + return count; +} + +void +dht_selfheal_layout_new_directory(call_frame_t *frame, loc_t *loc, + dht_layout_t *new_layout); + +void +dht_layout_range_swap(dht_layout_t *layout, int i, int j); /* * It's a bit icky using local variables in a macro, but it makes the rest * of the code a lot clearer. */ -#define OV_ENTRY(x,y) table[x*new->cnt+y] +#define OV_ENTRY(x, y) table[x * new->cnt + y] -void -dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc, - dht_layout_t *new, dht_layout_t *old) +static void +dht_selfheal_layout_maximize_overlap(call_frame_t *frame, loc_t *loc, + dht_layout_t *new, dht_layout_t *old) { - int i = 0; - int j = 0; - uint32_t curr_overlap = 0; - uint32_t max_overlap = 0; - int max_overlap_idx = -1; - uint32_t overlap = 0; - uint32_t *table = NULL; - - dht_layout_sort_volname (old); - /* Now both old_layout->list[] and new_layout->list[] - are match the same xlators/subvolumes. i.e, - old_layout->[i] and new_layout->[i] are referring - to the same subvolumes - */ - - /* Build a table of overlaps between new[i] and old[j]. */ - table = alloca(sizeof(overlap)*old->cnt*new->cnt); - if (!table) { - return; + int i = 0; + int j = 0; + uint32_t curr_overlap = 0; + uint32_t max_overlap = 0; + int max_overlap_idx = -1; + uint32_t overlap = 0; + uint32_t *table = NULL; + + dht_layout_sort_volname(old); + /* Now both old_layout->list[] and new_layout->list[] + are match the same xlators/subvolumes. i.e, + old_layout->[i] and new_layout->[i] are referring + to the same subvolumes + */ + + /* Build a table of overlaps between new[i] and old[j]. */ + table = alloca(sizeof(overlap) * old->cnt * new->cnt); + if (!table) { + return; + } + memset(table, 0, sizeof(overlap) * old->cnt * new->cnt); + for (i = 0; i < new->cnt; ++i) { + for (j = 0; j < old->cnt; ++j) { + OV_ENTRY(i, j) = dht_overlap_calc(old, j, new, i); } - memset(table,0,sizeof(overlap)*old->cnt*new->cnt); - for (i = 0; i < new->cnt; ++i) { - for (j = 0; j < old->cnt; ++j) { - OV_ENTRY(i,j) = dht_overlap_calc(old,j,new,i); - } + } + + for (i = 0; i < new->cnt; i++) { + if (new->list[i].err > 0) { + /* Subvol might be marked for decommission + with EINVAL, or some other serious error + marked with positive errno. + */ + continue; } - for (i = 0; i < new->cnt; i++) { - if (new->list[i].err > 0) { - /* Subvol might be marked for decommission - with EINVAL, or some other serious error - marked with positive errno. - */ - continue; - } - - max_overlap = 0; - max_overlap_idx = i; - for (j = (i + 1); j < new->cnt; ++j) { - if (new->list[j].err > 0) { - /* Subvol might be marked for decommission - with EINVAL, or some other serious error - marked with positive errno. - */ - continue; - } - /* Calculate the overlap now. */ - curr_overlap = OV_ENTRY(i,i) + OV_ENTRY(j,j); - /* Calculate the overlap after the proposed swap. */ - overlap = OV_ENTRY(i,j) + OV_ENTRY(j,i); - /* Are we better than status quo? */ - if (overlap > curr_overlap) { - overlap -= curr_overlap; - /* Are we better than the previous choice? */ - if (overlap > max_overlap) { - max_overlap = overlap; - max_overlap_idx = j; - } - } + max_overlap = 0; + max_overlap_idx = i; + for (j = (i + 1); j < new->cnt; ++j) { + if (new->list[j].err > 0) { + /* Subvol might be marked for decommission + with EINVAL, or some other serious error + marked with positive errno. + */ + continue; + } + /* Calculate the overlap now. */ + curr_overlap = OV_ENTRY(i, i) + OV_ENTRY(j, j); + /* Calculate the overlap after the proposed swap. */ + overlap = OV_ENTRY(i, j) + OV_ENTRY(j, i); + /* Are we better than status quo? */ + if (overlap > curr_overlap) { + overlap -= curr_overlap; + /* Are we better than the previous choice? */ + if (overlap > max_overlap) { + max_overlap = overlap; + max_overlap_idx = j; } + } + } - if (max_overlap_idx != i) { - dht_layout_range_swap (new, i, max_overlap_idx); - /* Need to swap the table values too. */ - for (j = 0; j < old->cnt; ++j) { - overlap = OV_ENTRY(i,j); - OV_ENTRY(i,j) = OV_ENTRY(max_overlap_idx,j); - OV_ENTRY(max_overlap_idx,j) = overlap; - } - } - } + if (max_overlap_idx != i) { + dht_layout_range_swap(new, i, max_overlap_idx); + /* Need to swap the table values too. */ + for (j = 0; j < old->cnt; ++j) { + overlap = OV_ENTRY(i, j); + OV_ENTRY(i, j) = OV_ENTRY(max_overlap_idx, j); + OV_ENTRY(max_overlap_idx, j) = overlap; + } + } + } } - -dht_layout_t * -dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc, - dht_layout_t *layout) +static dht_layout_t * +dht_fix_layout_of_directory(call_frame_t *frame, loc_t *loc, + dht_layout_t *layout) { - int i = 0; - xlator_t *this = NULL; - dht_layout_t *new_layout = NULL; - dht_conf_t *priv = NULL; - dht_local_t *local = NULL; - uint32_t subvol_down = 0; - int ret = 0; - - this = frame->this; - priv = this->private; - local = frame->local; - - if (layout->type == DHT_HASH_TYPE_DM_USER) { - gf_log (THIS->name, GF_LOG_DEBUG, "leaving %s alone", - loc->path); - goto done; + int i = 0; + xlator_t *this = NULL; + dht_layout_t *new_layout = NULL; + dht_conf_t *priv = NULL; + dht_local_t *local = NULL; + uint32_t subvol_down = 0; + gf_boolean_t maximize_overlap = _gf_true; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + this = frame->this; + priv = this->private; + local = frame->local; + + if (layout->type == DHT_HASH_TYPE_DM_USER) { + gf_msg_debug(THIS->name, 0, "leaving %s alone", loc->path); + goto done; + } + + new_layout = dht_layout_new(this, priv->subvolume_cnt); + if (!new_layout) { + gf_uuid_unparse(loc->gfid, gfid); + gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED, + "new_layout, path=%s", loc->path, "gfid=%s", gfid, NULL); + goto done; + } + + /* If a subvolume is down, do not re-write the layout. */ + dht_layout_anomalies(this, loc, layout, NULL, NULL, NULL, &subvol_down, + NULL, NULL); + + if (subvol_down) { + gf_uuid_unparse(loc->gfid, gfid); + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_FIX_FAILED, + "subvol-down=%u", subvol_down, "Skipping-fix-layout", "path=%s", + loc->path, "gfid=%s", gfid, NULL); + GF_FREE(new_layout); + return NULL; + } + + for (i = 0; i < new_layout->cnt; i++) { + if (layout->list[i].err != ENOSPC) + new_layout->list[i].err = layout->list[i].err; + else + new_layout->list[i].err = -1; + + new_layout->list[i].xlator = layout->list[i].xlator; + } + + new_layout->commit_hash = layout->commit_hash; + + if (priv->du_stats) { + for (i = 0; i < priv->subvolume_cnt; ++i) { + gf_smsg(this->name, GF_LOG_DEBUG, 0, DHT_MSG_SUBVOL_INFO, + "index=%d", i, "name=%s", priv->subvolumes[i]->name, + "chunks=%u", priv->du_stats[i].chunks, "path=%s", loc->path, + NULL); + + /* Maximize overlap if the bricks are all the same + * size. + * This is probably not going to be very common on + * live setups but will benefit our regression tests + */ + if (i && (priv->du_stats[i].chunks != priv->du_stats[0].chunks)) { + maximize_overlap = _gf_false; + } } + } else { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NO_DISK_USAGE_STATUS, + NULL); + } + + /* First give it a layout as though it is a new directory. This + ensures rotation to kick in */ + dht_layout_sort_volname(new_layout); + dht_selfheal_layout_new_directory(frame, loc, new_layout); + + /* Maximize overlap if weighted-rebalance is disabled */ + if (!priv->do_weighting) + maximize_overlap = _gf_true; + + /* Now selectively re-assign ranges only when it helps */ + if (maximize_overlap) { + dht_selfheal_layout_maximize_overlap(frame, loc, new_layout, layout); + } +done: + if (new_layout) { + /* Make sure the extra 'ref' for existing layout is removed */ + dht_layout_unref(this, local->layout); - new_layout = dht_layout_new (this, priv->subvolume_cnt); - if (!new_layout) - goto done; + local->layout = new_layout; + } - /* If a subvolume is down, do not re-write the layout. */ - ret = dht_layout_anomalies (this, loc, layout, NULL, NULL, NULL, - &subvol_down, NULL, NULL); + return local->layout; +} - if (subvol_down || (ret == -1)) { - gf_log (this->name, GF_LOG_WARNING, "%u subvolume(s) are down" - ". Skipping fix layout.", subvol_down); - GF_FREE (new_layout); - return NULL; - } +/* + * Having to call this 2x for each entry in the layout is pretty horrible, but + * that's what all of this layout-sorting nonsense gets us. + */ +static uint32_t +dht_get_chunks_from_xl(xlator_t *parent, xlator_t *child) +{ + dht_conf_t *priv = parent->private; + xlator_list_t *trav; + uint32_t index = 0; - for (i = 0; i < new_layout->cnt; i++) { - if (layout->list[i].err != ENOSPC) - new_layout->list[i].err = layout->list[i].err; - else - new_layout->list[i].err = -1; + if (!priv->du_stats) { + return 0; + } - new_layout->list[i].xlator = layout->list[i].xlator; + for (trav = parent->children; trav; trav = trav->next) { + if (trav->xlator == child) { + return priv->du_stats[index].chunks; } + ++index; + } - /* First give it a layout as though it is a new directory. This - ensures rotation to kick in */ - dht_layout_sort_volname (new_layout); - dht_selfheal_layout_new_directory (frame, loc, new_layout); + return 0; +} - /* Now selectively re-assign ranges only when it helps */ - dht_selfheal_layout_maximize_overlap (frame, loc, new_layout, layout); +void +dht_selfheal_layout_new_directory(call_frame_t *frame, loc_t *loc, + dht_layout_t *layout) +{ + xlator_t *this = NULL; + double chunk = 0; + int i = 0; + uint32_t start = 0; + int bricks_to_use = 0; + int err = 0; + int start_subvol = 0; + uint32_t curr_size; + uint32_t range_size; + uint64_t total_size = 0; + int real_i; + dht_conf_t *priv; + gf_boolean_t weight_by_size; + int bricks_used = 0; + + this = frame->this; + priv = this->private; + weight_by_size = priv->do_weighting; + + bricks_to_use = dht_get_layout_count(this, layout, 1); + GF_ASSERT(bricks_to_use > 0); + + bricks_used = 0; + for (i = 0; i < layout->cnt; ++i) { + err = layout->list[i].err; + if ((err != -1) && (err != ENOENT)) { + continue; + } + curr_size = dht_get_chunks_from_xl(this, layout->list[i].xlator); + if (!curr_size) { + weight_by_size = _gf_false; + break; + } + total_size += curr_size; + if (++bricks_used >= bricks_to_use) { + break; + } + } + + if (weight_by_size && total_size) { + /* We know total_size is not zero. */ + chunk = ((double)0xffffffff) / ((double)total_size); + gf_msg_debug(this->name, 0, + "chunk size = 0xffffffff / %" PRIu64 " = %f", total_size, + chunk); + } else { + weight_by_size = _gf_false; + chunk = ((unsigned long)0xffffffff) / bricks_to_use; + } + + start_subvol = dht_selfheal_layout_alloc_start(this, loc, layout); + + /* clear out the range, as we are re-computing here */ + DHT_RESET_LAYOUT_RANGE(layout); + + /* + * OK, what's this "real_i" stuff about? This used to be two loops - + * from start_subvol to layout->cnt-1, then from 0 to start_subvol-1. + * That way is practically an open invitation to bugs when only one + * of the loops is updated. Using real_i and modulo operators to make + * it one loop avoids this problem. Remember, folks: it's everyone's + * responsibility to help stamp out copy/paste abuse. + */ + bricks_used = 0; + for (real_i = 0; real_i < layout->cnt; real_i++) { + i = (real_i + start_subvol) % layout->cnt; + err = layout->list[i].err; + if ((err != -1) && (err != ENOENT)) { + continue; + } + if (weight_by_size) { + curr_size = dht_get_chunks_from_xl(this, layout->list[i].xlator); + if (!curr_size) { + continue; + } + } else { + curr_size = 1; + } + range_size = chunk * curr_size; + gf_msg_debug(this->name, 0, "assigning range size 0x%x to %s", + range_size, layout->list[i].xlator->name); + DHT_SET_LAYOUT_RANGE(layout, i, start, range_size, loc->path); + if (++bricks_used >= bricks_to_use) { + layout->list[i].stop = 0xffffffff; + goto done; + } + start += range_size; + } done: - if (new_layout) { - /* Now that the new layout has all the proper layout, change the - inode context */ - dht_layout_set (this, loc->inode, new_layout); - - /* Make sure the extra 'ref' for existing layout is removed */ - dht_layout_unref (this, local->layout); + return; +} - local->layout = new_layout; +static int +dht_selfheal_dir_getafix(call_frame_t *frame, loc_t *loc, dht_layout_t *layout) +{ + dht_local_t *local = NULL; + uint32_t holes = 0; + int ret = -1; + int i = -1; + uint32_t overlaps = 0; + + local = frame->local; + + holes = local->selfheal.hole_cnt; + overlaps = local->selfheal.overlaps_cnt; + + if (holes || overlaps) { + /* If the layout has anomalies which would change the hash + * ranges, then we need to reset the commit_hash for this + * directory, as the layout would change and things may not + * be in place as expected */ + layout->commit_hash = DHT_LAYOUT_HASH_INVALID; + dht_selfheal_layout_new_directory(frame, loc, layout); + ret = 0; + } + + for (i = 0; i < layout->cnt; i++) { + /* directory not present */ + if (layout->list[i].err == ENOENT) { + ret = 0; + break; } + } - return local->layout; -} + /* TODO: give a fix to these non-virgins */ + return ret; +} -void -dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, - dht_layout_t *layout) +int +dht_selfheal_new_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, + dht_layout_t *layout) { - xlator_t *this = NULL; - uint32_t chunk = 0; - int i = 0; - uint32_t start = 0; - int cnt = 0; - int err = 0; - int start_subvol = 0; - - this = frame->this; - - cnt = dht_get_layout_count (this, layout, 1); - - chunk = ((unsigned long) 0xffffffff) / ((cnt) ? cnt : 1); - - start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout); - - /* clear out the range, as we are re-computing here */ - DHT_RESET_LAYOUT_RANGE (layout); - for (i = start_subvol; i < layout->cnt; i++) { - err = layout->list[i].err; - if (err == -1 || err == ENOENT) { - DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, - cnt, loc->path); - if (--cnt == 0) { - layout->list[i].stop = 0xffffffff; - goto done; - } - start += chunk; - } - } + dht_local_t *local = NULL; + int ret = 0; + inode_t *linked_inode = NULL, *inode = NULL; + loc_t *loc = NULL; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + char gfid[GF_UUID_BUF_SIZE] = {0}; + int32_t op_errno = EIO; - for (i = 0; i < start_subvol; i++) { - err = layout->list[i].err; - if (err == -1 || err == ENOENT) { - DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, - cnt, loc->path); - if (--cnt == 0) { - layout->list[i].stop = 0xffffffff; - goto done; - } - start += chunk; - } - } + local = frame->local; -done: - return; + loc = &local->loc; + + gf_uuid_unparse(local->stbuf.ia_gfid, gfid); + gf_uuid_unparse(loc->parent->gfid, pgfid); + + linked_inode = inode_link(loc->inode, loc->parent, loc->name, + &local->stbuf); + if (!linked_inode) { + gf_smsg(frame->this->name, GF_LOG_WARNING, 0, DHT_MSG_LINK_INODE_FAILED, + "pgfid=%s", pgfid, "name=%s", loc->name, "gfid=%s", gfid, NULL); + ret = -1; + goto out; + } + + inode = loc->inode; + loc->inode = linked_inode; + inode_unref(inode); + + local->selfheal.dir_cbk = dir_cbk; + local->selfheal.layout = dht_layout_ref(frame->this, layout); + + dht_layout_sort_volname(layout); + dht_selfheal_layout_new_directory(frame, &local->loc, layout); + + op_errno = ENOMEM; + ret = dht_selfheal_layout_lock(frame, layout, _gf_true, + dht_selfheal_dir_xattr, + dht_should_heal_layout); + +out: + if (ret < 0) { + dir_cbk(frame, NULL, frame->this, -1, op_errno, NULL); + } + + return 0; } int -dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc, - dht_layout_t *layout) +dht_fix_directory_layout(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, + dht_layout_t *layout) { - dht_local_t *local = NULL; - uint32_t holes = 0; - int ret = -1; - int i = -1; - uint32_t overlaps = 0; + dht_local_t *local = NULL; + dht_layout_t *tmp_layout = NULL; + int ret = 0; - local = frame->local; + local = frame->local; - holes = local->selfheal.hole_cnt; - overlaps = local->selfheal.overlaps_cnt; + local->selfheal.dir_cbk = dir_cbk; + local->selfheal.layout = dht_layout_ref(frame->this, layout); - if (holes || overlaps) { - dht_selfheal_layout_new_directory (frame, loc, layout); - ret = 0; - } + /* No layout sorting required here */ + tmp_layout = dht_fix_layout_of_directory(frame, &local->loc, layout); + if (!tmp_layout) { + return -1; + } - for (i = 0; i < layout->cnt; i++) { - /* directory not present */ - if (layout->list[i].err == ENOENT) { - ret = 0; - break; - } - } + ret = dht_selfheal_layout_lock(frame, tmp_layout, _gf_false, + dht_fix_dir_xattr, dht_should_fix_layout); - /* TODO: give a fix to these non-virgins */ - - return ret; + return ret; } int -dht_selfheal_new_directory (call_frame_t *frame, - dht_selfheal_dir_cbk_t dir_cbk, - dht_layout_t *layout) +dht_selfheal_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, + loc_t *loc, dht_layout_t *layout) { - dht_local_t *local = NULL; + dht_local_t *local = NULL; + xlator_t *this = NULL; + uint32_t down = 0; + uint32_t misc = 0; + int ret = 0; + char pgfid[GF_UUID_BUF_SIZE] = {0}; + char gfid[GF_UUID_BUF_SIZE] = {0}; + inode_t *linked_inode = NULL, *inode = NULL; + + local = frame->local; + this = frame->this; + + local->selfheal.dir_cbk = dir_cbk; + local->selfheal.layout = dht_layout_ref(this, layout); + + if (local->need_attrheal) { + if (__is_root_gfid(local->stbuf.ia_gfid)) { + local->stbuf.ia_gid = local->prebuf.ia_gid; + local->stbuf.ia_uid = local->prebuf.ia_uid; + + local->stbuf.ia_ctime = local->prebuf.ia_ctime; + local->stbuf.ia_ctime_nsec = local->prebuf.ia_ctime_nsec; + local->stbuf.ia_prot = local->prebuf.ia_prot; + + } else if (!IA_ISINVAL(local->mds_stbuf.ia_type)) { + local->stbuf = local->mds_stbuf; + } + } + + if (!__is_root_gfid(local->stbuf.ia_gfid)) { + gf_uuid_unparse(local->stbuf.ia_gfid, gfid); + gf_uuid_unparse(loc->parent->gfid, pgfid); + + linked_inode = inode_link(loc->inode, loc->parent, loc->name, + &local->stbuf); + if (!linked_inode) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LINK_INODE_FAILED, + "pgfid=%s", pgfid, "name=%s", loc->name, "gfid=%s", gfid, + NULL); + ret = 0; + goto sorry_no_fix; + } - local = frame->local; + inode = loc->inode; + loc->inode = linked_inode; + inode_unref(inode); + } + + if (local->need_xattr_heal && (local->mds_xattr)) { + dht_dir_set_heal_xattr(this, local, local->xattr, local->mds_xattr, + NULL, NULL); + dict_unref(local->mds_xattr); + local->mds_xattr = NULL; + } + + dht_layout_anomalies(this, loc, layout, &local->selfheal.hole_cnt, + &local->selfheal.overlaps_cnt, + &local->selfheal.missing_cnt, &local->selfheal.down, + &local->selfheal.misc, NULL); + + down = local->selfheal.down; + misc = local->selfheal.misc; + + if (down) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SELFHEAL_FAILED, + "path=%s", loc->path, "subvol-down=%d", down, "Not-fixing", + "gfid=%s", gfid, NULL); + ret = 0; + goto sorry_no_fix; + } + + if (misc) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SELFHEAL_FAILED, + "path=%s", loc->path, "misc=%d", misc, "unrecoverable-errors", + "gfid=%s", gfid, NULL); + + ret = 0; + goto sorry_no_fix; + } + + dht_layout_sort_volname(layout); + local->heal_layout = _gf_true; + + /* Ignore return value as it can be inferred from result of + * dht_layout_anomalies + */ + dht_selfheal_dir_getafix(frame, loc, layout); + + if (!(local->selfheal.hole_cnt || local->selfheal.overlaps_cnt || + local->selfheal.missing_cnt)) { + local->heal_layout = _gf_false; + } + + ret = dht_selfheal_dir_mkdir(frame, loc, layout, 0); + if (ret < 0) { + ret = 0; + goto sorry_no_fix; + } + + return 0; - local->selfheal.dir_cbk = dir_cbk; - local->selfheal.layout = dht_layout_ref (frame->this, layout); +sorry_no_fix: + /* TODO: need to put appropriate local->op_errno */ + dht_selfheal_dir_finish(frame, this, ret, 1); - dht_layout_sort_volname (layout); - dht_selfheal_layout_new_directory (frame, &local->loc, layout); - dht_selfheal_dir_xattr (frame, &local->loc, layout); - return 0; + return 0; } int -dht_fix_directory_layout (call_frame_t *frame, - dht_selfheal_dir_cbk_t dir_cbk, - dht_layout_t *layout) +dht_selfheal_restore(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, + loc_t *loc, dht_layout_t *layout) { - dht_local_t *local = NULL; - dht_layout_t *tmp_layout = NULL; + int ret = 0; + dht_local_t *local = NULL; - local = frame->local; + local = frame->local; - local->selfheal.dir_cbk = dir_cbk; - local->selfheal.layout = dht_layout_ref (frame->this, layout); + local->selfheal.dir_cbk = dir_cbk; + local->selfheal.layout = dht_layout_ref(frame->this, layout); - /* No layout sorting required here */ - tmp_layout = dht_fix_layout_of_directory (frame, &local->loc, layout); - if (!tmp_layout) { - return -1; + ret = dht_selfheal_dir_mkdir(frame, loc, layout, 1); + + return ret; +} + +int +dht_dir_heal_xattrs(void *data) +{ + call_frame_t *frame = NULL; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + xlator_t *mds_subvol = NULL; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dict_t *user_xattr = NULL; + dict_t *internal_xattr = NULL; + dict_t *mds_xattr = NULL; + dict_t *xdata = NULL; + int call_cnt = 0; + int ret = -1; + int uret = 0; + int uflag = 0; + int i = 0; + int xattr_hashed = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + int32_t allzero[1] = {0}; + + GF_VALIDATE_OR_GOTO("dht", data, out); + + frame = data; + local = frame->local; + this = frame->this; + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, local, out); + mds_subvol = local->mds_subvol; + conf = this->private; + GF_VALIDATE_OR_GOTO(this->name, conf, out); + gf_uuid_unparse(local->loc.gfid, gfid); + + if (!mds_subvol) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NO_MDS_SUBVOL, "path=%s", + local->loc.path, "gfid=%s", gfid, NULL); + goto out; + } + + if ((local->loc.inode && gf_uuid_is_null(local->loc.inode->gfid)) || + gf_uuid_is_null(local->loc.gfid)) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_NOT_PRESENT, + "skip-heal path=%s", local->loc.path, "gfid=%s", gfid, NULL); + goto out; + } + + internal_xattr = dict_new(); + if (!internal_xattr) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_CREATE_FAILED, + "dictionary", NULL); + goto out; + } + xdata = dict_new(); + if (!xdata) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_CREATE_FAILED, + "dictionary", NULL); + goto out; + } + + call_cnt = conf->subvolume_cnt; + + user_xattr = dict_new(); + if (!user_xattr) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_CREATE_FAILED, + "dictionary", NULL); + goto out; + } + + ret = syncop_listxattr(local->mds_subvol, &local->loc, &mds_xattr, NULL, + NULL); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LIST_XATTRS_FAILED, + "path=%s", local->loc.path, "name=%s", local->mds_subvol->name, + NULL); + } + + if (!mds_xattr) + goto out; + + dht_dir_set_heal_xattr(this, local, user_xattr, mds_xattr, &uret, &uflag); + + /* To set quota related xattr need to set GLUSTERFS_INTERNAL_FOP_KEY + * key value to 1 + */ + if (dict_get(user_xattr, QUOTA_LIMIT_KEY) || + dict_get(user_xattr, QUOTA_LIMIT_OBJECTS_KEY)) { + ret = dict_set_int32(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "key=%s", GLUSTERFS_INTERNAL_FOP_KEY, "path=%s", + local->loc.path, NULL); + goto out; } - dht_fix_dir_xattr (frame, &local->loc, tmp_layout); + } + if (uret <= 0 && !uflag) + goto out; + + for (i = 0; i < call_cnt; i++) { + subvol = conf->subvolumes[i]; + if (subvol == mds_subvol) + continue; + if (uret || uflag) { + /* Custom xattr heal is required - let posix handle it */ + ret = dict_set_int8(xdata, "sync_backend_xattrs", _gf_true); + if (ret) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "path=%s", local->loc.path, "key=%s", + "sync_backend_xattrs", NULL); + goto out; + } + + ret = syncop_setxattr(subvol, &local->loc, user_xattr, 0, xdata, + NULL); + if (ret) { + xattr_hashed = 1; + gf_smsg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_DIR_XATTR_HEAL_FAILED, + "set-user-xattr-failed path=%s", local->loc.path, + "subvol=%s", subvol->name, "gfid=%s", gfid, NULL); + } else { + dict_del(xdata, "sync_backend_xattrs"); + } + } + } + /* After heal all custom xattr reset internal MDS xattr to 0 */ + if (!xattr_hashed) { + ret = dht_dict_set_array(internal_xattr, conf->mds_xattr_key, allzero, + 1); + if (ret) { + gf_smsg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED, + "key=%s", conf->mds_xattr_key, "path=%s", local->loc.path, + NULL); + goto out; + } + ret = syncop_setxattr(mds_subvol, &local->loc, internal_xattr, 0, NULL, + NULL); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_DIR_XATTR_HEAL_FAILED, "path=%s", local->loc.path, + "subvol=%s", mds_subvol->name, "gfid=%s", gfid, NULL); + } + } - return 0; +out: + if (user_xattr) + dict_unref(user_xattr); + if (mds_xattr) + dict_unref(mds_xattr); + if (internal_xattr) + dict_unref(internal_xattr); + if (xdata) + dict_unref(xdata); + return 0; } +int +dht_dir_heal_xattrs_done(int ret, call_frame_t *sync_frame, void *data) +{ + DHT_STACK_DESTROY(sync_frame); + return 0; +} int -dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, - loc_t *loc, dht_layout_t *layout) +dht_dir_attr_heal(void *data) { - dht_local_t *local = NULL; - uint32_t down = 0; - uint32_t misc = 0; - int ret = 0; - xlator_t *this = NULL; - - local = frame->local; - this = frame->this; - - dht_layout_anomalies (this, loc, layout, - &local->selfheal.hole_cnt, - &local->selfheal.overlaps_cnt, - NULL, &local->selfheal.down, - &local->selfheal.misc, NULL); - - down = local->selfheal.down; - misc = local->selfheal.misc; - - local->selfheal.dir_cbk = dir_cbk; - local->selfheal.layout = dht_layout_ref (this, layout); - - if (down) { - gf_log (this->name, GF_LOG_WARNING, - "%d subvolumes down -- not fixing", down); - ret = 0; - goto sorry_no_fix; + call_frame_t *frame = NULL; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + xlator_t *mds_subvol = NULL; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + int call_cnt = 0; + int ret = -1; + int i = 0; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + GF_VALIDATE_OR_GOTO("dht", data, out); + + frame = data; + local = frame->local; + this = frame->this; + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", local, out); + conf = this->private; + GF_VALIDATE_OR_GOTO("dht", conf, out); + + mds_subvol = local->mds_subvol; + call_cnt = conf->subvolume_cnt; + + if (!__is_root_gfid(local->stbuf.ia_gfid) && (!mds_subvol)) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NO_MDS_SUBVOL, "path=%s", + local->loc.path, "gfid=%s", gfid, NULL); + goto out; + } + + if (!__is_root_gfid(local->stbuf.ia_gfid)) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == mds_subvol) { + if (!conf->subvolume_status[i]) { + gf_smsg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_MDS_DOWN_UNABLE_TO_SET, "path=%s", + local->loc.path, "gfid=%s", gfid, NULL); + goto out; + } + } } - - if (misc) { - gf_log (this->name, GF_LOG_WARNING, - "%d subvolumes have unrecoverable errors", misc); - ret = 0; - goto sorry_no_fix; + } + + for (i = 0; i < call_cnt; i++) { + subvol = conf->subvolumes[i]; + if (!subvol || subvol == mds_subvol) + continue; + if (__is_root_gfid(local->stbuf.ia_gfid)) { + ret = syncop_setattr( + subvol, &local->loc, &local->stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE), NULL, + NULL, NULL, NULL); + } else { + ret = syncop_setattr( + subvol, &local->loc, &local->mds_stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE), NULL, + NULL, NULL, NULL); } - dht_layout_sort_volname (layout); - ret = dht_selfheal_dir_getafix (frame, loc, layout); + if (ret) { + gf_uuid_unparse(local->loc.gfid, gfid); - if (ret == -1) { - gf_log (this->name, GF_LOG_WARNING, - "not able to form layout for the directory"); - goto sorry_no_fix; + gf_smsg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_DIR_ATTR_HEAL_FAILED, "path=%s", local->loc.path, + "subvol=%s", subvol->name, "gfid=%s", gfid, NULL); } + } +out: + return 0; +} - dht_selfheal_dir_mkdir (frame, loc, layout, 0); +int +dht_dir_attr_heal_done(int ret, call_frame_t *sync_frame, void *data) +{ + DHT_STACK_DESTROY(sync_frame); + return 0; +} - return 0; +/* EXIT: dht_update_commit_hash_for_layout */ +static int +dht_update_commit_hash_for_layout_done(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; -sorry_no_fix: - /* TODO: need to put appropriate local->op_errno */ - dht_selfheal_dir_finish (frame, this, ret); + local = frame->local; - return 0; + /* preserve oldest error */ + if (op_ret && !local->op_ret) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + + DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno, NULL); + + return 0; } +static int +dht_update_commit_hash_for_layout_unlock(call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + int ret = 0; + + local = frame->local; + + ret = dht_unlock_inodelk(frame, local->lock[0].layout.my_layout.locks, + local->lock[0].layout.my_layout.lk_count, + dht_update_commit_hash_for_layout_done); + if (ret < 0) { + /* preserve oldest error, just ... */ + if (!local->op_ret) { + local->op_errno = errno; + local->op_ret = -1; + } + + gf_smsg(this->name, GF_LOG_WARNING, errno, DHT_MSG_WIND_UNLOCK_FAILED, + "path=%s", local->loc.path, NULL); -int -dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, - loc_t *loc, dht_layout_t *layout) + dht_update_commit_hash_for_layout_done(frame, NULL, this, 0, 0, NULL); + } + + return 0; +} + +static int +dht_update_commit_hash_for_layout_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + dict_t *xdata) { - int ret = 0; - dht_local_t *local = NULL; + dht_local_t *local = NULL; + int this_call_cnt = 0; - local = frame->local; + local = frame->local; - local->selfheal.dir_cbk = dir_cbk; - local->selfheal.layout = dht_layout_ref (frame->this, layout); + LOCK(&frame->lock); + /* store first failure, just because */ + if (op_ret && !local->op_ret) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + UNLOCK(&frame->lock); - ret = dht_selfheal_dir_mkdir (frame, loc, layout, 1); + this_call_cnt = dht_frame_return(frame); - return ret; + if (is_last_call(this_call_cnt)) { + dht_update_commit_hash_for_layout_unlock(frame, this); + } + + return 0; } -int -dht_dir_attr_heal (void *data) +static int +dht_update_commit_hash_for_layout_resume(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) { - call_frame_t *frame = NULL; - dht_local_t *local = NULL; - xlator_t *subvol = NULL; - xlator_t *this = NULL; - dht_conf_t *conf = NULL; - int call_cnt = 0; - int ret = -1; - int i = 0; - - GF_VALIDATE_OR_GOTO ("dht", data, out); - - frame = data; - local = frame->local; - this = frame->this; - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", local, out); - conf = this->private; - GF_VALIDATE_OR_GOTO ("dht", conf, out); - - call_cnt = conf->subvolume_cnt; - - for (i = 0; i < call_cnt; i++) { - subvol = conf->subvolumes[i]; - if (!subvol || (subvol == dht_first_up_subvol (this))) - continue; - ret = syncop_setattr (subvol, &local->loc, &local->stbuf, - (GF_SET_ATTR_UID | GF_SET_ATTR_GID), - NULL, NULL); - if (ret) { - gf_log ("dht", GF_LOG_ERROR, "Failed to set uid/gid on" - " %s on %s subvol (%s)", local->loc.path, - subvol->name, strerror (-ret)); - } + dht_local_t *local = NULL; + int count = 1, ret = -1, i = 0, j = 0; + dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + int32_t *disk_layout = NULL; + dict_t **xattr = NULL; + + local = frame->local; + conf = frame->this->private; + count = conf->local_subvols_cnt; + layout = local->layout; + + if (op_ret < 0) { + goto err_done; + } + + /* We precreate the xattr list as we cannot change call count post the + * first wind as we may never continue from there. So we finish prep + * work before winding the setxattrs */ + xattr = GF_CALLOC(count, sizeof(*xattr), gf_common_mt_char); + if (!xattr) { + local->op_errno = errno; + + gf_smsg(this->name, GF_LOG_WARNING, errno, DHT_MSG_COMMIT_HASH_FAILED, + "allocation-failed path=%s", local->loc.path, NULL); + + goto err; + } + + for (i = 0; i < count; i++) { + /* find the layout index for the subvolume */ + ret = dht_layout_index_for_subvol(layout, conf->local_subvols[i]); + if (ret < 0) { + local->op_errno = ENOENT; + + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_COMMIT_HASH_FAILED, + "path=%s", local->loc.path, "subvol=%s", + conf->local_subvols[i]->name, "find-disk-layout-failed", + NULL); + + goto err; } -out: - return 0; + j = ret; + + /* update the commit hash for the layout */ + layout->list[j].commit_hash = layout->commit_hash; + + /* extract the current layout */ + ret = dht_disk_layout_extract(this, layout, j, &disk_layout); + if (ret == -1) { + local->op_errno = errno; + + gf_smsg(this->name, GF_LOG_WARNING, errno, + DHT_MSG_COMMIT_HASH_FAILED, "path=%s", local->loc.path, + "subvol=%s", conf->local_subvols[i]->name, + "extract-disk-layout-failed", NULL); + + goto err; + } + + xattr[i] = dict_new(); + if (!xattr[i]) { + local->op_errno = errno; + + gf_smsg(this->name, GF_LOG_WARNING, errno, + DHT_MSG_COMMIT_HASH_FAILED, "path=%s Allocation-failed", + local->loc.path, NULL); + + goto err; + } + + ret = dict_set_bin(xattr[i], conf->xattr_name, disk_layout, 4 * 4); + if (ret != 0) { + local->op_errno = ENOMEM; + + gf_smsg(this->name, GF_LOG_WARNING, 0, + DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "path=%s", + local->loc.path, "subvol=%s", conf->local_subvols[i]->name, + "set-xattr-failed", NULL); + + goto err; + } + disk_layout = NULL; + + gf_msg_trace(this->name, 0, + "setting commit hash %u on subvolume %s" + " for %s", + layout->list[j].commit_hash, conf->local_subvols[i]->name, + local->loc.path); + } + + /* wind the setting of the commit hash across the local subvols */ + local->call_cnt = count; + local->op_ret = 0; + local->op_errno = 0; + for (i = 0; i < count; i++) { + STACK_WIND(frame, dht_update_commit_hash_for_layout_cbk, + conf->local_subvols[i], + conf->local_subvols[i]->fops->setxattr, &local->loc, + xattr[i], 0, NULL); + } + for (i = 0; i < count; i++) + dict_unref(xattr[i]); + GF_FREE(xattr); + + return 0; +err: + if (xattr) { + for (i = 0; i < count; i++) { + if (xattr[i]) + dict_unref(xattr[i]); + } + + GF_FREE(xattr); + } + + GF_FREE(disk_layout); + + local->op_ret = -1; + + dht_update_commit_hash_for_layout_unlock(frame, this); + + return 0; +err_done: + local->op_ret = -1; + + dht_update_commit_hash_for_layout_done(frame, NULL, this, 0, 0, NULL); + + return 0; } +/* ENTER: dht_update_commit_hash_for_layout (see EXIT above) + * This function is invoked from rebalance only. + * As a result, the check here is simple enough to see if defrag is present + * in the conf, as other data would be populated appropriately if so. + * If ever this was to be used in other code paths, checks would need to + * change. + * + * Functional details: + * - Lock the inodes on the subvols that we want the commit hash updated + * - Update each layout with the inode layout, modified to take in the new + * commit hash. + * - Unlock and return. + */ int -dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data) +dht_update_commit_hash_for_layout(call_frame_t *frame) { - DHT_STACK_DESTROY (sync_frame); - return 0; + dht_local_t *local = NULL; + int count = 1, ret = -1, i = 0; + dht_lock_t **lk_array = NULL; + dht_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO("dht", frame, err); + GF_VALIDATE_OR_GOTO(frame->this->name, frame->local, err); + + local = frame->local; + conf = frame->this->private; + + if (!conf->defrag) + goto err; + + count = conf->local_subvols_cnt; + lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_char); + if (lk_array == NULL) + goto err; + + for (i = 0; i < count; i++) { + lk_array[i] = dht_lock_new(frame->this, conf->local_subvols[i], + &local->loc, F_WRLCK, DHT_LAYOUT_HEAL_DOMAIN, + NULL, FAIL_ON_ANY_ERROR); + if (lk_array[i] == NULL) + goto err; + } + + local->lock[0].layout.my_layout.locks = lk_array; + local->lock[0].layout.my_layout.lk_count = count; + + ret = dht_blocking_inodelk(frame, lk_array, count, + dht_update_commit_hash_for_layout_resume); + if (ret < 0) { + local->lock[0].layout.my_layout.locks = NULL; + local->lock[0].layout.my_layout.lk_count = 0; + goto err; + } + + return 0; +err: + if (lk_array != NULL) { + dht_lock_array_free(lk_array, count); + GF_FREE(lk_array); + } + + return -1; } diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index f2e7467abe7..bb72b0ffbb5 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -8,16 +8,14 @@ cases as published by the Free Software Foundation. */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - /* TODO: add NS locking */ - -#include "statedump.h" +#include <glusterfs/statedump.h> #include "dht-common.h" +#include "dht-messages.h" + +#ifndef MAX +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) +#endif /* TODO: - use volumename in xattr instead of "dht" @@ -25,756 +23,1082 @@ - handle all cases in self heal layout reconstruction - complete linkfile selfheal */ -struct volume_options options[]; -void -dht_layout_dump (dht_layout_t *layout, const char *prefix) +static void +dht_layout_dump(dht_layout_t *layout, const char *prefix) { - - char key[GF_DUMP_MAX_BUF_LEN]; - int i = 0; - - if (!layout) - goto out; - if (!prefix) - goto out; - - gf_proc_dump_build_key(key, prefix, "cnt"); - gf_proc_dump_write(key, "%d", layout->cnt); - gf_proc_dump_build_key(key, prefix, "preset"); - gf_proc_dump_write(key, "%d", layout->preset); - gf_proc_dump_build_key(key, prefix, "gen"); - gf_proc_dump_write(key, "%d", layout->gen); - if (layout->type != IA_INVAL) { - gf_proc_dump_build_key(key, prefix, "inode type"); - gf_proc_dump_write(key, "%d", layout->type); - } - - if (!IA_ISDIR (layout->type)) - goto out; - - for (i = 0; i < layout->cnt; i++) { - gf_proc_dump_build_key(key, prefix,"list[%d].err", i); - gf_proc_dump_write(key, "%d", layout->list[i].err); - gf_proc_dump_build_key(key, prefix,"list[%d].start", i); - gf_proc_dump_write(key, "%u", layout->list[i].start); - gf_proc_dump_build_key(key, prefix,"list[%d].stop", i); - gf_proc_dump_write(key, "%u", layout->list[i].stop); - if (layout->list[i].xlator) { - gf_proc_dump_build_key(key, prefix, - "list[%d].xlator.type", i); - gf_proc_dump_write(key, "%s", - layout->list[i].xlator->type); - gf_proc_dump_build_key(key, prefix, - "list[%d].xlator.name", i); - gf_proc_dump_write(key, "%s", - layout->list[i].xlator->name); - } + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; + + if (!layout) + goto out; + + gf_proc_dump_build_key(key, prefix, "cnt"); + gf_proc_dump_write(key, "%d", layout->cnt); + gf_proc_dump_build_key(key, prefix, "preset"); + gf_proc_dump_write(key, "%d", layout->preset); + gf_proc_dump_build_key(key, prefix, "gen"); + gf_proc_dump_write(key, "%d", layout->gen); + if (layout->type != IA_INVAL) { + gf_proc_dump_build_key(key, prefix, "inode type"); + gf_proc_dump_write(key, "%d", layout->type); + } + + if (!IA_ISDIR(layout->type)) + goto out; + + for (i = 0; i < layout->cnt; i++) { + gf_proc_dump_build_key(key, prefix, "list[%d].err", i); + gf_proc_dump_write(key, "%d", layout->list[i].err); + gf_proc_dump_build_key(key, prefix, "list[%d].start", i); + gf_proc_dump_write(key, "0x%x", layout->list[i].start); + gf_proc_dump_build_key(key, prefix, "list[%d].stop", i); + gf_proc_dump_write(key, "0x%x", layout->list[i].stop); + if (layout->list[i].xlator) { + gf_proc_dump_build_key(key, prefix, "list[%d].xlator.type", i); + gf_proc_dump_write(key, "%s", layout->list[i].xlator->type); + gf_proc_dump_build_key(key, prefix, "list[%d].xlator.name", i); + gf_proc_dump_write(key, "%s", layout->list[i].xlator->name); } + } out: - return; + return; } - int32_t -dht_priv_dump (xlator_t *this) +dht_priv_dump(xlator_t *this) { - char key_prefix[GF_DUMP_MAX_BUF_LEN]; - char key[GF_DUMP_MAX_BUF_LEN]; - int i = 0; - dht_conf_t *conf = NULL; - int ret = -1; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; + dht_conf_t *conf = NULL; + int ret = -1; - if (!this) - goto out; + if (!this) + goto out; - conf = this->private; - if (!conf) - goto out; + conf = this->private; + if (!conf) + goto out; - ret = TRY_LOCK(&conf->subvolume_lock); - if (ret != 0) { - return ret; + ret = TRY_LOCK(&conf->subvolume_lock); + if (ret != 0) { + return ret; + } + + gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name); + gf_proc_dump_build_key(key_prefix, "xlator.cluster.dht", "%s.priv", + this->name); + gf_proc_dump_write("subvol_cnt", "%d", conf->subvolume_cnt); + for (i = 0; i < conf->subvolume_cnt; i++) { + snprintf(key, sizeof(key), "subvolumes[%d]", i); + gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type, + conf->subvolumes[i]->name); + if (conf->file_layouts && conf->file_layouts[i]) { + snprintf(key, sizeof(key), "file_layouts[%d]", i); + dht_layout_dump(conf->file_layouts[i], key); } - - gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name); - gf_proc_dump_build_key(key_prefix,"xlator.cluster.dht","%s.priv", - this->name); - gf_proc_dump_write("subvol_cnt","%d", conf->subvolume_cnt); + if (conf->dir_layouts && conf->dir_layouts[i]) { + snprintf(key, sizeof(key), "dir_layouts[%d]", i); + dht_layout_dump(conf->dir_layouts[i], key); + } + if (conf->subvolume_status) { + snprintf(key, sizeof(key), "subvolume_status[%d]", i); + gf_proc_dump_write(key, "%d", (int)conf->subvolume_status[i]); + } + } + + gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed); + gf_proc_dump_write("gen", "%d", conf->gen); + gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk); + gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes); + gf_proc_dump_write("disk_unit", "%c", conf->disk_unit); + gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval); + gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit); + gf_proc_dump_write("use-readdirp", "%d", conf->use_readdirp); + + if (conf->du_stats && conf->subvolume_status) { for (i = 0; i < conf->subvolume_cnt; i++) { - snprintf (key, sizeof (key), "subvolumes[%d]", i); - gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type, - conf->subvolumes[i]->name); - if (conf->file_layouts && conf->file_layouts[i]){ - snprintf (key, sizeof (key), "file_layouts[%d]", i); - dht_layout_dump(conf->file_layouts[i], key); - } - if (conf->dir_layouts && conf->dir_layouts[i]) { - snprintf (key, sizeof (key), "dir_layouts[%d]", i); - dht_layout_dump(conf->dir_layouts[i], key); - } - if (conf->subvolume_status) { - - snprintf (key, sizeof (key), "subvolume_status[%d]", i); - gf_proc_dump_write(key, "%d", - (int)conf->subvolume_status[i]); - } + if (!conf->subvolume_status[i]) + continue; - } + snprintf(key, sizeof(key), "subvolumes[%d]", i); + gf_proc_dump_write(key, "%s", conf->subvolumes[i]->name); + + snprintf(key, sizeof(key), "du_stats[%d].avail_percent", i); + gf_proc_dump_write(key, "%lf", conf->du_stats[i].avail_percent); + + snprintf(key, sizeof(key), "du_stats[%d].avail_space", i); + gf_proc_dump_write(key, "%" PRIu64, conf->du_stats[i].avail_space); + + snprintf(key, sizeof(key), "du_stats[%d].avail_inodes", i); + gf_proc_dump_write(key, "%lf", conf->du_stats[i].avail_inodes); - gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed); - gf_proc_dump_write("gen", "%d", conf->gen); - gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk); - gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes); - gf_proc_dump_write("disk_unit", "%c", conf->disk_unit); - gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval); - gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit); - - if (conf->du_stats) { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (!conf->subvolume_status[i]) - continue; - - snprintf (key, sizeof (key), "subvolumes[%d]", i); - gf_proc_dump_write (key, "%s", - conf->subvolumes[i]->name); - - snprintf (key, sizeof (key), - "du_stats[%d].avail_percent", i); - gf_proc_dump_write (key, "%lf", - conf->du_stats[i].avail_percent); - - snprintf (key, sizeof (key), "du_stats[%d].avail_space", - i); - gf_proc_dump_write (key, "%lu", - conf->du_stats[i].avail_space); - - snprintf (key, sizeof (key), - "du_stats[%d].avail_inodes", i); - gf_proc_dump_write (key, "%lf", - conf->du_stats[i].avail_inodes); - - snprintf (key, sizeof (key), "du_stats[%d].log", i); - gf_proc_dump_write (key, "%lu", - conf->du_stats[i].log); - } + snprintf(key, sizeof(key), "du_stats[%d].log", i); + gf_proc_dump_write(key, "%" PRIu32, conf->du_stats[i].log); } + } - if (conf->last_stat_fetch.tv_sec) - gf_proc_dump_write("last_stat_fetch", "%s", - ctime(&conf->last_stat_fetch.tv_sec)); + if (conf->last_stat_fetch) + gf_proc_dump_write("last_stat_fetch", "%s", + ctime(&conf->last_stat_fetch)); - UNLOCK(&conf->subvolume_lock); + UNLOCK(&conf->subvolume_lock); out: - return ret; + return ret; } int32_t -dht_inodectx_dump (xlator_t *this, inode_t *inode) +dht_inodectx_dump(xlator_t *this, inode_t *inode) { - int ret = -1; - dht_layout_t *layout = NULL; + int ret = -1; + dht_layout_t *layout = NULL; - if (!this) - goto out; - if (!inode) - goto out; + if (!this) + goto out; + if (!inode) + goto out; - ret = dht_inode_ctx_layout_get (inode, this, &layout); + ret = dht_inode_ctx_layout_get(inode, this, &layout); - if ((ret != 0) || !layout) - return ret; + if ((ret != 0) || !layout) + return ret; - gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name); - dht_layout_dump(layout, "layout"); + gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name); + dht_layout_dump(layout, "layout"); out: - return ret; + return ret; } void -dht_fini (xlator_t *this) +dht_fini(xlator_t *this) { - int i = 0; - dht_conf_t *conf = NULL; + int i = 0; + dht_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO("dht", this, out); + + conf = this->private; + this->private = NULL; + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + GF_FREE(conf->file_layouts[i]); + } + GF_FREE(conf->file_layouts); + } - GF_VALIDATE_OR_GOTO ("dht", this, out); + dict_unref(conf->leaf_to_subvol); - conf = this->private; - this->private = NULL; - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } + /* allocated in dht_init_subvolumes() */ + GF_FREE(conf->subvolumes); + GF_FREE(conf->subvolume_status); + GF_FREE(conf->last_event); + GF_FREE(conf->subvol_up_time); + GF_FREE(conf->du_stats); + GF_FREE(conf->decommissioned_bricks); - GF_FREE (conf->subvolumes); + /* allocated in dht_init() */ + GF_FREE(conf->mds_xattr_key); + GF_FREE(conf->link_xattr_name); + GF_FREE(conf->commithash_xattr_name); + GF_FREE(conf->wild_xattr_name); - GF_FREE (conf->subvolume_status); + /* allocated in dht_init_regex() */ + if (conf->rsync_regex_valid) + regfree(&conf->rsync_regex); + if (conf->extra_regex_valid) + regfree(&conf->extra_regex); - GF_FREE (conf); - } + synclock_destroy(&conf->link_lock); + + if (conf->lock_pool) + mem_pool_destroy(conf->lock_pool); + + GF_FREE(conf); + } out: - return; + return; } int32_t -mem_acct_init (xlator_t *this) +mem_acct_init(xlator_t *this) { - int ret = -1; + int ret = -1; - GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", this, out); - ret = xlator_mem_acct_init (this, gf_dht_mt_end + 1); + ret = xlator_mem_acct_init(this, gf_dht_mt_end + 1); - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } -out: + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY, + "Memory accounting init failed"); return ret; + } +out: + return ret; } - -int -dht_parse_decommissioned_bricks (xlator_t *this, dht_conf_t *conf, - const char *bricks) +static int +dht_parse_decommissioned_bricks(xlator_t *this, dht_conf_t *conf, + const char *bricks) { - int i = 0; - int ret = -1; - char *tmpstr = NULL; - char *dup_brick = NULL; - char *node = NULL; - - if (!conf || !bricks) - goto out; - - dup_brick = gf_strdup (bricks); - node = strtok_r (dup_brick, ",", &tmpstr); - while (node) { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (!strcmp (conf->subvolumes[i]->name, node)) { - conf->decommissioned_bricks[i] = - conf->subvolumes[i]; - conf->decommission_subvols_cnt++; - gf_log (this->name, GF_LOG_INFO, - "decommissioning subvolume %s", - conf->subvolumes[i]->name); - break; - } - } - if (i == conf->subvolume_cnt) { - /* Wrong node given. */ - goto out; - } - node = strtok_r (NULL, ",", &tmpstr); + int i = 0; + int ret = -1; + char *tmpstr = NULL; + char *dup_brick = NULL; + char *node = NULL; + + if (!conf || !bricks) + goto out; + + dup_brick = gf_strdup(bricks); + if (dup_brick == NULL) { + goto out; + } + + node = strtok_r(dup_brick, ",", &tmpstr); + while (node) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!strcmp(conf->subvolumes[i]->name, node)) { + conf->decommissioned_bricks[i] = conf->subvolumes[i]; + conf->decommission_subvols_cnt++; + gf_msg(this->name, GF_LOG_INFO, 0, + DHT_MSG_SUBVOL_DECOMMISSION_INFO, + "decommissioning subvolume %s", + conf->subvolumes[i]->name); + break; + } + } + if (i == conf->subvolume_cnt) { + /* Wrong node given. */ + goto out; } + node = strtok_r(NULL, ",", &tmpstr); + } - ret = 0; - conf->decommission_in_progress = 1; + ret = 0; + conf->decommission_in_progress = 1; out: - GF_FREE (dup_brick); + GF_FREE(dup_brick); - return ret; + return ret; } - -int -dht_decommissioned_remove (xlator_t *this, dht_conf_t *conf) +static void +dht_decommissioned_remove(xlator_t *this, dht_conf_t *conf) { - int i = 0; - int ret = -1; - - if (!conf) - goto out; + int i = 0; - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->decommissioned_bricks[i]) { - conf->decommissioned_bricks[i] = NULL; - conf->decommission_subvols_cnt--; - } + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i]) { + conf->decommissioned_bricks[i] = NULL; + conf->decommission_subvols_cnt--; } - - ret = 0; -out: - - return ret; + } } -void -dht_init_regex (xlator_t *this, dict_t *odict, char *name, - regex_t *re, gf_boolean_t *re_valid) + +static void +dht_init_regex(xlator_t *this, dict_t *odict, char *name, regex_t *re, + gf_boolean_t *re_valid, dht_conf_t *conf) { - char *temp_str; + char *temp_str = NULL; - if (dict_get_str (odict, name, &temp_str) != 0) { - if (strcmp(name,"rsync-hash-regex")) { - return; - } - temp_str = "^\\.(.+)\\.[^.]+$"; + if (dict_get_str(odict, name, &temp_str) != 0) { + if (strcmp(name, "rsync-hash-regex")) { + return; } + temp_str = "^\\.(.+)\\.[^.]+$"; + } + LOCK(&conf->lock); + { if (*re_valid) { - regfree(re); - *re_valid = _gf_false; + regfree(re); + *re_valid = _gf_false; } - if (!strcmp(temp_str,"none")) { - return; + if (!strcmp(temp_str, "none")) { + goto unlock; } - if (regcomp(re,temp_str,REG_EXTENDED) == 0) { - gf_log (this->name, GF_LOG_INFO, - "using regex %s = %s", name, temp_str); - *re_valid = _gf_true; - } - else { - gf_log (this->name, GF_LOG_WARNING, - "compiling regex %s failed", temp_str); + if (regcomp(re, temp_str, REG_EXTENDED) == 0) { + gf_msg_debug(this->name, 0, "using regex %s = %s", name, temp_str); + *re_valid = _gf_true; + } else { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_REGEX_INFO, + "compiling regex %s failed", temp_str); } + } +unlock: + UNLOCK(&conf->lock); } int -dht_reconfigure (xlator_t *this, dict_t *options) +dht_set_subvol_range(xlator_t *this) { - dht_conf_t *conf = NULL; - char *temp_str = NULL; - gf_boolean_t search_unhashed; - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", options, out); - - conf = this->private; - if (!conf) - return 0; - - if (dict_get_str (options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean*/ - if (strcasecmp (temp_str, "auto")) { - if (!gf_string2boolean (temp_str, &search_unhashed)) { - gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" - " lookup-unhashed reconfigured (%s)", - temp_str); - conf->search_unhashed = search_unhashed; - } else { - gf_log(this->name, GF_LOG_ERROR, "Reconfigure:" - " lookup-unhashed should be boolean," - " not (%s), defaulting to (%d)", - temp_str, conf->search_unhashed); - ret = -1; - goto out; - } - } else { - gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" - " lookup-unhashed reconfigured auto "); - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; - } - } + int ret = -1; + dht_conf_t *conf = NULL; - GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options, - percent_or_size, out); - /* option can be any one of percent or bytes */ - conf->disk_unit = 0; - if (conf->min_free_disk < 100.0) - conf->disk_unit = 'p'; + conf = this->private; - GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options, - percent, out); + if (!conf) + goto out; - GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt, - options, uint32, out); + conf->leaf_to_subvol = dict_new(); + if (!conf->leaf_to_subvol) + goto out; - GF_OPTION_RECONF ("readdir-optimize", conf->readdir_optimize, options, - bool, out); - if (conf->defrag) { - GF_OPTION_RECONF ("rebalance-stats", conf->defrag->stats, - options, bool, out); + ret = glusterfs_reachable_leaves(this, conf->leaf_to_subvol); + +out: + return ret; +} + +static int +dht_configure_throttle(xlator_t *this, dht_conf_t *conf, char *temp_str) +{ + int rebal_thread_count = 0; + int ret = 0; + + pthread_mutex_lock(&conf->defrag->dfq_mutex); + { + if (!strcasecmp(temp_str, "lazy")) { + conf->defrag->recon_thread_count = 1; + } else if (!strcasecmp(temp_str, "normal")) { + conf->defrag->recon_thread_count = 2; + } else if (!strcasecmp(temp_str, "aggressive")) { + conf->defrag->recon_thread_count = MAX(MAX_REBAL_THREADS - 4, 4); + } else if ((gf_string2int(temp_str, &rebal_thread_count) == 0)) { + if ((rebal_thread_count > 0) && + (rebal_thread_count <= MAX_REBAL_THREADS)) { + conf->defrag->recon_thread_count = rebal_thread_count; + pthread_mutex_unlock(&conf->defrag->dfq_mutex); + gf_msg(this->name, GF_LOG_INFO, 0, 0, + "rebal thread count configured to %d", + rebal_thread_count); + goto out; + } else { + pthread_mutex_unlock(&conf->defrag->dfq_mutex); + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, + "Invalid option: Reconfigure: " + "rebal-throttle should be " + "within range of 0 and maximum number of" + " cores available"); + ret = -1; + goto out; + } + } else { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, + "Invalid option: Reconfigure: " + "rebal-throttle should be {lazy|normal|aggressive}" + " or a number up to the number of cores available," + " not (%s), defaulting to (%d)", + temp_str, conf->dthrottle); + ret = -1; } + } + pthread_mutex_unlock(&conf->defrag->dfq_mutex); + +out: + return ret; +} + +int +dht_reconfigure(xlator_t *this, dict_t *options) +{ + dht_conf_t *conf = NULL; + char *temp_str = NULL; + gf_boolean_t search_unhashed; + int ret = -1; + + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO("dht", options, out); + + conf = this->private; + if (!conf) + return 0; - if (dict_get_str (options, "decommissioned-bricks", &temp_str) == 0) { - ret = dht_parse_decommissioned_bricks (this, conf, temp_str); - if (ret == -1) - goto out; + if (dict_get_str(options, "lookup-unhashed", &temp_str) == 0) { + /* If option is not "auto", other options _should_ be boolean*/ + if (strcasecmp(temp_str, "auto")) { + if (!gf_string2boolean(temp_str, &search_unhashed)) { + gf_msg_debug(this->name, 0, + "Reconfigure: " + "lookup-unhashed reconfigured(%s)", + temp_str); + conf->search_unhashed = search_unhashed; + } else { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, + "Invalid option: Reconfigure: " + "lookup-unhashed should be boolean," + " not (%s), defaulting to (%d)", + temp_str, conf->search_unhashed); + ret = -1; + goto out; + } } else { - ret = dht_decommissioned_remove (this, conf); - if (ret == -1) - goto out; + gf_msg_debug(this->name, 0, + "Reconfigure:" + " lookup-unhashed reconfigured auto "); + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; } + } + + GF_OPTION_RECONF("lookup-optimize", conf->lookup_optimize, options, bool, + out); + + GF_OPTION_RECONF("min-free-disk", conf->min_free_disk, options, + percent_or_size, out); + /* option can be any one of percent or bytes */ + conf->disk_unit = 0; + if (conf->min_free_disk < 100.0) + conf->disk_unit = 'p'; + + GF_OPTION_RECONF("min-free-inodes", conf->min_free_inodes, options, percent, + out); - dht_init_regex (this, options, "rsync-hash-regex", - &conf->rsync_regex, &conf->rsync_regex_valid); - dht_init_regex (this, options, "extra-hash-regex", - &conf->extra_regex, &conf->extra_regex_valid); + GF_OPTION_RECONF("directory-layout-spread", conf->dir_spread_cnt, options, + uint32, out); - ret = 0; + GF_OPTION_RECONF("readdir-optimize", conf->readdir_optimize, options, bool, + out); + GF_OPTION_RECONF("randomize-hash-range-by-gfid", conf->randomize_by_gfid, + options, bool, out); + + GF_OPTION_RECONF("lock-migration", conf->lock_migration_enabled, options, + bool, out); + + GF_OPTION_RECONF("force-migration", conf->force_migration, options, bool, + out); + + if (conf->defrag) { + if (dict_get_str(options, "rebal-throttle", &temp_str) == 0) { + ret = dht_configure_throttle(this, conf, temp_str); + if (ret == -1) + goto out; + } + } + + if (conf->defrag) { + conf->defrag->lock_migration_enabled = conf->lock_migration_enabled; + } + + if (conf->defrag) { + GF_OPTION_RECONF("rebalance-stats", conf->defrag->stats, options, bool, + out); + } + + if (dict_get_str(options, "decommissioned-bricks", &temp_str) == 0) { + ret = dht_parse_decommissioned_bricks(this, conf, temp_str); + if (ret == -1) + goto out; + } else { + dht_decommissioned_remove(this, conf); + } + + dht_init_regex(this, options, "rsync-hash-regex", &conf->rsync_regex, + &conf->rsync_regex_valid, conf); + dht_init_regex(this, options, "extra-hash-regex", &conf->extra_regex, + &conf->extra_regex_valid, conf); + + GF_OPTION_RECONF("weighted-rebalance", conf->do_weighting, options, bool, + out); + + GF_OPTION_RECONF("use-readdirp", conf->use_readdirp, options, bool, out); + ret = 0; out: - return ret; + return ret; } static int -gf_defrag_pattern_list_fill (xlator_t *this, gf_defrag_info_t *defrag, char *data) +gf_defrag_pattern_list_fill(xlator_t *this, gf_defrag_info_t *defrag, + char *data) { - int ret = -1; - char *tmp_str = NULL; - char *tmp_str1 = NULL; - char *dup_str = NULL; - char *num = NULL; - char *pattern_str = NULL; - char *pattern = NULL; - gf_defrag_pattern_list_t *temp_list = NULL; - gf_defrag_pattern_list_t *pattern_list = NULL; - - if (!this || !defrag || !data) - goto out; - - /* Get the pattern for pattern list. "pattern:<optional-size>" - * eg: *avi, *pdf:10MB, *:1TB - */ - pattern_str = strtok_r (data, ",", &tmp_str); - while (pattern_str) { - dup_str = gf_strdup (pattern_str); - pattern_list = GF_CALLOC (1, sizeof (gf_defrag_pattern_list_t), - 1); - if (!pattern_list) { - goto out; - } - pattern = strtok_r (dup_str, ":", &tmp_str1); - num = strtok_r (NULL, ":", &tmp_str1); - if (!pattern) - goto out; - if (!num) { - if (gf_string2bytesize_uint64(pattern, &pattern_list->size) - == 0) { - pattern = "*"; - } - } else if (gf_string2bytesize_uint64 (num, &pattern_list->size) != 0) { - gf_log (this->name, GF_LOG_ERROR, - "invalid number format \"%s\"", num); - goto out; - } - memcpy (pattern_list->path_pattern, pattern, strlen (dup_str)); - - if (!defrag->defrag_pattern) - temp_list = NULL; - else - temp_list = defrag->defrag_pattern; - - pattern_list->next = temp_list; - - defrag->defrag_pattern = pattern_list; - pattern_list = NULL; - - GF_FREE (dup_str); - dup_str = NULL; - - pattern_str = strtok_r (NULL, ",", &tmp_str); + int ret = -1; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *dup_str = NULL; + char *num = NULL; + char *pattern_str = NULL; + char *pattern = NULL; + gf_defrag_pattern_list_t *temp_list = NULL; + gf_defrag_pattern_list_t *pattern_list = NULL; + + if (!this || !defrag || !data) + goto out; + + /* Get the pattern for pattern list. "pattern:<optional-size>" + * eg: *avi, *pdf:10MB, *:1TB + */ + pattern_str = strtok_r(data, ",", &tmp_str); + while (pattern_str) { + dup_str = gf_strdup(pattern_str); + if (!dup_str) + goto out; + pattern_list = GF_CALLOC(1, sizeof(gf_defrag_pattern_list_t), 1); + if (!pattern_list) { + goto out; + } + pattern = strtok_r(dup_str, ":", &tmp_str1); + num = strtok_r(NULL, ":", &tmp_str1); + if (!pattern) + goto out; + if (!num) { + if (gf_string2bytesize_uint64(pattern, &pattern_list->size) == 0) { + pattern = "*"; + } + } else if (gf_string2bytesize_uint64(num, &pattern_list->size) != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, + "Invalid option. Defrag pattern:" + " Invalid number format \"%s\"", + num); + goto out; } + memcpy(pattern_list->path_pattern, pattern, strlen(dup_str)); + + if (!defrag->defrag_pattern) + temp_list = NULL; + else + temp_list = defrag->defrag_pattern; + + pattern_list->next = temp_list; + + defrag->defrag_pattern = pattern_list; + pattern_list = NULL; + + GF_FREE(dup_str); + dup_str = NULL; - ret = 0; + pattern_str = strtok_r(NULL, ",", &tmp_str); + } + + ret = 0; out: - if (ret) - GF_FREE (pattern_list); - GF_FREE (dup_str); + if (ret) + GF_FREE(pattern_list); + GF_FREE(dup_str); - return ret; + return ret; } -int -dht_init (xlator_t *this) +static int +dht_init_methods(xlator_t *this) { - dht_conf_t *conf = NULL; - char *temp_str = NULL; - int ret = -1; - int i = 0; - gf_defrag_info_t *defrag = NULL; - int cmd = 0; - char *node_uuid = NULL; + int ret = -1; + dht_conf_t *conf = NULL; + dht_methods_t *methods = NULL; + GF_VALIDATE_OR_GOTO("dht", this, err); - GF_VALIDATE_OR_GOTO ("dht", this, err); + conf = this->private; + methods = &(conf->methods); - if (!this->children) { - gf_log (this->name, GF_LOG_CRITICAL, - "Distribute needs more than one subvolume"); - return -1; - } + methods->migration_get_dst_subvol = dht_migration_get_dst_subvol; + methods->migration_other = NULL; + methods->layout_search = dht_layout_search; - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile"); - } + ret = 0; +err: + return ret; +} - conf = GF_CALLOC (1, sizeof (*conf), gf_dht_mt_dht_conf_t); - if (!conf) { - goto err; - } +int +dht_init(xlator_t *this) +{ + dht_conf_t *conf = NULL; + char *temp_str = NULL; + int ret = -1; + int i = 0; + gf_defrag_info_t *defrag = NULL; + int cmd = 0; + char *node_uuid = NULL; + uint32_t commit_hash = 0; + + GF_VALIDATE_OR_GOTO("dht", this, err); + + if (!this->children) { + gf_msg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_INVALID_CONFIGURATION, + "Distribute needs more than one subvolume"); + return -1; + } - ret = dict_get_int32 (this->options, "rebalance-cmd", &cmd); + if (!this->parents) { + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_CONFIGURATION, + "dangling volume. check volfile"); + } - if (cmd) { - defrag = GF_CALLOC (1, sizeof (gf_defrag_info_t), - gf_defrag_info_mt); + conf = GF_CALLOC(1, sizeof(*conf), gf_dht_mt_dht_conf_t); + if (!conf) { + goto err; + } - GF_VALIDATE_OR_GOTO (this->name, defrag, err); + LOCK_INIT(&conf->subvolume_lock); + LOCK_INIT(&conf->layout_lock); + LOCK_INIT(&conf->lock); + synclock_init(&conf->link_lock, SYNC_LOCK_DEFAULT); - LOCK_INIT (&defrag->lock); + /* We get the commit-hash to set only for rebalance process */ + if (dict_get_uint32(this->options, "commit-hash", &commit_hash) == 0) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_COMMIT_HASH_INFO, + "%s using commit hash %u", __func__, commit_hash); + conf->vol_commit_hash = commit_hash; + conf->vch_forced = _gf_true; + } - defrag->is_exiting = 0; + ret = dict_get_int32(this->options, "rebalance-cmd", &cmd); - conf->defrag = defrag; + if (cmd) { + defrag = GF_CALLOC(1, sizeof(gf_defrag_info_t), gf_defrag_info_mt); - ret = dict_get_str (this->options, "node-uuid", &node_uuid); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "node-uuid not " - "specified"); - goto err; - } + GF_VALIDATE_OR_GOTO(this->name, defrag, err); - if (uuid_parse (node_uuid, defrag->node_uuid)) { - gf_log (this->name, GF_LOG_ERROR, "Cannot parse " - "glusterd node uuid"); - goto err; - } + LOCK_INIT(&defrag->lock); - defrag->cmd = cmd; + defrag->is_exiting = 0; - defrag->stats = _gf_false; + conf->defrag = defrag; + defrag->this = this; + + ret = dict_get_str(this->options, "node-uuid", &node_uuid); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_CONFIGURATION, + "Invalid volume configuration: " + "node-uuid not specified"); + goto err; } - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; - if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean */ - if (strcasecmp (temp_str, "auto")) - gf_string2boolean (temp_str, &conf->search_unhashed); - else - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; + if (gf_uuid_parse(node_uuid, defrag->node_uuid)) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, + "Invalid option:" + " Cannot parse glusterd node uuid"); + goto err; } - GF_OPTION_INIT ("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool, - err); + defrag->cmd = cmd; - GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err); + defrag->stats = _gf_false; - GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size, - err); + defrag->queue = NULL; - GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent, - err); + defrag->crawl_done = 0; - conf->dir_spread_cnt = conf->subvolume_cnt; - GF_OPTION_INIT ("directory-layout-spread", conf->dir_spread_cnt, - uint32, err); + defrag->global_error = 0; - GF_OPTION_INIT ("assert-no-child-down", conf->assert_no_child_down, - bool, err); + defrag->q_entry_count = 0; - GF_OPTION_INIT ("readdir-optimize", conf->readdir_optimize, bool, err); + defrag->wakeup_crawler = 0; - if (defrag) { - GF_OPTION_INIT ("rebalance-stats", defrag->stats, bool, err); - if (dict_get_str (this->options, "rebalance-filter", &temp_str) - == 0) { - if (gf_defrag_pattern_list_fill (this, defrag, temp_str) - == -1) { - gf_log (this->name, GF_LOG_ERROR, "Cannot parse" - " rebalance-filter (%s)", temp_str); - goto err; - } - } - } + pthread_mutex_init(&defrag->dfq_mutex, 0); + pthread_cond_init(&defrag->parallel_migration_cond, 0); + pthread_cond_init(&defrag->rebalance_crawler_alarm, 0); + pthread_cond_init(&defrag->df_wakeup_thread, 0); + + pthread_mutex_init(&defrag->fc_mutex, 0); + pthread_cond_init(&defrag->fc_wakeup_cond, 0); + + defrag->global_error = 0; + } - /* option can be any one of percent or bytes */ - conf->disk_unit = 0; - if (conf->min_free_disk < 100) - conf->disk_unit = 'p'; + conf->use_fallocate = 1; - ret = dht_init_subvolumes (this, conf); - if (ret == -1) { + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; + if (dict_get_str(this->options, "lookup-unhashed", &temp_str) == 0) { + /* If option is not "auto", other options _should_ be boolean */ + if (strcasecmp(temp_str, "auto")) { + gf_boolean_t search_unhashed_bool; + ret = gf_string2boolean(temp_str, &search_unhashed_bool); + if (ret == -1) { goto err; + } + conf->search_unhashed = search_unhashed_bool + ? GF_DHT_LOOKUP_UNHASHED_ON + : GF_DHT_LOOKUP_UNHASHED_OFF; + } else { + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; } + } - if (dict_get_str (this->options, "decommissioned-bricks", &temp_str) == 0) { - ret = dht_parse_decommissioned_bricks (this, conf, temp_str); - if (ret == -1) - goto err; - } + GF_OPTION_INIT("lookup-optimize", conf->lookup_optimize, bool, err); - dht_init_regex (this, this->options, "rsync-hash-regex", - &conf->rsync_regex, &conf->rsync_regex_valid); - dht_init_regex (this, this->options, "extra-hash-regex", - &conf->extra_regex, &conf->extra_regex_valid); + GF_OPTION_INIT("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool, err); - ret = dht_layouts_init (this, conf); - if (ret == -1) { - goto err; - } + GF_OPTION_INIT("use-readdirp", conf->use_readdirp, bool, err); + + GF_OPTION_INIT("min-free-disk", conf->min_free_disk, percent_or_size, err); + + GF_OPTION_INIT("min-free-inodes", conf->min_free_inodes, percent, err); + + conf->dir_spread_cnt = conf->subvolume_cnt; + GF_OPTION_INIT("directory-layout-spread", conf->dir_spread_cnt, uint32, + err); + + GF_OPTION_INIT("assert-no-child-down", conf->assert_no_child_down, bool, + err); + + GF_OPTION_INIT("readdir-optimize", conf->readdir_optimize, bool, err); - LOCK_INIT (&conf->subvolume_lock); - LOCK_INIT (&conf->layout_lock); + GF_OPTION_INIT("lock-migration", conf->lock_migration_enabled, bool, err); - conf->gen = 1; + GF_OPTION_INIT("force-migration", conf->force_migration, bool, err); + + if (defrag) { + defrag->lock_migration_enabled = conf->lock_migration_enabled; + + GF_OPTION_INIT("rebalance-stats", defrag->stats, bool, err); + if (dict_get_str(this->options, "rebalance-filter", &temp_str) == 0) { + if (gf_defrag_pattern_list_fill(this, defrag, temp_str) == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION, + "Invalid option:" + " Cannot parse rebalance-filter (%s)", + temp_str); - this->local_pool = mem_pool_new (dht_local_t, 512); - if (!this->local_pool) { - gf_log (this->name, GF_LOG_ERROR, - "failed to create local_t's memory pool"); goto err; + } } - - GF_OPTION_INIT ("xattr-name", conf->xattr_name, str, err); - gf_asprintf (&conf->link_xattr_name, "%s."DHT_LINKFILE_STR, - conf->xattr_name); - gf_asprintf (&conf->wild_xattr_name, "%s*", conf->xattr_name); - if (!conf->link_xattr_name || !conf->wild_xattr_name) { + } + + /* option can be any one of percent or bytes */ + conf->disk_unit = 0; + if (conf->min_free_disk < 100) + conf->disk_unit = 'p'; + + ret = dht_init_subvolumes(this, conf); + if (ret == -1) { + goto err; + } + + if (cmd) { + ret = dht_init_local_subvolumes(this, conf); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_INIT_LOCAL_SUBVOL_FAILED, + "dht_init_local_subvolumes failed"); + goto err; + } + } + + if (dict_get_str(this->options, "decommissioned-bricks", &temp_str) == 0) { + ret = dht_parse_decommissioned_bricks(this, conf, temp_str); + if (ret == -1) + goto err; + } + + dht_init_regex(this, this->options, "rsync-hash-regex", &conf->rsync_regex, + &conf->rsync_regex_valid, conf); + dht_init_regex(this, this->options, "extra-hash-regex", &conf->extra_regex, + &conf->extra_regex_valid, conf); + + ret = dht_layouts_init(this, conf); + if (ret == -1) { + goto err; + } + + conf->gen = 1; + + this->local_pool = mem_pool_new(dht_local_t, 512); + if (!this->local_pool) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, + " DHT initialisation failed. " + "failed to create local_t's memory pool"); + goto err; + } + + GF_OPTION_INIT("randomize-hash-range-by-gfid", conf->randomize_by_gfid, + bool, err); + + if (defrag) { + GF_OPTION_INIT("rebal-throttle", temp_str, str, err); + if (temp_str) { + ret = dht_configure_throttle(this, conf, temp_str); + if (ret == -1) goto err; } + } - this->private = conf; + GF_OPTION_INIT("xattr-name", conf->xattr_name, str, err); + gf_asprintf(&conf->mds_xattr_key, "%s." DHT_MDS_STR, conf->xattr_name); + gf_asprintf(&conf->link_xattr_name, "%s." DHT_LINKFILE_STR, + conf->xattr_name); + gf_asprintf(&conf->commithash_xattr_name, "%s." DHT_COMMITHASH_STR, + conf->xattr_name); + gf_asprintf(&conf->wild_xattr_name, "%s*", conf->xattr_name); + if (!conf->link_xattr_name || !conf->wild_xattr_name) { + goto err; + } - return 0; + GF_OPTION_INIT("weighted-rebalance", conf->do_weighting, bool, err); + + conf->lock_pool = mem_pool_new(dht_lock_t, 512); + if (!conf->lock_pool) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INIT_FAILED, + "failed to create lock mem_pool, failing " + "initialization"); + goto err; + } + + this->private = conf; + + if (dht_set_subvol_range(this)) + goto err; + + if (dht_init_methods(this)) + goto err; + + return 0; err: - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + GF_FREE(conf->file_layouts[i]); + } + GF_FREE(conf->file_layouts); + } - GF_FREE (conf->subvolumes); + GF_FREE(conf->subvolumes); - GF_FREE (conf->subvolume_status); + GF_FREE(conf->subvolume_status); - GF_FREE (conf->du_stats); + GF_FREE(conf->du_stats); - GF_FREE (conf->defrag); + GF_FREE(conf->defrag); - GF_FREE (conf->xattr_name); - GF_FREE (conf->link_xattr_name); - GF_FREE (conf->wild_xattr_name); + GF_FREE(conf->xattr_name); + GF_FREE(conf->link_xattr_name); + GF_FREE(conf->wild_xattr_name); + GF_FREE(conf->mds_xattr_key); - GF_FREE (conf); - } + if (conf->lock_pool) + mem_pool_destroy(conf->lock_pool); - return -1; -} + GF_FREE(conf); + } + return -1; +} -struct volume_options options[] = { - { .key = {"lookup-unhashed"}, - .value = {"auto", "yes", "no", "enable", "disable", "1", "0", - "on", "off"}, - .type = GF_OPTION_TYPE_STR, - .default_value = "on", - .description = "This option if set to ON, does a lookup through " - "all the sub-volumes, in case a lookup didn't return any result " - "from the hash subvolume. If set to OFF, it does not do a lookup " - "on the remaining subvolumes." - }, - { .key = {"min-free-disk"}, - .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, - .default_value = "10%", - .description = "Percentage/Size of disk space, after which the " - "process starts balancing out the cluster, and logs will appear " - "in log files", - }, - { .key = {"min-free-inodes"}, - .type = GF_OPTION_TYPE_PERCENT, - .default_value = "5%", - .description = "after system has only N% of inodes, warnings " - "starts to appear in log files", - }, - { .key = {"unhashed-sticky-bit"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - }, - { .key = {"use-readdirp"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "on", - .description = "This option if set to ON, forces the use of " - "readdirp, and hence also displays the stats of the files." - }, - { .key = {"assert-no-child-down"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - .description = "This option if set to ON, in the event of " - "CHILD_DOWN, will call exit." - }, - { .key = {"directory-layout-spread"}, - .type = GF_OPTION_TYPE_INT, - .min = 1, - .validate = GF_OPT_VALIDATE_MIN, - .description = "Specifies the directory layout spread. Takes number " - "of subvolumes as default value." - }, - { .key = {"decommissioned-bricks"}, - .type = GF_OPTION_TYPE_ANY, - .description = "This option if set to ON, decommissions " - "the brick, so that no new data is allowed to be created " - "on that brick." - }, - { .key = {"rebalance-cmd"}, - .type = GF_OPTION_TYPE_INT, - }, - { .key = {"node-uuid"}, - .type = GF_OPTION_TYPE_STR, - }, - { .key = {"rebalance-stats"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - .description = "This option if set to ON displays and logs the " - " time taken for migration of each file, during the rebalance " - "process. If set to OFF, the rebalance logs will only display the " - "time spent in each directory." - }, - { .key = {"readdir-optimize"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - .description = "This option if set to ON enables the optimization " - "that allows DHT to requests non-first subvolumes to filter out " - "directory entries." - }, - { .key = {"rsync-hash-regex"}, - .type = GF_OPTION_TYPE_STR, - /* Setting a default here doesn't work. See dht_init_regex. */ - .description = "Regular expression for stripping temporary-file " - "suffix and prefix used by rsync, to prevent relocation when the " - "file is renamed." - }, - { .key = {"extra-hash-regex"}, - .type = GF_OPTION_TYPE_STR, - /* Setting a default here doesn't work. See dht_init_regex. */ - .description = "Regular expression for stripping temporary-file " - "suffix and prefix used by an application, to prevent relocation when " - "the file is renamed." - }, - { .key = {"rebalance-filter"}, - .type = GF_OPTION_TYPE_STR, - }, - - { .key = {"xattr-name"}, - .type = GF_OPTION_TYPE_STR, - .default_value = "trusted.glusterfs.dht", - .description = "Base for extended attributes used by this " - "translator instance, to avoid conflicts with others above or " - "below it." - }, - - /* NUFA option */ - { .key = {"local-volume-name"}, - .type = GF_OPTION_TYPE_XLATOR - }, - - /* switch option */ - { .key = {"pattern.switch.case"}, - .type = GF_OPTION_TYPE_ANY - }, - - { .key = {NULL} }, +struct volume_options dht_options[] = { + { + .key = {"lookup-unhashed"}, + .value = {"auto", "yes", "no", "enable", "disable", "1", "0", "on", + "off"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "on", + .description = + "This option if set to ON, does a lookup through " + "all the sub-volumes, in case a lookup didn't return any result " + "from the hash subvolume. If set to OFF, it does not do a lookup " + "on the remaining subvolumes.", + .op_version = {1}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE, + .level = OPT_STATUS_BASIC, + }, + {.key = {"lookup-optimize"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = + "This option if set to ON enables the optimization " + "of -ve lookups, by not doing a lookup on non-hashed subvolumes for " + "files, in case the hashed subvolume does not return any result. " + "This option disregards the lookup-unhashed setting, when enabled.", + .op_version = {GD_OP_VERSION_3_7_2}, + .level = OPT_STATUS_ADVANCED, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"min-free-disk"}, + .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, + .default_value = "10%", + .description = + "Percentage/Size of disk space, after which the " + "process starts balancing out the cluster, and logs will appear " + "in log files", + .op_version = {1}, + .level = OPT_STATUS_BASIC, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"min-free-inodes"}, + .type = GF_OPTION_TYPE_PERCENT, + .default_value = "5%", + .description = "after system has only N% of inodes, warnings " + "starts to appear in log files", + .op_version = {1}, + .level = OPT_STATUS_BASIC, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + { + .key = {"unhashed-sticky-bit"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, + {.key = {"use-readdirp"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "This option if set to ON, forces the use of " + "readdirp, and hence also displays the stats of the files.", + .level = OPT_STATUS_ADVANCED, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"assert-no-child-down"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option if set to ON, in the event of " + "CHILD_DOWN, will call exit."}, + { + .key = {"directory-layout-spread"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .validate = GF_OPT_VALIDATE_MIN, + .description = "Specifies the directory layout spread. Takes number " + "of subvolumes as default value.", + + .op_version = {2}, + }, + { + .key = {"decommissioned-bricks"}, + .type = GF_OPTION_TYPE_ANY, + .description = + "This option if set to ON, decommissions " + "the brick, so that no new data is allowed to be created " + "on that brick.", + .level = OPT_STATUS_ADVANCED, + }, + { + .key = {"rebalance-cmd"}, + .type = GF_OPTION_TYPE_INT, + }, + { + .key = {"commit-hash"}, + .type = GF_OPTION_TYPE_INT, + }, + { + .key = {"node-uuid"}, + .type = GF_OPTION_TYPE_STR, + }, + { + .key = {"rebalance-stats"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = + "This option if set to ON displays and logs the " + " time taken for migration of each file, during the rebalance " + "process. If set to OFF, the rebalance logs will only display the " + "time spent in each directory.", + .op_version = {2}, + .level = OPT_STATUS_BASIC, + }, + {.key = {"readdir-optimize"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = + "This option if set to ON enables the optimization " + "that allows DHT to requests non-first subvolumes to filter out " + "directory entries.", + .op_version = {1}, + .level = OPT_STATUS_ADVANCED, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"rsync-hash-regex"}, + .type = GF_OPTION_TYPE_STR, + /* Setting a default here doesn't work. See dht_init_regex. */ + .description = + "Regular expression for stripping temporary-file " + "suffix and prefix used by rsync, to prevent relocation when the " + "file is renamed.", + .op_version = {3}, + .level = OPT_STATUS_BASIC, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + {.key = {"extra-hash-regex"}, + .type = GF_OPTION_TYPE_STR, + /* Setting a default here doesn't work. See dht_init_regex. */ + .description = + "Regular expression for stripping temporary-file " + "suffix and prefix used by an application, to prevent relocation when " + "the file is renamed.", + .op_version = {3}, + .level = OPT_STATUS_BASIC, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + { + .key = {"rebalance-filter"}, + .type = GF_OPTION_TYPE_STR, + }, + + { + .key = {"xattr-name"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "trusted.glusterfs.dht", + .description = + "Base for extended attributes used by this " + "translator instance, to avoid conflicts with others above or " + "below it.", + .op_version = {3}, + }, + + {.key = {"weighted-rebalance"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = + "When enabled, files will be allocated to bricks " + "with a probability proportional to their size. Otherwise, all " + "bricks will have the same probability (legacy behavior).", + .op_version = {GD_OP_VERSION_3_6_0}, + .level = OPT_STATUS_BASIC, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + + /* NUFA option */ + {.key = {"local-volume-name"}, .type = GF_OPTION_TYPE_XLATOR}, + + /* switch option */ + {.key = {"pattern.switch.case"}, .type = GF_OPTION_TYPE_ANY}, + + { + .key = {"randomize-hash-range-by-gfid"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = + "Use gfid of directory to determine the subvolume " + "from which hash ranges are allocated starting with 0. " + "Note that we still use a directory/file's name to determine the " + "subvolume to which it hashes", + .op_version = {GD_OP_VERSION_3_6_0}, + }, + + {.key = {"rebal-throttle"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "normal", + .description = " Sets the maximum number of parallel file migrations " + "allowed on a node during the rebalance operation. The" + " default value is normal and allows a max of " + "[($(processing units) - 4) / 2), 2] files to be " + "migrated at a time. Lazy will allow only one file to " + "be migrated at a time and aggressive will allow " + "max of [($(processing units) - 4) / 2), 4]", + .op_version = {GD_OP_VERSION_3_7_0}, + .level = OPT_STATUS_BASIC, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC + + }, + + {.key = {"lock-migration"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = " If enabled this feature will migrate the posix locks" + " associated with a file during rebalance", + .op_version = {GD_OP_VERSION_3_8_0}, + .level = OPT_STATUS_ADVANCED, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + + {.key = {"force-migration"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "If disabled, rebalance will not migrate files that " + "are being written to by an application", + .op_version = {GD_OP_VERSION_4_0_0}, + .level = OPT_STATUS_ADVANCED, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC}, + + {.key = {NULL}}, }; + +#define NUM_DHT_OPTIONS (sizeof(dht_options) / sizeof(dht_options[0])) + +extern struct volume_options options[NUM_DHT_OPTIONS] + __attribute__((alias("dht_options"))); diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c index fc0ca2f7735..53de8292704 100644 --- a/xlators/cluster/dht/src/dht.c +++ b/xlators/cluster/dht/src/dht.c @@ -8,82 +8,116 @@ cases as published by the Free Software Foundation. */ +#include "dht-common.h" -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif +struct xlator_fops dht_pt_fops = { + /* we need to keep mkdir to make sure we + have layout on new directory */ + .mkdir = dht_pt_mkdir, + .getxattr = dht_pt_getxattr, + .fgetxattr = dht_pt_fgetxattr, -#include "statedump.h" -#include "dht-common.h" + /* required to trace fop properly in changelog */ + .rename = dht_pt_rename, -class_methods_t class_methods = { - .init = dht_init, - .fini = dht_fini, - .reconfigure = dht_reconfigure, - .notify = dht_notify + /* FIXME: commenting the '.lookup()' below made some of + the failing tests to pass. I would remove the below + line, but keeping it here as a reminder for people + to check for issues if they find concerns with DHT + pass-through logic */ + /* + .lookup = dht_lookup, + .readdir = dht_readdir, + .readdirp = dht_readdirp, + */ + /* Keeping above as commented, mainly to support the + usecase of a gluster volume getting to 1x(anytype), + due to remove-brick (shrinking) exercise. In that case, + we would need above fops to be available, so we can + handle the case of dangling linkto files (if any) */ }; struct xlator_fops fops = { - .lookup = dht_lookup, - .mknod = dht_mknod, - .create = dht_create, + .ipc = dht_ipc, + .lookup = dht_lookup, + .mknod = dht_mknod, + .create = dht_create, - .open = dht_open, - .statfs = dht_statfs, - .opendir = dht_opendir, - .readdir = dht_readdir, - .readdirp = dht_readdirp, - .fsyncdir = dht_fsyncdir, - .symlink = dht_symlink, - .unlink = dht_unlink, - .link = dht_link, - .mkdir = dht_mkdir, - .rmdir = dht_rmdir, - .rename = dht_rename, - .entrylk = dht_entrylk, - .fentrylk = dht_fentrylk, + .open = dht_open, + .statfs = dht_statfs, + .opendir = dht_opendir, + .readdir = dht_readdir, + .readdirp = dht_readdirp, + .fsyncdir = dht_fsyncdir, + .symlink = dht_symlink, + .unlink = dht_unlink, + .link = dht_link, + .mkdir = dht_mkdir, + .rmdir = dht_rmdir, + .rename = dht_rename, + .entrylk = dht_entrylk, + .fentrylk = dht_fentrylk, - /* Inode read operations */ - .stat = dht_stat, - .fstat = dht_fstat, - .access = dht_access, - .readlink = dht_readlink, - .getxattr = dht_getxattr, - .fgetxattr = dht_fgetxattr, - .readv = dht_readv, - .flush = dht_flush, - .fsync = dht_fsync, - .inodelk = dht_inodelk, - .finodelk = dht_finodelk, - .lk = dht_lk, + /* Inode read operations */ + .stat = dht_stat, + .fstat = dht_fstat, + .access = dht_access, + .readlink = dht_readlink, + .getxattr = dht_getxattr, + .fgetxattr = dht_fgetxattr, + .readv = dht_readv, + .flush = dht_flush, + .fsync = dht_fsync, + .inodelk = dht_inodelk, + .finodelk = dht_finodelk, + .lk = dht_lk, + .lease = dht_lease, - /* Inode write operations */ - .fremovexattr = dht_fremovexattr, - .removexattr = dht_removexattr, - .setxattr = dht_setxattr, - .fsetxattr = dht_fsetxattr, - .truncate = dht_truncate, - .ftruncate = dht_ftruncate, - .writev = dht_writev, - .xattrop = dht_xattrop, - .fxattrop = dht_fxattrop, - .setattr = dht_setattr, - .fsetattr = dht_fsetattr, - .fallocate = dht_fallocate, - .discard = dht_discard, - .zerofill = dht_zerofill, + /* Inode write operations */ + .fremovexattr = dht_fremovexattr, + .removexattr = dht_removexattr, + .setxattr = dht_setxattr, + .fsetxattr = dht_fsetxattr, + .truncate = dht_truncate, + .ftruncate = dht_ftruncate, + .writev = dht_writev, + .xattrop = dht_xattrop, + .fxattrop = dht_fxattrop, + .setattr = dht_setattr, + .fsetattr = dht_fsetattr, + .fallocate = dht_fallocate, + .discard = dht_discard, + .zerofill = dht_zerofill, }; struct xlator_dumpops dumpops = { - .priv = dht_priv_dump, - .inodectx = dht_inodectx_dump, + .priv = dht_priv_dump, + .inodectx = dht_inodectx_dump, }; - struct xlator_cbks cbks = { -// .release = dht_release, -// .releasedir = dht_releasedir, - .forget = dht_forget + .release = dht_release, + // .releasedir = dht_releasedir, + .forget = dht_forget, +}; + +extern int32_t +mem_acct_init(xlator_t *this); + +extern struct volume_options dht_options[]; + +xlator_api_t xlator_api = { + .init = dht_init, + .fini = dht_fini, + .notify = dht_notify, + .reconfigure = dht_reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = dht_options, + .identifier = "distribute", + .pass_through_fops = &dht_pt_fops, + .category = GF_MAINTAINED, }; -; diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c index e934acdf00a..3648a564840 100644 --- a/xlators/cluster/dht/src/nufa.c +++ b/xlators/cluster/dht/src/nufa.c @@ -8,669 +8,650 @@ cases as published by the Free Software Foundation. */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - #include "dht-common.h" /* TODO: all 'TODO's in dht.c holds good */ -extern struct volume_options options[]; +extern struct volume_options dht_options[]; int -nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, dict_t *xattr, - struct iatt *postparent) +nufa_local_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) { - xlator_t *subvol = NULL; - char is_linkfile = 0; - char is_dir = 0; - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - loc_t *loc = NULL; - int i = 0; - call_frame_t *prev = NULL; - int call_cnt = 0; - int ret = 0; - - conf = this->private; - - prev = cookie; - local = frame->local; - loc = &local->loc; - - if (ENTRY_MISSING (op_ret, op_errno)) { - if (conf->search_unhashed) { - local->op_errno = ENOENT; - dht_lookup_everywhere (frame, this, loc); - return 0; - } + xlator_t *subvol = NULL; + char is_linkfile = 0; + char is_dir = 0; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + loc_t *loc = NULL; + int i = 0; + xlator_t *prev = NULL; + int call_cnt = 0; + int ret = 0; + + conf = this->private; + + prev = cookie; + local = frame->local; + loc = &local->loc; + + if (ENTRY_MISSING(op_ret, op_errno)) { + if (conf->search_unhashed) { + local->op_errno = ENOENT; + dht_lookup_everywhere(frame, this, loc); + return 0; } - - if (op_ret == -1) - goto out; - - is_linkfile = check_is_linkfile (inode, stbuf, xattr, - conf->link_xattr_name); - is_dir = check_is_dir (inode, stbuf, xattr); - - if (!is_dir && !is_linkfile) { - /* non-directory and not a linkfile */ - ret = dht_layout_preset (this, prev->this, inode); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "could not set pre-set layout for subvol %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; - goto err; - } - - goto out; + } + + if (op_ret == -1) + goto out; + + is_linkfile = check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name); + is_dir = check_is_dir(inode, stbuf, xattr); + + if (!is_dir && !is_linkfile) { + /* non-directory and not a linkfile */ + ret = dht_layout_preset(this, prev, inode); + if (ret < 0) { + gf_msg_debug(this->name, 0, + "could not set pre-set layout for subvol" + " %s", + prev->name); + op_ret = -1; + op_errno = EINVAL; + goto err; } - if (is_dir) { - call_cnt = conf->subvolume_cnt; - local->call_cnt = call_cnt; - - local->inode = inode_ref (inode); - local->xattr = dict_ref (xattr); - - local->op_ret = 0; - local->op_errno = 0; - - local->layout = dht_layout_new (this, conf->subvolume_cnt); - if (!local->layout) { - op_ret = -1; - op_errno = ENOMEM; - goto err; - } - - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_lookup_dir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->lookup, - &local->loc, local->xattr_req); - } - } + goto out; + } + + if (is_dir) { + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; - if (is_linkfile) { - subvol = dht_linkfile_subvol (this, inode, stbuf, xattr); + local->inode = inode_ref(inode); + local->xattr = dict_ref(xattr); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "linkfile not having link subvolume. path=%s", - loc->path); - dht_lookup_everywhere (frame, this, loc); - return 0; - } + local->op_ret = 0; + local->op_errno = 0; - STACK_WIND (frame, dht_lookup_linkfile_cbk, - subvol, subvol->fops->lookup, - &local->loc, local->xattr_req); + local->layout = dht_layout_new(this, conf->subvolume_cnt); + if (!local->layout) { + op_ret = -1; + op_errno = ENOMEM; + goto err; } - return 0; + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(frame, dht_lookup_dir_cbk, conf->subvolumes[i], + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, &local->loc, + local->xattr_req); + } + } -out: - if (!local->hashed_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - local->loc.path); - local->op_errno = ENOENT; - dht_lookup_everywhere (frame, this, loc); - return 0; + if (is_linkfile) { + subvol = dht_linkfile_subvol(this, inode, stbuf, xattr); + + if (!subvol) { + gf_msg_debug(this->name, 0, + "linkfile has no link subvolume. path=%s", loc->path); + dht_lookup_everywhere(frame, this, loc); + return 0; } - STACK_WIND (frame, dht_lookup_cbk, - local->hashed_subvol, local->hashed_subvol->fops->lookup, - &local->loc, local->xattr_req); + STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol, + subvol->fops->lookup, &local->loc, local->xattr_req); + } + + return 0; +out: + if (!local->hashed_subvol) { + gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s", + local->loc.path); + local->op_errno = ENOENT; + dht_lookup_everywhere(frame, this, loc); return 0; + } + + STACK_WIND_COOKIE(frame, dht_lookup_cbk, local->hashed_subvol, + local->hashed_subvol, local->hashed_subvol->fops->lookup, + &local->loc, local->xattr_req); + + return 0; err: - DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, - inode, stbuf, xattr, postparent); - return 0; + DHT_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr, + postparent); + return 0; } int -nufa_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) +nufa_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) { - xlator_t *hashed_subvol = NULL; - xlator_t *subvol = NULL; - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int ret = -1; - int op_errno = -1; - dht_layout_t *layout = NULL; - int i = 0; - int call_cnt = 0; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - conf = this->private; - - local = dht_local_init (frame, loc, NULL, GF_FOP_LOOKUP); - if (!local) { - op_errno = ENOMEM; - goto err; + xlator_t *hashed_subvol = NULL; + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int op_errno = -1; + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + VALIDATE_OR_GOTO(loc->path, err); + + conf = this->private; + + local = dht_local_init(frame, loc, NULL, GF_FOP_LOOKUP); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + if (xattr_req) { + local->xattr_req = dict_ref(xattr_req); + } else { + local->xattr_req = dict_new(); + } + + hashed_subvol = dht_subvol_get_hashed(this, &local->loc); + + local->hashed_subvol = hashed_subvol; + + if (is_revalidate(loc)) { + layout = local->layout; + if (!layout) { + gf_msg_debug(this->name, 0, + "revalidate lookup without cache. " + "path=%s", + loc->path); + op_errno = EINVAL; + goto err; } - if (xattr_req) { - local->xattr_req = dict_ref (xattr_req); - } else { - local->xattr_req = dict_new (); + if (layout->gen && (layout->gen < conf->gen)) { + gf_msg_debug(this->name, 0, "incomplete layout failure for path=%s", + loc->path); + dht_layout_unref(this, local->layout); + goto do_fresh_lookup; } - hashed_subvol = dht_subvol_get_hashed (this, &local->loc); - - local->hashed_subvol = hashed_subvol; - - if (is_revalidate (loc)) { - layout = local->layout; - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "revalidate without cache. path=%s", - loc->path); - op_errno = EINVAL; - goto err; - } - - if (layout->gen && (layout->gen < conf->gen)) { - gf_log (this->name, GF_LOG_DEBUG, - "incomplete layout failure for path=%s", - loc->path); - dht_layout_unref (this, local->layout); - goto do_fresh_lookup; - } - - local->inode = inode_ref (loc->inode); - - local->call_cnt = layout->cnt; - call_cnt = local->call_cnt; - - /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute, - * revalidates directly go to the cached-subvolume. - */ - ret = dict_set_uint32 (local->xattr_req, - conf->xattr_name, 4 * 4); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to set dict value."); - op_errno = -1; - goto err; - } - - for (i = 0; i < layout->cnt; i++) { - subvol = layout->list[i].xlator; - - STACK_WIND (frame, dht_revalidate_cbk, - subvol, subvol->fops->lookup, - loc, local->xattr_req); - - if (!--call_cnt) - break; - } - } else { - do_fresh_lookup: - ret = dict_set_uint32 (local->xattr_req, - conf->xattr_name, 4 * 4); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to set dict value."); - op_errno = -1; - goto err; - } - - ret = dict_set_uint32 (local->xattr_req, - conf->link_xattr_name, 256); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to set dict value."); - op_errno = -1; - goto err; - } - - /* Send it to only local volume */ - STACK_WIND (frame, nufa_local_lookup_cbk, - (xlator_t *)conf->private, - ((xlator_t *)conf->private)->fops->lookup, - loc, local->xattr_req); - } + local->inode = inode_ref(loc->inode); - return 0; + local->call_cnt = layout->cnt; + call_cnt = local->call_cnt; -err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, - NULL); - return 0; -} + /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute, + * revalidates directly go to the cached-subvolume. + */ + ret = dict_set_uint32(local->xattr_req, conf->xattr_name, 4 * 4); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dict value."); + op_errno = -1; + goto err; + } -int -nufa_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - dht_local_t *local = NULL; + for (i = 0; i < layout->cnt; i++) { + subvol = layout->list[i].xlator; + + STACK_WIND_COOKIE(frame, dht_revalidate_cbk, subvol, subvol, + subvol->fops->lookup, loc, local->xattr_req); - local = frame->local; + if (!--call_cnt) + break; + } + } else { + do_fresh_lookup: + ret = dict_set_uint32(local->xattr_req, conf->xattr_name, 4 * 4); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dict value."); + op_errno = -1; + goto err; + } - if (op_ret == -1) - goto err; + ret = dict_set_uint32(local->xattr_req, conf->link_xattr_name, 256); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dict value."); + op_errno = -1; + goto err; + } - STACK_WIND (frame, dht_create_cbk, - local->cached_subvol, local->cached_subvol->fops->create, - &local->loc, local->flags, local->mode, local->umask, - local->fd, local->params); + /* Send it to only local volume */ + STACK_WIND_COOKIE( + frame, nufa_local_lookup_cbk, ((xlator_t *)conf->private), + ((xlator_t *)conf->private), + ((xlator_t *)conf->private)->fops->lookup, loc, local->xattr_req); + } - return 0; + return 0; err: - DHT_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL, NULL); - return 0; + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; } int -nufa_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - mode_t umask, fd_t *fd, dict_t *params) +nufa_create_linkfile_create_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - xlator_t *subvol = NULL; - xlator_t *avail_subvol = NULL; - int op_errno = -1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); + dht_local_t *local = NULL; - conf = this->private; + local = frame->local; - dht_get_du_info (frame, this, loc); + if (op_ret == -1) + goto err; - local = dht_local_init (frame, loc, fd, GF_FOP_CREATE); - if (!local) { - op_errno = ENOMEM; - goto err; - } + STACK_WIND_COOKIE(frame, dht_create_cbk, local->cached_subvol, + local->cached_subvol, local->cached_subvol->fops->create, + &local->loc, local->flags, local->mode, local->umask, + local->fd, local->params); - subvol = dht_subvol_get_hashed (this, loc); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - op_errno = ENOENT; - goto err; - } + return 0; - avail_subvol = conf->private; - if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) { - avail_subvol = - dht_free_disk_available_subvol (this, - (xlator_t *)conf->private, - local); - } +err: + DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; +} - if (subvol != avail_subvol) { - /* create a link file instead of actual file */ - local->params = dict_ref (params); - local->mode = mode; - local->flags = flags; - local->umask = umask; - local->cached_subvol = avail_subvol; - dht_linkfile_create (frame, nufa_create_linkfile_create_cbk, - this, avail_subvol, subvol, loc); - return 0; - } +int +nufa_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *params) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; + xlator_t *avail_subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + + conf = this->private; + + dht_get_du_info(frame, this, loc); + + local = dht_local_init(frame, loc, fd, GF_FOP_CREATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = dht_subvol_get_hashed(this, loc); + if (!subvol) { + gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + avail_subvol = conf->private; + if (dht_is_subvol_filled(this, (xlator_t *)conf->private)) { + avail_subvol = dht_free_disk_available_subvol( + this, (xlator_t *)conf->private, local); + } + + if (subvol != avail_subvol) { + /* create a link file instead of actual file */ + local->params = dict_ref(params); + local->mode = mode; + local->flags = flags; + local->umask = umask; + local->cached_subvol = avail_subvol; + dht_linkfile_create(frame, nufa_create_linkfile_create_cbk, this, + avail_subvol, subvol, loc); + return 0; + } - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s", loc->path, subvol->name); + gf_msg_trace(this->name, 0, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_create_cbk, - subvol, subvol->fops->create, - loc, flags, mode, umask, fd, params); + STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol, + subvol->fops->create, loc, flags, mode, umask, fd, + params); - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, + NULL); - return 0; + return 0; } int -nufa_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, inode_t *inode, - struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +nufa_mknod_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - dht_local_t *local = NULL; + dht_local_t *local = NULL; - local = frame->local; - if (!local || !local->cached_subvol) { - op_errno = EINVAL; - op_ret = -1; - goto err; - } + local = frame->local; + if (!local || !local->cached_subvol) { + op_errno = EINVAL; + op_ret = -1; + goto err; + } - if (op_ret >= 0) { - STACK_WIND_COOKIE (frame, dht_newfile_cbk, - (void *)local->cached_subvol, local->cached_subvol, - local->cached_subvol->fops->mknod, - &local->loc, local->mode, local->rdev, - local->umask, local->params); + if (op_ret >= 0) { + STACK_WIND_COOKIE( + frame, dht_newfile_cbk, (void *)local->cached_subvol, + local->cached_subvol, local->cached_subvol->fops->mknod, + &local->loc, local->mode, local->rdev, local->umask, local->params); - return 0; - } + return 0; + } err: - WIPE (postparent); - WIPE (preparent); + WIPE(postparent); + WIPE(preparent); - DHT_STACK_UNWIND (link, frame, op_ret, op_errno, - inode, stbuf, preparent, postparent, xdata); - return 0; + DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent, + postparent, xdata); + return 0; } - int -nufa_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *params) +nufa_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *params) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - xlator_t *subvol = NULL; - xlator_t *avail_subvol = NULL; - int op_errno = -1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - - conf = this->private; - - dht_get_du_info (frame, this, loc); - - local = dht_local_init (frame, loc, NULL, GF_FOP_MKNOD); - if (!local) { - op_errno = ENOMEM; - goto err; - } - - subvol = dht_subvol_get_hashed (this, loc); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - op_errno = ENOENT; - goto err; - } - - /* Consider the disksize in consideration */ - avail_subvol = conf->private; - if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) { - avail_subvol = - dht_free_disk_available_subvol (this, - (xlator_t *)conf->private, - local); - } - - if (avail_subvol != subvol) { - /* Create linkfile first */ - - local->params = dict_ref (params); - local->mode = mode; - local->umask = umask; - local->rdev = rdev; - local->cached_subvol = avail_subvol; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; + xlator_t *avail_subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + + conf = this->private; + + dht_get_du_info(frame, this, loc); + + local = dht_local_init(frame, loc, NULL, GF_FOP_MKNOD); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = dht_subvol_get_hashed(this, loc); + if (!subvol) { + gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + /* Consider the disksize in consideration */ + avail_subvol = conf->private; + if (dht_is_subvol_filled(this, (xlator_t *)conf->private)) { + avail_subvol = dht_free_disk_available_subvol( + this, (xlator_t *)conf->private, local); + } + + if (avail_subvol != subvol) { + /* Create linkfile first */ + + local->params = dict_ref(params); + local->mode = mode; + local->umask = umask; + local->rdev = rdev; + local->cached_subvol = avail_subvol; + + dht_linkfile_create(frame, nufa_mknod_linkfile_cbk, this, avail_subvol, + subvol, loc); + return 0; + } - dht_linkfile_create (frame, nufa_mknod_linkfile_cbk, this, - avail_subvol, subvol, loc); - return 0; - } + gf_msg_trace(this->name, 0, "creating %s on %s", loc->path, subvol->name); - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s", loc->path, subvol->name); + STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, rdev, umask, params); - STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol, - subvol->fops->mknod, loc, mode, rdev, umask, - params); - - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (mknod, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } - gf_boolean_t -same_first_part (char *str1, char term1, char *str2, char term2) +same_first_part(char *str1, char term1, char *str2, char term2) { - gf_boolean_t ended1; - gf_boolean_t ended2; - - for (;;) { - ended1 = ((*str1 == '\0') || (*str1 == term1)); - ended2 = ((*str2 == '\0') || (*str2 == term2)); - if (ended1 && ended2) { - return _gf_true; - } - if (ended1 || ended2 || (*str1 != *str2)) { - return _gf_false; - } - ++str1; - ++str2; + gf_boolean_t ended1; + gf_boolean_t ended2; + + for (;;) { + ended1 = ((*str1 == '\0') || (*str1 == term1)); + ended2 = ((*str2 == '\0') || (*str2 == term2)); + if (ended1 && ended2) { + return _gf_true; + } + if (ended1 || ended2 || (*str1 != *str2)) { + return _gf_false; } + ++str1; + ++str2; + } } typedef struct nufa_args { - xlator_t *this; - char *volname; - gf_boolean_t addr_match; + xlator_t *this; + char *volname; + gf_boolean_t addr_match; } nufa_args_t; static void -nufa_find_local_brick (xlator_t *xl, void *data) +nufa_find_local_brick(xlator_t *xl, void *data) { - nufa_args_t *args = data; - xlator_t *this = args->this; - char *local_volname = args->volname; - gf_boolean_t addr_match = args->addr_match; - char *brick_host = NULL; - dht_conf_t *conf = this->private; - int ret = -1; - - /*This means a local subvol was already found. We pick the first brick - * that is local*/ - if (conf->private) - return; - - if (strcmp (xl->name, local_volname) == 0) { - conf->private = xl; - gf_log (this->name, GF_LOG_INFO, "Using specified subvol %s", - local_volname); - return; - } - - if (!addr_match) - return; - - ret = dict_get_str (xl->options, "remote-host", &brick_host); - if ((ret == 0) && - (gf_is_same_address (local_volname, brick_host) || - gf_is_local_addr (brick_host))) { - conf->private = xl; - gf_log (this->name, GF_LOG_INFO, "Using the first local " - "subvol %s", xl->name); - return; - } - + nufa_args_t *args = data; + xlator_t *this = args->this; + char *local_volname = args->volname; + gf_boolean_t addr_match = args->addr_match; + char *brick_host = NULL; + dht_conf_t *conf = this->private; + int ret = -1; + + /*This means a local subvol was already found. We pick the first brick + * that is local*/ + if (conf->private) + return; + + if (strcmp(xl->name, local_volname) == 0) { + conf->private = xl; + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO, + "Using specified subvol %s", local_volname); + return; + } + + if (!addr_match) + return; + + ret = dict_get_str(xl->options, "remote-host", &brick_host); + if ((ret == 0) && (gf_is_same_address(local_volname, brick_host) || + gf_is_local_addr(brick_host))) { + conf->private = xl; + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO, + "Using the first local " + "subvol %s", + xl->name); + return; + } } static void -nufa_to_dht (xlator_t *this) +nufa_to_dht(xlator_t *this) { - GF_ASSERT (this); - GF_ASSERT (this->fops); + GF_ASSERT(this); + GF_ASSERT(this->fops); - this->fops->lookup = dht_lookup; - this->fops->create = dht_create; - this->fops->mknod = dht_mknod; + this->fops->lookup = dht_lookup; + this->fops->create = dht_create; + this->fops->mknod = dht_mknod; } int -nufa_find_local_subvol (xlator_t *this, - void (*fn) (xlator_t *each, void* data), void *data) +nufa_find_local_subvol(xlator_t *this, void (*fn)(xlator_t *each, void *data), + void *data) { - int ret = -1; - dht_conf_t *conf = this->private; - xlator_list_t *trav = NULL; - xlator_t *parent = NULL; - xlator_t *candidate = NULL; - - xlator_foreach_depth_first (this, fn, data); - if (!conf->private) { - gf_log (this->name, GF_LOG_ERROR, "Couldn't find a local " - "brick"); - return -1; + int ret = -1; + dht_conf_t *conf = this->private; + xlator_list_t *trav = NULL; + xlator_t *parent = NULL; + xlator_t *candidate = NULL; + + xlator_foreach_depth_first(this, fn, data); + if (!conf->private) { + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_BRICK_ERROR, + "Couldn't find a local " + "brick"); + return -1; + } + + candidate = conf->private; + trav = candidate->parents; + while (trav) { + parent = trav->xlator; + if (strcmp(parent->type, "cluster/nufa") == 0) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO, + "Found local subvol, " + "%s", + candidate->name); + ret = 0; + conf->private = candidate; + break; } - candidate = conf->private; - trav = candidate->parents; - while (trav) { - - parent = trav->xlator; - if (strcmp (parent->type, "cluster/nufa") == 0) { - gf_log (this->name, GF_LOG_INFO, "Found local subvol, " - "%s", candidate->name); - ret = 0; - conf->private = candidate; - break; - } - - candidate = parent; - trav = parent->parents; - } + candidate = parent; + trav = parent->parents; + } - return ret; + return ret; } int -nufa_init (xlator_t *this) +nufa_init(xlator_t *this) { - data_t *data = NULL; - char *local_volname = NULL; - int ret = -1; - char my_hostname[256]; - gf_boolean_t addr_match = _gf_false; - nufa_args_t args = {0, }; - - ret = dht_init(this); - if (ret) { - return ret; - } - - if ((data = dict_get (this->options, "local-volume-name"))) { - local_volname = data->data; - - } else { - addr_match = _gf_true; - local_volname = "localhost"; - ret = gethostname (my_hostname, 256); - if (ret == 0) - local_volname = my_hostname; - - else - gf_log (this->name, GF_LOG_WARNING, - "could not find hostname (%s)", - strerror (errno)); - - } - - args.this = this; - args.volname = local_volname; - args.addr_match = addr_match; - ret = nufa_find_local_subvol (this, nufa_find_local_brick, &args); - if (ret) { - gf_log (this->name, GF_LOG_INFO, - "Unable to find local subvolume, switching " - "to dht mode"); - nufa_to_dht (this); - } - return 0; + data_t *data = NULL; + char *local_volname = NULL; + int ret = -1; + char my_hostname[256]; + gf_boolean_t addr_match = _gf_false; + nufa_args_t args = { + 0, + }; + + ret = dht_init(this); + if (ret) { + return ret; + } + + if ((data = dict_get(this->options, "local-volume-name"))) { + local_volname = data->data; + + } else { + addr_match = _gf_true; + local_volname = "localhost"; + ret = gethostname(my_hostname, 256); + if (ret == 0) + local_volname = my_hostname; + + else + gf_msg(this->name, GF_LOG_WARNING, errno, + DHT_MSG_GET_HOSTNAME_FAILED, "could not find hostname"); + } + + args.this = this; + args.volname = local_volname; + args.addr_match = addr_match; + ret = nufa_find_local_subvol(this, nufa_find_local_brick, &args); + if (ret) { + gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO, + "Unable to find local subvolume, switching " + "to dht mode"); + nufa_to_dht(this); + } + return 0; } - -class_methods_t class_methods = { - .init = nufa_init, - .fini = dht_fini, - .reconfigure = dht_reconfigure, - .notify = dht_notify +dht_methods_t dht_methods = { + .migration_get_dst_subvol = dht_migration_get_dst_subvol, + .layout_search = dht_layout_search, }; - struct xlator_fops fops = { - .lookup = nufa_lookup, - .create = nufa_create, - .mknod = nufa_mknod, - - .stat = dht_stat, - .fstat = dht_fstat, - .truncate = dht_truncate, - .ftruncate = dht_ftruncate, - .access = dht_access, - .readlink = dht_readlink, - .setxattr = dht_setxattr, - .getxattr = dht_getxattr, - .removexattr = dht_removexattr, - .open = dht_open, - .readv = dht_readv, - .writev = dht_writev, - .flush = dht_flush, - .fsync = dht_fsync, - .statfs = dht_statfs, - .lk = dht_lk, - .opendir = dht_opendir, - .readdir = dht_readdir, - .readdirp = dht_readdirp, - .fsyncdir = dht_fsyncdir, - .symlink = dht_symlink, - .unlink = dht_unlink, - .link = dht_link, - .mkdir = dht_mkdir, - .rmdir = dht_rmdir, - .rename = dht_rename, - .inodelk = dht_inodelk, - .finodelk = dht_finodelk, - .entrylk = dht_entrylk, - .fentrylk = dht_fentrylk, - .xattrop = dht_xattrop, - .fxattrop = dht_fxattrop, - .setattr = dht_setattr, + .lookup = nufa_lookup, + .create = nufa_create, + .mknod = nufa_mknod, + + .stat = dht_stat, + .fstat = dht_fstat, + .truncate = dht_truncate, + .ftruncate = dht_ftruncate, + .access = dht_access, + .readlink = dht_readlink, + .setxattr = dht_setxattr, + .getxattr = dht_getxattr, + .removexattr = dht_removexattr, + .open = dht_open, + .readv = dht_readv, + .writev = dht_writev, + .flush = dht_flush, + .fsync = dht_fsync, + .statfs = dht_statfs, + .lk = dht_lk, + .opendir = dht_opendir, + .readdir = dht_readdir, + .readdirp = dht_readdirp, + .fsyncdir = dht_fsyncdir, + .symlink = dht_symlink, + .unlink = dht_unlink, + .link = dht_link, + .mkdir = dht_mkdir, + .rmdir = dht_rmdir, + .rename = dht_rename, + .inodelk = dht_inodelk, + .finodelk = dht_finodelk, + .entrylk = dht_entrylk, + .fentrylk = dht_fentrylk, + .xattrop = dht_xattrop, + .fxattrop = dht_fxattrop, + .setattr = dht_setattr, }; - -struct xlator_cbks cbks = { - .forget = dht_forget +struct xlator_cbks cbks = {.forget = dht_forget}; +extern int32_t +mem_acct_init(xlator_t *this); + +xlator_api_t xlator_api = { + .init = nufa_init, + .fini = dht_fini, + .notify = dht_notify, + .reconfigure = dht_reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .fops = &fops, + .cbks = &cbks, + .options = dht_options, + .identifier = "nufa", + .category = GF_TECH_PREVIEW, }; diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c index 2717ce97541..207d109a025 100644 --- a/xlators/cluster/dht/src/switch.c +++ b/xlators/cluster/dht/src/switch.c @@ -8,12 +8,6 @@ cases as published by the Free Software Foundation. */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - #include "dht-common.h" #include "dht-mem-types.h" @@ -22,883 +16,876 @@ #include <fnmatch.h> #include <string.h> -extern struct volume_options options[]; +extern struct volume_options dht_options[]; struct switch_sched_array { - xlator_t *xl; - int32_t eligible; - int32_t considered; + xlator_t *xl; + int32_t eligible; + int32_t considered; }; /* Select one of this struct based on the path's pattern match */ struct switch_struct { - struct switch_struct *next; - struct switch_sched_array *array; - int32_t node_index; /* Index of the node in - this pattern. */ - int32_t num_child; /* Total num of child nodes - with this pattern. */ - char path_pattern[256]; + struct switch_struct *next; + struct switch_sched_array *array; + int32_t node_index; /* Index of the node in + this pattern. */ + int32_t num_child; /* Total num of child nodes + with this pattern. */ + char path_pattern[256]; }; /* TODO: all 'TODO's in dht.c holds good */ /* This function should return child node as '*:subvolumes' is inserterd */ static int32_t -gf_switch_valid_child (xlator_t *this, const char *child) +gf_switch_valid_child(xlator_t *this, const char *child) { - xlator_list_t *children = NULL; - int32_t ret = 0; - - children = this->children; - while (children) { - if (!strcmp (child, children->xlator->name)) { - ret = 1; - break; - } - children = children->next; + xlator_list_t *children = NULL; + int32_t ret = 0; + + children = this->children; + while (children) { + if (!strcmp(child, children->xlator->name)) { + ret = 1; + break; } + children = children->next; + } - return ret; + return ret; } static xlator_t * -get_switch_matching_subvol (const char *path, dht_conf_t *conf, - xlator_t *hashed_subvol) +get_switch_matching_subvol(const char *path, dht_conf_t *conf, + xlator_t *hashed_subvol) { - struct switch_struct *cond = NULL; - struct switch_struct *trav = NULL; - char *pathname = NULL; - int idx = 0; - xlator_t *subvol = NULL; - - cond = conf->private; - subvol = hashed_subvol; - if (!cond) - goto out; - - pathname = gf_strdup (path); - if (!pathname) - goto out; - - trav = cond; - while (trav) { - if (fnmatch (trav->path_pattern, - pathname, FNM_NOESCAPE) == 0) { - for (idx = 0; idx < trav->num_child; idx++) { - if (trav->array[idx].xl == hashed_subvol) - goto out; - } - idx = trav->node_index++; - trav->node_index %= trav->num_child; - subvol = trav->array[idx].xl; - goto out; - } - trav = trav->next; + struct switch_struct *cond = NULL; + struct switch_struct *trav = NULL; + char *pathname = NULL; + int idx = 0; + xlator_t *subvol = NULL; + + cond = conf->private; + subvol = hashed_subvol; + if (!cond) + goto out; + + pathname = gf_strdup(path); + if (!pathname) + goto out; + + trav = cond; + while (trav) { + if (fnmatch(trav->path_pattern, pathname, FNM_NOESCAPE) == 0) { + for (idx = 0; idx < trav->num_child; idx++) { + if (trav->array[idx].xl == hashed_subvol) + goto out; + } + idx = trav->node_index++; + trav->node_index %= trav->num_child; + subvol = trav->array[idx].xl; + goto out; } + trav = trav->next; + } out: - GF_FREE (pathname); + GF_FREE(pathname); - return subvol; + return subvol; } - int -switch_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, dict_t *xattr, - struct iatt *postparent) +switch_local_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) { - xlator_t *subvol = NULL; - char is_linkfile = 0; - char is_dir = 0; - dht_conf_t *conf = NULL; - dht_local_t *local = NULL; - loc_t *loc = NULL; - int i = 0; - call_frame_t *prev = NULL; - int call_cnt = 0; - int ret = 0; - - conf = this->private; - - prev = cookie; - local = frame->local; - loc = &local->loc; - - if (ENTRY_MISSING (op_ret, op_errno)) { - if (conf->search_unhashed) { - local->op_errno = ENOENT; - dht_lookup_everywhere (frame, this, loc); - return 0; - } + xlator_t *subvol = NULL; + char is_linkfile = 0; + char is_dir = 0; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + loc_t *loc = NULL; + int i = 0; + xlator_t *prev = NULL; + int call_cnt = 0; + int ret = 0; + + conf = this->private; + + prev = cookie; + local = frame->local; + loc = &local->loc; + + if (ENTRY_MISSING(op_ret, op_errno)) { + if (conf->search_unhashed) { + local->op_errno = ENOENT; + dht_lookup_everywhere(frame, this, loc); + return 0; } - - if (op_ret == -1) - goto out; - - is_linkfile = check_is_linkfile (inode, stbuf, xattr, - conf->link_xattr_name); - is_dir = check_is_dir (inode, stbuf, xattr); - - if (!is_dir && !is_linkfile) { - /* non-directory and not a linkfile */ - - ret = dht_layout_preset (this, prev->this, inode); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "could not set pre-set layout for subvol %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; - goto err; - } - - goto out; + } + + if (op_ret == -1) + goto out; + + is_linkfile = check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name); + is_dir = check_is_dir(inode, stbuf, xattr); + + if (!is_dir && !is_linkfile) { + /* non-directory and not a linkfile */ + + ret = dht_layout_preset(this, prev, inode); + if (ret < 0) { + gf_msg_debug(this->name, 0, + "could not set pre-set layout " + "for subvol %s", + prev->name); + op_ret = -1; + op_errno = EINVAL; + goto err; } - if (is_dir) { - call_cnt = conf->subvolume_cnt; - local->call_cnt = call_cnt; + goto out; + } - local->inode = inode_ref (inode); - local->xattr = dict_ref (xattr); + if (is_dir) { + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; - local->op_ret = 0; - local->op_errno = 0; + local->inode = inode_ref(inode); + local->xattr = dict_ref(xattr); - local->layout = dht_layout_new (this, conf->subvolume_cnt); - if (!local->layout) { - op_ret = -1; - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_DEBUG, - "memory allocation failed :("); - goto err; - } + local->op_ret = 0; + local->op_errno = 0; - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_lookup_dir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->lookup, - &local->loc, local->xattr_req); - } + local->layout = dht_layout_new(this, conf->subvolume_cnt); + if (!local->layout) { + op_ret = -1; + op_errno = ENOMEM; + gf_msg_debug(this->name, 0, "memory allocation failed :("); + goto err; } - if (is_linkfile) { - subvol = dht_linkfile_subvol (this, inode, stbuf, xattr); + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(frame, dht_lookup_dir_cbk, conf->subvolumes[i], + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, &local->loc, + local->xattr_req); + } + } - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "linkfile not having link subvolume. path=%s", - loc->path); - dht_lookup_everywhere (frame, this, loc); - return 0; - } + if (is_linkfile) { + subvol = dht_linkfile_subvol(this, inode, stbuf, xattr); - STACK_WIND (frame, dht_lookup_linkfile_cbk, - subvol, subvol->fops->lookup, - &local->loc, local->xattr_req); + if (!subvol) { + gf_msg_debug(this->name, 0, + "linkfile has no link subvolume.path=%s", loc->path); + dht_lookup_everywhere(frame, this, loc); + return 0; } - return 0; + STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol, + subvol->fops->lookup, &local->loc, local->xattr_req); + } + + return 0; out: - if (!local->hashed_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - local->loc.path); - local->op_errno = ENOENT; - dht_lookup_everywhere (frame, this, loc); - return 0; - } + if (!local->hashed_subvol) { + gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s", + local->loc.path); + local->op_errno = ENOENT; + dht_lookup_everywhere(frame, this, loc); + return 0; + } - STACK_WIND (frame, dht_lookup_cbk, - local->hashed_subvol, local->hashed_subvol->fops->lookup, - &local->loc, local->xattr_req); + STACK_WIND_COOKIE(frame, dht_lookup_cbk, local->hashed_subvol, + local->hashed_subvol, local->hashed_subvol->fops->lookup, + &local->loc, local->xattr_req); - return 0; + return 0; err: - DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, - inode, stbuf, xattr, NULL); - return 0; + DHT_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr, + NULL); + return 0; } int -switch_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) +switch_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xattr_req) { - xlator_t *hashed_subvol = NULL; - xlator_t *cached_subvol = NULL; - xlator_t *subvol = NULL; - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - int ret = -1; - int op_errno = -1; - dht_layout_t *layout = NULL; - int i = 0; - int call_cnt = 0; - - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - - conf = this->private; - - local = dht_local_init (frame, loc, NULL, GF_FOP_LOOKUP); - if (!local) { - op_errno = ENOMEM; - goto err; + xlator_t *hashed_subvol = NULL; + xlator_t *cached_subvol = NULL; + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int op_errno = -1; + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + VALIDATE_OR_GOTO(loc->inode, err); + VALIDATE_OR_GOTO(loc->path, err); + + conf = this->private; + + local = dht_local_init(frame, loc, NULL, GF_FOP_LOOKUP); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + if (xattr_req) { + local->xattr_req = dict_ref(xattr_req); + } else { + local->xattr_req = dict_new(); + } + + hashed_subvol = dht_subvol_get_hashed(this, &local->loc); + cached_subvol = local->cached_subvol; + + local->hashed_subvol = hashed_subvol; + + if (is_revalidate(loc)) { + layout = local->layout; + if (!layout) { + gf_msg_debug(this->name, 0, + "revalidate lookup without cache. path=%s", loc->path); + op_errno = EINVAL; + goto err; } - if (xattr_req) { - local->xattr_req = dict_ref (xattr_req); - } else { - local->xattr_req = dict_new (); + if (layout->gen && (layout->gen < conf->gen)) { + gf_msg_debug(this->name, 0, "incomplete layout failure for path=%s", + loc->path); + dht_layout_unref(this, local->layout); + goto do_fresh_lookup; } - hashed_subvol = dht_subvol_get_hashed (this, &local->loc); - cached_subvol = local->cached_subvol; + local->inode = inode_ref(loc->inode); - local->hashed_subvol = hashed_subvol; - - if (is_revalidate (loc)) { - layout = local->layout; - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "revalidate without cache. path=%s", - loc->path); - op_errno = EINVAL; - goto err; - } + local->call_cnt = layout->cnt; + call_cnt = local->call_cnt; - if (layout->gen && (layout->gen < conf->gen)) { - gf_log (this->name, GF_LOG_DEBUG, - "incomplete layout failure for path=%s", - loc->path); - dht_layout_unref (this, local->layout); - goto do_fresh_lookup; - } - - local->inode = inode_ref (loc->inode); - - local->call_cnt = layout->cnt; - call_cnt = local->call_cnt; + /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' + * attribute, revalidates directly go to the cached-subvolume. + */ + ret = dict_set_uint32(local->xattr_req, conf->xattr_name, 4 * 4); + if (ret < 0) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "failed to set dict value for %s", conf->xattr_name); - /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' - * attribute, revalidates directly go to the cached-subvolume. - */ - ret = dict_set_uint32 (local->xattr_req, - conf->xattr_name, 4 * 4); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "failed to set dict value for %s", - conf->xattr_name); + for (i = 0; i < layout->cnt; i++) { + subvol = layout->list[i].xlator; - for (i = 0; i < layout->cnt; i++) { - subvol = layout->list[i].xlator; + STACK_WIND_COOKIE(frame, dht_revalidate_cbk, subvol, subvol, + subvol->fops->lookup, loc, local->xattr_req); - STACK_WIND (frame, dht_revalidate_cbk, - subvol, subvol->fops->lookup, - loc, local->xattr_req); + if (!--call_cnt) + break; + } + } else { + do_fresh_lookup: + ret = dict_set_uint32(local->xattr_req, conf->xattr_name, 4 * 4); + if (ret < 0) + gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, + "failed to set dict value for %s", conf->xattr_name); + + ret = dict_set_uint32(local->xattr_req, conf->link_xattr_name, 256); + if (ret < 0) + gf_msg(this->name, GF_LOG_WARNING, EINVAL, DHT_MSG_DICT_SET_FAILED, + "failed to set dict value for %s", conf->link_xattr_name); + + if (!hashed_subvol) { + gf_msg_debug(this->name, 0, + "no subvolume in layout for path=%s, " + "checking on all the subvols to see if " + "it is a directory", + loc->path); + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + local->layout = dht_layout_new(this, conf->subvolume_cnt); + if (!local->layout) { + op_errno = ENOMEM; + goto err; + } + + for (i = 0; i < call_cnt; i++) { + STACK_WIND_COOKIE(frame, dht_lookup_dir_cbk, + conf->subvolumes[i], conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + &local->loc, local->xattr_req); + } + return 0; + } - if (!--call_cnt) - break; - } + /* */ + cached_subvol = get_switch_matching_subvol(loc->path, conf, + hashed_subvol); + if (cached_subvol == hashed_subvol) { + STACK_WIND_COOKIE(frame, dht_lookup_cbk, hashed_subvol, + hashed_subvol, hashed_subvol->fops->lookup, loc, + local->xattr_req); } else { - do_fresh_lookup: - ret = dict_set_uint32 (local->xattr_req, - conf->xattr_name, 4 * 4); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "failed to set dict value for %s", - conf->xattr_name); - - ret = dict_set_uint32 (local->xattr_req, - conf->link_xattr_name, 256); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "failed to set dict value for %s", - conf->link_xattr_name); - - if (!hashed_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s, " - "checking on all the subvols to see if " - "it is a directory", loc->path); - call_cnt = conf->subvolume_cnt; - local->call_cnt = call_cnt; - - local->layout = dht_layout_new (this, - conf->subvolume_cnt); - if (!local->layout) { - op_errno = ENOMEM; - goto err; - } - - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_lookup_dir_cbk, - conf->subvolumes[i], - conf->subvolumes[i]->fops->lookup, - &local->loc, local->xattr_req); - } - return 0; - } - - /* */ - cached_subvol = get_switch_matching_subvol (loc->path, conf, - hashed_subvol); - if (cached_subvol == hashed_subvol) { - STACK_WIND (frame, dht_lookup_cbk, - hashed_subvol, - hashed_subvol->fops->lookup, - loc, local->xattr_req); - } else { - STACK_WIND (frame, switch_local_lookup_cbk, - cached_subvol, - cached_subvol->fops->lookup, - loc, local->xattr_req); - } + STACK_WIND_COOKIE(frame, switch_local_lookup_cbk, cached_subvol, + cached_subvol, cached_subvol->fops->lookup, loc, + local->xattr_req); } + } - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (lookup, frame, -1, op_errno, - NULL, NULL, NULL, NULL); - return 0; + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; } int -switch_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +switch_create_linkfile_create_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, + struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - dht_local_t *local = NULL; + dht_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (op_ret == -1) - goto err; + if (op_ret == -1) + goto err; - STACK_WIND (frame, dht_create_cbk, - local->cached_subvol, local->cached_subvol->fops->create, - &local->loc, local->flags, local->mode, local->umask, - local->fd, local->params); + STACK_WIND_COOKIE(frame, dht_create_cbk, local->cached_subvol, + local->cached_subvol, local->cached_subvol->fops->create, + &local->loc, local->flags, local->mode, local->umask, + local->fd, local->params); - return 0; + return 0; err: - DHT_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL, NULL); - return 0; + DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; } int -switch_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - mode_t umask, fd_t *fd, dict_t *params) +switch_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *params) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - xlator_t *subvol = NULL; - xlator_t *avail_subvol = NULL; - int op_errno = -1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - - conf = this->private; - - dht_get_du_info (frame, this, loc); - - local = dht_local_init (frame, loc, fd, GF_FOP_CREATE); - if (!local) { - op_errno = ENOMEM; - goto err; - } - - subvol = dht_subvol_get_hashed (this, loc); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - op_errno = ENOENT; - goto err; - } - - avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol); - if (dht_is_subvol_filled (this, avail_subvol)) { - avail_subvol = - dht_free_disk_available_subvol (this, avail_subvol, - local); - } - - if (subvol != avail_subvol) { - /* create a link file instead of actual file */ - local->mode = mode; - local->flags = flags; - local->umask = umask; - local->cached_subvol = avail_subvol; - dht_linkfile_create (frame, switch_create_linkfile_create_cbk, - this, avail_subvol, subvol, loc); - return 0; - } + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; + xlator_t *avail_subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + + conf = this->private; + + dht_get_du_info(frame, this, loc); + + local = dht_local_init(frame, loc, fd, GF_FOP_CREATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = dht_subvol_get_hashed(this, loc); + if (!subvol) { + gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + avail_subvol = get_switch_matching_subvol(loc->path, conf, subvol); + if (dht_is_subvol_filled(this, avail_subvol)) { + avail_subvol = dht_free_disk_available_subvol(this, avail_subvol, + local); + } + + if (subvol != avail_subvol) { + /* create a link file instead of actual file */ + local->mode = mode; + local->flags = flags; + local->umask = umask; + local->cached_subvol = avail_subvol; + dht_linkfile_create(frame, switch_create_linkfile_create_cbk, this, + avail_subvol, subvol, loc); + return 0; + } - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s", loc->path, subvol->name); + gf_msg_trace(this->name, 0, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_create_cbk, - subvol, subvol->fops->create, - loc, flags, mode, umask, fd, params); + STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol, + subvol->fops->create, loc, flags, mode, umask, fd, + params); - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, + NULL); - return 0; + return 0; } int -switch_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, inode_t *inode, - struct iatt *stbuf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +switch_mknod_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - dht_local_t *local = NULL; + dht_local_t *local = NULL; - local = frame->local; - if (!local || !local->cached_subvol) { - op_errno = EINVAL; - op_ret = -1; - goto err; - } + local = frame->local; + if (!local || !local->cached_subvol) { + op_errno = EINVAL; + op_ret = -1; + goto err; + } - if (op_ret >= 0) { - STACK_WIND_COOKIE (frame, dht_newfile_cbk, - (void *)local->cached_subvol, local->cached_subvol, - local->cached_subvol->fops->mknod, - &local->loc, local->mode, local->rdev, - local->umask, local->params); + if (op_ret >= 0) { + STACK_WIND_COOKIE( + frame, dht_newfile_cbk, (void *)local->cached_subvol, + local->cached_subvol, local->cached_subvol->fops->mknod, + &local->loc, local->mode, local->rdev, local->umask, local->params); - return 0; - } -err: - DHT_STACK_UNWIND (link, frame, op_ret, op_errno, - inode, stbuf, preparent, postparent, xdata); return 0; + } +err: + DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent, + postparent, xdata); + return 0; } - int -switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t rdev, mode_t umask, dict_t *params) +switch_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *params) { - dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - xlator_t *subvol = NULL; - xlator_t *avail_subvol = NULL; - int op_errno = -1; - - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); - VALIDATE_OR_GOTO (loc, err); - - conf = this->private; - - dht_get_du_info (frame, this, loc); - - local = dht_local_init (frame, loc, NULL, GF_FOP_MKNOD); - if (!local) { - op_errno = ENOMEM; - goto err; - } - - subvol = dht_subvol_get_hashed (this, loc); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume in layout for path=%s", - loc->path); - op_errno = ENOENT; - goto err; - } - - /* Consider the disksize in consideration */ - avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol); - if (dht_is_subvol_filled (this, avail_subvol)) { - avail_subvol = - dht_free_disk_available_subvol (this, avail_subvol, - local); - } - - if (avail_subvol != subvol) { - /* Create linkfile first */ - - local->params = dict_ref (params); - local->mode = mode; - local->umask = umask; - local->rdev = rdev; - local->cached_subvol = avail_subvol; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; + xlator_t *avail_subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); + VALIDATE_OR_GOTO(loc, err); + + conf = this->private; + + dht_get_du_info(frame, this, loc); + + local = dht_local_init(frame, loc, NULL, GF_FOP_MKNOD); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + subvol = dht_subvol_get_hashed(this, loc); + if (!subvol) { + gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + /* Consider the disksize in consideration */ + avail_subvol = get_switch_matching_subvol(loc->path, conf, subvol); + if (dht_is_subvol_filled(this, avail_subvol)) { + avail_subvol = dht_free_disk_available_subvol(this, avail_subvol, + local); + } + + if (avail_subvol != subvol) { + /* Create linkfile first */ + + local->params = dict_ref(params); + local->mode = mode; + local->umask = umask; + local->rdev = rdev; + local->cached_subvol = avail_subvol; + + dht_linkfile_create(frame, switch_mknod_linkfile_cbk, this, + avail_subvol, subvol, loc); + return 0; + } - dht_linkfile_create (frame, switch_mknod_linkfile_cbk, - this, avail_subvol, subvol, loc); - return 0; - } + gf_msg_trace(this->name, 0, "creating %s on %s", loc->path, subvol->name); - gf_log (this->name, GF_LOG_TRACE, - "creating %s on %s", loc->path, subvol->name); + STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, rdev, umask, params); - STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol, - subvol->fops->mknod, loc, mode, rdev, umask, - params); - - return 0; + return 0; err: - op_errno = (op_errno == -1) ? errno : op_errno; - DHT_STACK_UNWIND (mknod, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); - return 0; + return 0; } - void -switch_fini (xlator_t *this) +switch_fini(xlator_t *this) { - dht_conf_t *conf = NULL; - struct switch_struct *trav = NULL; - struct switch_struct *prev = NULL; - - conf = this->private; - - if (conf) { - trav = (struct switch_struct *)conf->private; - conf->private = NULL; - while (trav) { - GF_FREE (trav->array); - prev = trav; - trav = trav->next; - GF_FREE (prev); - } + dht_conf_t *conf = NULL; + struct switch_struct *trav = NULL; + struct switch_struct *prev = NULL; + + conf = this->private; + + if (conf) { + trav = (struct switch_struct *)conf->private; + conf->private = NULL; + while (trav) { + GF_FREE(trav->array); + prev = trav; + trav = trav->next; + GF_FREE(prev); } + } - dht_fini(this); + dht_fini(this); } int -set_switch_pattern (xlator_t *this, dht_conf_t *conf, - const char *pattern_str) +set_switch_pattern(xlator_t *this, dht_conf_t *conf, const char *pattern_str) { - int flag = 0; - int idx = 0; - int index = 0; - int child_count = 0; - char *tmp = NULL; - char *tmp1 = NULL; - char *child = NULL; - char *tmp_str = NULL; - char *tmp_str1 = NULL; - char *dup_str = NULL; - char *dup_childs = NULL; - char *switch_str = NULL; - char *pattern = NULL; - char *childs = NULL; - char *option_string = NULL; - struct switch_struct *switch_buf = NULL; - struct switch_struct *switch_opt = NULL; - struct switch_struct *trav = NULL; - struct switch_sched_array *switch_buf_array = NULL; - xlator_list_t *trav_xl = NULL; - - trav_xl = this->children; - while (trav_xl) { - index++; - trav_xl = trav_xl->next; + int flag = 0; + int idx = 0; + int index = 0; + int child_count = 0; + char *tmp = NULL; + char *tmp1 = NULL; + char *child = NULL; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *dup_str = NULL; + char *dup_childs = NULL; + char *switch_str = NULL; + char *pattern = NULL; + char *childs = NULL; + char *option_string = NULL; + size_t pattern_length; + struct switch_struct *switch_buf = NULL; + struct switch_struct *switch_opt = NULL; + struct switch_struct *trav = NULL; + struct switch_sched_array *switch_buf_array = NULL; + xlator_list_t *trav_xl = NULL; + + trav_xl = this->children; + while (trav_xl) { + index++; + trav_xl = trav_xl->next; + } + child_count = index; + switch_buf_array = GF_CALLOC((index + 1), sizeof(struct switch_sched_array), + gf_switch_mt_switch_sched_array); + if (!switch_buf_array) + goto err; + + trav_xl = this->children; + index = 0; + + while (trav_xl) { + switch_buf_array[index].xl = trav_xl->xlator; + switch_buf_array[index].eligible = 1; + trav_xl = trav_xl->next; + index++; + } + + /* *jpg:child1,child2;*mpg:child3;*:child4,child5,child6 */ + + /* Get the pattern for considering switch case. + "option block-size *avi:10MB" etc */ + option_string = gf_strdup(pattern_str); + if (option_string == NULL) { + goto err; + } + switch_str = strtok_r(option_string, ";", &tmp_str); + while (switch_str) { + dup_str = gf_strdup(switch_str); + if (dup_str == NULL) { + goto err; } - child_count = index; - switch_buf_array = GF_CALLOC ((index + 1), - sizeof (struct switch_sched_array), - gf_switch_mt_switch_sched_array); - if (!switch_buf_array) - goto err; - - trav_xl = this->children; - index = 0; - - while (trav_xl) { - switch_buf_array[index].xl = trav_xl->xlator; - switch_buf_array[index].eligible = 1; - trav_xl = trav_xl->next; - index++; + switch_opt = GF_CALLOC(1, sizeof(struct switch_struct), + gf_switch_mt_switch_struct); + if (!switch_opt) { + GF_FREE(dup_str); + goto err; } - /* *jpg:child1,child2;*mpg:child3;*:child4,child5,child6 */ - - /* Get the pattern for considering switch case. - "option block-size *avi:10MB" etc */ - option_string = gf_strdup (pattern_str); - switch_str = strtok_r (option_string, ";", &tmp_str); - while (switch_str) { - dup_str = gf_strdup (switch_str); - switch_opt = GF_CALLOC (1, sizeof (struct switch_struct), - gf_switch_mt_switch_struct); - if (!switch_opt) { - GF_FREE (dup_str); - goto err; - } + pattern = strtok_r(dup_str, ":", &tmp_str1); + childs = strtok_r(NULL, ":", &tmp_str1); + if (strncmp(pattern, "*", 2) == 0) { + gf_msg("switch", GF_LOG_INFO, 0, DHT_MSG_SWITCH_PATTERN_INFO, + "'*' pattern will be taken by default " + "for all the unconfigured child nodes," + " hence neglecting current option"); + switch_str = strtok_r(NULL, ";", &tmp_str); + GF_FREE(switch_opt); + switch_opt = NULL; + GF_FREE(dup_str); + continue; + } + GF_FREE(dup_str); + + pattern_length = strlen(pattern); + if (pattern_length >= (sizeof(switch_opt->path_pattern))) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_SET_SWITCH_PATTERN_ERROR, "Pattern (%s) too long", + pattern); + goto err; + } + memcpy(switch_opt->path_pattern, pattern, pattern_length); + switch_opt->path_pattern[pattern_length] = '\0'; - pattern = strtok_r (dup_str, ":", &tmp_str1); - childs = strtok_r (NULL, ":", &tmp_str1); - if (strncmp (pattern, "*", 2) == 0) { - gf_log ("switch", GF_LOG_INFO, - "'*' pattern will be taken by default " - "for all the unconfigured child nodes," - " hence neglecting current option"); - switch_str = strtok_r (NULL, ";", &tmp_str); - GF_FREE (switch_opt); - GF_FREE (dup_str); - continue; - } - GF_FREE (dup_str); - memcpy (switch_opt->path_pattern, pattern, strlen (pattern)); - if (childs) { - dup_childs = gf_strdup (childs); - child = strtok_r (dup_childs, ",", &tmp); - while (child) { - if (gf_switch_valid_child (this, child)) { - idx++; - child = strtok_r (NULL, ",", &tmp); - } else { - gf_log (this->name, GF_LOG_ERROR, - "%s is not a subvolume of %s. " - "pattern can only be scheduled " - "only to a subvolume of %s", - child, this->name, this->name); - goto err; - } - } - GF_FREE (dup_childs); - child = strtok_r (childs, ",", &tmp1); - switch_opt->num_child = idx; - switch_opt->array = GF_CALLOC (1, (idx * - sizeof (struct switch_sched_array)), - gf_switch_mt_switch_sched_array); - if (!switch_opt->array) - goto err; - idx = 0; - while (child) { - for (index = 0; index < child_count; index++) { - if (strcmp (switch_buf_array[index].xl->name, - child) == 0) { - gf_log ("switch", GF_LOG_DEBUG, - "'%s' pattern will be " - "scheduled to \"%s\"", - switch_opt->path_pattern, child); - /* - if (switch_buf_array[index-1].considered) { - gf_log ("switch", GF_LOG_DEBUG, - "ambiguity found, exiting"); - return -1; - } - */ - switch_opt->array[idx].xl = switch_buf_array[index].xl; - switch_buf_array[index].considered = 1; - idx++; - break; - } - } - child = strtok_r (NULL, ",", &tmp1); - } + if (childs) { + dup_childs = gf_strdup(childs); + if (dup_childs == NULL) { + goto err; + } + child = strtok_r(dup_childs, ",", &tmp); + while (child) { + if (gf_switch_valid_child(this, child)) { + idx++; + child = strtok_r(NULL, ",", &tmp); } else { - /* error */ - gf_log ("switch", GF_LOG_ERROR, - "Check \"scheduler.switch.case\" " - "option in unify volume. Exiting"); - goto err; + gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SUBVOL_ERROR, + "%s is not a subvolume of %s. " + "pattern can only be scheduled " + "only to a subvolume of %s", + child, this->name, this->name); + GF_FREE(dup_childs); + goto err; } - - /* Link it to the main structure */ - if (switch_buf) { - /* there are already few entries */ - trav = switch_buf; - while (trav->next) - trav = trav->next; - trav->next = switch_opt; - } else { - /* First entry */ - switch_buf = switch_opt; + } + GF_FREE(dup_childs); + child = strtok_r(childs, ",", &tmp1); + switch_opt->num_child = idx; + switch_opt->array = GF_CALLOC( + 1, (idx * sizeof(struct switch_sched_array)), + gf_switch_mt_switch_sched_array); + if (!switch_opt->array) + goto err; + idx = 0; + while (child) { + for (index = 0; index < child_count; index++) { + if (strcmp(switch_buf_array[index].xl->name, child) == 0) { + gf_msg_debug("switch", 0, + "'%s' pattern will be " + "scheduled to \"%s\"", + switch_opt->path_pattern, child); + /* + if (switch_buf_array[index-1].considered) { + gf_msg_debug ("switch", 0, + "ambiguity found, exiting"); + return -1; + } + */ + switch_opt->array[idx].xl = switch_buf_array[index].xl; + switch_buf_array[index].considered = 1; + idx++; + break; + } } - switch_opt = NULL; - switch_str = strtok_r (NULL, ";", &tmp_str); + child = strtok_r(NULL, ",", &tmp1); + } + } else { + /* error */ + gf_msg("switch", GF_LOG_ERROR, 0, DHT_MSG_SET_SWITCH_PATTERN_ERROR, + "Check \"scheduler.switch.case\" " + "option in unify volume. Exiting"); + goto err; } - /* Now, all the pattern based considerations done, so for all the - * remaining pattern, '*' to all the remaining child nodes - */ - { - for (index=0; index < child_count; index++) { - /* check for considered flag */ - if (switch_buf_array[index].considered) - continue; - flag++; - } - if (!flag) { - gf_log ("switch", GF_LOG_ERROR, - "No nodes left for pattern '*'. Exiting"); - goto err; - } - switch_opt = GF_CALLOC (1, sizeof (struct switch_struct), - gf_switch_mt_switch_struct); - if (!switch_opt) - goto err; - - /* Add the '*' pattern to the array */ - memcpy (switch_opt->path_pattern, "*", 2); - switch_opt->num_child = flag; - switch_opt->array = - GF_CALLOC (1, - flag * sizeof (struct switch_sched_array), - gf_switch_mt_switch_sched_array); - if (!switch_opt->array) - goto err; - flag = 0; - for (index=0; index < child_count; index++) { - /* check for considered flag */ - if (switch_buf_array[index].considered) - continue; - gf_log ("switch", GF_LOG_DEBUG, - "'%s' pattern will be scheduled to \"%s\"", - switch_opt->path_pattern, - switch_buf_array[index].xl->name); - switch_opt->array[flag].xl = - switch_buf_array[index].xl; - switch_buf_array[index].considered = 1; - flag++; - } - if (switch_buf) { - /* there are already few entries */ - trav = switch_buf; - while (trav->next) - trav = trav->next; - trav->next = switch_opt; - } else { - /* First entry */ - switch_buf = switch_opt; - } - switch_opt = NULL; + /* Link it to the main structure */ + if (switch_buf) { + /* there are already few entries */ + trav = switch_buf; + while (trav->next) + trav = trav->next; + trav->next = switch_opt; + } else { + /* First entry */ + switch_buf = switch_opt; + } + switch_opt = NULL; + switch_str = strtok_r(NULL, ";", &tmp_str); + } + + /* Now, all the pattern based considerations done, so for all the + * remaining pattern, '*' to all the remaining child nodes + */ + { + for (index = 0; index < child_count; index++) { + /* check for considered flag */ + if (switch_buf_array[index].considered) + continue; + flag++; } - /* */ - conf->private = switch_buf; + if (!flag) { + gf_msg("switch", GF_LOG_ERROR, 0, DHT_MSG_SET_SWITCH_PATTERN_ERROR, + "No nodes left for pattern '*'. Exiting"); + goto err; + } + switch_opt = GF_CALLOC(1, sizeof(struct switch_struct), + gf_switch_mt_switch_struct); + if (!switch_opt) + goto err; + + /* Add the '*' pattern to the array */ + memcpy(switch_opt->path_pattern, "*", 2); + switch_opt->num_child = flag; + switch_opt->array = GF_CALLOC(1, + flag * sizeof(struct switch_sched_array), + gf_switch_mt_switch_sched_array); + if (!switch_opt->array) + goto err; + flag = 0; + for (index = 0; index < child_count; index++) { + /* check for considered flag */ + if (switch_buf_array[index].considered) + continue; + gf_msg_debug("switch", 0, + "'%s'" + " pattern will be scheduled to \"%s\"", + switch_opt->path_pattern, + switch_buf_array[index].xl->name); + + switch_opt->array[flag].xl = switch_buf_array[index].xl; + switch_buf_array[index].considered = 1; + flag++; + } + if (switch_buf) { + /* there are already few entries */ + trav = switch_buf; + while (trav->next) + trav = trav->next; + trav->next = switch_opt; + } else { + /* First entry */ + switch_buf = switch_opt; + } + switch_opt = NULL; + } + /* */ + conf->private = switch_buf; - return 0; + GF_FREE(option_string); + return 0; err: - GF_FREE (switch_buf_array); - GF_FREE (switch_opt); + GF_FREE(switch_buf_array); + GF_FREE(switch_opt); + GF_FREE(option_string); - if (switch_buf) { - trav = switch_buf; - while (trav) { - GF_FREE (trav->array); - switch_opt = trav; - trav = trav->next; - GF_FREE (switch_opt); - } + if (switch_buf) { + trav = switch_buf; + while (trav) { + GF_FREE(trav->array); + switch_opt = trav; + trav = trav->next; + GF_FREE(switch_opt); } - return -1; + } + return -1; } - int32_t -switch_init (xlator_t *this) +switch_init(xlator_t *this) { - dht_conf_t *conf = NULL; - data_t *data = NULL; - int ret = -1; + dht_conf_t *conf = NULL; + data_t *data = NULL; + int ret = -1; + + ret = dht_init(this); + if (ret) { + return ret; + } + conf = this->private; - ret = dht_init(this); + data = dict_get(this->options, "pattern.switch.case"); + if (data) { + /* TODO: */ + ret = set_switch_pattern(this, conf, data->data); if (ret) { - return ret; - } - conf = this->private; - - data = dict_get (this->options, "pattern.switch.case"); - if (data) { - /* TODO: */ - ret = set_switch_pattern (this, conf, data->data); - if (ret) { - goto err; - } + goto err; } + } - this->private = conf; - return 0; + this->private = conf; + return 0; err: - dht_fini(this); - return -1; + dht_fini(this); + return -1; } - -class_methods_t class_methods = { - .init = switch_init, - .fini = switch_fini, - .reconfigure = dht_reconfigure, - .notify = dht_notify -}; - - struct xlator_fops fops = { - .lookup = switch_lookup, - .create = switch_create, - .mknod = switch_mknod, - - .stat = dht_stat, - .fstat = dht_fstat, - .truncate = dht_truncate, - .ftruncate = dht_ftruncate, - .access = dht_access, - .readlink = dht_readlink, - .setxattr = dht_setxattr, - .getxattr = dht_getxattr, - .removexattr = dht_removexattr, - .open = dht_open, - .readv = dht_readv, - .writev = dht_writev, - .flush = dht_flush, - .fsync = dht_fsync, - .statfs = dht_statfs, - .lk = dht_lk, - .opendir = dht_opendir, - .readdir = dht_readdir, - .readdirp = dht_readdirp, - .fsyncdir = dht_fsyncdir, - .symlink = dht_symlink, - .unlink = dht_unlink, - .link = dht_link, - .mkdir = dht_mkdir, - .rmdir = dht_rmdir, - .rename = dht_rename, - .inodelk = dht_inodelk, - .finodelk = dht_finodelk, - .entrylk = dht_entrylk, - .fentrylk = dht_fentrylk, - .xattrop = dht_xattrop, - .fxattrop = dht_fxattrop, - .setattr = dht_setattr, + .lookup = switch_lookup, + .create = switch_create, + .mknod = switch_mknod, + + .stat = dht_stat, + .fstat = dht_fstat, + .truncate = dht_truncate, + .ftruncate = dht_ftruncate, + .access = dht_access, + .readlink = dht_readlink, + .setxattr = dht_setxattr, + .getxattr = dht_getxattr, + .removexattr = dht_removexattr, + .open = dht_open, + .readv = dht_readv, + .writev = dht_writev, + .flush = dht_flush, + .fsync = dht_fsync, + .statfs = dht_statfs, + .lk = dht_lk, + .opendir = dht_opendir, + .readdir = dht_readdir, + .readdirp = dht_readdirp, + .fsyncdir = dht_fsyncdir, + .symlink = dht_symlink, + .unlink = dht_unlink, + .link = dht_link, + .mkdir = dht_mkdir, + .rmdir = dht_rmdir, + .rename = dht_rename, + .inodelk = dht_inodelk, + .finodelk = dht_finodelk, + .entrylk = dht_entrylk, + .fentrylk = dht_fentrylk, + .xattrop = dht_xattrop, + .fxattrop = dht_fxattrop, + .setattr = dht_setattr, }; - -struct xlator_cbks cbks = { - .forget = dht_forget +struct xlator_cbks cbks = {.forget = dht_forget}; +extern int32_t +mem_acct_init(xlator_t *this); + +xlator_api_t xlator_api = { + .init = switch_init, + .fini = switch_fini, + .notify = dht_notify, + .reconfigure = dht_reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .fops = &fops, + .cbks = &cbks, + .options = dht_options, + .identifier = "switch", + .category = GF_TECH_PREVIEW, }; diff --git a/xlators/cluster/dht/src/unittest/dht_layout_mock.c b/xlators/cluster/dht/src/unittest/dht_layout_mock.c index aa19ddc575d..771452963d1 100644 --- a/xlators/cluster/dht/src/unittest/dht_layout_mock.c +++ b/xlators/cluster/dht/src/unittest/dht_layout_mock.c @@ -7,57 +7,67 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "xlator.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/xlator.h> #include "dht-common.h" -#include "byte-order.h" +#include <glusterfs/byte-order.h> int -dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p) +dht_hash_compute(xlator_t *this, int type, const char *name, uint32_t *hash_p) { return 0; } int -dht_inode_ctx_layout_get (inode_t *inode, xlator_t *this, dht_layout_t **layout) +dht_inode_ctx_layout_get(inode_t *inode, xlator_t *this, dht_layout_t **layout) { return 0; } int -dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this, - dht_layout_t *layout_int) +dht_inode_ctx_layout_set(inode_t *inode, xlator_t *this, + dht_layout_t *layout_int) { return 0; } int -dict_get_ptr (dict_t *this, char *key, void **ptr) +dict_get_ptr(dict_t *this, char *key, void **ptr) { return 0; } int -dict_get_ptr_and_len (dict_t *this, char *key, void **ptr, int *len) +dict_get_ptr_and_len(dict_t *this, char *key, void **ptr, int *len) { return 0; } -int _gf_log (const char *domain, const char *file, - const char *function, int32_t line, gf_loglevel_t level, - const char *fmt, ...) +int +_gf_log(const char *domain, const char *file, const char *function, + int32_t line, gf_loglevel_t level, const char *fmt, ...) { return 0; } -int _gf_log_callingfn (const char *domain, const char *file, - const char *function, int32_t line, gf_loglevel_t level, - const char *fmt, ...) +int +_gf_log_callingfn(const char *domain, const char *file, const char *function, + int32_t line, gf_loglevel_t level, const char *fmt, ...) +{ + return 0; +} + +void +gf_uuid_unparse(const uuid_t uu, char *out) +{ + // could call a will-return function here + // to place the correct data in *out +} + +int +_gf_msg(const char *domain, const char *file, const char *function, + int32_t line, gf_loglevel_t level, int errnum, int trace, + uint64_t msgid, const char *fmt, ...) { return 0; } diff --git a/xlators/cluster/dht/src/unittest/dht_layout_unittest.c b/xlators/cluster/dht/src/unittest/dht_layout_unittest.c index b5233d235d0..c94a1d0a2e1 100644 --- a/xlators/cluster/dht/src/unittest/dht_layout_unittest.c +++ b/xlators/cluster/dht/src/unittest/dht_layout_unittest.c @@ -9,15 +9,15 @@ */ #include "dht-common.h" -#include "logging.h" -#include "xlator.h" +#include <glusterfs/logging.h> +#include <glusterfs/xlator.h> +#include <inttypes.h> #include <stdarg.h> #include <stddef.h> #include <setjmp.h> -#include <inttypes.h> -#include <cmockery/pbc.h> -#include <cmockery/cmockery.h> +#include <cmocka_pbc.h> +#include <cmocka.h> /* * Helper functions @@ -33,16 +33,17 @@ helper_xlator_init(uint32_t num_types) xl = test_calloc(1, sizeof(xlator_t)); assert_non_null(xl); - xl->mem_acct.num_types = num_types; - xl->mem_acct.rec = test_calloc(num_types, sizeof(struct mem_acct_rec)); - assert_non_null(xl->mem_acct.rec); + xl->mem_acct->num_types = num_types; + xl->mem_acct = test_calloc(sizeof(struct mem_acct) + + sizeof(struct mem_acct_rec) + num_types); + assert_non_null(xl->mem_acct); xl->ctx = test_calloc(1, sizeof(glusterfs_ctx_t)); assert_non_null(xl->ctx); for (i = 0; i < num_types; i++) { - ret = LOCK_INIT(&(xl->mem_acct.rec[i].lock)); - assert_false(ret); + ret = LOCK_INIT(&(xl->mem_acct.rec[i].lock)); + assert_false(ret); } ENSURE(num_types == xl->mem_acct.num_types); @@ -57,8 +58,8 @@ helper_xlator_destroy(xlator_t *xl) int i, ret; for (i = 0; i < xl->mem_acct.num_types; i++) { - ret = LOCK_DESTROY(&(xl->mem_acct.rec[i].lock)); - assert_int_equal(ret, 0); + ret = LOCK_DESTROY(&(xl->mem_acct.rec[i].lock)); + assert_int_equal(ret, 0); } free(xl->mem_acct.rec); @@ -75,7 +76,7 @@ test_dht_layout_new(void **state) { xlator_t *xl; dht_layout_t *layout; - dht_conf_t *conf; + dht_conf_t *conf; int cnt; expect_assert_failure(dht_layout_new(NULL, 0)); @@ -89,7 +90,7 @@ test_dht_layout_new(void **state) assert_non_null(layout); assert_int_equal(layout->type, DHT_HASH_TYPE_DM); assert_int_equal(layout->cnt, cnt); - assert_int_equal(layout->ref, 1); + assert_int_equal(GF_ATOMIC_GET(layout->ref), 1); assert_int_equal(layout->gen, 0); assert_int_equal(layout->spread_cnt, 0); free(layout); @@ -106,7 +107,7 @@ test_dht_layout_new(void **state) assert_non_null(layout); assert_int_equal(layout->type, DHT_HASH_TYPE_DM); assert_int_equal(layout->cnt, cnt); - assert_int_equal(layout->ref, 1); + assert_int_equal(GF_ATOMIC_GET(layout->ref), 1); assert_int_equal(layout->gen, conf->gen); assert_int_equal(layout->spread_cnt, conf->dir_spread_cnt); free(layout); @@ -115,10 +116,12 @@ test_dht_layout_new(void **state) helper_xlator_destroy(xl); } -int main(void) { - const UnitTest tests[] = { +int +main(void) +{ + const struct CMUnitTest xlator_dht_layout_tests[] = { unit_test(test_dht_layout_new), }; - return run_tests(tests, "xlator_dht_layout"); + return cmocka_run_group_tests(xlator_dht_layout_tests, NULL, NULL); } |
