diff options
Diffstat (limited to 'xlators/cluster/afr')
31 files changed, 13577 insertions, 17086 deletions
diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am index e192b599b..ea5a90abb 100644 --- a/xlators/cluster/afr/src/Makefile.am +++ b/xlators/cluster/afr/src/Makefile.am @@ -1,27 +1,38 @@ xlator_LTLIBRARIES = afr.la pump.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster -afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c afr-self-heal-algorithm.c afr-lk-common.c $(top_builddir)/xlators/lib/src/libxlator.c +afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \ + afr-inode-write.c afr-open.c afr-transaction.c afr-lk-common.c \ + afr-read-txn.c \ + $(top_builddir)/xlators/lib/src/libxlator.c -afr_la_LDFLAGS = -module -avoidversion -afr_la_SOURCES = $(afr_common_source) afr.c +AFR_SELFHEAL_SOURCES = afr-self-heal-common.c afr-self-heal-data.c \ + afr-self-heal-entry.c afr-self-heal-metadata.c afr-self-heald.c \ + afr-self-heal-name.c + +afr_la_LDFLAGS = -module -avoid-version +afr_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) afr.c afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -pump_la_LDFLAGS = -module -avoidversion -pump_la_SOURCES = $(afr_common_source) pump.c +pump_la_LDFLAGS = -module -avoid-version +pump_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) pump.c pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h afr-self-heal-algorithm.h pump.h afr-mem-types.h afr-common.c $(top_builddir)/xlators/lib/src/libxlator.h +noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h \ + afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-mem-types.h \ + afr-common.c afr-self-heald.h pump.h \ + $(top_builddir)/xlators/lib/src/libxlator.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) \ + -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ + -I$(top_srcdir)/rpc/rpc-lib/src -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/contrib/md5 -shared -nostartfiles $(GF_CFLAGS) \ - -I$(top_srcdir)/xlators/lib/src +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = uninstall-local: rm -f $(DESTDIR)$(xlatordir)/replicate.so - rm -f $(DESTDIR)$(xlatordir)/pump.so install-data-hook: ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 9e0328b2c..6bd231600 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -44,6 +35,7 @@ #include "compat.h" #include "byte-order.h" #include "statedump.h" +#include "inode.h" #include "fd.h" @@ -53,662 +45,861 @@ #include "afr-dir-write.h" #include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" -#include "pump.h" +#include "afr-self-heald.h" -#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000200000000ULL -#define AFR_ICTX_SPLIT_BRAIN_MASK 0x0000000100000000ULL -#define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL -int -afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, - gf_boolean_t fail_conflict); -void -afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count) +call_frame_t * +afr_copy_frame (call_frame_t *base) { - int i = 0; + afr_local_t *local = NULL; + call_frame_t *frame = NULL; + int op_errno = 0; + + frame = copy_frame (base); + if (!frame) + return NULL; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) { + AFR_STACK_DESTROY (frame); + return NULL; + } - for (i = 0; i < child_count; i++) - dst[i] = src[i]; + return frame; } -void -afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path) +/* + * INODE CTX 64-bit VALUE FORMAT FOR SMALL (<= 16) SUBVOL COUNTS: + * + * |<---------- 64bit ------------>| + * 63 32 31 16 15 0 + * | EVENT_GEN | DATA | METADATA | + * + * + * METADATA (bit-0 .. bit-15): bitmap representing subvolumes from which + * metadata can be attempted to be read. + * + * bit-0 => priv->subvolumes[0] + * bit-1 => priv->subvolumes[1] + * ... etc. till bit-15 + * + * DATA (bit-16 .. bit-31): bitmap representing subvolumes from which data + * can be attempted to be read. + * + * bit-16 => priv->subvolumes[0] + * bit-17 => priv->subvolumes[1] + * ... etc. till bit-31 + * + * EVENT_GEN (bit-32 .. bit-63): event generation (i.e priv->event_generation) + * when DATA and METADATA was last updated. + * + * If EVENT_GEN is < priv->event_generation, + * or is 0, it means afr_inode_refresh() needs + * to be called to recalculate the bitmaps. + */ + +int +__afr_inode_read_subvol_get_small (inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int *event_p) { - int i = 0; - afr_private_t *priv = NULL; - int ret = 0; + afr_private_t *priv = NULL; + int ret = -1; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint32_t event = 0; + uint64_t val = 0; + int i = 0; - priv = this->private; + priv = this->private; - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_uint64 (xattr_req, priv->pending_key[i], - 3 * sizeof(int32_t)); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - path, priv->pending_key[i]); - /* 3 = data+metadata+entry */ - } + ret = __inode_ctx_get (inode, this, &val); + if (ret < 0) + return ret; + + metadatamap = (val & 0x000000000000ffff); + datamap = (val & 0x00000000ffff0000) >> 16; + event = (val & 0xffffffff00000000) >> 32; + + for (i = 0; i < priv->child_count; i++) { + if (metadata) + metadata[i] = (metadatamap >> i) & 1; + if (data) + data[i] = (datamap >> i) & 1; + } + + if (event_p) + *event_p = event; + return ret; } + int -afr_errno_count (int32_t *children, int *child_errno, - unsigned int child_count, int32_t op_errno) +__afr_inode_read_subvol_set_small (inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int event) { - int i = 0; - int errno_count = 0; - int child = 0; + afr_private_t *priv = NULL; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint64_t val = 0; + int i = 0; - for (i = 0; i < child_count; i++) { - if (children) { - child = children[i]; - if (child == -1) - break; - } else { - child = i; - } - if (child_errno[child] == op_errno) - errno_count++; - } - return errno_count; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (data[i]) + datamap |= (1 << i); + if (metadata[i]) + metadatamap |= (1 << i); + } + + val = ((uint64_t) metadatamap) | + (((uint64_t) datamap) << 16) | + (((uint64_t) event) << 32); + + return __inode_ctx_set (inode, this, &val); } -int32_t -afr_set_dict_gfid (dict_t *dict, uuid_t gfid) + +int +__afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this) { - int ret = 0; - uuid_t *pgfid = NULL; + int ret = -1; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint32_t event = 0; + uint64_t val = 0; - GF_ASSERT (gfid); + ret = __inode_ctx_get (inode, this, &val); + (void) ret; - pgfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char); - if (!pgfid) { - ret = -1; - goto out; - } + metadatamap = (val & 0x000000000000ffff) >> 0; + datamap = (val & 0x00000000ffff0000) >> 16; + event = 0; - uuid_copy (*pgfid, gfid); + val = ((uint64_t) metadatamap) | + (((uint64_t) datamap) << 16) | + (((uint64_t) event) << 32); - ret = dict_set_dynptr (dict, "gfid-req", pgfid, sizeof (uuid_t)); - if (ret) - gf_log (THIS->name, GF_LOG_DEBUG, "gfid set failed"); + return __inode_ctx_set (inode, this, &val); +} -out: - if (ret && pgfid) - GF_FREE (pgfid); - return ret; +int +__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int *event_p) +{ + afr_private_t *priv = NULL; + int ret = -1; + + priv = this->private; + + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_get_small (inode, this, data, + metadata, event_p); + else + /* TBD: allocate structure with array and read from it */ + ret = -1; + + return ret; } -afr_inode_ctx_t* -afr_inode_ctx_get_from_addr (uint64_t addr, int32_t child_count) + +int +__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int event) { - int ret = -1; - afr_inode_ctx_t *ctx = NULL; - size_t size = 0; + afr_private_t *priv = NULL; + int ret = -1; - GF_ASSERT (child_count > 0); + priv = this->private; - if (!addr) { - ctx = GF_CALLOC (1, sizeof (*ctx), - gf_afr_mt_inode_ctx_t); - if (!ctx) - goto out; - size = sizeof (*ctx->fresh_children); - ctx->fresh_children = GF_CALLOC (child_count, size, - gf_afr_mt_int32_t); - if (!ctx->fresh_children) - goto out; - } else { - ctx = (afr_inode_ctx_t*) (long) addr; - } - ret = 0; -out: - if (ret && ctx) { - if (ctx->fresh_children) - GF_FREE (ctx->fresh_children); - GF_FREE (ctx); - ctx = NULL; - } - return ctx; + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_set_small (inode, this, data, + metadata, event); + else + ret = -1; + + return ret; } -void -afr_inode_get_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) + +int +__afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this) { - GF_ASSERT (inode); - GF_ASSERT (params); + afr_private_t *priv = NULL; + int ret = -1; - int ret = 0; - afr_inode_ctx_t *ctx = NULL; - afr_private_t *priv = NULL; - int i = 0; - uint64_t ctx_addr = 0; - int32_t read_child = -1; - int32_t *fresh_children = NULL; + priv = this->private; - priv = this->private; - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx_addr); - if (ret < 0) - goto unlock; - ctx = afr_inode_ctx_get_from_addr (ctx_addr, priv->child_count); - if (!ctx) - goto unlock; - switch (params->mask_type) { - case AFR_ICTX_READ_CHILD_MASK: - fresh_children = params->u.read_ctx.fresh_children; - read_child = (int32_t)(ctx->masks & - AFR_ICTX_READ_CHILD_MASK); - params->u.read_ctx.read_child = read_child; - if (!fresh_children) - goto unlock; - for (i = 0; i < priv->child_count; i++) - fresh_children[i] = ctx->fresh_children[i]; - break; - case AFR_ICTX_OPENDIR_DONE_MASK: - params->u.value = ctx->masks & - AFR_ICTX_OPENDIR_DONE_MASK; - break; - case AFR_ICTX_SPLIT_BRAIN_MASK: - params->u.value = ctx->masks & AFR_ICTX_SPLIT_BRAIN_MASK; - break; - } - } -unlock: - UNLOCK (&inode->lock); + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_reset_small (inode, this); + else + ret = -1; + + return ret; } -uint64_t -afr_is_split_brain (xlator_t *this, inode_t *inode) + +int +afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int *event_p) { - afr_inode_params_t params = {0}; + int ret = -1; + + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_get (inode, this, data, + metadata, event_p); + } + UNLOCK(&inode->lock); - params.mask_type = AFR_ICTX_SPLIT_BRAIN_MASK; - afr_inode_get_ctx (this, inode, ¶ms); - return params.u.value; + return ret; } -gf_boolean_t -afr_is_opendir_done (xlator_t *this, inode_t *inode) + +int +afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int event) { - afr_inode_params_t params = {0}; + int ret = -1; + + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_set (inode, this, data, metadata, + event); + } + UNLOCK(&inode->lock); - params.mask_type = AFR_ICTX_OPENDIR_DONE_MASK; - afr_inode_get_ctx (this, inode, ¶ms); - return params.u.value; + return ret; } -int32_t -afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children) +int +afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this) { - afr_inode_params_t params = {0}; + int ret = -1; - params.mask_type = AFR_ICTX_READ_CHILD_MASK; - params.u.read_ctx.fresh_children = fresh_children; - afr_inode_get_ctx (this, inode, ¶ms); - return params.u.read_ctx.read_child; + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_reset (inode, this); + } + UNLOCK(&inode->lock); + + return ret; } -void -afr_inode_ctx_set_read_ctx (afr_inode_ctx_t *ctx, int32_t read_child, - int32_t *fresh_children, int32_t child_count) + +int +afr_accused_fill (xlator_t *this, dict_t *xdata, unsigned char *accused, + afr_transaction_type type) { - uint64_t rest_of_mask = 0; - uint64_t mask = 0; - int i = 0; + afr_private_t *priv = NULL; + int i = 0; + int idx = afr_index_for_transaction_type (type); + void *pending_raw = NULL; + int pending[3]; + int ret = 0; - rest_of_mask = (~AFR_ICTX_READ_CHILD_MASK & ctx->masks); - mask = (AFR_ICTX_READ_CHILD_MASK & read_child); - ctx->masks = rest_of_mask | mask; + priv = this->private; - /* avoid memcpy as int, int32_t are used interchangeably - */ - for (i = 0; i < child_count; i++) { - if (fresh_children) - ctx->fresh_children[i] = fresh_children[i]; - else - ctx->fresh_children[i] = -1; - } + for (i = 0; i < priv->child_count; i++) { + ret = dict_get_ptr (xdata, priv->pending_key[i], + &pending_raw); + if (ret) /* no pending flags */ + continue; + memcpy (pending, pending_raw, sizeof(pending)); + + if (ntoh32 (pending[idx])) + accused[i] = 1; + } + + return 0; } -void -afr_inode_ctx_set_opendir_done (afr_inode_ctx_t *ctx) + +int +afr_accuse_smallfiles (xlator_t *this, struct afr_reply *replies, + unsigned char *data_accused) { - uint64_t rest_of_mask = 0; - uint64_t mask = 0; + int i = 0; + afr_private_t *priv = NULL; + uint64_t maxsize = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) + continue; + if (replies[i].poststat.ia_size > maxsize) + maxsize = replies[i].poststat.ia_size; + } - rest_of_mask = (~AFR_ICTX_OPENDIR_DONE_MASK & ctx->masks); - mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_OPENDIR_DONE_MASK); - ctx->masks = rest_of_mask | mask; + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) + continue; + if (replies[i].poststat.ia_size < maxsize) + data_accused[i] = 1; + } + + return 0; } -void -afr_inode_ctx_set_splitbrain (afr_inode_ctx_t *ctx, gf_boolean_t set) -{ - uint64_t rest_of_mask = 0; - uint64_t mask = 0; - if (set) { - rest_of_mask = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx->masks); - mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_SPLIT_BRAIN_MASK); - ctx->masks = rest_of_mask | mask; - } else { - ctx->masks = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx->masks); - } +int +afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int event_generation = 0; + int i = 0; + unsigned char *data_accused = NULL; + unsigned char *metadata_accused = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + int ret = 0; + + local = frame->local; + priv = this->private; + replies = local->replies; + event_generation = local->event_generation; + + data_accused = alloca0 (priv->child_count); + data_readable = alloca0 (priv->child_count); + metadata_accused = alloca0 (priv->child_count); + metadata_readable = alloca0 (priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + data_readable[i] = 1; + metadata_readable[i] = 1; + } + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) { + data_readable[i] = 0; + metadata_readable[i] = 0; + continue; + } + + if (replies[i].op_ret == -1) { + data_readable[i] = 0; + metadata_readable[i] = 0; + continue; + } + + afr_accused_fill (this, replies[i].xdata, data_accused, + (inode->ia_type == IA_IFDIR) ? + AFR_ENTRY_TRANSACTION : AFR_DATA_TRANSACTION); + + afr_accused_fill (this, replies[i].xdata, + metadata_accused, AFR_METADATA_TRANSACTION); + + } + + if (inode->ia_type != IA_IFDIR) + afr_accuse_smallfiles (this, replies, data_accused); + + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) { + data_readable[i] = 0; + ret = 1; + } + if (metadata_accused[i]) { + metadata_readable[i] = 0; + ret = 1; + } + } + + afr_inode_read_subvol_set (inode, this, data_readable, + metadata_readable, event_generation); + return ret; } -void -afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) -{ - GF_ASSERT (inode); - GF_ASSERT (params); - int ret = 0; - afr_inode_ctx_t *ctx = NULL; - afr_private_t *priv = NULL; - uint64_t ctx_addr = 0; - gf_boolean_t set = _gf_false; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - priv = this->private; - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx_addr); - if (ret < 0) - ctx_addr = 0; - ctx = afr_inode_ctx_get_from_addr (ctx_addr, priv->child_count); - if (!ctx) - goto unlock; - switch (params->mask_type) { - case AFR_ICTX_READ_CHILD_MASK: - read_child = params->u.read_ctx.read_child; - fresh_children = params->u.read_ctx.fresh_children; - afr_inode_ctx_set_read_ctx (ctx, read_child, - fresh_children, - priv->child_count); - break; - case AFR_ICTX_OPENDIR_DONE_MASK: - afr_inode_ctx_set_opendir_done (ctx); - break; - case AFR_ICTX_SPLIT_BRAIN_MASK: - set = params->u.value; - afr_inode_ctx_set_splitbrain (ctx, set); - break; - } - ret = __inode_ctx_put (inode, this, (uint64_t)ctx); - if (ret) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to " - "set the inode ctx (%s)", - uuid_utoa (inode->gfid)); - } - } -unlock: - UNLOCK (&inode->lock); +int +afr_refresh_selfheal_done (int ret, call_frame_t *heal, void *opaque) +{ + if (heal) + STACK_DESTROY (heal->root); + return 0; } -void -afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set) +int +afr_inode_refresh_err (call_frame_t *frame, xlator_t *this) { - afr_inode_params_t params = {0}; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int err = 0; + + local = frame->local; + priv = this->private; - params.mask_type = AFR_ICTX_SPLIT_BRAIN_MASK; - params.u.value = set; - afr_inode_set_ctx (this, inode, ¶ms); + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && !local->replies[i].op_ret) { + err = 0; + goto ret; + } + } + + err = afr_final_errno (local, priv); +ret: + return -err; } -void -afr_set_opendir_done (xlator_t *this, inode_t *inode) + +int +afr_refresh_selfheal_wrap (void *opaque) { - afr_inode_params_t params = {0}; + call_frame_t *frame = opaque; + afr_local_t *local = NULL; + xlator_t *this = NULL; + int err = 0; + + local = frame->local; + this = frame->this; + + afr_selfheal (frame->this, local->refreshinode->gfid); - params.mask_type = AFR_ICTX_OPENDIR_DONE_MASK; - afr_inode_set_ctx (this, inode, ¶ms); + afr_selfheal_unlocked_discover (frame, local->refreshinode, + local->refreshinode->gfid, + local->replies); + + afr_replies_interpret (frame, this, local->refreshinode); + + err = afr_inode_refresh_err (frame, this); + + afr_replies_wipe (local, this->private); + + local->refreshfn (frame, this, err); + + return 0; } -void -afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, - int32_t *fresh_children) + +gf_boolean_t +afr_selfheal_enabled (xlator_t *this) { - afr_inode_params_t params = {0}; + afr_private_t *priv = NULL; + gf_boolean_t data = _gf_false; - GF_ASSERT (read_child >= 0); - GF_ASSERT (fresh_children); + priv = this->private; - params.mask_type = AFR_ICTX_READ_CHILD_MASK; - params.u.read_ctx.read_child = read_child; - params.u.read_ctx.fresh_children = fresh_children; - afr_inode_set_ctx (this, inode, ¶ms); + gf_string2boolean (priv->data_self_heal, &data); + + return data || priv->metadata_self_heal || priv->entry_self_heal; } -gf_boolean_t -afr_is_source_child (int32_t *sources, int32_t child_count, int32_t child) + + +int +afr_inode_refresh_done (call_frame_t *frame, xlator_t *this) { - gf_boolean_t source_xattrs = _gf_false; + call_frame_t *heal = NULL; + afr_local_t *local = NULL; + int ret = 0; + int err = 0; - GF_ASSERT (child < child_count); + local = frame->local; - if ((child >= 0) && (child < child_count) && - sources[child]) { - source_xattrs = _gf_true; - } - return source_xattrs; + ret = afr_replies_interpret (frame, this, local->refreshinode); + + err = afr_inode_refresh_err (frame, this); + + afr_replies_wipe (local, this->private); + + if (ret && afr_selfheal_enabled (this)) { + heal = copy_frame (frame); + if (heal) + heal->root->pid = -1; + ret = synctask_new (this->ctx->env, afr_refresh_selfheal_wrap, + afr_refresh_selfheal_done, heal, frame); + if (ret) + goto refresh_done; + } else { + refresh_done: + local->refreshfn (frame, this, err); + } + + return 0; } -gf_boolean_t -afr_is_child_present (int32_t *success_children, int32_t child_count, - int32_t child) + +int +afr_inode_refresh_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *par) { - gf_boolean_t success_child = _gf_false; - int i = 0; + afr_local_t *local = NULL; + int call_child = (long) cookie; + int call_count = 0; - GF_ASSERT (child < child_count); + local = frame->local; - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - if (child == success_children[i]) { - success_child = _gf_true; - break; - } - } - return success_child; + local->replies[call_child].valid = 1; + local->replies[call_child].op_ret = op_ret; + local->replies[call_child].op_errno = op_errno; + if (op_ret != -1) { + local->replies[call_child].poststat = *buf; + local->replies[call_child].postparent = *par; + local->replies[call_child].xdata = dict_ref (xdata); + } + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_inode_refresh_done (frame, this); + + return 0; } -gf_boolean_t -afr_is_read_child (int32_t *success_children, int32_t *sources, - int32_t child_count, int32_t child) + +int +afr_inode_refresh_subvol (call_frame_t *frame, xlator_t *this, int i, + inode_t *inode, dict_t *xdata) { - gf_boolean_t success_child = _gf_false; - gf_boolean_t source = _gf_false; + loc_t loc = {0, }; + afr_private_t *priv = NULL; - GF_ASSERT (success_children); - GF_ASSERT (child_count > 0); + priv = this->private; - success_child = afr_is_child_present (success_children, child_count, - child); - if (!success_child) - goto out; - if (NULL == sources) { - source = _gf_true; - goto out; - } - source = afr_is_source_child (sources, child_count, child); -out: - return (success_child && source); + loc.inode = inode; + uuid_copy (loc.gfid, inode->gfid); + + STACK_WIND_COOKIE (frame, afr_inode_refresh_subvol_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->lookup, &loc, xdata); + return 0; } -/* If sources is NULL the xattrs are assumed to be of source for all - * success_children. - */ + int -afr_select_read_child_from_policy (int32_t *success_children, int32_t child_count, - int32_t prev_read_child, - int32_t config_read_child, int32_t *sources) +afr_inode_refresh_do (call_frame_t *frame, xlator_t *this) { - int32_t read_child = -1; - int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t *xdata = NULL; - GF_ASSERT (success_children); + priv = this->private; + local = frame->local; - read_child = prev_read_child; - if (afr_is_read_child (success_children, sources, child_count, - read_child)) - goto out; + afr_replies_wipe (local, priv); - read_child = config_read_child; - if (afr_is_read_child (success_children, sources, child_count, - read_child)) - goto out; + xdata = dict_new (); + if (!xdata) { + afr_inode_refresh_done (frame, this); + return 0; + } - for (i = 0; i < child_count; i++) { - read_child = success_children[i]; - if (read_child < 0) - break; - if (afr_is_read_child (success_children, sources, child_count, - read_child)) - goto out; - } - read_child = -1; + if (afr_xattr_req_prepare (this, xdata) != 0) { + dict_unref (xdata); + afr_inode_refresh_done (frame, this); + return 0; + } -out: - return read_child; + local->call_count = AFR_COUNT (local->child_up, priv->child_count); + + call_count = local->call_count; + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + + afr_inode_refresh_subvol (frame, this, i, local->refreshinode, + xdata); + + if (!--call_count) + break; + } + + dict_unref (xdata); + + return 0; } -/* This function should be used when all the success_children are sources - */ -void -afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, - int32_t *fresh_children, int32_t prev_read_child, - int32_t config_read_child) -{ - int read_child = -1; - afr_private_t *priv = NULL; - priv = this->private; - read_child = afr_select_read_child_from_policy (fresh_children, - priv->child_count, - prev_read_child, - config_read_child, - NULL); - afr_inode_set_read_ctx (this, inode, read_child, fresh_children); -} - -/* afr_next_call_child () - * This is a common function used by all the read-type fops - * This function should not be called with the inode's read_children array. - * The fop's handler should make a copy of the inode's read_children, - * preferred read_child into the local vars, because while this function is - * in execution there is a chance for inode's read_ctx to change. - */ -int32_t -afr_next_call_child (int32_t *fresh_children, unsigned char *child_up, - size_t child_count, int32_t *last_index, - int32_t read_child) +int +afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_inode_refresh_cbk_t refreshfn) { - int next_index = 0; - int32_t next_call_child = -1; + afr_local_t *local = NULL; - GF_ASSERT (last_index); + local = frame->local; - next_index = *last_index; -retry: - next_index++; - if ((next_index >= child_count) || - (fresh_children[next_index] == -1)) - goto out; - if ((fresh_children[next_index] == read_child) || - (!child_up[fresh_children[next_index]])) - goto retry; - *last_index = next_index; - next_call_child = fresh_children[next_index]; -out: - return next_call_child; + local->refreshfn = refreshfn; + + if (local->refreshinode) { + inode_unref (local->refreshinode); + local->refreshinode = NULL; + } + + local->refreshinode = inode_ref (inode); + + afr_inode_refresh_do (frame, this); + + return 0; } - /* This function should not be called with the inode's read_children array. - * The fop's handler should make a copy of the inode's read_children, - * preferred read_child into the local vars, because while this function is - * in execution there is a chance for inode's read_ctx to change. - */ -int32_t -afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, - int32_t *fresh_children, - int32_t *call_child, int32_t *last_index) + +int +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req) { - int ret = 0; - afr_private_t *priv = NULL; - int i = 0; + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; - GF_ASSERT (child_up); - GF_ASSERT (call_child); - GF_ASSERT (last_index); - GF_ASSERT (fresh_children); - GF_ASSERT (read_child >= 0); + priv = this->private; - priv = this->private; - *call_child = -1; - *last_index = -1; + for (i = 0; i < priv->child_count; i++) { + ret = dict_set_uint64 (xattr_req, priv->pending_key[i], + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret < 0) + gf_log (this->name, GF_LOG_WARNING, + "Unable to set dict value for %s", + priv->pending_key[i]); + /* 3 = data+metadata+entry */ + } + ret = dict_set_uint64 (xattr_req, AFR_DIRTY, + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "failed to set dirty " + "query flag"); + } - if (child_up[read_child]) { - *call_child = read_child; - } else { - for (i = 0; i < priv->child_count; i++) { - if (fresh_children[i] == -1) - break; - if (child_up[fresh_children[i]]) { - *call_child = fresh_children[i]; - ret = 0; - break; - } - } + return ret; +} - if (*call_child == -1) { - ret = -ENOTCONN; - goto out; - } +int +afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this, + dict_t *xattr_req, loc_t *loc) +{ + int ret = -ENOMEM; + + local->xattr_req = dict_new (); + if (!local->xattr_req) + goto out; + if (xattr_req) + dict_copy (xattr_req, local->xattr_req); - *last_index = i; + ret = afr_xattr_req_prepare (this, local->xattr_req); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to prepare xattr_req", loc->path); + } + + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_INODELK_COUNT); } + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_ENTRYLK_COUNT); + } + + ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_PARENT_ENTRYLK); + } + + ret = 0; out: - gf_log (this->name, GF_LOG_DEBUG, "Returning %d, call_child: %d, " - "last_index: %d", ret, *call_child, *last_index); return ret; } -void -afr_reset_xattr (dict_t **xattr, unsigned int child_count) + +int +afr_hash_child (inode_t *inode, int32_t child_count, int hashmode) { - unsigned int i = 0; + uuid_t gfid_copy = {0,}; + pid_t pid; - if (!xattr) - goto out; - for (i = 0; i < child_count; i++) { - if (xattr[i]) { - dict_unref (xattr[i]); - xattr[i] = NULL; - } + if (!hashmode) { + return -1; } -out: - return; + + if (inode) { + uuid_copy (gfid_copy, inode->gfid); + } + + if (hashmode > 1) { + /* + * Why getpid? Because it's one of the cheapest calls + * available - faster than gethostname etc. - and returns a + * constant-length value that's sure to be shorter than a UUID. + * It's still very unlikely to be the same across clients, so + * it still provides good mixing. We're not trying for + * perfection here. All we need is a low probability that + * multiple clients won't converge on the same subvolume. + */ + pid = getpid(); + memcpy (gfid_copy, &pid, sizeof(pid)); + } + + return SuperFastHash((char *)gfid_copy, + sizeof(gfid_copy)) % child_count; } -void -afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) + +int +afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, + unsigned char *readable) { - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; + afr_private_t *priv = NULL; + int read_subvol = -1; + int i = 0; + priv = this->private; - sh = &local->self_heal; - priv = this->private; + /* first preference - explicitly specified or local subvolume */ + if (priv->read_child >= 0 && readable[priv->read_child]) + return priv->read_child; - if (sh->buf) - GF_FREE (sh->buf); + /* second preference - use hashed mode */ + read_subvol = afr_hash_child (inode, priv->child_count, + priv->hash_mode); + if (read_subvol >= 0 && readable[read_subvol]) + return read_subvol; - if (sh->parentbufs) - GF_FREE (sh->parentbufs); + for (i = 0; i < priv->child_count; i++) { + if (readable[i]) + return i; + } - if (sh->inode) - inode_unref (sh->inode); + /* no readable subvolumes, either split brain or all subvols down */ - if (sh->xattr) { - afr_reset_xattr (sh->xattr, priv->child_count); - GF_FREE (sh->xattr); - } + return -1; +} - if (sh->child_errno) - GF_FREE (sh->child_errno); - if (sh->pending_matrix) { - for (i = 0; i < priv->child_count; i++) { - GF_FREE (sh->pending_matrix[i]); - } - GF_FREE (sh->pending_matrix); - } - - if (sh->delta_matrix) { - for (i = 0; i < priv->child_count; i++) { - GF_FREE (sh->delta_matrix[i]); - } - GF_FREE (sh->delta_matrix); - } +int +afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, + int type) +{ + int ret = -1; - if (sh->sources) - GF_FREE (sh->sources); + if (type == AFR_METADATA_TRANSACTION) + ret = afr_inode_read_subvol_get (inode, this, 0, readable, + event_p); + else + ret = afr_inode_read_subvol_get (inode, this, readable, 0, + event_p); + return ret; +} - if (sh->success) - GF_FREE (sh->success); - if (sh->locked_nodes) - GF_FREE (sh->locked_nodes); +int +afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p, + int *event_p, afr_transaction_type type) +{ + afr_private_t *priv = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + unsigned char *readable = NULL; + unsigned char *intersection = NULL; + int subvol = -1; + int event = 0; - if (sh->healing_fd && !sh->healing_fd_opened) { - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; - } + priv = this->private; - if (sh->linkname) - GF_FREE ((char *)sh->linkname); + readable = alloca0 (priv->child_count); + data_readable = alloca0 (priv->child_count); + metadata_readable = alloca0 (priv->child_count); + intersection = alloca0 (priv->child_count); - if (sh->success_children) - GF_FREE (sh->success_children); + afr_inode_read_subvol_type_get (inode, this, readable, &event, type); - if (sh->fresh_children) - GF_FREE (sh->fresh_children); + afr_inode_read_subvol_get (inode, this, data_readable, metadata_readable, + &event); - if (sh->fresh_parent_dirs) - GF_FREE (sh->fresh_parent_dirs); + AFR_INTERSECT (intersection, data_readable, metadata_readable, + priv->child_count); - loc_wipe (&sh->parent_loc); + if (AFR_COUNT (intersection, priv->child_count) > 0) + subvol = afr_read_subvol_select_by_policy (inode, this, + intersection); + else + subvol = afr_read_subvol_select_by_policy (inode, this, + readable); + if (subvol_p) + *subvol_p = subvol; + if (event_p) + *event_p = event; + return subvol; } void afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) { - int i = 0; - afr_private_t * priv = NULL; + afr_private_t *priv = NULL; + int i = 0; priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (local->pending && local->pending[i]) - GF_FREE (local->pending[i]); - } - - GF_FREE (local->pending); + afr_matrix_cleanup (local->pending, priv->child_count); - if (local->internal_lock.locked_nodes) - GF_FREE (local->internal_lock.locked_nodes); + GF_FREE (local->internal_lock.locked_nodes); - if (local->internal_lock.inode_locked_nodes) - GF_FREE (local->internal_lock.inode_locked_nodes); - - if (local->internal_lock.entry_locked_nodes) - GF_FREE (local->internal_lock.entry_locked_nodes); + for (i = 0; local->internal_lock.inodelk[i].domain; i++) { + GF_FREE (local->internal_lock.inodelk[i].locked_nodes); + } - if (local->internal_lock.lower_locked_nodes) - GF_FREE (local->internal_lock.lower_locked_nodes); + GF_FREE (local->internal_lock.lower_locked_nodes); + afr_entry_lockee_cleanup (&local->internal_lock); - GF_FREE (local->transaction.child_errno); - GF_FREE (local->child_errno); + GF_FREE (local->transaction.pre_op); + GF_FREE (local->transaction.eager_lock); GF_FREE (local->transaction.basename); GF_FREE (local->transaction.new_basename); loc_wipe (&local->transaction.parent_loc); loc_wipe (&local->transaction.new_parent_loc); + } void +afr_replies_wipe (afr_local_t *local, afr_private_t *priv) +{ + int i; + + if (!local->replies) + return; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].xdata) { + dict_unref (local->replies[i].xdata); + local->replies[i].xdata = NULL; + } + } + + memset (local->replies, 0, sizeof(*local->replies) * priv->child_count); +} + +void +afr_remove_eager_lock_stub (afr_local_t *local) +{ + LOCK (&local->fd->lock); + { + list_del_init (&local->transaction.eager_locked); + } + UNLOCK (&local->fd->lock); +} + +void afr_local_cleanup (afr_local_t *local, xlator_t *this) { afr_private_t * priv = NULL; @@ -716,7 +907,11 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (!local) return; - afr_local_sh_cleanup (local, this); + syncbarrier_destroy (&local->barrier); + + if (local->transaction.eager_lock_on && + !list_empty (&local->transaction.eager_locked)) + afr_remove_eager_lock_stub (local); afr_local_transaction_cleanup (local, this); @@ -731,49 +926,36 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (local->xattr_req) dict_unref (local->xattr_req); - if (local->child_up) - GF_FREE (local->child_up); + if (local->dict) + dict_unref (local->dict); - if (local->fresh_children) - GF_FREE (local->fresh_children); + afr_replies_wipe (local, priv); + GF_FREE(local->replies); - { /* lookup */ - if (local->cont.lookup.xattrs) { - afr_reset_xattr (local->cont.lookup.xattrs, - priv->child_count); - GF_FREE (local->cont.lookup.xattrs); - local->cont.lookup.xattrs = NULL; - } + GF_FREE (local->child_up); - if (local->cont.lookup.xattr) { - dict_unref (local->cont.lookup.xattr); - } + GF_FREE (local->read_attempted); - if (local->cont.lookup.inode) { - inode_unref (local->cont.lookup.inode); - } + GF_FREE (local->readable); - if (local->cont.lookup.postparents) - GF_FREE (local->cont.lookup.postparents); + if (local->inode) + inode_unref (local->inode); - if (local->cont.lookup.bufs) - GF_FREE (local->cont.lookup.bufs); + if (local->parent) + inode_unref (local->parent); - if (local->cont.lookup.success_children) - GF_FREE (local->cont.lookup.success_children); + if (local->parent2) + inode_unref (local->parent2); - if (local->cont.lookup.sources) - GF_FREE (local->cont.lookup.sources); - } + if (local->refreshinode) + inode_unref (local->refreshinode); { /* getxattr */ - if (local->cont.getxattr.name) - GF_FREE (local->cont.getxattr.name); + GF_FREE (local->cont.getxattr.name); } { /* lk */ - if (local->cont.lk.locked_nodes) - GF_FREE (local->cont.lk.locked_nodes); + GF_FREE (local->cont.lk.locked_nodes); } { /* create */ @@ -807,18 +989,40 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) dict_unref (local->cont.setxattr.dict); } + { /* fsetxattr */ + if (local->cont.fsetxattr.dict) + dict_unref (local->cont.fsetxattr.dict); + } + { /* removexattr */ GF_FREE (local->cont.removexattr.name); } - + { /* xattrop */ + if (local->cont.xattrop.xattr) + dict_unref (local->cont.xattrop.xattr); + } + { /* fxattrop */ + if (local->cont.fxattrop.xattr) + dict_unref (local->cont.fxattrop.xattr); + } { /* symlink */ GF_FREE (local->cont.symlink.linkpath); } { /* opendir */ - if (local->cont.opendir.checksum) - GF_FREE (local->cont.opendir.checksum); + GF_FREE (local->cont.opendir.checksum); } + + { /* readdirp */ + if (local->cont.readdir.dict) + dict_unref (local->cont.readdir.dict); + } + + if (local->xdata_req) + dict_unref (local->xdata_req); + + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); } @@ -840,1302 +1044,1060 @@ afr_frame_return (call_frame_t *frame) } -/** - * up_children_count - return the number of children that are up - */ - -int -afr_up_children_count (int child_count, unsigned char *child_up) -{ - int i = 0; - int ret = 0; - - for (i = 0; i < child_count; i++) - if (child_up[i]) - ret++; - return ret; -} - gf_boolean_t -afr_is_fresh_lookup (loc_t *loc, xlator_t *this) +afr_is_entry_possibly_under_txn (afr_local_t *local, xlator_t *this) { - uint64_t ctx = 0; - int32_t ret = 0; + int i = 0; + int tmp = 0; + afr_private_t *priv = NULL; - GF_ASSERT (loc); - GF_ASSERT (this); - GF_ASSERT (loc->inode); + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].xdata) + continue; + if (dict_get_int32 (local->replies[i].xdata, + GLUSTERFS_PARENT_ENTRYLK, + &tmp) == 0) + if (tmp) + return _gf_true; + } - ret = inode_ctx_get (loc->inode, this, &ctx); - if (0 == ret) - return _gf_false; - return _gf_true; + return _gf_false; } -void -afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) -{ - GF_ASSERT (loc); - GF_ASSERT (buf); - uuid_copy (loc->gfid, buf->ia_gfid); - if (postparent) - uuid_copy (loc->pargfid, postparent->ia_gfid); +/* + * Quota size xattrs are not maintained by afr. There is a + * possibility that they differ even when both the directory changelog xattrs + * suggest everything is fine. So if there is at least one 'source' check among + * the sources which has the maximum quota size. Otherwise check among all the + * available ones for maximum quota size. This way if there is a source and + * stale copies it always votes for the 'source'. + * */ + +static void +afr_handle_quota_size (call_frame_t *frame, xlator_t *this) +{ + unsigned char *readable = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int i = 0; + uint64_t size = 0; + uint64_t max_size = 0; + int readable_cnt = 0; + + local = frame->local; + priv = this->private; + replies = local->replies; + + readable = alloca0 (priv->child_count); + + afr_inode_read_subvol_get (local->inode, this, readable, 0, 0); + + readable_cnt = AFR_COUNT (readable, priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (readable_cnt && !readable[i]) + continue; + if (!replies[i].xdata) + continue; + if (dict_get_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, &size)) + continue; + if (size > max_size) + max_size = size; + } + + if (!max_size) + return; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (readable_cnt && !readable[i]) + continue; + if (!replies[i].xdata) + continue; + if (dict_set_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, max_size)) + continue; + } } -void -afr_lookup_build_response_params (afr_local_t *local, xlator_t *this) + +static void +afr_lookup_done (call_frame_t *frame, xlator_t *this) { - int32_t read_child = -1; - struct iatt *buf = NULL; - struct iatt *postparent = NULL; - dict_t **xattr = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = -1; + int op_errno = 0; + int read_subvol = 0; + unsigned char *readable = NULL; + int event = 0; + struct afr_reply *replies = NULL; + uuid_t read_gfid = {0, }; + gf_boolean_t locked_entry = _gf_false; + gf_boolean_t can_interpret = _gf_true; - GF_ASSERT (local); + priv = this->private; + local = frame->local; + replies = local->replies; - buf = &local->cont.lookup.buf; - postparent = &local->cont.lookup.postparent; - xattr = &local->cont.lookup.xattr; + locked_entry = afr_is_entry_possibly_under_txn (local, this); - read_child = afr_inode_get_read_ctx (this, local->cont.lookup.inode, - NULL); - gf_log (this->name, GF_LOG_DEBUG, "Building lookup response from %d", - read_child); - *xattr = dict_ref (local->cont.lookup.xattrs[read_child]); - *buf = local->cont.lookup.bufs[read_child]; - *postparent = local->cont.lookup.postparents[read_child]; + readable = alloca0 (priv->child_count); - if (IA_INVAL == local->cont.lookup.inode->ia_type) { - /* fix for RT #602 */ - local->cont.lookup.inode->ia_type = buf->ia_type; - } -} + afr_inode_read_subvol_get (local->loc.parent, this, readable, + NULL, &event); -static void -afr_lookup_update_lk_counts (afr_local_t *local, xlator_t *this, - int child_index, dict_t *xattr) -{ - uint32_t inodelk_count = 0; - uint32_t entrylk_count = 0; - int ret = -1; + /* First, check if we have an ESTALE from somewhere, + If so, propagate that so that a revalidate can be + issued + */ + op_errno = afr_final_errno (frame->local, this->private); + local->op_errno = op_errno; + if (op_errno == ESTALE) { + local->op_errno = op_errno; + local->op_ret = -1; + goto unwind; + } + + read_subvol = -1; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (locked_entry && replies[i].op_ret == -1 && + replies[i].op_errno == ENOENT) { + /* Second, check entry is still + "underway" in creation */ + local->op_ret = -1; + local->op_errno = ENOENT; + read_subvol = i; + goto unwind; + } + + if (replies[i].op_ret == -1) + continue; + + if (read_subvol == -1 || !readable[read_subvol]) { + read_subvol = i; + uuid_copy (read_gfid, replies[i].poststat.ia_gfid); + local->op_ret = 0; + } + } + + if (read_subvol == -1) + goto unwind; + /* We now have a read_subvol, which is readable[] (if there + were any). Next we look for GFID mismatches. We don't + consider a GFID mismatch as an error if read_subvol is + readable[] but the mismatching GFID subvol is not. + */ + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) { + if (priv->child_up[i]) + can_interpret = _gf_false; + continue; + } + + if (!uuid_compare (replies[i].poststat.ia_gfid, + read_gfid)) + continue; + + can_interpret = _gf_false; + + if (locked_entry) + continue; + + /* Now GFIDs mismatch. It's OK as long as this subvol + is not readable[] but read_subvol is */ + if (readable[read_subvol] && !readable[i]) + continue; + + /* LOG ERROR */ + local->op_ret = -1; + local->op_errno = EIO; + goto unwind; + } + + /* Forth, for the finalized GFID, pick the best subvolume + to return stats from. + */ + if (can_interpret) { + /* It is safe to call afr_replies_interpret() because we have + a response from all the UP subvolumes and all of them resolved + to the same GFID + */ + if (afr_replies_interpret (frame, this, local->inode)) { + read_subvol = afr_data_subvol_get (local->inode, this, + 0, 0); + afr_inode_read_subvol_reset (local->inode, this); + goto cant_interpret; + } else { + read_subvol = afr_data_subvol_get (local->inode, this, + 0, 0); + } + } else { + cant_interpret: + if (read_subvol == -1) + dict_del (replies[0].xdata, GF_CONTENT_KEY); + else + dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY); + } + + afr_handle_quota_size (frame, this); + +unwind: + if (read_subvol == -1) + read_subvol = 0; + + AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->replies[read_subvol].poststat, + local->replies[read_subvol].xdata, + &local->replies[read_subvol].postparent); +} - GF_ASSERT (local); - GF_ASSERT (this); - GF_ASSERT (xattr); - GF_ASSERT (child_index >= 0); +/* + * During a lookup, some errors are more "important" than + * others in that they must be given higher priority while + * returning to the user. + * + * The hierarchy is ESTALE > ENOENT > others + */ - ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT, - &inodelk_count); - if (ret == 0) - local->inodelk_count += inodelk_count; +int +afr_higher_errno (int32_t old_errno, int32_t new_errno) +{ + if (old_errno == ENODATA || new_errno == ENODATA) + return ENODATA; + if (old_errno == ESTALE || new_errno == ESTALE) + return ESTALE; + if (old_errno == ENOENT || new_errno == ENOENT) + return ENOENT; - ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT, - &entrylk_count); - if (ret == 0) - local->entrylk_count += entrylk_count; + return new_errno; } -static void -afr_lookup_set_self_heal_data_by_xattr (afr_local_t *local, xlator_t *this, - dict_t *xattr) -{ - GF_ASSERT (local); - GF_ASSERT (this); - GF_ASSERT (xattr); - if (afr_sh_has_metadata_pending (xattr, this)) { - local->self_heal.need_metadata_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "metadata self-heal is pending for %s.", - local->loc.path); - } +int +afr_final_errno (afr_local_t *local, afr_private_t *priv) +{ + int i = 0; + int op_errno = 0; + int tmp_errno = 0; - if (afr_sh_has_entry_pending (xattr, this)) { - local->self_heal.need_entry_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "entry self-heal is pending for %s.", local->loc.path); - } + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret == 0) + continue; + tmp_errno = local->replies[i].op_errno; + op_errno = afr_higher_errno (op_errno, tmp_errno); + } - if (afr_sh_has_data_pending (xattr, this)) { - local->self_heal.need_data_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "data self-heal is pending for %s.", local->loc.path); - } + return op_errno; } -static void -afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this, - struct iatt *buf, struct iatt *lookup_buf) +static int +get_pathinfo_host (char *pathinfo, char *hostname, size_t size) { - if (PERMISSION_DIFFERS (buf, lookup_buf)) { - /* mismatching permissions */ - gf_log (this->name, GF_LOG_INFO, - "permissions differ for %s ", local->loc.path); - local->self_heal.need_metadata_self_heal = _gf_true; - } + char *start = NULL; + char *end = NULL; + int ret = -1; + int i = 0; - if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { - /* mismatching permissions */ - local->self_heal.need_metadata_self_heal = _gf_true; - gf_log (this->name, GF_LOG_INFO, - "ownership differs for %s ", local->loc.path); - } + if (!pathinfo) + goto out; - if (SIZE_DIFFERS (buf, lookup_buf) - && IA_ISREG (buf->ia_type)) { - gf_log (this->name, GF_LOG_INFO, - "size differs for %s ", local->loc.path); - local->self_heal.need_data_self_heal = _gf_true; - } + start = strchr (pathinfo, ':'); + if (!start) + goto out; + end = strrchr (pathinfo, ':'); + if (start == end) + goto out; - if (uuid_compare (buf->ia_gfid, lookup_buf->ia_gfid)) { - /* mismatching gfid */ - gf_log (this->name, GF_LOG_WARNING, - "%s: gfid different on subvolume", local->loc.path); - } + memset (hostname, 0, size); + i = 0; + while (++start != end) + hostname[i++] = *start; + ret = 0; +out: + return ret; } -static void -afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this) +int +afr_local_pathinfo (char *pathinfo, gf_boolean_t *local) { - GF_ASSERT (local); - GF_ASSERT (this); + int ret = 0; + char pathinfohost[1024] = {0}; + char localhost[1024] = {0}; + xlator_t *this = THIS; - if ((local->success_count > 0) && (local->enoent_count > 0)) { - local->self_heal.need_metadata_self_heal = _gf_true; - local->self_heal.need_data_self_heal = _gf_true; - local->self_heal.need_entry_self_heal = _gf_true; - local->self_heal.need_gfid_self_heal = _gf_true; - local->self_heal.need_missing_entry_self_heal = _gf_true; - gf_log(this->name, GF_LOG_INFO, - "entries are missing in lookup of %s.", - local->loc.path); - //If all self-heals are needed no need to check for other rules + *local = _gf_false; + ret = get_pathinfo_host (pathinfo, pathinfohost, sizeof (pathinfohost)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Invalid pathinfo: %s", + pathinfo); goto out; } - if ((local->success_count > 0) && - afr_is_split_brain (this, local->cont.lookup.inode) && - IA_ISREG (local->cont.lookup.inode->ia_type)) { - local->self_heal.need_data_self_heal = _gf_true; - local->self_heal.need_gfid_self_heal = _gf_true; - local->self_heal.need_missing_entry_self_heal = _gf_true; - gf_log (this->name, GF_LOG_WARNING, - "split brain detected during lookup of %s.", - local->loc.path); + ret = gethostname (localhost, sizeof (localhost)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "gethostname() failed, " + "reason: %s", strerror (errno)); + goto out; } + if (!strcmp (localhost, pathinfohost)) + *local = _gf_true; out: - return; + return ret; } -gf_boolean_t -afr_can_self_heal_proceed (afr_self_heal_t *sh, afr_private_t *priv) +static int32_t +afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - GF_ASSERT (sh); - GF_ASSERT (priv); - - return (sh->need_gfid_self_heal - || sh->need_missing_entry_self_heal - || (priv->data_self_heal && sh->need_data_self_heal) - || (priv->metadata_self_heal && sh->need_metadata_self_heal) - || (priv->entry_self_heal && sh->need_entry_self_heal)); -} + int ret = 0; + char *pathinfo = NULL; + gf_boolean_t is_local = _gf_false; + afr_private_t *priv = NULL; + int32_t child_index = -1; -afr_transaction_type -afr_transaction_type_get (ia_type_t ia_type) -{ - afr_transaction_type type = AFR_METADATA_TRANSACTION; + if (op_ret != 0) { + goto out; + } - GF_ASSERT (ia_type != IA_INVAL); + priv = this->private; + child_index = (int32_t)(long)cookie; - if (IA_ISDIR (ia_type)) { - type = AFR_ENTRY_TRANSACTION; - } else if (IA_ISREG (ia_type)) { - type = AFR_DATA_TRANSACTION; + ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); + if (ret != 0) { + goto out; } - return type; -} - -int -afr_lookup_select_read_child (afr_local_t *local, xlator_t *this, - int32_t *read_child) -{ - ia_type_t ia_type = IA_INVAL; - int32_t source = -1; - int ret = -1; - dict_t **xattrs = NULL; - int32_t *success_children = NULL; - struct iatt *bufs = NULL; - afr_transaction_type type = AFR_METADATA_TRANSACTION; - - GF_ASSERT (local); - GF_ASSERT (this); - GF_ASSERT (local->success_count > 0); - bufs = local->cont.lookup.bufs; - success_children = local->cont.lookup.success_children; - /*We can take the success_children[0] only because we already - *handle the conflicting children other wise, we could select the - *read_child based on wrong file type - */ - ia_type = local->cont.lookup.bufs[success_children[0]].ia_type; - type = afr_transaction_type_get (ia_type); - xattrs = local->cont.lookup.xattrs; - source = afr_lookup_select_read_child_by_txn_type (this, local, xattrs, - type); - if (source < 0) { - gf_log (this->name, GF_LOG_DEBUG, "failed to select source " - "for %s", local->loc.path); + ret = afr_local_pathinfo (pathinfo, &is_local); + if (ret) { goto out; } - gf_log (this->name, GF_LOG_DEBUG, "Source selected as %d for %s", - source, local->loc.path); - *read_child = source; - ret = 0; + /* + * Note that one local subvolume will override another here. The only + * way to avoid that would be to retain extra information about whether + * the previous read_child is local, and it's just not worth it. Even + * the slowest local subvolume is far preferable to a remote one. + */ + if (is_local) { + gf_log (this->name, GF_LOG_INFO, + "selecting local read_child %s", + priv->children[child_index]->name); + priv->read_child = child_index; + } out: - return ret; -} - -static inline gf_boolean_t -afr_is_self_heal_running (afr_local_t *local) -{ - GF_ASSERT (local); - return ((local->inodelk_count > 0) || (local->entrylk_count > 0)); + STACK_DESTROY(frame->root); + return 0; } static void -afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, - gf_boolean_t is_background, ia_type_t ia_type, - void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, - xlator_t *this), - int (*unwind) (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno)) +afr_attempt_local_discovery (xlator_t *this, int32_t child_index) { - afr_local_t *local = NULL; - char sh_type_str[256] = {0,}; - - GF_ASSERT (frame); - GF_ASSERT (this); - GF_ASSERT (inode); + call_frame_t *newframe = NULL; + loc_t tmploc = {0,}; + afr_private_t *priv = this->private; - local = frame->local; - local->self_heal.background = is_background; - local->self_heal.type = ia_type; - local->self_heal.unwind = unwind; - local->self_heal.gfid_sh_success_cbk = gfid_sh_success_cbk; - - afr_self_heal_type_str_get (&local->self_heal, - sh_type_str, - sizeof (sh_type_str)); - - gf_log (this->name, GF_LOG_INFO, - "background %s self-heal triggered. path: %s", - sh_type_str, local->loc.path); + newframe = create_frame(this,this->ctx->pool); + if (!newframe) { + return; + } - afr_self_heal (frame, this, inode); + tmploc.gfid[sizeof(tmploc.gfid)-1] = 1; + STACK_WIND_COOKIE (newframe, afr_local_discovery_cbk, + (void *)(long)child_index, + priv->children[child_index], + priv->children[child_index]->fops->getxattr, + &tmploc, GF_XATTR_PATHINFO_KEY, NULL); } + int -afr_gfid_missing_count (const char *xlator_name, int32_t *success_children, - struct iatt *bufs, unsigned int child_count, - const char *path) +afr_lookup_selfheal_wrap (void *opaque) { - int gfid_miss_count = 0; - int i = 0; - struct iatt *child1 = NULL; + call_frame_t *frame = opaque; + afr_local_t *local = NULL; + xlator_t *this = NULL; + inode_t *inode = NULL; - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - child1 = &bufs[success_children[i]]; - if (uuid_is_null (child1->ia_gfid)) { - gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid is null" - " on subvolume %d", path, success_children[i]); - gfid_miss_count++; - } - } + local = frame->local; + this = frame->this; - return gfid_miss_count; -} + afr_selfheal_name (frame->this, local->loc.pargfid, local->loc.name); -static int -afr_lookup_gfid_missing_count (afr_local_t *local, xlator_t *this) -{ - int32_t *success_children = NULL; - afr_private_t *priv = NULL; - struct iatt *bufs = NULL; - int miss_count = 0; + afr_replies_wipe (local, this->private); - priv = this->private; - bufs = local->cont.lookup.bufs; - success_children = local->cont.lookup.success_children; - - miss_count = afr_gfid_missing_count (this->name, success_children, - bufs, priv->child_count, - local->loc.path); - return miss_count; -} + inode = afr_selfheal_unlocked_lookup_on (frame, local->loc.parent, + local->loc.name, local->replies, + local->child_up); + if (inode) + inode_unref (inode); + afr_lookup_done (frame, this); -gf_boolean_t -afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, - unsigned int child_count, const char *path, - const char *xlator_name) -{ - gf_boolean_t conflicting = _gf_false; - int i = 0; - struct iatt *child1 = NULL; - struct iatt *child2 = NULL; - uuid_t *gfid = NULL; - - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - child1 = &bufs[success_children[i]]; - if ((!gfid) && (!uuid_is_null (child1->ia_gfid))) - gfid = &child1->ia_gfid; - - if (i == 0) - continue; - - child2 = &bufs[success_children[i-1]]; - if (FILETYPE_DIFFERS (child1, child2)) { - gf_log (xlator_name, GF_LOG_WARNING, "%s: filetype " - "differs on subvolumes (%d, %d)", path, - success_children[i-1], success_children[i]); - conflicting = _gf_true; - goto out; - } - if (!gfid || uuid_is_null (child1->ia_gfid)) - continue; - if (uuid_compare (*gfid, child1->ia_gfid)) { - gf_log (xlator_name, GF_LOG_WARNING, "%s: gfid differs" - " on subvolume %d", path, success_children[i]); - conflicting = _gf_true; - goto out; - } - } -out: - return conflicting; + return 0; } -/* afr_update_gfid_from_iatts: This function should be called only if the - * iatts are not conflicting. - */ -void -afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs, - int32_t *success_children, unsigned int child_count) -{ - uuid_t *gfid = NULL; - int i = 0; - int child = 0; - for (i = 0; i < child_count; i++) { - child = success_children[i]; - if (child == -1) - break; - if ((!gfid) && (!uuid_is_null (bufs[child].ia_gfid))) { - gfid = &bufs[child].ia_gfid; - } else if (gfid && (!uuid_is_null (bufs[child].ia_gfid))) { - if (uuid_compare (*gfid, bufs[child].ia_gfid)) { - GF_ASSERT (0); - goto out; - } - } - } - if (gfid && (!uuid_is_null (*gfid))) - uuid_copy (uuid, *gfid); -out: - return; +int +afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + call_frame_t *heal = NULL; + int i = 0, first = -1; + gf_boolean_t need_heal = _gf_false; + struct afr_reply *replies = NULL; + int ret = 0; + + local = frame->local; + replies = local->replies; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (first == -1) { + first = i; + continue; + } + + if (replies[i].op_ret != replies[first].op_ret) { + need_heal = _gf_true; + break; + } + + if (uuid_compare (replies[i].poststat.ia_gfid, + replies[first].poststat.ia_gfid)) { + need_heal = _gf_true; + break; + } + } + + if (need_heal) { + heal = copy_frame (frame); + if (heal) + heal->root->pid = -1; + ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap, + afr_refresh_selfheal_done, heal, frame); + if (ret) + goto lookup_done; + } else { + lookup_done: + afr_lookup_done (frame, this); + } + + return ret; } -static gf_boolean_t -afr_lookup_conflicting_entries (afr_local_t *local, xlator_t *this) -{ - afr_private_t *priv = NULL; - gf_boolean_t conflict = _gf_false; - priv = this->private; - conflict = afr_conflicting_iattrs (local->cont.lookup.bufs, - local->cont.lookup.success_children, - priv->child_count, local->loc.path, - this->name); - return conflict; -} - -static void -afr_lookup_set_self_heal_data (afr_local_t *local, xlator_t *this) +int +afr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *buf, + dict_t *xdata, struct iatt *postparent) { - int i = 0; - struct iatt *bufs = NULL; - dict_t **xattr = NULL; - afr_private_t *priv = NULL; - int32_t child1 = -1; - int32_t child2 = -1; + afr_local_t * local = NULL; + int call_count = -1; + int child_index = -1; - priv = this->private; - afr_detect_self_heal_by_lookup_status (local, this); + child_index = (long) cookie; - if (afr_lookup_gfid_missing_count (local, this)) - local->self_heal.need_gfid_self_heal = _gf_true; + local = frame->local; - if (_gf_true == afr_lookup_conflicting_entries (local, this)) - local->self_heal.need_missing_entry_self_heal = _gf_true; - else - afr_update_gfid_from_iatts (local->self_heal.sh_gfid_req, - local->cont.lookup.bufs, - local->cont.lookup.success_children, - priv->child_count); + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (op_ret != -1) { + local->replies[child_index].poststat = *buf; + local->replies[child_index].postparent = *postparent; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + } - bufs = local->cont.lookup.bufs; - for (i = 1; i < local->success_count; i++) { - child1 = local->cont.lookup.success_children[i-1]; - child2 = local->cont.lookup.success_children[i]; - afr_detect_self_heal_by_iatt (local, this, - &bufs[child1], &bufs[child2]); + call_count = afr_frame_return (frame); + if (call_count == 0) { + afr_lookup_entry_heal (frame, this); } - xattr = local->cont.lookup.xattrs; - for (i = 0; i < local->success_count; i++) { - child1 = local->cont.lookup.success_children[i]; - afr_lookup_set_self_heal_data_by_xattr (local, this, - xattr[child1]); - } + return 0; } -int -afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) + + +static void +afr_discover_done (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = -1; + int op_errno = 0; + int read_subvol = 0; + priv = this->private; local = frame->local; - if (op_ret == -1) { - local->op_ret = -1; - if (afr_error_more_important (local->op_errno, op_errno)) - local->op_errno = op_errno; + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret == 0) + local->op_ret = 0; + } - goto out; - } else { - local->op_ret = 0; - } + op_errno = afr_final_errno (frame->local, this->private); - afr_lookup_done_success_action (frame, this, _gf_true); -out: - AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, - local->cont.lookup.inode, &local->cont.lookup.buf, - local->cont.lookup.xattr, - &local->cont.lookup.postparent); + if (local->op_ret < 0) { + local->op_errno = op_errno; + local->op_ret = -1; + goto unwind; + } - return 0; -} + afr_replies_interpret (frame, this, local->inode); -//TODO: At the moment only lookup needs this, so not doing any checks, in the -// future we will have to do fop specific operations -void -afr_post_gfid_sh_success (call_frame_t *sh_frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_local_t *sh_local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - struct iatt *lookup_bufs = NULL; - struct iatt *lookup_parentbufs = NULL; - - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - local = sh->orig_frame->local; - lookup_bufs = local->cont.lookup.bufs; - lookup_parentbufs = local->cont.lookup.postparents; - priv = this->private; + read_subvol = afr_data_subvol_get (local->inode, this, 0, 0); + if (read_subvol == -1) { + gf_log (this->name, GF_LOG_WARNING, "no read subvols for %s", + local->loc.path); - memcpy (lookup_bufs, sh->buf, priv->child_count * sizeof (*sh->buf)); - memcpy (lookup_parentbufs, sh->parentbufs, - priv->child_count * sizeof (*sh->parentbufs)); + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid || + local->replies[i].op_ret == -1) + continue; + read_subvol = i; + break; + } + } - afr_reset_xattr (local->cont.lookup.xattrs, priv->child_count); - if (local->cont.lookup.xattr) { - dict_unref (local->cont.lookup.xattr); - local->cont.lookup.xattr = NULL; - } +unwind: + if (read_subvol == -1) + read_subvol = 0; - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) - local->cont.lookup.xattrs[i] = dict_ref (sh->xattr[i]); - } - - afr_reset_children (local->cont.lookup.success_children, - priv->child_count); - afr_children_copy (local->cont.lookup.success_children, - sh->fresh_children, priv->child_count); + AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->replies[read_subvol].poststat, + local->replies[read_subvol].xdata, + &local->replies[read_subvol].postparent); } -static void -afr_lookup_perform_self_heal_if_needed (call_frame_t *frame, xlator_t *this, - gf_boolean_t *sh_launched) + +int +afr_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *buf, + dict_t *xdata, struct iatt *postparent) { - size_t up_count = 0; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; + afr_local_t * local = NULL; + int call_count = -1; + int child_index = -1; - GF_ASSERT (sh_launched); - *sh_launched = _gf_false; - priv = this->private; - local = frame->local; + child_index = (long) cookie; - up_count = afr_up_children_count (priv->child_count, local->child_up); - if (up_count == 1) { - gf_log (this->name, GF_LOG_DEBUG, - "Only 1 child up - do not attempt to detect self heal"); - goto out; - } + local = frame->local; - afr_lookup_set_self_heal_data (local, this); - if (afr_can_self_heal_proceed (&local->self_heal, priv)) { - if (afr_is_self_heal_running (local)) - goto out; + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (op_ret != -1) { + local->replies[child_index].poststat = *buf; + local->replies[child_index].postparent = *postparent; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + } - afr_launch_self_heal (frame, this, local->cont.lookup.inode, - _gf_true, local->cont.lookup.buf.ia_type, - afr_post_gfid_sh_success, - afr_self_heal_lookup_unwind); - *sh_launched = _gf_true; - } -out: - return; -} + if (local->do_discovery && (op_ret == 0)) + afr_attempt_local_discovery (this, child_index); -void -afr_get_fresh_children (int32_t *success_children, int32_t *sources, - int32_t *fresh_children, unsigned int child_count) -{ - unsigned int i = 0; - unsigned int j = 0; - - GF_ASSERT (success_children); - GF_ASSERT (sources); - GF_ASSERT (fresh_children); - - afr_reset_children (fresh_children, child_count); - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - if (afr_is_read_child (success_children, sources, child_count, - success_children[i])) { - fresh_children[j] = success_children[i]; - j++; - } + call_count = afr_frame_return (frame); + if (call_count == 0) { + afr_discover_done (frame, this); } + + return 0; } -static int -afr_lookup_set_read_ctx (afr_local_t *local, xlator_t *this, int32_t read_child) + +int +afr_discover_do (call_frame_t *frame, xlator_t *this, int err) { - afr_private_t *priv = NULL; + int ret = 0; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; - GF_ASSERT (read_child >= 0); + local = frame->local; + priv = this->private; - priv = this->private; - afr_get_fresh_children (local->cont.lookup.success_children, - local->cont.lookup.sources, - local->fresh_children, priv->child_count); - afr_inode_set_read_ctx (this, local->cont.lookup.inode, read_child, - local->fresh_children); + if (err) { + local->op_errno = -err; + ret = -1; + goto out; + } - return 0; -} + call_count = local->call_count = AFR_COUNT (local->child_up, + priv->child_count); -int -afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, - gf_boolean_t fail_conflict) -{ - int32_t read_child = -1; - int32_t ret = -1; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - priv = this->private; - - if (local->loc.parent == NULL) - fail_conflict = _gf_true; - - if (afr_conflicting_iattrs (local->cont.lookup.bufs, - local->cont.lookup.success_children, - priv->child_count, local->loc.path, - this->name)) { - if (fail_conflict == _gf_false) { - ret = 0; - } else { - local->op_ret = -1; - local->op_errno = EIO; - } - goto out; - } - - ret = afr_lookup_select_read_child (local, this, &read_child); + ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, + &local->loc); if (ret) { - local->op_ret = -1; - local->op_errno = EIO; + local->op_errno = -ret; + ret = -1; goto out; } - ret = afr_lookup_set_read_ctx (local, this, read_child); - if (ret) { - local->op_ret = -1; - local->op_errno = EIO; - goto out; - } - - afr_lookup_build_response_params (local, this); - if (afr_is_fresh_lookup (&local->loc, this)) { - afr_update_loc_gfids (&local->loc, - &local->cont.lookup.buf, - &local->cont.lookup.postparent); + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_discover_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, local->xattr_req); + if (!--call_count) + break; + } } - ret = 0; + return 0; out: - return ret; + AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0); + return 0; } -static void -afr_lookup_done (call_frame_t *frame, xlator_t *this) -{ - int unwind = 1; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int ret = -1; - gf_boolean_t sh_launched = _gf_false; - int gfid_miss_count = 0; - int enotconn_count = 0; - int up_children_count = 0; - priv = this->private; - local = frame->local; - - if (local->op_ret < 0) - goto unwind; - gfid_miss_count = afr_lookup_gfid_missing_count (local, this); - up_children_count = afr_up_children_count (priv->child_count, - local->child_up); - enotconn_count = priv->child_count - up_children_count; - if ((gfid_miss_count == local->success_count) && - (enotconn_count > 0)) { - local->op_ret = -1; - local->op_errno = EIO; - gf_log (this->name, GF_LOG_ERROR, "Failing lookup for %s, " - "LOOKUP on a file without gfid is not allowed when " - "some of the children are down", local->loc.path); - goto unwind; - } +int +afr_discover (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + int op_errno = ENOMEM; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int event = 0; - ret = afr_lookup_done_success_action (frame, this, _gf_false); - if (ret) - goto unwind; - uuid_copy (local->self_heal.sh_gfid_req, local->cont.lookup.gfid_req); + priv = this->private; - afr_lookup_perform_self_heal_if_needed (frame, this, &sh_launched); - if (sh_launched) { - unwind = 0; - goto unwind; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - unwind: - if (unwind) { - AFR_STACK_UNWIND (lookup, frame, local->op_ret, - local->op_errno, local->cont.lookup.inode, - &local->cont.lookup.buf, - local->cont.lookup.xattr, - &local->cont.lookup.postparent); + if (!local->call_count) { + op_errno = ENOTCONN; + goto out; } -} -/* - * During a lookup, some errors are more "important" than - * others in that they must be given higher priority while - * returning to the user. - * - * The hierarchy is ESTALE > ENOENT > others - * - */ - -gf_boolean_t -afr_error_more_important (int32_t old_errno, int32_t new_errno) -{ - gf_boolean_t ret = _gf_true; + if (__is_root_gfid (loc->inode->gfid)) { + if (!this->itable) + this->itable = loc->inode->table; + if (!priv->root_inode) + priv->root_inode = inode_ref (loc->inode); - /* Nothing should ever overwrite ESTALE */ - if (old_errno == ESTALE) - ret = _gf_false; - - /* Nothing should overwrite ENOENT, except ESTALE */ - else if ((old_errno == ENOENT) && (new_errno != ESTALE)) - ret = _gf_false; + if (priv->choose_local && !priv->did_discovery) { + /* Logic to detect which subvolumes of AFR are + local, in order to prefer them for reads + */ + local->do_discovery = _gf_true; + priv->did_discovery = _gf_true; + } + } - return ret; -} + local->op = GF_FOP_LOOKUP; -int32_t -afr_resultant_errno_get (int32_t *children, - int *child_errno, unsigned int child_count) -{ - int i = 0; - int32_t op_errno = 0; - int child = 0; + loc_copy (&local->loc, loc); - for (i = 0; i < child_count; i++) { - if (children) { - child = children[i]; - if (child == -1) - break; - } else { - child = i; - } - if (afr_error_more_important (op_errno, child_errno[child])) - op_errno = child_errno[child]; - } - return op_errno; -} + local->inode = inode_ref (loc->inode); -static void -afr_lookup_handle_error (afr_local_t *local, int32_t op_ret, int32_t op_errno) -{ - GF_ASSERT (local); - if (op_errno == ENOENT) - local->enoent_count++; + if (xattr_req) + /* If xattr_req was null, afr_lookup_xattr_req_prepare() will + allocate one for us */ + local->xattr_req = dict_ref (xattr_req); - if (afr_error_more_important (local->op_errno, op_errno)) - local->op_errno = op_errno; + if (uuid_is_null (loc->inode->gfid)) { + afr_discover_do (frame, this, 0); + return 0; + } - if (local->op_errno == ESTALE) { - local->op_ret = -1; - } -} + afr_read_subvol_get (loc->inode, this, NULL, &event, + AFR_DATA_TRANSACTION); -static void -afr_set_root_inode_on_first_lookup (afr_local_t *local, xlator_t *this, - inode_t *inode) -{ - afr_private_t *priv = NULL; - GF_ASSERT (inode); + if (event != local->event_generation) + afr_inode_refresh (frame, this, loc->inode, afr_discover_do); + else + afr_discover_do (frame, this, 0); - if (inode->ino != 1) - goto out; - if (!afr_is_fresh_lookup (&local->loc, this)) - goto out; - priv = this->private; - if ((priv->first_lookup)) { - gf_log (this->name, GF_LOG_INFO, "added root inode"); - priv->root_inode = inode_ref (inode); - priv->first_lookup = 0; - } + return 0; out: - return; + AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; } -static void -afr_lookup_cache_args (afr_local_t *local, int child_index, dict_t *xattr, - struct iatt *buf, struct iatt *postparent) -{ - GF_ASSERT (child_index >= 0); - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - local->cont.lookup.postparents[child_index] = *postparent; - local->cont.lookup.bufs[child_index] = *buf; -} - -static void -afr_lookup_handle_first_success (afr_local_t *local, xlator_t *this, - inode_t *inode, struct iatt *buf) -{ - local->cont.lookup.inode = inode_ref (inode); - local->cont.lookup.buf = *buf; - afr_set_root_inode_on_first_lookup (local, this, inode); -} - -static void -afr_lookup_handle_success (afr_local_t *local, xlator_t *this, int32_t child_index, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - if (local->success_count == 0) { - if (local->op_errno != ESTALE) { - local->op_ret = op_ret; - local->op_errno = 0; - } - afr_lookup_handle_first_success (local, this, inode, buf); - } - afr_lookup_update_lk_counts (local, this, - child_index, xattr); - - afr_lookup_cache_args (local, child_index, xattr, - buf, postparent); - local->cont.lookup.success_children[local->success_count] = child_index; - local->success_count++; -} int -afr_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) +afr_lookup_do (call_frame_t *frame, xlator_t *this, int err) { - afr_local_t * local = NULL; - int call_count = -1; - int child_index = -1; + int ret = 0; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; - child_index = (long) cookie; + local = frame->local; + priv = this->private; - LOCK (&frame->lock); - { - local = frame->local; + if (err < 0) { + local->op_errno = -err; + ret = -1; + goto out; + } - if (op_ret == -1) { - afr_lookup_handle_error (local, op_ret, op_errno); - goto unlock; - } - afr_lookup_handle_success (local, this, child_index, op_ret, - op_errno, inode, buf, xattr, - postparent); - - } -unlock: - UNLOCK (&frame->lock); + call_count = local->call_count = AFR_COUNT (local->child_up, + priv->child_count); - call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_lookup_done (frame, this); + ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, + &local->loc); + if (ret) { + local->op_errno = -ret; + ret = -1; + goto out; } - return 0; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, local->xattr_req); + if (!--call_count) + break; + } + } + return 0; +out: + AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0); + return 0; } +/* + * afr_lookup() + * + * The goal here is to figure out what the element getting looked up is. + * i.e what is the GFID, inode type and a conservative estimate of the + * inode attributes are. + * + * As we lookup, operations may be underway on the entry name and the + * inode. In lookup() we are primarily concerned only with the entry + * operations. If the entry is getting unlinked or renamed, we detect + * what operation is underway by querying for on-going transactions and + * pending self-healing on the entry through xdata. + * + * If the entry is a file/dir, it may need self-heal and/or in a + * split-brain condition. Lookup is not the place to worry about these + * conditions. Outcast marking will naturally handle them in the read + * paths. + * + * Here is a brief goal of what we are trying to achieve: + * + * - LOOKUP on all subvolumes concurrently, querying on-going transaction + * and pending self-heal info from the servers. + * + * - If all servers reply the same inode type and GFID, the overall call + * MUST be a success. + * + * - If inode types or GFIDs mismatch, and there IS either an on-going + * transaction or pending self-heal, inspect what the nature of the + * transaction or pending heal is, and select the appropriate subvolume's + * reply as the winner. + * + * - If inode types or GFIDs mismatch, and there are no on-going transactions + * or pending self-heal on the entry name on any of the servers, fail the + * lookup with EIO. Something has gone wrong beyond reasonable action. + */ + int -afr_lookup_cont_init (afr_local_t *local, unsigned int child_count) +afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) { - int ret = -ENOMEM; - struct iatt *iatts = NULL; - int32_t *success_children = NULL; + afr_local_t *local = NULL; + int32_t op_errno = 0; + int event = 0; - GF_ASSERT (local); - local->cont.lookup.xattrs = GF_CALLOC (child_count, - sizeof (*local->cont.lookup.xattr), - gf_afr_mt_dict_t); - if (NULL == local->cont.lookup.xattrs) - goto out; + if (!loc->parent) { + afr_discover (frame, this, loc, xattr_req); + return 0; + } - iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); - if (NULL == iatts) - goto out; - local->cont.lookup.postparents = iatts; + if (__is_root_gfid (loc->parent->gfid)) { + if (!strcmp (loc->name, GF_REPLICATE_TRASH_DIR)) { + op_errno = EPERM; + goto out; + } + } - iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); - if (NULL == iatts) - goto out; - local->cont.lookup.bufs = iatts; - - success_children = afr_fresh_children_create (child_count); - if (NULL == success_children) - goto out; - local->cont.lookup.success_children = success_children; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - local->fresh_children = afr_fresh_children_create (child_count); - if (NULL == local->fresh_children) + if (!local->call_count) { + op_errno = ENOTCONN; goto out; + } - ret = 0; -out: - return ret; -} + local->op = GF_FOP_LOOKUP; -int -afr_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - void *gfid_req = NULL; - int ret = -1; - int i = 0; - int call_count = 0; - uint64_t ctx = 0; - int32_t op_errno = 0; + loc_copy (&local->loc, loc); - priv = this->private; + local->inode = inode_ref (loc->inode); - ALLOC_OR_GOTO (local, afr_local_t, out); + if (xattr_req) + /* If xattr_req was null, afr_lookup_xattr_req_prepare() will + allocate one for us */ + local->xattr_req = dict_ref (xattr_req); - local->op_ret = -1; + afr_read_subvol_get (loc->parent, this, NULL, &event, + AFR_DATA_TRANSACTION); - frame->local = local; + if (event != local->event_generation) + afr_inode_refresh (frame, this, loc->parent, afr_lookup_do); + else + afr_lookup_do (frame, this, 0); - if (!strcmp (loc->path, "/" GF_REPLICATE_TRASH_DIR)) { - op_errno = ENOENT; - goto out; - } + return 0; +out: + AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); - loc_copy (&local->loc, loc); + return 0; +} - ret = inode_ctx_get (loc->inode, this, &ctx); - if (ret == 0) { - /* lookup is a revalidate */ - local->read_child_index = afr_inode_get_read_ctx (this, - loc->inode, - NULL); - } else { - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); - } +/* {{{ open */ - if (loc->parent) - local->cont.lookup.parent_ino = loc->parent->ino; +afr_fd_ctx_t * +__afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ + uint64_t ctx = 0; + int ret = 0; + afr_fd_ctx_t *fd_ctx = NULL; - local->child_up = memdup (priv->child_up, priv->child_count); - if (NULL == local->child_up) { - op_errno = ENOMEM; - goto out; - } + ret = __fd_ctx_get (fd, this, &ctx); - ret = afr_lookup_cont_init (local, priv->child_count); if (ret < 0) { - op_errno = -ret; - goto out; - } - - local->call_count = afr_up_children_count (priv->child_count, - local->child_up); - call_count = local->call_count; + ret = __afr_fd_ctx_set (this, fd); + if (ret < 0) + goto out; - if (local->call_count == 0) { - ret = -1; - op_errno = ENOTCONN; - goto out; + ret = __fd_ctx_get (fd, this, &ctx); + if (ret < 0) + goto out; } - /* By default assume ENOTCONN. On success it will be set to 0. */ - local->op_errno = ENOTCONN; + fd_ctx = (afr_fd_ctx_t *)(long) ctx; +out: + return fd_ctx; +} - if (xattr_req == NULL) - local->xattr_req = dict_new (); - else - local->xattr_req = dict_ref (xattr_req); - afr_xattr_req_prepare (this, local->xattr_req, loc->path); - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_INODELK_COUNT); - } - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_ENTRYLK_COUNT); - } - - ret = dict_get_ptr (xattr_req, "gfid-req", &gfid_req); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to get the gfid from dict"); - } else { - uuid_copy (local->cont.lookup.gfid_req, gfid_req); - } - if (local->loc.parent != NULL) - dict_del (xattr_req, "gfid-req"); +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ + afr_fd_ctx_t *fd_ctx = NULL; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_lookup_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - loc, local->xattr_req); - if (!--call_count) - break; - } + LOCK(&fd->lock); + { + fd_ctx = __afr_fd_ctx_get (fd, this); } + UNLOCK(&fd->lock); - ret = 0; -out: - if (ret == -1) - AFR_STACK_UNWIND (lookup, frame, -1, op_errno, - NULL, NULL, NULL, NULL); - - return 0; + return fd_ctx; } -/* {{{ open */ - int -afr_fd_ctx_set (xlator_t *this, fd_t *fd) +__afr_fd_ctx_set (xlator_t *this, fd_t *fd) { afr_private_t * priv = NULL; int ret = -1; uint64_t ctx = 0; afr_fd_ctx_t * fd_ctx = NULL; + int i = 0; VALIDATE_OR_GOTO (this->private, out); VALIDATE_OR_GOTO (fd, out); priv = this->private; - LOCK (&fd->lock); - { - ret = __fd_ctx_get (fd, this, &ctx); + ret = __fd_ctx_get (fd, this, &ctx); - if (ret == 0) - goto unlock; + if (ret == 0) + goto out; - fd_ctx = GF_CALLOC (1, sizeof (afr_fd_ctx_t), - gf_afr_mt_afr_fd_ctx_t); - if (!fd_ctx) { - ret = -ENOMEM; - goto unlock; - } + fd_ctx = GF_CALLOC (1, sizeof (afr_fd_ctx_t), + gf_afr_mt_afr_fd_ctx_t); + if (!fd_ctx) { + ret = -ENOMEM; + goto out; + } - fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_done) { - ret = -ENOMEM; - goto unlock; - } + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]), + priv->child_count, + gf_afr_mt_int32_t); + if (!fd_ctx->pre_op_done[i]) { + ret = -ENOMEM; + goto out; + } + } + + fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), + priv->child_count, + gf_afr_mt_int32_t); + if (!fd_ctx->opened_on) { + ret = -ENOMEM; + goto out; + } - fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_piggyback) { - ret = -ENOMEM; - goto unlock; - } + for (i = 0; i < priv->child_count; i++) { + if (fd_is_anonymous (fd)) + fd_ctx->opened_on[i] = AFR_FD_OPENED; + else + fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED; + } - fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->opened_on) { - ret = -ENOMEM; - goto unlock; - } + fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->lock_piggyback) { + ret = -ENOMEM; + goto out; + } - fd_ctx->up_count = priv->up_count; - fd_ctx->down_count = priv->down_count; + fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->lock_acquired) { + ret = -ENOMEM; + goto out; + } - fd_ctx->locked_on = GF_CALLOC (sizeof (*fd_ctx->locked_on), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->locked_on) { - ret = -ENOMEM; - goto unlock; - } + pthread_mutex_init (&fd_ctx->delay_lock, NULL); - ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set fd ctx (%p)", fd); + INIT_LIST_HEAD (&fd_ctx->eager_locked); - INIT_LIST_HEAD (&fd_ctx->entries); - } -unlock: - UNLOCK (&fd->lock); + ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); + if (ret) + gf_log (this->name, GF_LOG_DEBUG, + "failed to set fd ctx (%p)", fd); out: return ret; } -/* {{{ flush */ int -afr_flush_unwind (call_frame_t *frame, xlator_t *this) +afr_fd_ctx_set (xlator_t *this, fd_t *fd) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; + int ret = -1; - LOCK (&frame->lock); + LOCK (&fd->lock); { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (flush, main_frame, - local->op_ret, local->op_errno); + ret = __afr_fd_ctx_set (this, fd); } + UNLOCK (&fd->lock); - return 0; + return ret; } +/* {{{ flush */ int -afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int child_index = (long) cookie; - int need_unwind = 0; + afr_local_t *local = NULL; + int call_count = -1; local = frame->local; - priv = this->private; LOCK (&frame->lock); { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } - - local->op_errno = op_errno; + local->op_ret = op_ret; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + } else { + local->op_errno = op_errno; + } } UNLOCK (&frame->lock); - if (need_unwind) - afr_flush_unwind (frame, this); + call_count = afr_frame_return (frame); - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } + if (call_count == 0) + AFR_STACK_UNWIND (flush, frame, local->op_ret, + local->op_errno, local->xdata_rsp); return 0; } - -int -afr_flush_wind (call_frame_t *frame, xlator_t *this) +static int +afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int i = 0; - int call_count = -1; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; - local = frame->local; priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; + local = frame->local; + call_count = local->call_count; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_flush_wind_cbk, + STACK_WIND_COOKIE (frame, afr_flush_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->flush, - local->fd); - + local->fd, xdata); if (!--call_count) break; + } } return 0; } - int -afr_flush_done (call_frame_t *frame, xlator_t *this) +afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; + call_stub_t *stub = NULL; + int op_errno = ENOMEM; - local = frame->local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - local->transaction.unwind (frame, this); + if (!local->call_count) { + op_errno = ENOTCONN; + goto out; + } - AFR_STACK_DESTROY (frame); + local->fd = fd_ref(fd); - return 0; -} - - -int -afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; - int call_count = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata); + if (!stub) goto out; - } - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; - goto out; - } - transaction_frame->local = local; + afr_delayed_changelog_wake_resume (this, fd, stub); - local->op = GF_FOP_FLUSH; - - local->transaction.fop = afr_flush_wind; - local->transaction.done = afr_flush_done; - local->transaction.unwind = afr_flush_unwind; - - local->fd = fd_ref (fd); - - local->transaction.main_frame = frame; - local->transaction.start = 0; - local->transaction.len = 0; - - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - - - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - - AFR_STACK_UNWIND (flush, frame, op_ret, op_errno); - } - + AFR_STACK_UNWIND (flush, frame, -1, op_errno, NULL); return 0; } @@ -2148,6 +2110,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) uint64_t ctx = 0; afr_fd_ctx_t *fd_ctx = NULL; int ret = 0; + int i = 0; ret = fd_ctx_get (fd, this, &ctx); if (ret < 0) @@ -2156,17 +2119,22 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) fd_ctx = (afr_fd_ctx_t *)(long) ctx; if (fd_ctx) { - if (fd_ctx->pre_op_done) - GF_FREE (fd_ctx->pre_op_done); + //no need to take any locks + if (!list_empty (&fd_ctx->eager_locked)) + gf_log (this->name, GF_LOG_WARNING, "%s: Stale " + "Eager-lock stubs found", + uuid_utoa (fd->inode->gfid)); + + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) + GF_FREE (fd_ctx->pre_op_done[i]); + + GF_FREE (fd_ctx->opened_on); - if (fd_ctx->opened_on) - GF_FREE (fd_ctx->opened_on); + GF_FREE (fd_ctx->lock_piggyback); - if (fd_ctx->locked_on) - GF_FREE (fd_ctx->locked_on); + GF_FREE (fd_ctx->lock_acquired); - if (fd_ctx->pre_op_piggyback) - GF_FREE (fd_ctx->pre_op_piggyback); + pthread_mutex_destroy (&fd_ctx->delay_lock); GF_FREE (fd_ctx); } @@ -2179,24 +2147,8 @@ out: int afr_release (xlator_t *this, fd_t *fd) { - afr_locked_fd_t *locked_fd = NULL; - afr_locked_fd_t *tmp = NULL; - afr_private_t *priv = NULL; - - priv = this->private; - afr_cleanup_fd_ctx (this, fd); - list_for_each_entry_safe (locked_fd, tmp, &priv->saved_fds, - list) { - - if (locked_fd->fd == fd) { - list_del_init (&locked_fd->list); - GF_FREE (locked_fd); - } - - } - return 0; } @@ -2204,51 +2156,87 @@ afr_release (xlator_t *this, fd_t *fd) /* {{{ fsync */ int +afr_fsync_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} + +int afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { afr_local_t *local = NULL; int call_count = -1; int child_index = (long) cookie; - int read_child = 0; + int read_subvol = 0; + call_stub_t *stub = NULL; local = frame->local; - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + read_subvol = afr_data_subvol_get (local->inode, this, 0, 0); LOCK (&frame->lock); { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - if (op_ret == 0) { - local->op_ret = 0; + if (local->op_ret == -1) { + local->op_ret = 0; - if (local->success_count == 0) { - local->cont.fsync.prebuf = *prebuf; - local->cont.fsync.postbuf = *postbuf; - } + local->cont.inode_wfop.prebuf = *prebuf; + local->cont.inode_wfop.postbuf = *postbuf; - if (child_index == read_child) { - local->cont.fsync.prebuf = *prebuf; - local->cont.fsync.postbuf = *postbuf; + if (xdata) + local->xdata_rsp = dict_ref (xdata); } - local->success_count++; - } - - local->op_errno = op_errno; + if (child_index == read_subvol) { + local->cont.inode_wfop.prebuf = *prebuf; + local->cont.inode_wfop.postbuf = *postbuf; + if (xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = dict_ref (xdata); + } + } + } else { + local->op_errno = op_errno; + } } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) { - AFR_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno, - &local->cont.fsync.prebuf, - &local->cont.fsync.postbuf); + /* Make a stub out of the frame, and register it + with the waking up post-op. When the call-stub resumes, + we are guaranteed that there was no post-op pending + (i.e changelogs were unset in the server). This is an + essential "guarantee", that fsync() returns only after + completely finishing EVERYTHING, including the delayed + post-op. This guarantee is expected by FUSE graph switching + for example. + */ + stub = fop_fsync_cbk_stub (frame, afr_fsync_unwind_cbk, + local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + local->xdata_rsp); + if (!stub) { + AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); + return 0; + } + + /* If no new unstable writes happened between the + time we cleared the unstable write witness flag in afr_fsync + and now, calling afr_delayed_changelog_wake_up() should + wake up and skip over the fsync phase and go straight to + afr_changelog_post_op_now() + */ + afr_delayed_changelog_wake_resume (this, local->fd, stub); } return 0; @@ -2256,36 +2244,34 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int -afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync) +afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; + int32_t op_errno = ENOMEM; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + priv = this->private; - priv = this->private; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - ALLOC_OR_GOTO (local, afr_local_t, out); + call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local->fd = fd_ref (fd); - call_count = local->call_count; - frame->local = local; + if (afr_fd_has_witnessed_unstable_write (this, fd)) { + /* don't care. we only wanted to CLEAR the bit */ + } - local->fd = fd_ref (fd); - local->cont.fsync.ino = fd->inode->ino; + local->inode = inode_ref (fd->inode); for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -2293,17 +2279,16 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, (void *) (long) i, priv->children[i], priv->children[i]->fops->fsync, - fd, datasync); + fd, datasync, xdata); if (!--call_count) break; } } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, NULL, NULL); - } + AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } @@ -2311,9 +2296,9 @@ out: /* {{{ fsync */ -int32_t -afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) +int +afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; int call_count = -1; @@ -2322,10 +2307,13 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, LOCK (&frame->lock); { - if (op_ret == 0) + if (op_ret == 0) { local->op_ret = 0; - - local->op_errno = op_errno; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + } else { + local->op_errno = op_errno; + } } UNLOCK (&frame->lock); @@ -2333,57 +2321,49 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret, - local->op_errno); + local->op_errno, local->xdata_rsp); return 0; } -int32_t -afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync) +int +afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; + int32_t op_errno = ENOMEM; - ALLOC_OR_GOTO (local, afr_local_t, out); + priv = this->private; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; - frame->local = local; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_fsyncdir_cbk, priv->children[i], priv->children[i]->fops->fsyncdir, - fd, datasync); + fd, datasync, xdata); if (!--call_count) break; } } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fsyncdir, frame, op_ret, op_errno); - } + AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); + return 0; } @@ -2394,7 +2374,7 @@ out: int32_t afr_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) + dict_t *xattr, dict_t *xdata) { afr_local_t *local = NULL; int call_count = -1; @@ -2403,8 +2383,15 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, LOCK (&frame->lock); { - if (op_ret == 0) + if (op_ret == 0) { + if (!local->cont.xattrop.xattr) + local->cont.xattrop.xattr = dict_ref (xattr); + + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + local->op_ret = 0; + } local->op_errno = op_errno; } @@ -2414,7 +2401,7 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (xattrop, frame, local->op_ret, local->op_errno, - xattr); + local->cont.xattrop.xattr, local->xdata_rsp); return 0; } @@ -2422,49 +2409,41 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, int32_t afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t optype, dict_t *xattr) + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; - frame->local = local; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_xattrop_cbk, priv->children[i], priv->children[i]->fops->xattrop, - loc, optype, xattr); + loc, optype, xattr, xdata); if (!--call_count) break; } } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (xattrop, frame, op_ret, op_errno, NULL); - } + AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); + return 0; } @@ -2475,7 +2454,7 @@ out: int32_t afr_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) + dict_t *xattr, dict_t *xdata) { afr_local_t *local = NULL; @@ -2485,8 +2464,14 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, LOCK (&frame->lock); { - if (op_ret == 0) + if (op_ret == 0) { + if (!local->cont.fxattrop.xattr) + local->cont.fxattrop.xattr = dict_ref (xattr); + + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); local->op_ret = 0; + } local->op_errno = op_errno; } @@ -2496,7 +2481,7 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fxattrop, frame, local->op_ret, local->op_errno, - xattr); + local->cont.fxattrop.xattr, local->xdata_rsp); return 0; } @@ -2504,49 +2489,41 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, int32_t afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_xattrop_flags_t optype, dict_t *xattr) + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) goto out; - } call_count = local->call_count; - frame->local = local; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_fxattrop_cbk, priv->children[i], priv->children[i]->fops->fxattrop, - fd, optype, xattr); + fd, optype, xattr, xdata); if (!--call_count) break; } } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, NULL); - } + AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); + return 0; } @@ -2554,8 +2531,8 @@ out: int32_t -afr_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) +afr_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -2576,7 +2553,7 @@ afr_inodelk_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (inodelk, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } @@ -2584,57 +2561,50 @@ afr_inodelk_cbk (call_frame_t *frame, void *cookie, int32_t afr_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock) + const char *volume, loc_t *loc, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) goto out; - } call_count = local->call_count; - frame->local = local; + if (!call_count) { + op_errno = ENOMEM; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_inodelk_cbk, priv->children[i], priv->children[i]->fops->inodelk, - volume, loc, cmd, flock); + volume, loc, cmd, flock, xdata); if (!--call_count) break; } } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (inodelk, frame, op_ret, op_errno); - } + AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); + return 0; } int32_t -afr_finodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) +afr_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -2655,66 +2625,57 @@ afr_finodelk_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (finodelk, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } int32_t -afr_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock) +afr_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *flock, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; - frame->local = local; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_finodelk_cbk, priv->children[i], priv->children[i]->fops->finodelk, - volume, fd, cmd, flock); + volume, fd, cmd, flock, xdata); if (!--call_count) break; } } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (finodelk, frame, op_ret, op_errno); - } + AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); + return 0; } int32_t -afr_entrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) - +afr_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; int call_count = -1; @@ -2734,67 +2695,59 @@ afr_entrylk_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (entrylk, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } -int32_t -afr_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, - const char *basename, entrylk_cmd cmd, entrylk_type type) +int +afr_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; - frame->local = local; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_entrylk_cbk, priv->children[i], priv->children[i]->fops->entrylk, - volume, loc, basename, cmd, type); + volume, loc, basename, cmd, type, xdata); if (!--call_count) break; } } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (entrylk, frame, op_ret, op_errno); - } + AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); + return 0; } -int32_t -afr_fentrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) +int +afr_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -2815,156 +2768,148 @@ afr_fentrylk_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fentrylk, frame, local->op_ret, - local->op_errno); + local->op_errno, xdata); return 0; } -int32_t -afr_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, - const char *basename, entrylk_cmd cmd, entrylk_type type) +int +afr_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; - frame->local = local; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_fentrylk_cbk, priv->children[i], priv->children[i]->fops->fentrylk, - volume, fd, basename, cmd, type); + volume, fd, basename, cmd, type, xdata); if (!--call_count) break; } } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fentrylk, frame, op_ret, op_errno); - } + AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); + return 0; } -int32_t -afr_statfs_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct statvfs *statvfs) + +int +afr_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct statvfs *statvfs, dict_t *xdata) { afr_local_t *local = NULL; int call_count = 0; + struct statvfs *buf = NULL; LOCK (&frame->lock); { local = frame->local; - if (op_ret == 0) { - local->op_ret = op_ret; - - if (local->cont.statfs.buf_set) { - if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail) - local->cont.statfs.buf = *statvfs; - } else { - local->cont.statfs.buf = *statvfs; - local->cont.statfs.buf_set = 1; - } - } - - if (op_ret == -1) + if (op_ret != 0) { local->op_errno = op_errno; - + goto unlock; + } + + local->op_ret = op_ret; + + buf = &local->cont.statfs.buf; + if (local->cont.statfs.buf_set) { + if (statvfs->f_bavail < buf->f_bavail) { + *buf = *statvfs; + if (xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = dict_ref (xdata); + } + } + } else { + *buf = *statvfs; + local->cont.statfs.buf_set = 1; + if (xdata) + local->xdata_rsp = dict_ref (xdata); + } } +unlock: UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno, - &local->cont.statfs.buf); + &local->cont.statfs.buf, local->xdata_rsp); return 0; } -int32_t -afr_statfs (call_frame_t *frame, xlator_t *this, - loc_t *loc) +int +afr_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - afr_private_t * priv = NULL; - int child_count = 0; afr_local_t * local = NULL; + afr_private_t *priv = NULL; int i = 0; - int ret = -1; int call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; + int32_t op_errno = ENOMEM; - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); + priv = this->private; - priv = this->private; - child_count = priv->child_count; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - frame->local = local; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } - for (i = 0; i < child_count; i++) { + for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_statfs_cbk, priv->children[i], priv->children[i]->fops->statfs, - loc); + loc, xdata); if (!--call_count) break; } } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (statfs, frame, op_ret, op_errno, NULL); - } + AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); + return 0; } int32_t afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) { afr_local_t * local = NULL; int call_count = -1; @@ -2974,7 +2919,7 @@ afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (call_count == 0) AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, - lock); + lock, xdata); return 0; } @@ -2996,7 +2941,7 @@ afr_lk_unlock (call_frame_t *frame, xlator_t *this) if (call_count == 0) { AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, - &local->cont.lk.ret_flock); + &local->cont.lk.ret_flock, NULL); return 0; } @@ -3010,7 +2955,7 @@ afr_lk_unlock (call_frame_t *frame, xlator_t *this) priv->children[i], priv->children[i]->fops->lk, local->fd, F_SETLK, - &local->cont.lk.user_flock); + &local->cont.lk.user_flock, NULL); if (!--call_count) break; @@ -3023,7 +2968,7 @@ afr_lk_unlock (call_frame_t *frame, xlator_t *this) int32_t afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata) { afr_local_t *local = NULL; afr_private_t *priv = NULL; @@ -3058,30 +3003,15 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, priv->children[child_index], priv->children[child_index]->fops->lk, local->fd, local->cont.lk.cmd, - &local->cont.lk.user_flock); + &local->cont.lk.user_flock, xdata); } else if (local->op_ret == -1) { /* all nodes have gone down */ AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN, - &local->cont.lk.ret_flock); + &local->cont.lk.ret_flock, NULL); } else { - /* locking has succeeded on all nodes that are up */ - - /* temporarily - ret = afr_mark_locked_nodes (this, local->fd, - local->cont.lk.locked_nodes); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "Could not save locked nodes info in fdctx"); - - ret = afr_save_locked_fd (this, local->fd); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "Could not save locked fd"); - - */ AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, - &local->cont.lk.ret_flock); + &local->cont.lk.ret_flock, NULL); } return 0; @@ -3090,24 +3020,18 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int afr_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t cmd, struct gf_flock *flock) + fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; int i = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - AFR_LOCAL_INIT (local, priv); - - frame->local = local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count, sizeof (*local->cont.lk.locked_nodes), @@ -3126,32 +3050,18 @@ afr_lk (call_frame_t *frame, xlator_t *this, STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0, priv->children[i], priv->children[i]->fops->lk, - fd, cmd, flock); + fd, cmd, flock, xdata); - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (lk, frame, op_ret, op_errno, NULL); - } + AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); + return 0; } int afr_forget (xlator_t *this, inode_t *inode) { - uint64_t ctx_addr = 0; - afr_inode_ctx_t *ctx = NULL; - - inode_ctx_get (inode, this, &ctx_addr); - - if (!ctx_addr) - goto out; - - ctx = (afr_inode_ctx_t *)(long)ctx_addr; - if (ctx->fresh_children) - GF_FREE (ctx->fresh_children); - GF_FREE (ctx); -out: return 0; } @@ -3170,41 +3080,22 @@ afr_priv_dump (xlator_t *this) GF_ASSERT (priv); snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); gf_proc_dump_add_section(key_prefix); - gf_proc_dump_build_key(key, key_prefix, "child_count"); - gf_proc_dump_write(key, "%u", priv->child_count); - gf_proc_dump_build_key(key, key_prefix, "read_child_rr"); - gf_proc_dump_write(key, "%u", priv->read_child_rr); + gf_proc_dump_write("child_count", "%u", priv->child_count); for (i = 0; i < priv->child_count; i++) { - gf_proc_dump_build_key(key, key_prefix, "child_up[%d]", i); + sprintf (key, "child_up[%d]", i); gf_proc_dump_write(key, "%d", priv->child_up[i]); - gf_proc_dump_build_key(key, key_prefix, - "pending_key[%d]", i); + sprintf (key, "pending_key[%d]", i); gf_proc_dump_write(key, "%s", priv->pending_key[i]); } - gf_proc_dump_build_key(key, key_prefix, "data_self_heal"); - gf_proc_dump_write(key, "%d", priv->data_self_heal); - gf_proc_dump_build_key(key, key_prefix, "metadata_self_heal"); - gf_proc_dump_write(key, "%d", priv->metadata_self_heal); - gf_proc_dump_build_key(key, key_prefix, "entry_self_heal"); - gf_proc_dump_write(key, "%d", priv->entry_self_heal); - gf_proc_dump_build_key(key, key_prefix, "data_change_log"); - gf_proc_dump_write(key, "%d", priv->data_change_log); - gf_proc_dump_build_key(key, key_prefix, "metadata_change_log"); - gf_proc_dump_write(key, "%d", priv->metadata_change_log); - gf_proc_dump_build_key(key, key_prefix, "entry_change_log"); - gf_proc_dump_write(key, "%d", priv->entry_change_log); - gf_proc_dump_build_key(key, key_prefix, "read_child"); - gf_proc_dump_write(key, "%d", priv->read_child); - gf_proc_dump_build_key(key, key_prefix, "favorite_child"); - gf_proc_dump_write(key, "%d", priv->favorite_child); - gf_proc_dump_build_key(key, key_prefix, "data_lock_server_count"); - gf_proc_dump_write(key, "%u", priv->data_lock_server_count); - gf_proc_dump_build_key(key, key_prefix, "metadata_lock_server_count"); - gf_proc_dump_write(key, "%u", priv->metadata_lock_server_count); - gf_proc_dump_build_key(key, key_prefix, "entry_lock_server_count"); - gf_proc_dump_write(key, "%u", priv->entry_lock_server_count); - gf_proc_dump_build_key(key, key_prefix, "wait_count"); - gf_proc_dump_write(key, "%u", priv->wait_count); + gf_proc_dump_write("data_self_heal", "%s", priv->data_self_heal); + gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal); + gf_proc_dump_write("entry_self_heal", "%d", priv->entry_self_heal); + gf_proc_dump_write("data_change_log", "%d", priv->data_change_log); + gf_proc_dump_write("metadata_change_log", "%d", priv->metadata_change_log); + gf_proc_dump_write("entry-change_log", "%d", priv->entry_change_log); + gf_proc_dump_write("read_child", "%d", priv->read_child); + gf_proc_dump_write("favorite_child", "%d", priv->favorite_child); + gf_proc_dump_write("wait_count", "%u", priv->wait_count); return 0; } @@ -3234,24 +3125,35 @@ find_child_index (xlator_t *this, xlator_t *child) int32_t afr_notify (xlator_t *this, int32_t event, - void *data, ...) + void *data, void *data2) { afr_private_t *priv = NULL; int i = -1; int up_children = 0; int down_children = 0; int propagate = 0; - int had_heard_from_all = 0; int have_heard_from_all = 0; int idx = -1; int ret = -1; + int call_psh = 0; + int up_child = -1; + dict_t *input = NULL; + dict_t *output = NULL; priv = this->private; if (!priv) return 0; + /* + * We need to reset this in case children come up in "staggered" + * fashion, so that we discover a late-arriving local subvolume. Note + * that we could end up issuing N lookups to the first subvolume, and + * O(N^2) overall, but N is small for AFR so it shouldn't be an issue. + */ + priv->did_discovery = _gf_false; + had_heard_from_all = 1; for (i = 0; i < priv->child_count; i++) { if (!priv->last_event[i]) { @@ -3262,7 +3164,7 @@ afr_notify (xlator_t *this, int32_t event, /* parent xlators dont need to know about every child_up, child_down * because of afr ha. If all subvolumes go down, child_down has * to be triggered. In that state when 1 subvolume comes up child_up - * needs to be triggered. dht optimises revalidate lookup by sending + * needs to be triggered. dht optimizes revalidate lookup by sending * it only to one of its subvolumes. When child up/down happens * for afr's subvolumes dht should be notified by child_modified. The * subsequent revalidate lookup happens on all the dht's subvolumes @@ -3279,9 +3181,20 @@ afr_notify (xlator_t *this, int32_t event, case GF_EVENT_CHILD_UP: LOCK (&priv->lock); { + /* + * This only really counts if the child was never up + * (value = -1) or had been down (value = 0). See + * comment at GF_EVENT_CHILD_DOWN for a more detailed + * explanation. + */ + if (priv->child_up[idx] != 1) { + priv->up_count++; + priv->event_generation++; + } priv->child_up[idx] = 1; - priv->up_count++; + call_psh = 1; + up_child = idx; for (i = 0; i < priv->child_count; i++) if (priv->child_up[i] == 1) up_children++; @@ -3302,8 +3215,23 @@ afr_notify (xlator_t *this, int32_t event, case GF_EVENT_CHILD_DOWN: LOCK (&priv->lock); { + /* + * If a brick is down when we start, we'll get a + * CHILD_DOWN to indicate its initial state. There + * was never a CHILD_UP in this case, so if we + * increment "down_count" the difference between than + * and "up_count" will no longer be the number of + * children that are currently up. This has serious + * implications e.g. for quorum enforcement, so we + * don't increment these values unless the event + * represents an actual state transition between "up" + * (value = 1) and anything else. + */ + if (priv->child_up[idx] == 1) { + priv->down_count++; + priv->event_generation++; + } priv->child_up[idx] = 0; - priv->down_count++; for (i = 0; i < priv->child_count; i++) if (priv->child_up[i] == 0) @@ -3328,7 +3256,20 @@ afr_notify (xlator_t *this, int32_t event, priv->last_event[idx] = event; } UNLOCK (&priv->lock); + break; + + case GF_EVENT_TRANSLATOR_OP: + input = data; + output = data2; + if (!had_heard_from_all) { + ret = -1; + goto out; + } + ret = afr_xl_op (this, input, output); + goto out; + break; + default: propagate = 1; break; @@ -3355,6 +3296,7 @@ afr_notify (xlator_t *this, int32_t event, LOCK (&priv->lock); { + up_children = AFR_COUNT (priv->child_up, priv->child_count); for (i = 0; i < priv->child_count; i++) { if (priv->last_event[i] == GF_EVENT_CHILD_UP) { event = GF_EVENT_CHILD_UP; @@ -3375,52 +3317,69 @@ afr_notify (xlator_t *this, int32_t event, if (propagate) ret = default_notify (this, event, data); + if (call_psh && priv->shd.iamshd) { + afr_selfheal_childup (this, up_child); + } out: return ret; } -int -afr_first_up_child (unsigned char *child_up, size_t child_count) -{ - int ret = -1; - int i = 0; - - GF_ASSERT (child_up); - - for (i = 0; i < child_count; i++) { - if (child_up[i]) { - ret = i; - break; - } - } - - return ret; -} int -AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv) +afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) { local->op_ret = -1; local->op_errno = EUCLEAN; - local->call_count = afr_up_children_count (priv->child_count, - priv->child_up); - if (local->call_count == 0) { - gf_log (THIS->name, GF_LOG_INFO, "no subvolumes up"); - return -ENOTCONN; - } + syncbarrier_init (&local->barrier); - local->child_up = GF_CALLOC (sizeof (*local->child_up), - priv->child_count, + local->child_up = GF_CALLOC (priv->child_count, + sizeof (*local->child_up), gf_afr_mt_char); if (!local->child_up) { - return -ENOMEM; + if (op_errno) + *op_errno = ENOMEM; + goto out; } memcpy (local->child_up, priv->child_up, sizeof (*local->child_up) * priv->child_count); - - return 0; + local->call_count = AFR_COUNT (local->child_up, priv->child_count); + if (local->call_count == 0) { + gf_log (THIS->name, GF_LOG_INFO, "no subvolumes up"); + if (op_errno) + *op_errno = ENOTCONN; + goto out; + } + local->event_generation = priv->event_generation; + + local->read_attempted = GF_CALLOC (priv->child_count, sizeof (char), + gf_afr_mt_char); + if (!local->read_attempted) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + local->readable = GF_CALLOC (priv->child_count, sizeof (char), + gf_afr_mt_char); + if (!local->readable) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), + gf_afr_mt_reply_t); + if (!local->replies) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + return 0; +out: + return -1; } int @@ -3429,16 +3388,6 @@ afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, { int ret = -ENOMEM; - lk->inode_locked_nodes = GF_CALLOC (sizeof (*lk->inode_locked_nodes), - child_count, gf_afr_mt_char); - if (NULL == lk->inode_locked_nodes) - goto out; - - lk->entry_locked_nodes = GF_CALLOC (sizeof (*lk->entry_locked_nodes), - child_count, gf_afr_mt_char); - if (NULL == lk->entry_locked_nodes) - goto out; - lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), child_count, gf_afr_mt_char); if (NULL == lk->locked_nodes) @@ -3458,119 +3407,219 @@ out: return ret; } +void +afr_matrix_cleanup (int32_t **matrix, unsigned int m) +{ + int i = 0; + + if (!matrix) + goto out; + for (i = 0; i < m; i++) { + GF_FREE (matrix[i]); + } + + GF_FREE (matrix); +out: + return; +} + +int32_t** +afr_matrix_create (unsigned int m, unsigned int n) +{ + int32_t **matrix = NULL; + int i = 0; + + matrix = GF_CALLOC (sizeof (*matrix), m, gf_afr_mt_int32_t); + if (!matrix) + goto out; + + for (i = 0; i < m; i++) { + matrix[i] = GF_CALLOC (sizeof (*matrix[i]), n, + gf_afr_mt_int32_t); + if (!matrix[i]) + goto out; + } + return matrix; +out: + afr_matrix_cleanup (matrix, m); + return NULL; +} + +int +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count) +{ + int ret = -ENOMEM; + + lk->domain = dom; + lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), + child_count, gf_afr_mt_char); + if (NULL == lk->locked_nodes) + goto out; + ret = 0; +out: + return ret; +} + int -afr_transaction_local_init (afr_local_t *local, afr_private_t *priv) +afr_transaction_local_init (afr_local_t *local, xlator_t *this) { - int i; - int child_up_count = 0; - int ret = -ENOMEM; + int child_up_count = 0; + int ret = -ENOMEM; + afr_private_t *priv = NULL; + priv = this->private; ret = afr_internal_lock_init (&local->internal_lock, priv->child_count, AFR_TRANSACTION_LK); if (ret < 0) goto out; + if ((local->transaction.type == AFR_DATA_TRANSACTION) || + (local->transaction.type == AFR_METADATA_TRANSACTION)) { + ret = afr_inodelk_init (&local->internal_lock.inodelk[0], + this->name, priv->child_count); + if (ret < 0) + goto out; + } + ret = -ENOMEM; - child_up_count = afr_up_children_count (priv->child_count, local->child_up); + child_up_count = AFR_COUNT (local->child_up, priv->child_count); if (priv->optimistic_change_log && child_up_count == priv->child_count) local->optimistic_change_log = 1; - local->first_up_child = afr_first_up_child (local->child_up, - priv->child_count); + local->pre_op_compat = priv->pre_op_compat; + + local->transaction.eager_lock = + GF_CALLOC (sizeof (*local->transaction.eager_lock), + priv->child_count, + gf_afr_mt_int32_t); - local->child_errno = GF_CALLOC (sizeof (*local->child_errno), - priv->child_count, - gf_afr_mt_int32_t); - if (!local->child_errno) + if (!local->transaction.eager_lock) goto out; - local->pending = GF_CALLOC (sizeof (*local->pending), - priv->child_count, - gf_afr_mt_int32_t); + local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op), + priv->child_count, + gf_afr_mt_char); + if (!local->transaction.pre_op) + goto out; - if (!local->pending) + local->transaction.fop_subvols = GF_CALLOC (sizeof (*local->transaction.fop_subvols), + priv->child_count, + gf_afr_mt_char); + if (!local->transaction.fop_subvols) goto out; - local->fresh_children = afr_fresh_children_create (priv->child_count); - if (!local->fresh_children) + local->transaction.failed_subvols = GF_CALLOC (sizeof (*local->transaction.failed_subvols), + priv->child_count, + gf_afr_mt_char); + if (!local->transaction.failed_subvols) goto out; - for (i = 0; i < priv->child_count; i++) { - local->pending[i] = GF_CALLOC (sizeof (*local->pending[i]), - 3, /* data + metadata + entry */ - gf_afr_mt_int32_t); - if (!local->pending[i]) - goto out; - } + local->pending = afr_matrix_create (priv->child_count, + AFR_NUM_CHANGE_LOGS); + if (!local->pending) + goto out; - local->transaction.child_errno = - GF_CALLOC (sizeof (*local->transaction.child_errno), - priv->child_count, - gf_afr_mt_int32_t); - local->transaction.erase_pending = 1; + INIT_LIST_HEAD (&local->transaction.eager_locked); ret = 0; out: return ret; } + void -afr_reset_children (int32_t *fresh_children, int32_t child_count) +afr_set_low_priority (call_frame_t *frame) { - unsigned int i = 0; - for (i = 0; i < child_count; i++) - fresh_children[i] = -1; + frame->root->pid = LOW_PRIO_PROC_PID; } -int32_t* -afr_fresh_children_create (int32_t child_count) + +gf_boolean_t +afr_have_quorum (char *logname, afr_private_t *priv) { - int32_t *fresh_children = NULL; - int i = 0; + unsigned int quorum = 0; - GF_ASSERT (child_count > 0); + GF_VALIDATE_OR_GOTO(logname,priv,out); + + quorum = priv->quorum_count; + if (quorum != AFR_QUORUM_AUTO) { + return (priv->up_count >= (priv->down_count + quorum)); + } + + quorum = priv->child_count / 2 + 1; + if (priv->up_count >= (priv->down_count + quorum)) { + return _gf_true; + } + + /* + * Special case for even numbers of nodes: if we have exactly half + * and that includes the first ("senior-most") node, then that counts + * as quorum even if it wouldn't otherwise. This supports e.g. N=2 + * while preserving the critical property that there can only be one + * such group. + */ + if ((priv->child_count % 2) == 0) { + quorum = priv->child_count / 2; + if (priv->up_count >= (priv->down_count + quorum)) { + if (priv->child_up[0]) { + return _gf_true; + } + } + } - fresh_children = GF_CALLOC (child_count, sizeof (*fresh_children), - gf_afr_mt_int32_t); - if (NULL == fresh_children) - goto out; - for (i = 0; i < child_count; i++) - fresh_children[i] = -1; out: - return fresh_children; + return _gf_false; } void -afr_fresh_children_add_child (int32_t *fresh_children, int32_t child, - int32_t child_count) +afr_priv_destroy (afr_private_t *priv) { - gf_boolean_t child_found = _gf_false; - int i = 0; + int i = 0; - for (i = 0; i < child_count; i++) { - if (fresh_children[i] == -1) - break; - if (fresh_children[i] == child) { - child_found = _gf_true; - break; - } - } - if (!child_found) { - GF_ASSERT (i < child_count); - fresh_children[i] = child; - } + if (!priv) + goto out; + inode_unref (priv->root_inode); + GF_FREE (priv->last_event); + if (priv->pending_key) { + for (i = 0; i < priv->child_count; i++) + GF_FREE (priv->pending_key[i]); + } + GF_FREE (priv->pending_key); + GF_FREE (priv->children); + GF_FREE (priv->child_up); + LOCK_DESTROY (&priv->lock); + + GF_FREE (priv); +out: + return; } int -afr_get_children_count (int32_t *fresh_children, unsigned int child_count) +xlator_subvolume_count (xlator_t *this) { - int count = 0; int i = 0; + xlator_list_t *list = NULL; - for (i = 0; i < child_count; i++) { - if (fresh_children[i] == -1) - break; - count++; - } - return count; + for (list = this->children; list; list = list->next) + i++; + return i; +} + + +void +afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + + local = frame->local; + + if (!local->fd) + return; + + fd_ctx = afr_fd_ctx_get (local->fd, this); + if (!fd_ctx) + return; + + fd_ctx->open_fd_count = local->open_fd_count; } diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index 4632da7d0..fa1da3958 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -46,709 +37,384 @@ #include "checksum.h" #include "afr.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" - -int -afr_examine_dir_sh_unwind (call_frame_t *frame, xlator_t *this, int32_t op_ret, - int32_t op_errno) -{ - afr_local_t *local = NULL; - - local = frame->local; - - afr_set_opendir_done (this, local->fd->inode); - - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); - - return 0; -} - - -gf_boolean_t -__checksums_differ (uint32_t *checksum, int child_count, - unsigned char *child_up) -{ - int ret = _gf_false; - int i = 0; - uint32_t cksum = 0; - gf_boolean_t activate_check = _gf_false; - - for (i = 0; i < child_count; i++) { - if (!child_up[i]) - continue; - if (_gf_false == activate_check) { - cksum = checksum[i]; - activate_check = _gf_true; - continue; - } - - if (cksum != checksum[i]) { - ret = _gf_true; - break; - } - - cksum = checksum[i]; - } - - return ret; -} - - -int32_t -afr_examine_dir_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; - int child_index = 0; - uint32_t entry_cksum = 0; - int call_count = 0; - off_t last_offset = 0; - char sh_type_str[256] = {0,}; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to do opendir on %s", - local->loc.path, priv->children[child_index]->name); - local->op_ret = -1; - local->op_ret = op_errno; - goto out; - } - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: no entries found in %s", - local->loc.path, priv->children[child_index]->name); - goto out; - } - - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - entry_cksum = gf_rsync_weak_checksum (entry->d_name, - strlen (entry->d_name)); - local->cont.opendir.checksum[child_index] ^= entry_cksum; - } - - list_for_each_entry (entry, &entries->list, list) { - last_offset = entry->d_off; - } - - /* read more entries */ - - STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->readdir, - local->fd, 131072, last_offset); - - return 0; - -out: - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (__checksums_differ (local->cont.opendir.checksum, - priv->child_count, - local->child_up)) { - - sh->need_entry_self_heal = _gf_true; - sh->forced_merge = _gf_true; - sh->type = local->fd->inode->ia_type; - sh->background = _gf_false; - sh->unwind = afr_examine_dir_sh_unwind; - - afr_self_heal_type_str_get(&local->self_heal, - sh_type_str, - sizeof(sh_type_str)); - gf_log (this->name, GF_LOG_INFO, - "%s self-heal triggered. path: %s, " - "reason: checksums of directory differ," - " forced merge option set", - sh_type_str, local->loc.path); - - afr_self_heal (frame, this, local->fd->inode); - } else { - afr_set_opendir_done (this, local->fd->inode); - - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); - } - } - - return 0; -} - - -int -afr_examine_dir (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - int i = 0; - int call_count = 0; - - local = frame->local; - priv = this->private; - - local->cont.opendir.checksum = GF_CALLOC (priv->child_count, - sizeof (*local->cont.opendir.checksum), - gf_afr_mt_int32_t); - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->readdir, - local->fd, 131072, 0); - - if (!--call_count) - break; - } - } - - return 0; -} +#include "afr-transaction.h" int32_t afr_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd) + fd_t *fd, dict_t *xdata) { - afr_private_t *priv = NULL; afr_local_t *local = NULL; - int32_t up_children_count = 0; - int ret = -1; int call_count = -1; + int32_t child_index = 0; + afr_fd_ctx_t *fd_ctx = NULL; - priv = this->private; local = frame->local; - - up_children_count = afr_up_children_count (priv->child_count, - local->child_up); + fd_ctx = local->fd_ctx; + child_index = (long) cookie; LOCK (&frame->lock); { - if (op_ret >= 0) + if (op_ret == -1) { + local->op_errno = op_errno; + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } else { local->op_ret = op_ret; - - local->op_errno = op_errno; + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + } } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); - if (call_count == 0) { - if (local->op_ret != 0) - goto out; - - ret = afr_fd_ctx_set (this, local->fd); - if (ret) { - local->op_ret = -1; - local->op_errno = -1; - gf_log (this->name, GF_LOG_ERROR, - "failed to set fd ctx for fd %p", - local->fd); - goto out; - } - if (!afr_is_opendir_done (this, local->fd->inode) && - up_children_count > 1) { - - /* - * This is the first opendir on this inode. We need - * to check if the directory's entries are the same - * on all subvolumes. This is needed in addition - * to regular entry self-heal because the readdir - * call is sent only to the first subvolume, and - * thus files that exist only there will never be healed - * otherwise (assuming changelog shows no anamolies). - */ - - gf_log (this->name, GF_LOG_TRACE, - "reading contents of directory %s looking for mismatch", - local->loc.path); - - afr_examine_dir (frame, this); - - } else { - /* do the unwind */ - goto out; - } - } - - return 0; - -out: - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd); - + if (call_count == 0) + AFR_STACK_UNWIND (opendir, frame, local->op_ret, + local->op_errno, local->fd, NULL); return 0; } -int32_t -afr_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd) +int +afr_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) { afr_private_t * priv = NULL; afr_local_t * local = NULL; - int child_count = 0; int i = 0; - int ret = -1; int call_count = -1; - int32_t op_ret = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; + afr_fd_ctx_t *fd_ctx = NULL; priv = this->private; - child_count = priv->child_count; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - ALLOC_OR_GOTO (local, afr_local_t, out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + goto out; loc_copy (&local->loc, loc); - frame->local = local; local->fd = fd_ref (fd); + local->fd_ctx = fd_ctx; call_count = local->call_count; - for (i = 0; i < child_count; i++) { + for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { - STACK_WIND (frame, afr_opendir_cbk, - priv->children[i], - priv->children[i]->fops->opendir, - loc, fd); + STACK_WIND_COOKIE (frame, afr_opendir_cbk, + (void*) (long) i, + priv->children[i], + priv->children[i]->fops->opendir, + loc, fd, NULL); if (!--call_count) break; } } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (opendir, frame, op_ret, op_errno, fd); - } - + AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL); return 0; } -/** - * Common algorithm for directory read calls: - * - * - Try the fop on the first child that is up - * - if we have failed due to ENOTCONN: - * try the next child - * - * Applicable to: readdir - */ +#define BACKEND_D_OFF_BITS 63 +#define PRESENT_D_OFF_BITS 63 +#define ONE 1ULL +#define MASK (~0ULL) +#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS)) +#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS)) -struct entry_name { - char *name; - struct list_head list; -}; +#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1)) +#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1))) - -static gf_boolean_t -remembered_name (const char *name, struct list_head *entries) +static uint64_t +afr_bits_for (uint64_t num) { - struct entry_name *e = NULL; - gf_boolean_t ret = _gf_false; + uint64_t bits = 0, ctrl = 1; - list_for_each_entry (e, entries, list) { - if (!strcmp (name, e->name)) { - ret = _gf_true; - goto out; - } - } + while (ctrl < num) { + ctrl *= 2; + bits ++; + } -out: - return ret; + return bits; } - -static void -afr_remember_entries (gf_dirent_t *entries, fd_t *fd) +int +afr_itransform (xlator_t *this, int subvol, uint64_t x, uint64_t *y_p) { - struct entry_name *n = NULL; - gf_dirent_t *entry = NULL; - int ret = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - ret = fd_ctx_get (fd, THIS, &ctx); - if (ret < 0) { - gf_log (THIS->name, GF_LOG_INFO, - "could not get fd ctx for fd=%p", fd); - return; + afr_private_t *conf = NULL; + int cnt = 0; + int max = 0; + uint64_t y = 0; + uint64_t hi_mask = 0; + uint64_t off_mask = 0; + int max_bits = 0; + + if (x == ((uint64_t) -1)) { + y = (uint64_t) -1; + goto out; } - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - list_for_each_entry (entry, &entries->list, list) { - n = GF_CALLOC (1, sizeof (*n), gf_afr_mt_entry_name); - n->name = gf_strdup (entry->d_name); - INIT_LIST_HEAD (&n->list); + conf = this->private; + if (!conf) + goto out; - list_add (&n->list, &fd_ctx->entries); - } -} + max = conf->child_count; + cnt = subvol; + if (max == 1) { + y = x; + goto out; + } -static off_t -afr_filter_entries (gf_dirent_t *entries, fd_t *fd) -{ - gf_dirent_t *entry = NULL; - gf_dirent_t *tmp = NULL; - int ret = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - off_t offset = 0; - - ret = fd_ctx_get (fd, THIS, &ctx); - if (ret < 0) { - gf_log (THIS->name, GF_LOG_INFO, - "could not get fd ctx for fd=%p", fd); - return -1; - } + max_bits = afr_bits_for (max); - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + hi_mask = ~(PRESENT_MASK >> (max_bits + 1)); - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - offset = entry->d_off; - - if (remembered_name (entry->d_name, &fd_ctx->entries)) { - list_del (&entry->list); - GF_FREE (entry); - } + if (x & hi_mask) { + /* HUGE d_off */ + off_mask = MASK << max_bits; + y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt; + } else { + /* small d_off */ + y = ((x * max) + cnt); } - return offset; +out: + if (y_p) + *y_p = y; + + return 0; } -static void -afr_forget_entries (fd_t *fd) +int +afr_deitransform (xlator_t *this, uint64_t y, int *subvol_p, + uint64_t *x_p) { - struct entry_name *entry = NULL; - struct entry_name *tmp = NULL; - int ret = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - ret = fd_ctx_get (fd, THIS, &ctx); - if (ret < 0) { - gf_log (THIS->name, GF_LOG_INFO, - "could not get fd ctx for fd=%p", fd); - return; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - list_for_each_entry_safe (entry, tmp, &fd_ctx->entries, list) { - GF_FREE (entry->name); - list_del (&entry->list); - GF_FREE (entry); - } -} + afr_private_t *conf = NULL; + int cnt = 0; + int max = 0; + uint64_t x = 0; + int subvol = 0; + int max_bits = 0; + uint64_t off_mask = 0; + uint64_t host_mask = 0; + + if (!this->private) + return -1; + conf = this->private; + max = conf->child_count; -int32_t -afr_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; - int child_index = -1; + if (max == 1) { + x = y; + cnt = 0; + goto out; + } - priv = this->private; - local = frame->local; - child_index = (long) cookie; + if (y & TOP_BIT) { + /* HUGE d_off */ + max_bits = afr_bits_for (max); + off_mask = (MASK << max_bits); + host_mask = ~(off_mask); - if (op_ret == -1) - goto out; + x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS; - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - if ((local->fd->inode == local->fd->inode->table->root) - && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { - list_del_init (&entry->list); - GF_FREE (entry); - } + cnt = y & host_mask; + } else { + /* small d_off */ + cnt = y % max; + x = y / max; } out: - AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries); + subvol = cnt; + + if (subvol_p) + *subvol_p = subvol; + + if (x_p) + *x_p = x; return 0; } -int32_t -afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries) +static void +afr_readdir_transform_entries (gf_dirent_t *subvol_entries, int subvol, + gf_dirent_t *entries, fd_t *fd) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int32_t next_call_child = -1; - int ret = 0; - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; - int32_t *last_index = NULL; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - off_t offset = 0; - int32_t call_child = -1; - - priv = this->private; - children = priv->children; - - local = frame->local; - - read_child = (long) cookie; - last_index = &local->cont.readdir.last_index; - fresh_children = local->fresh_children; - - /* the value of the last_index changes if afr_next_call_child is - * called. So to find the call_child of this callback use last_index - * before the next_call_child call. - */ - if (*last_index == -1) - call_child = read_child; - else - call_child = fresh_children[*last_index]; - - if (priv->strict_readdir) { - ret = fd_ctx_get (local->fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "could not get fd ctx for fd=%p", local->fd); - op_ret = -1; - op_errno = -ret; - goto out; + afr_private_t *priv = NULL; + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + int gen = 0; + + priv = THIS->private; + + data_readable = alloca0 (priv->child_count); + metadata_readable = alloca0 (priv->child_count); + + list_for_each_entry_safe (entry, tmp, &subvol_entries->list, list) { + if (__is_root_gfid (fd->inode->gfid) && + !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { + continue; } - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + list_del_init (&entry->list); + afr_itransform (THIS, subvol, entry->d_off, &entry->d_off); + list_add_tail (&entry->list, &entries->list); - if (op_ret == -1) { - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, - read_child); - if (next_call_child < 0) - goto out; - gf_log (this->name, GF_LOG_TRACE, - "starting readdir afresh on child %d, offset %"PRId64, - next_call_child, (uint64_t) 0); - - fd_ctx->failed_over = _gf_true; - - STACK_WIND_COOKIE (frame, afr_readdirp_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->readdirp, - local->fd, - local->cont.readdir.size, 0); - return 0; - } - } + if (entry->inode) { + gen = 0; + afr_inode_read_subvol_get (entry->inode, THIS, + data_readable, + metadata_readable, &gen); - if (op_ret != -1) { - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - if ((local->fd->inode == local->fd->inode->table->root) - && !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { - list_del_init (&entry->list); - GF_FREE (entry); - } - } - } + if (gen != priv->event_generation || + !data_readable[subvol] || + !metadata_readable[subvol]) { - if (priv->strict_readdir) { - if (fd_ctx->failed_over) { - if (list_empty (&entries->list)) { - gf_log (this->name, GF_LOG_DEBUG, - "no entries found"); - goto out; - } - - offset = afr_filter_entries (entries, local->fd); - - afr_remember_entries (entries, local->fd); - - if (list_empty (&entries->list)) { - /* All the entries we got were duplicate. We - shouldn't send an empty list now, because - that'll make the application stop reading. So - try to get more entries */ - - gf_log (this->name, GF_LOG_TRACE, - "trying to fetch non-duplicate entries " - "from offset %"PRId64", child %s", - offset, children[call_child]->name); - - STACK_WIND_COOKIE (frame, afr_readdirp_cbk, - (void *) (long) read_child, - children[call_child], - children[call_child]->fops->readdirp, - local->fd, local->cont.readdir.size, offset); - return 0; - } - } else { - afr_remember_entries (entries, local->fd); - } + inode_unref (entry->inode); + entry->inode = NULL; + } + } } - -out: - AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries); - - return 0; } + int32_t -afr_do_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, int whichop) +afr_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *subvol_entries, + dict_t *xdata) { - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int ret = -1; - int32_t op_ret = -1; - int32_t op_errno = 0; - uint64_t read_child = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - children = priv->children; - - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + afr_local_t *local = NULL; + gf_dirent_t entries; - local->fresh_children = afr_fresh_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } + INIT_LIST_HEAD (&entries.list); - read_child = afr_inode_get_read_ctx (this, fd->inode, - local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.readdir.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } + local = frame->local; - local->fd = fd_ref (fd); - local->cont.readdir.size = size; + if (op_ret < 0 && !local->cont.readdir.offset) { + /* failover only if this was first readdir, detected + by offset == 0 */ + local->op_ret = op_ret; + local->op_errno = op_errno; - if (priv->strict_readdir) { - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "could not get fd ctx for fd=%p", fd); - op_errno = -ret; - goto out; - } + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + if (op_ret >= 0) + afr_readdir_transform_entries (subvol_entries, (long) cookie, + &entries, local->fd); - if (fd_ctx->last_tried != call_child) { - gf_log (this->name, GF_LOG_TRACE, - "first up child has changed from %d to %d, " - "restarting readdir from offset 0", - fd_ctx->last_tried, call_child); + AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries, xdata); - fd_ctx->failed_over = _gf_true; - offset = 0; - } + return 0; +} - fd_ctx->last_tried = call_child; - } - if (whichop == GF_FOP_READDIR) +int +afr_readdir_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + local = frame->local; + + if (subvol == -1) { + AFR_STACK_UNWIND (readdir, frame, local->op_ret, + local->op_errno, 0, 0); + return 0; + } + + if (local->op == GF_FOP_READDIR) STACK_WIND_COOKIE (frame, afr_readdir_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readdir, fd, - size, offset); + (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->readdir, + local->fd, local->cont.readdir.size, + local->cont.readdir.offset, + local->xdata_req); else - STACK_WIND_COOKIE (frame, afr_readdirp_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readdirp, fd, - size, offset); + STACK_WIND_COOKIE (frame, afr_readdir_cbk, + (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->readdirp, + local->fd, local->cont.readdir.size, + local->cont.readdir.offset, + local->xdata_req); + return 0; +} - op_ret = 0; + +int +afr_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, int whichop, dict_t *dict) +{ + afr_local_t *local = NULL; + int32_t op_errno = 0; + int subvol = -1; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + local->op = whichop; + local->fd = fd_ref (fd); + local->cont.readdir.size = size; + local->cont.readdir.offset = offset; + local->xdata_req = (dict)? dict_ref (dict) : NULL; + + if (offset == 0) { + /* First readdir has option of failing over and selecting + an appropriate read subvolume */ + afr_read_txn (frame, this, fd->inode, afr_readdir_wind, + AFR_DATA_TRANSACTION); + } else { + /* But continued readdirs MUST stick to the same subvolume + without an option to failover */ + afr_deitransform (this, offset, &subvol, + (uint64_t *)&local->cont.readdir.offset); + afr_readdir_wind (frame, this, subvol); + } + + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, NULL); - } + AFR_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL); return 0; } int32_t afr_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) + off_t offset, dict_t *xdata) { - afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR); + afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata); + return 0; } int32_t afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) + off_t offset, dict_t *dict) { - afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP); + afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP, dict); + return 0; } @@ -756,7 +422,6 @@ afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, int32_t afr_releasedir (xlator_t *this, fd_t *fd) { - afr_forget_entries (fd); afr_cleanup_fd_ctx (this, fd); return 0; diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h index 8bad2a0c0..09456d159 100644 --- a/xlators/cluster/afr/src/afr-dir-read.h +++ b/xlators/cluster/afr/src/afr-dir-read.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __DIR_READ_H__ @@ -23,23 +14,23 @@ int32_t afr_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd); + loc_t *loc, fd_t *fd, dict_t *xdata); int32_t afr_releasedir (xlator_t *this, fd_t *fd); int32_t afr_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset); + fd_t *fd, size_t size, off_t offset, dict_t *xdata); int32_t afr_readdirp (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset); + fd_t *fd, size_t size, off_t offset, dict_t *dict); int32_t afr_checksum (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags); + loc_t *loc, int32_t flags, dict_t *xdata); #endif /* __DIR_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index f9be71fae..465dde54f 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -43,72 +34,218 @@ #include "common-utils.h" #include "compat-errno.h" #include "compat.h" +#include "byte-order.h" #include "afr.h" #include "afr-transaction.h" - void -afr_build_parent_loc (loc_t *parent, loc_t *child) +afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this); + +int +afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno) { - char *tmp = NULL; + int ret = -1; + char *child_path = NULL; if (!child->parent) { - //this should never be called with root as the child - GF_ASSERT (0); - loc_copy (parent, child); - return; + if (op_errno) + *op_errno = EINVAL; + goto out; } - tmp = gf_strdup (child->path); - parent->path = gf_strdup (dirname (tmp)); - GF_FREE (tmp); + child_path = gf_strdup (child->path); + if (!child_path) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } - parent->name = strrchr (parent->path, '/'); - if (parent->name) - parent->name++; + parent->path = gf_strdup (dirname (child_path)); + if (!parent->path) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + parent->inode = inode_ref (child->parent); + uuid_copy (parent->gfid, child->pargfid); - parent->inode = inode_ref (child->parent); - parent->parent = inode_parent (parent->inode, 0, NULL); - parent->ino = parent->inode->ino; + ret = 0; +out: + GF_FREE (child_path); - if (!uuid_is_null (child->pargfid)) - uuid_copy (parent->gfid, child->pargfid); + return ret; } -/* {{{ create */ -int -afr_create_unwind (call_frame_t *frame, xlator_t *this) +static void +__afr_dir_write_finalize (call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - struct iatt *unwind_buf = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int inode_read_subvol = -1; + int parent_read_subvol = -1; + int parent2_read_subvol = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + if (local->inode) { + afr_replies_interpret (frame, this, local->inode); + inode_read_subvol = afr_data_subvol_get (local->inode, this, + NULL, NULL); + } + if (local->parent) + parent_read_subvol = afr_data_subvol_get (local->parent, this, + NULL, NULL); + if (local->parent2) + parent2_read_subvol = afr_data_subvol_get (local->parent2, this, + NULL, NULL); + + local->op_ret = -1; + local->op_errno = afr_final_errno (local, priv); + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret < 0) { + if (local->inode) + afr_inode_read_subvol_reset (local->inode, + this); + if (local->parent) + afr_inode_read_subvol_reset (local->parent, + this); + if (local->parent2) + afr_inode_read_subvol_reset (local->parent2, + this); + continue; + } + + if (local->op_ret == -1) { + local->op_ret = local->replies[i].op_ret; + local->op_errno = local->replies[i].op_errno; + + local->cont.dir_fop.buf = + local->replies[i].poststat; + local->cont.dir_fop.preparent = + local->replies[i].preparent; + local->cont.dir_fop.postparent = + local->replies[i].postparent; + local->cont.dir_fop.prenewparent = + local->replies[i].preparent2; + local->cont.dir_fop.postnewparent = + local->replies[i].postparent2; + if (local->replies[i].xdata) + local->xdata_rsp = + dict_ref (local->replies[i].xdata); + continue; + } + + if (i == inode_read_subvol) { + local->cont.dir_fop.buf = + local->replies[i].poststat; + if (local->replies[i].xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = + dict_ref (local->replies[i].xdata); + } + } + + if (i == parent_read_subvol) { + local->cont.dir_fop.preparent = + local->replies[i].preparent; + local->cont.dir_fop.postparent = + local->replies[i].postparent; + } + + if (i == parent2_read_subvol) { + local->cont.dir_fop.prenewparent = + local->replies[i].preparent2; + local->cont.dir_fop.postnewparent = + local->replies[i].postparent2; + } + } +} + + +static void +__afr_dir_write_fill (call_frame_t *frame, xlator_t *this, int child_index, + int op_ret, int op_errno, struct iatt *poststat, + struct iatt *preparent, struct iatt *postparent, + struct iatt *preparent2, struct iatt *postparent2, + dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_fd_ctx_t *fd_ctx = NULL; local = frame->local; + fd_ctx = local->fd_ctx; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + + if (op_ret >= 0) { + if (poststat) + local->replies[child_index].poststat = *poststat; + if (preparent) + local->replies[child_index].preparent = *preparent; + if (postparent) + local->replies[child_index].postparent = *postparent; + if (preparent2) + local->replies[child_index].preparent2 = *preparent2; + if (postparent2) + local->replies[child_index].postparent2 = *postparent2; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + + if (fd_ctx) + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + } else { + if (op_errno != ENOTEMPTY) + afr_transaction_fop_failed (frame, this, child_index); + if (fd_ctx) + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } + + return; +} - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - if (local->cont.create.read_child_buf.ia_ino) { - unwind_buf = &local->cont.create.read_child_buf; - } else { - unwind_buf = &local->cont.create.buf; - } - - AFR_STACK_UNWIND (create, main_frame, - local->op_ret, local->op_errno, - local->cont.create.fd, - local->cont.create.inode, - unwind_buf, &local->cont.create.preparent, - &local->cont.create.postparent); + +static int +__afr_dir_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + struct iatt *preparent2, struct iatt *postparent2, + dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = (long) cookie; + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + __afr_dir_write_fill (frame, this, child_index, op_ret, + op_errno, buf, preparent, postparent, + preparent2, postparent2, xdata); + } + UNLOCK (&frame->lock); + call_count = afr_frame_return (frame); + + if (call_count == 0) { + __afr_dir_write_finalize (frame, this); + + if (afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); + + afr_mark_entry_pending_changelog (frame, this); + + local->transaction.resume (frame, this); } return 0; @@ -116,223 +253,269 @@ afr_create_unwind (call_frame_t *frame, xlator_t *this) int -afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - fd_t *fd, inode_t *inode, struct iatt *buf, - struct iatt *preparent, struct iatt *postparent) +afr_mark_new_entry_changelog_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + dict_t *xattr, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int ret = 0; - int call_count = -1; - int child_index = -1; - int32_t *fresh_children = NULL; + int call_count = 0; - local = frame->local; - priv = this->private; + call_count = afr_frame_return (frame); - child_index = (long) cookie; + if (call_count == 0) + AFR_STACK_DESTROY (frame); - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); + return 0; +} - if (op_ret != -1) { - local->op_ret = op_ret; - ret = afr_fd_ctx_set (this, fd); +void +afr_mark_new_entry_changelog (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *new_frame = NULL; + afr_local_t *local = NULL; + afr_local_t *new_local = NULL; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int32_t **changelog = NULL; + int i = 0; + int idx = 0; + int op_errno = ENOMEM; + unsigned char *pending = NULL; + int call_count = 0; - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not set ctx on fd=%p", fd); + local = frame->local; + priv = this->private; - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } + new_frame = copy_frame (frame); + if (!new_frame) + goto out; - ret = fd_ctx_get (fd, this, &ctx); + new_local = AFR_FRAME_INIT (new_frame, op_errno); + if (!new_local) + goto out; - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not get fd ctx for fd=%p", fd); - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } + changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!changelog) + goto out; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + xattr = dict_new (); + if (!xattr) + goto out; - fd_ctx->opened_on[child_index] = 1; - fd_ctx->flags = local->cont.create.flags; + idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - if (local->success_count == 0) - local->cont.create.buf = *buf; + pending = alloca0 (priv->child_count); - if (child_index == local->read_child_index) { - local->cont.create.read_child_buf = *buf; - local->cont.create.preparent = *preparent; - local->cont.create.postparent = *postparent; - } + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] && + !local->transaction.failed_subvols[i]) { + call_count ++; + continue; + } - local->cont.create.inode = inode; + changelog[i][idx] = hton32(1); + pending[i] = 1; + } - fresh_children = local->fresh_children; - fresh_children[local->success_count] = child_index; - local->success_count++; - } + new_local->pending = changelog; + uuid_copy (new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid); + new_local->loc.inode = inode_ref (local->inode); - local->op_errno = op_errno; - } -unlock: - UNLOCK (&frame->lock); + afr_set_pending_dict (priv, xattr, changelog); - call_count = afr_frame_return (frame); + new_local->call_count = call_count; - if (call_count == 0) { - afr_set_read_ctx_from_policy (this, inode, - local->fresh_children, - local->read_child_index, - priv->read_child); - local->transaction.unwind (frame, this); + for (i = 0; i < priv->child_count; i++) { + if (pending[i]) + continue; - local->transaction.resume (frame, this); + STACK_WIND_COOKIE (new_frame, afr_mark_new_entry_changelog_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->xattrop, + &new_local->loc, GF_XATTROP_ADD_ARRAY, + xattr, NULL); + if (!--call_count) + break; } - return 0; + new_frame = NULL; +out: + if (new_frame) + AFR_STACK_DESTROY (new_frame); + if (xattr) + dict_unref (xattr); + return; } -int -afr_create_wind (call_frame_t *frame, xlator_t *this) +void +afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; + int pre_op_count = 0; + int failed_count = 0; local = frame->local; - priv = this->private; + priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); + if (local->op_ret < 0) + return; - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + if (local->op != GF_FOP_CREATE && local->op != GF_FOP_MKNOD) + return; - local->call_count = call_count; + pre_op_count = AFR_COUNT (local->transaction.pre_op, priv->child_count); + failed_count = AFR_COUNT (local->transaction.failed_subvols, + priv->child_count); - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_create_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->create, - &local->loc, - local->cont.create.flags, - local->cont.create.mode, - local->cont.create.fd, - local->cont.create.params); - if (!--call_count) - break; - } - } + if (pre_op_count == priv->child_count && !failed_count) + return; - return 0; + afr_mark_new_entry_changelog (frame, this); + + return; } +/* {{{ create */ + int -afr_create_done (call_frame_t *frame, xlator_t *this) +afr_create_unwind (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; local = frame->local; - local->transaction.unwind (frame, this); + main_frame = afr_transaction_detach_fop_frame (frame); - AFR_STACK_DESTROY (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (create, main_frame, local->op_ret, local->op_errno, + local->cont.create.fd, local->inode, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } int -afr_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) +afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t *fd, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); +} - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); +int +afr_create_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; priv = this->private; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } + STACK_WIND_COOKIE (frame, afr_create_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->create, + &local->loc, local->cont.create.flags, + local->cont.create.mode, local->umask, + local->cont.create.fd, local->xdata_req); + return 0; +} - ALLOC_OR_GOTO (local, afr_local_t, out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; +int +afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + priv = this->private; + + QUORUM_CHECK(create,out); + + transaction_frame = copy_frame (frame); + if (!transaction_frame) goto out; - } - transaction_frame->local = local; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, loc); - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + local->fd_ctx = afr_fd_ctx_get (fd, this); + if (!local->fd_ctx) + goto out; + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); + + local->op = GF_FOP_CREATE; local->cont.create.flags = flags; local->cont.create.mode = mode; local->cont.create.fd = fd_ref (fd); - if (params) - local->cont.create.params = dict_ref (params); + local->umask = umask; + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - if (loc->parent) - local->cont.create.parent_ino = loc->parent->ino; + if (!local->xdata_req) + goto out; - local->transaction.fop = afr_create_wind; - local->transaction.done = afr_create_done; + local->transaction.wind = afr_create_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_create_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, loc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (create, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); return 0; } @@ -345,33 +528,17 @@ afr_mknod_unwind (call_frame_t *frame, xlator_t *this) { call_frame_t *main_frame = NULL; afr_local_t *local = NULL; - struct iatt *unwind_buf = NULL; local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - if (local->cont.mknod.read_child_buf.ia_ino) { - unwind_buf = &local->cont.mknod.read_child_buf; - } else { - unwind_buf = &local->cont.mknod.buf; - } - - AFR_STACK_UNWIND (mknod, main_frame, - local->op_ret, local->op_errno, - local->cont.mknod.inode, - unwind_buf, &local->cont.mknod.preparent, - &local->cont.mknod.postparent); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (mknod, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -380,183 +547,107 @@ int afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int child_index = -1; - int32_t *fresh_children = NULL; - - local = frame->local; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0) - local->cont.mknod.buf = *buf; - - if (child_index == local->read_child_index) { - local->cont.mknod.read_child_buf = *buf; - local->cont.mknod.preparent = *preparent; - local->cont.mknod.postparent = *postparent; - } - - local->cont.mknod.inode = inode; - - fresh_children = local->fresh_children; - fresh_children[local->success_count] = child_index; - local->success_count++; - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_set_read_ctx_from_policy (this, inode, - local->fresh_children, - local->read_child_index, - priv->read_child); - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } -int32_t -afr_mknod_wind (call_frame_t *frame, xlator_t *this) +int +afr_mknod_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mknod, - &local->loc, local->cont.mknod.mode, - local->cont.mknod.dev, - local->cont.mknod.params); - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->mknod, + &local->loc, local->cont.mknod.mode, + local->cont.mknod.dev, local->umask, + local->xdata_req); return 0; } - int -afr_mknod_done (call_frame_t *frame, xlator_t *this) +afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t dev, mode_t umask, dict_t *xdata) { - afr_local_t * local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - AFR_STACK_DESTROY (frame); - - return 0; -} - - -int -afr_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t dev, dict_t *params) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; priv = this->private; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); + QUORUM_CHECK(mknod,out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + transaction_frame = copy_frame (frame); + if (!transaction_frame) goto out; - } - transaction_frame->local = local; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); - + local->op = GF_FOP_MKNOD; local->cont.mknod.mode = mode; local->cont.mknod.dev = dev; - if (params) - local->cont.mknod.params = dict_ref (params); + local->umask = umask; + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - if (loc->parent) - local->cont.mknod.parent_ino = loc->parent->ino; + if (!local->xdata_req) + goto out; - local->transaction.fop = afr_mknod_wind; - local->transaction.done = afr_mknod_done; + local->transaction.wind = afr_mknod_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_mknod_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, loc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (mknod, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); return 0; } @@ -570,33 +661,17 @@ afr_mkdir_unwind (call_frame_t *frame, xlator_t *this) { call_frame_t *main_frame = NULL; afr_local_t *local = NULL; - struct iatt *unwind_buf = NULL; local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - if (local->cont.mkdir.read_child_buf.ia_ino) { - unwind_buf = &local->cont.mkdir.read_child_buf; - } else { - unwind_buf = &local->cont.mkdir.buf; - } - - AFR_STACK_UNWIND (mkdir, main_frame, - local->op_ret, local->op_errno, - local->cont.mkdir.inode, - unwind_buf, &local->cont.mkdir.preparent, - &local->cont.mkdir.postparent); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (mkdir, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -605,184 +680,106 @@ int afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int child_index = -1; - int32_t *fresh_children = NULL; - - local = frame->local; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0) - local->cont.mkdir.buf = *buf; - - if (child_index == local->read_child_index) { - local->cont.mkdir.read_child_buf = *buf; - local->cont.mkdir.preparent = *preparent; - local->cont.mkdir.postparent = *postparent; - } - - local->cont.mkdir.inode = inode; - - fresh_children = local->fresh_children; - fresh_children[local->success_count] = child_index; - local->success_count++; - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_set_read_ctx_from_policy (this, inode, - local->fresh_children, - local->read_child_index, - priv->read_child); - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } int -afr_mkdir_wind (call_frame_t *frame, xlator_t *this) +afr_mkdir_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mkdir, - &local->loc, local->cont.mkdir.mode, - local->cont.mkdir.params); - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->mkdir, &local->loc, + local->cont.mkdir.mode, local->umask, + local->xdata_req); return 0; } int -afr_mkdir_done (call_frame_t *frame, xlator_t *this) +afr_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) { - afr_local_t * local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - - -int -afr_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *params) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; priv = this->private; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); + QUORUM_CHECK(mkdir,out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + transaction_frame = copy_frame (frame); + if (!transaction_frame) goto out; - } - transaction_frame->local = local; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, loc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); local->cont.mkdir.mode = mode; - if (params) - local->cont.mkdir.params = dict_ref (params); + local->umask = umask; + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - if (loc->parent) - local->cont.mkdir.parent_ino = loc->parent->ino; + if (!local->xdata_req) + goto out; - local->transaction.fop = afr_mkdir_wind; - local->transaction.done = afr_mkdir_done; + local->op = GF_FOP_MKDIR; + local->transaction.wind = afr_mkdir_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_mkdir_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, loc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - - AFR_STACK_UNWIND (mkdir, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); return 0; } @@ -796,33 +793,17 @@ afr_link_unwind (call_frame_t *frame, xlator_t *this) { call_frame_t *main_frame = NULL; afr_local_t *local = NULL; - struct iatt *unwind_buf = NULL; local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - if (local->cont.link.read_child_buf.ia_ino) { - unwind_buf = &local->cont.link.read_child_buf; - } else { - unwind_buf = &local->cont.link.buf; - } - - AFR_STACK_UNWIND (link, main_frame, - local->op_ret, local->op_errno, - local->cont.link.inode, - unwind_buf, &local->cont.link.preparent, - &local->cont.link.postparent); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (link, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -831,182 +812,105 @@ int afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int child_index = -1; - int32_t *fresh_children = NULL; - - local = frame->local; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0) { - local->cont.link.buf = *buf; - } - - if (child_index == local->read_child_index) { - local->cont.link.read_child_buf = *buf; - local->cont.link.preparent = *preparent; - local->cont.link.postparent = *postparent; - } - - local->cont.link.inode = inode; - - fresh_children = local->fresh_children; - fresh_children[local->success_count] = child_index; - local->success_count++; - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_set_read_ctx_from_policy (this, inode, - local->fresh_children, - local->read_child_index, - priv->read_child); - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } int -afr_link_wind (call_frame_t *frame, xlator_t *this) +afr_link_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->link, - &local->loc, - &local->newloc); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->link, + &local->loc, &local->newloc, local->xdata_req); return 0; } int -afr_link_done (call_frame_t *frame, xlator_t *this) +afr_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - - -int -afr_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; priv = this->private; + QUORUM_CHECK(link,out); + transaction_frame = copy_frame (frame); - if (!transaction_frame) { + if (!transaction_frame) goto out; - } - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - transaction_frame->local = local; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, oldloc); loc_copy (&local->newloc, newloc); - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + local->inode = inode_ref (oldloc->inode); + local->parent = inode_ref (newloc->parent); - local->cont.link.ino = oldloc->inode->ino; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - if (oldloc->parent) - local->cont.link.parent_ino = newloc->parent->ino; + if (!local->xdata_req) + goto out; - local->transaction.fop = afr_link_wind; - local->transaction.done = afr_link_done; + local->op = GF_FOP_LINK; + + local->transaction.wind = afr_link_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_link_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, oldloc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, newloc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (oldloc->path); - local->transaction.new_basename = AFR_BASENAME (newloc->path); + local->transaction.basename = AFR_BASENAME (newloc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (link, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); return 0; } @@ -1020,33 +924,17 @@ afr_symlink_unwind (call_frame_t *frame, xlator_t *this) { call_frame_t *main_frame = NULL; afr_local_t *local = NULL; - struct iatt *unwind_buf = NULL; local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - if (local->cont.symlink.read_child_buf.ia_ino) { - unwind_buf = &local->cont.symlink.read_child_buf; - } else { - unwind_buf = &local->cont.symlink.buf; - } - - AFR_STACK_UNWIND (symlink, main_frame, - local->op_ret, local->op_errno, - local->cont.symlink.inode, - unwind_buf, &local->cont.symlink.preparent, - &local->cont.symlink.postparent); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (symlink, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -1055,184 +943,106 @@ int afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int child_index = -1; - int32_t *fresh_children = NULL; - - local = frame->local; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - local->op_ret = op_ret; - - if (local->success_count == 0) - local->cont.symlink.buf = *buf; - - if (child_index == local->read_child_index) { - local->cont.symlink.read_child_buf = *buf; - local->cont.symlink.preparent = *preparent; - local->cont.symlink.postparent = *postparent; - } - - local->cont.symlink.inode = inode; - - fresh_children = local->fresh_children; - fresh_children[local->success_count] = child_index; - local->success_count++; - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_set_read_ctx_from_policy (this, inode, - local->fresh_children, - local->read_child_index, - priv->read_child); - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } int -afr_symlink_wind (call_frame_t *frame, xlator_t *this) +afr_symlink_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->symlink, - local->cont.symlink.linkpath, - &local->loc, - local->cont.symlink.params); - - if (!--call_count) - break; - - } - } - - return 0; -} - - -int -afr_symlink_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->symlink, + local->cont.symlink.linkpath, &local->loc, + local->umask, local->xdata_req); return 0; } int -afr_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *loc, dict_t *params) +afr_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; priv = this->private; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } + QUORUM_CHECK(symlink,out); - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + transaction_frame = copy_frame (frame); + if (!transaction_frame) goto out; - } - transaction_frame->local = local; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, loc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); local->cont.symlink.linkpath = gf_strdup (linkpath); - if (params) - local->cont.symlink.params = dict_ref (params); + local->umask = umask; - if (loc->parent) - local->cont.symlink.parent_ino = loc->parent->ino; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local->transaction.fop = afr_symlink_wind; - local->transaction.done = afr_symlink_done; + if (!local->xdata_req) + goto out; + + local->op = GF_FOP_SYMLINK; + local->transaction.wind = afr_symlink_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_symlink_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, loc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (symlink, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (symlink, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL); return 0; } @@ -1245,35 +1055,19 @@ afr_rename_unwind (call_frame_t *frame, xlator_t *this) { call_frame_t *main_frame = NULL; afr_local_t *local = NULL; - struct iatt *unwind_buf = NULL; local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - if (local->cont.rename.read_child_buf.ia_ino) { - unwind_buf = &local->cont.rename.read_child_buf; - } else { - unwind_buf = &local->cont.rename.buf; - } - - AFR_STACK_UNWIND (rename, main_frame, - local->op_ret, local->op_errno, - unwind_buf, - &local->cont.rename.preoldparent, - &local->cont.rename.postoldparent, - &local->cont.rename.prenewparent, - &local->cont.rename.postnewparent); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (rename, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + &local->cont.dir_fop.prenewparent, + &local->cont.dir_fop.postnewparent, local->xdata_rsp); return 0; } @@ -1282,174 +1076,135 @@ int afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent) + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { - afr_local_t * local = NULL; - int call_count = -1; - int child_index = -1; - - local = frame->local; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno) && op_errno != ENOTEMPTY) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - - if (buf) { - local->cont.rename.buf = *buf; - } - - local->success_count++; - } - - if (child_index == local->read_child_index) { - local->cont.rename.read_child_buf = *buf; - - local->cont.rename.preoldparent = *preoldparent; - local->cont.rename.postoldparent = *postoldparent; - local->cont.rename.prenewparent = *prenewparent; - local->cont.rename.postnewparent = *postnewparent; - } - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.unwind (frame, this); - local->transaction.resume (frame, this); - } - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preoldparent, postoldparent, prenewparent, + postnewparent, xdata); } -int32_t -afr_rename_wind (call_frame_t *frame, xlator_t *this) +int +afr_rename_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->rename, - &local->loc, - &local->newloc); - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_rename_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); + local = frame->local; + priv = this->private; + STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->rename, + &local->loc, &local->newloc, local->xdata_req); return 0; } int -afr_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) +afr_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + int nlockee = 0; priv = this->private; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); + QUORUM_CHECK(rename,out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + transaction_frame = copy_frame (frame); + if (!transaction_frame) + op_errno = ENOMEM; - transaction_frame->local = local; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, oldloc); loc_copy (&local->newloc, newloc); - local->read_child_index = afr_inode_get_read_ctx (this, oldloc->inode, NULL); + local->inode = inode_ref (oldloc->inode); + local->parent = inode_ref (oldloc->parent); + local->parent2 = inode_ref (newloc->parent); - local->cont.rename.ino = oldloc->inode->ino; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - if (oldloc->parent) - local->cont.rename.oldparent_ino = oldloc->parent->ino; - if (newloc->parent) - local->cont.rename.newparent_ino = newloc->parent->ino; + if (!local->xdata_req) + goto out; - local->transaction.fop = afr_rename_wind; - local->transaction.done = afr_rename_done; + local->op = GF_FOP_RENAME; + local->transaction.wind = afr_rename_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_rename_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, oldloc); - afr_build_parent_loc (&local->transaction.new_parent_loc, newloc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, oldloc, + &op_errno); + if (ret) + goto out; + ret = afr_build_parent_loc (&local->transaction.new_parent_loc, newloc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (oldloc->path); local->transaction.new_basename = AFR_BASENAME (newloc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = nlockee = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->transaction.new_parent_loc, + local->transaction.new_basename, + priv->child_count); + if (ret) + goto out; - afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); + nlockee++; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; - op_ret = 0; -out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + nlockee++; + if (local->newloc.inode && IA_ISDIR (local->newloc.inode->ia_type)) { + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->newloc, + NULL, + priv->child_count); + if (ret) + goto out; + + nlockee++; + } + qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee), + afr_entry_lockee_cmp); + int_lock->lockee_count = nlockee; - AFR_STACK_UNWIND (rename, frame, op_ret, op_errno, - NULL, NULL, NULL, NULL, NULL); + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); return 0; } @@ -1465,22 +1220,13 @@ afr_unlink_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (unlink, main_frame, - local->op_ret, local->op_errno, - &local->cont.unlink.preparent, - &local->cont.unlink.postparent); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (unlink, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -1488,163 +1234,103 @@ afr_unlink_unwind (call_frame_t *frame, xlator_t *this) int afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (child_index == local->read_child_index) { - local->read_child_returned = _gf_true; - } - - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.unlink.preparent = *preparent; - local->cont.unlink.postparent = *postparent; - } - - if (child_index == local->read_child_index) { - local->cont.unlink.preparent = *preparent; - local->cont.unlink.postparent = *postparent; - } - - local->success_count++; - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL, + preparent, postparent, NULL, NULL, xdata); } -int32_t -afr_unlink_wind (call_frame_t *frame, xlator_t *this) +int +afr_unlink_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->unlink, - &local->loc); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->unlink, + &local->loc, local->xflag, local->xdata_req); return 0; } -int32_t -afr_unlink_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - - -int32_t -afr_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc) +int +afr_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; priv = this->private; + QUORUM_CHECK(unlink,out); + transaction_frame = copy_frame (frame); - if (!transaction_frame) { + if (!transaction_frame) goto out; - } - ALLOC_OR_GOTO (local, afr_local_t, out); + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + loc_copy (&local->loc, loc); + local->xflag = xflag; - transaction_frame->local = local; + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); - loc_copy (&local->loc, loc); + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - if (loc->parent) - local->cont.unlink.parent_ino = loc->parent->ino; + if (!local->xdata_req) + goto out; - local->transaction.fop = afr_unlink_wind; - local->transaction.done = afr_unlink_done; + local->op = GF_FOP_UNLINK; + local->transaction.wind = afr_unlink_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_unlink_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, loc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + int_lock->lockee_count++; + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (unlink, frame, op_ret, op_errno, - NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -1662,22 +1348,13 @@ afr_rmdir_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (rmdir, main_frame, - local->op_ret, local->op_errno, - &local->cont.rmdir.preparent, - &local->cont.rmdir.postparent); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (rmdir, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -1685,165 +1362,117 @@ afr_rmdir_unwind (call_frame_t *frame, xlator_t *this) int afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent) + struct iatt *postparent, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int child_index = (long) cookie; - int read_child = 0; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - if (afr_fop_failed (op_ret, op_errno) && (op_errno != ENOTEMPTY)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.rmdir.preparent = *preparent; - local->cont.rmdir.postparent = *postparent; - - } - - if (child_index == read_child) { - local->cont.rmdir.preparent = *preparent; - local->cont.rmdir.postparent = *postparent; - } - - local->success_count++; - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.unwind (frame, this); - local->transaction.resume (frame, this); - } - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL, + preparent, postparent, NULL, NULL, xdata); } int -afr_rmdir_wind (call_frame_t *frame, xlator_t *this) +afr_rmdir_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->rmdir, - &local->loc, local->cont.rmdir.flags); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_rmdir_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->rmdir, + &local->loc, local->cont.rmdir.flags, local->xdata_req); return 0; } int -afr_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags) +afr_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + int nlockee = 0; priv = this->private; + QUORUM_CHECK(rmdir,out); + transaction_frame = copy_frame (frame); - if (!transaction_frame) { + if (!transaction_frame) goto out; - } - ALLOC_OR_GOTO (local, afr_local_t, out); + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - transaction_frame->local = local; + loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); local->cont.rmdir.flags = flags; - loc_copy (&local->loc, loc); - if (loc->parent) - local->cont.rmdir.parent_ino = loc->parent->ino; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; - local->transaction.fop = afr_rmdir_wind; - local->transaction.done = afr_rmdir_done; + local->op = GF_FOP_RMDIR; + local->transaction.wind = afr_rmdir_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_rmdir_unwind; - afr_build_parent_loc (&local->transaction.parent_loc, loc); + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, + &op_errno); + if (ret) + goto out; local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = nlockee = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; - afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + nlockee++; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->loc, + NULL, + priv->child_count); + if (ret) + goto out; - op_ret = 0; -out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (rmdir, frame, op_ret, op_errno, - NULL, NULL); + nlockee++; + qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee), + afr_entry_lockee_cmp); + int_lock->lockee_count = nlockee; + + ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL); return 0; } diff --git a/xlators/cluster/afr/src/afr-dir-write.h b/xlators/cluster/afr/src/afr-dir-write.h index fb754d371..02f0a3682 100644 --- a/xlators/cluster/afr/src/afr-dir-write.h +++ b/xlators/cluster/afr/src/afr-dir-write.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __DIR_WRITE_H__ @@ -23,38 +14,34 @@ int32_t afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params); + mode_t umask, fd_t *fd, dict_t *xdata); int32_t afr_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t dev, dict_t *params); + loc_t *loc, mode_t mode, dev_t dev, mode_t umask, dict_t *xdata); int32_t afr_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *params); + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata); int32_t afr_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc); + loc_t *loc, int xflag, dict_t *xdata); int32_t afr_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags); + loc_t *loc, int flags, dict_t *xdata); int32_t afr_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc); + loc_t *oldloc, loc_t *newloc, dict_t *xdata); int32_t afr_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc); + loc_t *oldloc, loc_t *newloc, dict_t *xdata); int afr_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *oldloc, dict_t *params); - -int32_t -afr_setdents (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count); + const char *linkpath, loc_t *oldloc, mode_t umask, dict_t *params); #endif /* __DIR_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index a42f5ea73..01e078c13 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -44,247 +35,153 @@ #include "compat-errno.h" #include "compat.h" -#include "afr.h" +#include "afr-transaction.h" -/** - * Common algorithm for inode read calls: - * - * - Try the fop on the first child that is up - * - if we have failed due to ENOTCONN: - * try the next child - * - * Applicable to: access, stat, fstat, readlink, getxattr - */ - /* {{{ access */ -int32_t -afr_access_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno) +int +afr_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; + afr_local_t *local = NULL; local = frame->local; - read_child = (long) cookie; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - if (op_ret == -1) { - last_index = &local->cont.access.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; - - unwind = 0; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - STACK_WIND_COOKIE (frame, afr_access_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->access, - &local->loc, local->cont.access.mask); - } - -out: - if (unwind) { - AFR_STACK_UNWIND (access, frame, op_ret, op_errno); - } + AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata); return 0; } -int32_t -afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask) +int +afr_access_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - int32_t op_ret = -1; - int32_t op_errno = 0; - int32_t read_child = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; - - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; - - op_ret = AFR_LOCAL_INIT (local, priv); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } - - local->fresh_children = afr_fresh_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (subvol == -1) { + AFR_STACK_UNWIND (access, frame, local->op_ret, + local->op_errno, 0); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_access_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->access, + &local->loc, local->cont.access.mask, + local->xdata_req); + return 0; +} +int +afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, + int mask, dict_t *xdata) +{ + afr_local_t *local = NULL; + int op_errno = 0; - read_child = afr_inode_get_read_ctx (this, loc->inode, - local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.access.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - loc_copy (&local->loc, loc); - local->cont.access.mask = mask; + local->op = GF_FOP_ACCESS; + loc_copy (&local->loc, loc); + local->cont.access.mask = mask; + if (xdata) + local->xdata_req = dict_ref (xdata); - STACK_WIND_COOKIE (frame, afr_access_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->access, - loc, mask); + afr_read_txn (frame, this, loc->inode, afr_access_wind, + AFR_METADATA_TRANSACTION); - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (access, frame, op_ret, op_errno); - } + AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL); + return 0; } - /* }}} */ /* {{{ stat */ -int32_t +int afr_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf) + struct iatt *buf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; - - read_child = (long) cookie; + afr_local_t *local = NULL; local = frame->local; - if (op_ret == -1) { - last_index = &local->cont.stat.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - unwind = 0; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - STACK_WIND_COOKIE (frame, afr_stat_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->stat, - &local->loc); - } - -out: - if (unwind) { - AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf); - } + AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); return 0; } -int32_t -afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) +int +afr_stat_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int call_child = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - int32_t read_child = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; - - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; - op_ret = AFR_LOCAL_INIT (local, priv); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (subvol == -1) { + AFR_STACK_UNWIND (stat, frame, local->op_ret, local->op_errno, + 0, 0); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->stat, + &local->loc, local->xdata_req); + return 0; +} - local->fresh_children = afr_fresh_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } +int +afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + afr_local_t *local = NULL; + int op_errno = 0; - read_child = afr_inode_get_read_ctx (this, loc->inode, - local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.stat.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } - loc_copy (&local->loc, loc); + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - local->cont.stat.ino = loc->inode->ino; + local->op = GF_FOP_STAT; + loc_copy (&local->loc, loc); + if (xdata) + local->xdata_req = dict_ref (xdata); - STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child, - children[call_child], - children[call_child]->fops->stat, - loc); + afr_read_txn (frame, this, loc->inode, afr_stat_wind, + AFR_DATA_TRANSACTION); - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, NULL); - } + AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); return 0; } @@ -294,121 +191,76 @@ out: /* {{{ fstat */ -int32_t +int afr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf) + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; + afr_local_t *local = NULL; local = frame->local; - read_child = (long) cookie; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - if (op_ret == -1) { - last_index = &local->cont.fstat.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - unwind = 0; + AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); - STACK_WIND_COOKIE (frame, afr_fstat_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->fstat, - local->fd); - } + return 0; +} -out: - if (unwind) { - AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf); - } - return 0; +int +afr_fstat_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (subvol == -1) { + AFR_STACK_UNWIND (fstat, frame, local->op_ret, local->op_errno, + 0, 0); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fstat, + local->fd, local->xdata_req); + return 0; } int32_t afr_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd) + fd_t *fd, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int call_child = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - int32_t read_child = 0; + afr_local_t *local = NULL; + int op_errno = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - VALIDATE_OR_GOTO (this->private, out); + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + local->op = GF_FOP_FSTAT; + local->fd = fd_ref (fd); + if (xdata) + local->xdata_req = dict_ref (xdata); - children = priv->children; - - VALIDATE_OR_GOTO (fd->inode, out); - - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; - - op_ret = AFR_LOCAL_INIT (local, priv); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } - - local->fresh_children = afr_fresh_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } - - read_child = afr_inode_get_read_ctx (this, fd->inode, - local->fresh_children); - - - - op_ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.fstat.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } - - local->cont.fstat.ino = fd->inode->ino; - local->fd = fd_ref (fd); + afr_fix_open (fd, this); - STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child, - children[call_child], - children[call_child]->fops->fstat, - fd); + afr_read_txn (frame, this, fd->inode, afr_fstat_wind, + AFR_DATA_TRANSACTION); - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, NULL); - } + AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); return 0; } @@ -417,118 +269,77 @@ out: /* {{{ readlink */ -int32_t +int afr_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - const char *buf, struct iatt *sbuf) + const char *buf, struct iatt *sbuf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; + afr_local_t *local = NULL; - priv = this->private; - children = priv->children; + local = frame->local; - local = frame->local; + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - read_child = (long) cookie; - - if (op_ret == -1) { - last_index = &local->cont.readlink.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; - - unwind = 0; - STACK_WIND_COOKIE (frame, afr_readlink_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->readlink, - &local->loc, - local->cont.readlink.size); - } + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } -out: - if (unwind) { - AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, buf, sbuf); - } + AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, + buf, sbuf, xdata); + return 0; +} - return 0; +int +afr_readlink_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND (readlink, frame, local->op_ret, + local->op_errno, 0, 0, 0); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_readlink_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->readlink, + &local->loc, local->cont.readlink.size, + local->xdata_req); + return 0; } -int32_t +int afr_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size) + loc_t *loc, size_t size, dict_t *xdata) { - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - int32_t op_ret = -1; + afr_local_t * local = NULL; int32_t op_errno = 0; - int32_t read_child = -1; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; - - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; - op_ret = AFR_LOCAL_INIT (local, priv); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } - - local->fresh_children = afr_fresh_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } - read_child = afr_inode_get_read_ctx (this, loc->inode, - local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.readlink.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + local->op = GF_FOP_READLINK; loc_copy (&local->loc, loc); + local->cont.readlink.size = size; + if (xdata) + local->xdata_req = dict_ref (xdata); - local->cont.readlink.size = size; - local->cont.readlink.ino = loc->inode->ino; - - STACK_WIND_COOKIE (frame, afr_readlink_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readlink, - loc, size); + afr_read_txn (frame, this, loc->inode, afr_readlink_wind, + AFR_DATA_TRANSACTION); - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, NULL, NULL); - } return 0; +out: + AFR_STACK_UNWIND(readlink, frame, -1, op_errno, 0, 0, 0); + + return 0; } @@ -542,7 +353,7 @@ struct _xattr_key { }; -void +int __gather_xattr_keys (dict_t *dict, char *key, data_t *value, void *data) { @@ -554,18 +365,19 @@ __gather_xattr_keys (dict_t *dict, char *key, data_t *value, xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key); if (!xkey) - return; + return -1; xkey->key = key; INIT_LIST_HEAD (&xkey->list); list_add_tail (&xkey->list, list); } + return 0; } void -__filter_xattrs (dict_t *dict) +afr_filter_xattrs (dict_t *dict) { struct list_head keys = {0,}; struct _xattr_key *key = NULL; @@ -586,85 +398,669 @@ __filter_xattrs (dict_t *dict) } - -int32_t +int afr_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict) + dict_t *dict, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } + + if (dict) + afr_filter_xattrs (dict); + + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); + + return 0; +} + + +int +afr_getxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND (getxattr, frame, local->op_ret, + local->op_errno, NULL, NULL); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_getxattr_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->getxattr, + &local->loc, local->cont.getxattr.name, + local->xdata_req); + return 0; +} + + +int32_t +afr_getxattr_unwind (call_frame_t *frame, int op_ret, int op_errno, + dict_t *dict, dict_t *xdata) + +{ + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + +int32_t +afr_fgetxattr_clrlk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + dict_t *xattr = NULL; + char *tmp_report = NULL; + char lk_summary[1024] = {0,}; + int serz_len = 0; + int32_t callcnt = 0; + long int cky = 0; + int ret = 0; priv = this->private; children = priv->children; local = frame->local; + cky = (long) cookie; - read_child = (long) cookie; + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) + local->replies[cky].op_errno = op_errno; + + if (!local->dict) + local->dict = dict_new (); + if (local->dict) { + ret = dict_get_str (dict, local->cont.getxattr.name, + &tmp_report); + if (ret) + goto unlock; + ret = dict_set_dynstr (local->dict, + children[cky]->name, + gf_strdup (tmp_report)); + if (ret) + goto unlock; + } + } +unlock: + UNLOCK (&frame->lock); - if (op_ret == -1) { - last_index = &local->cont.getxattr.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; + if (!callcnt) { + xattr = dict_new (); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + ret = dict_serialize_value_with_delim (local->dict, + lk_summary, + &serz_len, '\n'); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Error serializing dictionary"); + goto unwind; + } + if (serz_len == -1) + snprintf (lk_summary, sizeof (lk_summary), + "No locks cleared."); + ret = dict_set_dynstr (xattr, local->cont.getxattr.name, + gf_strdup (lk_summary)); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Error setting dictionary"); + goto unwind; + } + + unwind: + // Updating child_errno with more recent 'events' + op_errno = afr_final_errno (local, priv); + + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr, + xdata); + if (xattr) + dict_unref (xattr); + } + + return ret; +} + +int32_t +afr_getxattr_clrlk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + dict_t *xattr = NULL; + char *tmp_report = NULL; + char lk_summary[1024] = {0,}; + int serz_len = 0; + int32_t callcnt = 0; + long int cky = 0; + int ret = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + cky = (long) cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) + local->replies[cky].op_errno = op_errno; + + if (!local->dict) + local->dict = dict_new (); + if (local->dict) { + ret = dict_get_str (dict, local->cont.getxattr.name, + &tmp_report); + if (ret) + goto unlock; + ret = dict_set_dynstr (local->dict, + children[cky]->name, + gf_strdup (tmp_report)); + if (ret) + goto unlock; + } + } +unlock: + UNLOCK (&frame->lock); + + if (!callcnt) { + xattr = dict_new (); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + ret = dict_serialize_value_with_delim (local->dict, + lk_summary, + &serz_len, '\n'); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Error serializing dictionary"); + goto unwind; + } + if (serz_len == -1) + snprintf (lk_summary, sizeof (lk_summary), + "No locks cleared."); + ret = dict_set_dynstr (xattr, local->cont.getxattr.name, + gf_strdup (lk_summary)); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Error setting dictionary"); + goto unwind; + } + + unwind: + // Updating child_errno with more recent 'events' + op_errno = afr_final_errno (local, priv); + + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata); + + if (xattr) + dict_unref (xattr); + } + + return ret; +} + +/** + * node-uuid cbk uses next child querying mechanism + */ +int32_t +afr_getxattr_node_uuid_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t **children = NULL; + int unwind = 1; + int curr_call_child = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { /** query the _next_ child */ + + /** + * _current_ becomes _next_ + * If done with all childs and yet no success; give up ! + */ + curr_call_child = (int) ((long)cookie); + if (++curr_call_child == priv->child_count) + goto unwind; + + gf_log (this->name, GF_LOG_WARNING, + "op_ret (-1): Re-querying afr-child (%d/%d)", + curr_call_child, priv->child_count); unwind = 0; - STACK_WIND_COOKIE (frame, afr_getxattr_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->getxattr, + STACK_WIND_COOKIE (frame, afr_getxattr_node_uuid_cbk, + (void *) (long) curr_call_child, + children[curr_call_child], + children[curr_call_child]->fops->getxattr, &local->loc, - local->cont.getxattr.name); + local->cont.getxattr.name, + NULL); } -out: - if (unwind) { - if (op_ret >= 0 && dict) - __filter_xattrs (dict); + unwind: + if (unwind) + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, + NULL); - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); + return 0; +} + +int32_t +afr_getxattr_lockinfo_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + int call_cnt = 0, len = 0; + char *lockinfo_buf = NULL; + dict_t *lockinfo = NULL, *newdict = NULL; + afr_local_t *local = NULL; + + LOCK (&frame->lock); + { + local = frame->local; + + call_cnt = --local->call_count; + + if ((op_ret < 0) || (!dict && !xdata)) { + goto unlock; + } + + if (xdata) { + if (!local->xdata_rsp) { + local->xdata_rsp = dict_new (); + if (!local->xdata_rsp) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } + + if (!dict) { + goto unlock; + } + + op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY, + (void **)&lockinfo_buf, &len); + + if (!lockinfo_buf) { + goto unlock; + } + + if (!local->dict) { + local->dict = dict_new (); + if (!local->dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } } +unlock: + UNLOCK (&frame->lock); + + if (lockinfo_buf != NULL) { + lockinfo = dict_new (); + if (lockinfo == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + } else { + op_ret = dict_unserialize (lockinfo_buf, len, + &lockinfo); + + if (lockinfo && local->dict) { + dict_copy (lockinfo, local->dict); + } + } + } + + if (xdata && local->xdata_rsp) { + dict_copy (xdata, local->xdata_rsp); + } + + if (!call_cnt) { + newdict = dict_new (); + if (!newdict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } + + len = dict_serialized_length (local->dict); + if (len == 0) { + goto unwind; + } + + lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char); + if (!lockinfo_buf) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } + + op_ret = dict_serialize (local->dict, lockinfo_buf); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + } + + op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY, + (void *)lockinfo_buf, len); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + goto unwind; + } + + unwind: + AFR_STACK_UNWIND (getxattr, frame, op_ret, + op_errno, newdict, + local->xdata_rsp); + } + + dict_unref (lockinfo); return 0; } int32_t -afr_getxattr_unwind (call_frame_t *frame, - int op_ret, int op_errno, dict_t *dict) - +afr_fgetxattr_lockinfo_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) { - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); + int call_cnt = 0, len = 0; + char *lockinfo_buf = NULL; + dict_t *lockinfo = NULL, *newdict = NULL; + afr_local_t *local = NULL; + + LOCK (&frame->lock); + { + local = frame->local; + + call_cnt = --local->call_count; + + if ((op_ret < 0) || (!dict && !xdata)) { + goto unlock; + } + + if (xdata) { + if (!local->xdata_rsp) { + local->xdata_rsp = dict_new (); + if (!local->xdata_rsp) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } + + if (!dict) { + goto unlock; + } + + op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY, + (void **)&lockinfo_buf, &len); + + if (!lockinfo_buf) { + goto unlock; + } + + if (!local->dict) { + local->dict = dict_new (); + if (!local->dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + } + } +unlock: + UNLOCK (&frame->lock); + + if (lockinfo_buf != NULL) { + lockinfo = dict_new (); + if (lockinfo == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + } else { + op_ret = dict_unserialize (lockinfo_buf, len, + &lockinfo); + + if (lockinfo && local->dict) { + dict_copy (lockinfo, local->dict); + } + } + } + + if (xdata && local->xdata_rsp) { + dict_copy (xdata, local->xdata_rsp); + } + + if (!call_cnt) { + newdict = dict_new (); + if (!newdict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } + + len = dict_serialized_length (local->dict); + if (len <= 0) { + goto unwind; + } + + lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char); + if (!lockinfo_buf) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } + + op_ret = dict_serialize (local->dict, lockinfo_buf); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + } + + op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY, + (void *)lockinfo_buf, len); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + goto unwind; + } + + unwind: + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, + op_errno, newdict, + local->xdata_rsp); + } + + dict_unref (lockinfo); + return 0; } int32_t +afr_fgetxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t callcnt = 0; + int ret = 0; + char *xattr = NULL; + char *xattr_serz = NULL; + char xattr_cky[1024] = {0,}; + dict_t *nxattr = NULL; + long cky = 0; + int32_t padding = 0; + int32_t tlen = 0; + + if (!frame || !frame->local || !this) { + gf_log ("", GF_LOG_ERROR, "possible NULL deref"); + goto out; + } + + local = frame->local; + cky = (long) cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret < 0) { + local->op_errno = op_errno; + } else { + local->op_ret = op_ret; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + } + + if (!dict || (op_ret < 0)) + goto out; + + if (!local->dict) + local->dict = dict_new (); + + if (local->dict) { + ret = dict_get_str (dict, + local->cont.getxattr.name, + &xattr); + if (ret) + goto out; + + xattr = gf_strdup (xattr); + + (void)snprintf (xattr_cky, 1024, "%s-%ld", + local->cont.getxattr.name, cky); + ret = dict_set_dynstr (local->dict, + xattr_cky, xattr); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Cannot set xattr cookie key"); + goto out; + } + + local->cont.getxattr.xattr_len + += strlen (xattr) + 1; + } + } +out: + UNLOCK (&frame->lock); + + if (!callcnt) { + if (!local->cont.getxattr.xattr_len) + goto unwind; + + nxattr = dict_new (); + if (!nxattr) + goto unwind; + + /* extra bytes for decorations (brackets and <>'s) */ + padding += strlen (this->name) + + strlen (AFR_PATHINFO_HEADER) + 4; + local->cont.getxattr.xattr_len += (padding + 2); + + xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len, + sizeof (char), gf_common_mt_char); + + if (!xattr_serz) + goto unwind; + + /* the xlator info */ + (void) sprintf (xattr_serz, "(<"AFR_PATHINFO_HEADER"%s> ", + this->name); + + /* actual series of pathinfo */ + ret = dict_serialize_value_with_delim (local->dict, + xattr_serz + + strlen (xattr_serz), + &tlen, ' '); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Error serializing" + " dictionary"); + goto unwind; + } + + /* closing part */ + *(xattr_serz + padding + tlen) = ')'; + *(xattr_serz + padding + tlen + 1) = '\0'; + + ret = dict_set_dynstr (nxattr, local->cont.getxattr.name, + xattr_serz); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo" + " key in dict"); + + unwind: + AFR_STACK_UNWIND (fgetxattr, frame, local->op_ret, + local->op_errno, nxattr, local->xdata_rsp); + + if (nxattr) + dict_unref (nxattr); + } + + return ret; +} + +int32_t afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict) + dict_t *dict, dict_t *xdata) { - afr_local_t *local = NULL; - int32_t callcnt = 0; - int ret = 0; - char *pathinfo = NULL; - char *pathinfo_serz = NULL; - char pathinfo_cky[1024] = {0,}; - dict_t *xattr = NULL; - long cky = 0; - int32_t padding = 0; - int32_t tlen = 0; + afr_local_t *local = NULL; + int32_t callcnt = 0; + int ret = 0; + char *xattr = NULL; + char *xattr_serz = NULL; + char xattr_cky[1024] = {0,}; + dict_t *nxattr = NULL; + long cky = 0; + int32_t padding = 0; + int32_t tlen = 0; if (!frame || !frame->local || !this) { - gf_log (this->name, GF_LOG_ERROR, "possible NULL deref"); + gf_log ("", GF_LOG_ERROR, "possible NULL deref"); goto out; } @@ -675,6 +1071,14 @@ afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, { callcnt = --local->call_count; + if (op_ret < 0) { + local->op_errno = op_errno; + } else { + local->op_ret = op_ret; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + } + if (!dict || (op_ret < 0)) goto out; @@ -682,142 +1086,341 @@ afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, local->dict = dict_new (); if (local->dict) { - ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); + ret = dict_get_str (dict, + local->cont.getxattr.name, + &xattr); if (ret) goto out; - pathinfo = gf_strdup (pathinfo); + xattr = gf_strdup (xattr); - snprintf (pathinfo_cky, 1024, "%s-%ld", GF_XATTR_PATHINFO_KEY, cky); - ret = dict_set_dynstr (local->dict, pathinfo_cky, pathinfo); + (void)snprintf (xattr_cky, 1024, "%s-%ld", + local->cont.getxattr.name, cky); + ret = dict_set_dynstr (local->dict, + xattr_cky, xattr); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo cookie key"); + gf_log (this->name, GF_LOG_ERROR, + "Cannot set xattr cookie key"); goto out; } - local->cont.getxattr.pathinfo_len += strlen (pathinfo) + 1; + local->cont.getxattr.xattr_len += strlen (xattr) + 1; } } out: UNLOCK (&frame->lock); if (!callcnt) { - if (!local->cont.getxattr.pathinfo_len) + if (!local->cont.getxattr.xattr_len) goto unwind; - xattr = dict_new (); - if (!xattr) + nxattr = dict_new (); + if (!nxattr) goto unwind; /* extra bytes for decorations (brackets and <>'s) */ - padding = strlen (this->name) + strlen (AFR_PATHINFO_HEADER) + 4; - local->cont.getxattr.pathinfo_len += (padding + 2); + padding += strlen (this->name) + strlen (AFR_PATHINFO_HEADER) + 4; + local->cont.getxattr.xattr_len += (padding + 2); - pathinfo_serz = GF_CALLOC (local->cont.getxattr.pathinfo_len, sizeof (char), - gf_common_mt_char); + xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len, + sizeof (char), gf_common_mt_char); - if (!pathinfo_serz) + if (!xattr_serz) goto unwind; /* the xlator info */ - sprintf (pathinfo_serz, "(<"AFR_PATHINFO_HEADER"%s> ", this->name); + (void) sprintf (xattr_serz, "(<"AFR_PATHINFO_HEADER"%s> ", + this->name); /* actual series of pathinfo */ - ret = dict_serialize_value_with_delim (local->dict, pathinfo_serz + strlen (pathinfo_serz), + ret = dict_serialize_value_with_delim (local->dict, + xattr_serz + strlen (xattr_serz), &tlen, ' '); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Error serializing dictionary"); + gf_log (this->name, GF_LOG_ERROR, "Error serializing" + " dictionary"); goto unwind; } /* closing part */ - *(pathinfo_serz + padding + tlen) = ')'; - *(pathinfo_serz + padding + tlen + 1) = '\0'; + *(xattr_serz + padding + tlen) = ')'; + *(xattr_serz + padding + tlen + 1) = '\0'; - ret = dict_set_dynstr (xattr, GF_XATTR_PATHINFO_KEY, pathinfo_serz); + ret = dict_set_dynstr (nxattr, local->cont.getxattr.name, + xattr_serz); if (ret) - gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo key in dict"); + gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo" + " key in dict"); unwind: - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr); + AFR_STACK_UNWIND (getxattr, frame, local->op_ret, + local->op_errno, nxattr, local->xdata_rsp); - if (local->dict) - dict_unref (local->dict); - - if (xattr) - dict_unref (xattr); + if (nxattr) + dict_unref (nxattr); } return ret; } +static int +afr_aggregate_stime_xattr (dict_t *this, char *key, data_t *value, void *data) +{ + int ret = 0; + + if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) + ret = gf_get_max_stime (THIS, data, key, value); + + return ret; +} + +int32_t +afr_common_getxattr_stime_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t callcnt = 0; + + if (!frame || !frame->local || !this) { + gf_log ("", GF_LOG_ERROR, "possible NULL deref"); + goto out; + } + + local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (!dict || (op_ret < 0)) { + local->op_errno = op_errno; + goto cleanup; + } + + if (!local->dict) + local->dict = dict_copy_with_ref (dict, NULL); + else + dict_foreach (dict, afr_aggregate_stime_xattr, + local->dict); + local->op_ret = 0; + } + +cleanup: + UNLOCK (&frame->lock); + + if (!callcnt) { + AFR_STACK_UNWIND (getxattr, frame, local->op_ret, + local->op_errno, local->dict, xdata); + } + +out: + return 0; +} + + +static gf_boolean_t +afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk, + gf_boolean_t is_fgetxattr) +{ + gf_boolean_t is_spl = _gf_true; + + GF_ASSERT (cbk); + if (!cbk || !name) { + is_spl = _gf_false; + goto out; + } + + if (!strcmp (name, GF_XATTR_PATHINFO_KEY) || + !strcmp (name, GF_XATTR_USER_PATHINFO_KEY)) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_pathinfo_cbk; + } else { + *cbk = afr_getxattr_pathinfo_cbk; + } + } else if (!strncmp (name, GF_XATTR_CLRLK_CMD, + strlen (GF_XATTR_CLRLK_CMD))) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_clrlk_cbk; + } else { + *cbk = afr_getxattr_clrlk_cbk; + } + } else if (!strncmp (name, GF_XATTR_LOCKINFO_KEY, + strlen (GF_XATTR_LOCKINFO_KEY))) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_lockinfo_cbk; + } else { + *cbk = afr_getxattr_lockinfo_cbk; + } + } else if (fnmatch (GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) { + *cbk = afr_common_getxattr_stime_cbk; + } else { + is_spl = _gf_false; + } + +out: + return is_spl; +} + +static void +afr_getxattr_all_subvols (xlator_t *this, call_frame_t *frame, + const char *name, loc_t *loc, + fop_getxattr_cbk_t cbk) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + + priv = this->private; + + local = frame->local; + //local->call_count set in afr_local_init + call_count = local->call_count; + + //If up-children count is 0, afr_local_init would have failed already + //and the call would have unwound so not handling it here. + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->getxattr, + loc, name, NULL); + if (!--call_count) + break; + } + } + return; +} + int32_t afr_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) + loc_t *loc, const char *name, dict_t *xdata) { - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - xlator_list_t *trav = NULL; - xlator_t **sub_volumes = NULL; - int i = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - int32_t read_child = -1; - - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_private_t *priv = NULL; + xlator_t **children = NULL; + afr_local_t *local = NULL; + xlator_list_t *trav = NULL; + xlator_t **sub_volumes = NULL; + int i = 0; + int32_t op_errno = 0; + int ret = -1; + fop_getxattr_cbk_t cbk = NULL; + int afr_xtime_gauge[MCNT_MAX] = {0,}; + + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); children = priv->children; - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; + loc_copy (&local->loc, loc); + + local->op = GF_FOP_GETXATTR; + + if (xdata) + local->xdata_req = dict_ref (xdata); + + if (!name) + goto no_name; + + local->cont.getxattr.name = gf_strdup (name); - op_ret = AFR_LOCAL_INIT (local, priv); - if (op_ret < 0) { - op_errno = -op_ret; + if (!local->cont.getxattr.name) { + op_errno = ENOMEM; + goto out; + } + + if (!strncmp (name, AFR_XATTR_PREFIX, + strlen (AFR_XATTR_PREFIX))) { + gf_log (this->name, GF_LOG_INFO, + "%s: no data present for key %s", + loc->path, name); + op_errno = ENODATA; goto out; } + if ((strcmp (GF_XATTR_MARKER_KEY, name) == 0) + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { - loc_copy (&local->loc, loc); - if (name) - local->cont.getxattr.name = gf_strdup (name); + local->marker.call_count = priv->child_count; + sub_volumes = alloca ( priv->child_count * sizeof (xlator_t *)); + for (i = 0, trav = this->children; trav ; + trav = trav->next, i++) { + + *(sub_volumes + i) = trav->xlator; + } + + if (cluster_getmarkerattr (frame, this, loc, name, + local, afr_getxattr_unwind, + sub_volumes, + priv->child_count, + MARKER_UUID_TYPE, + marker_uuid_default_gauge, + priv->vol_uuid)) { - if (name) { - if (!strncmp (name, AFR_XATTR_PREFIX, - strlen (AFR_XATTR_PREFIX))) { gf_log (this->name, GF_LOG_INFO, - "%s: no data present for key %s", + "%s: failed to get marker attr (%s)", loc->path, name); - op_errno = ENODATA; + op_errno = EINVAL; goto out; } - if ((strcmp (GF_XATTR_MARKER_KEY, name) == 0) - && (-1 == frame->root->pid)) { + return 0; + } + + /* + * if we are doing getxattr with pathinfo as the key then we + * collect information from all childs + */ + if (afr_is_special_xattr (name, &cbk, 0)) { + afr_getxattr_all_subvols (this, frame, name, loc, cbk); + return 0; + } + + if (XATTR_IS_NODE_UUID (name)) { + i = 0; + STACK_WIND_COOKIE (frame, afr_getxattr_node_uuid_cbk, + (void *) (long) i, + children[i], + children[i]->fops->getxattr, + loc, name, xdata); + return 0; + } + + if (*priv->vol_uuid) { + if ((match_uuid_local (name, priv->vol_uuid) == 0) + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { local->marker.call_count = priv->child_count; - sub_volumes = alloca ( priv->child_count * sizeof (xlator_t *)); + sub_volumes = alloca ( priv->child_count + * sizeof (xlator_t *)); for (i = 0, trav = this->children; trav ; trav = trav->next, i++) { *(sub_volumes + i) = trav->xlator; + } - if (cluster_getmarkerattr (frame, this, loc, name, - local, afr_getxattr_unwind, + /* don't err out on getting ENOTCONN (brick down) + * from a subset of the bricks + */ + memcpy (afr_xtime_gauge, marker_xtime_default_gauge, + sizeof (afr_xtime_gauge)); + afr_xtime_gauge[MCNT_NOTFOUND] = 0; + afr_xtime_gauge[MCNT_ENOTCONN] = 0; + if (cluster_getmarkerattr (frame, this, loc, + name, local, + afr_getxattr_unwind, sub_volumes, priv->child_count, - MARKER_UUID_TYPE, + MARKER_XTIME_TYPE, + afr_xtime_gauge, priv->vol_uuid)) { - gf_log (this->name, GF_LOG_INFO, "%s: failed to get marker attr (%s)", loc->path, name); @@ -827,86 +1430,150 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, return 0; } + } - /* - * if we are doing getxattr with pathinfo as the key then we - * collect information from all childs - */ - if (strncmp (name, GF_XATTR_PATHINFO_KEY, - strlen (GF_XATTR_PATHINFO_KEY)) == 0) { - - local->call_count = priv->child_count; - for (i = 0; i < priv->child_count; i++) { - STACK_WIND_COOKIE (frame, afr_getxattr_pathinfo_cbk, - (void *) (long) i, - children[i], children[i]->fops->getxattr, - loc, name); - } +no_name: - return 0; - } + afr_read_txn (frame, this, local->loc.inode, afr_getxattr_wind, + AFR_METADATA_TRANSACTION); - if (*priv->vol_uuid) { - if ((match_uuid_local (name, priv->vol_uuid) == 0) - && (-1 == frame->root->pid)) { + ret = 0; +out: + if (ret < 0) + AFR_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); + return 0; +} - local->marker.call_count = priv->child_count; +/* {{{ fgetxattr */ - sub_volumes = alloca ( priv->child_count * sizeof (xlator_t *)); - for (i = 0, trav = this->children; trav ; - trav = trav->next, i++) { - *(sub_volumes + i) = trav->xlator; +int32_t +afr_fgetxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; - } + local = frame->local; - if (cluster_getmarkerattr (frame, this, loc, - name, local, - afr_getxattr_unwind, - sub_volumes, - priv->child_count, - MARKER_XTIME_TYPE, - priv->vol_uuid)) { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to get marker attr (%s)", - loc->path, name); - op_errno = EINVAL; - goto out; - } + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - return 0; - } + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } + + if (dict) + afr_filter_xattrs (dict); + + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata); + + return 0; +} + +int +afr_fgetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND (fgetxattr, frame, local->op_ret, + local->op_errno, NULL, NULL); + return 0; + } + + STACK_WIND_COOKIE (frame, (void *) (long) subvol, afr_fgetxattr_cbk, + priv->children[subvol], + priv->children[subvol]->fops->fgetxattr, + local->fd, local->cont.getxattr.name, + local->xdata_req); + return 0; +} + + +static void +afr_fgetxattr_all_subvols (xlator_t *this, call_frame_t *frame, + fop_fgetxattr_cbk_t cbk) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + + priv = this->private; + + local = frame->local; + //local->call_count set in afr_local_init + call_count = local->call_count; + + //If up-children count is 0, afr_local_init would have failed already + //and the call would have unwound so not handling it here. + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fgetxattr, + local->fd, local->cont.getxattr.name, + NULL); + if (!--call_count) + break; } } - local->fresh_children = afr_fresh_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } + return; +} - read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.getxattr.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; - goto out; + +int +afr_fgetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t op_errno = 0; + fop_fgetxattr_cbk_t cbk = NULL; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + local->op = GF_FOP_FGETXATTR; + local->fd = fd_ref (fd); + if (name) { + local->cont.getxattr.name = gf_strdup (name); + if (!local->cont.getxattr.name) { + op_errno = ENOMEM; + goto out; + } + } + if (xdata) + local->xdata_req = dict_ref (xdata); + + /* pathinfo gets handled only in getxattr(), but we need to handle + * lockinfo. + * If we are doing fgetxattr with lockinfo as the key then we + * collect information from all children. + */ + if (afr_is_special_xattr (name, &cbk, 1)) { + afr_fgetxattr_all_subvols (this, frame, cbk); + return 0; } - STACK_WIND_COOKIE (frame, afr_getxattr_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->getxattr, - loc, name); + afr_fix_open (fd, this); - op_ret = 0; + afr_read_txn (frame, this, fd->inode, afr_fgetxattr_wind, + AFR_METADATA_TRANSACTION); + + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, NULL); - } + AFR_STACK_UNWIND (fgetxattr, frame, -1, op_errno, NULL, NULL); + return 0; } @@ -915,140 +1582,84 @@ out: /* {{{ readv */ -/** - * read algorithm: - * - * if the user has specified a read subvolume, use it - * otherwise - - * use the inode number to hash it to one of the subvolumes, and - * read from there (to balance read load) - * - * if any of the above read's fail, try the children in sequence - * beginning at the beginning - */ - -int32_t +int afr_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, int32_t count, struct iatt *buf, - struct iobref *iobref) + struct iobref *iobref, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t *fresh_children = NULL; - int32_t read_child = -1; + afr_local_t *local = NULL; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + local = frame->local; - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; - - local = frame->local; + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - read_child = (long) cookie; - - if (op_ret == -1) { - last_index = &local->cont.readv.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; - - unwind = 0; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - STACK_WIND_COOKIE (frame, afr_readv_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->readv, - local->fd, local->cont.readv.size, - local->cont.readv.offset); - } + AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, + vector, count, buf, iobref, xdata); + return 0; +} -out: - if (unwind) { - AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, - vector, count, buf, iobref); - } - return 0; +int +afr_readv_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND (readv, frame, local->op_ret, local->op_errno, + 0, 0, 0, 0, 0); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_readv_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->readv, + local->fd, local->cont.readv.size, + local->cont.readv.offset, local->cont.readv.flags, + local->xdata_req); + return 0; } -int32_t -afr_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset) +int +afr_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - afr_private_t * priv = NULL; afr_local_t * local = NULL; - xlator_t ** children = NULL; - int call_child = 0; - int32_t op_ret = -1; int32_t op_errno = 0; - int32_t read_child = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (fd, out); - priv = this->private; - children = priv->children; - - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; - op_ret = AFR_LOCAL_INIT (local, priv); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - local->fresh_children = afr_fresh_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } - - read_child = afr_inode_get_read_ctx (this, fd->inode, local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.readv.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } - - local->fd = fd_ref (fd); + local->op = GF_FOP_READ; + local->fd = fd_ref (fd); + local->cont.readv.size = size; + local->cont.readv.offset = offset; + local->cont.readv.flags = flags; + if (xdata) + local->xdata_req = dict_ref (xdata); - local->cont.readv.ino = fd->inode->ino; - local->cont.readv.size = size; - local->cont.readv.offset = offset; + afr_fix_open (fd, this); - STACK_WIND_COOKIE (frame, afr_readv_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readv, - fd, size, offset); + afr_read_txn (frame, this, fd->inode, afr_readv_wind, + AFR_DATA_TRANSACTION); - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, NULL, 0, NULL, - NULL); - } return 0; +out: + AFR_STACK_UNWIND(readv, frame, -1, op_errno, 0, 0, 0, 0, 0); + + return 0; } /* }}} */ diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h index 06acaac84..e4091a793 100644 --- a/xlators/cluster/afr/src/afr-inode-read.h +++ b/xlators/cluster/afr/src/afr-inode-read.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __INODE_READ_H__ @@ -22,26 +13,30 @@ int32_t afr_access (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t mask); + loc_t *loc, int32_t mask, dict_t *xdata); int32_t afr_stat (call_frame_t *frame, xlator_t *this, - loc_t *loc); + loc_t *loc, dict_t *xdata); int32_t afr_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd); + fd_t *fd, dict_t *xdata); int32_t afr_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size); + loc_t *loc, size_t size, dict_t *xdata); int32_t afr_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset); + fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata); int32_t afr_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name); + loc_t *loc, const char *name, dict_t *xdata); + +int32_t +afr_fgetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata); #endif /* __INODE_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 6d67f4429..3dacfc8dd 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -46,141 +37,303 @@ #include "afr.h" #include "afr-transaction.h" +//#include "afr-self-heal-common.h" -/* {{{ writev */ -int -afr_writev_unwind (call_frame_t *frame, xlator_t *this) +static void +__afr_inode_write_finalize (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int read_subvol = 0; + int i = 0; + + local = frame->local; + priv = this->private; + + if (local->inode) { + if (local->transaction.type == AFR_METADATA_TRANSACTION) + read_subvol = afr_metadata_subvol_get (local->inode, this, + NULL, NULL); + else + read_subvol = afr_data_subvol_get (local->inode, this, + NULL, NULL); + } + + local->op_ret = -1; + local->op_errno = afr_final_errno (local, priv); + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret < 0) { + afr_inode_read_subvol_reset (local->inode, this); + continue; + } + + /* Order of checks in the compound conditional + below is important. + + - Highest precedence: largest op_ret + - Next precendence: if all op_rets are equal, read subvol + - Least precedence: any succeeded subvol + */ + if ((local->op_ret < local->replies[i].op_ret) || + ((local->op_ret == local->replies[i].op_ret) && + (i == read_subvol))) { + + local->op_ret = local->replies[i].op_ret; + local->op_errno = local->replies[i].op_errno; + + local->cont.inode_wfop.prebuf = + local->replies[i].prestat; + local->cont.inode_wfop.postbuf = + local->replies[i].poststat; + + if (local->replies[i].xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = + dict_ref (local->replies[i].xdata); + } + } + } +} - local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); +static void +__afr_inode_write_fill (call_frame_t *frame, xlator_t *this, int child_index, + int op_ret, int op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t *local = NULL; - if (main_frame) { - AFR_STACK_UNWIND (writev, main_frame, - local->op_ret, local->op_errno, - &local->cont.writev.prebuf, - &local->cont.writev.postbuf); - } - return 0; + local = frame->local; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + + if (op_ret >= 0) { + if (prebuf) + local->replies[child_index].prestat = *prebuf; + if (postbuf) + local->replies[child_index].poststat = *postbuf; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + } else { + afr_transaction_fop_failed (frame, this, child_index); + } + + return; } -int -afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) +static int +__afr_inode_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; + afr_local_t *local = NULL; int child_index = (long) cookie; - int call_count = -1; - int read_child = 0; + int call_count = -1; local = frame->local; - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - LOCK (&frame->lock); { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.writev.prebuf = *prebuf; - local->cont.writev.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.writev.prebuf = *prebuf; - local->cont.writev.postbuf = *postbuf; - } - } - - local->op_errno = op_errno; + __afr_inode_write_fill (frame, this, child_index, op_ret, + op_errno, prebuf, postbuf, xdata); } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) { - local->transaction.unwind (frame, this); + __afr_inode_write_finalize (frame, this); + + if (afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); local->transaction.resume (frame, this); } + return 0; } +/* {{{ writev */ -int -afr_writev_wind (call_frame_t *frame, xlator_t *this) +void +afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int i = 0; - int call_count = -1; + afr_local_t *src_local = NULL; + afr_local_t *dst_local = NULL; + + src_local = src_frame->local; + dst_local = dst_frame->local; + + dst_local->op_ret = src_local->op_ret; + dst_local->op_errno = src_local->op_errno; + dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf; + dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf; + if (src_local->xdata_rsp) + dst_local->xdata_rsp = dict_ref (src_local->xdata_rsp); +} +void +afr_writev_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; local = frame->local; - priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); + AFR_STACK_UNWIND (writev, frame, + local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + local->xdata_rsp); +} - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; + +int +afr_transaction_writev_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *fop_frame = NULL; + + fop_frame = afr_transaction_detach_fop_frame (frame); + + if (fop_frame) { + afr_writev_copy_outvars (frame, fop_frame); + afr_writev_unwind (fop_frame, this); } + return 0; +} - local->call_count = call_count; +static void +afr_writev_handle_short_writes (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + local = frame->local; + priv = this->private; + /* + * We already have the best case result of the writev calls staged + * as the return value. Any writev that returns some value less + * than the best case is now out of sync, so mark the fop as + * failed. Note that fops that have returned with errors have + * already been marked as failed. + */ for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->writev, - local->fd, - local->cont.writev.vector, - local->cont.writev.count, - local->cont.writev.offset, - local->cont.writev.iobref); - - if (!--call_count) - break; - } + if ((!local->replies[i].valid) || + (local->replies[i].op_ret == -1)) + continue; + + if (local->replies[i].op_ret < local->op_ret) + afr_transaction_fop_failed(frame, this, i); } +} + +int +afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t * local = NULL; + call_frame_t *fop_frame = NULL; + int child_index = (long) cookie; + int call_count = -1; + int ret = 0; + uint32_t open_fd_count = 0; + uint32_t write_is_append = 0; + + local = frame->local; + + LOCK (&frame->lock); + { + __afr_inode_write_fill (frame, this, child_index, op_ret, + op_errno, prebuf, postbuf, xdata); + if (op_ret == -1 || !xdata) + goto unlock; + + write_is_append = 0; + ret = dict_get_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND, + &write_is_append); + if (ret || !write_is_append) + local->append_write = _gf_false; + + ret = dict_get_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, + &open_fd_count); + if (ret == -1) + goto unlock; + if ((open_fd_count > local->open_fd_count)) { + local->open_fd_count = open_fd_count; + local->update_open_fd_count = _gf_true; + } + } +unlock: + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + if (call_count == 0) { + if (!local->stable_write && !local->append_write) + /* An appended write removes the necessity to + fsync() the file. This is because self-heal + has the logic to check for larger file when + the xattrs are not reliably pointing at + a stale file. + */ + afr_fd_report_unstable_write (this, local->fd); + + __afr_inode_write_finalize (frame, this); + + afr_writev_handle_short_writes (frame, this); + + if (local->update_open_fd_count) + afr_handle_open_fd_count (frame, this); + + if (!afr_txn_nothing_failed (frame, this)) { + //Don't unwind until post-op is complete + local->transaction.resume (frame, this); + } else { + /* + * Generally inode-write fops do transaction.unwind then + * transaction.resume, but writev needs to make sure that + * delayed post-op frame is placed in fdctx before unwind + * happens. This prevents the race of flush doing the + * changelog wakeup first in fuse thread and then this + * writev placing its delayed post-op frame in fdctx. + * This helps flush make sure all the delayed post-ops are + * completed. + */ + + fop_frame = afr_transaction_detach_fop_frame (frame); + afr_writev_copy_outvars (frame, fop_frame); + local->transaction.resume (frame, this); + afr_writev_unwind (fop_frame, this); + } + } return 0; } int -afr_writev_done (call_frame_t *frame, xlator_t *this) +afr_writev_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; + afr_private_t *priv = NULL; local = frame->local; + priv = this->private; - iobref_unref (local->cont.writev.iobref); - local->cont.writev.iobref = NULL; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->writev, + local->fd, local->cont.writev.vector, + local->cont.writev.count, local->cont.writev.offset, + local->cont.writev.flags, local->cont.writev.iobref, + local->xdata_req); return 0; } @@ -188,32 +341,39 @@ afr_writev_done (call_frame_t *frame, xlator_t *this) int afr_do_writev (call_frame_t *frame, xlator_t *this) { - call_frame_t * transaction_frame = NULL; - afr_local_t * local = NULL; - int op_ret = -1; - int op_errno = 0; - - local = frame->local; + call_frame_t *transaction_frame = NULL; + afr_local_t *local = NULL; + int ret = -1; + int op_errno = ENOMEM; transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } + local = frame->local; transaction_frame->local = local; - frame->local = NULL; + frame->local = NULL; - local->op = GF_FOP_WRITE; + if (!AFR_FRAME_INIT (frame, op_errno)) + goto out; - local->success_count = 0; + local->op = GF_FOP_WRITE; - local->transaction.fop = afr_writev_wind; - local->transaction.done = afr_writev_done; - local->transaction.unwind = afr_writev_unwind; + local->transaction.wind = afr_writev_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_transaction_writev_unwind; local->transaction.main_frame = frame; + if (local->fd->flags & O_APPEND) { + /* + * Backend vfs ignores the 'offset' for append mode fd so + * locking just the region provided for the writev does not + * give consistency gurantee. The actual write may happen at a + * completely different range than the one provided by the + * offset, len in the fop. So lock the entire file. + */ local->transaction.start = 0; local->transaction.len = 0; } else { @@ -222,16 +382,18 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) local->cont.writev.count); } - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -239,59 +401,67 @@ out: int afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref) + uint32_t flags, struct iobref *iobref, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int op_errno = ENOMEM; priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + QUORUM_CHECK(writev,out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - frame->local = local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - local->cont.writev.vector = iov_dup (vector, count); + local->cont.writev.vector = iov_dup (vector, count); + if (!local->cont.writev.vector) + goto out; local->cont.writev.count = count; local->cont.writev.offset = offset; - local->cont.writev.ino = fd->inode->ino; + local->cont.writev.flags = flags; local->cont.writev.iobref = iobref_ref (iobref); - local->fd = fd_ref (fd); + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - goto out; - } + if (!local->xdata_req) + goto out; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); - if (fd_ctx->up_count < priv->up_count) { - local->openfd_flush_cbk = afr_do_writev; - afr_openfd_flush (frame, this, fd); - } else { - afr_do_writev (frame, this); - } + if (dict_set_uint32 (local->xdata_req, GLUSTERFS_OPEN_FD_COUNT, 4)) { + op_errno = ENOMEM; + goto out; + } + + if (dict_set_uint32 (local->xdata_req, GLUSTERFS_WRITE_IS_APPEND, 4)) { + op_errno = ENOMEM; + goto out; + } + + /* Set append_write to be true speculatively. If on any + server it turns not be true, we unset it in the + callback. + */ + local->append_write = _gf_true; + + /* detect here, but set it in writev_wind_cbk *after* the unstable + write is performed + */ + local->stable_write = !!((fd->flags|flags)&(O_SYNC|O_DSYNC)); - op_ret = 0; + afr_fix_open (fd, this); + + afr_do_writev (frame, this); + + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL); - } + AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -309,21 +479,13 @@ afr_truncate_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, - local->op_errno, - &local->cont.truncate.prebuf, - &local->cont.truncate.postbuf); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } @@ -331,175 +493,99 @@ afr_truncate_unwind (call_frame_t *frame, xlator_t *this) int afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; - int read_child = 0; - int call_count = -1; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - if (afr_fop_failed (op_ret, op_errno) && op_errno != EFBIG) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.truncate.prebuf = *prebuf; - local->cont.truncate.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.truncate.prebuf = *prebuf; - local->cont.truncate.postbuf = *postbuf; - } + afr_local_t *local = NULL; - local->success_count++; + local = frame->local; - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); + if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); } -int32_t -afr_truncate_wind (call_frame_t *frame, xlator_t *this) +int +afr_truncate_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->truncate, - &local->loc, - local->cont.truncate.offset); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_truncate_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->truncate, + &local->loc, local->cont.truncate.offset, + local->xdata_req); return 0; } int afr_truncate (call_frame_t *frame, xlator_t *this, - loc_t *loc, off_t offset) + loc_t *loc, off_t offset, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int ret = -1; + int op_errno = ENOMEM; priv = this->private; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); + QUORUM_CHECK(truncate,out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + transaction_frame = copy_frame (frame); + if (!transaction_frame) goto out; - } - - transaction_frame->local = local; - local->op_ret = -1; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; local->cont.truncate.offset = offset; - local->cont.truncate.ino = loc->inode->ino; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local->transaction.fop = afr_truncate_wind; - local->transaction.done = afr_truncate_done; + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_truncate_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_truncate_unwind; loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + + local->op = GF_FOP_TRUNCATE; local->transaction.main_frame = frame; local->transaction.start = offset; local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + /* Set it true speculatively, will get reset in afr_truncate_wind_cbk + if truncate was not a NOP */ + local->stable_write = _gf_true; - op_ret = 0; -out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (truncate, frame, op_ret, op_errno, NULL, NULL); + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -517,20 +603,13 @@ afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, - local->op_errno, - &local->cont.ftruncate.prebuf, - &local->cont.ftruncate.postbuf); - } + AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } @@ -538,994 +617,1138 @@ afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) int afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; - int call_count = -1; - int need_unwind = 0; - int read_child = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); + afr_local_t *local = NULL; - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.ftruncate.prebuf = *prebuf; - local->cont.ftruncate.postbuf = *postbuf; - } + local = frame->local; - if (child_index == read_child) { - local->cont.ftruncate.prebuf = *prebuf; - local->cont.ftruncate.postbuf = *postbuf; - } + if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; - local->success_count++; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); +} - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - if (need_unwind) - local->transaction.unwind (frame, this); +int +afr_ftruncate_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } + local = frame->local; + priv = this->private; + STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->ftruncate, + local->fd, local->cont.ftruncate.offset, + local->xdata_req); return 0; } int -afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) +afr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); + QUORUM_CHECK(ftruncate,out); - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + transaction_frame = copy_frame (frame); + if (!frame) + goto out; - local->call_count = call_count; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->ftruncate, - local->fd, local->cont.ftruncate.offset); - - if (!--call_count) - break; - } - } + local->cont.ftruncate.offset = offset; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - return 0; -} + if (!local->xdata_req) + goto out; + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); -int -afr_ftruncate_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; + local->op = GF_FOP_FTRUNCATE; - local = frame->local; + local->transaction.wind = afr_ftruncate_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_ftruncate_unwind; - local->transaction.unwind (frame, this); + local->transaction.main_frame = frame; - AFR_STACK_DESTROY (frame); + local->transaction.start = local->cont.ftruncate.offset; + local->transaction.len = 0; + + afr_fix_open (fd, this); + + /* Set it true speculatively, will get reset in afr_ftruncate_wind_cbk + if truncate was not a NOP */ + local->stable_write = _gf_true; + + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; +out: + AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); return 0; } +/* }}} */ + +/* {{{ setattr */ int -afr_do_ftruncate (call_frame_t *frame, xlator_t *this) +afr_setattr_unwind (call_frame_t *frame, xlator_t *this) { - call_frame_t * transaction_frame = NULL; - afr_local_t * local = NULL; - int op_ret = -1; - int op_errno = 0; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; local = frame->local; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } - - transaction_frame->local = local; - frame->local = NULL; + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - local->op = GF_FOP_FTRUNCATE; + AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + local->xdata_rsp); + return 0; +} - local->transaction.fop = afr_ftruncate_wind; - local->transaction.done = afr_ftruncate_done; - local->transaction.unwind = afr_ftruncate_unwind; - local->transaction.main_frame = frame; +int +afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata) +{ + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + preop, postop, xdata); +} - local->transaction.start = local->cont.ftruncate.offset; - local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); +int +afr_setattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - op_ret = 0; -out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, NULL); - } + local = frame->local; + priv = this->private; + STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->setattr, + &local->loc, &local->cont.setattr.in_buf, + local->cont.setattr.valid, local->xdata_req); return 0; } int -afr_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset) +afr_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf, + int32_t valid, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; int ret = -1; - int op_ret = -1; - int op_errno = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); - ret = AFR_LOCAL_INIT (local, priv); + QUORUM_CHECK(setattr,out); - if (ret < 0) { - op_errno = -ret; + transaction_frame = copy_frame (frame); + if (!transaction_frame) goto out; - } - frame->local = local; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - local->cont.ftruncate.offset = offset; - local->cont.ftruncate.ino = fd->inode->ino; + local->cont.setattr.in_buf = *buf; + local->cont.setattr.valid = valid; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local->fd = fd_ref (fd); + if (!local->xdata_req) + goto out; - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - goto out; - } + local->transaction.wind = afr_setattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_setattr_unwind; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); - if (fd_ctx->up_count < priv->up_count) { - local->openfd_flush_cbk = afr_do_ftruncate; - afr_openfd_flush (frame, this, fd); - } else { - afr_do_ftruncate (frame, this); + local->op = GF_FOP_SETATTR; + + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; + + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); return 0; } -/* }}} */ - -/* {{{ setattr */ +/* {{{ fsetattr */ int -afr_setattr_unwind (call_frame_t *frame, xlator_t *this) +afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this) { afr_local_t * local = NULL; call_frame_t *main_frame = NULL; local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, - local->op_errno, - &local->cont.setattr.preop_buf, - &local->cont.setattr.postop_buf); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } int -afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) +afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; - int read_child = 0; - int call_count = -1; - int need_unwind = 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + preop, postop, xdata); +} + + +int +afr_fsetattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; local = frame->local; - priv = this->private; + priv = this->private; - read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); + STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fsetattr, + local->fd, &local->cont.fsetattr.in_buf, + local->cont.fsetattr.valid, local->xdata_req); + return 0; +} - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); +int +afr_fsetattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + priv = this->private; + + QUORUM_CHECK(fsetattr,out); - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.setattr.preop_buf = *preop; - local->cont.setattr.postop_buf = *postop; - } + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - if (child_index == read_child) { - local->cont.setattr.preop_buf = *preop; - local->cont.setattr.postop_buf = *postop; - } + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - local->success_count++; + local->cont.fsetattr.in_buf = *buf; + local->cont.fsetattr.valid = valid; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + if (!local->xdata_req) + goto out; - if (need_unwind) - local->transaction.unwind (frame, this); + local->transaction.wind = afr_fsetattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_fsetattr_unwind; - call_count = afr_frame_return (frame); + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); - if (call_count == 0) { - local->transaction.resume (frame, this); + local->op = GF_FOP_FSETATTR; + + afr_fix_open (fd, this); + + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; + + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); return 0; } -int32_t -afr_setattr_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; +/* {{{ setxattr */ - call_count = afr_up_children_count (priv->child_count, local->child_up); - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } +int +afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; - local->call_count = call_count; + local = frame->local; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setattr, - &local->loc, - &local->cont.setattr.in_buf, - local->cont.setattr.valid); - - if (!--call_count) - break; - } - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (setxattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); return 0; } int -afr_setattr_done (call_frame_t *frame, xlator_t *this) +afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + NULL, NULL, xdata); +} - local = frame->local; - local->transaction.unwind (frame, this); +int +afr_setxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - AFR_STACK_DESTROY (frame); + local = frame->local; + priv = this->private; + STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->setxattr, + &local->loc, local->cont.setxattr.dict, + local->cont.setxattr.flags, local->xdata_req); return 0; } int -afr_setattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct iatt *buf, int32_t valid) +afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + int ret = -1; + int op_errno = EINVAL; + + GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, + op_errno, out); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, + op_errno, out); priv = this->private; + QUORUM_CHECK(setxattr,out); + transaction_frame = copy_frame (frame); - if (!transaction_frame) { + if (!transaction_frame) goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) goto out; - } - - transaction_frame->local = local; - local->op_ret = -1; + local->cont.setxattr.dict = dict_ref (dict); + local->cont.setxattr.flags = flags; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local->cont.setattr.ino = loc->inode->ino; + if (!local->xdata_req) + goto out; - local->cont.setattr.in_buf = *buf; - local->cont.setattr.valid = valid; - - local->transaction.fop = afr_setattr_wind; - local->transaction.done = afr_setattr_done; - local->transaction.unwind = afr_setattr_unwind; + local->transaction.wind = afr_setxattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_setxattr_unwind; loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + local->op = GF_FOP_SETXATTR; - op_ret = 0; -out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setattr, frame, op_ret, op_errno, NULL, NULL); + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + return 0; } -/* {{{ fsetattr */ +/* {{{ fsetxattr */ + int -afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this) +afr_fsetxattr_unwind (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, - local->op_errno, - &local->cont.fsetattr.preop_buf, - &local->cont.fsetattr.postop_buf); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (fsetxattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); return 0; } int -afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) +afr_fsetxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; - int read_child = 0; - int call_count = -1; - int need_unwind = 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + NULL, NULL, xdata); +} + + +int +afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; local = frame->local; - priv = this->private; + priv = this->private; - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fsetxattr, + local->fd, local->cont.fsetxattr.dict, + local->cont.fsetxattr.flags, local->xdata_req); + return 0; +} - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); +int +afr_fsetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.fsetattr.preop_buf = *preop; - local->cont.fsetattr.postop_buf = *postop; - } + GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, + op_errno, out); - if (child_index == read_child) { - local->cont.fsetattr.preop_buf = *preop; - local->cont.fsetattr.postop_buf = *postop; - } + GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, + op_errno, out); - local->success_count++; + priv = this->private; - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + QUORUM_CHECK(fsetxattr,out); - if (need_unwind) - local->transaction.unwind (frame, this); + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - call_count = afr_frame_return (frame); + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - if (call_count == 0) { - local->transaction.resume (frame, this); + local->cont.fsetxattr.dict = dict_ref (dict); + local->cont.fsetxattr.flags = flags; + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_fsetxattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_fsetxattr_unwind; + + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); + + local->op = GF_FOP_FSETXATTR; + + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; + + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); return 0; } +/* }}} */ -int32_t -afr_fsetattr_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - local = frame->local; - priv = this->private; +/* {{{ removexattr */ - call_count = afr_up_children_count (priv->child_count, local->child_up); - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } +int +afr_removexattr_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; - local->call_count = call_count; + local = frame->local; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fsetattr, - local->fd, - &local->cont.fsetattr.in_buf, - local->cont.fsetattr.valid); - - if (!--call_count) - break; - } - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (removexattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); return 0; } int -afr_fsetattr_done (call_frame_t *frame, xlator_t *this) +afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + NULL, NULL, xdata); +} - local = frame->local; - local->transaction.unwind (frame, this); +int +afr_removexattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - AFR_STACK_DESTROY (frame); + local = frame->local; + priv = this->private; + STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->removexattr, + &local->loc, local->cont.removexattr.name, + local->xdata_req); return 0; } int -afr_fsetattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iatt *buf, int32_t valid) +afr_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", + name, op_errno, out); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", + name, op_errno, out); priv = this->private; + QUORUM_CHECK(removexattr,out); + transaction_frame = copy_frame (frame); - if (!transaction_frame) { + if (!transaction_frame) goto out; - } - ALLOC_OR_GOTO (local, afr_local_t, out); + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - transaction_frame->local = local; + local->cont.removexattr.name = gf_strdup (name); - local->op_ret = -1; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local->cont.fsetattr.ino = fd->inode->ino; + if (!local->xdata_req) + goto out; - local->cont.fsetattr.in_buf = *buf; - local->cont.fsetattr.valid = valid; + local->transaction.wind = afr_removexattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_removexattr_unwind; - local->transaction.fop = afr_fsetattr_wind; - local->transaction.done = afr_fsetattr_done; - local->transaction.unwind = afr_fsetattr_unwind; + loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); - local->fd = fd_ref (fd); + local->op = GF_FOP_REMOVEXATTR; local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); return 0; } - -/* {{{ setxattr */ - - +/* ffremovexattr */ int -afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) +afr_fremovexattr_unwind (call_frame_t *frame, xlator_t *this) { afr_local_t * local = NULL; call_frame_t *main_frame = NULL; local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (setxattr, main_frame, - local->op_ret, local->op_errno) - } + AFR_STACK_UNWIND (fremovexattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); return 0; } int -afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->child_count) { - need_unwind = 1; - } - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + NULL, NULL, xdata); +} - if (need_unwind) - local->transaction.unwind (frame, this); - call_count = afr_frame_return (frame); +int +afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - if (call_count == 0) { - local->transaction.resume (frame, this); - } + local = frame->local; + priv = this->private; + STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fremovexattr, + local->fd, local->cont.removexattr.name, + local->xdata_req); return 0; } int -afr_setxattr_wind (call_frame_t *frame, xlator_t *this) +afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) { + afr_private_t *priv = NULL; afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; - priv = this->private; + GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", + name, op_errno, out); - call_count = afr_up_children_count (priv->child_count, local->child_up); + GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", + name, op_errno, out); - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + priv = this->private; - local->call_count = call_count; + QUORUM_CHECK(fremovexattr, out); - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, - local->cont.setxattr.dict, - local->cont.setxattr.flags); - - if (!--call_count) - break; - } + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; + + local->cont.removexattr.name = gf_strdup (name); + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_fremovexattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_fremovexattr_unwind; + + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); + + local->op = GF_FOP_FREMOVEXATTR; + + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; + + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (fremovexattr, frame, -1, op_errno, NULL); + return 0; } int -afr_setxattr_done (call_frame_t *frame, xlator_t *this) +afr_fallocate_unwind (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = frame->local; + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; - local->transaction.unwind (frame, this); + local = frame->local; - AFR_STACK_DESTROY (frame); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } + int -afr_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags) +afr_fallocate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); +} + - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); +int +afr_fallocate_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + local = frame->local; priv = this->private; - ALLOC_OR_GOTO (local, afr_local_t, out); + STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fallocate, + local->fd, local->cont.fallocate.mode, + local->cont.fallocate.offset, + local->cont.fallocate.len, local->xdata_req); + return 0; +} - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + +int +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + afr_private_t *priv = NULL; + call_frame_t *transaction_frame = NULL; + afr_local_t *local = NULL; + int ret = -1; + int op_errno = ENOMEM; + + priv = this->private; + + QUORUM_CHECK(fallocate,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { + if (!transaction_frame) goto out; - } - transaction_frame->local = local; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - local->op_ret = -1; + local->cont.fallocate.mode = mode; + local->cont.fallocate.offset = offset; + local->cont.fallocate.len = len; - local->cont.setxattr.dict = dict_ref (dict); - local->cont.setxattr.flags = flags; + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); - local->transaction.fop = afr_setxattr_wind; - local->transaction.done = afr_setxattr_done; - local->transaction.unwind = afr_setxattr_unwind; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - loc_copy (&local->loc, loc); + if (!local->xdata_req) + goto out; + + local->op = GF_FOP_FALLOCATE; + + local->transaction.wind = afr_fallocate_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_fallocate_unwind; local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; + + local->transaction.start = local->cont.fallocate.offset; local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + afr_fix_open (fd, this); - op_ret = 0; -out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno); + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); return 0; } -/* }}} */ -/* {{{ removexattr */ +/* }}} */ +/* {{{ discard */ int -afr_removexattr_unwind (call_frame_t *frame, xlator_t *this) +afr_discard_unwind (call_frame_t *frame, xlator_t *this) { afr_local_t * local = NULL; call_frame_t *main_frame = NULL; local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (removexattr, main_frame, - local->op_ret, local->op_errno) - } + AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } int -afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +afr_discard_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); +} - if (need_unwind) - local->transaction.unwind (frame, this); - call_count = afr_frame_return (frame); +int +afr_discard_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - if (call_count == 0) { - local->transaction.resume (frame, this); - } + local = frame->local; + priv = this->private; + STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->discard, + local->fd, local->cont.discard.offset, + local->cont.discard.len, local->xdata_req); return 0; } -int32_t -afr_removexattr_wind (call_frame_t *frame, xlator_t *this) +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { + afr_private_t *priv = NULL; afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); + QUORUM_CHECK(discard, out); - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - local->call_count = call_count; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->removexattr, - &local->loc, - local->cont.removexattr.name); - - if (!--call_count) - break; - } + local->cont.discard.offset = offset; + local->cont.discard.len = len; + + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; + + local->op = GF_FOP_DISCARD; + + local->transaction.wind = afr_discard_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_discard_unwind; + + local->transaction.main_frame = frame; + + local->transaction.start = local->cont.discard.offset; + local->transaction.len = 0; + + afr_fix_open (fd, this); + + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); return 0; } +/* {{{ zerofill */ + int -afr_removexattr_done (call_frame_t *frame, xlator_t *this) +afr_zerofill_unwind (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = frame->local; + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; - local->transaction.unwind (frame, this); + local = frame->local; - AFR_STACK_DESTROY (frame); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } int -afr_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) +afr_zerofill_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_ret = -1; - int op_errno = 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); +} - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); +int +afr_zerofill_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; priv = this->private; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } + STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->zerofill, + local->fd, local->cont.zerofill.offset, + local->cont.zerofill.len, local->xdata_req); + return 0; +} - ALLOC_OR_GOTO (local, afr_local_t, out); +int +afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + priv = this->private; - transaction_frame->local = local; + QUORUM_CHECK(discard, out); - local->op_ret = -1; + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - local->cont.removexattr.name = gf_strdup (name); + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - local->transaction.fop = afr_removexattr_wind; - local->transaction.done = afr_removexattr_done; - local->transaction.unwind = afr_removexattr_unwind; + local->cont.zerofill.offset = offset; + local->cont.zerofill.len = len; - loc_copy (&local->loc, loc); + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; + + local->op = GF_FOP_ZEROFILL; + + local->transaction.wind = afr_zerofill_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_zerofill_unwind; local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + local->transaction.start = local->cont.discard.offset; + local->transaction.len = len; - op_ret = 0; -out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (removexattr, frame, op_ret, op_errno); + afr_fix_open (fd, this); + + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL); return 0; } + +/* }}} */ + + diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h index d336f30e1..7b1fc5528 100644 --- a/xlators/cluster/afr/src/afr-inode-write.h +++ b/xlators/cluster/afr/src/afr-inode-write.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __INODE_WRITE_H__ @@ -22,51 +13,70 @@ int32_t afr_chmod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode); + loc_t *loc, mode_t mode, dict_t *xdata); int32_t afr_chown (call_frame_t *frame, xlator_t *this, - loc_t *loc, uid_t uid, gid_t gid); + loc_t *loc, uid_t uid, gid_t gid, dict_t *xdata); int afr_fchown (call_frame_t *frame, xlator_t *this, - fd_t *fd, uid_t uid, gid_t gid); + fd_t *fd, uid_t uid, gid_t gid, dict_t *xdata); int32_t afr_fchmod (call_frame_t *frame, xlator_t *this, - fd_t *fd, mode_t mode); + fd_t *fd, mode_t mode, dict_t *xdata); int32_t -afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, +afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref); + uint32_t flags, struct iobref *iobref, dict_t *xdata); int32_t afr_truncate (call_frame_t *frame, xlator_t *this, - loc_t *loc, off_t offset); + loc_t *loc, off_t offset, dict_t *xdata); int32_t afr_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset); + fd_t *fd, off_t offset, dict_t *xdata); int32_t afr_utimens (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct timespec tv[2]); + loc_t *loc, struct timespec tv[2], dict_t *xdata); int afr_setattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct iatt *buf, int32_t valid); + loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata); int afr_fsetattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iatt *buf, int32_t valid); + fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata); int32_t afr_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags); + loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata); + +int32_t +afr_fsetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata); int32_t afr_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name); + loc_t *loc, const char *name, dict_t *xdata); + +int32_t +afr_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata); +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata); + +int +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata); + +int +afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata); #endif /* __INODE_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index 0f514c73d..a2a758f35 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include "dict.h" @@ -31,8 +22,69 @@ #define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path */ #define LOCKED_LOWER 0x2 /* for lower path */ +#define AFR_TRACE_INODELK_IN(frame, this, params ...) \ + do { \ + afr_private_t *_priv = this->private; \ + if (!_priv->inodelk_trace) \ + break; \ + afr_trace_inodelk_in (frame, this, params); \ + } while (0); + +#define AFR_TRACE_INODELK_OUT(frame, this, params ...) \ + do { \ + afr_private_t *_priv = this->private; \ + if (!_priv->inodelk_trace) \ + break; \ + afr_trace_inodelk_out (frame, this, params); \ + } while (0); + +#define AFR_TRACE_ENTRYLK_IN(frame, this, params ...) \ + do { \ + afr_private_t *_priv = this->private; \ + if (!_priv->entrylk_trace) \ + break; \ + afr_trace_entrylk_in (frame, this, params); \ + } while (0); + +#define AFR_TRACE_ENTRYLK_OUT(frame, this, params ...) \ + do { \ + afr_private_t *_priv = this->private; \ + if (!_priv->entrylk_trace) \ + break; \ + afr_trace_entrylk_out (frame, this, params); \ + } while (0); + int -afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index); +afr_entry_lockee_cmp (const void *l1, const void *l2) +{ + const afr_entry_lockee_t *r1 = l1; + const afr_entry_lockee_t *r2 = l2; + int ret = 0; + uuid_t gfid1 = {0}; + uuid_t gfid2 = {0}; + + loc_gfid ((loc_t*)&r1->loc, gfid1); + loc_gfid ((loc_t*)&r2->loc, gfid2); + ret = uuid_compare (gfid1, gfid2); + /*Entrylks with NULL basename are the 'smallest'*/ + if (ret == 0) { + if (!r1->basename) + return -1; + if (!r2->basename) + return 1; + ret = strcmp (r1->basename, r2->basename); + } + + if (ret <= 0) + return -1; + else + return 1; +} + +int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index); + +static int +afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this); static uint64_t afr_lock_number = 1; @@ -57,12 +109,13 @@ afr_set_lock_number (call_frame_t *frame, xlator_t *this) } void -afr_set_lk_owner (call_frame_t *frame, xlator_t *this) +afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner) { gf_log (this->name, GF_LOG_TRACE, "Setting lk-owner=%llu", - (unsigned long long) (unsigned long)frame->root); - frame->root->lk_owner = (uint64_t) (unsigned long)frame->root; + (unsigned long long) (unsigned long)lk_owner); + + set_lk_owner_from_ptr (&frame->root->lk_owner, lk_owner); } static int @@ -88,8 +141,7 @@ is_afr_lock_selfheal (afr_local_t *local) } int32_t -internal_lock_count (call_frame_t *frame, xlator_t *this, - afr_fd_ctx_t *fd_ctx) +internal_lock_count (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; @@ -99,17 +151,9 @@ internal_lock_count (call_frame_t *frame, xlator_t *this, local = frame->local; priv = this->private; - if (fd_ctx) { - GF_ASSERT (local->fd); - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && fd_ctx->opened_on[i]) - ++call_count; - } - } else { - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) - ++call_count; - } + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) + ++call_count; } return call_count; @@ -117,7 +161,7 @@ internal_lock_count (call_frame_t *frame, xlator_t *this, static void afr_print_inodelk (char *str, int size, int cmd, - struct gf_flock *flock, uint64_t owner) + struct gf_flock *flock, gf_lkowner_t *owner) { char *cmd_str = NULL; char *type_str = NULL; @@ -165,11 +209,11 @@ afr_print_inodelk (char *str, int size, int cmd, } snprintf (str, size, "lock=INODELK, cmd=%s, type=%s, " - "start=%llu, len=%llu, pid=%llu, lk-owner=%llu", + "start=%llu, len=%llu, pid=%llu, lk-owner=%s", cmd_str, type_str, (unsigned long long) flock->l_start, (unsigned long long) flock->l_len, (unsigned long long) flock->l_pid, - (unsigned long long) owner); + lkowner_utoa (owner)); } @@ -185,11 +229,11 @@ afr_print_lockee (char *str, int size, loc_t *loc, fd_t *fd, void afr_print_entrylk (char *str, int size, const char *basename, - uint64_t owner) + gf_lkowner_t *owner) { - snprintf (str, size, "Basename=%s, lk-owner=%llu", + snprintf (str, size, "Basename=%s, lk-owner=%s", basename ? basename : "<nul>", - (unsigned long long)owner); + lkowner_utoa (owner)); } static void @@ -243,27 +287,20 @@ afr_set_lock_call_type (afr_lock_call_type_t lock_call_type, } static void -afr_trace_inodelk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type, +afr_trace_inodelk_out (call_frame_t *frame, xlator_t *this, + afr_lock_call_type_t lock_call_type, afr_lock_op_type_t lk_op_type, struct gf_flock *flock, int op_ret, int op_errno, int32_t child_index) { - xlator_t *this = NULL; afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; - afr_private_t *priv = NULL; char lockee[256]; char lock_call_type_str[256]; char verdict[16]; - this = THIS; local = frame->local; int_lock = &local->internal_lock; - priv = this->private; - - if (!priv->inodelk_trace) { - return; - } afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); @@ -272,39 +309,31 @@ afr_trace_inodelk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type, afr_print_verdict (op_ret, op_errno, verdict); gf_log (this->name, GF_LOG_INFO, - "[%s %s] [%s] Lockee={%s} Number={%llu}", + "[%s %s] [%s] lk-owner=%s Lockee={%s} Number={%llu}", lock_call_type_str, lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", - verdict, - lockee, + verdict, lkowner_utoa (&frame->root->lk_owner), lockee, (unsigned long long) int_lock->lock_number); } static void -afr_trace_inodelk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type, +afr_trace_inodelk_in (call_frame_t *frame, xlator_t *this, + afr_lock_call_type_t lock_call_type, afr_lock_op_type_t lk_op_type, struct gf_flock *flock, int32_t cmd, int32_t child_index) { - xlator_t *this = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; char lock[256]; char lockee[256]; char lock_call_type_str[256]; - this = THIS; local = frame->local; int_lock = &local->internal_lock; - priv = this->private; - if (!priv->inodelk_trace) { - return; - } - - afr_print_inodelk (lock, 256, cmd, flock, frame->root->lk_owner); + afr_print_inodelk (lock, 256, cmd, flock, &frame->root->lk_owner); afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); @@ -319,20 +348,21 @@ afr_trace_inodelk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type, } static void -afr_trace_entrylk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type, +afr_trace_entrylk_in (call_frame_t *frame, xlator_t *this, + afr_lock_call_type_t lock_call_type, afr_lock_op_type_t lk_op_type, const char *basename, - int32_t child_index) + int32_t cookie) { - xlator_t *this = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; afr_private_t *priv = NULL; + int child_index = 0; + int lockee_no = 0; char lock[256]; char lockee[256]; char lock_call_type_str[256]; - this = THIS; local = frame->local; int_lock = &local->internal_lock; priv = this->private; @@ -340,36 +370,41 @@ afr_trace_entrylk_in (call_frame_t *frame, afr_lock_call_type_t lock_call_type, if (!priv->entrylk_trace) { return; } + lockee_no = cookie / priv->child_count; + child_index = cookie % priv->child_count; - afr_print_entrylk (lock, 256, basename, frame->root->lk_owner); - afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); + afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); + afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, + child_index); afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); gf_log (this->name, GF_LOG_INFO, - "[%s %s] Lock={%s} Lockee={%s} Number={%llu}", + "[%s %s] Lock={%s} Lockee={%s} Number={%llu}, Cookie={%d}", lock_call_type_str, lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST", lock, lockee, - (unsigned long long) int_lock->lock_number); + (unsigned long long) int_lock->lock_number, + cookie); } static void -afr_trace_entrylk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type, - afr_lock_op_type_t lk_op_type, const char *basename, int op_ret, - int op_errno, int32_t child_index) +afr_trace_entrylk_out (call_frame_t *frame, xlator_t *this, + afr_lock_call_type_t lock_call_type, + afr_lock_op_type_t lk_op_type, const char *basename, + int op_ret, int op_errno, int32_t cookie) { - xlator_t *this = NULL; afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; + int lockee_no = 0; + int child_index = 0; char lock[256]; char lockee[256]; char lock_call_type_str[256]; char verdict[16]; - this = THIS; local = frame->local; int_lock = &local->internal_lock; priv = this->private; @@ -377,20 +412,25 @@ afr_trace_entrylk_out (call_frame_t *frame, afr_lock_call_type_t lock_call_type, if (!priv->entrylk_trace) { return; } + lockee_no = cookie / priv->child_count; + child_index = cookie % priv->child_count; - afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); + afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); + afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, + child_index); afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); afr_print_verdict (op_ret, op_errno, verdict); gf_log (this->name, GF_LOG_INFO, - "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu}", + "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu} Cookie={%d}", lock_call_type_str, lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", verdict, lock, lockee, - (unsigned long long) int_lock->lock_number); + (unsigned long long) int_lock->lock_number, + cookie); } @@ -443,6 +483,47 @@ is_afr_lock_transaction (afr_local_t *local) return ret; } +int +afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local, + loc_t *loc, char *basename, int child_count) +{ + int ret = -1; + + loc_copy (&lockee->loc, loc); + lockee->basename = (basename)? gf_strdup (basename): NULL; + if (basename && !lockee->basename) + goto out; + + lockee->locked_count = 0; + lockee->locked_nodes = GF_CALLOC (child_count, + sizeof (*lockee->locked_nodes), + gf_afr_mt_afr_node_character); + + if (!lockee->locked_nodes) + goto out; + + ret = 0; +out: + return ret; + +} + +void +afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock) +{ + int i = 0; + + for (i = 0; i < int_lock->lockee_count; i++) { + loc_wipe (&int_lock->lockee[i].loc); + if (int_lock->lockee[i].basename) + GF_FREE (int_lock->lockee[i].basename); + if (int_lock->lockee[i].locked_nodes) + GF_FREE (int_lock->lockee[i].locked_nodes); + } + + return; +} + static int initialize_entrylk_variables (call_frame_t *frame, xlator_t *this) { @@ -460,8 +541,13 @@ initialize_entrylk_variables (call_frame_t *frame, xlator_t *this) int_lock->lock_op_ret = -1; int_lock->lock_op_errno = 0; - for (i = 0; i < priv->child_count; i++) { - int_lock->entry_locked_nodes[i] = 0; + for (i = 0; i < AFR_LOCKEE_COUNT_MAX; i++) { + if (!int_lock->lockee[i].locked_nodes) + break; + int_lock->lockee[i].locked_count = 0; + memset (int_lock->lockee[i].locked_nodes, 0, + sizeof (*int_lock->lockee[i].locked_nodes) * + priv->child_count); } return 0; @@ -473,37 +559,37 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; afr_private_t *priv = NULL; - int i = 0; + afr_inodelk_t *inodelk = NULL; priv = this->private; local = frame->local; int_lock = &local->internal_lock; - int_lock->inodelk_lock_count = 0; - int_lock->lock_op_ret = -1; - int_lock->lock_op_errno = 0; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - for (i = 0; i < priv->child_count; i++) { - int_lock->inode_locked_nodes[i] = 0; - } + inodelk->lock_count = 0; + int_lock->lk_attempted_count = 0; + int_lock->lock_op_ret = -1; + int_lock->lock_op_errno = 0; + + memset (inodelk->locked_nodes, 0, + sizeof (*inodelk->locked_nodes) * priv->child_count); + memset (int_lock->locked_nodes, 0, + sizeof (*int_lock->locked_nodes) * priv->child_count); return 0; } -loc_t * -lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2) +int +afr_lockee_locked_nodes_count (afr_internal_lock_t *int_lock) { - int ret = 0; - - ret = strcmp (l1->path, l2->path); + int call_count = 0; + int i = 0; - if (ret == 0) - ret = strcmp (b1, b2); + for (i = 0; i < int_lock->lockee_count; i++) + call_count += int_lock->lockee[i].locked_count; - if (ret <= 0) - return l1; - else - return l2; + return call_count; } int @@ -524,7 +610,7 @@ afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) /* FIXME: What if UNLOCK fails */ static int32_t afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; @@ -550,23 +636,37 @@ afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this, static int32_t afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; + int32_t child_index = (long)cookie; + afr_private_t *priv = NULL; local = frame->local; + int_lock = &local->internal_lock; - afr_trace_inodelk_out (frame, AFR_INODELK_TRANSACTION, + AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION, AFR_UNLOCK_OP, NULL, op_ret, - op_errno, (long) cookie); + op_errno, child_index); + + priv = this->private; if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { - gf_log (this->name, GF_LOG_ERROR, - "%s: unlock failed %s", - local->loc.path, strerror (op_errno)); + gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on subvolume %s " + "with lock owner %s", local->loc.path, + priv->children[child_index]->name, + lkowner_utoa (&frame->root->lk_owner)); } - afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno); + + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + inodelk->locked_nodes[child_index] &= LOCKED_NO; + if (local->transaction.eager_lock) + local->transaction.eager_lock[child_index] = 0; + + afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, xdata); return 0; @@ -576,21 +676,30 @@ static int afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; struct gf_flock flock = {0,}; + struct gf_flock full_flock = {0,}; + struct gf_flock *flock_use = NULL; int call_count = 0; int i = 0; + int piggyback = 0; + afr_fd_ctx_t *fd_ctx = NULL; + local = frame->local; int_lock = &local->internal_lock; priv = this->private; - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; flock.l_type = F_UNLCK; - call_count = afr_locked_nodes_count (int_lock->inode_locked_nodes, + full_flock.l_type = F_UNLCK; + call_count = afr_locked_nodes_count (inodelk->locked_nodes, priv->child_count); int_lock->lk_call_count = call_count; @@ -602,55 +711,107 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) goto out; } + if (local->fd) + fd_ctx = afr_fd_ctx_get (local->fd, this); + for (i = 0; i < priv->child_count; i++) { - if (int_lock->inode_locked_nodes[i] & LOCKED_YES) { - if (local->fd) { - afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION, - AFR_UNLOCK_OP, &flock, F_SETLK, i); - - STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, - (void *) (long)i, - priv->children[i], - priv->children[i]->fops->finodelk, - this->name, local->fd, - F_SETLK, &flock); + if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES) + continue; - if (!--call_count) - break; + if (local->fd) { + flock_use = &flock; + if (!local->transaction.eager_lock[i]) { + goto wind; + } - } else { - afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION, - AFR_UNLOCK_OP, &flock, F_SETLK, i); + piggyback = 0; - STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, - (void *) (long)i, - priv->children[i], - priv->children[i]->fops->inodelk, - this->name, &local->loc, - F_SETLK, &flock); + LOCK (&local->fd->lock); + { + if (fd_ctx->lock_piggyback[i]) { + fd_ctx->lock_piggyback[i]--; + piggyback = 1; + } else { + fd_ctx->lock_acquired[i]--; + } + } + UNLOCK (&local->fd->lock); + if (piggyback) { + afr_unlock_inodelk_cbk (frame, (void *) (long) i, + this, 1, 0, NULL); if (!--call_count) break; - + continue; } - } + flock_use = &full_flock; + wind: + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_TRANSACTION, + AFR_UNLOCK_OP, flock_use, F_SETLK, + i); - } + STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, + (void *) (long)i, + priv->children[i], + priv->children[i]->fops->finodelk, + int_lock->domain, local->fd, + F_SETLK, flock_use, NULL); + if (!--call_count) + break; + + } else { + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_TRANSACTION, + AFR_UNLOCK_OP, &flock, F_SETLK, i); + + STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, + (void *) (long)i, + priv->children[i], + priv->children[i]->fops->inodelk, + int_lock->domain, &local->loc, + F_SETLK, &flock, NULL); + + if (!--call_count) + break; + } + } out: return 0; } static int32_t afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION, - AFR_UNLOCK_OP, NULL, op_ret, - op_errno, (long) cookie); + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; + int32_t child_index = 0; + int lockee_no = 0; - afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno); + priv = this->private; + lockee_no = (int)((long) cookie) / priv->child_count; + child_index = (int) ((long) cookie) % priv->child_count; + + local = frame->local; + int_lock = &local->internal_lock; + + AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, + AFR_UNLOCK_OP, + int_lock->lockee[lockee_no].basename, op_ret, + op_errno, (int) ((long)cookie)); + + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s: unlock failed on %d, reason: %s", + local->loc.path, child_index, strerror (op_errno)); + } + + int_lock->lockee[lockee_no].locked_nodes[child_index] &= LOCKED_NO; + afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, NULL); return 0; } @@ -658,24 +819,22 @@ afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, static int afr_unlock_entrylk (call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - const char *basename = NULL; - loc_t *loc = NULL; - int call_count = 0; - int i = -1; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int index = 0; + int lockee_no = 0; + int copies = 0; + int i = -1; local = frame->local; int_lock = &local->internal_lock; priv = this->private; + copies = priv->child_count; - basename = int_lock->lk_basename; - if (int_lock->lk_loc) - loc = int_lock->lk_loc; + call_count = afr_lockee_locked_nodes_count (int_lock); - call_count = afr_locked_nodes_count (int_lock->entry_locked_nodes, - priv->child_count); int_lock->lk_call_count = call_count; if (!call_count){ @@ -685,18 +844,23 @@ afr_unlock_entrylk (call_frame_t *frame, xlator_t *this) goto out; } - for (i = 0; i < priv->child_count; i++) { - if (int_lock->entry_locked_nodes[i] & LOCKED_YES) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION, - AFR_UNLOCK_OP, basename, i); + for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) { + lockee_no = i / copies; + index = i % copies; + if (int_lock->lockee[lockee_no].locked_nodes[index] & LOCKED_YES) { + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, + AFR_UNLOCK_OP, + int_lock->lockee[lockee_no].basename, + i); STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - loc, basename, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + priv->children[index], + priv->children[index]->fops->entrylk, + int_lock->domain, + &int_lock->lockee[lockee_no].loc, + int_lock->lockee[lockee_no].basename, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); if (!--call_count) break; @@ -710,17 +874,21 @@ out: static int32_t afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int done = 0; - int child_index = (long) cookie; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int cky = (long) cookie; + int child_index = 0; + int lockee_no = 0; + priv = this->private; local = frame->local; int_lock = &local->internal_lock; - priv = this->private; + + child_index = ((int)cky) % priv->child_count; + lockee_no = ((int)cky) / priv->child_count; LOCK (&frame->lock); { @@ -732,13 +900,13 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "please load features/locks xlator on server"); local->op_ret = op_ret; int_lock->lock_op_ret = op_ret; - done = 1; } - local->child_up[child_index] = 0; local->op_errno = op_errno; int_lock->lock_op_errno = op_errno; } + + int_lock->lk_attempted_count++; } UNLOCK (&frame->lock); @@ -747,11 +915,17 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_unlock (frame, this); } else { if (op_ret == 0) { - int_lock->locked_nodes[child_index] - |= LOCKED_YES; - int_lock->lock_count++; + if (local->transaction.type == AFR_ENTRY_TRANSACTION || + local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + int_lock->lockee[lockee_no].locked_nodes[child_index] |= LOCKED_YES; + int_lock->lockee[lockee_no].locked_count++; + int_lock->entrylk_lock_count++; + } else { + int_lock->locked_nodes[child_index] |= LOCKED_YES; + int_lock->lock_count++; + } } - afr_lock_blocking (frame, this, child_index + 1); + afr_lock_blocking (frame, this, cky + 1); } return 0; @@ -759,104 +933,26 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, static int32_t afr_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_trace_inodelk_out (frame, AFR_INODELK_TRANSACTION, + AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION, AFR_LOCK_OP, NULL, op_ret, op_errno, (long) cookie); - afr_lock_cbk (frame, cookie, this, op_ret, op_errno); + afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); return 0; } static int32_t -afr_lock_lower_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - loc_t *lower = NULL; - loc_t *higher = NULL; - const char *lower_name = NULL; - const char *higher_name = NULL; - int child_index = (long) cookie; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ - - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/locks xlator on server"); - - local->op_ret = op_ret; - } - - local->child_up[child_index] = 0; - local->op_errno = op_errno; - } - } - UNLOCK (&frame->lock); - - if (op_ret != 0) { - afr_unlock (frame, this); - goto out; - } else { - int_lock->lower_locked_nodes[child_index] |= LOCKED_LOWER; - int_lock->lock_count++; - } - - /* The lower path has been locked. Now lock the higher path */ - - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, higher_name, child_index); - - - STACK_WIND_COOKIE (frame, afr_lock_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - this->name, higher, higher_name, - ENTRYLK_LOCK, ENTRYLK_WRLCK); - -out: - return 0; -} - -static int32_t afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION, + AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, AFR_LOCK_OP, NULL, op_ret, op_errno, (long)cookie); - afr_lock_cbk (frame, cookie, this, op_ret, op_errno); + afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); return 0; } @@ -864,6 +960,7 @@ static int afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; @@ -874,18 +971,16 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) switch (local->transaction.type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - memcpy (int_lock->inode_locked_nodes, - int_lock->locked_nodes, - priv->child_count); - int_lock->inodelk_lock_count = int_lock->lock_count; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + memcpy (inodelk->locked_nodes, int_lock->locked_nodes, + sizeof (*inodelk->locked_nodes) * priv->child_count); + inodelk->lock_count = int_lock->lock_count; break; case AFR_ENTRY_RENAME_TRANSACTION: case AFR_ENTRY_TRANSACTION: - memcpy (int_lock->entry_locked_nodes, - int_lock->locked_nodes, - priv->child_count); - int_lock->entrylk_lock_count = int_lock->lock_count; + /*entrylk_count is being used in both non-blocking and blocking + * modes */ break; } @@ -893,28 +988,67 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) } +static inline gf_boolean_t +afr_is_entrylk (afr_internal_lock_t *int_lock, + afr_transaction_type trans_type) +{ + gf_boolean_t is_entrylk = _gf_false; + + if ((int_lock->transaction_lk_type == AFR_SELFHEAL_LK) && + int_lock->selfheal_lk_type == AFR_ENTRY_SELF_HEAL_LK) { + + is_entrylk = _gf_true; + + } else if ((int_lock->transaction_lk_type == AFR_TRANSACTION_LK) && + (trans_type == AFR_ENTRY_TRANSACTION || + trans_type == AFR_ENTRY_RENAME_TRANSACTION)) { + + is_entrylk = _gf_true; + + } else { + is_entrylk = _gf_false; + } + + return is_entrylk; +} + +static gf_boolean_t +_is_lock_wind_needed (afr_local_t *local, int child_index) +{ + if (!local->child_up[child_index]) + return _gf_false; + + return _gf_true; +} + int -afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) +afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - loc_t *lower = NULL; - loc_t *higher = NULL; - const char *lower_name = NULL; - const char *higher_name = NULL; struct gf_flock flock = {0,}; uint64_t ctx = 0; int ret = 0; + int child_index = 0; + int lockee_no = 0; + gf_boolean_t is_entrylk = _gf_false; - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + child_index = cookie % priv->child_count; + lockee_no = cookie / priv->child_count; + is_entrylk = afr_is_entrylk (int_lock, local->transaction.type); - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; - flock.l_type = int_lock->lk_flock.l_type; + + if (!is_entrylk) { + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; + flock.l_type = inodelk->flock.l_type; + } if (local->fd) { ret = fd_ctx_get (local->fd, this, &ctx); @@ -933,46 +1067,26 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) return 0; } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - /* skip over children that or down - or don't have the fd open */ - - while ((child_index < priv->child_count) - && (!local->child_up[child_index] - || !fd_ctx->opened_on[child_index])) - - child_index++; - } else { - /* skip over children that are down */ - while ((child_index < priv->child_count) - && !local->child_up[child_index]) - child_index++; } - if ((child_index == priv->child_count) && - int_lock->lock_count == 0) { - - gf_log (this->name, GF_LOG_INFO, - "unable to lock on even one child"); - - local->op_ret = -1; - int_lock->lock_op_ret = -1; + if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { + if ((is_entrylk && int_lock->entrylk_lock_count == 0) || + (!is_entrylk && int_lock->lock_count == 0)) { + gf_log (this->name, GF_LOG_INFO, + "unable to lock on even one child"); - afr_copy_locked_nodes (frame, this); + local->op_ret = -1; + int_lock->lock_op_ret = -1; - afr_unlock(frame, this); + afr_copy_locked_nodes (frame, this); - return 0; + afr_unlock(frame, this); + return 0; + } } - if ((child_index == priv->child_count) - || (int_lock->lock_count == - afr_up_children_count (priv->child_count, - local->child_up))) { - + if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { /* we're done locking */ gf_log (this->name, GF_LOG_DEBUG, @@ -985,12 +1099,18 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) return 0; } + if (!_is_lock_wind_needed (local, child_index)) { + afr_lock_blocking (frame, this, cookie + 1); + return 0; + } + switch (local->transaction.type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: if (local->fd) { - afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION, + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_TRANSACTION, AFR_LOCK_OP, &flock, F_SETLKW, child_index); @@ -998,11 +1118,12 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->finodelk, - this->name, local->fd, - F_SETLKW, &flock); + int_lock->domain, local->fd, + F_SETLKW, &flock, NULL); } else { - afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION, + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_TRANSACTION, AFR_LOCK_OP, &flock, F_SETLKW, child_index); @@ -1010,79 +1131,50 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->inodelk, - this->name, &local->loc, - F_SETLKW, &flock); + int_lock->domain, &local->loc, + F_SETLKW, &flock, NULL); } break; case AFR_ENTRY_RENAME_TRANSACTION: - { - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, lower_name, child_index); - - - STACK_WIND_COOKIE (frame, afr_lock_lower_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - this->name, lower, lower_name, - ENTRYLK_LOCK, ENTRYLK_WRLCK); - - break; - } - case AFR_ENTRY_TRANSACTION: + /*Accounting for child_index increments on 'down' + *and 'fd-less' children */ + if (local->fd) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, local->transaction.basename, - child_index); + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, + cookie); STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, - (void *) (long) child_index, + (void *) (long) cookie, priv->children[child_index], priv->children[child_index]->fops->fentrylk, - this->name, local->fd, - local->transaction.basename, - ENTRYLK_LOCK, ENTRYLK_WRLCK); + int_lock->domain, local->fd, + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); } else { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_TRANSACTION, + AFR_TRACE_ENTRYLK_IN (frame, this, + AFR_ENTRYLK_TRANSACTION, AFR_LOCK_OP, local->transaction.basename, child_index); STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, - (void *) (long) child_index, + (void *) (long) cookie, priv->children[child_index], priv->children[child_index]->fops->entrylk, - this->name, - &local->transaction.parent_loc, - local->transaction.basename, - ENTRYLK_LOCK, ENTRYLK_WRLCK); + int_lock->domain, + &int_lock->lockee[lockee_no].loc, + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); } break; } return 0; - - } int32_t @@ -1091,6 +1183,7 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this) afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; + int up_count = 0; priv = this->private; local = frame->local; @@ -1104,6 +1197,10 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this) case AFR_ENTRY_RENAME_TRANSACTION: case AFR_ENTRY_TRANSACTION: + up_count = AFR_COUNT (local->child_up, priv->child_count); + int_lock->lk_call_count = int_lock->lk_expected_count + = (int_lock->lockee_count * + up_count); initialize_entrylk_variables (frame, this); break; } @@ -1115,59 +1212,68 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this) static int32_t afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; - afr_private_t *priv = NULL; int call_count = 0; int child_index = (long) cookie; + int copies = 0; + int index = 0; + int lockee_no = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + copies = priv->child_count; + index = child_index % copies; + lockee_no = child_index / copies; local = frame->local; int_lock = &local->internal_lock; - priv = this->private; - afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, NULL, op_ret, + AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, op_ret, op_errno, (long) cookie); - LOCK (&frame->lock); - { - call_count = --int_lock->lk_call_count; - } - UNLOCK (&frame->lock); - - if (op_ret < 0 ) { - if (op_errno == ENOSYS) { + LOCK (&frame->lock); + { + if (op_ret < 0 ) { + if (op_errno == ENOSYS) { /* return ENOTSUP */ - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/locks xlator on server"); - local->op_ret = op_ret; - int_lock->lock_op_ret = op_ret; + gf_log (this->name, GF_LOG_ERROR, + "subvolume does not support locking. " + "please load features/locks xlator on server"); + local->op_ret = op_ret; + int_lock->lock_op_ret = op_ret; + + int_lock->lock_op_errno = op_errno; + local->op_errno = op_errno; + } + } else if (op_ret == 0) { + int_lock->lockee[lockee_no].locked_nodes[index] |= \ + LOCKED_YES; + int_lock->lockee[lockee_no].locked_count++; + int_lock->entrylk_lock_count++; + } - local->child_up[child_index] = 0; - int_lock->lock_op_errno = op_errno; - local->op_errno = op_errno; - } - } else if (op_ret == 0) { - int_lock->entry_locked_nodes[child_index] - |= LOCKED_YES; - int_lock->entrylk_lock_count++; + call_count = --int_lock->lk_call_count; } + UNLOCK (&frame->lock); if (call_count == 0) { gf_log (this->name, GF_LOG_TRACE, "Last locking reply received"); - /* all locks successfull. Proceed to call FOP */ + /* all locks successful. Proceed to call FOP */ if (int_lock->entrylk_lock_count == - afr_up_children_count (priv->child_count, local->child_up)) { + int_lock->lk_expected_count) { gf_log (this->name, GF_LOG_TRACE, "All servers locked. Calling the cbk"); int_lock->lock_op_ret = 0; int_lock->lock_cbk (frame, this); } - /* Not all locks were successfull. Unlock and try locking + /* Not all locks were successful. Unlock and try locking again, this time with serially blocking locks */ else { gf_log (this->name, GF_LOG_TRACE, @@ -1184,31 +1290,26 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - const char *basename = NULL; - loc_t *loc = NULL; - int32_t call_count = 0; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int copies = 0; + int index = 0; + int lockee_no = 0; + int32_t call_count = 0; int i = 0; - uint64_t ctx = 0; - int ret = 0; local = frame->local; int_lock = &local->internal_lock; priv = this->private; + copies = priv->child_count; initialize_entrylk_variables (frame, this); - basename = int_lock->lk_basename; - if (int_lock->lk_loc) - loc = int_lock->lk_loc; - if (local->fd) { - ret = fd_ctx_get (local->fd, this, &ctx); - - if (ret < 0) { + fd_ctx = afr_fd_ctx_get (local->fd, this); + if (!fd_ctx) { gf_log (this->name, GF_LOG_INFO, "unable to get fd ctx for fd=%p", local->fd); @@ -1218,13 +1319,13 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) local->op_errno = EINVAL; int_lock->lock_op_errno = EINVAL; + afr_unlock (frame, this); return -1; } - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - call_count = internal_lock_count (frame, this, fd_ctx); + call_count = int_lock->lockee_count * internal_lock_count (frame, this); int_lock->lk_call_count = call_count; + int_lock->lk_expected_count = call_count; if (!call_count) { gf_log (this->name, GF_LOG_INFO, @@ -1235,41 +1336,52 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) /* Send non-blocking entrylk calls only on up children and where the fd has been opened */ - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && fd_ctx->opened_on[i]) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION, - AFR_LOCK_OP, basename, i); + for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) { + index = i%copies; + lockee_no = i/copies; + if (local->child_up[index]) { + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, + i); STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fentrylk, + priv->children[index], + priv->children[index]->fops->fentrylk, this->name, local->fd, - basename, - ENTRYLK_LOCK_NB, ENTRYLK_WRLCK); + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, + NULL); + if (!--call_count) + break; } } } else { - GF_ASSERT (loc); - - call_count = internal_lock_count (frame, this, NULL); + call_count = int_lock->lockee_count * internal_lock_count (frame, this); int_lock->lk_call_count = call_count; + int_lock->lk_expected_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION, - AFR_LOCK_OP, basename, i); + for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) { + index = i%copies; + lockee_no = i/copies; + if (local->child_up[index]) { + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, + i); STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, loc, basename, - ENTRYLK_LOCK_NB, ENTRYLK_WRLCK); + priv->children[index], + priv->children[index]->fops->entrylk, + this->name, &int_lock->lockee[lockee_no].loc, + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, + NULL); if (!--call_count) break; - } } } @@ -1279,58 +1391,75 @@ out: int32_t afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; - afr_private_t *priv = NULL; int call_count = 0; int child_index = (long) cookie; + afr_fd_ctx_t *fd_ctx = NULL; + local = frame->local; int_lock = &local->internal_lock; - priv = this->private; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - afr_trace_inodelk_out (frame, AFR_INODELK_NB_TRANSACTION, + AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_NB_TRANSACTION, AFR_LOCK_OP, NULL, op_ret, op_errno, (long) cookie); + if (local->fd) + fd_ctx = afr_fd_ctx_get (local->fd, this); + LOCK (&frame->lock); { + if (op_ret < 0) { + if (op_errno == ENOSYS) { + /* return ENOTSUP */ + gf_log (this->name, GF_LOG_ERROR, + "subvolume does not support locking. " + "please load features/locks xlator on " + "server"); + local->op_ret = op_ret; + int_lock->lock_op_ret = op_ret; + int_lock->lock_op_errno = op_errno; + local->op_errno = op_errno; + } + if (local->transaction.eager_lock) + local->transaction.eager_lock[child_index] = 0; + } else { + inodelk->locked_nodes[child_index] |= LOCKED_YES; + inodelk->lock_count++; + + if (local->transaction.eager_lock && + local->transaction.eager_lock[child_index] && + local->fd) { + /* piggybacked */ + if (op_ret == 1) { + /* piggybacked */ + } else if (op_ret == 0) { + /* lock acquired from server */ + fd_ctx->lock_acquired[child_index]++; + } + } + } + call_count = --int_lock->lk_call_count; } UNLOCK (&frame->lock); - if (op_ret < 0 ) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/locks xlator on server"); - local->op_ret = op_ret; - int_lock->lock_op_ret = op_ret; - local->child_up[child_index] = 0; - int_lock->lock_op_errno = op_errno; - local->op_errno = op_errno; - } - } else if (op_ret == 0) { - int_lock->inode_locked_nodes[child_index] - |= LOCKED_YES; - int_lock->inodelk_lock_count++; - } - if (call_count == 0) { gf_log (this->name, GF_LOG_TRACE, "Last inode locking reply received"); - /* all locks successfull. Proceed to call FOP */ - if (int_lock->inodelk_lock_count == - afr_up_children_count (priv->child_count, local->child_up)) { + /* all locks successful. Proceed to call FOP */ + if (inodelk->lock_count == int_lock->lk_expected_count) { gf_log (this->name, GF_LOG_TRACE, "All servers locked. Calling the cbk"); int_lock->lock_op_ret = 0; int_lock->lock_cbk (frame, this); } - /* Not all locks were successfull. Unlock and try locking + /* Not all locks were successful. Unlock and try locking again, this time with serially blocking locks */ else { gf_log (this->name, GF_LOG_TRACE, @@ -1348,29 +1477,35 @@ int afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; afr_fd_ctx_t *fd_ctx = NULL; - int32_t call_count = 0; - uint64_t ctx = 0; - int i = 0; - int ret = 0; - struct gf_flock flock = {0,}; + int32_t call_count = 0; + int i = 0; + int ret = 0; + struct gf_flock flock = {0,}; + struct gf_flock full_flock = {0,}; + struct gf_flock *flock_use = NULL; + int piggyback = 0; local = frame->local; int_lock = &local->internal_lock; priv = this->private; - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; - flock.l_type = int_lock->lk_flock.l_type; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; + flock.l_type = inodelk->flock.l_type; + + full_flock.l_type = inodelk->flock.l_type; initialize_inodelk_variables (frame, this); if (local->fd) { - ret = fd_ctx_get (local->fd, this, &ctx); - - if (ret < 0) { + fd_ctx = afr_fd_ctx_get (local->fd, this); + if (!fd_ctx) { gf_log (this->name, GF_LOG_INFO, "unable to get fd ctx for fd=%p", local->fd); @@ -1380,14 +1515,14 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) local->op_errno = EINVAL; int_lock->lock_op_errno = EINVAL; + afr_unlock (frame, this); ret = -1; goto out; } - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - call_count = internal_lock_count (frame, this, fd_ctx); + call_count = internal_lock_count (frame, this); int_lock->lk_call_count = call_count; + int_lock->lk_expected_count = call_count; if (!call_count) { gf_log (this->name, GF_LOG_INFO, @@ -1399,258 +1534,77 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) /* Send non-blocking inodelk calls only on up children and where the fd has been opened */ for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && fd_ctx->opened_on[i]) { - afr_trace_inodelk_in (frame, AFR_INODELK_NB_TRANSACTION, - AFR_LOCK_OP, &flock, F_SETLK, i); - - STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->finodelk, - this->name, local->fd, - F_SETLK, &flock); - - if (!--call_count) - break; + if (!local->child_up[i]) + continue; + flock_use = &flock; + if (!local->transaction.eager_lock_on) { + goto wind; } - } - } else { - call_count = internal_lock_count (frame, this, NULL); - int_lock->lk_call_count = call_count; + piggyback = 0; + local->transaction.eager_lock[i] = 1; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - afr_trace_inodelk_in (frame, AFR_INODELK_NB_TRANSACTION, - AFR_LOCK_OP, &flock, F_SETLK, i); + afr_set_delayed_post_op (frame, this); - STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->inodelk, - this->name, &local->loc, - F_SETLK, &flock); + LOCK (&local->fd->lock); + { + if (fd_ctx->lock_acquired[i]) { + fd_ctx->lock_piggyback[i]++; + piggyback = 1; + } + } + UNLOCK (&local->fd->lock); + if (piggyback) { + /* (op_ret == 1) => indicate piggybacked lock */ + afr_nonblocking_inodelk_cbk (frame, (void *) (long) i, + this, 1, 0, NULL); if (!--call_count) break; - + continue; } - } - } - -out: - return ret; -} - -static int -__is_lower_locked (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int count = 0; - int i = 0; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (int_lock->lower_locked_nodes[i] & LOCKED_LOWER) - count++; - } - - return count; - -} - -static int -__is_higher_locked (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int count = 0; - int i = 0; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (int_lock->locked_nodes[i] & LOCKED_YES) - count++; - } - - return count; - -} - -static int -afr_unlock_lower_entrylk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - const char *basename = NULL; - loc_t *loc = NULL; - int call_count = 0; - int i = -1; + flock_use = &full_flock; + wind: + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_NB_TRANSACTION, + AFR_LOCK_OP, flock_use, F_SETLK, i); - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - basename = int_lock->lk_basename; - if (int_lock->lk_loc) - loc = int_lock->lk_loc; - - call_count = __is_lower_locked (frame, this); - int_lock->lk_call_count = call_count; + STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->finodelk, + int_lock->domain, local->fd, + F_SETLK, flock_use, NULL); - if (!call_count){ - gf_log (this->name, GF_LOG_TRACE, - "No internal locks unlocked"); - int_lock->lock_cbk (frame, this); - goto out; - } + if (!--call_count) + break; + } + } else { + call_count = internal_lock_count (frame, this); + int_lock->lk_call_count = call_count; + int_lock->lk_expected_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (int_lock->lower_locked_nodes[i] & LOCKED_LOWER) { - afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION, - AFR_UNLOCK_OP, basename, i); + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + AFR_TRACE_INODELK_IN (frame, this, + AFR_INODELK_NB_TRANSACTION, + AFR_LOCK_OP, &flock, F_SETLK, i); - STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk, + STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, (void *) (long) i, priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - loc, basename, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + priv->children[i]->fops->inodelk, + int_lock->domain, &local->loc, + F_SETLK, &flock, NULL); if (!--call_count) break; - } } - out: - return 0; - -} - - -static int -afr_post_unlock_higher_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.done (frame, this); - return 0; -} - -static int -afr_post_unlock_lower_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - loc_t *lower = NULL; - loc_t *higher = NULL; - const char *lower_name = NULL; - const char *higher_name = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - if (__is_higher_locked (frame, this)) { - gf_log (this->name, GF_LOG_DEBUG, - "unlocking higher"); - int_lock->lk_basename = higher_name; - int_lock->lk_loc = higher; - int_lock->lock_cbk = afr_post_unlock_higher_cbk; - - afr_unlock_entrylk (frame, this); - } else - local->transaction.done (frame, this); - - return 0; -} - -static int -afr_rename_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - loc_t *lower = NULL; - loc_t *higher = NULL; - const char *lower_name = NULL; - const char *higher_name = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - - if (__is_lower_locked (frame, this)) { - gf_log (this->name, GF_LOG_DEBUG, - "unlocking lower"); - int_lock->lk_basename = lower_name; - int_lock->lk_loc = lower; - int_lock->lock_cbk = afr_post_unlock_lower_cbk; - - afr_unlock_lower_entrylk (frame, this); - } else - afr_post_unlock_lower_cbk (frame, this); - - return 0; -} - -static int -afr_rename_transaction (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - return (local->transaction.type == - AFR_ENTRY_RENAME_TRANSACTION); - + return ret; } int32_t @@ -1664,10 +1618,8 @@ afr_unlock (call_frame_t *frame, xlator_t *this) if (is_afr_lock_transaction (local)) afr_unlock_inodelk (frame, this); else - if (!afr_rename_transaction (frame, this)) - afr_unlock_entrylk (frame, this); - else - afr_rename_unlock (frame, this); + afr_unlock_entrylk (frame, this); + } else { if (is_afr_lock_selfheal (local)) afr_unlock_inodelk (frame, this); @@ -1679,485 +1631,37 @@ afr_unlock (call_frame_t *frame, xlator_t *this) } int -afr_mark_locked_nodes (xlator_t *this, fd_t *fd, - unsigned char *locked_nodes) +afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, + unsigned int child_count) { - afr_private_t *priv = NULL; - afr_fd_ctx_t *fdctx = NULL; - uint64_t tmp = 0; - int ret = 0; - - priv = this->private; - - ret = afr_fd_ctx_set (this, fd); - if (ret) - goto out; - - ret = fd_ctx_get (fd, this, &tmp); - if (ret) { - gf_log (this->name, GF_LOG_INFO, - "failed to get the fd ctx"); + afr_local_t *dst_local = NULL; + afr_local_t *src_local = NULL; + afr_internal_lock_t *dst_lock = NULL; + afr_internal_lock_t *src_lock = NULL; + afr_inodelk_t *dst_inodelk = NULL; + afr_inodelk_t *src_inodelk = NULL; + int ret = -1; + + src_local = src->local; + src_lock = &src_local->internal_lock; + src_inodelk = afr_get_inodelk (src_lock, dom); + dst_local = dst->local; + dst_lock = &dst_local->internal_lock; + dst_inodelk = afr_get_inodelk (dst_lock, dom); + if (!dst_inodelk || !src_inodelk) goto out; - } - fdctx = (afr_fd_ctx_t *) (long) tmp; - - GF_ASSERT (fdctx->locked_on); - - memcpy (fdctx->locked_on, locked_nodes, - priv->child_count); - -out: - return ret; -} - -static int -__is_fd_saved (xlator_t *this, fd_t *fd) -{ - afr_locked_fd_t *locked_fd = NULL; - afr_private_t *priv = NULL; - int found = 0; - - priv = this->private; - - list_for_each_entry (locked_fd, &priv->saved_fds, list) { - if (locked_fd->fd == fd) { - found = 1; - break; - } - } - - return found; -} - -static int -__afr_save_locked_fd (xlator_t *this, fd_t *fd) -{ - afr_private_t *priv = NULL; - afr_locked_fd_t *locked_fd = NULL; - int ret = 0; - - priv = this->private; - - locked_fd = GF_CALLOC (1, sizeof (*locked_fd), - gf_afr_mt_locked_fd); - if (!locked_fd) { - ret = -1; - goto out; - } - - locked_fd->fd = fd; - INIT_LIST_HEAD (&locked_fd->list); - - list_add_tail (&locked_fd->list, &priv->saved_fds); - -out: - return ret; -} - -int -afr_save_locked_fd (xlator_t *this, fd_t *fd) -{ - afr_private_t *priv = NULL; - int ret = 0; - - priv = this->private; - - pthread_mutex_lock (&priv->mutex); - { - if (__is_fd_saved (this, fd)) { - gf_log (this->name, GF_LOG_DEBUG, - "fd=%p already saved", fd); - goto unlock; - } - - ret = __afr_save_locked_fd (this, fd); - if (ret) { - gf_log (this->name, GF_LOG_INFO, - "fd=%p could not be saved", fd); - goto unlock; - } - } -unlock: - pthread_mutex_unlock (&priv->mutex); - - return ret; -} - -static int -afr_lock_recovery_cleanup (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_locked_fd_t *locked_fd = NULL; - - local = frame->local; - - locked_fd = local->locked_fd; - - STACK_DESTROY (frame->root); - afr_local_cleanup (local, this); - - afr_save_locked_fd (this, locked_fd->fd); - - return 0; - -} - -static int -afr_get_source_lock_recovery (xlator_t *this, fd_t *fd) -{ - afr_fd_ctx_t *fdctx = NULL; - afr_private_t *priv = NULL; - uint64_t tmp = 0; - int i = 0; - int source_child = -1; - int ret = 0; - - priv = this->private; - - ret = fd_ctx_get (fd, this, &tmp); - if (ret) - goto out; - - fdctx = (afr_fd_ctx_t *) (long) tmp; - - for (i = 0; i < priv->child_count; i++) { - if (fdctx->locked_on[i]) { - gf_log (this->name, GF_LOG_DEBUG, - "Found lock recovery source=%d", i); - source_child = i; - break; - } - } - -out: - return source_child; - -} - -int32_t -afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock); -int32_t -afr_recover_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int32_t source_child = 0; - struct gf_flock flock = {0,}; - - local = frame->local; - priv = this->private; - - if (op_ret) { - gf_log (this->name, GF_LOG_INFO, - "lock recovery failed"); - goto cleanup; - } - - source_child = local->source_child; - - memcpy (&flock, lock, sizeof (*lock)); - - STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk, - (void *) (long) source_child, - priv->children[source_child], - priv->children[source_child]->fops->lk, - local->fd, F_GETLK_FD, &flock); - - return 0; - -cleanup: - afr_lock_recovery_cleanup (frame, this); - return 0; -} - -int -afr_recover_lock (call_frame_t *frame, xlator_t *this, - struct gf_flock *flock) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int32_t lock_recovery_child = 0; - - priv = this->private; - local = frame->local; - - lock_recovery_child = local->lock_recovery_child; - - frame->root->lk_owner = flock->l_owner; - - STACK_WIND_COOKIE (frame, afr_recover_lock_cbk, - (void *) (long) lock_recovery_child, - priv->children[lock_recovery_child], - priv->children[lock_recovery_child]->fops->lk, - local->fd, F_SETLK, flock); - - return 0; -} - -static int -is_afr_lock_eol (struct gf_flock *lock) -{ - int ret = 0; - - if ((lock->l_type == GF_LK_EOL)) - ret = 1; - - return ret; -} - -int32_t -afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock) -{ - if (op_ret) { - gf_log (this->name, GF_LOG_INFO, - "Failed to get locks on fd"); - goto cleanup; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Got a lock on fd"); - - if (is_afr_lock_eol (lock)) { - gf_log (this->name, GF_LOG_INFO, - "Reached EOL on locks on fd"); - goto cleanup; - } - - afr_recover_lock (frame, this, lock); - - return 0; - -cleanup: - afr_lock_recovery_cleanup (frame, this); - - return 0; -} - -static int -afr_lock_recovery (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - fd_t *fd = NULL; - int ret = 0; - int32_t source_child = 0; - struct gf_flock flock = {0,}; - - priv = this->private; - local = frame->local; - - fd = local->fd; - - source_child = afr_get_source_lock_recovery (this, fd); - if (source_child < 0) { - gf_log (this->name, GF_LOG_ERROR, - "Could not recover locks due to lock " - "split brain"); - ret = -1; - goto out; - } - - local->source_child = source_child; - - /* the flock can be zero filled as we're querying incrementally - the locks held on the fd. - */ - STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk, - (void *) (long) source_child, - priv->children[source_child], - priv->children[source_child]->fops->lk, - local->fd, F_GETLK_FD, &flock); - -out: - return ret; -} - - -static int -afr_mark_fd_opened (xlator_t *this, fd_t *fd, int32_t child_index) -{ - afr_fd_ctx_t *fdctx = NULL; - uint64_t tmp = 0; - int ret = 0; - - ret = fd_ctx_get (fd, this, &tmp); - if (ret) - goto out; - - fdctx = (afr_fd_ctx_t *) (long) tmp; - - fdctx->opened_on[child_index] = 1; - -out: - return ret; -} - -int32_t -afr_lock_recovery_preopen_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) -{ - int32_t child_index = (long )cookie; - int ret = 0; - - if (op_ret) { - gf_log (this->name, GF_LOG_INFO, - "Reopen during lock-recovery failed"); - goto cleanup; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Open succeeded => proceed to recover locks"); - - ret = afr_lock_recovery (frame, this); - if (ret) { - gf_log (this->name, GF_LOG_INFO, - "Lock recovery failed"); - goto cleanup; - } - - ret = afr_mark_fd_opened (this, fd, child_index); - if (ret) { - gf_log (this->name, GF_LOG_INFO, - "Marking fd open failed"); - goto cleanup; - } - - return 0; - -cleanup: - afr_lock_recovery_cleanup (frame, this); - return 0; -} - -static int -afr_lock_recovery_preopen (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - uint64_t tmp = 0; - afr_fd_ctx_t *fdctx = NULL; - loc_t loc = {0,}; - int32_t child_index = 0; - int ret = 0; - - priv = this->private; - local = frame->local; - - GF_ASSERT (local && local->fd); - - ret = fd_ctx_get (local->fd, this, &tmp); - fdctx = (afr_fd_ctx_t *) (long) tmp; - GF_ASSERT (fdctx); - - child_index = local->lock_recovery_child; - - inode_path (local->fd->inode, NULL, (char **)&loc.path); - loc.name = strrchr (loc.path, '/'); - loc.inode = inode_ref (local->fd->inode); - loc.parent = inode_parent (local->fd->inode, 0, NULL); - - - STACK_WIND_COOKIE (frame, afr_lock_recovery_preopen_cbk, - (void *)(long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->open, - &loc, fdctx->flags, local->fd, - fdctx->wbflags); - - return 0; -} - -static int -is_fd_opened (fd_t *fd, int32_t child_index) -{ - afr_fd_ctx_t *fdctx = NULL; - uint64_t tmp = 0; - int ret = 0; - - ret = fd_ctx_get (fd, THIS, &tmp); - if (ret) - goto out; - - fdctx = (afr_fd_ctx_t *) (long) tmp; - - if (fdctx->opened_on[child_index]) - ret = 1; - -out: - return ret; -} - -int -afr_attempt_lock_recovery (xlator_t *this, int32_t child_index) -{ - call_frame_t *frame = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_locked_fd_t *locked_fd = NULL; - afr_locked_fd_t *tmp = NULL; - int ret = 0; - struct list_head locks_list = {0,}; - - - priv = this->private; - - if (list_empty (&priv->saved_fds)) - goto out; - - frame = create_frame (this, this->ctx->pool); - if (!frame) { - ret = -1; - goto out; - } - - local = GF_CALLOC (1, sizeof (*local), - gf_afr_mt_afr_local_t); - if (!local) { - ret = -1; - goto out; - } - - AFR_LOCAL_INIT (local, priv); - if (!local) { - ret = -1; - goto out; - } - - frame->local = local; - - INIT_LIST_HEAD (&locks_list); - - pthread_mutex_lock (&priv->mutex); - { - list_splice_init (&priv->saved_fds, &locks_list); - } - pthread_mutex_unlock (&priv->mutex); - - list_for_each_entry_safe (locked_fd, tmp, - &locks_list, list) { - - list_del_init (&locked_fd->list); - - local->fd = fd_ref (locked_fd->fd); - local->lock_recovery_child = child_index; - local->locked_fd = locked_fd; - - if (!is_fd_opened (locked_fd->fd, child_index)) { - gf_log (this->name, GF_LOG_DEBUG, - "attempting open before lock " - "recovery"); - afr_lock_recovery_preopen (frame, this); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "attempting lock recovery " - "without a preopen"); - afr_lock_recovery (frame, this); - } - } - + if (src_inodelk->locked_nodes) { + memcpy (dst_inodelk->locked_nodes, src_inodelk->locked_nodes, + sizeof (*dst_inodelk->locked_nodes) * child_count); + memset (src_inodelk->locked_nodes, 0, + sizeof (*src_inodelk->locked_nodes) * child_count); + } + + dst_lock->transaction_lk_type = src_lock->transaction_lk_type; + dst_lock->selfheal_lk_type = src_lock->selfheal_lk_type; + dst_inodelk->lock_count = src_inodelk->lock_count; + src_inodelk->lock_count = 0; + ret = 0; out: return ret; } diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 80e10482f..05df90cc0 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -26,7 +17,6 @@ enum gf_afr_mem_types_ { gf_afr_mt_iovec = gf_common_mt_end + 1, gf_afr_mt_afr_fd_ctx_t, - gf_afr_mt_afr_local_t, gf_afr_mt_afr_private_t, gf_afr_mt_int32_t, gf_afr_mt_char, @@ -43,6 +33,16 @@ enum gf_afr_mem_types_ { gf_afr_mt_pump_priv, gf_afr_mt_locked_fd, gf_afr_mt_inode_ctx_t, + gf_afr_fd_paused_call_t, + gf_afr_mt_crawl_data_t, + gf_afr_mt_brick_pos_t, + gf_afr_mt_shd_bool_t, + gf_afr_mt_shd_timer_t, + gf_afr_mt_shd_event_t, + gf_afr_mt_time_t, + gf_afr_mt_pos_data_t, + gf_afr_mt_reply_t, + gf_afr_mt_subvol_healer_t, gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index 66146a92d..f86aa7fd8 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -52,19 +43,31 @@ #include "afr-dir-read.h" #include "afr-dir-write.h" #include "afr-transaction.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" + + +gf_boolean_t +afr_is_fd_fixable (fd_t *fd) +{ + if (!fd || !fd->inode) + return _gf_false; + else if (fd_is_anonymous (fd)) + return _gf_false; + else if (uuid_is_null (fd->inode->gfid)) + return _gf_false; + + return _gf_true; +} int afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) + struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = frame->local; AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, - local->fd); + local->fd, xdata); return 0; } @@ -72,583 +75,263 @@ afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int afr_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd) + fd_t *fd, dict_t *xdata) { afr_local_t * local = NULL; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int ret = 0; int call_count = -1; int child_index = (long) cookie; + afr_fd_ctx_t *fd_ctx = NULL; local = frame->local; + fd_ctx = local->fd_ctx; LOCK (&frame->lock); { if (op_ret == -1) { local->op_errno = op_errno; - } - - if (op_ret >= 0) { + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } else { local->op_ret = op_ret; - local->success_count++; - - ret = afr_fd_ctx_set (this, fd); - - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not set fd ctx for fd=%p", fd); - - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } - - ret = fd_ctx_get (fd, this, &ctx); - - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not get fd ctx for fd=%p", fd); - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - fd_ctx->opened_on[child_index] = 1; - fd_ctx->flags = local->cont.open.flags; - fd_ctx->wbflags = local->cont.open.wbflags; + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); } } -unlock: UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) { - if ((local->cont.open.flags & O_TRUNC) - && (local->op_ret >= 0)) { + if ((fd_ctx->flags & O_TRUNC) && (local->op_ret >= 0)) { STACK_WIND (frame, afr_open_ftruncate_cbk, this, this->fops->ftruncate, - fd, 0); + fd, 0, NULL); } else { AFR_STACK_UNWIND (open, frame, local->op_ret, - local->op_errno, local->fd); + local->op_errno, local->fd, + local->xdata_rsp); } } return 0; } - int afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, int32_t wbflags) + fd_t *fd, dict_t *xdata) { afr_private_t * priv = NULL; afr_local_t * local = NULL; int i = 0; - int ret = -1; int32_t call_count = 0; - int32_t op_ret = -1; int32_t op_errno = 0; - int32_t wind_flags = flags & (~O_TRUNC); + afr_fd_ctx_t *fd_ctx = NULL; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); + //We can't let truncation to happen outside transaction. priv = this->private; - if (afr_is_split_brain (this, loc->inode)) { - /* self-heal failed */ - gf_log (this->name, GF_LOG_WARNING, - "failed to open as split brain seen, returning EIO"); - op_errno = EIO; - goto out; + if (flags & (O_CREAT|O_TRUNC)) { + QUORUM_CHECK(open,out); } - ALLOC_OR_GOTO (local, afr_local_t, out); + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) { + op_errno = ENOMEM; + goto out; + } - frame->local = local; - call_count = local->call_count; - - loc_copy (&local->loc, loc); + local->fd = fd_ref (fd); + local->fd_ctx = fd_ctx; + fd_ctx->flags = flags; - local->cont.open.flags = flags; - local->cont.open.wbflags = wbflags; + call_count = local->call_count; - local->fd = fd_ref (fd); + local->cont.open.flags = flags; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->open, - loc, wind_flags, fd, wbflags); - + loc, (flags & ~O_TRUNC), fd, xdata); if (!--call_count) break; } } - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (open, frame, op_ret, op_errno, fd); - } + AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, NULL); return 0; } - int -afr_openfd_sh_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - uint64_t ctx = 0; - int ret = 0; - int call_count = 0; - int child_index = (long) cookie; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - - LOCK (&frame->lock); - { - if (op_ret >= 0) { - ret = fd_ctx_get (fd, this, &ctx); - - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "failed to get fd context, %p", fd); - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - fd_ctx->opened_on[child_index] = 1; - - gf_log (this->name, GF_LOG_TRACE, - "fd for %s opened successfully on subvolume %s", - local->loc.path, priv->children[child_index]->name); - } - } -out: - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - int_lock->lock_cbk = local->transaction.done; - local->transaction.resume (frame, this); - } - - return 0; -} - - -static int -__unopened_count (int child_count, unsigned int *opened_on, unsigned char *child_up) -{ - int i = 0; - int count = 0; - - for (i = 0; i < child_count; i++) { - if (!opened_on[i] && child_up[i]) - count++; - } - - return count; -} - - -int -afr_openfd_sh_unwind (call_frame_t *frame, xlator_t *this, int32_t op_ret, - int32_t op_errno) +afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - uint64_t ctx = 0; afr_fd_ctx_t *fd_ctx = NULL; - int abandon = 0; - int ret = 0; - int i = 0; - int call_count = 0; + int call_count = 0; + int child_index = (long) cookie; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - /* - * Some subvolumes might have come up on which we never - * opened this fd in the first place. Re-open fd's on those - * subvolumes now. - */ - - ret = fd_ctx_get (local->fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "failed to get fd context %p (%s)", - local->fd, local->loc.path); - abandon = 1; - goto out; + if (op_ret >= 0) { + gf_log (this->name, GF_LOG_DEBUG, "fd for %s opened " + "successfully on subvolume %s", local->loc.path, + priv->children[child_index]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, "Failed to open %s " + "on subvolume %s", local->loc.path, + priv->children[child_index]->name); } - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + fd_ctx = local->fd_ctx; LOCK (&local->fd->lock); { - call_count = __unopened_count (priv->child_count, - fd_ctx->opened_on, - local->child_up); - for (i = 0; i < priv->child_count; i++) { - fd_ctx->pre_op_done[i] = 0; - fd_ctx->pre_op_piggyback[i] = 0; + if (op_ret >= 0) { + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + } else { + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; } } UNLOCK (&local->fd->lock); - if (call_count == 0) { - gf_log (this->name, GF_LOG_WARNING, - "fd not open on any subvolume %p (%s)", - local->fd, local->loc.path); - abandon = 1; - goto out; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (!fd_ctx->opened_on[i] && local->child_up[i]) { - gf_log (this->name, GF_LOG_TRACE, - "opening fd for %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_openfd_sh_open_cbk, - (void *)(long) i, - priv->children[i], - priv->children[i]->fops->open, - &local->loc, fd_ctx->flags, local->fd, - fd_ctx->wbflags); - - if (!--call_count) - break; - } - } - -out: - if (abandon) - local->transaction.resume (frame, this); + call_count = afr_frame_return (frame); + if (call_count == 0) + AFR_STACK_DESTROY (frame); return 0; } static int -afr_prepare_loc (call_frame_t *frame, fd_t *fd) -{ - afr_local_t *local = NULL; - char *name = NULL; - char *path = NULL; - int ret = 0; - - if ((!fd) || (!fd->inode)) - return -1; - - local = frame->local; - ret = inode_path (fd->inode, NULL, (char **)&path); - if (ret <= 0) { - gf_log (frame->this->name, GF_LOG_DEBUG, - "Unable to get path for gfid: %s", - uuid_utoa (fd->inode->gfid)); - return -1; - } - - if (local->loc.path) { - if (strcmp (path, local->loc.path)) - gf_log (frame->this->name, GF_LOG_DEBUG, - "overwriting old loc->path %s with %s", - local->loc.path, path); - GF_FREE ((char *)local->loc.path); - } - local->loc.path = path; - - name = strrchr (local->loc.path, '/'); - if (name) - name++; - local->loc.name = name; - - if (local->loc.inode) { - inode_unref (local->loc.inode); - } - local->loc.inode = inode_ref (fd->inode); - - if (local->loc.parent) { - inode_unref (local->loc.parent); - } - - local->loc.parent = inode_parent (local->loc.inode, 0, NULL); - - return 0; -} - - -int -afr_openfd_sh (call_frame_t *frame, xlator_t *this) +afr_fd_ctx_need_open (fd_t *fd, xlator_t *this, unsigned char *need_open) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - char sh_type_str[256] = {0,}; - - local = frame->local; - sh = &local->self_heal; - - GF_ASSERT (local->loc.path); - /* forcibly trigger missing-entries self-heal */ - - sh->need_missing_entry_self_heal = _gf_true; - sh->need_gfid_self_heal = _gf_true; - sh->data_lock_held = _gf_true; - sh->need_data_self_heal = _gf_true; - sh->type = local->fd->inode->ia_type; - sh->background = _gf_false; - sh->unwind = afr_openfd_sh_unwind; - - afr_self_heal_type_str_get(&local->self_heal, - sh_type_str, - sizeof(sh_type_str)); - gf_log (this->name, GF_LOG_INFO, "%s self-heal triggered. " - "path: %s, reason: Replicate up down flush, data lock is held", - sh_type_str, local->loc.path); - - afr_self_heal (frame, this, local->fd->inode); - - return 0; + afr_fd_ctx_t *fd_ctx = NULL; + afr_private_t *priv = NULL; + int i = 0; + int count = 0; + + priv = this->private; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return 0; + + LOCK (&fd->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED && + priv->child_up[i]) { + fd_ctx->opened_on[i] = AFR_FD_OPENING; + need_open[i] = 1; + count++; + } else { + need_open[i] = 0; + } + } + } + UNLOCK (&fd->lock); + + return count; } -int -afr_openfd_flush_done (call_frame_t *frame, xlator_t *this) +void +afr_fix_open (fd_t *fd, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; - - int _ret = -1; + afr_private_t *priv = NULL; + int i = 0; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + int ret = -1; + int32_t op_errno = 0; + afr_fd_ctx_t *fd_ctx = NULL; + unsigned char *need_open = NULL; + int call_count = 0; priv = this->private; - local = frame->local; - - LOCK (&local->fd->lock); - { - _ret = __fd_ctx_get (local->fd, this, &ctx); - if (_ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "failed to get fd context %p (%s)", - local->fd, local->loc.path); - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - fd_ctx->down_count = priv->down_count; - fd_ctx->up_count = priv->up_count; - } -out: - UNLOCK (&local->fd->lock); - - afr_local_transaction_cleanup (local, this); - - gf_log (this->name, GF_LOG_TRACE, - "The up/down flush is over"); - - fd_unref (local->fd); - local->openfd_flush_cbk (frame, this); - - return 0; -} - - - -int -afr_openfd_xaction (call_frame_t *frame, xlator_t *this, fd_t *fd) -{ - afr_local_t * local = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - local = frame->local; - - local->op = GF_FOP_FLUSH; - - local->transaction.fop = afr_openfd_sh; - local->transaction.done = afr_openfd_flush_done; - - local->transaction.start = 0; - local->transaction.len = 0; - - gf_log (this->name, GF_LOG_TRACE, - "doing up/down flush on fd=%p", fd); - - afr_transaction (frame, this, AFR_DATA_TRANSACTION); - -out: - return 0; -} - - - -int -afr_openfd_xaction_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int ret = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int call_count = 0; - int child_index = (long) cookie; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - LOCK (&frame->lock); - { - if (op_ret >= 0) { - ret = fd_ctx_get (fd, this, &ctx); - - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "failed to get fd context %p (%s)", - fd, local->loc.path); - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - fd_ctx->opened_on[child_index] = 1; - - gf_log (this->name, GF_LOG_TRACE, - "fd for %s opened successfully on subvolume %s", - local->loc.path, priv->children[child_index]->name); - } - } -out: - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); + if (!afr_is_fd_fixable (fd)) + goto out; - if (call_count == 0) { - afr_openfd_xaction (frame, this, local->fd); - } + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + goto out; - return 0; -} + need_open = alloca0 (priv->child_count); + call_count = afr_fd_ctx_need_open (fd, this, need_open); + if (!call_count) + goto out; -int -afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int no_open = 0; - int ret = 0; - int i = 0; - int call_count = 0; + frame = create_frame (this, this->ctx->pool); + if (!frame) + goto out; - priv = this->private; - local = frame->local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - /* - * If the file is already deleted while the fd is open, no need to - * perform the openfd flush, call the flush_cbk and get out. - */ - ret = afr_prepare_loc (frame, fd); - if (ret < 0) { - local->openfd_flush_cbk (frame, this); + local->loc.inode = inode_ref (fd->inode); + ret = loc_path (&local->loc, NULL); + if (ret < 0) goto out; - } - - /* - * Some subvolumes might have come up on which we never - * opened this fd in the first place. Re-open fd's on those - * subvolumes now. - */ local->fd = fd_ref (fd); + local->fd_ctx = fd_ctx; - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "failed to get fd context %p (%s)", - fd, local->loc.path); - no_open = 1; - goto out; - } + local->call_count = call_count; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + gf_log (this->name, GF_LOG_DEBUG, "need open count: %d", + call_count); - LOCK (&local->fd->lock); - { - call_count = __unopened_count (priv->child_count, - fd_ctx->opened_on, - local->child_up); - } - UNLOCK (&local->fd->lock); - - if (call_count == 0) { - gf_log (this->name, GF_LOG_WARNING, - "fd not open on any subvolume %p (%s)", - fd, local->loc.path); - no_open = 1; - goto out; - } + for (i = 0; i < priv->child_count; i++) { + if (!need_open[i]) + continue; - local->call_count = call_count; + if (IA_IFDIR == fd->inode->ia_type) { + gf_log (this->name, GF_LOG_DEBUG, + "opening fd for dir %s on subvolume %s", + local->loc.path, priv->children[i]->name); - for (i = 0; i < priv->child_count; i++) { - if (!fd_ctx->opened_on[i] && local->child_up[i]) { - gf_log (this->name, GF_LOG_TRACE, - "opening fd for %s on subvolume %s", + STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk, + (void*) (long) i, + priv->children[i], + priv->children[i]->fops->opendir, + &local->loc, local->fd, + NULL); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "opening fd for file %s on subvolume %s", local->loc.path, priv->children[i]->name); - STACK_WIND_COOKIE (frame, afr_openfd_xaction_open_cbk, + STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk, (void *)(long) i, priv->children[i], priv->children[i]->fops->open, - &local->loc, fd_ctx->flags, fd, - fd_ctx->wbflags); - - if (!--call_count) - break; + &local->loc, + fd_ctx->flags & (~O_TRUNC), + local->fd, NULL); } + + if (!--call_count) + break; } + return; out: - if (no_open) - afr_openfd_xaction (frame, this, fd); - - return 0; + if (frame) + AFR_STACK_DESTROY (frame); } diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c new file mode 100644 index 000000000..186f68c33 --- /dev/null +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -0,0 +1,239 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "afr.h" +#include "afr-transaction.h" + +int +afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int subvol = -1; + + local = frame->local; + priv = this->private; + + + for (i = 0; i < priv->child_count; i++) { + if (!local->readable[i]) { + /* don't even bother trying here. + just mark as attempted and move on. */ + local->read_attempted[i] = 1; + continue; + } + + if (!local->read_attempted[i]) { + subvol = i; + break; + } + } + + /* If no more subvols were available for reading, we leave + @subvol as -1, which is an indication we have run out of + readable subvols. */ + if (subvol != -1) + local->read_attempted[subvol] = 1; + local->readfn (frame, this, subvol); + + return 0; +} + + +int +afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err) +{ + afr_local_t *local = NULL; + int read_subvol = 0; + int event_generation = 0; + inode_t *inode = NULL; + int ret = -1; + + local = frame->local; + inode = local->inode; + + if (err) { + local->op_errno = -err; + local->op_ret = -1; + read_subvol = -1; + goto readfn; + } + + ret = afr_inode_read_subvol_type_get (inode, this, local->readable, + &event_generation, + local->transaction.type); + + if (ret == -1 || !event_generation) { + /* Even after refresh, we don't have a good + read subvolume. Time to bail */ + local->op_ret = -1; + local->op_errno = EIO; + read_subvol = -1; + goto readfn; + } + + read_subvol = afr_read_subvol_select_by_policy (inode, this, + local->readable); + + if (read_subvol == -1) { + local->op_ret = -1; + local->op_errno = EIO; + goto readfn; + } + + if (local->read_attempted[read_subvol]) { + afr_read_txn_next_subvol (frame, this); + return 0; + } + + local->read_attempted[read_subvol] = 1; +readfn: + local->readfn (frame, this, read_subvol); + + return 0; +} + + +int +afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (!local->refreshed) { + local->refreshed = _gf_true; + afr_inode_refresh (frame, this, local->inode, + afr_read_txn_refresh_done); + } else { + afr_read_txn_next_subvol (frame, this); + } + + return 0; +} + + +/* afr_read_txn_wipe: + + clean internal variables in @local in order to make + it possible to call afr_read_txn() multiple times from + the same frame +*/ + +void +afr_read_txn_wipe (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + + local->readfn = NULL; + + if (local->inode) + inode_unref (local->inode); + + for (i = 0; i < priv->child_count; i++) { + local->read_attempted[i] = 0; + local->readable[i] = 0; + } +} + + +/* + afr_read_txn: + + This is the read transaction function. The way it works: + + - Determine read-subvolume from inode ctx. + + - If read-subvolume's generation was stale, refresh ctx once by + calling afr_inode_refresh() + + Else make an attempt to read on read-subvolume. + + - If attempted read on read-subvolume fails, refresh ctx once + by calling afr_inode_refresh() + + - After ctx refresh, query read-subvolume freshly and attempt + read once. + + - If read fails, try every other readable[] subvolume before + finally giving up. readable[] elements are set by afr_inode_refresh() + based on dirty and pending flags. + + - If file is in split brain in the backend, generation will be + kept 0 by afr_inode_refresh() and readable[] will be set 0 for + all elements. Therefore reads always fail. +*/ + +int +afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_read_txn_wind_t readfn, afr_transaction_type type) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int read_subvol = -1; + int event_generation = 0; + int ret = -1; + + priv = this->private; + local = frame->local; + + afr_read_txn_wipe (frame, this); + + local->readfn = readfn; + local->inode = inode_ref (inode); + + local->transaction.type = type; + ret = afr_inode_read_subvol_type_get (inode, this, local->readable, + &event_generation, type); + if (ret == -1) + /* very first transaction on this inode */ + goto refresh; + + if (local->event_generation != event_generation) + /* servers have disconnected / reconnected, and possibly + rebooted, very likely changing the state of freshness + of copies */ + goto refresh; + + read_subvol = afr_read_subvol_select_by_policy (inode, this, + local->readable); + + if (read_subvol < 0 || read_subvol > priv->child_count) { + gf_log (this->name, GF_LOG_WARNING, "Unreadable subvolume %d " + "found with event generation %d", read_subvol, + event_generation); + goto refresh; + } + + if (!local->child_up[read_subvol]) { + /* should never happen, just in case */ + gf_log (this->name, GF_LOG_WARNING, "subvolume %d is the " + "read subvolume in this generation, but is not up", + read_subvol); + goto refresh; + } + + local->read_attempted[read_subvol] = 1; + + local->readfn (frame, this, read_subvol); + + return 0; + +refresh: + afr_inode_refresh (frame, this, inode, afr_read_txn_refresh_done); + + return 0; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c deleted file mode 100644 index 5621db866..000000000 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c +++ /dev/null @@ -1,1090 +0,0 @@ -/* - Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - - -#include "glusterfs.h" -#include "afr.h" -#include "xlator.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" -#include "md5.h" - -#include "afr-transaction.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" -#include "afr-self-heal-algorithm.h" - -/* - This file contains the various self-heal algorithms -*/ - - -/* - The "full" algorithm. Copies the entire file from - source to sinks. -*/ - - -static void -sh_full_private_cleanup (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_full_private_t *sh_priv = NULL; - - local = frame->local; - sh = &local->self_heal; - - sh_priv = sh->private; - - if (sh_priv) - GF_FREE (sh_priv); -} - - -static int -sh_full_loop_driver (call_frame_t *frame, xlator_t *this, gf_boolean_t is_first_call); - -static int -sh_full_loop_driver_done (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_full_private_t *sh_priv = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - sh_priv = sh->private; - - sh_full_private_cleanup (frame, this); - if (sh->op_failed) { - gf_log (this->name, GF_LOG_INFO, - "full self-heal aborting on %s", - local->loc.path); - - local->self_heal.algo_abort_cbk (frame, this); - } else { - gf_log (this->name, GF_LOG_INFO, - "full self-heal completed on %s", - local->loc.path); - - local->self_heal.algo_completion_cbk (frame, this); - } - return 0; -} - -static int -sh_full_loop_return (call_frame_t *rw_frame, xlator_t *this, off_t offset) -{ - afr_local_t * rw_local = NULL; - afr_self_heal_t * rw_sh = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_full_private_t *sh_priv = NULL; - - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - sh_priv = sh->private; - - AFR_STACK_DESTROY (rw_frame); - - sh_full_loop_driver (sh_frame, this, _gf_false); - - return 0; -} - - -static int -sh_full_write_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) -{ - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t *rw_sh = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - int child_index = (long) cookie; - int call_count = 0; - - priv = this->private; - - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - gf_log (this->name, GF_LOG_TRACE, - "wrote %d bytes of data from %s to child %d, offset %"PRId64"", - op_ret, sh_local->loc.path, child_index, - rw_sh->offset - op_ret); - - LOCK (&sh_frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "write to %s failed on subvolume %s (%s)", - sh_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - sh->op_failed = 1; - } - } - UNLOCK (&sh_frame->lock); - - call_count = afr_frame_return (rw_frame); - - if (call_count == 0) { - sh_full_loop_return (rw_frame, this, rw_sh->offset - op_ret); - } - - return 0; -} - - -static int -sh_full_read_cbk (call_frame_t *rw_frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, struct iatt *buf, - struct iobref *iobref) -{ - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t *rw_sh = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - int call_count = 0; - off_t offset = (long) cookie; - - priv = this->private; - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - call_count = sh->active_sinks; - - rw_local->call_count = call_count; - - gf_log (this->name, GF_LOG_TRACE, - "read %d bytes of data from %s, offset %"PRId64"", - op_ret, sh_local->loc.path, offset); - - if (op_ret <= 0) { - gf_log (this->name, GF_LOG_ERROR, - "read from %s failed on subvolume %s (%s)", - sh_local->loc.path, - priv->children[sh->source]->name, - strerror (op_errno)); - sh->op_failed = 1; - sh_full_loop_return (rw_frame, this, offset); - return 0; - } - - rw_sh->offset += op_ret; - - if (sh->file_has_holes) { - if (iov_0filled (vector, count) == 0) { - /* the iter function depends on the - sh->offset already being updated - above - */ - gf_log (this->name, GF_LOG_DEBUG, - "block has all 0 filled"); - sh_full_loop_return (rw_frame, this, offset); - goto out; - } - } - - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] || !sh_local->child_up[i]) - continue; - - /* this is a sink, so write to it */ - - STACK_WIND_COOKIE (rw_frame, sh_full_write_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->writev, - sh->healing_fd, vector, count, offset, - iobref); - - if (!--call_count) - break; - } - -out: - return 0; -} - - -static int -sh_full_read_write (call_frame_t *frame, xlator_t *this, off_t offset) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t *rw_sh = NULL; - afr_self_heal_t *sh = NULL; - call_frame_t *rw_frame = NULL; - int32_t op_errno = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - rw_frame = copy_frame (frame); - if (!rw_frame) - goto out; - - ALLOC_OR_GOTO (rw_local, afr_local_t, out); - - rw_frame->local = rw_local; - rw_sh = &rw_local->self_heal; - - rw_sh->offset = offset; - rw_sh->sh_frame = frame; - - STACK_WIND_COOKIE (rw_frame, sh_full_read_cbk, - (void *) (long) offset, - priv->children[sh->source], - priv->children[sh->source]->fops->readv, - sh->healing_fd, sh->block_size, - offset); - return 0; - -out: - sh->op_failed = 1; - - sh_full_loop_driver (frame, this, _gf_false); - - return 0; -} - - -static int -sh_full_loop_driver (call_frame_t *frame, xlator_t *this, gf_boolean_t is_first_call) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_full_private_t *sh_priv = NULL; - gf_boolean_t is_driver_done = _gf_false; - blksize_t block_size = 0; - off_t offset = 0; - int loop = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - sh_priv = sh->private; - - LOCK (&sh_priv->lock); - { - if (_gf_false == is_first_call) - sh_priv->loops_running--; - offset = sh_priv->offset; - block_size = sh->block_size; - while ((sh->op_failed == 0) && - (sh_priv->loops_running < priv->data_self_heal_window_size) - && (sh_priv->offset < sh->file_size)) { - - loop++; - gf_log (this->name, GF_LOG_TRACE, - "spawning a loop for offset %"PRId64, - sh_priv->offset); - - sh_priv->offset += sh->block_size; - sh_priv->loops_running++; - - if (_gf_false == is_first_call) - break; - - } - if (0 == sh_priv->loops_running) { - is_driver_done = _gf_true; - } - } - UNLOCK (&sh_priv->lock); - - while (loop--) { - if (sh->op_failed) { - // op failed in other loop, stop spawning more loops - sh_full_loop_driver (frame, this, _gf_false); - } else { - sh_full_read_write (frame, this, offset); - offset += block_size; - } - } - - if (is_driver_done) { - sh_full_loop_driver_done (frame, this); - } - - return 0; -} - - -int -afr_sh_algo_full (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_full_private_t *sh_priv = NULL; - - local = frame->local; - sh = &local->self_heal; - - sh_priv = GF_CALLOC (1, sizeof (*sh_priv), - gf_afr_mt_afr_private_t); - if (!sh_priv) - goto out; - - LOCK_INIT (&sh_priv->lock); - - sh->private = sh_priv; - - local->call_count = 0; - - sh_full_loop_driver (frame, this, _gf_true); -out: - return 0; -} - - -/* - * The "diff" algorithm. Copies only those blocks whose checksums - * don't match with those of source. - */ - - -static void -sh_diff_private_cleanup (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_diff_private_t *sh_priv = NULL; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sh_priv = sh->private; - - for (i = 0; i < priv->data_self_heal_window_size; i++) { - if (sh_priv->loops[i]) { - if (sh_priv->loops[i]->write_needed) - GF_FREE (sh_priv->loops[i]->write_needed); - - if (sh_priv->loops[i]->checksum) - GF_FREE (sh_priv->loops[i]->checksum); - - GF_FREE (sh_priv->loops[i]); - } - } - - if (sh_priv) { - if (sh_priv->loops) - GF_FREE (sh_priv->loops); - - GF_FREE (sh_priv); - } - - -} - - -static uint32_t -__make_cookie (int loop_index, int child_index) -{ - uint32_t ret = ((loop_index << 16) | child_index); - return ret; -} - - -static int -__loop_index (uint32_t cookie) -{ - return ((cookie & 0xFFFF0000) >> 16); -} - - -static int -__child_index (uint32_t cookie) -{ - return (cookie & 0x0000FFFF); -} - - -static void -sh_diff_loop_state_reset (struct sh_diff_loop_state *loop_state, int child_count) -{ - loop_state->active = _gf_false; -// loop_state->offset = 0; - - memset (loop_state->write_needed, - 0, sizeof (*loop_state->write_needed) * child_count); - - memset (loop_state->checksum, - 0, MD5_DIGEST_LEN * child_count); -} - - -static int -sh_diff_number_of_writes_needed (unsigned char *write_needed, int child_count) -{ - int writes = 0; - int i = 0; - - for (i = 0; i < child_count; i++) { - if (write_needed[i]) - writes++; - } - - return writes; -} - - -static int -sh_diff_loop_driver_done (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_diff_private_t *sh_priv = NULL; - int32_t total_blocks = 0; - int32_t diff_blocks = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - sh_priv = sh->private; - total_blocks = sh_priv->total_blocks; - diff_blocks = sh_priv->diff_blocks; - - sh_diff_private_cleanup (frame, this); - if (sh->op_failed) { - gf_log (this->name, GF_LOG_INFO, - "diff self-heal aborting on %s", - local->loc.path); - - local->self_heal.algo_abort_cbk (frame, this); - } else { - gf_log (this->name, GF_LOG_INFO, - "diff self-heal on %s: completed. " - "(%d blocks of %d were different (%.2f%%))", - local->loc.path, diff_blocks, total_blocks, - ((diff_blocks * 1.0)/total_blocks) * 100); - - local->self_heal.algo_completion_cbk (frame, this); - } - - return 0; -} - -static int -sh_diff_loop_driver (call_frame_t *frame, xlator_t *this, - gf_boolean_t is_first_call, - struct sh_diff_loop_state *loop_state); - -static int -sh_diff_loop_return (call_frame_t *rw_frame, xlator_t *this, - struct sh_diff_loop_state *loop_state) -{ - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t * rw_sh = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_diff_private_t *sh_priv = NULL; - - priv = this->private; - - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - sh_priv = sh->private; - - gf_log (this->name, GF_LOG_TRACE, - "loop for offset %"PRId64" returned", loop_state->offset); - - AFR_STACK_DESTROY (rw_frame); - - sh_diff_loop_driver (sh_frame, this, _gf_false, loop_state); - - return 0; -} - - -static int -sh_diff_write_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *postbuf) -{ - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t * rw_sh = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_diff_private_t *sh_priv = NULL; - struct sh_diff_loop_state *loop_state = NULL; - int call_count = 0; - int child_index = 0; - int loop_index = 0; - - priv = this->private; - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - sh_priv = sh->private; - - child_index = __child_index ((uint32_t) (long) cookie); - loop_index = __loop_index ((uint32_t) (long) cookie); - loop_state = sh_priv->loops[loop_index]; - - gf_log (this->name, GF_LOG_TRACE, - "wrote %d bytes of data from %s to child %d, offset %"PRId64"", - op_ret, sh_local->loc.path, child_index, - loop_state->offset); - - LOCK (&sh_frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "write to %s failed on subvolume %s (%s)", - sh_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - sh->op_failed = 1; - } - } - UNLOCK (&sh_frame->lock); - - call_count = afr_frame_return (rw_frame); - - if (call_count == 0) { - sh_diff_loop_return (rw_frame, this, loop_state); - } - - return 0; -} - - -static int -sh_diff_read_cbk (call_frame_t *rw_frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, struct iatt *buf, - struct iobref *iobref) -{ - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t * rw_sh = NULL; - afr_sh_algo_diff_private_t * sh_priv = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - int loop_index = 0; - struct sh_diff_loop_state *loop_state = NULL; - uint32_t wcookie = 0; - int i = 0; - int call_count = 0; - - priv = this->private; - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - sh_priv = sh->private; - - loop_index = __loop_index ((uint32_t) (long) cookie); - loop_state = sh_priv->loops[loop_index]; - - call_count = sh_diff_number_of_writes_needed (loop_state->write_needed, - priv->child_count); - - rw_local->call_count = call_count; - - gf_log (this->name, GF_LOG_TRACE, - "read %d bytes of data from %s, offset %"PRId64"", - op_ret, sh_local->loc.path, loop_state->offset); - - if ((op_ret <= 0) || - (call_count == 0)) { - sh_diff_loop_return (rw_frame, this, loop_state); - - return 0; - } - - if (sh->file_has_holes) { - if (iov_0filled (vector, count) == 0) { - gf_log (this->name, GF_LOG_DEBUG, "0 filled block"); - sh_diff_loop_return (rw_frame, this, loop_state); - goto out; - } - } - - for (i = 0; i < priv->child_count; i++) { - if (loop_state->write_needed[i]) { - wcookie = __make_cookie (loop_index, i); - - STACK_WIND_COOKIE (rw_frame, sh_diff_write_cbk, - (void *) (long) wcookie, - priv->children[i], - priv->children[i]->fops->writev, - sh->healing_fd, vector, count, - loop_state->offset, iobref); - - if (!--call_count) - break; - } - } - -out: - return 0; -} - - -static int -sh_diff_read (call_frame_t *rw_frame, xlator_t *this, - int loop_index) -{ - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t * rw_sh = NULL; - afr_sh_algo_diff_private_t * sh_priv = NULL; - struct sh_diff_loop_state *loop_state = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - uint32_t cookie = 0; - - priv = this->private; - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - sh_priv = sh->private; - - loop_state = sh_priv->loops[loop_index]; - - cookie = __make_cookie (loop_index, sh->source); - - STACK_WIND_COOKIE (rw_frame, sh_diff_read_cbk, - (void *) (long) cookie, - priv->children[sh->source], - priv->children[sh->source]->fops->readv, - sh->healing_fd, sh_priv->block_size, - loop_state->offset); - - return 0; -} - - -static int -sh_diff_checksum_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - uint32_t weak_checksum, uint8_t *strong_checksum) -{ - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t *rw_sh = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_diff_private_t * sh_priv = NULL; - int loop_index = 0; - int child_index = 0; - struct sh_diff_loop_state *loop_state = NULL; - int call_count = 0; - int i = 0; - int write_needed = 0; - - priv = this->private; - - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - sh_priv = sh->private; - - child_index = __child_index ((uint32_t) (long) cookie); - loop_index = __loop_index ((uint32_t) (long) cookie); - - loop_state = sh_priv->loops[loop_index]; - - if (op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "checksum on %s failed on subvolume %s (%s)", - sh_local->loc.path, priv->children[child_index]->name, - strerror (op_errno)); - - sh->op_failed = 1; - } else { - memcpy (loop_state->checksum + child_index * MD5_DIGEST_LEN, - strong_checksum, - MD5_DIGEST_LEN); - } - - call_count = afr_frame_return (rw_frame); - - if (call_count == 0) { - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] || !sh_local->child_up[i]) - continue; - - if (memcmp (loop_state->checksum + (i * MD5_DIGEST_LEN), - loop_state->checksum + (sh->source * MD5_DIGEST_LEN), - MD5_DIGEST_LEN)) { - /* - Checksums differ, so this block - must be written to this sink - */ - - gf_log (this->name, GF_LOG_DEBUG, - "checksum on subvolume %s at offset %" - PRId64" differs from that on source", - priv->children[i]->name, loop_state->offset); - - write_needed = loop_state->write_needed[i] = 1; - } - } - - LOCK (&sh_priv->lock); - { - sh_priv->total_blocks++; - if (write_needed) - sh_priv->diff_blocks++; - } - UNLOCK (&sh_priv->lock); - - if (write_needed && !sh->op_failed) { - sh_diff_read (rw_frame, this, loop_index); - } else { - sh->offset += sh_priv->block_size; - - sh_diff_loop_return (rw_frame, this, loop_state); - } - } - - return 0; -} - - -static int -sh_diff_find_unused_loop (afr_sh_algo_diff_private_t *sh_priv, int max) -{ - int i = 0; - - LOCK (&sh_priv->lock); - { - for (i = 0; i < max; i++) { - if (sh_priv->loops[i]->active == _gf_false) { - sh_priv->loops[i]->active = _gf_true; - break; - } - } - } - UNLOCK (&sh_priv->lock); - - if (i == max) { - gf_log ("[sh-diff]", GF_LOG_ERROR, - "no free loops found! This shouldn't happen. Please" - " report this to gluster-devel@nongnu.org"); - } - - return i; -} - - -static int -sh_diff_checksum (call_frame_t *frame, xlator_t *this, off_t offset) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t * sh = NULL; - afr_self_heal_t * rw_sh = NULL; - afr_sh_algo_diff_private_t * sh_priv = NULL; - call_frame_t *rw_frame = NULL; - uint32_t cookie = 0; - int loop_index = 0; - struct sh_diff_loop_state *loop_state = NULL; - int32_t op_errno = 0; - int call_count = 0; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sh_priv = sh->private; - - rw_frame = copy_frame (frame); - if (!rw_frame) - goto out; - - ALLOC_OR_GOTO (rw_local, afr_local_t, out); - - rw_frame->local = rw_local; - rw_sh = &rw_local->self_heal; - - rw_sh->offset = sh->offset; - rw_sh->sh_frame = frame; - - call_count = sh->active_sinks + 1; /* sinks and source */ - - rw_local->call_count = call_count; - - loop_index = sh_diff_find_unused_loop (sh_priv, priv->data_self_heal_window_size); - - loop_state = sh_priv->loops[loop_index]; - loop_state->offset = offset; - - /* we need to send both the loop index and child index, - so squeeze them both into a 32-bit number */ - - cookie = __make_cookie (loop_index, sh->source); - - STACK_WIND_COOKIE (rw_frame, sh_diff_checksum_cbk, - (void *) (long) cookie, - priv->children[sh->source], - priv->children[sh->source]->fops->rchecksum, - sh->healing_fd, - offset, sh_priv->block_size); - - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] || !local->child_up[i]) - continue; - - cookie = __make_cookie (loop_index, i); - - STACK_WIND_COOKIE (rw_frame, sh_diff_checksum_cbk, - (void *) (long) cookie, - priv->children[i], - priv->children[i]->fops->rchecksum, - sh->healing_fd, - offset, sh_priv->block_size); - - if (!--call_count) - break; - } - - return 0; - -out: - sh->op_failed = 1; - - sh_diff_loop_driver (frame, this, _gf_false, loop_state); - - return 0; -} - - -static int -sh_diff_loop_driver (call_frame_t *frame, xlator_t *this, - gf_boolean_t is_first_call, - struct sh_diff_loop_state *loop_state) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_diff_private_t *sh_priv = NULL; - gf_boolean_t is_driver_done = _gf_false; - blksize_t block_size = 0; - int loop = 0; - off_t offset = 0; - char sh_type_str[256] = {0,}; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - sh_priv = sh->private; - - afr_self_heal_type_str_get(sh, sh_type_str, sizeof(sh_type_str)); - - LOCK (&sh_priv->lock); - { - if (loop_state) - sh_diff_loop_state_reset (loop_state, priv->child_count); - if (_gf_false == is_first_call) - sh_priv->loops_running--; - offset = sh_priv->offset; - block_size = sh_priv->block_size; - while ((0 == sh->op_failed) && - (sh_priv->loops_running < priv->data_self_heal_window_size) - && (sh_priv->offset < sh->file_size)) { - - loop++; - gf_log (this->name, GF_LOG_TRACE, - "spawning a loop for offset %"PRId64, - sh_priv->offset); - - sh_priv->offset += sh_priv->block_size; - sh_priv->loops_running++; - - if (_gf_false == is_first_call) - break; - - } - if (0 == sh_priv->loops_running) { - is_driver_done = _gf_true; - } - } - UNLOCK (&sh_priv->lock); - - while (loop--) { - if (sh->op_failed) { - // op failed in other loop, stop spawning more loops - sh_diff_loop_driver (frame, this, _gf_false, NULL); - } else { - sh_diff_checksum (frame, this, offset); - offset += block_size; - } - } - - if (is_driver_done) { - sh_diff_loop_driver_done (frame, this); - } - return 0; -} - - -int -afr_sh_algo_diff (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_diff_private_t *sh_priv = NULL; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sh_priv = GF_CALLOC (1, sizeof (*sh_priv), - gf_afr_mt_afr_private_t); - if (!sh_priv) - goto err; - - sh_priv->block_size = this->ctx->page_size; - - sh->private = sh_priv; - - LOCK_INIT (&sh_priv->lock); - - local->call_count = 0; - - sh_priv->loops = GF_CALLOC (priv->data_self_heal_window_size, - sizeof (*sh_priv->loops), - gf_afr_mt_sh_diff_loop_state); - if (!sh_priv->loops) - goto err; - - for (i = 0; i < priv->data_self_heal_window_size; i++) { - sh_priv->loops[i] = GF_CALLOC (1, sizeof (*sh_priv->loops[i]), - gf_afr_mt_sh_diff_loop_state); - if (!sh_priv->loops[i]) - goto err; - - sh_priv->loops[i]->checksum = GF_CALLOC (priv->child_count, - MD5_DIGEST_LEN, gf_afr_mt_uint8_t); - if (!sh_priv->loops[i]->checksum) - goto err; - - sh_priv->loops[i]->write_needed = GF_CALLOC (priv->child_count, - sizeof (*sh_priv->loops[i]->write_needed), - gf_afr_mt_char); - if (!sh_priv->loops[i]->write_needed) - goto err; - - } - - sh_diff_loop_driver (frame, this, _gf_true, NULL); - - return 0; -err: - if (sh_priv) { - if (sh_priv->loops) { - for (i = 0; i < priv->data_self_heal_window_size; i++) { - if (sh_priv->loops[i]->write_needed) - GF_FREE (sh_priv->loops[i]->write_needed); - if (sh_priv->loops[i]->checksum) - GF_FREE (sh_priv->loops[i]->checksum); - if (sh_priv->loops[i]) - GF_FREE (sh_priv->loops[i]); - } - - GF_FREE (sh_priv->loops); - } - - GF_FREE (sh_priv); - } - return 0; -} - - -struct afr_sh_algorithm afr_self_heal_algorithms[] = { - {.name = "full", .fn = afr_sh_algo_full}, - {.name = "diff", .fn = afr_sh_algo_diff}, - {0, 0}, -}; diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.h b/xlators/cluster/afr/src/afr-self-heal-algorithm.h deleted file mode 100644 index bff1be63a..000000000 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - Copyright (c) 2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#ifndef __AFR_SELF_HEAL_ALGORITHM_H__ -#define __AFR_SELF_HEAL_ALGORITHM_H__ - - -typedef int (*afr_sh_algo_fn) (call_frame_t *frame, - xlator_t *this); - -struct afr_sh_algorithm { - const char *name; - afr_sh_algo_fn fn; -}; - -extern struct afr_sh_algorithm afr_self_heal_algorithms[3]; - -typedef struct { - gf_lock_t lock; - unsigned int loops_running; - off_t offset; -} afr_sh_algo_full_private_t; - -struct sh_diff_loop_state { - off_t offset; - unsigned char *write_needed; - uint8_t *checksum; - gf_boolean_t active; -}; - -typedef struct { - size_t block_size; - - gf_lock_t lock; - unsigned int loops_running; - off_t offset; - - int32_t total_blocks; - int32_t diff_blocks; - - struct sh_diff_loop_state **loops; -} afr_sh_algo_diff_private_t; - -#endif /* __AFR_SELF_HEAL_ALGORITHM_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 65d2c52b0..4dac83113 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -1,2168 +1,1009 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#include "glusterfs.h" -#include "xlator.h" -#include "byte-order.h" + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif #include "afr.h" -#include "afr-transaction.h" -#include "afr-self-heal-common.h" #include "afr-self-heal.h" -#include "pump.h" +#include "byte-order.h" -//Intersection[child]=1 if child is part of intersection -void -afr_children_intersection_get (int32_t *set1, int32_t *set2, - int *intersection, unsigned int child_count) + +int +afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - int i = 0; - - memset (intersection, 0, sizeof (*intersection) * child_count); - for (i = 0; i < child_count; i++) { - intersection[i] = afr_is_child_present (set1, child_count, i) - && afr_is_child_present (set2, child_count, - i); - } + afr_local_t *local = NULL; + + local = frame->local; + + syncbarrier_wake (&local->barrier); + + return 0; } -/** - * select_source - select a source and return it - */ int -afr_sh_select_source (int sources[], int child_count) +afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode, + int subvol, dict_t *xattr) { - int i = 0; - for (i = 0; i < child_count; i++) - if (sources[i]) - return i; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + loc_t loc = {0, }; - return -1; -} + priv = this->private; + local = frame->local; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); -/** - * sink_count - return number of sinks in sources array - */ + STACK_WIND (frame, afr_selfheal_post_op_cbk, priv->children[subvol], + priv->children[subvol]->fops->xattrop, &loc, + GF_XATTROP_ADD_ARRAY, xattr, NULL); -int -afr_sh_sink_count (int sources[], int child_count) -{ - int i = 0; - int sinks = 0; - for (i = 0; i < child_count; i++) - if (!sources[i]) - sinks++; - return sinks; + syncbarrier_wait (&local->barrier, 1); + + return 0; } -int -afr_sh_source_count (int sources[], int child_count) + +dict_t * +afr_selfheal_output_xattr (xlator_t *this, afr_transaction_type type, + int *output_dirty, int **output_matrix, int subvol) { - int i = 0; - int nsource = 0; + dict_t *xattr = NULL; + afr_private_t *priv = NULL; + int j = 0; + int idx = 0; + int ret = 0; + int *raw = 0; - for (i = 0; i < child_count; i++) - if (sources[i]) - nsource++; - return nsource; -} + priv = this->private; + idx = afr_index_for_transaction_type (type); -void -afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno) -{ - sh->op_ret = -1; - if (afr_error_more_important (sh->op_errno, op_errno)) - sh->op_errno = op_errno; -} + xattr = dict_new (); + if (!xattr) + return NULL; -void -afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) -{ - afr_private_t * priv = this->private; - char *buf = NULL; - char *ptr = NULL; - int i = 0; - int j = 0; - - /* 10 digits per entry + 1 space + '[' and ']' */ - buf = GF_MALLOC (priv->child_count * 11 + 8, gf_afr_mt_char); - - for (i = 0; i < priv->child_count; i++) { - ptr = buf; - ptr += sprintf (ptr, "[ "); - for (j = 0; j < priv->child_count; j++) { - ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); - } - sprintf (ptr, "]"); - gf_log (this->name, GF_LOG_TRACE, - "pending_matrix: %s", buf); - } - - GF_FREE (buf); -} + if (output_dirty[subvol]) { + /* clear dirty */ + raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t); + if (!raw) + goto err; -void -afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count) -{ - int i = 0; - int j = 0; + raw[idx] = hton32 (output_dirty[subvol]); + ret = dict_set_bin (xattr, AFR_DIRTY, raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) + goto err; + } - GF_ASSERT (pending_matrix); + /* clear/set pending */ + for (j = 0; j < priv->child_count; j++) { + if (!output_matrix[subvol][j]) + continue; - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - pending_matrix[i][j] = 0; - } - } -} + raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS, + gf_afr_mt_int32_t); + if (!raw) + goto err; -void -afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix, - unsigned char *ignorant_subvols, - size_t child_count) -{ - int i = 0; - int j = 0; - - GF_ASSERT (pending_matrix); - GF_ASSERT (ignorant_subvols); - - for (i = 0; i < child_count; i++) { - if (ignorant_subvols[i]) { - for (j = 0; j < child_count; j++) { - if (!ignorant_subvols[j]) - pending_matrix[j][i] += 1; - } - } - } + raw[idx] = hton32 (output_matrix[subvol][j]); + + ret = dict_set_bin (xattr, priv->pending_key[j], + raw, sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) + goto err; + } + + return xattr; +err: + if (xattr) + dict_unref (xattr); + return NULL; } + int -afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, - dict_t *xattr[], afr_transaction_type type, - size_t child_count) -{ - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - int ret = -1; - int i = 0; - int j = 0; - int k = 0; - unsigned char *ignorant_subvols = NULL; - - ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), child_count, - gf_afr_mt_char); - if (NULL == ignorant_subvols) - goto out; - - afr_init_pending_matrix (pending_matrix, child_count); - - for (i = 0; i < child_count; i++) { - pending_raw = NULL; - - for (j = 0; j < child_count; j++) { - ret = dict_get_ptr (xattr[i], pending_key[j], - &pending_raw); - - if (ret != 0) { - /* - * There is no xattr present. This means this - * subvolume should be considered an 'ignorant' - * subvolume. - */ - - ignorant_subvols[i] = 1; - continue; - } - - memcpy (pending, pending_raw, sizeof(pending)); - k = afr_index_for_transaction_type (type); - - pending_matrix[i][j] = ntoh32 (pending[k]); - } - } - - afr_mark_ignorant_subvols_as_pending (pending_matrix, - ignorant_subvols, - child_count); - GF_FREE (ignorant_subvols); -out: - return ret; +afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, afr_transaction_type type, + struct afr_reply *replies, unsigned char *locked_on) +{ + afr_private_t *priv = NULL; + int i = 0; + int j = 0; + unsigned char *pending = NULL; + int *input_dirty = NULL; + int **input_matrix = NULL; + int *output_dirty = NULL; + int **output_matrix = NULL; + dict_t *xattr = NULL; + + priv = this->private; + + pending = alloca0 (priv->child_count); + + input_dirty = alloca0 (priv->child_count * sizeof (int)); + input_matrix = ALLOC_MATRIX (priv->child_count, int); + output_dirty = alloca0 (priv->child_count * sizeof (int)); + output_matrix = ALLOC_MATRIX (priv->child_count, int); + + afr_selfheal_extract_xattr (this, replies, type, input_dirty, + input_matrix); + + for (i = 0; i < priv->child_count; i++) + if (sinks[i] && !healed_sinks[i]) + pending[i] = 1; + + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + if (pending[j]) + output_matrix[i][j] = 1; + else + output_matrix[i][j] = -input_matrix[i][j]; + } + } + + for (i = 0; i < priv->child_count; i++) { + if (!pending[i]) + output_dirty[i] = -input_dirty[i]; + } + + for (i = 0; i < priv->child_count; i++) { + if (!locked_on[i]) + /* perform post-op only on subvols we had locked + and inspected on. + */ + continue; + + xattr = afr_selfheal_output_xattr (this, type, output_dirty, + output_matrix, i); + if (!xattr) { + gf_log (this->name, GF_LOG_ERROR, + "unable to allocate xdata for subvol %d", i); + continue; + } + + afr_selfheal_post_op (frame, this, inode, i, xattr); + + dict_unref (xattr); + } + + return 0; } -typedef enum { - AFR_NODE_INNOCENT, - AFR_NODE_FOOL, - AFR_NODE_WISE, - AFR_NODE_INVALID = -1, -} afr_node_type; -typedef struct { - afr_node_type type; - int wisdom; -} afr_node_character; +void +afr_replies_copy (struct afr_reply *dst, struct afr_reply *src, int count) +{ + int i = 0; + dict_t *xdata = NULL; + + if (dst == src) + return; + + for (i = 0; i < count; i++) { + dst[i].valid = src[i].valid; + dst[i].op_ret = src[i].op_ret; + dst[i].op_errno = src[i].op_errno; + dst[i].prestat = src[i].prestat; + dst[i].poststat = src[i].poststat; + dst[i].preparent = src[i].preparent; + dst[i].postparent = src[i].postparent; + dst[i].preparent2 = src[i].preparent2; + dst[i].postparent2 = src[i].postparent2; + if (src[i].xdata) + xdata = dict_ref (src[i].xdata); + else + xdata = NULL; + if (dst[i].xdata) + dict_unref (dst[i].xdata); + dst[i].xdata = xdata; + memcpy (dst[i].checksum, src[i].checksum, + MD5_DIGEST_LENGTH); + } +} -static int -afr_sh_is_innocent (int32_t *array, int child_count) +int +afr_selfheal_fill_dirty (xlator_t *this, int *dirty, int subvol, + int idx, dict_t *xdata) { - int i = 0; - int ret = 1; /* innocent until proven guilty */ + void *pending_raw = NULL; + int pending[3] = {0, }; - for (i = 0; i < child_count; i++) { - if (array[i]) { - ret = 0; - break; - } - } + if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw)) + return -1; - return ret; -} + if (!pending_raw) + return -1; + memcpy (pending, pending_raw, sizeof(pending)); -static int -afr_sh_is_fool (int32_t *array, int i, int child_count) -{ - return array[i]; /* fool if accuses itself */ + dirty[subvol] = ntoh32 (pending[idx]); + + return 0; } -static int -afr_sh_is_wise (int32_t *array, int i, int child_count) +int +afr_selfheal_fill_matrix (xlator_t *this, int **matrix, int subvol, + int idx, dict_t *xdata) { - return !array[i]; /* wise if does not accuse itself */ -} + int i = 0; + void *pending_raw = NULL; + int pending[3] = {0, }; + afr_private_t *priv = NULL; + priv = this->private; -static int -afr_sh_all_nodes_innocent (afr_node_character *characters, - int child_count) -{ - int i = 0; - int ret = 1; + for (i = 0; i < priv->child_count; i++) { + if (dict_get_ptr (xdata, priv->pending_key[i], &pending_raw)) + continue; + + if (!pending_raw) + continue; - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_INNOCENT) { - ret = 0; - break; - } - } + memcpy (pending, pending_raw, sizeof(pending)); - return ret; + matrix[subvol][i] = ntoh32 (pending[idx]); + } + + return 0; } -static int -afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count) +int +afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies, + afr_transaction_type type, int *dirty, int **matrix) { - int i = 0; - int ret = 0; + afr_private_t *priv = NULL; + int i = 0; + dict_t *xdata = NULL; + int idx = -1; + + idx = afr_index_for_transaction_type (type); - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_WISE) { - ret = 1; - break; - } - } + priv = this->private; - return ret; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].xdata) + continue; + + xdata = replies[i].xdata; + + afr_selfheal_fill_dirty (this, dirty, i, idx, xdata); + afr_selfheal_fill_matrix (this, matrix, i, idx, xdata); + } + + return 0; } + /* - * The 'wisdom' of a wise node is 0 if any other wise node accuses it. - * It is 1 if no other wise node accuses it. - * Only wise nodes with wisdom 1 are sources. + * This function determines if a self-heal is required for a given inode, + * and if needed, in what direction. + * + * locked_on[] is the array representing servers which have been locked and + * from which xattrs have been fetched for analysis. + * + * The output of the function is by filling the arrays sources[] and sinks[]. + * + * sources[i] is set if i'th server is an eligible source for a selfheal. + * + * sinks[i] is set if i'th server needs to be healed. + * + * if sources[0..N] are all set, there is no need for a selfheal. + * + * if sinks[0..N] are all set, the inode is in split brain. * - * If no nodes with wisdom 1 exist, a split-brain has occured. */ -static void -afr_sh_compute_wisdom (int32_t *pending_matrix[], - afr_node_character characters[], int child_count) -{ - int i = 0; - int j = 0; - - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_WISE) { - characters[i].wisdom = 1; - - for (j = 0; j < child_count; j++) { - if ((characters[j].type == AFR_NODE_WISE) - && pending_matrix[j][i]) { - - characters[i].wisdom = 0; - } - } - } - } +int +afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + afr_transaction_type type, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks) +{ + afr_private_t *priv = NULL; + int i = 0; + int j = 0; + int *dirty = NULL; + int **matrix = NULL; + char *accused = NULL; + + priv = this->private; + + dirty = alloca0 (priv->child_count * sizeof (int)); + accused = alloca0 (priv->child_count); + matrix = ALLOC_MATRIX(priv->child_count, int); + + /* First construct the pending matrix for further analysis */ + afr_selfheal_extract_xattr (this, replies, type, dirty, matrix); + + /* Next short list all accused to exclude them from being sources */ + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + if (matrix[i][j]) + accused[j] = 1; + } + } + + /* Short list all non-accused as sources */ + memset (sources, 0, priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (!accused[i] && locked_on[i]) + sources[i] = 1; + } + + /* Everyone accused by sources are sinks */ + memset (sinks, 0, priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + for (j = 0; j < priv->child_count; j++) { + if (matrix[i][j]) + sinks[j] = 1; + } + } + + /* If any source has 'dirty' bit, pick first + 'dirty' source and make everybody else sinks */ + for (i = 0; i < priv->child_count; i++) { + if (sources[i] && dirty[i]) { + for (j = 0; j < priv->child_count; j++) { + if (j != i) { + sources[j] = 0; + sinks[j] = 1; + } + } + break; + } + } + + /* If no sources, all locked nodes are sinks - split brain */ + if (AFR_COUNT (sources, priv->child_count) == 0) { + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) + sinks[i] = 1; + } + } + + return 0; } -static int -afr_sh_wise_nodes_conflict (afr_node_character *characters, - int child_count) +int +afr_selfheal_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *parbuf) { - int i = 0; - int ret = 1; + afr_local_t *local = NULL; + int i = -1; - for (i = 0; i < child_count; i++) { - if ((characters[i].type == AFR_NODE_WISE) - && characters[i].wisdom == 1) { + local = frame->local; + i = (long) cookie; - /* There is atleast one bona-fide wise node */ - ret = 0; - break; - } - } + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (buf) + local->replies[i].poststat = *buf; + if (parbuf) + local->replies[i].postparent = *parbuf; + if (xdata) + local->replies[i].xdata = dict_ref (xdata); - return ret; + syncbarrier_wake (&local->barrier); + + return 0; } -static int -afr_sh_mark_wisest_as_sources (int sources[], - afr_node_character *characters, - int child_count) +inode_t * +afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent, + const char *name, struct afr_reply *replies, + unsigned char *lookup_on) { - int nsources = 0; - int i = 0; + loc_t loc = {0, }; + dict_t *xattr_req = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + inode_t *inode = NULL; - for (i = 0; i < child_count; i++) { - if (characters[i].wisdom == 1) { - sources[i] = 1; - nsources++; - } - } + local = frame->local; + priv = frame->this->private; - return nsources; -} + xattr_req = dict_new (); + if (!xattr_req) + return NULL; -static void -afr_compute_witness_of_fools (int32_t *witnesses, int32_t **pending_matrix, - afr_node_character *characters, - int32_t child_count) -{ - int i = 0; - int j = 0; - int witness = 0; - - GF_ASSERT (witnesses); - GF_ASSERT (pending_matrix); - GF_ASSERT (characters); - GF_ASSERT (child_count > 0); - - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_FOOL) - continue; - - witness = 0; - for (j = 0; j < child_count; j++) { - if (i == j) - continue; - witness += pending_matrix[i][j]; - } - witnesses[i] = witness; - } -} + if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) { + dict_destroy (xattr_req); + return NULL; + } -static int32_t -afr_find_biggest_witness_among_fools (int32_t *witnesses, - afr_node_character *characters, - int32_t child_count) -{ - int i = 0; - int biggest_witness = -1; + inode = inode_new (parent->table); + if (!inode) { + dict_destroy (xattr_req); + return NULL; + } - GF_ASSERT (witnesses); - GF_ASSERT (characters); - GF_ASSERT (child_count > 0); + loc.parent = inode_ref (parent); + uuid_copy (loc.pargfid, parent->gfid); + loc.name = name; + loc.inode = inode_ref (inode); - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_FOOL) - continue; + AFR_ONLIST (lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xattr_req); - if (biggest_witness < witnesses[i]) - biggest_witness = witnesses[i]; - } - return biggest_witness; -} + afr_replies_copy (replies, local->replies, priv->child_count); -int -afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses, - afr_node_character *characters, - int32_t child_count, int32_t witness) -{ - int i = 0; - int nsources = 0; - - GF_ASSERT (sources); - GF_ASSERT (witnesses); - GF_ASSERT (characters); - GF_ASSERT (child_count > 0); - - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_FOOL) - continue; - - if (witness == witnesses[i]) { - sources[i] = 1; - nsources++; - } - } - return nsources; -} + loc_wipe (&loc); + dict_unref (xattr_req); -static int -afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, - afr_node_character *characters, - int child_count) -{ - int32_t biggest_witness = 0; - int nsources = 0; - int32_t *witnesses = NULL; - - GF_ASSERT (child_count > 0); - - witnesses = GF_CALLOC (child_count, sizeof (*witnesses), - gf_afr_mt_int32_t); - if (NULL == witnesses) { - nsources = -1; - goto out; - } - - afr_compute_witness_of_fools (witnesses, pending_matrix, characters, - child_count); - biggest_witness = afr_find_biggest_witness_among_fools (witnesses, - characters, - child_count); - nsources = afr_mark_fool_as_source_by_witness (sources, witnesses, - characters, child_count, - biggest_witness); -out: - if (witnesses) - GF_FREE (witnesses); - return nsources; + return inode; } -int -afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs, - int32_t *valid_children, int child_count, - uint32_t uid) -{ - int i = 0; - int nsources = 0; - int child = 0; - - GF_ASSERT (bufs); - GF_ASSERT (valid_children); - GF_ASSERT (sources); - GF_ASSERT (child_count > 0); - - for (i = 0; i < child_count; i++) { - if (-1 == valid_children[i]) - continue; - - child = valid_children[i]; - if (uid == bufs[child].ia_uid) { - sources[child] = 1; - nsources++; - } - } - return nsources; -} int -afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *valid_children, - int child_count) +afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies, + unsigned char *discover_on) { - int i = 0; - int smallest = -1; - int child = 0; - - GF_ASSERT (bufs); - GF_ASSERT (valid_children); - GF_ASSERT (child_count > 0); - - for (i = 0; i < child_count; i++) { - if (-1 == valid_children[i]) - continue; - child = valid_children[i]; - if ((smallest == -1) || - (bufs[child].ia_uid < bufs[smallest].ia_uid)) { - smallest = child; - } - } - return smallest; -} + loc_t loc = {0, }; + dict_t *xattr_req = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; -static int -afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *valid_children, - int child_count, int32_t *sources) -{ - int nsources = 0; - int smallest = 0; - - smallest = afr_get_child_with_lowest_uid (bufs, valid_children, - child_count); - if (smallest < 0) { - nsources = -1; - goto out; - } - nsources = afr_mark_child_as_source_by_uid (sources, bufs, - valid_children, child_count, - bufs[smallest].ia_uid); -out: - return nsources; -} + local = frame->local; + priv = frame->this->private; -char * -afr_get_character_str (afr_node_type type) -{ - char *character = NULL; - - switch (type) { - case AFR_NODE_INNOCENT: - character = "innocent"; - break; - case AFR_NODE_FOOL: - character = "fool"; - break; - case AFR_NODE_WISE: - character = "wise"; - break; - default: - character = "invalid"; - break; - } - return character; -} + xattr_req = dict_new (); + if (!xattr_req) + return -ENOMEM; -afr_node_type -afr_find_child_character_type (int32_t *pending_row, int32_t child, - int32_t child_count, const char *xlator_name) -{ - afr_node_type type = AFR_NODE_INVALID; - - GF_ASSERT (pending_row); - GF_ASSERT (child_count > 0); - GF_ASSERT ((child >= 0) && (child < child_count)); - - if (afr_sh_is_innocent (pending_row, child_count)) - type = AFR_NODE_INNOCENT; - else if (afr_sh_is_fool (pending_row, child, child_count)) - type = AFR_NODE_FOOL; - else if (afr_sh_is_wise (pending_row, child, child_count)) - type = AFR_NODE_WISE; - else - GF_ASSERT (0); - - gf_log (xlator_name, GF_LOG_DEBUG, "child %d character %s", - child, afr_get_character_str (type)); - return type; -} + if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) { + dict_destroy (xattr_req); + return -ENOMEM; + } -int -afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, - int32_t **pending_matrix, int32_t *sources, - int32_t *success_children, afr_transaction_type type) -{ - afr_private_t *priv = NULL; - afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; - int nsources = -1; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, gfid); - priv = this->private; + AFR_ONLIST (discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xattr_req); - if (afr_get_children_count (success_children, priv->child_count) == 0) - goto out; + afr_replies_copy (replies, local->replies, priv->child_count); - afr_build_pending_matrix (priv->pending_key, pending_matrix, - xattr, type, priv->child_count); + loc_wipe (&loc); + dict_unref (xattr_req); - sh_type = afr_self_heal_type_for_transaction (type); - if (AFR_SELF_HEAL_INVALID == sh_type) - goto out; + return 0; +} - afr_sh_print_pending_matrix (pending_matrix, this); +int +afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies) +{ + afr_private_t *priv = NULL; - nsources = afr_mark_sources (sources, pending_matrix, bufs, - priv->child_count, sh_type, - success_children, this->name); -out: - return nsources; + priv = frame->this->private; + + return afr_selfheal_unlocked_discover_on (frame, inode, gfid, replies, + priv->child_up); } -/** - * mark_sources: Mark all 'source' nodes and return number of source - * nodes found - * - * A node (a row in the pending matrix) belongs to one of - * three categories: - * - * M is the pending matrix. - * - * 'innocent' - M[i] is all zeroes - * 'fool' - M[i] has i'th element = 1 (self-reference) - * 'wise' - M[i] has i'th element = 0, others are 1 or 0. - * - * All 'innocent' nodes are sinks. If all nodes are innocent, no self-heal is - * needed. - * - * A 'wise' node can be a source. If two 'wise' nodes conflict, it is - * a split-brain. If one wise node refers to the other but the other doesn't - * refer back, the referrer is a source. - * - * All fools are sinks, unless there are no 'wise' nodes. In that case, - * one of the fools is made a source. - */ int -afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, - int32_t child_count, afr_self_heal_type type, - int32_t *valid_children, const char *xlator_name) +afr_selfheal_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - /* stores the 'characters' (innocent, fool, wise) of the nodes */ - - afr_node_character *characters = NULL; - int i = 0; - int nsources = -1; - xlator_t *this = NULL; - - characters = GF_CALLOC (sizeof (afr_node_character), - child_count, gf_afr_mt_afr_node_character); - if (!characters) - goto out; - - this = THIS; - - /* start clean */ - for (i = 0; i < child_count; i++) { - sources[i] = 0; - } - - nsources = 0; - for (i = 0; i < child_count; i++) { - characters[i].type = - afr_find_child_character_type (pending_matrix[i], i, - child_count, - xlator_name); - if (AFR_NODE_INVALID == characters[i].type) - gf_log (xlator_name, GF_LOG_WARNING, - "child %d had invalid xattrs", i); - } - - if ((type == AFR_SELF_HEAL_METADATA) - && afr_sh_all_nodes_innocent (characters, child_count)) { - - nsources = afr_sh_mark_lowest_uid_as_source (bufs, - valid_children, - child_count, - sources); - goto out; - } - - if (afr_sh_wise_nodes_exist (characters, child_count)) { - afr_sh_compute_wisdom (pending_matrix, characters, child_count); - - if (afr_sh_wise_nodes_conflict (characters, child_count)) { - /* split-brain */ - gf_log (this->name, GF_LOG_INFO, - "split-brain possible, no source detected"); - nsources = -1; - - } else { - nsources = afr_sh_mark_wisest_as_sources (sources, - characters, - child_count); - } - } else { - nsources = afr_mark_biggest_of_fools_as_source (sources, - pending_matrix, - characters, - child_count); - } + afr_local_t *local = NULL; + int i = 0; -out: - if (nsources == 0) { - for (i = 0; i < child_count; i++) { - if (valid_children[i] != -1) - sources[valid_children[i]] = 1; - } - } - if (characters) - GF_FREE (characters); - - gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources); - return nsources; -} + local = frame->local; + i = (long) cookie; -void -afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, - int32_t *delta_matrix[], int success[], - int child_count, afr_transaction_type type) -{ - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - int ret = 0; - int i = 0; - int j = 0; - int k = 0; - - /* start clean */ - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - delta_matrix[i][j] = 0; - } - } - - for (i = 0; i < child_count; i++) { - if (pending_raw) - pending_raw = NULL; - - for (j = 0; j < child_count; j++) { - ret = dict_get_ptr (xattr[i], priv->pending_key[j], - &pending_raw); - if (ret < 0) - gf_log (THIS->name, GF_LOG_DEBUG, - "Unable to get dict value."); - if (!success[j]) - continue; - - k = afr_index_for_transaction_type (type); - - if (pending_raw != NULL) { - memcpy (pending, pending_raw, sizeof(pending)); - delta_matrix[i][j] = -(ntoh32 (pending[k])); - } else { - delta_matrix[i][j] = 0; - } - - } - } + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + + syncbarrier_wake (&local->barrier); + + return 0; } int -afr_sh_delta_to_xattr (afr_private_t *priv, - int32_t *delta_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type) +afr_selfheal_locked_fill (call_frame_t *frame, xlator_t *this, + unsigned char *locked_on) { - int i = 0; - int j = 0; - int k = 0; - int ret = 0; - int32_t *pending = NULL; - - for (i = 0; i < child_count; i++) { - if (!xattr[i]) - continue; - - for (j = 0; j < child_count; j++) { - pending = GF_CALLOC (sizeof (int32_t), 3, - gf_afr_mt_int32_t); - - if (!pending) - continue; - /* 3 = data+metadata+entry */ - - k = afr_index_for_transaction_type (type); - - pending[k] = hton32 (delta_matrix[i][j]); - - ret = dict_set_bin (xattr[i], priv->pending_key[j], - pending, - 3 * sizeof (int32_t)); - if (ret < 0) - gf_log (THIS->name, GF_LOG_WARNING, - "Unable to set dict value."); - } - } - return 0; + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int count = 0; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && local->replies[i].op_ret == 0) { + locked_on[i] = 1; + count++; + } else { + locked_on[i] = 0; + } + } + + return count; } int -afr_sh_has_metadata_pending (dict_t *xattr, xlator_t *this) +afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on) { - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - afr_private_t *priv = NULL; - int ret = -1; - int i = 0; - int j = 0; + loc_t loc = {0,}; + struct gf_flock flock = {0, }; - priv = this->private; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; - if (ret != 0) - return 0; + AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom, + &loc, F_SETLK, &flock, NULL); - memcpy (pending, pending_raw, sizeof(pending)); - j = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); + loc_wipe (&loc); - if (pending[j]) - return 1; - } - - return 0; + return afr_selfheal_locked_fill (frame, this, locked_on); } int -afr_sh_has_data_pending (dict_t *xattr, xlator_t *this) +afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on) { - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - afr_private_t *priv = NULL; - int ret = -1; - int i = 0; - int j = 0; + loc_t loc = {0,}; + struct gf_flock flock = {0, }; + afr_local_t *local = NULL; + int i = 0; + afr_private_t *priv = NULL; + + priv = this->private; + local = frame->local; + + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - priv = this->private; + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); + AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom, + &loc, F_SETLK, &flock, NULL); - if (ret != 0) - return 0; + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == -1 && + local->replies[i].op_errno == EAGAIN) { + afr_selfheal_locked_fill (frame, this, locked_on); + afr_selfheal_uninodelk (frame, this, inode, dom, off, + size, locked_on); - memcpy (pending, pending_raw, sizeof(pending)); - j = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); + AFR_SEQ (frame, afr_selfheal_lock_cbk, inodelk, dom, + &loc, F_SETLKW, &flock, NULL); + break; + } + } - if (pending[j]) - return 1; - } + loc_wipe (&loc); - return 0; + return afr_selfheal_locked_fill (frame, this, locked_on); } int -afr_sh_has_entry_pending (dict_t *xattr, xlator_t *this) +afr_selfheal_uninodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + const unsigned char *locked_on) { - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - afr_private_t *priv = NULL; - int ret = -1; - int i = 0; - int j = 0; + loc_t loc = {0,}; + struct gf_flock flock = {0, }; - priv = this->private; - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - if (ret != 0) - return 0; + flock.l_type = F_UNLCK; + flock.l_start = off; + flock.l_len = size; - memcpy (pending, pending_raw, sizeof(pending)); - j = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); + AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, inodelk, + dom, &loc, F_SETLK, &flock, NULL); - if (pending[j]) - return 1; - } + loc_wipe (&loc); - return 0; + return 0; } -/** - * is_matrix_zero - return true if pending matrix is all zeroes - */ - int -afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count) +afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) { - int i = 0; - int j = 0; - - for (i = 0; i < child_count; i++) - for (j = 0; j < child_count; j++) - if (pending_matrix[i][j]) - return 0; - return 1; + loc_t loc = {0,}; + + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); + + AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom, + &loc, name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); + + loc_wipe (&loc); + + return afr_selfheal_locked_fill (frame, this, locked_on); } int -afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) +afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - -// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); - memset (sh->buf, 0, sizeof (struct iatt) * priv->child_count); - - for (i = 0; i < priv->child_count; i++) { - sh->locked_nodes[i] = 0; - } - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } - - if (local->govinda_gOvinda || sh->op_failed) { - gf_log (this->name, GF_LOG_INFO, - "split brain found, aborting selfheal of %s", - local->loc.path); - sh->op_failed = 1; - sh->completion_cbk (frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to metadata check on %s", - local->loc.path); - afr_self_heal_metadata (frame, this); - } - - return 0; -} + loc_t loc = {0,}; + afr_local_t *local = NULL; + int i = 0; + afr_private_t *priv = NULL; + priv = this->private; + local = frame->local; -static int -afr_sh_missing_entries_finish (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - local = frame->local; - int_lock = &local->internal_lock; + AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, + name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); - int_lock->lock_cbk = afr_sh_missing_entries_done; - afr_unlock (frame, this); + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == -1 && + local->replies[i].op_errno == EAGAIN) { + afr_selfheal_locked_fill (frame, this, locked_on); + afr_selfheal_unentrylk (frame, this, inode, dom, name, + locked_on); - return 0; -} + AFR_SEQ (frame, afr_selfheal_lock_cbk, entrylk, dom, + &loc, name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + break; + } + } -static void -afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, - dict_t *xattr, struct iatt *postparent) -{ - int child_index = 0; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == 0) { - sh->buf[child_index] = *buf; - sh->parentbuf = *postparent; - sh->parentbufs[child_index] = *postparent; - sh->success_children[sh->success_count] = child_index; - sh->success_count++; - sh->xattr[child_index] = dict_ref (xattr); - } else { - gf_log (this->name, GF_LOG_ERROR, - "path %s on subvolume %s => -1 (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - local->self_heal.child_errno[child_index] = op_errno; - } - } - UNLOCK (&frame->lock); - return; -} + loc_wipe (&loc); -gf_boolean_t -afr_valid_ia_type (ia_type_t ia_type) -{ - switch (ia_type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - case IA_IFLNK: - case IA_IFDIR: - return _gf_true; - default: - return _gf_false; - } - return _gf_false; + return afr_selfheal_locked_fill (frame, this, locked_on); } -void -afr_sh_call_entry_impunge_recreate (call_frame_t *frame, xlator_t *this, - int child_index, struct iatt *buf, - struct iatt *postparent, - afr_impunge_done_cbk_t impunge_done) -{ - call_frame_t *impunge_frame = NULL; - afr_local_t *local = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *impunge_sh = NULL; - int32_t op_errno = 0; - - impunge_frame = copy_frame (frame); - if (!impunge_frame) { - op_errno = ENOMEM; - goto out; - } - - ALLOC_OR_GOTO (impunge_local, afr_local_t, out); - - local = frame->local; - sh = &local->self_heal; - impunge_frame->local = impunge_local; - impunge_sh = &impunge_local->self_heal; - impunge_sh->sh_frame = frame; - impunge_sh->active_source = sh->source; - impunge_sh->impunging_entry_mode = st_mode_from_ia (buf->ia_prot, - buf->ia_type); - impunge_sh->impunge_ret_child = child_index; - loc_copy (&impunge_local->loc, &local->loc); - sh->impunge_done = impunge_done; - impunge_local->call_count = 1; - afr_sh_entry_impunge_create (impunge_frame, this, child_index, buf, - postparent); - return; -out: - gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, reason: %s", - local->loc.path, strerror (op_errno)); - impunge_done (frame, this, child_index, -1, op_errno); -} int -afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this, int child, - int32_t op_ret, int32_t op_errno) +afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) { - int call_count = 0; - afr_local_t *local = NULL; - - local = frame->local; - - if (op_ret == -1) - gf_log (this->name, GF_LOG_ERROR, - "create entry %s failed, on child %d reason, %s", - local->loc.path, child, strerror (op_errno)); - call_count = afr_frame_return (frame); - if (call_count == 0) - afr_sh_missing_entries_finish (frame, this); - return 0; -} + loc_t loc = {0,}; -static int -sh_missing_entries_create (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int type = 0; - afr_private_t *priv = NULL; - int enoent_count = 0; - int i = 0; - struct iatt *buf = NULL; - struct iatt *postparent = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - enoent_count = afr_errno_count (NULL, sh->child_errno, - priv->child_count, ENOENT); - if (enoent_count == 0) { - gf_log (this->name, GF_LOG_INFO, - "no missing files - %s. proceeding to metadata check", - local->loc.path); - /* proceed to next step - metadata self-heal */ - afr_sh_missing_entries_finish (frame, this); - return 0; - } - - buf = &sh->buf[sh->source]; - postparent = &sh->parentbufs[sh->source]; - - type = buf->ia_type; - if (!afr_valid_ia_type (type)) { - gf_log (this->name, GF_LOG_ERROR, - "%s: unknown file type: 0%o", local->loc.path, type); - local->govinda_gOvinda = 1; - afr_sh_missing_entries_finish (frame, this); - goto out; - } - - local->call_count = enoent_count; - for (i = 0; i < priv->child_count; i++) { - //If !child_up errno will be zero - if (sh->child_errno[i] != ENOENT) - continue; - afr_sh_call_entry_impunge_recreate (frame, this, i, - buf, postparent, - afr_sh_create_entry_cbk); - enoent_count--; - } - GF_ASSERT (enoent_count == 0); -out: - return 0; -} + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); -void -afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int32_t op_errno = 0; - ia_type_t ia_type = IA_INVAL; - int32_t nsources = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (afr_get_children_count (sh->success_children, - priv->child_count) == 0) { - op_errno = afr_resultant_errno_get (NULL, sh->child_errno, - priv->child_count); - goto out; - } - - if (afr_gfid_missing_count (this->name, sh->success_children, - sh->buf, priv->child_count, - local->loc.path) || - afr_conflicting_iattrs (sh->buf, sh->success_children, - priv->child_count, local->loc.path, - this->name)) { - //this can happen if finding the fresh parent dir failed - local->govinda_gOvinda = 1; - sh->op_failed = 1; - op_errno = EIO; - goto out; - } - - //now No chance for the ia_type to conflict - ia_type = sh->buf[sh->success_children[0]].ia_type; - nsources = afr_build_sources (this, sh->xattr, sh->buf, - sh->pending_matrix, sh->sources, - sh->success_children, - afr_transaction_type_get (ia_type)); - if (nsources < 0) { - gf_log (this->name, GF_LOG_INFO, "No sources for dir of %s," - " in missing entry self-heal, continuing with the rest" - " of the self-heals", local->loc.path); - op_errno = EIO; - goto out; - } - - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); - sh->source = sh->fresh_children[0]; - if (sh->source == -1) { - gf_log (this->name, GF_LOG_DEBUG, "No active sources found."); - op_errno = EIO; - goto out; - } - - if (sh->gfid_sh_success_cbk) - sh->gfid_sh_success_cbk (frame, this); - sh_missing_entries_create (frame, this); - return; -out: - afr_sh_set_error (sh, op_errno); - afr_sh_missing_entries_finish (frame, this); - return; + AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, entrylk, + dom, &loc, name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); + + loc_wipe (&loc); + + return 0; } -static int -afr_sh_missing_entries_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent) + +gf_boolean_t +afr_is_pending_set (xlator_t *this, dict_t *xdata, int type) { - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; + int idx = -1; + afr_private_t *priv = NULL; + void *pending_raw = NULL; + int *pending_int = NULL; + int i = 0; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + priv = this->private; + idx = afr_index_for_transaction_type (type); - afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, - op_errno, inode, buf, xattr, - postparent); - call_count = afr_frame_return (frame); + if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw) == 0) { + if (pending_raw) { + pending_int = pending_raw; - if (call_count == 0) - afr_sh_missing_entries_lookup_done (frame, this); + if (ntoh32 (pending_int[idx])) + return _gf_true; + } + } - return 0; -} + for (i = 0; i < priv->child_count; i++) { + if (dict_get_ptr (xdata, priv->pending_key[i], + &pending_raw)) + continue; + if (!pending_raw) + continue; + pending_int = pending_raw; -int -afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child, - int32_t op_ret, int32_t op_errno) -{ - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - GF_ASSERT (sh->post_remove_call); - if ((op_ret == -1) && (op_errno != ENOENT)) { - gf_log (this->name, GF_LOG_ERROR, - "purge entry %s failed, on child %d reason, %s", - local->loc.path, child, strerror (op_errno)); - LOCK (&frame->lock); - { - afr_sh_set_error (sh, EIO); - sh->op_failed = 1; - } - UNLOCK (&frame->lock); - } - call_count = afr_frame_return (frame); - if (call_count == 0) - sh->post_remove_call (frame, this); - return 0; -} + if (ntoh32 (pending_int[idx])) + return _gf_true; + } -void -afr_sh_call_entry_expunge_remove (call_frame_t *frame, xlator_t *this, - int child_index, struct iatt *buf, - afr_expunge_done_cbk_t expunge_done) -{ - call_frame_t *expunge_frame = NULL; - afr_local_t *local = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *expunge_sh = NULL; - int32_t op_errno = 0; - - expunge_frame = copy_frame (frame); - if (!expunge_frame) { - goto out; - } - - ALLOC_OR_GOTO (expunge_local, afr_local_t, out); - - local = frame->local; - sh = &local->self_heal; - expunge_frame->local = expunge_local; - expunge_sh = &expunge_local->self_heal; - expunge_sh->sh_frame = frame; - loc_copy (&expunge_local->loc, &local->loc); - sh->expunge_done = expunge_done; - afr_sh_entry_expunge_remove (expunge_frame, this, child_index, buf); - return; -out: - gf_log (this->name, GF_LOG_ERROR, "Expunge of %s failed, reason: %s", - local->loc.path, strerror (op_errno)); - expunge_done (frame, this, child_index, -1, op_errno); + return _gf_false; } -void -afr_sh_remove_stale_lookup_info (afr_self_heal_t *sh, int32_t *success_children, - int32_t *fresh_children, - unsigned int child_count) -{ - int i = 0; - - for (i = 0; i < child_count; i++) { - if (afr_is_child_present (success_children, child_count, i) && - !afr_is_child_present (fresh_children, child_count, i)) { - sh->child_errno[i] = ENOENT; - GF_ASSERT (sh->xattr[i]); - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } - } -} -int -afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this) +gf_boolean_t +afr_is_data_set (xlator_t *this, dict_t *xdata) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (sh->op_failed) { - afr_sh_missing_entries_finish (frame, this); - } else { - if (afr_gfid_missing_count (this->name, sh->fresh_children, - sh->buf, priv->child_count, - local->loc.path)) { - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_missing_entries_lookup_cbk, - _gf_true); - } else { - //No need to set gfid so goto missing entries lookup done - //Behave as if you have done the lookup - afr_sh_remove_stale_lookup_info (sh, - sh->success_children, - sh->fresh_children, - priv->child_count); - afr_children_copy (sh->success_children, - sh->fresh_children, - priv->child_count); - afr_sh_missing_entries_lookup_done (frame, this); - } - } - return 0; + return afr_is_pending_set (this, xdata, AFR_DATA_TRANSACTION); } gf_boolean_t -afr_sh_purge_entry_condition (afr_local_t *local, afr_private_t *priv, - int child) +afr_is_metadata_set (xlator_t *this, dict_t *xdata) { - afr_self_heal_t *sh = NULL; - - sh = &local->self_heal; - - if (local->child_up[child] && - (!afr_is_child_present (sh->fresh_parent_dirs, priv->child_count, - child)) - && (sh->child_errno[child] != ENOENT)) - return _gf_true; - - return _gf_false; + return afr_is_pending_set (this, xdata, AFR_METADATA_TRANSACTION); } gf_boolean_t -afr_sh_purge_stale_entry_condition (afr_local_t *local, afr_private_t *priv, - int child) +afr_is_entry_set (xlator_t *this, dict_t *xdata) { - afr_self_heal_t *sh = NULL; - - sh = &local->self_heal; - - if (local->child_up[child] && - (!afr_is_child_present (sh->fresh_children, priv->child_count, - child)) - && (sh->child_errno[child] != ENOENT)) - return _gf_true; - - return _gf_false; + return afr_is_pending_set (this, xdata, AFR_ENTRY_TRANSACTION); } -void -afr_sh_purge_entry_common (call_frame_t *frame, xlator_t *this, - gf_boolean_t purge_condition (afr_local_t *local, - afr_private_t *priv, - int child)) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - int call_count = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (purge_condition (local, priv, i)) - call_count++; - } - - if (call_count == 0) { - sh->post_remove_call (frame, this); - goto out; - } - - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (!purge_condition (local, priv, i)) - continue; - afr_sh_call_entry_expunge_remove (frame, this, - (long) i, &sh->buf[i], - afr_sh_remove_entry_cbk); - } -out: - return; -} void -afr_sh_purge_entry (call_frame_t *frame, xlator_t *this) +afr_inode_link (inode_t *inode, struct iatt *iatt) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + inode_t *linked_inode = NULL; + + linked_inode = inode_link (inode, NULL, NULL, iatt); - local = frame->local; - sh = &local->self_heal; - sh->post_remove_call = afr_sh_missing_entries_finish; + uuid_copy (inode->gfid, iatt->ia_gfid); + inode->ia_type = iatt->ia_type; - afr_sh_purge_entry_common (frame, this, afr_sh_purge_entry_condition); + if (linked_inode) { + inode_lookup (linked_inode); + inode_unref (linked_inode); + } } -void -afr_sh_purge_stale_entry (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - local = frame->local; - sh = &local->self_heal; - priv = this->private; +/* + * This function inspects the looked up replies (in an unlocked manner) + * and decides whether a locked verification and possible healing is + * required or not. It updates the three booleans for each type + * of healing. If the boolean flag gets set to FALSE, then we are sure + * no healing is required. If the boolean flag gets set to TRUE then + * we have to proceed with locked reinspection. + */ - sh->post_remove_call = afr_sh_purge_stale_entries_done; +int +afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, + inode_t *inode, uuid_t gfid, + gf_boolean_t *data_selfheal, + gf_boolean_t *metadata_selfheal, + gf_boolean_t *entry_selfheal) +{ + afr_private_t *priv = NULL; + int i = 0; + int valid_cnt = 0; + struct iatt first = {0, }; + struct afr_reply *replies = NULL; + int ret = -1; + + priv = this->private; + + replies = alloca0 (sizeof (*replies) * priv->child_count); + + ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies); + if (ret) + return ret; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + if (replies[i].op_ret == -1) + continue; + + if (afr_is_data_set (this, replies[i].xdata)) + *data_selfheal = _gf_true; + + if (afr_is_metadata_set (this, replies[i].xdata)) + *metadata_selfheal = _gf_true; - for (i = 0; i < priv->child_count; i++) { - if (afr_is_child_present (sh->fresh_children, - priv->child_count, i)) - continue; + if (afr_is_entry_set (this, replies[i].xdata)) + *entry_selfheal = _gf_true; - if ((!local->child_up[i]) || sh->child_errno[i] != 0) - continue; + valid_cnt ++; + if (valid_cnt == 1) { + first = replies[i].poststat; + continue; + } - GF_ASSERT (!uuid_is_null (sh->entrybuf.ia_gfid) || - uuid_is_null (sh->buf[i].ia_gfid)); + if (!IA_EQUAL (first, replies[i].poststat, type)) { + gf_log (this->name, GF_LOG_ERROR, + "TYPE mismatch %d vs %d on %s for gfid:%s", + (int) first.ia_type, + (int) replies[i].poststat.ia_type, + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); + return -EIO; + } - if ((sh->entrybuf.ia_type != sh->buf[i].ia_type) || - (uuid_compare (sh->buf[i].ia_gfid, - sh->entrybuf.ia_gfid))) - continue; + if (!IA_EQUAL (first, replies[i].poststat, uid)) { + gf_log (this->name, GF_LOG_DEBUG, + "UID mismatch %d vs %d on %s for gfid:%s", + (int) first.ia_uid, + (int) replies[i].poststat.ia_uid, + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); - afr_fresh_children_add_child (sh->fresh_children, - i, priv->child_count); + *metadata_selfheal = _gf_true; + } - } - afr_sh_purge_entry_common (frame, this, - afr_sh_purge_stale_entry_condition); -} + if (!IA_EQUAL (first, replies[i].poststat, gid)) { + gf_log (this->name, GF_LOG_DEBUG, + "GID mismatch %d vs %d on %s for gfid:%s", + (int) first.ia_uid, + (int) replies[i].poststat.ia_uid, + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); -void -afr_sh_save_child_iatts_from_policy (int32_t *children, struct iatt *bufs, - struct iatt *save, - unsigned int child_count) -{ - int i = 0; - int child = 0; - gf_boolean_t saved = _gf_false; - - GF_ASSERT (save); - //if iatt buf with gfid exists sets it - for (i = 0; i < child_count; i++) { - child = children[i]; - if (child == -1) - break; - *save = bufs[child]; - saved = _gf_true; - if (!uuid_is_null (save->ia_gfid)) - break; - } - GF_ASSERT (saved); -} + *metadata_selfheal = _gf_true; + } -void -afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int32_t fresh_child_enoents = 0; - int32_t fresh_parent_count = 0; - int32_t op_errno = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (afr_get_children_count (sh->success_children, - priv->child_count) == 0) { - op_errno = afr_resultant_errno_get (NULL, sh->child_errno, - priv->child_count); - goto fail; - } - - //make intersection of (success_children & fresh_parent_dirs) fresh_children - //the other success_children will be added to it if they are not stale - afr_children_intersection_get (sh->success_children, - sh->fresh_parent_dirs, - sh->sources, priv->child_count); - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); - memset (sh->sources, 0, sizeof (*sh->sources) * priv->child_count); - - fresh_parent_count = afr_get_children_count (sh->fresh_parent_dirs, - priv->child_count); - //we need the enoent count of the subvols present in fresh_parent_dirs - fresh_child_enoents = afr_errno_count (sh->fresh_parent_dirs, - sh->child_errno, - priv->child_count, ENOENT); - if (fresh_child_enoents == fresh_parent_count) { - afr_sh_set_error (sh, ENOENT); - sh->op_failed = 1; - afr_sh_purge_entry (frame, this); - } else if (!afr_conflicting_iattrs (sh->buf, sh->fresh_children, - priv->child_count, local->loc.path, - this->name)) { - afr_sh_save_child_iatts_from_policy (sh->fresh_children, - sh->buf, &sh->entrybuf, - priv->child_count); - afr_update_gfid_from_iatts (sh->sh_gfid_req, sh->buf, - sh->fresh_children, - priv->child_count); - afr_sh_purge_stale_entry (frame, this); - } else { - op_errno = EIO; - local->govinda_gOvinda = 1; - goto fail; - } - - return; - -fail: - afr_sh_set_error (sh, op_errno); - afr_sh_missing_entries_finish (frame, this); - return; -} + if (!IA_EQUAL (first, replies[i].poststat, prot)) { + gf_log (this->name, GF_LOG_DEBUG, + "MODE mismatch %d vs %d on %s for gfid:%s", + (int) st_mode_from_ia (first.ia_prot, 0), + (int) st_mode_from_ia (replies[i].poststat.ia_prot, 0), + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); -static int -afr_sh_children_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - int call_count = 0; + *metadata_selfheal = _gf_true; + } - afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, - op_errno, inode, buf, xattr, - postparent); - call_count = afr_frame_return (frame); + if (IA_ISREG(first.ia_type) && + !IA_EQUAL (first, replies[i].poststat, size)) { + gf_log (this->name, GF_LOG_DEBUG, + "SIZE mismatch %lld vs %lld on %s for gfid:%s", + (long long) first.ia_size, + (long long) replies[i].poststat.ia_size, + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); - if (call_count == 0) - afr_sh_children_lookup_done (frame, this); + *data_selfheal = _gf_true; + } + } - return 0; -} + if (valid_cnt > 0) + afr_inode_link (inode, &first); -static int -afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this) -{ - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int enoent_count = 0; - int nsources = 0; - int source = -1; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - /* If We can't find a fresh parent directory here, - * we wont know which subvol is correct without finding a parent dir - * upwards which has correct xattrs, for that we may have to - * do lookups till root, we dont wanna do that, - * instead make sure that if there are conflicting gfid - * parent dirs, self-heal thus lookup is failed with EIO. - * if there are missing entries we dont know whether to delete or - * create so fail with EIO, - * If there are conflicting xattr fail with EIO. - */ - if (afr_get_children_count (sh->success_children, - priv->child_count) == 0) { - gf_log (this->name, GF_LOG_ERROR, "Parent dir lookup failed " - "for %s, in missing entry self-heal, continuing with " - "the rest of the self-heals", local->loc.path); - goto out; - } - - enoent_count = afr_errno_count (NULL, sh->child_errno, - priv->child_count, ENOENT); - if (enoent_count > 0) { - gf_log (this->name, GF_LOG_INFO, "Parent dir missing for %s," - " in missing entry self-heal, continuing with the rest" - " of the self-heals", local->loc.path); - goto out; - } - - if (afr_conflicting_iattrs (sh->buf, sh->success_children, - priv->child_count, sh->parent_loc.path, - this->name)) { - gf_log (this->name, GF_LOG_INFO, "conflicting stat info for " - "parent dirs of %s", local->loc.path); - goto out; - } - - nsources = afr_build_sources (this, sh->xattr, sh->buf, - sh->pending_matrix, sh->sources, - sh->success_children, - AFR_ENTRY_TRANSACTION); - if (nsources < 0) { - gf_log (this->name, GF_LOG_INFO, "No sources for dir of %s," - " in missing entry self-heal, continuing with the rest" - " of the self-heals", local->loc.path); - goto out; - } - - source = afr_sh_select_source (sh->sources, priv->child_count); - if (source == -1) { - GF_ASSERT (0); - gf_log (this->name, GF_LOG_DEBUG, "No active sources found."); - goto out; - } - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_parent_dirs, priv->child_count); - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_children_lookup_cbk, _gf_false); - return 0; + if (valid_cnt < 2) + return -ENOTCONN; -out: - afr_sh_set_error (sh, EIO); - sh->op_failed = 1; - afr_sh_missing_entries_finish (frame, this); - return 0; + return 0; } -int -afr_sh_conflicting_entry_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, - dict_t *xattr, struct iatt *postparent) -{ - int call_count = 0; - afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, - op_errno, inode, buf, xattr, - postparent); - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_find_fresh_parents (frame, this); - - return 0; -} - -void -afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count) +inode_t * +afr_inode_find (xlator_t *this, uuid_t gfid) { - int i = 0; - - for (i = 0; i < child_count; i++) { - memset (&sh->buf[i], 0, sizeof (sh->buf[i])); - memset (&sh->parentbufs[i], 0, sizeof (sh->parentbufs[i])); - sh->child_errno[i] = 0; - } - memset (&sh->parentbuf, 0, sizeof (sh->parentbuf)); - sh->success_count = 0; - afr_reset_children (sh->success_children, child_count); - afr_reset_children (sh->fresh_children, child_count); - afr_reset_xattr (sh->xattr, child_count); -} + inode_table_t *table = NULL; + inode_t *inode = NULL; -/* afr self-heal state will be lost if this call is made - * please check the afr_sh_common_reset that is called in this function - */ -int -afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - afr_lookup_cbk_t lookup_cbk, gf_boolean_t set_gfid) -{ - afr_local_t *local = NULL; - int i = 0; - int call_count = 0; - afr_private_t *priv = NULL; - dict_t *xattr_req = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - - call_count = afr_up_children_count (priv->child_count, - local->child_up); - - local->call_count = call_count; - - xattr_req = dict_new(); - - if (xattr_req) { - afr_xattr_req_prepare (this, xattr_req, loc->path); - if (set_gfid) { - gf_log (this->name, GF_LOG_DEBUG, - "looking up %s with gfid: %s", - local->loc.path, uuid_utoa (sh->sh_gfid_req)); - GF_ASSERT (!uuid_is_null (sh->sh_gfid_req)); - afr_set_dict_gfid (xattr_req, sh->sh_gfid_req); - } - } - - afr_sh_common_reset (sh, priv->child_count); - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - gf_log (this->name, GF_LOG_DEBUG, - "looking up %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, - lookup_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - loc, xattr_req); - - if (!--call_count) - break; - } - } - - if (xattr_req) - dict_unref (xattr_req); - - return 0; -} + table = this->itable; + if (!table) + return NULL; + inode = inode_find (table, gfid); + if (inode) + return inode; + inode = inode_new (table); + if (!inode) + return NULL; -int -afr_sh_post_nb_entrylk_conflicting_sh_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "Non blocking entrylks failed."); - afr_sh_missing_entries_done (frame, this); - } else { - - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking entrylks done. Proceeding to FOP"); - afr_sh_common_lookup (frame, this, &sh->parent_loc, - afr_sh_conflicting_entry_lookup_cbk, - _gf_false); - } - - return 0; -} + uuid_copy (inode->gfid, gfid); -int -afr_sh_post_nb_entrylk_gfid_sh_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "Non blocking entrylks failed."); - afr_sh_missing_entries_done (frame, this); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking entrylks done. Proceeding to FOP"); - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_missing_entries_lookup_cbk, - _gf_true); - } - - return 0; + return inode; } -int -afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, - char *base_name, afr_lock_cbk_t lock_cbk) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - - int_lock->transaction_lk_type = AFR_SELFHEAL_LK; - int_lock->selfheal_lk_type = AFR_ENTRY_SELF_HEAL_LK; - - afr_set_lock_number (frame, this); +call_frame_t * +afr_frame_create (xlator_t *this) +{ + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + int op_errno = 0; + pid_t pid = -1; - int_lock->lk_basename = base_name; - int_lock->lk_loc = loc; - int_lock->lock_cbk = lock_cbk; + frame = create_frame (this, this->ctx->pool); + if (!frame) + return NULL; - afr_nonblocking_entrylk (frame, this); + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) { + STACK_DESTROY (frame->root); + return NULL; + } - return 0; -} + syncopctx_setfspid (&pid); -static int -afr_self_heal_parent_entrylk (call_frame_t *frame, xlator_t *this, - afr_lock_cbk_t lock_cbk) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - priv = this->private; - - gf_log (this->name, GF_LOG_TRACE, - "attempting to recreate missing entries for path=%s", - local->loc.path); - - GF_ASSERT (local->loc.parent); - afr_build_parent_loc (&sh->parent_loc, &local->loc); - - afr_sh_entrylk (frame, this, &sh->parent_loc, NULL, - lock_cbk); - return 0; -} + frame->root->pid = pid; -static int -afr_self_heal_conflicting_entries (call_frame_t *frame, xlator_t *this) -{ - afr_self_heal_parent_entrylk (frame, this, - afr_sh_post_nb_entrylk_conflicting_sh_cbk); - return 0; -} + afr_set_lk_owner (frame, this, frame->root); -static int -afr_self_heal_gfids (call_frame_t *frame, xlator_t *this) -{ - afr_self_heal_parent_entrylk (frame, this, - afr_sh_post_nb_entrylk_gfid_sh_cbk); - return 0; + return frame; } -afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *lc = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *shc = NULL; - - priv = this->private; - - sh = &l->self_heal; - - lc = GF_CALLOC (1, sizeof (afr_local_t), - gf_afr_mt_afr_local_t); - if (!lc) - goto out; - - shc = &lc->self_heal; - - shc->unwind = sh->unwind; - shc->gfid_sh_success_cbk = sh->gfid_sh_success_cbk; - shc->need_missing_entry_self_heal = sh->need_missing_entry_self_heal; - shc->need_gfid_self_heal = sh->need_gfid_self_heal; - shc->need_data_self_heal = sh->need_data_self_heal; - shc->need_metadata_self_heal = sh->need_metadata_self_heal; - shc->need_entry_self_heal = sh->need_entry_self_heal; - shc->forced_merge = sh->forced_merge; - shc->healing_fd_opened = sh->healing_fd_opened; - shc->data_lock_held = sh->data_lock_held; - if (sh->healing_fd && !sh->healing_fd_opened) - shc->healing_fd = fd_ref (sh->healing_fd); - else - shc->healing_fd = sh->healing_fd; - shc->background = sh->background; - shc->type = sh->type; - - uuid_copy (shc->sh_gfid_req, sh->sh_gfid_req); - if (l->loc.path) - loc_copy (&lc->loc, &l->loc); - - lc->child_up = memdup (l->child_up, priv->child_count); - if (l->xattr_req) - lc->xattr_req = dict_ref (l->xattr_req); - - if (l->cont.lookup.inode) - lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode); - if (l->cont.lookup.xattr) - lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr); - if (l->internal_lock.inode_locked_nodes) - lc->internal_lock.inode_locked_nodes = - memdup (l->internal_lock.inode_locked_nodes, - priv->child_count); - else - lc->internal_lock.inode_locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.inode_locked_nodes), - priv->child_count, - gf_afr_mt_char); - if (l->internal_lock.entry_locked_nodes) - lc->internal_lock.entry_locked_nodes = - memdup (l->internal_lock.entry_locked_nodes, - priv->child_count); - else - lc->internal_lock.entry_locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.entry_locked_nodes), - priv->child_count, - gf_afr_mt_char); - if (l->internal_lock.locked_nodes) - lc->internal_lock.locked_nodes = - memdup (l->internal_lock.locked_nodes, - priv->child_count); - else - lc->internal_lock.locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), - priv->child_count, - gf_afr_mt_char); - - lc->internal_lock.inodelk_lock_count = - l->internal_lock.inodelk_lock_count; - lc->internal_lock.entrylk_lock_count = - l->internal_lock.entrylk_lock_count; -out: - return lc; -} +/* + * This is the entry point for healing a given GFID + */ int -afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) +afr_selfheal (xlator_t *this, uuid_t gfid) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - char sh_type_str[256] = {0,}; - gf_boolean_t split_brain = _gf_false; - - priv = this->private; - local = bgsh_frame->local; - sh = &local->self_heal; - - if (local->govinda_gOvinda) - split_brain = _gf_true; - - afr_set_split_brain (this, sh->inode, split_brain); - - afr_self_heal_type_str_get (sh, sh_type_str, - sizeof(sh_type_str)); - if (sh->op_failed) { - gf_log (this->name, GF_LOG_ERROR, "background %s self-heal " - "failed on %s", sh_type_str, local->loc.path); - } else { - gf_log (this->name, GF_LOG_INFO, "background %s self-heal " - "completed on %s", sh_type_str, local->loc.path); - } - - FRAME_SU_UNDO (bgsh_frame, afr_local_t); - - if (!sh->unwound) { - sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno); - } - - if (sh->background) { - LOCK (&priv->lock); - { - priv->background_self_heals_started--; - } - UNLOCK (&priv->lock); - } - - AFR_STACK_DESTROY (bgsh_frame); - - return 0; -} + inode_t *inode = NULL; + call_frame_t *frame = NULL; + int ret = -1; + gf_boolean_t data_selfheal = _gf_false; + gf_boolean_t metadata_selfheal = _gf_false; + gf_boolean_t entry_selfheal = _gf_false; -int -afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - - call_frame_t *sh_frame = NULL; - afr_local_t *sh_local = NULL; - - local = frame->local; - priv = this->private; - - GF_ASSERT (local->loc.path); - - if (local->self_heal.background) { - LOCK (&priv->lock); - { - if (priv->background_self_heals_started - < priv->background_self_heal_count) { - priv->background_self_heals_started++; - - - } else { - local->self_heal.background = _gf_false; - } - } - UNLOCK (&priv->lock); - } - - gf_log (this->name, GF_LOG_TRACE, - "performing self heal on %s (metadata=%d data=%d entry=%d)", - local->loc.path, - local->self_heal.need_metadata_self_heal, - local->self_heal.need_data_self_heal, - local->self_heal.need_entry_self_heal); - - sh_frame = copy_frame (frame); - afr_set_lk_owner (sh_frame, this); - - sh_local = afr_local_copy (local, this); - sh_frame->local = sh_local; - sh = &sh_local->self_heal; - - sh->inode = inode_ref (inode); - - sh->orig_frame = frame; - - sh->completion_cbk = afr_self_heal_completion_cbk; - - sh->buf = GF_CALLOC (priv->child_count, sizeof (struct iatt), - gf_afr_mt_iatt); - sh->parentbufs = GF_CALLOC (priv->child_count, sizeof (struct iatt), - gf_afr_mt_iatt); - sh->child_errno = GF_CALLOC (priv->child_count, sizeof (int), - gf_afr_mt_int); - sh->success = GF_CALLOC (priv->child_count, sizeof (int), - gf_afr_mt_int); - sh->xattr = GF_CALLOC (priv->child_count, sizeof (dict_t *), - gf_afr_mt_dict_t); - sh->sources = GF_CALLOC (sizeof (*sh->sources), priv->child_count, - gf_afr_mt_int); - sh->locked_nodes = GF_CALLOC (sizeof (*sh->locked_nodes), - priv->child_count, - gf_afr_mt_int); - - sh->pending_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count, - gf_afr_mt_int32_t); - - for (i = 0; i < priv->child_count; i++) { - sh->pending_matrix[i] = GF_CALLOC (sizeof (int32_t), - priv->child_count, - gf_afr_mt_int32_t); - } - - sh->delta_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count, - gf_afr_mt_int32_t); - for (i = 0; i < priv->child_count; i++) { - sh->delta_matrix[i] = GF_CALLOC (sizeof (int32_t), - priv->child_count, - gf_afr_mt_int32_t); - } - sh->success_children = afr_fresh_children_create (priv->child_count); - sh->fresh_children = afr_fresh_children_create (priv->child_count); - sh->fresh_parent_dirs = afr_fresh_children_create (priv->child_count); - - - FRAME_SU_DO (sh_frame, afr_local_t); - if (sh->need_missing_entry_self_heal) { - afr_self_heal_conflicting_entries (sh_frame, this); - } else if (sh->need_gfid_self_heal) { - GF_ASSERT (!uuid_is_null (sh->sh_gfid_req)); - afr_self_heal_gfids (sh_frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to metadata check on %s", - local->loc.path); - - afr_sh_missing_entries_done (sh_frame, this); - } - - return 0; -} + inode = afr_inode_find (this, gfid); + if (!inode) + goto out; -void -afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, - size_t size) -{ - GF_ASSERT (str && (size > strlen (" missing-entry gfid " - "meta-data data entry"))); + frame = afr_frame_create (this); + if (!frame) + goto out; - if (self_heal_p->need_metadata_self_heal) { - snprintf (str, size, " meta-data"); - } + ret = afr_selfheal_unlocked_inspect (frame, this, inode, gfid, + &data_selfheal, + &metadata_selfheal, + &entry_selfheal); + if (ret) + goto out; - if (self_heal_p->need_data_self_heal) { - snprintf (str + strlen(str), size - strlen(str), " data"); - } + if (data_selfheal) + afr_selfheal_data (frame, this, inode); - if (self_heal_p->need_entry_self_heal) { - snprintf (str + strlen(str), size - strlen(str), " entry"); - } + if (metadata_selfheal) + afr_selfheal_metadata (frame, this, inode); - if (self_heal_p->need_missing_entry_self_heal) { - snprintf (str + strlen(str), size - strlen(str), - " missing-entry"); - } + if (entry_selfheal) + afr_selfheal_entry (frame, this, inode); - if (self_heal_p->need_gfid_self_heal) { - snprintf (str + strlen(str), size - strlen(str), " gfid"); - } -} + inode_forget (inode, 1); +out: + if (inode) + inode_unref (inode); + if (frame) + AFR_STACK_DESTROY (frame); -afr_self_heal_type -afr_self_heal_type_for_transaction (afr_transaction_type type) -{ - afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; - - switch (type) { - case AFR_DATA_TRANSACTION: - sh_type = AFR_SELF_HEAL_DATA; - break; - case AFR_METADATA_TRANSACTION: - sh_type = AFR_SELF_HEAL_METADATA; - break; - case AFR_ENTRY_TRANSACTION: - sh_type = AFR_SELF_HEAL_ENTRY; - break; - case AFR_ENTRY_RENAME_TRANSACTION: - GF_ASSERT (0); - break; - } - return sh_type; + return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h deleted file mode 100644 index f0091dc02..000000000 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ /dev/null @@ -1,98 +0,0 @@ -/* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ - -#ifndef __AFR_SELF_HEAL_COMMON_H__ -#define __AFR_SELF_HEAL_COMMON_H__ - -#define FILE_HAS_HOLES(buf) (((buf)->ia_size) > ((buf)->ia_blocks * 512)) - -typedef enum { - AFR_SELF_HEAL_ENTRY, - AFR_SELF_HEAL_METADATA, - AFR_SELF_HEAL_DATA, - AFR_SELF_HEAL_INVALID = -1, -} afr_self_heal_type; - -typedef int -(*afr_lookup_cbk_t) (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent); -int -afr_sh_select_source (int sources[], int child_count); - -int -afr_sh_sink_count (int sources[], int child_count); - -int -afr_sh_source_count (int sources[], int child_count); - -void -afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this); - -int -afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, - dict_t *xattr[], afr_transaction_type type, - size_t child_count); - -void -afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, - int32_t *delta_matrix[], int success[], - int child_count, afr_transaction_type type); - -int -afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, - int32_t child_count, afr_self_heal_type type, - int32_t *valid_children, const char *xlator_name); - -int -afr_sh_delta_to_xattr (afr_private_t *priv, - int32_t *delta_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type); - -int -afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count); - -void -afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, - size_t size); - -afr_self_heal_type -afr_self_heal_type_for_transaction (afr_transaction_type type); - -int -afr_build_sources (xlator_t *xlator, dict_t **xattr, struct iatt *bufs, - int32_t **pending_matrix, int32_t *sources, - int32_t *success_children, afr_transaction_type type); -void -afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count); -int -afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - afr_lookup_cbk_t lookup_cbk, gf_boolean_t set_gfid); -int -afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, - int active_src, struct iatt *buf); -int -afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, - char *base_name, afr_lock_cbk_t lock_cbk); -int -afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *buf, - struct iatt *postparent); -#endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index dd0599397..c0548d995 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -1,1198 +1,635 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#include <libgen.h> -#include <unistd.h> -#include <fnmatch.h> -#include <sys/time.h> -#include <stdlib.h> -#include <signal.h> #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" #endif -#include "glusterfs.h" #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" -#include "afr-self-heal-algorithm.h" - - -int -afr_sh_data_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - /* - TODO: cleanup sh->* - */ - - if (sh->healing_fd && !sh->healing_fd_opened) { - /* unref only if we created the fd ourselves */ - - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; - } - - /* for (i = 0; i < priv->child_count; i++) */ - /* sh->locked_nodes[i] = 0; */ - - sh->completion_cbk (frame, this); - - return 0; -} - - -int -afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "flush or setattr failed on %s on subvolume %s: %s", - local->loc.path, priv->children[child_index]->name, - strerror (op_errno)); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_data_done (frame, this); - } - - return 0; -} - - -int -afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *statpre, - struct iatt *statpost) -{ - afr_sh_data_flush_cbk (frame, cookie, this, op_ret, op_errno); - - return 0; -} - - -int -afr_sh_data_close (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - int call_count = 0; - int source = 0; - int32_t valid = 0; - struct iatt stbuf = {0,}; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - valid |= (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME); - - stbuf.ia_atime = sh->buf[source].ia_atime; - stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec; - stbuf.ia_mtime = sh->buf[source].ia_mtime; - stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec; - - if (sh->healing_fd_opened) { - /* not our job to close the fd */ - - afr_sh_data_done (frame, this); - return 0; - } - - if (!sh->healing_fd) { - afr_sh_data_done (frame, this); - return 0; - } - - call_count = (sh->active_sinks + 1) * 2; - local->call_count = call_count; - - /* closed source */ - gf_log (this->name, GF_LOG_TRACE, - "closing fd of %s on %s", - local->loc.path, priv->children[sh->source]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, - (void *) (long) sh->source, - priv->children[sh->source], - priv->children[sh->source]->fops->flush, - sh->healing_fd); - call_count--; - - STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk, - (void *) (long) sh->source, - priv->children[sh->source], - priv->children[sh->source]->fops->setattr, - &local->loc, &stbuf, valid); - - call_count--; - - if (call_count == 0) - return 0; - - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] || !local->child_up[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "closing fd of %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->flush, - sh->healing_fd); - - call_count--; - - STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setattr, - &local->loc, &stbuf, valid); - - if (!--call_count) - break; - } - - return 0; -} - - -int -afr_sh_data_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t * local = NULL; - int call_count = 0; - int child_index = (long) cookie; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "locking inode of %s on child %d failed: %s", - local->loc.path, child_index, - strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_TRACE, - "inode of %s on child %d locked", - local->loc.path, child_index); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_data_close (frame, this); - } - - return 0; -} - - -int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - - GF_ASSERT (!sh->data_lock_held); - - int_lock->lock_cbk = afr_sh_data_close; - afr_unlock (frame, this); - - return 0; -} - - -int -afr_sh_data_finish (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - gf_log (this->name, GF_LOG_DEBUG, - "finishing data selfheal of %s", local->loc.path); - - if (!sh->data_lock_held) - afr_sh_data_unlock (frame, this); - else - afr_sh_data_close (frame, this); - - return 0; -} - - -int -afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) -{ - afr_local_t *local = NULL; - int call_count = 0; - long i = 0; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - i = (long)cookie; - - afr_fresh_children_add_child (sh->fresh_children, i, priv->child_count); - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_inode_set_read_ctx (this, sh->inode, sh->source, - sh->fresh_children); - afr_sh_data_finish (frame, this); - } - - return 0; -} - - -int -afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, sh->success, - priv->child_count, AFR_DATA_TRANSACTION); - - erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, - gf_afr_mt_dict_t); - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; - - erase_xattr[i] = get_new_dict(); - dict_ref (erase_xattr[i]); - } - } - - afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, - priv->child_count, AFR_DATA_TRANSACTION); - - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "erasing pending flags from %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_erase_pending_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - sh->healing_fd, - GF_XATTROP_ADD_ARRAY, erase_xattr[i]); - if (!--call_count) - break; - } +#include "byte-order.h" - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); - } - } - GF_FREE (erase_xattr); +enum { + AFR_SELFHEAL_DATA_FULL = 0, + AFR_SELFHEAL_DATA_DIFF, +}; - return 0; -} - -int -afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) +#define HAS_HOLES(i) ((i->ia_blocks * 512) < (i->ia_size)) +static int +__checksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, uint32_t weak, uint8_t *strong, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - int call_count = 0; - int child_index = 0; - - priv = this->private; - local = frame->local; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) - gf_log (this->name, GF_LOG_INFO, - "ftruncate of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - else - gf_log (this->name, GF_LOG_TRACE, - "ftruncate of %s on subvolume %s completed", - local->loc.path, - priv->children[child_index]->name); - } - UNLOCK (&frame->lock); + afr_local_t *local = NULL; + int i = (long) cookie; - call_count = afr_frame_return (frame); + local = frame->local; - if (call_count == 0) { - afr_sh_data_erase_pending (frame, this); - } + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (strong) + memcpy (local->replies[i].checksum, strong, MD5_DIGEST_LENGTH); - return 0; + syncbarrier_wake (&local->barrier); + return 0; } -int -afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) +static int +attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, struct iatt *post, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; - int *sources = NULL; - int call_count = 0; - int i = 0; - - - priv = this->private; - local = frame->local; - sh = &local->self_heal; + int i = (long) cookie; + afr_local_t *local = NULL; - sources = sh->sources; - call_count = sh->active_sinks; + local = frame->local; - local->call_count = call_count; + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (pre) + local->replies[i].prestat = *pre; + if (post) + local->replies[i].poststat = *post; + if (xdata) + local->replies[i].xdata = dict_ref (xdata); - for (i = 0; i < priv->child_count; i++) { - if (sources[i] || !local->child_up[i]) - continue; + syncbarrier_wake (&local->barrier); - STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->ftruncate, - sh->healing_fd, sh->file_size); - - if (!--call_count) - break; - } - - return 0; + return 0; } -static struct afr_sh_algorithm * -sh_algo_from_name (xlator_t *this, char *name) +static gf_boolean_t +__afr_selfheal_data_checksums_match (call_frame_t *frame, xlator_t *this, + fd_t *fd, int source, + unsigned char *healed_sinks, + off_t offset, size_t size) { - int i = 0; - - while (afr_self_heal_algorithms[i].name) { - if (!strcmp (name, afr_self_heal_algorithms[i].name)) { - return &afr_self_heal_algorithms[i]; - } - - i++; - } - - return NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + unsigned char *wind_subvols = NULL; + int i = 0; + + priv = this->private; + local = frame->local; + + wind_subvols = alloca0 (priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (i == source || healed_sinks[i]) + wind_subvols[i] = 1; + } + + AFR_ONLIST (wind_subvols, frame, __checksum_cbk, rchecksum, fd, + offset, size, NULL); + + if (!local->replies[source].valid || local->replies[source].op_ret != 0) + return _gf_false; + + for (i = 0; i < priv->child_count; i++) { + if (i == source) + continue; + if (memcmp (local->replies[source].checksum, + local->replies[i].checksum, + MD5_DIGEST_LENGTH)) + return _gf_false; + } + + return _gf_true; } static int -sh_zero_byte_files_exist (afr_self_heal_t *sh, int child_count) +__afr_selfheal_data_read_write (call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, + off_t offset, size_t size, + struct afr_reply *replies) { - int i; - int ret = 0; - - for (i = 0; i < child_count; i++) { - if (sh->buf[i].ia_size == 0) { - ret = 1; - break; - } - } - - return ret; + struct iovec *iovec = NULL; + int count = 0; + struct iobref *iobref = NULL; + int ret = 0; + int i = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + ret = syncop_readv (priv->children[source], fd, size, offset, 0, + &iovec, &count, &iobref); + if (ret <= 0) + return ret; + + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + + /* + * TODO: Use fiemap() and discard() to heal holes + * in the future. + * + * For now, + * + * - if the source had any holes at all, + * AND + * - if we are writing past the original file size + * of the sink + * AND + * - is NOT the last block of the source file. if + * the block contains EOF, it has to be written + * in order to set the file size even if the + * last block is 0-filled. + * AND + * - if the read buffer is filled with only 0's + * + * then, skip writing to this source. We don't depend + * on the write to happen to update the size as we + * have performed an ftruncate() upfront anyways. + */ +#define is_last_block(o,b,s) ((s >= o) && (s <= (o + b))) + if (HAS_HOLES ((&replies[source].poststat)) && + offset >= replies[i].poststat.ia_size && + !is_last_block (offset, size, + replies[source].poststat.ia_size) && + (iov_0filled (iovec, count) == 0)) + continue; + + ret = syncop_writev (priv->children[i], fd, iovec, count, + offset, iobref, 0); + if (ret != iov_length (iovec, count)) { + /* write() failed on this sink. unset the corresponding + member in sinks[] (which is healed_sinks[] in the + caller) so that this server does NOT get considered + as successfully healed. + */ + healed_sinks[i] = 0; + } + } + if (iobref) + iobref_unref (iobref); + + return ret; } -struct afr_sh_algorithm * -afr_sh_data_pick_algo (call_frame_t *frame, xlator_t *this) +static int +afr_selfheal_data_block (call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, off_t offset, + size_t size, int type, struct afr_reply *replies) { - afr_private_t * priv = NULL; - struct afr_sh_algorithm * algo = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - algo = sh_algo_from_name (this, priv->data_self_heal_algorithm); - - if (algo == NULL) { - /* option not set, so fall back on heuristics */ - - if ((local->enoent_count != 0) - || sh_zero_byte_files_exist (sh, priv->child_count) - || (sh->file_size <= (priv->data_self_heal_window_size * - this->ctx->page_size))) { - - /* - * If the file does not exist on one of the subvolumes, - * or a zero-byte file exists (created by entry self-heal) - * the entire content has to be copied anyway, so there - * is no benefit from using the "diff" algorithm. - * - * If the file size is about the same as page size, - * the entire file can be read and written with a few - * (pipelined) STACK_WINDs, which will be faster - * than "diff" which has to read checksums and then - * read and write. - */ - - algo = sh_algo_from_name (this, "full"); - - } else { - algo = sh_algo_from_name (this, "diff"); - } - } - - return algo; + int ret = -1; + int sink_count = 0; + afr_private_t *priv = NULL; + unsigned char *data_lock = NULL; + + priv = this->private; + sink_count = AFR_COUNT (healed_sinks, priv->child_count); + data_lock = alloca0 (priv->child_count); + + ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, + offset, size, data_lock); + { + if (ret < sink_count) { + ret = -ENOTCONN; + goto unlock; + } + + if (type == AFR_SELFHEAL_DATA_DIFF && + __afr_selfheal_data_checksums_match (frame, this, fd, source, + healed_sinks, offset, size)) { + ret = 0; + goto unlock; + } + + ret = __afr_selfheal_data_read_write (frame, this, fd, source, + healed_sinks, offset, size, + replies); + } +unlock: + afr_selfheal_uninodelk (frame, this, fd->inode, this->name, + offset, size, data_lock); + return ret; } -int -afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int active_sinks = 0; - int source = 0; - int i = 0; - struct afr_sh_algorithm *sh_algo = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] == 0 && local->child_up[i] == 1) { - active_sinks++; - sh->success[i] = 1; - } - } - sh->success[source] = 1; - - if (active_sinks == 0) { - gf_log (this->name, GF_LOG_INFO, - "no active sinks for performing self-heal on file %s", - local->loc.path); - afr_sh_data_finish (frame, this); - return 0; - } - sh->active_sinks = active_sinks; - gf_log (this->name, GF_LOG_DEBUG, - "self-healing file %s from subvolume %s to %d other", - local->loc.path, priv->children[source]->name, active_sinks); - - sh->algo_completion_cbk = afr_sh_data_trim_sinks; - sh->algo_abort_cbk = afr_sh_data_finish; - - sh_algo = afr_sh_data_pick_algo (frame, this); - - sh_algo->fn (frame, this); - - return 0; -} - - -int -afr_sh_data_fix (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_local_t * orig_local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int nsources = 0; - int source = 0; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, - sh->sources, sh->success_children, - AFR_DATA_TRANSACTION); - if (nsources == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "No self-heal needed for %s", - local->loc.path); - - afr_sh_data_finish (frame, this); - return 0; - } - - if ((nsources == -1) - && (priv->favorite_child != -1) - && (sh->child_errno[priv->favorite_child] == 0)) { - - gf_log (this->name, GF_LOG_DEBUG, - "Picking favorite child %s as authentic source to " - "resolve conflicting data of %s", - priv->children[priv->favorite_child]->name, - local->loc.path); - - sh->sources[priv->favorite_child] = 1; - - nsources = afr_sh_source_count (sh->sources, - priv->child_count); - } - - if (nsources == -1) { - gf_log (this->name, GF_LOG_ERROR, - "Unable to self-heal contents of '%s' (possible " - "split-brain). Please delete the file from all but " - "the preferred subvolume.", local->loc.path); - - local->govinda_gOvinda = 1; - - afr_sh_data_finish (frame, this); - return 0; - } - - source = afr_sh_select_source (sh->sources, priv->child_count); - - if (source == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "No active sources found."); - - afr_sh_data_finish (frame, this); - return 0; - } - - sh->source = source; - sh->block_size = 65536; /* TODO: make it configurable or use macro */ - sh->file_size = sh->buf[source].ia_size; - - if (FILE_HAS_HOLES (&sh->buf[source])) - sh->file_has_holes = 1; - - orig_local = sh->orig_frame->local; - orig_local->cont.lookup.buf.ia_size = sh->buf[source].ia_size; - - /* detect changes not visible through pending flags -- JIC */ - for (i = 0; i < priv->child_count; i++) { - if (i == source || sh->child_errno[i]) - continue; - - if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[source])) - sh->sources[i] = 0; - } - - afr_reset_children (sh->fresh_children, priv->child_count); - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); - afr_inode_set_read_ctx (this, sh->inode, sh->source, - sh->fresh_children); - - /* - quick-read might have read the file, so send xattr from - the source subvolume (http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=815) - */ - - dict_unref (orig_local->cont.lookup.xattr); - if (orig_local->cont.lookup.xattrs) - orig_local->cont.lookup.xattr = dict_ref (orig_local->cont.lookup.xattrs[sh->source]); - - if (sh->background) { - sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno); - sh->unwound = _gf_true; - } - - afr_sh_data_sync_prepare (frame, this); - - return 0; -} - -static void -afr_destroy_pending_matrix (int32_t **pending_matrix, int32_t child_count) +static int +afr_selfheal_data_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *healed_sinks) { - int i = 0; - GF_ASSERT (child_count > 0); - if (pending_matrix) { - for (i = 0; i < child_count; i++) { - if (pending_matrix[i]) - GF_FREE (pending_matrix[i]); - } - GF_FREE (pending_matrix); - } + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + + AFR_ONLIST (healed_sinks, frame, attr_cbk, fsync, fd, 0, NULL); + + for (i = 0; i < priv->child_count; i++) + if (healed_sinks[i] && local->replies[i].op_ret != 0) + /* fsync() failed. Do NOT consider this server + as successfully healed. Mark it so. + */ + healed_sinks[i] = 0; + return 0; } -static int32_t** -afr_create_pending_matrix (int32_t child_count) -{ - gf_boolean_t cleanup = _gf_false; - int32_t **pending_matrix = NULL; - int i = 0; - - GF_ASSERT (child_count > 0); - - pending_matrix = GF_CALLOC (sizeof (*pending_matrix), child_count, - gf_afr_mt_int32_t); - if (NULL == pending_matrix) - goto out; - for (i = 0; i < child_count; i++) { - pending_matrix[i] = GF_CALLOC (sizeof (**pending_matrix), - child_count, - gf_afr_mt_int32_t); - if (NULL == pending_matrix[i]) { - cleanup = _gf_true; - goto out; - } - } -out: - if (_gf_true == cleanup) { - afr_destroy_pending_matrix (pending_matrix, child_count); - pending_matrix = NULL; - } - return pending_matrix; -} -int -afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, - dict_t **xattr, - afr_transaction_type txn_type) +static int +afr_selfheal_data_restore_time (call_frame_t *frame, xlator_t *this, + inode_t *inode, int source, + unsigned char *healed_sinks, + struct afr_reply *replies) { - afr_private_t *priv = NULL; - int read_child = -1; - int ret = -1; - int32_t **pending_matrix = NULL; - int32_t *sources = NULL; - int32_t *success_children = NULL; - struct iatt *bufs = NULL; - int32_t nsources = 0; - int32_t prev_read_child = -1; - int32_t config_read_child = -1; - afr_self_heal_t *sh = NULL; - - priv = this->private; - bufs = local->cont.lookup.bufs; - success_children = local->cont.lookup.success_children; - sh = &local->self_heal; - - pending_matrix = afr_create_pending_matrix (priv->child_count); - if (NULL == pending_matrix) - goto out; - - sources = GF_CALLOC (sizeof (*sources), priv->child_count, - gf_afr_mt_int32_t); - if (NULL == sources) - goto out; - - nsources = afr_build_sources (this, xattr, bufs, pending_matrix, - sources, success_children, txn_type); - if (nsources < 0) { - ret = -1; - goto out; - } + loc_t loc = {0, }; - prev_read_child = local->read_child_index; - config_read_child = priv->read_child; - read_child = afr_select_read_child_from_policy (success_children, - priv->child_count, - prev_read_child, - config_read_child, - sources); - ret = 0; - local->cont.lookup.sources = sources; -out: - afr_destroy_pending_matrix (pending_matrix, priv->child_count); - if (-1 == ret) { - if (sources) - GF_FREE (sources); - } - gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d", read_child); - return read_child; -} - -int -afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int call_count = -1; - int child_index = (long) cookie; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret != -1) { - gf_log (this->name, GF_LOG_TRACE, - "fstat of %s on %s succeeded", - local->loc.path, - priv->children[child_index]->name); - - sh->buf[child_index] = *buf; - sh->success_children[sh->success_count] = child_index; - sh->success_count++; - } - } - UNLOCK (&frame->lock); + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - call_count = afr_frame_return (frame); + AFR_ONLIST (healed_sinks, frame, attr_cbk, setattr, &loc, + &replies[source].poststat, + (GF_SET_ATTR_ATIME|GF_SET_ATTR_MTIME), NULL); - if (call_count == 0) { - afr_sh_data_fix (frame, this); - } + loc_wipe (&loc); - return 0; + return 0; } - -int -afr_sh_data_fstat (call_frame_t *frame, xlator_t *this) +static int +afr_data_self_heal_type_get (afr_private_t *priv, unsigned char *healed_sinks, + int source, struct afr_reply *replies) { - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = 0; + int type = AFR_SELFHEAL_DATA_FULL; int i = 0; - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - call_count = afr_up_children_count (priv->child_count, - local->child_up); - - local->call_count = call_count; - - afr_reset_children (sh->success_children, priv->child_count); - sh->success_count = 0; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fstat, - sh->healing_fd); - - if (!--call_count) + if (priv->data_self_heal_algorithm == NULL) { + type = AFR_SELFHEAL_DATA_FULL; + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i] && i != source) + continue; + if (replies[i].poststat.ia_size) { + type = AFR_SELFHEAL_DATA_DIFF; break; + } } + } else if (strcmp (priv->data_self_heal_algorithm, "full") == 0) { + type = AFR_SELFHEAL_DATA_FULL; + } else if (strcmp (priv->data_self_heal_algorithm, "diff") == 0) { + type = AFR_SELFHEAL_DATA_DIFF; } - - return 0; + return type; } - -int -afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int call_count = -1; - int child_index = (long) cookie; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret != -1) { - gf_log (this->name, GF_LOG_TRACE, - "fxattrop of %s on %s succeeded", - local->loc.path, - priv->children[child_index]->name); - - sh->xattr[child_index] = dict_ref (xattr); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_data_fstat (frame, this); - } - - return 0; -} - - -int -afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this) +static int +afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, + struct afr_reply *replies) { - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - dict_t *xattr_req = NULL; - int32_t *zero_pending = NULL; - int call_count = 0; - int i = 0; - int ret = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - call_count = afr_up_children_count (priv->child_count, - local->child_up); - - local->call_count = call_count; - - xattr_req = dict_new(); - if (!xattr_req) { - ret = -1; - goto out; - } - - for (i = 0; i < priv->child_count; i++) { - zero_pending = GF_CALLOC (3, sizeof (*zero_pending), - gf_afr_mt_int32_t); - if (!zero_pending) { - ret = -1; - goto out; - } - ret = dict_set_dynptr (xattr_req, priv->pending_key[i], - zero_pending, - 3 * sizeof (*zero_pending)); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value"); - goto out; - } else { - zero_pending = NULL; - } - } - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_sh_data_fxattrop_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - sh->healing_fd, GF_XATTROP_ADD_ARRAY, - xattr_req); - - if (!--call_count) - break; - } - } + afr_private_t *priv = NULL; + int i = 0; + off_t off = 0; + size_t block = 128 * 1024; + int type = AFR_SELFHEAL_DATA_FULL; + int ret = -1; + call_frame_t *iter_frame = NULL; + char *sinks_str = NULL; + char *p = NULL; + + priv = this->private; + + sinks_str = alloca0 (priv->child_count * 8); + p = sinks_str; + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + p += sprintf (p, "%d ", i); + } + + gf_log (this->name, GF_LOG_INFO, "performing data selfheal on %s. " + "source=%d sinks=%s", + uuid_utoa (fd->inode->gfid), source, sinks_str); + + type = afr_data_self_heal_type_get (priv, healed_sinks, source, + replies); + + iter_frame = afr_copy_frame (frame); + if (!iter_frame) + return -ENOMEM; + + for (off = 0; off < replies[source].poststat.ia_size; off += block) { + ret = afr_selfheal_data_block (iter_frame, this, fd, source, + healed_sinks, off, block, type, + replies); + if (ret < 0) + goto out; + + AFR_STACK_RESET (iter_frame); + } + + afr_selfheal_data_restore_time (frame, this, fd->inode, source, + healed_sinks, replies); + + ret = afr_selfheal_data_fsync (frame, this, fd, healed_sinks); out: - if (xattr_req) - dict_unref (xattr_req); - - if (ret) { - if (zero_pending) - GF_FREE (zero_pending); - sh->op_failed = 1; - afr_sh_data_done (frame, this); - } - - return 0; + if (iter_frame) + AFR_STACK_DESTROY (iter_frame); + return ret; } -int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this); -int -afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) +static int +__afr_selfheal_truncate_sinks (call_frame_t *frame, xlator_t *this, + fd_t *fd, unsigned char *healed_sinks, + struct afr_reply *replies, uint64_t size) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "Non Blocking data inodelks " - "failed for %s.", local->loc.path); - sh->op_failed = 1; - afr_sh_data_done (frame, this); - } else { - - gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks " - "done for %s. Proceeding to FOP", local->loc.path); - afr_sh_data_fxattrop (frame, this); - } - - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + unsigned char *larger_sinks = 0; + int i = 0; + + local = frame->local; + priv = this->private; + + larger_sinks = alloca0 (priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (healed_sinks[i] && replies[i].poststat.ia_size > size) + larger_sinks[i] = 1; + } + + AFR_ONLIST (larger_sinks, frame, attr_cbk, ftruncate, fd, size, NULL); + + for (i = 0; i < priv->child_count; i++) + if (healed_sinks[i] && local->replies[i].op_ret == -1) + /* truncate() failed. Do NOT consider this server + as successfully healed. Mark it so. + */ + healed_sinks[i] = 0; + return 0; } -int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this) +/* + * If by chance there are multiple sources with differing sizes, select + * the largest file as the source. + * + * This can only happen if data was directly modified in the backend. + */ +static int +__afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources, + unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *locked_on, + struct afr_reply *replies) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - - int_lock->transaction_lk_type = AFR_SELFHEAL_LK; - int_lock->selfheal_lk_type = AFR_DATA_SELF_HEAL_LK; - - afr_set_lock_number (frame, this); - - int_lock->lk_flock.l_start = 0; - int_lock->lk_flock.l_len = 0; - int_lock->lk_flock.l_type = F_WRLCK; - int_lock->lock_cbk = afr_sh_data_post_nonblocking_inodelk_cbk; - - afr_nonblocking_inodelk (frame, this); - - return 0; + int i = 0; + afr_private_t *priv = NULL; + uint64_t size = 0; + int source = -1; + int locked_count = 0; + int sources_count = 0; + int healed_sinks_count = 0; + + priv = this->private; + + locked_count = AFR_COUNT (locked_on, priv->child_count); + sources_count = AFR_COUNT (sources, priv->child_count); + healed_sinks_count = AFR_COUNT (healed_sinks, priv->child_count); + + if (locked_count == healed_sinks_count || !sources_count) { + /* split brain */ + return -EIO; + } + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (size <= replies[i].poststat.ia_size) { + size = replies[i].poststat.ia_size; + source = i; + } + } + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (replies[i].poststat.ia_size < size) { + sources[i] = 0; + sinks[i] = 1; + } + } + + return source; } - -int -afr_sh_data_lock (call_frame_t *frame, xlator_t *this) +/* + * __afr_selfheal_data_prepare: + * + * This function inspects the on-disk xattrs and determines which subvols + * are sources and sinks. + * + * The return value is the index of the subvolume to be used as the source + * for self-healing, or -1 if no healing is necessary/split brain. + */ +static int +__afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, + struct afr_reply *replies) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t * sh = NULL; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (sh->data_lock_held) { - /* caller has held the lock already, - so skip locking */ - - afr_sh_data_fxattrop (frame, this); - return 0; - } - - return afr_sh_data_lock_rec (frame, this); + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; + + ret = afr_selfheal_unlocked_discover (frame, fd->inode, fd->inode->gfid, + replies); + if (ret) + return ret; + + ret = afr_selfheal_find_direction (frame, this, replies, + AFR_DATA_TRANSACTION, + locked_on, sources, sinks); + if (ret) + return ret; + + source = __afr_selfheal_data_finalize_source (this, sources, sinks, + healed_sinks, locked_on, + replies); + if (source < 0) + return -EIO; + + for (i = 0; i < priv->child_count; i++) + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + healed_sinks[i] = sinks[i] && locked_on[i]; + + return source; } -int -afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) +static int +__afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - /* TODO: some of the open's might fail. - In that case, modify cleanup fn to send flush on those - fd's which are already open */ - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "open of %s failed on child %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - sh->op_failed = 1; - } - - gf_log (this->name, GF_LOG_TRACE, - "open of %s succeeded on child %s", - local->loc.path, - priv->children[child_index]->name); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (sh->op_failed) { - afr_sh_data_finish (frame, this); - return 0; - } - - gf_log (this->name, GF_LOG_TRACE, - "fd for %s opened, commencing sync", - local->loc.path); - - afr_sh_data_lock (frame, this); - } - - return 0; + afr_private_t *priv = NULL; + int ret = -1; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *data_lock = NULL; + unsigned char *healed_sinks = NULL; + struct afr_reply *locked_replies = NULL; + int source = -1; + gf_boolean_t compat = _gf_false; + unsigned char *compat_lock = NULL; + + priv = this->private; + + sources = alloca0 (priv->child_count); + sinks = alloca0 (priv->child_count); + healed_sinks = alloca0 (priv->child_count); + data_lock = alloca0 (priv->child_count); + compat_lock = alloca0 (priv->child_count); + + locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); + + ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, 0, 0, + data_lock); + { + if (ret < 2) { + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_data_prepare (frame, this, fd, data_lock, + sources, sinks, healed_sinks, + locked_replies); + if (ret < 0) + goto unlock; + + source = ret; + + ret = __afr_selfheal_truncate_sinks (frame, this, fd, healed_sinks, + locked_replies, + locked_replies[source].poststat.ia_size); + if (ret < 0) + goto unlock; + + ret = 0; + + /* Locking from (LLONG_MAX - 2) to (LLONG_MAX - 1) is for + compatibility with older self-heal clients which do not + hold a lock in the @priv->sh_domain domain to guard + against concurrent ongoing self-heals + */ + afr_selfheal_inodelk (frame, this, fd->inode, this->name, + LLONG_MAX - 2, 1, compat_lock); + compat = _gf_true; + } +unlock: + afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0, + data_lock); + if (ret < 0) + goto out; + + ret = afr_selfheal_data_do (frame, this, fd, source, healed_sinks, + locked_replies); + if (ret) + goto out; + + ret = afr_selfheal_undo_pending (frame, this, fd->inode, sources, sinks, + healed_sinks, AFR_DATA_TRANSACTION, + locked_replies, data_lock); +out: + if (compat) + afr_selfheal_uninodelk (frame, this, fd->inode, this->name, + LLONG_MAX - 2, 1, compat_lock); + return ret; } -int -afr_sh_data_open (call_frame_t *frame, xlator_t *this) +static fd_t * +afr_selfheal_data_open (xlator_t *this, inode_t *inode) { - int i = 0; - int call_count = 0; - fd_t *fd = NULL; - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (sh->healing_fd_opened) { - /* caller has opened the fd for us already, so skip open */ - - afr_sh_data_lock (frame, this); - return 0; - } + loc_t loc = {0,}; + int ret = 0; + fd_t *fd = NULL; - call_count = afr_up_children_count (priv->child_count, local->child_up); - local->call_count = call_count; + fd = fd_create (inode, 0); + if (!fd) + return NULL; - fd = fd_create (local->loc.inode, frame->root->pid); - sh->healing_fd = fd; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - /* open sinks */ - for (i = 0; i < priv->child_count; i++) { - if(!local->child_up[i]) - continue; + ret = syncop_open (this, &loc, O_RDWR|O_LARGEFILE, fd); + if (ret) { + fd_unref (fd); + fd = NULL; + } else { + fd_bind (fd); + } - STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->open, - &local->loc, - O_RDWR|O_LARGEFILE, fd, 0); + loc_wipe (&loc); - if (!--call_count) - break; - } - - return 0; + return fd; } int -afr_self_heal_data (call_frame_t *frame, xlator_t *this) +afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = this->private; - - local = frame->local; - sh = &local->self_heal; - - if (sh->need_data_self_heal && priv->data_self_heal) { - afr_sh_data_open (frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "not doing data self heal on %s", - local->loc.path); - afr_sh_data_done (frame, this); - } - - return 0; + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + int ret = 0; + fd_t *fd = NULL; + + priv = this->private; + + fd = afr_selfheal_data_open (this, inode); + if (!fd) + return -EIO; + + locked_on = alloca0 (priv->child_count); + + ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain, 0, 0, + locked_on); + { + if (ret < 2) { + /* Either less than two subvols available, or another + selfheal (from another server) is in progress. Skip + for now in any case there isn't anything to do. + */ + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_data (frame, this, fd, locked_on); + } +unlock: + afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on); + + if (fd) + fd_unref (fd); + + return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 474d0b0ab..9605d69f4 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -1,2382 +1,631 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#include <libgen.h> -#include <unistd.h> -#include <fnmatch.h> -#include <sys/time.h> -#include <stdlib.h> -#include <signal.h> #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" #endif -#include "glusterfs.h" -#include "inode.h" #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" +#include "afr-self-heal.h" #include "byte-order.h" - #include "afr-transaction.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" - -int -afr_sh_entry_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - /* - TODO: cleanup sh->* - */ - - if (sh->healing_fd) - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; - - /* for (i = 0; i < priv->child_count; i++) { */ - /* sh->locked_nodes[i] = 0; */ - /* } */ - - sh->completion_cbk (frame, this); - - return 0; -} - - -int -afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->lock_cbk = afr_sh_entry_done; - afr_unlock (frame, this); - - return 0; -} - - -int -afr_sh_entry_finish (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - gf_log (this->name, GF_LOG_TRACE, - "finishing entry selfheal of %s", local->loc.path); - - afr_sh_entry_unlock (frame, this); - - return 0; -} - - -int -afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) -{ - long i = 0; - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_local_t *orig_local = NULL; - call_frame_t *orig_frame = NULL; - afr_private_t *priv = NULL; - int32_t read_child = -1; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - i = (long)cookie; - - - afr_fresh_children_add_child (sh->fresh_children, i, priv->child_count); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to erase pending xattrs on %s (%s)", - local->loc.path, priv->children[i]->name, - strerror (op_errno)); - } - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (sh->source == -1) { - //this happens if the forced merge option is set - read_child = sh->fresh_children[0]; - } else { - read_child = sh->source; - } - afr_inode_set_read_ctx (this, sh->inode, read_child, - sh->fresh_children); - orig_frame = sh->orig_frame; - orig_local = orig_frame->local; - - if (sh->source != -1) { - orig_local->cont.lookup.buf.ia_nlink = sh->buf[sh->source].ia_nlink; - } - - afr_sh_entry_finish (frame, this); - } - - return 0; -} - - -int -afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; - int need_unwind = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, sh->success, - priv->child_count, AFR_ENTRY_TRANSACTION); - - erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, - gf_afr_mt_dict_t); - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; - - erase_xattr[i] = get_new_dict(); - dict_ref (erase_xattr[i]); - } - } - - if (call_count == 0) - need_unwind = 1; - - afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, - priv->child_count, AFR_ENTRY_TRANSACTION); - - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "erasing pending flags from %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_entry_erase_pending_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, erase_xattr[i]); - if (!--call_count) - break; - } - - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); - } - } - GF_FREE (erase_xattr); - - if (need_unwind) - afr_sh_entry_finish (frame, this); - - return 0; -} - static int -next_active_source (call_frame_t *frame, xlator_t *this, - int current_active_source) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int source = -1; - int next_active_source = -1; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - source = sh->source; - - if (source != -1) { - if (current_active_source != source) - next_active_source = source; - goto out; - } - - /* - the next active sink becomes the source for the - 'conservative decision' of merging all entries - */ - - for (i = 0; i < priv->child_count; i++) { - if ((sh->sources[i] == 0) - && (local->child_up[i] == 1) - && (i > current_active_source)) { - - next_active_source = i; - break; - } - } -out: - return next_active_source; -} - - - -static int -next_active_sink (call_frame_t *frame, xlator_t *this, - int current_active_sink) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int next_active_sink = -1; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - /* - the next active sink becomes the source for the - 'conservative decision' of merging all entries - */ - - for (i = 0; i < priv->child_count; i++) { - if ((sh->sources[i] == 0) - && (local->child_up[i] == 1) - && (i > current_active_sink)) { - - next_active_sink = i; - break; - } - } - - return next_active_sink; -} - - -int -build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) -{ - int ret = -1; - - if (!child) { - goto out; - } - - if (strcmp (parent->path, "/") == 0) - ret = gf_asprintf ((char **)&child->path, "/%s", name); - else - ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path, - name); - - if (-1 == ret) { - gf_log (this->name, GF_LOG_ERROR, - "asprintf failed while setting child path"); - } - - if (!child->path) { - goto out; - } - - child->name = strrchr (child->path, '/'); - if (child->name) - child->name++; - - child->parent = inode_ref (parent->inode); - child->inode = inode_new (parent->inode->table); - - if (!child->inode) { - goto out; - } - - ret = 0; -out: - if (ret == -1) - loc_wipe (child); - - return ret; -} - - -int -afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this); - -int -afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src); - -int -afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this); - -int -afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src); +afr_selfheal_entry_delete (call_frame_t *frame, xlator_t *this, inode_t *dir, + const char *name, inode_t *inode, int child, + struct afr_reply *replies) +{ + afr_private_t *priv = NULL; + xlator_t *subvol = NULL; + int ret = 0; + loc_t loc = {0, }; + char g[64]; + + priv = this->private; + + subvol = priv->children[child]; + + loc.parent = inode_ref (dir); + uuid_copy (loc.pargfid, dir->gfid); + loc.name = name; + loc.inode = inode_ref (inode); + + if (replies[child].valid && replies[child].op_ret == 0) { + switch (replies[child].poststat.ia_type) { + case IA_IFDIR: + gf_log (this->name, GF_LOG_WARNING, + "expunging dir %s/%s (%s) on %s", + uuid_utoa (dir->gfid), name, + uuid_utoa_r (replies[child].poststat.ia_gfid, g), + subvol->name); + ret = syncop_rmdir (subvol, &loc, 1); + break; + default: + gf_log (this->name, GF_LOG_WARNING, + "expunging file %s/%s (%s) on %s", + uuid_utoa (dir->gfid), name, + uuid_utoa_r (replies[child].poststat.ia_gfid, g), + subvol->name); + ret = syncop_unlink (subvol, &loc); + break; + } + } + + loc_wipe (&loc); + + return ret; +} + + +int +afr_selfheal_recreate_entry (call_frame_t *frame, xlator_t *this, int dst, + int source, inode_t *dir, const char *name, + inode_t *inode, struct afr_reply *replies) +{ + int ret = 0; + loc_t loc = {0,}; + loc_t srcloc = {0,}; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + struct iatt *iatt = NULL; + char *linkname = NULL; + mode_t mode = 0; + struct iatt newent = {0,}; + + priv = this->private; + + xdata = dict_new(); + if (!xdata) + return -ENOMEM; + + loc.parent = inode_ref (dir); + uuid_copy (loc.pargfid, dir->gfid); + loc.name = name; + loc.inode = inode_ref (inode); + + ret = afr_selfheal_entry_delete (frame, this, dir, name, inode, dst, + replies); + if (ret) + goto out; + + ret = dict_set_static_bin (xdata, "gfid-req", + replies[source].poststat.ia_gfid, 16); + if (ret) + goto out; + + iatt = &replies[source].poststat; + + srcloc.inode = inode_ref (inode); + uuid_copy (srcloc.gfid, iatt->ia_gfid); + + mode = st_mode_from_ia (iatt->ia_prot, iatt->ia_type); + + switch (iatt->ia_type) { + case IA_IFDIR: + ret = syncop_mkdir (priv->children[dst], &loc, mode, xdata, 0); + break; + case IA_IFLNK: + ret = syncop_lookup (priv->children[dst], &srcloc, 0, 0, 0, 0); + if (ret == 0) { + ret = syncop_link (priv->children[dst], &srcloc, &loc); + } else { + ret = syncop_readlink (priv->children[source], &srcloc, + &linkname, 4096); + if (ret <= 0) + goto out; + ret = syncop_symlink (priv->children[dst], &loc, linkname, + xdata, NULL); + } + break; + default: + ret = dict_set_int32 (xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1); + if (ret) + goto out; + ret = syncop_mknod (priv->children[dst], &loc, mode, + iatt->ia_rdev, xdata, &newent); + if (ret == 0 && iatt->ia_size && !newent.ia_size) { + /* New entry created. Mark @dst pending on all sources */ + ret = 1; + } + break; + } -int -afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this, - int active_src, int32_t op_ret, - int32_t op_errno) -{ - int call_count = 0; - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_entry_expunge_subvol (frame, this, active_src); - - return 0; -} - -int -afr_sh_entry_expunge_parent_setattr_cbk (call_frame_t *expunge_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - call_frame_t *frame = NULL; - int active_src = (long) cookie; - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - - if (op_ret != 0) { - gf_log (this->name, GF_LOG_ERROR, - "setattr on parent directory of %s on subvolume %s failed: %s", - expunge_local->loc.path, - priv->children[active_src]->name, strerror (op_errno)); - } - - AFR_STACK_DESTROY (expunge_frame); - sh->expunge_done (frame, this, active_src, op_ret, op_errno); - - return 0; -} - - -int -afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int active_src = 0; - call_frame_t *frame = NULL; - int32_t valid = 0; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - - active_src = (long) cookie; - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "removed %s on %s", - expunge_local->loc.path, - priv->children[active_src]->name); - } else { - gf_log (this->name, GF_LOG_INFO, - "removing %s on %s failed (%s)", - expunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - } - - valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - afr_build_parent_loc (&expunge_sh->parent_loc, &expunge_local->loc); - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_parent_setattr_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->setattr, - &expunge_sh->parent_loc, - &expunge_sh->parentbuf, - valid); - - return 0; -} - - -int -afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - - gf_log (this->name, GF_LOG_TRACE, - "expunging file %s on %s", - expunge_local->loc.path, priv->children[active_src]->name); - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->unlink, - &expunge_local->loc); - - return 0; -} - - - -int -afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - - gf_log (this->name, GF_LOG_DEBUG, - "expunging directory %s on %s", - expunge_local->loc.path, priv->children[active_src]->name); - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->rmdir, - &expunge_local->loc, 1); - - return 0; -} - - -int -afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, - int active_src, struct iatt *buf) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - call_frame_t *frame = NULL; - int type = 0; - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - - type = buf->ia_type; - - switch (type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - case IA_IFLNK: - afr_sh_entry_expunge_unlink (expunge_frame, this, active_src); - break; - case IA_IFDIR: - afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src); - break; - default: - gf_log (this->name, GF_LOG_ERROR, - "%s has unknown file type on %s: 0%o", - expunge_local->loc.path, - priv->children[active_src]->name, type); - goto out; - break; - } - - return 0; out: - AFR_STACK_DESTROY (expunge_frame); - sh->expunge_done (frame, this, active_src, -1, EINVAL); - - return 0; -} - - -int -afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *x, - struct iatt *postparent) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - call_frame_t *frame = NULL; - int active_src = 0; - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - active_src = (long) cookie; - local = frame->local; - sh = &local->self_heal; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "lookup of %s on %s failed (%s)", - expunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - goto out; - } - - afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf); - - return 0; -out: - AFR_STACK_DESTROY (expunge_frame); - sh->expunge_done (frame, this, active_src, op_ret, op_errno); - - return 0; -} - - -int -afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", - expunge_local->loc.path, priv->children[active_src]->name); - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->lookup, - &expunge_local->loc, 0); - - return 0; -} - -int -afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *x, - struct iatt *postparent) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int source = 0; - call_frame_t *frame = NULL; - int active_src = 0; - int need_expunge = 0; - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - active_src = expunge_sh->active_source; - source = (long) cookie; - local = frame->local; - sh = &local->self_heal; - - if (op_ret == -1 && op_errno == ENOENT) - need_expunge = 1; - else if (op_ret == -1) - goto out; - - if (!uuid_is_null (expunge_sh->entrybuf.ia_gfid) && - !uuid_is_null (buf->ia_gfid) && - (uuid_compare (expunge_sh->entrybuf.ia_gfid, buf->ia_gfid) != 0)) { - char uuidbuf1[64]; - char uuidbuf2[64]; - gf_log (this->name, GF_LOG_DEBUG, - "entry %s found on %s with mismatching gfid (%s/%s)", - expunge_local->loc.path, - priv->children[source]->name, - uuid_utoa_r (expunge_sh->entrybuf.ia_gfid, uuidbuf1), - uuid_utoa_r (buf->ia_gfid, uuidbuf2)); - need_expunge = 1; - } - - if (need_expunge) { - gf_log (this->name, GF_LOG_INFO, - "missing entry %s on %s", - expunge_local->loc.path, - priv->children[source]->name); - - if (postparent) - expunge_sh->parentbuf = *postparent; - - afr_sh_entry_expunge_purge (expunge_frame, this, active_src); - - return 0; - } - -out: - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "%s exists under %s", - expunge_local->loc.path, - priv->children[source]->name); - } else { - gf_log (this->name, GF_LOG_INFO, - "looking up %s under %s failed (%s)", - expunge_local->loc.path, - priv->children[source]->name, - strerror (op_errno)); - } - - AFR_STACK_DESTROY (expunge_frame); - sh->expunge_done (frame, this, active_src, op_ret, op_errno); - - return 0; -} - - -int -afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, - gf_dirent_t *entry) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int ret = -1; - call_frame_t *expunge_frame = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int active_src = 0; - int source = 0; - int op_errno = 0; - char *name = NULL; - int op_ret = -1; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - source = sh->source; - sh->expunge_done = afr_sh_entry_expunge_entry_done; - - name = entry->d_name; - - if ((strcmp (name, ".") == 0) - || (strcmp (name, "..") == 0) - || ((strcmp (local->loc.path, "/") == 0) - && (strcmp (name, GF_REPLICATE_TRASH_DIR) == 0))) { - - gf_log (this->name, GF_LOG_TRACE, - "skipping inspection of %s under %s", - name, local->loc.path); - op_ret = 0; - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "inspecting existance of %s under %s", - name, local->loc.path); - - expunge_frame = copy_frame (frame); - if (!expunge_frame) { - op_errno = ENOMEM; - goto out; - } - - ALLOC_OR_GOTO (expunge_local, afr_local_t, out); - - expunge_frame->local = expunge_local; - expunge_sh = &expunge_local->self_heal; - expunge_sh->sh_frame = frame; - expunge_sh->active_source = active_src; - expunge_sh->entrybuf = entry->d_stat; - - ret = build_child_loc (this, &expunge_local->loc, &local->loc, name); - if (ret != 0) { - op_errno = EINVAL; - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", expunge_local->loc.path, - priv->children[source]->name); - - STACK_WIND_COOKIE (expunge_frame, - afr_sh_entry_expunge_entry_cbk, - (void *) (long) source, - priv->children[source], - priv->children[source]->fops->lookup, - &expunge_local->loc, 0); - - ret = 0; -out: - if (ret == -1) - sh->expunge_done (frame, this, active_src, op_ret, op_errno); - - return 0; -} - - -int -afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - gf_dirent_t *entry = NULL; - off_t last_offset = 0; - int active_src = 0; - int entry_count = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - - if (op_ret <= 0) { - if (op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "readdir of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_TRACE, - "readdir of %s on subvolume %s complete", - local->loc.path, - priv->children[active_src]->name); - } - - afr_sh_entry_expunge_all (frame, this); - return 0; - } - - list_for_each_entry (entry, &entries->list, list) { - last_offset = entry->d_off; - entry_count++; - } - - gf_log (this->name, GF_LOG_TRACE, - "readdir'ed %d entries from %s", - entry_count, priv->children[active_src]->name); - - sh->offset = last_offset; - local->call_count = entry_count; - - list_for_each_entry (entry, &entries->list, list) { - afr_sh_entry_expunge_entry (frame, this, entry); - } - - return 0; -} - -int -afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk, - priv->children[active_src], - priv->children[active_src]->fops->readdirp, - sh->healing_fd, sh->block_size, sh->offset); - - return 0; + if (xdata) + dict_unref (xdata); + loc_wipe (&loc); + loc_wipe (&srcloc); + return ret; } -int -afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this) +static int +afr_selfheal_newentry_mark (call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, struct afr_reply *replies, + unsigned char *sources, unsigned char *newentry) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int active_src = -1; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; + int ret = 0; + int i = 0; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int **changelog = NULL; + int idx = 0; - sh->offset = 0; + priv = this->private; - if (sh->source == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "no active sources for %s to expunge entries", - local->loc.path); - goto out; - } + idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - active_src = next_active_sink (frame, this, sh->active_source); - sh->active_source = active_src; + uuid_copy (inode->gfid, replies[source].poststat.ia_gfid); - if (sh->op_failed) { - goto out; - } + changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); - if (active_src == -1) { - /* completed creating missing files on all subvolumes */ - goto out; - } + xattr = dict_new(); + if (!xattr) + return -ENOMEM; - gf_log (this->name, GF_LOG_TRACE, - "expunging entries of %s on %s to other sinks", - local->loc.path, priv->children[active_src]->name); + for (i = 0; i < priv->child_count; i++) { + if (!newentry[i]) + continue; + changelog[i][idx] = hton32(1); + } - afr_sh_entry_expunge_subvol (frame, this, active_src); + afr_set_pending_dict (priv, xattr, changelog); - return 0; -out: - afr_sh_entry_impunge_all (frame, this); - return 0; + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + afr_selfheal_post_op (frame, this, inode, i, xattr); + } + dict_unref (xattr); + return ret; } -int -afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this, - int active_src, int32_t op_ret, - int32_t op_errno) -{ - int call_count = 0; - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_entry_impunge_subvol (frame, this, active_src); - - return 0; +static int +__afr_selfheal_heal_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, + char *name, inode_t *inode, int source, + unsigned char *sources, unsigned char *healed_sinks, + unsigned char *locked_on, struct afr_reply *replies) +{ + int ret = 0; + afr_private_t *priv = NULL; + int i = 0; + unsigned char *newentry = NULL; + + priv = this->private; + newentry = alloca0 (priv->child_count); + + if (!replies[source].valid) + return -EIO; + + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + if (replies[source].op_ret == -1 && + replies[source].op_errno == ENOENT) { + ret = afr_selfheal_entry_delete (frame, this, fd->inode, + name, inode, i, replies); + } else { + if (!uuid_compare (replies[i].poststat.ia_gfid, + replies[source].poststat.ia_gfid)) + continue; + + ret = afr_selfheal_recreate_entry (frame, this, i, source, + fd->inode, name, inode, + replies); + if (ret > 0) { + newentry[i] = 1; + ret = 0; + } + } + if (ret < 0) + break; + } + + if (AFR_COUNT (newentry, priv->child_count)) + afr_selfheal_newentry_mark (frame, this, inode, source, replies, + sources, newentry); + return ret; } -int -afr_sh_entry_impunge_setattr_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) +static int +__afr_selfheal_merge_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, + char *name, inode_t *inode, unsigned char *sources, + unsigned char *healed_sinks, unsigned char *locked_on, + struct afr_reply *replies) { - int call_count = 0; - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *frame = NULL; - int active_src = 0; - int child_index = 0; - int32_t impunge_ret_child = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - active_src = impunge_sh->active_source; - child_index = (long) cookie; - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "setattr done for %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - } else { - gf_log (this->name, GF_LOG_INFO, - "setattr (%s) on %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - } - - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - impunge_ret_child = impunge_sh->impunge_ret_child; - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, impunge_ret_child, op_ret, - op_errno); - } - - return 0; -} + int ret = 0; + afr_private_t *priv = NULL; + int i = 0; + int source = -1; + priv = this->private; -int -afr_sh_entry_impunge_xattrop_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - dict_t *xattr) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = 0; - struct iatt stbuf = {0}; - int32_t valid = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to perform xattrop on %s (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - } - - gf_log (this->name, GF_LOG_TRACE, - "setting ownership of %s on %s to %d/%d", - impunge_local->loc.path, - priv->children[child_index]->name, - impunge_local->cont.lookup.buf.ia_uid, - impunge_local->cont.lookup.buf.ia_gid); - - stbuf.ia_atime = impunge_local->cont.lookup.buf.ia_atime; - stbuf.ia_atime_nsec = impunge_local->cont.lookup.buf.ia_atime_nsec; - stbuf.ia_mtime = impunge_local->cont.lookup.buf.ia_mtime; - stbuf.ia_mtime_nsec = impunge_local->cont.lookup.buf.ia_mtime_nsec; - - stbuf.ia_uid = impunge_local->cont.lookup.buf.ia_uid; - stbuf.ia_gid = impunge_local->cont.lookup.buf.ia_gid; - - valid = GF_SET_ATTR_UID | GF_SET_ATTR_GID | - GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_setattr_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->setattr, - &impunge_local->loc, - &stbuf, valid); - return 0; -} - + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid && replies[i].op_ret == 0) { + source = i; + break; + } + } -int -afr_sh_entry_impunge_parent_setattr_cbk (call_frame_t *setattr_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) -{ - loc_t *parent_loc = cookie; + if (source == -1) { + /* entry got deleted in the mean time? */ + return 0; + } - if (op_ret != 0) { - gf_log (this->name, GF_LOG_INFO, - "setattr on parent directory (%s) failed: %s", - parent_loc->path, strerror (op_errno)); - } + for (i = 0; i < priv->child_count; i++) { + if (i == source || !healed_sinks[i]) + continue; - loc_wipe (parent_loc); + if (replies[i].op_errno != ENOENT) + continue; - GF_FREE (parent_loc); + ret = afr_selfheal_recreate_entry (frame, this, i, source, + fd->inode, name, inode, + replies); + } - AFR_STACK_DESTROY (setattr_frame); - return 0; + return ret; } -int -afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, - struct iatt *postparent) +static int +__afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, + char *name, inode_t *inode, int source, + unsigned char *sources, unsigned char *healed_sinks, + unsigned char *locked_on, struct afr_reply *replies) { - int call_count = 0; - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *frame = NULL; - int active_src = 0; - int child_index = 0; - int32_t *pending_array = NULL; - dict_t *xattr = NULL; - int ret = 0; - int idx = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - call_frame_t *setattr_frame = NULL; - int32_t valid = 0; - loc_t *parent_loc = NULL; - struct iatt parentbuf = {0,}; - int32_t impunge_ret_child = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - active_src = impunge_sh->active_source; - - child_index = (long) cookie; - - if (op_ret == -1) { - ret = -1; - gf_log (this->name, GF_LOG_ERROR, - "creation of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - goto out; - } - - inode->ia_type = stbuf->ia_type; - - xattr = dict_new (); - if (!xattr) { - ret = -1; - goto out; - } - - pending_array = (int32_t*) GF_CALLOC (3, sizeof (*pending_array), - gf_afr_mt_int32_t); - - if (!pending_array) { - ret = -1; - goto out; - } - idx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); - pending_array[idx] = hton32 (1); - if (IA_ISDIR (stbuf->ia_type)) - idx = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); - else - idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - pending_array[idx] = hton32 (1); - - ret = dict_set_dynptr (xattr, priv->pending_key[child_index], - pending_array, - 3 * sizeof (*pending_array)); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); - } else { - pending_array = NULL; - } - - valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - parentbuf = impunge_sh->parentbuf; - setattr_frame = copy_frame (impunge_frame); - - parent_loc = GF_CALLOC (1, sizeof (*parent_loc), - gf_afr_mt_loc_t); - if (!parent_loc) { - ret = -1; - goto out; - } - afr_build_parent_loc (parent_loc, &impunge_local->loc); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_xattrop_cbk, - (void *) (long) child_index, - priv->children[active_src], - priv->children[active_src]->fops->xattrop, - &impunge_local->loc, GF_XATTROP_ADD_ARRAY, xattr); - - STACK_WIND_COOKIE (setattr_frame, afr_sh_entry_impunge_parent_setattr_cbk, - (void *) (long) parent_loc, - priv->children[child_index], - priv->children[child_index]->fops->setattr, - parent_loc, &parentbuf, valid); + int ret = -1; -out: - if (xattr) - dict_unref (xattr); - - if (ret) { - if (pending_array) - GF_FREE (pending_array); - - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - impunge_ret_child = impunge_sh->impunge_ret_child; - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, impunge_ret_child, -1, - op_errno); - } - } - - return 0; + if (source < 0) + ret = __afr_selfheal_merge_dirent (frame, this, fd, name, inode, + sources, healed_sinks, + locked_on, replies); + else + ret = __afr_selfheal_heal_dirent (frame, this, fd, name, inode, + source, sources, healed_sinks, + locked_on, replies); + return ret; } -int -afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *stbuf) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - dict_t *dict = NULL; - int ret = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - - gf_log (this->name, GF_LOG_DEBUG, - "creating missing file %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - dict = dict_new (); - if (!dict) - gf_log (this->name, GF_LOG_ERROR, "Out of memory"); - - ret = afr_set_dict_gfid (dict, stbuf->ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed", - impunge_local->loc.path); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->mknod, - &impunge_local->loc, - st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type), - makedev (ia_major (stbuf->ia_rdev), - ia_minor (stbuf->ia_rdev)), dict); - - if (dict) - dict_unref (dict); - - return 0; -} - - - -int -afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *stbuf) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - dict_t *dict = NULL; - - int ret = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - - dict = dict_new (); - if (!dict) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - return 0; - } - - ret = afr_set_dict_gfid (dict, stbuf->ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed", - impunge_local->loc.path); - - gf_log (this->name, GF_LOG_DEBUG, - "creating missing directory %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->mkdir, - &impunge_local->loc, - st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type), - dict); - - if (dict) - dict_unref (dict); - - return 0; +static int +afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *sources, + unsigned char *healed_sinks, char *name) +{ + afr_private_t *priv = NULL; + int ret = 0; + unsigned char *locked_on = NULL; + struct afr_reply *replies = NULL; + inode_t *inode = NULL; + + priv = this->private; + + locked_on = alloca0 (priv->child_count); + + replies = alloca0 (priv->child_count * sizeof(*replies)); + + ret = afr_selfheal_entrylk (frame, this, fd->inode, this->name, + name, locked_on); + { + if (ret < 2) { + ret = -ENOTCONN; + goto unlock; + } + + inode = afr_selfheal_unlocked_lookup_on (frame, fd->inode, name, + replies, locked_on); + if (!inode) { + ret = -ENOMEM; + goto unlock; + } + + ret = __afr_selfheal_entry_dirent (frame, this, fd, name, inode, + source, sources, healed_sinks, + locked_on, replies); + } +unlock: + afr_selfheal_unentrylk (frame, this, fd->inode, this->name, name, + locked_on); + if (inode) + inode_unref (inode); + return ret; } -int -afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index, const char *linkname) +static int +afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this, fd_t *fd, + int child, int source, unsigned char *sources, + unsigned char *healed_sinks) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - dict_t *dict = NULL; - struct iatt *buf = NULL; - int ret = 0; - call_frame_t *frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *impunge_sh = NULL; - int32_t impunge_ret_child = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - - buf = &impunge_local->cont.symlink.buf; - - dict = dict_new (); - if (!dict) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - impunge_ret_child = impunge_sh->impunge_ret_child; - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (impunge_frame, this, impunge_ret_child, -1, - ENOMEM); - goto out; - } - - ret = afr_set_dict_gfid (dict, buf->ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_INFO, - "%s: dict set gfid failed", - impunge_local->loc.path); - - gf_log (this->name, GF_LOG_DEBUG, - "creating missing symlink %s -> %s on %s", - impunge_local->loc.path, linkname, - priv->children[child_index]->name); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->symlink, - linkname, &impunge_local->loc, dict); - - if (dict) - dict_unref (dict); -out: - return 0; -} + int ret = 0; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + off_t offset = 0; + call_frame_t *iter_frame = NULL; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; + priv = this->private; + subvol = priv->children[child]; -int -afr_sh_entry_impunge_symlink_unlink_cbk (call_frame_t *impunge_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = -1; - call_frame_t *frame = NULL; - int call_count = -1; - int active_src = -1; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int32_t impunge_ret_child = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - active_src = impunge_sh->active_source; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "unlink of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - goto out; - } - - afr_sh_entry_impunge_symlink (impunge_frame, this, child_index, - impunge_sh->linkname); - - return 0; -out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - impunge_ret_child = impunge_sh->impunge_ret_child; - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, impunge_ret_child, op_ret, - op_errno); - } - - return 0; -} + INIT_LIST_HEAD (&entries.list); + iter_frame = afr_copy_frame (frame); + if (!iter_frame) + return -ENOMEM; -int -afr_sh_entry_impunge_symlink_unlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; + while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) { + if (ret > 0) + ret = 0; + list_for_each_entry (entry, &entries.list, list) { + offset = entry->d_off; - priv = this->private; - impunge_local = impunge_frame->local; + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; - gf_log (this->name, GF_LOG_DEBUG, - "unlinking symlink %s with wrong target on %s", - impunge_local->loc.path, - priv->children[child_index]->name); + if (__is_root_gfid (fd->inode->gfid) && + !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) + continue; - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_symlink_unlink_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->unlink, - &impunge_local->loc); + ret = afr_selfheal_entry_dirent (iter_frame, this, fd, + source, sources, + healed_sinks, + entry->d_name); + AFR_STACK_RESET (iter_frame); - return 0; -} + if (ret) + break; + } + gf_dirent_free (&entries); + if (ret) + break; + } -int -afr_sh_entry_impunge_readlink_sink_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - const char *linkname, struct iatt *sbuf) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = -1; - call_frame_t *frame = NULL; - int call_count = -1; - int active_src = -1; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int32_t impunge_ret_child = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - active_src = impunge_sh->active_source; - - child_index = (long) cookie; - - if ((op_ret == -1) && (op_errno != ENOENT)) { - gf_log (this->name, GF_LOG_INFO, - "readlink of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - goto out; - } - - /* symlink doesn't exist on the sink */ - - if ((op_ret == -1) && (op_errno == ENOENT)) { - afr_sh_entry_impunge_symlink (impunge_frame, this, - child_index, impunge_sh->linkname); - return 0; - } - - - /* symlink exists on the sink, so check if targets match */ - - if (strcmp (linkname, impunge_sh->linkname) == 0) { - /* targets match, nothing to do */ - - goto out; - } else { - /* - * Hah! Sneaky wolf in sheep's clothing! - */ - afr_sh_entry_impunge_symlink_unlink (impunge_frame, this, - child_index); - return 0; - } - -out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - impunge_ret_child = impunge_sh->impunge_ret_child; - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, impunge_ret_child, op_ret, - op_errno); - } - - return 0; + AFR_STACK_DESTROY (iter_frame); + return ret; } - -int -afr_sh_entry_impunge_readlink_sink (call_frame_t *impunge_frame, xlator_t *this, - int child_index) +static int +afr_selfheal_entry_do (call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *sources, + unsigned char *healed_sinks, + struct afr_reply *locked_replies) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; - priv = this->private; - impunge_local = impunge_frame->local; + priv = this->private; - gf_log (this->name, GF_LOG_DEBUG, - "checking symlink target of %s on %s", - impunge_local->loc.path, priv->children[child_index]->name); + gf_log (this->name, GF_LOG_INFO, "performing entry selfheal on %s", + uuid_utoa (fd->inode->gfid)); - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_sink_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->readlink, - &impunge_local->loc, 4096); - - return 0; + for (i = 0; i < priv->child_count; i++) { + if (i != source && !healed_sinks[i]) + continue; + ret = afr_selfheal_entry_do_subvol (frame, this, fd, i, source, + sources, healed_sinks); + if (ret) + break; + } + return ret; } -int -afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - const char *linkname, struct iatt *sbuf) +static int +__afr_selfheal_entry_finalize_source (xlator_t *this, unsigned char *sources, + unsigned char *sinks, + unsigned char *locked_on, + struct afr_reply *replies) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = -1; - call_frame_t *frame = NULL; - int call_count = -1; - int active_src = -1; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int32_t impunge_ret_child = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - active_src = impunge_sh->active_source; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "readlink of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - goto out; - } - - impunge_sh->linkname = gf_strdup (linkname); - afr_sh_entry_impunge_readlink_sink (impunge_frame, this, child_index); - - return 0; - -out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - impunge_ret_child = impunge_sh->impunge_ret_child; - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, impunge_ret_child, op_ret, - op_errno); - } - - return 0; -} + int i = 0; + afr_private_t *priv = NULL; + int source = -1; + int locked_count = 0; + int sources_count = 0; + int sinks_count = 0; + priv = this->private; -int -afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *stbuf) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int active_src = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - active_src = impunge_sh->active_source; - impunge_local->cont.symlink.buf = *stbuf; - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk, - (void *) (long) child_index, - priv->children[active_src], - priv->children[active_src]->fops->readlink, - &impunge_local->loc, 4096); - - return 0; -} + locked_count = AFR_COUNT (locked_on, priv->child_count); + sources_count = AFR_COUNT (sources, priv->child_count); + sinks_count = AFR_COUNT (sinks, priv->child_count); -int -afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *buf, - struct iatt *postparent) -{ - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - afr_private_t *priv = NULL; - ia_type_t type = IA_INVAL; - int ret = 0; - int active_src = 0; - - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - impunge_sh->parentbuf = *postparent; - active_src = impunge_sh->active_source; - impunge_local->cont.lookup.buf = *buf; - afr_update_loc_gfids (&impunge_local->loc, buf, postparent); - - type = buf->ia_type; - - switch (type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - afr_sh_entry_impunge_mknod (impunge_frame, this, - child_index, buf); - break; - case IA_IFLNK: - afr_sh_entry_impunge_readlink (impunge_frame, this, - child_index, buf); - break; - case IA_IFDIR: - afr_sh_entry_impunge_mkdir (impunge_frame, this, - child_index, buf); - break; - default: - gf_log (this->name, GF_LOG_ERROR, - "%s has unknown file type on %s: 0%o", - impunge_local->loc.path, - priv->children[active_src]->name, type); - ret = -1; - break; - } - - return ret; -} + if (locked_count == sinks_count || !sources_count) { + return -1; + } -int -afr_sh_entry_impunge_recreate_lookup_cbk (call_frame_t *impunge_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, - dict_t *xattr,struct iatt *postparent) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *impunge_sh = NULL; - afr_self_heal_t *sh = NULL; - int active_src = 0; - int child_index = 0; - call_frame_t *frame = NULL; - int call_count = 0; - int ret = 0; - int32_t impunge_ret_child = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - - child_index = (long) cookie; - - active_src = impunge_sh->active_source; - - if (op_ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "looking up %s on %s (for %s) failed (%s)", - impunge_local->loc.path, - priv->children[active_src]->name, - priv->children[child_index]->name, - strerror (op_errno)); - goto out; - } - - ret = afr_sh_entry_impunge_create (impunge_frame, this, child_index, buf, - postparent); - if (ret) - goto out; - - return 0; + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source = i; + break; + } + } -out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - impunge_ret_child = impunge_sh->impunge_ret_child; - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, impunge_ret_child, op_ret, - op_errno); - } - - return 0; + return source; } -int -afr_sh_entry_impunge_recreate (call_frame_t *impunge_frame, xlator_t *this, - int child_index) +static int +__afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, + struct afr_reply *replies, int *source_p) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int active_src = 0; + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + int i = 0; + priv = this->private; - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; + ret = afr_selfheal_unlocked_discover (frame, fd->inode, fd->inode->gfid, + replies); + if (ret) + return ret; - active_src = impunge_sh->active_source; + ret = afr_selfheal_find_direction (frame, this, replies, + AFR_ENTRY_TRANSACTION, + locked_on, sources, sinks); + if (ret) + return ret; - STACK_WIND_COOKIE (impunge_frame, - afr_sh_entry_impunge_recreate_lookup_cbk, - (void *) (long) child_index, - priv->children[active_src], - priv->children[active_src]->fops->lookup, - &impunge_local->loc, 0); + source = __afr_selfheal_entry_finalize_source (this, sources, sinks, + locked_on, replies); + if (source < 0) { + /* If source is < 0 (typically split-brain), we perform a + conservative merge of entries rather than erroring out */ + } + *source_p = source; - return 0; -} + for (i = 0; i < priv->child_count; i++) + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + healed_sinks[i] = sinks[i] && locked_on[i]; -int -afr_sh_entry_impunge_entry_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *x, - struct iatt *postparent) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int call_count = 0; - int child_index = 0; - call_frame_t *frame = NULL; - int active_src = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int32_t impunge_ret_child = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - frame = impunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - child_index = (long) cookie; - active_src = impunge_sh->active_source; - - if ((op_ret == -1 && op_errno == ENOENT) - || (IA_ISLNK (impunge_sh->impunging_entry_mode))) { - - /* - * A symlink's target might have changed, so - * always go down the recreate path for them. - */ - - /* decrease call_count in recreate-callback */ - - gf_log (this->name, GF_LOG_TRACE, - "missing entry %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - afr_sh_entry_impunge_recreate (impunge_frame, this, - child_index); - return 0; - } - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "%s exists under %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - impunge_sh->parentbuf = *postparent; - } else { - gf_log (this->name, GF_LOG_WARNING, - "looking up %s under %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - } - - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) { - impunge_ret_child = impunge_sh->impunge_ret_child; - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, impunge_ret_child, op_ret, - op_errno); - } - - return 0; + return ret; } -int -afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this, - gf_dirent_t *entry) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int ret = -1; - call_frame_t *impunge_frame = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int active_src = 0; - int i = 0; - int call_count = 0; - int op_errno = 0; - int op_ret = -1; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - sh->impunge_done = afr_sh_entry_impunge_entry_done; - - if ((strcmp (entry->d_name, ".") == 0) - || (strcmp (entry->d_name, "..") == 0) - || ((strcmp (local->loc.path, "/") == 0) - && (strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR) == 0))) { - - gf_log (this->name, GF_LOG_TRACE, - "skipping inspection of %s under %s", - entry->d_name, local->loc.path); - op_ret = 0; - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "inspecting existance of %s under %s", - entry->d_name, local->loc.path); - - impunge_frame = copy_frame (frame); - if (!impunge_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - op_errno = ENOMEM; - goto out; - } - - ALLOC_OR_GOTO (impunge_local, afr_local_t, out); - - impunge_frame->local = impunge_local; - impunge_sh = &impunge_local->self_heal; - impunge_sh->sh_frame = frame; - impunge_sh->active_source = active_src; - impunge_sh->impunge_ret_child = active_src; - - impunge_sh->impunging_entry_mode = - st_mode_from_ia (entry->d_stat.ia_prot, entry->d_stat.ia_type); - - ret = build_child_loc (this, &impunge_local->loc, &local->loc, entry->d_name); - if (ret != 0) { - op_errno = ENOMEM; - goto out; - } - - for (i = 0; i < priv->child_count; i++) { - if (i == active_src) - continue; - if (local->child_up[i] == 0) - continue; - if (sh->sources[i] == 1) - continue; - call_count++; - } - - impunge_local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (i == active_src) - continue; - if (local->child_up[i] == 0) - continue; - if (sh->sources[i] == 1) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", impunge_local->loc.path, - priv->children[i]->name); - - STACK_WIND_COOKIE (impunge_frame, - afr_sh_entry_impunge_entry_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - &impunge_local->loc, 0); - - if (!--call_count) - break; - } - - ret = 0; +static int +__afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on) +{ + afr_private_t *priv = NULL; + int ret = -1; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *data_lock = NULL; + unsigned char *healed_sinks = NULL; + struct afr_reply *locked_replies = NULL; + int source = -1; + + priv = this->private; + + sources = alloca0 (priv->child_count); + sinks = alloca0 (priv->child_count); + healed_sinks = alloca0 (priv->child_count); + data_lock = alloca0 (priv->child_count); + + locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); + + ret = afr_selfheal_entrylk (frame, this, fd->inode, this->name, NULL, + data_lock); + { + if (ret < 2) { + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_entry_prepare (frame, this, fd, data_lock, + sources, sinks, healed_sinks, + locked_replies, &source); + } +unlock: + afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL, + data_lock); + if (ret < 0) + goto out; + + ret = afr_selfheal_entry_do (frame, this, fd, source, sources, + healed_sinks, locked_replies); + if (ret) + goto out; + + ret = afr_selfheal_undo_pending (frame, this, fd->inode, sources, sinks, + healed_sinks, AFR_ENTRY_TRANSACTION, + locked_replies, data_lock); out: - if (ret == -1) - sh->impunge_done (frame, this, active_src, op_ret, op_errno); - - return 0; -} - - -int -afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - gf_dirent_t *entry = NULL; - off_t last_offset = 0; - int active_src = 0; - int entry_count = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - - if (op_ret <= 0) { - if (op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "readdir of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_TRACE, - "readdir of %s on subvolume %s complete", - local->loc.path, - priv->children[active_src]->name); - } - - afr_sh_entry_impunge_all (frame, this); - return 0; - } - - list_for_each_entry (entry, &entries->list, list) { - last_offset = entry->d_off; - entry_count++; - } - - gf_log (this->name, GF_LOG_TRACE, - "readdir'ed %d entries from %s", - entry_count, priv->children[active_src]->name); - - sh->offset = last_offset; - local->call_count = entry_count; - - list_for_each_entry (entry, &entries->list, list) { - afr_sh_entry_impunge_entry (frame, this, entry); - } - - return 0; -} - - -int -afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk, - priv->children[active_src], - priv->children[active_src]->fops->readdirp, - sh->healing_fd, sh->block_size, sh->offset); - - return 0; -} - - -int -afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int active_src = -1; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sh->offset = 0; - - active_src = next_active_source (frame, this, sh->active_source); - sh->active_source = active_src; - - if (sh->op_failed) { - afr_sh_entry_finish (frame, this); - return 0; - } - - if (active_src == -1) { - /* completed creating missing files on all subvolumes */ - afr_sh_entry_erase_pending (frame, this); - return 0; - } - - gf_log (this->name, GF_LOG_TRACE, - "impunging entries of %s on %s to other sinks", - local->loc.path, priv->children[active_src]->name); - - afr_sh_entry_impunge_subvol (frame, this, active_src); - - return 0; -} - - -int -afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - /* TODO: some of the open's might fail. - In that case, modify cleanup fn to send flush on those - fd's which are already open */ - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "opendir of %s failed on child %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - sh->op_failed = 1; - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (sh->op_failed) { - afr_sh_entry_finish (frame, this); - return 0; - } - gf_log (this->name, GF_LOG_TRACE, - "fd for %s opened, commencing sync", - local->loc.path); - - sh->active_source = -1; - afr_sh_entry_expunge_all (frame, this); - } - - return 0; -} - - -int -afr_sh_entry_open (call_frame_t *frame, xlator_t *this) -{ - int i = 0; - int call_count = 0; - - int source = -1; - int *sources = NULL; - - fd_t *fd = NULL; - - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = local->self_heal.source; - sources = local->self_heal.sources; - - sh->block_size = 65536; //131072 - sh->offset = 0; - - call_count = sh->active_sinks; - if (source != -1) - call_count++; - - local->call_count = call_count; - - fd = fd_create (local->loc.inode, frame->root->pid); - sh->healing_fd = fd; - - if (source != -1) { - gf_log (this->name, GF_LOG_TRACE, - "opening directory %s on subvolume %s (source)", - local->loc.path, priv->children[source]->name); - - /* open source */ - STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, - (void *) (long) source, - priv->children[source], - priv->children[source]->fops->opendir, - &local->loc, fd); - call_count--; - } - - /* open sinks */ - for (i = 0; i < priv->child_count; i++) { - if (sources[i] || !local->child_up[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "opening directory %s on subvolume %s (sink)", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->opendir, - &local->loc, fd); - - if (!--call_count) - break; - } - - return 0; -} - - -int -afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int active_sinks = 0; - int source = 0; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] == 0 && local->child_up[i] == 1) { - active_sinks++; - sh->success[i] = 1; - } - } - if (source != -1) - sh->success[source] = 1; - - if (active_sinks == 0) { - gf_log (this->name, GF_LOG_TRACE, - "no active sinks for self-heal on dir %s", - local->loc.path); - afr_sh_entry_finish (frame, this); - return 0; - } - if (source == -1 && active_sinks < 2) { - gf_log (this->name, GF_LOG_TRACE, - "cannot sync with 0 sources and 1 sink on dir %s", - local->loc.path); - afr_sh_entry_finish (frame, this); - return 0; - } - sh->active_sinks = active_sinks; - - if (source != -1) - gf_log (this->name, GF_LOG_DEBUG, - "self-healing directory %s from subvolume %s to " - "%d other", - local->loc.path, priv->children[source]->name, - active_sinks); - else - gf_log (this->name, GF_LOG_DEBUG, - "no active sources for %s found. " - "merging all entries as a conservative decision", - local->loc.path); - - afr_sh_entry_open (frame, this); - - return 0; + return ret; } -int -afr_sh_entry_fix (call_frame_t *frame, xlator_t *this) +static fd_t * +afr_selfheal_data_opendir (xlator_t *this, inode_t *inode) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - - int nsources = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (sh->forced_merge) { - sh->source = -1; - goto heal; - } + loc_t loc = {0,}; + int ret = 0; + fd_t *fd = NULL; - nsources = afr_build_sources (this, sh->xattr, sh->buf, - sh->pending_matrix, sh->sources, - sh->success_children, - AFR_ENTRY_TRANSACTION); - if (nsources == 0) { - gf_log (this->name, GF_LOG_TRACE, - "No self-heal needed for %s", - local->loc.path); + fd = fd_create (inode, 0); + if (!fd) + return NULL; - afr_sh_entry_finish (frame, this); - return 0; - } + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - source = afr_sh_select_source (sh->sources, priv->child_count); + ret = syncop_opendir (this, &loc, fd); + if (ret) { + fd_unref (fd); + fd = NULL; + } else { + fd_bind (fd); + } - sh->source = source; + loc_wipe (&loc); - afr_reset_children (sh->fresh_children, priv->child_count); - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); - afr_inode_set_read_ctx (this, sh->inode, sh->source, - sh->fresh_children); - - -heal: - afr_sh_entry_sync_prepare (frame, this); - - return 0; + return fd; } int -afr_sh_entry_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) +afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - int call_count = -1; - int child_index = (long) cookie; - - local = frame->local; - sh = &local->self_heal; + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + fd_t *fd = NULL; + int ret = 0; - LOCK (&frame->lock); - { - if (op_ret != -1) { - sh->xattr[child_index] = dict_ref (xattr); - sh->buf[child_index] = *buf; - sh->success_children[sh->success_count] = child_index; - sh->success_count++; - } - } - UNLOCK (&frame->lock); + priv = this->private; - call_count = afr_frame_return (frame); + fd = afr_selfheal_data_opendir (this, inode); + if (!fd) + return -EIO; - if (call_count == 0) { - afr_sh_entry_fix (frame, this); - } - - return 0; -} - -int -afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "Non Blocking entrylks " - "failed for %s.", local->loc.path); - sh->op_failed = 1; - afr_sh_entry_done (frame, this); - } else { - - gf_log (this->name, GF_LOG_DEBUG, "Non Blocking entrylks done " - "for %s. Proceeding to FOP", local->loc.path); - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_entry_lookup_cbk, _gf_false); - } - - return 0; -} - -int -afr_self_heal_entry (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + locked_on = alloca0 (priv->child_count); + ret = afr_selfheal_tryentrylk (frame, this, inode, priv->sh_domain, NULL, + locked_on); + { + if (ret < 2) { + /* Either less than two subvols available, or another + selfheal (from another server) is in progress. Skip + for now in any case there isn't anything to do. + */ + ret = -ENOTCONN; + goto unlock; + } - priv = this->private; - local = frame->local; + ret = __afr_selfheal_entry (frame, this, fd, locked_on); + } +unlock: + afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL, locked_on); - if (local->self_heal.need_entry_self_heal && priv->entry_self_heal) { - afr_sh_entrylk (frame, this, &local->loc, NULL, - afr_sh_post_nonblocking_entry_cbk); - } else { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to completion on %s", - local->loc.path); - afr_sh_entry_done (frame, this); - } + if (fd) + fd_unref (fd); - return 0; + return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index f0b9e44fc..b31a33237 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -1,697 +1,281 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#include <libgen.h> -#include <unistd.h> -#include <fnmatch.h> -#include <sys/time.h> -#include <stdlib.h> -#include <signal.h> #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" #endif -#include "glusterfs.h" #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" - - -int -afr_sh_metadata_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - -// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); - memset (sh->buf, 0, sizeof (struct iatt) * priv->child_count); - memset (sh->success, 0, sizeof (int) * priv->child_count); - -/* for (i = 0; i < priv->child_count; i++) { */ -/* sh->locked_nodes[i] = 1; */ -/* } */ - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } - - if (local->govinda_gOvinda) { - gf_log (this->name, GF_LOG_INFO, - "split-brain detected, aborting selfheal of %s", - local->loc.path); - sh->op_failed = 1; - sh->completion_cbk (frame, this); - } else { - if (IA_ISREG (sh->type)) { - gf_log (this->name, GF_LOG_DEBUG, - "proceeding to data check on %s", - local->loc.path); - afr_self_heal_data (frame, this); - return 0; - } - - if (IA_ISDIR (sh->type)) { - gf_log (this->name, GF_LOG_DEBUG, - "proceeding to entry check on %s", - local->loc.path); - afr_self_heal_entry (frame, this); - return 0; - } - sh->completion_cbk (frame, this); - } - - return 0; -} - - -int -afr_sh_metadata_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - int call_count = 0; - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_metadata_done (frame, this); - - return 0; -} - -int -afr_sh_inode_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->lock_cbk = afr_sh_metadata_done; - afr_unlock (frame, this); - - return 0; -} - -int -afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this) -{ - afr_sh_inode_unlock (frame, this); - - return 0; -} - - -int -afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) -{ - afr_local_t *local = NULL; - int call_count = 0; - long i = 0; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - i = (long)cookie; - - if ((!IA_ISREG (sh->buf[sh->source].ia_type)) && - (!IA_ISDIR (sh->buf[sh->source].ia_type))) { - afr_fresh_children_add_child (sh->fresh_children, i, - priv->child_count); - } - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if ((!IA_ISREG (sh->buf[sh->source].ia_type)) && - (!IA_ISDIR (sh->buf[sh->source].ia_type))) { - afr_inode_set_read_ctx (this, sh->inode, sh->source, - sh->fresh_children); - } - afr_sh_metadata_finish (frame, this); - } - - return 0; -} - - -int -afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, - sh->success, priv->child_count, - AFR_METADATA_TRANSACTION); - - erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, - gf_afr_mt_dict_t); - if (!erase_xattr) - return -ENOMEM; - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; - - erase_xattr[i] = get_new_dict(); - dict_ref (erase_xattr[i]); - } - } - - afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, - priv->child_count, AFR_METADATA_TRANSACTION); - - local->call_count = call_count; - - if (call_count == 0) { - gf_log (this->name, GF_LOG_INFO, - "metadata of %s not healed on any subvolume", - local->loc.path); - - afr_sh_metadata_finish (frame, this); - } - - for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "erasing pending flags from %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_erase_pending_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, erase_xattr[i]); - if (!--call_count) - break; - } - - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); - } - } - GF_FREE (erase_xattr); - - return 0; -} - - -int -afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "setting attributes failed for %s on %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - sh->success[child_index] = 0; - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_metadata_erase_pending (frame, this); - - return 0; -} - - -int -afr_sh_metadata_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop) -{ - afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); - - return 0; -} - - -int -afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); - - return 0; -} - - -int -afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - int active_sinks = 0; - int call_count = 0; - int i = 0; - - struct iatt stbuf = {0,}; - int32_t valid = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - active_sinks = sh->active_sinks; - - /* - * 2 calls per sink - setattr, setxattr - */ - if (xattr) - call_count = active_sinks * 2; - else - call_count = active_sinks; - - local->call_count = call_count; - - stbuf.ia_atime = sh->buf[source].ia_atime; - stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec; - stbuf.ia_mtime = sh->buf[source].ia_mtime; - stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec; - - stbuf.ia_uid = sh->buf[source].ia_uid; - stbuf.ia_gid = sh->buf[source].ia_gid; - - stbuf.ia_type = sh->buf[source].ia_type; - stbuf.ia_prot = sh->buf[source].ia_prot; - - valid = GF_SET_ATTR_MODE | - GF_SET_ATTR_UID | GF_SET_ATTR_GID | - GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - - for (i = 0; i < priv->child_count; i++) { - if (call_count == 0) { - break; - } - if (sh->sources[i] || !local->child_up[i]) - continue; - - gf_log (this->name, GF_LOG_DEBUG, - "self-healing metadata of %s from %s to %s", - local->loc.path, priv->children[source]->name, - priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_setattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setattr, - &local->loc, &stbuf, valid); - - call_count--; - - if (!xattr) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, xattr, 0); - call_count--; - } - - return 0; -} - - -int -afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - - int i; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "getxattr of %s failed on subvolume %s (%s). proceeding without xattr", - local->loc.path, priv->children[source]->name, - strerror (op_errno)); - - afr_sh_metadata_sync (frame, this, NULL); - } else { - for (i = 0; i < priv->child_count; i++) { - dict_del (xattr, priv->pending_key[i]); - } - - afr_sh_metadata_sync (frame, this, xattr); - } +#include "byte-order.h" - return 0; -} +#define AFR_HEAL_ATTR (GF_SET_ATTR_UID|GF_SET_ATTR_GID|GF_SET_ATTR_MODE) int -afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) +afr_selfheal_metadata_do (call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, unsigned char *healed_sinks, + struct afr_reply *locked_replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int active_sinks = 0; - int source = 0; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] == 0 && local->child_up[i] == 1) { - active_sinks++; - sh->success[i] = 1; - } - } - sh->success[source] = 1; - - if (active_sinks == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "no active sinks for performing self-heal on file %s", - local->loc.path); - afr_sh_metadata_finish (frame, this); - return 0; - } - sh->active_sinks = active_sinks; - - gf_log (this->name, GF_LOG_TRACE, - "syncing metadata of %s from subvolume %s to %d active sinks", - local->loc.path, priv->children[source]->name, active_sinks); - - STACK_WIND (frame, afr_sh_metadata_getxattr_cbk, - priv->children[source], - priv->children[source]->fops->getxattr, - &local->loc, NULL); - - return 0; + int ret = -1; + loc_t loc = {0,}; + dict_t *xattr = NULL; + dict_t *old_xattr = NULL; + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; + + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); + + gf_log (this->name, GF_LOG_INFO, "performing metadata selfheal on %s", + uuid_utoa (inode->gfid)); + + ret = syncop_getxattr (priv->children[source], &loc, &xattr, NULL); + if (ret < 0) { + loc_wipe (&loc); + return -EIO; + } + + afr_filter_xattrs (xattr); + dict_del (xattr, GF_SELINUX_XATTR_KEY); + + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + + ret = syncop_setattr (priv->children[i], &loc, + &locked_replies[source].poststat, + AFR_HEAL_ATTR, NULL, NULL); + if (ret) + healed_sinks[i] = 0; + + old_xattr = NULL; + ret = syncop_getxattr (priv->children[i], &loc, &old_xattr, 0); + if (old_xattr) { + dict_del (old_xattr, GF_SELINUX_XATTR_KEY); + afr_filter_xattrs (old_xattr); + ret = syncop_removexattr (priv->children[i], &loc, "", + old_xattr); + } + + ret = syncop_setxattr (priv->children[i], &loc, xattr, 0); + if (ret) + healed_sinks[i] = 0; + } + + loc_wipe (&loc); + if (xattr) + dict_unref (xattr); + + return 0; } -int -afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this) +/* + * Look for mismatching uid/gid or mode even if xattrs don't say so, and + * pick one arbitrarily as winner. + */ + +static int +__afr_selfheal_metadata_finalize_source (xlator_t *this, unsigned char *sources, + unsigned char *sinks, + unsigned char *locked_on, + struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int nsources = 0; - int source = 0; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - nsources = afr_build_sources (this, sh->xattr, sh->buf, - sh->pending_matrix, sh->sources, - sh->success_children, - AFR_METADATA_TRANSACTION); - if (nsources == 0) { - gf_log (this->name, GF_LOG_TRACE, - "No self-heal needed for %s", - local->loc.path); - - afr_sh_metadata_finish (frame, this); - return 0; - } - - if ((nsources == -1) - && (priv->favorite_child != -1) - && (sh->child_errno[priv->favorite_child] == 0)) { - - gf_log (this->name, GF_LOG_WARNING, - "Picking favorite child %s as authentic source to resolve conflicting metadata of %s", - priv->children[priv->favorite_child]->name, - local->loc.path); - - sh->sources[priv->favorite_child] = 1; - - nsources = afr_sh_source_count (sh->sources, - priv->child_count); - } - - if (nsources == -1) { - gf_log (this->name, GF_LOG_ERROR, - "Unable to self-heal permissions/ownership of '%s' " - "(possible split-brain). Please fix the file on " - "all backend volumes", local->loc.path); - - local->govinda_gOvinda = 1; - - afr_sh_metadata_finish (frame, this); - return 0; - } - - source = afr_sh_select_source (sh->sources, priv->child_count); - - if (source == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "No active sources found."); - - afr_sh_metadata_finish (frame, this); - return 0; - } - - sh->source = source; - - /* detect changes not visible through pending flags -- JIC */ - for (i = 0; i < priv->child_count; i++) { - if (i == source || sh->child_errno[i]) - continue; - - if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source])) - sh->sources[i] = 0; - - if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source])) - sh->sources[i] = 0; - } - - if ((!IA_ISREG (sh->buf[source].ia_type)) && - (!IA_ISDIR (sh->buf[source].ia_type))) { - afr_reset_children (sh->fresh_children, - priv->child_count); - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); - afr_inode_set_read_ctx (this, sh->inode, sh->source, - sh->fresh_children); - } - - afr_sh_metadata_sync_prepare (frame, this); - - return 0; + int i = 0; + afr_private_t *priv = NULL; + struct iatt first = {0, }; + int source = -1; + int locked_count = 0; + int sources_count = 0; + int sinks_count = 0; + + priv = this->private; + + locked_count = AFR_COUNT (locked_on, priv->child_count); + sources_count = AFR_COUNT (sources, priv->child_count); + sinks_count = AFR_COUNT (sinks, priv->child_count); + + if (locked_count == sinks_count || !sources_count) { + if (!priv->metadata_splitbrain_forced_heal) { + return -EIO; + } + /* Metadata split brain, select one subvol + arbitrarily */ + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i] && sinks[i]) { + sources[i] = 1; + sinks[i] = 0; + break; + } + } + } + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (source == -1) { + source = i; + first = replies[i].poststat; + } + } + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (!IA_EQUAL (first, replies[i].poststat, type) || + !IA_EQUAL (first, replies[i].poststat, uid) || + !IA_EQUAL (first, replies[i].poststat, gid) || + !IA_EQUAL (first, replies[i].poststat, prot)) { + sources[i] = 0; + sinks[i] = 1; + } + } + + return source; } -int -afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) +static int +__afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *locked_on, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, + struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "path %s on subvolume %s is of mode 0%o", - local->loc.path, - priv->children[child_index]->name, - buf->ia_type); - - sh->buf[child_index] = *buf; - if (xattr) - sh->xattr[child_index] = dict_ref (xattr); - sh->success_children[sh->success_count] = child_index; - sh->success_count++; - } else { - gf_log (this->name, GF_LOG_INFO, - "path %s on subvolume %s => -1 (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - sh->child_errno[child_index] = op_errno; - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_metadata_fix (frame, this); - - return 0; + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; + + ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid, + replies); + if (ret) + return ret; + + ret = afr_selfheal_find_direction (frame, this, replies, + AFR_METADATA_TRANSACTION, + locked_on, sources, sinks); + if (ret) + return ret; + + source = __afr_selfheal_metadata_finalize_source (this, sources, sinks, + locked_on, replies); + if (source < 0) + return -EIO; + + for (i = 0; i < priv->child_count; i++) + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + healed_sinks[i] = sinks[i] && locked_on[i]; + + return source; } -int -afr_sh_metadata_post_nonblocking_inodelk_cbk (call_frame_t *frame, - xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "Non Blocking metadata " - "inodelks failed for %s.", local->loc.path); - gf_log (this->name, GF_LOG_ERROR, "Metadata self-heal " - "failed for %s.", local->loc.path); - afr_sh_metadata_done (frame, this); - } else { - - gf_log (this->name, GF_LOG_DEBUG, "Non Blocking metadata " - "inodelks done for %s. Proceeding to FOP", - local->loc.path); - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_metadata_lookup_cbk, _gf_false); - } - - return 0; -} -int -afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) +static int +__afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *locked_on) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->transaction_lk_type = AFR_SELFHEAL_LK; - int_lock->selfheal_lk_type = AFR_METADATA_SELF_HEAL_LK; - - afr_set_lock_number (frame, this); - - int_lock->lk_flock.l_start = 0; - int_lock->lk_flock.l_len = 0; - int_lock->lk_flock.l_type = F_WRLCK; - int_lock->lock_cbk = afr_sh_metadata_post_nonblocking_inodelk_cbk; - - afr_nonblocking_inodelk (frame, this); - - return 0; + afr_private_t *priv = NULL; + int ret = -1; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *data_lock = NULL; + unsigned char *healed_sinks = NULL; + struct afr_reply *locked_replies = NULL; + int source = -1; + + priv = this->private; + + sources = alloca0 (priv->child_count); + sinks = alloca0 (priv->child_count); + healed_sinks = alloca0 (priv->child_count); + data_lock = alloca0 (priv->child_count); + + locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); + + ret = afr_selfheal_inodelk (frame, this, inode, this->name, + LLONG_MAX - 1, 0, data_lock); + { + if (ret < 2) { + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_metadata_prepare (frame, this, inode, data_lock, + sources, sinks, healed_sinks, + locked_replies); + if (ret < 0) + goto unlock; + + source = ret; + ret = 0; + } +unlock: + afr_selfheal_uninodelk (frame, this, inode, this->name, + LLONG_MAX -1, 0, data_lock); + if (ret < 0) + goto out; + + ret = afr_selfheal_metadata_do (frame, this, inode, source, healed_sinks, + locked_replies); + if (ret) + goto out; + + ret = afr_selfheal_undo_pending (frame, this, inode, sources, sinks, + healed_sinks, AFR_METADATA_TRANSACTION, + locked_replies, data_lock); +out: + return ret; } int -afr_self_heal_metadata (call_frame_t *frame, xlator_t *this) +afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode) { - afr_local_t *local = NULL; - afr_private_t *priv = this->private; - - - local = frame->local; - - if (local->self_heal.need_metadata_self_heal && priv->metadata_self_heal) { - afr_sh_metadata_lock (frame, this); - } else { - afr_sh_metadata_done (frame, this); - } - - return 0; + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + int ret = 0; + + priv = this->private; + + locked_on = alloca0 (priv->child_count); + + ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain, 0, 0, + locked_on); + { + if (ret < 2) { + /* Either less than two subvols available, or another + selfheal (from another server) is in progress. Skip + for now in any case there isn't anything to do. + */ + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_metadata (frame, this, inode, locked_on); + } +unlock: + afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on); + + return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c new file mode 100644 index 000000000..ce80b8da3 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-name.c @@ -0,0 +1,457 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "afr.h" +#include "afr-self-heal.h" + + +int +__afr_selfheal_assign_gfid (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname, inode_t *inode, + struct afr_reply *replies, int gfid_idx) +{ + int i = 0; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + int ret = 0; + loc_t loc = {0, }; + + priv = this->private; + + uuid_copy (parent->gfid, pargfid); + + xdata = dict_new (); + if (!xdata) { + return -ENOMEM; + } + + ret = dict_set_static_bin (xdata, "gfid-req", + replies[gfid_idx].poststat.ia_gfid, 16); + if (ret) { + dict_destroy (xdata); + return -ENOMEM; + } + + loc.parent = inode_ref (parent); + loc.inode = inode_ref (inode); + uuid_copy (loc.pargfid, pargfid); + loc.name = bname; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].op_ret == 0 || replies[i].op_errno != ENODATA) + continue; + + ret = syncop_lookup (priv->children[i], &loc, xdata, 0, 0, 0); + } + + loc_wipe (&loc); + dict_unref (xdata); + + return ret; +} + + +int +__afr_selfheal_name_impunge (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname, inode_t *inode, + struct afr_reply *replies, int gfid_idx) +{ + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; + + priv = this->private; + + uuid_copy (parent->gfid, pargfid); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (uuid_compare (replies[i].poststat.ia_gfid, + replies[gfid_idx].poststat.ia_gfid) == 0) + continue; + + ret |= afr_selfheal_recreate_entry (frame, this, i, gfid_idx, + parent, bname, inode, replies); + } + + return ret; +} + + +int +__afr_selfheal_name_expunge (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname, inode_t *inode, + struct afr_reply *replies) +{ + loc_t loc = {0, }; + int i = 0; + afr_private_t *priv = NULL; + char g[64]; + int ret = 0; + + priv = this->private; + + loc.parent = inode_ref (parent); + uuid_copy (loc.pargfid, pargfid); + loc.name = bname; + loc.inode = inode_ref (inode); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (replies[i].op_ret) + continue; + + switch (replies[i].poststat.ia_type) { + case IA_IFDIR: + gf_log (this->name, GF_LOG_WARNING, + "expunging dir %s/%s (%s) on %s", + uuid_utoa (pargfid), bname, + uuid_utoa_r (replies[i].poststat.ia_gfid, g), + priv->children[i]->name); + ret |= syncop_rmdir (priv->children[i], &loc, 1); + break; + default: + gf_log (this->name, GF_LOG_WARNING, + "expunging file %s/%s (%s) on %s", + uuid_utoa (pargfid), bname, + uuid_utoa_r (replies[i].poststat.ia_gfid, g), + priv->children[i]->name); + ret |= syncop_unlink (priv->children[i], &loc); + break; + } + } + + loc_wipe (&loc); + + return ret; + +} + + +int +__afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, int source, + unsigned char *locked_on, struct afr_reply *replies) +{ + int i = 0; + afr_private_t *priv = NULL; + uuid_t gfid = {0, }; + int gfid_idx = -1; + gf_boolean_t source_is_empty = _gf_true; + gf_boolean_t need_heal = _gf_false; + int first_idx = -1; + char g1[64],g2[64]; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (first_idx == -1) { + first_idx = i; + continue; + } + + if (replies[i].op_ret != replies[first_idx].op_ret) + need_heal = _gf_true; + + if (uuid_compare (replies[i].poststat.ia_gfid, + replies[first_idx].poststat.ia_gfid)) + need_heal = _gf_true; + } + + if (!need_heal) + return 0; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (!replies[i].op_ret && (source == -1 || sources[i])) { + source_is_empty = _gf_false; + break; + } + } + + if (source_is_empty) { + return __afr_selfheal_name_expunge (frame, this, parent, pargfid, + bname, inode, replies); + } + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (uuid_is_null (replies[i].poststat.ia_gfid)) + continue; + + if (uuid_is_null (gfid)) { + uuid_copy (gfid, replies[i].poststat.ia_gfid); + gfid_idx = i; + continue; + } + + if (sources[i] || source == -1) { + if (gfid_idx != -1 && + (sources[gfid_idx] || source == -1) && + uuid_compare (gfid, replies[i].poststat.ia_gfid)) { + gf_log (this->name, GF_LOG_WARNING, + "GFID mismatch for <gfid:%s>/%s " + "%s on %s and %s on %s", + uuid_utoa (pargfid), bname, + uuid_utoa_r (replies[i].poststat.ia_gfid, g1), + priv->children[i]->name, + uuid_utoa_r (replies[gfid_idx].poststat.ia_gfid, g2), + priv->children[gfid_idx]->name); + return -1; + } + + uuid_copy (gfid, replies[i].poststat.ia_gfid); + gfid_idx = i; + continue; + } + } + + if (gfid_idx == -1) + return -1; + + __afr_selfheal_assign_gfid (frame, this, parent, pargfid, bname, inode, + replies, gfid_idx); + + return __afr_selfheal_name_impunge (frame, this, parent, pargfid, + bname, inode, replies, gfid_idx); +} + + +int +__afr_selfheal_name_finalize_source (xlator_t *this, unsigned char *sources, + unsigned char *sinks, unsigned char *locked_on, + struct afr_reply *replies) +{ + int i = 0; + afr_private_t *priv = NULL; + int source = -1; + int locked_count = 0; + int sources_count = 0; + int sinks_count = 0; + + priv = this->private; + + locked_count = AFR_COUNT (locked_on, priv->child_count); + sources_count = AFR_COUNT (sources, priv->child_count); + sinks_count = AFR_COUNT (sinks, priv->child_count); + + if (locked_count == sinks_count || !sources_count) { + return -1; + } + + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source = i; + break; + } + } + + return source; +} + + +int +__afr_selfheal_name_prepare (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, struct afr_reply *replies, + int *source_p) +{ + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; + + ret = afr_selfheal_unlocked_discover (frame, parent, pargfid, replies); + if (ret) + return ret; + + ret = afr_selfheal_find_direction (frame, this, replies, + AFR_ENTRY_TRANSACTION, + locked_on, sources, sinks); + if (ret) + return ret; + + source = __afr_selfheal_name_finalize_source (this, sources, sinks, + locked_on, replies); + if (source < 0) { + /* If source is < 0 (typically split-brain), we perform a + conservative merge of entries rather than erroring out */ + } + *source_p = source; + + for (i = 0; i < priv->child_count; i++) + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + healed_sinks[i] = sinks[i] && locked_on[i]; + + return ret; +} + + +int +afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname) +{ + afr_private_t *priv = NULL; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *healed_sinks = NULL; + unsigned char *locked_on = NULL; + int source = -1; + struct afr_reply *replies = NULL; + int ret = -1; + inode_t *inode = NULL; + + priv = this->private; + + locked_on = alloca0 (priv->child_count); + sources = alloca0 (priv->child_count); + sinks = alloca0 (priv->child_count); + healed_sinks = alloca0 (priv->child_count); + + replies = alloca0 (priv->child_count * sizeof(*replies)); + + ret = afr_selfheal_entrylk (frame, this, parent, this->name, bname, + locked_on); + { + if (ret < 2) { + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_name_prepare (frame, this, parent, pargfid, + locked_on, sources, sinks, + healed_sinks, replies, + &source); + if (ret) + goto unlock; + + inode = afr_selfheal_unlocked_lookup_on (frame, parent, bname, + replies, locked_on); + if (!inode) { + ret = -ENOMEM; + goto unlock; + } + + ret = __afr_selfheal_name_do (frame, this, parent, pargfid, bname, + inode, sources, sinks, healed_sinks, + source, locked_on, replies); + } +unlock: + afr_selfheal_unentrylk (frame, this, parent, this->name, bname, + locked_on); + if (inode) + inode_unref (inode); + + return ret; +} + + +int +afr_selfheal_name_unlocked_inspect (call_frame_t *frame, xlator_t *this, + inode_t *parent, uuid_t pargfid, + const char *bname, gf_boolean_t *need_heal) +{ + afr_private_t *priv = NULL; + int i = 0; + struct afr_reply *replies = NULL; + inode_t *inode = NULL; + int first_idx = -1; + + priv = this->private; + + replies = alloca0 (sizeof (*replies) * priv->child_count); + + inode = afr_selfheal_unlocked_lookup_on (frame, parent, bname, + replies, priv->child_up); + if (!inode) + return -ENOMEM; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (first_idx == -1) { + first_idx = i; + continue; + } + + if (replies[i].op_ret != replies[first_idx].op_ret) + *need_heal = _gf_true; + + if (uuid_compare (replies[i].poststat.ia_gfid, + replies[first_idx].poststat.ia_gfid)) + *need_heal = _gf_true; + } + + if (inode) + inode_unref (inode); + return 0; +} + +int +afr_selfheal_name (xlator_t *this, uuid_t pargfid, const char *bname) +{ + inode_t *parent = NULL; + call_frame_t *frame = NULL; + int ret = -1; + gf_boolean_t need_heal = _gf_false; + + parent = afr_inode_find (this, pargfid); + if (!parent) + goto out; + + frame = afr_frame_create (this); + if (!frame) + goto out; + + ret = afr_selfheal_name_unlocked_inspect (frame, this, parent, pargfid, + bname, &need_heal); + if (ret) + goto out; + + if (need_heal) + afr_selfheal_name_do (frame, this, parent, pargfid, bname); +out: + if (parent) + inode_unref (parent); + if (frame) + AFR_STACK_DESTROY (frame); + + return ret; +} diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index b2df92821..a1b972ac3 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -1,58 +1,167 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#ifndef __AFR_SELF_HEAL_H__ -#define __AFR_SELF_HEAL_H__ -#include <sys/stat.h> +#ifndef _AFR_SELFHEAL_H +#define _AFR_SELFHEAL_H + + +/* Perform fop on all UP subvolumes and wait for all callbacks to return */ + +#define AFR_ONALL(frame, rfn, fop, args ...) do { \ + afr_local_t *__local = frame->local; \ + afr_private_t *__priv = frame->this->private; \ + int __i = 0, __count = 0; \ + \ + afr_replies_wipe (__local, __priv); \ + \ + for (__i = 0; __i < __priv->child_count; __i++) { \ + if (!__priv->child_up[__i]) continue; \ + STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \ + __priv->children[__i], \ + __priv->children[__i]->fops->fop, args); \ + __count++; \ + } \ + syncbarrier_wait (&__local->barrier, __count); \ + } while (0) + + +/* Perform fop on all subvolumes represented by list[] array and wait + for all callbacks to return */ + +#define AFR_ONLIST(list, frame, rfn, fop, args ...) do { \ + afr_local_t *__local = frame->local; \ + afr_private_t *__priv = frame->this->private; \ + int __i = 0, __count = 0; \ + \ + afr_replies_wipe (__local, __priv); \ + \ + for (__i = 0; __i < __priv->child_count; __i++) { \ + if (!list[__i]) continue; \ + STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \ + __priv->children[__i], \ + __priv->children[__i]->fops->fop, args); \ + __count++; \ + } \ + syncbarrier_wait (&__local->barrier, __count); \ + } while (0) + + +#define AFR_SEQ(frame, rfn, fop, args ...) do { \ + afr_local_t *__local = frame->local; \ + afr_private_t *__priv = frame->this->private; \ + int __i = 0; \ + \ + afr_replies_wipe (__local, __priv); \ + \ + for (__i = 0; __i < __priv->child_count; __i++) { \ + if (!__priv->child_up[__i]) continue; \ + STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \ + __priv->children[__i], \ + __priv->children[__i]->fops->fop, args); \ + syncbarrier_wait (&__local->barrier, 1); \ + } \ + } while (0) -#define FILETYPE_DIFFERS(buf1,buf2) ((buf1)->ia_type != (buf2)->ia_type) -#define PERMISSION_DIFFERS(buf1,buf2) (st_mode_from_ia ((buf1)->ia_prot, (buf1)->ia_type) != st_mode_from_ia ((buf2)->ia_prot, (buf2)->ia_type)) -#define OWNERSHIP_DIFFERS(buf1,buf2) (((buf1)->ia_uid != (buf2)->ia_uid) || ((buf1)->ia_gid != (buf2)->ia_gid)) -#define SIZE_DIFFERS(buf1,buf2) ((buf1)->ia_size != (buf2)->ia_size) -#define SIZE_GREATER(buf1,buf2) ((buf1)->ia_size > (buf2)->ia_size) +#define ALLOC_MATRIX(n, type) ({type **__ptr = NULL; \ + int __i; \ + __ptr = alloca0 (n * sizeof(type *)); \ + for (__i = 0; __i < n; __i++) __ptr[__i] = alloca0 (n * sizeof(type)); \ + __ptr;}) + + +#define IA_EQUAL(f,s,field) (memcmp (&(f.ia_##field), &(s.ia_##field), sizeof (s.ia_##field)) == 0) + int -afr_sh_has_metadata_pending (dict_t *xattr, xlator_t *this); +afr_selfheal (xlator_t *this, uuid_t gfid); + int -afr_sh_has_entry_pending (dict_t *xattr, xlator_t *this); +afr_selfheal_name (xlator_t *this, uuid_t gfid, const char *name); + int -afr_sh_has_data_pending (dict_t *xattr, xlator_t *this); +afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode); int -afr_self_heal_entry (call_frame_t *frame, xlator_t *this); +afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode); int -afr_self_heal_data (call_frame_t *frame, xlator_t *this); +afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode); + + +int +afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on); int -afr_self_heal_metadata (call_frame_t *frame, xlator_t *this); +afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on); int -afr_self_heal_get_source (xlator_t *this, afr_local_t *local, dict_t **xattr); +afr_selfheal_uninodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + const unsigned char *locked_on); int -afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode); +afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on); int -afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, - dict_t **xattr, - afr_transaction_type txn_type); -#endif /* __AFR_SELF_HEAL_H__ */ +afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on); + +int +afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on); + +int +afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies); + +inode_t * +afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent, + const char *name, struct afr_reply *replies, + unsigned char *lookup_on); + +int +afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + afr_transaction_type type, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks); + +int +afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies, + afr_transaction_type type, int *dirty, int **matrix); + +int +afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, afr_transaction_type type, + struct afr_reply *replies, unsigned char *locked_on); + +int +afr_selfheal_recreate_entry (call_frame_t *frame, xlator_t *this, int dst, + int source, inode_t *dir, const char *name, + inode_t *inode, struct afr_reply *replies); + +int +afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode, + int subvol, dict_t *xattr); + +call_frame_t * +afr_frame_create (xlator_t *this); + +inode_t * +afr_inode_find (xlator_t *this, uuid_t gfid); + +#endif /* !_AFR_SELFHEAL_H */ diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c new file mode 100644 index 000000000..4bfe909bc --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -0,0 +1,1256 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "afr.h" +#include "afr-self-heal.h" +#include "afr-self-heald.h" +#include "protocol-common.h" + +#define SHD_INODE_LRU_LIMIT 2048 +#define AFR_EH_HEALED_LIMIT 1024 +#define AFR_EH_HEAL_FAIL_LIMIT 1024 +#define AFR_EH_SPLIT_BRAIN_LIMIT 1024 +#define AFR_STATISTICS_HISTORY_SIZE 50 + + +#define ASSERT_LOCAL(this, healer) \ + if (!afr_shd_is_subvol_local(this, healer->subvol)) { \ + healer->local = _gf_false; \ + if (safe_break (healer)) { \ + break; \ + } else { \ + continue; \ + } \ + } else { \ + healer->local = _gf_true; \ + } + + +#define NTH_INDEX_HEALER(this, n) &((((afr_private_t *)this->private))->shd.index_healers[n]) +#define NTH_FULL_HEALER(this, n) &((((afr_private_t *)this->private))->shd.full_healers[n]) + +int afr_shd_gfid_to_path (xlator_t *this, xlator_t *subvol, uuid_t gfid, char **path_p); + +char * +afr_subvol_name (xlator_t *this, int subvol) +{ + afr_private_t *priv = NULL; + + priv = this->private; + if (subvol < 0 || subvol > priv->child_count) + return NULL; + + return priv->children[subvol]->name; +} + + +void +afr_destroy_crawl_event_data (void *data) +{ + return; +} + + +void +afr_destroy_shd_event_data (void *data) +{ + shd_event_t *shd_event = data; + + if (!shd_event) + return; + GF_FREE (shd_event->path); + + return; +} + + +gf_boolean_t +afr_shd_is_subvol_local (xlator_t *this, int subvol) +{ + char *pathinfo = NULL; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int ret = 0; + gf_boolean_t is_local = _gf_false; + loc_t loc = {0, }; + + priv = this->private; + + loc.inode = this->itable->root; + uuid_copy (loc.gfid, loc.inode->gfid); + + ret = syncop_getxattr (priv->children[subvol], &loc, &xattr, + GF_XATTR_PATHINFO_KEY); + if (ret) + return _gf_false; + if (!xattr) + return _gf_false; + + ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &pathinfo); + if (ret) + return _gf_false; + + afr_local_pathinfo (pathinfo, &is_local); + + gf_log (this->name, GF_LOG_DEBUG, "subvol %s is %slocal", + priv->children[subvol]->name, is_local? "" : "not "); + + return is_local; +} + + +int +__afr_shd_healer_wait (struct subvol_healer *healer) +{ + afr_private_t *priv = NULL; + struct timespec wait_till = {0, }; + int ret = 0; + + priv = healer->this->private; + +disabled_loop: + wait_till.tv_sec = time (NULL) + 60; + + while (!healer->rerun) { + ret = pthread_cond_timedwait (&healer->cond, + &healer->mutex, + &wait_till); + if (ret == ETIMEDOUT) + break; + } + + ret = healer->rerun; + healer->rerun = 0; + + if (!priv->shd.enabled) + goto disabled_loop; + + return ret; +} + + +int +afr_shd_healer_wait (struct subvol_healer *healer) +{ + int ret = 0; + + pthread_mutex_lock (&healer->mutex); + { + ret = __afr_shd_healer_wait (healer); + } + pthread_mutex_unlock (&healer->mutex); + + return ret; +} + + +gf_boolean_t +safe_break (struct subvol_healer *healer) +{ + gf_boolean_t ret = _gf_false; + + pthread_mutex_lock (&healer->mutex); + { + if (healer->rerun) + goto unlock; + + healer->running = _gf_false; + ret = _gf_true; + } +unlock: + pthread_mutex_unlock (&healer->mutex); + + return ret; +} + + +inode_t * +afr_shd_inode_find (xlator_t *this, xlator_t *subvol, uuid_t gfid) +{ + inode_t *inode = NULL; + int ret = 0; + loc_t loc = {0, }; + struct iatt iatt = {0, }; + + inode = inode_find (this->itable, gfid); + if (inode) + goto out; + + loc.inode = inode_new (this->itable); + if (!loc.inode) + goto out; + uuid_copy (loc.gfid, gfid); + + ret = syncop_lookup (subvol, &loc, NULL, &iatt, NULL, NULL); + if (ret < 0) + goto out; + + inode = inode_link (loc.inode, NULL, NULL, &iatt); + if (inode) + inode_lookup (inode); +out: + loc_wipe (&loc); + return inode; +} + + +fd_t * +afr_shd_index_opendir (xlator_t *this, int child) +{ + fd_t *fd = NULL; + afr_private_t *priv = NULL; + xlator_t *subvol = NULL; + loc_t rootloc = {0, }; + inode_t *inode = NULL; + int ret = 0; + dict_t *xattr = NULL; + void *index_gfid = NULL; + + priv = this->private; + subvol = priv->children[child]; + + rootloc.inode = inode_ref (this->itable->root); + uuid_copy (rootloc.gfid, rootloc.inode->gfid); + + ret = syncop_getxattr (subvol, &rootloc, &xattr, + GF_XATTROP_INDEX_GFID); + if (ret || !xattr) { + errno = -ret; + goto out; + } + + ret = dict_get_ptr (xattr, GF_XATTROP_INDEX_GFID, &index_gfid); + if (ret) + goto out; + + gf_log (this->name, GF_LOG_DEBUG, "index-dir gfid for %s: %s", + subvol->name, uuid_utoa (index_gfid)); + + inode = afr_shd_inode_find (this, subvol, index_gfid); + if (!inode) + goto out; + fd = fd_anonymous (inode); +out: + loc_wipe (&rootloc); + if (xattr) + dict_unref (xattr); + return fd; +} + + +int +afr_shd_index_purge (xlator_t *subvol, inode_t *inode, char *name) +{ + loc_t loc = {0, }; + int ret = 0; + + loc.parent = inode_ref (inode); + loc.name = name; + + ret = syncop_unlink (subvol, &loc); + + loc_wipe (&loc); + return ret; +} + + +int +afr_shd_selfheal_name (struct subvol_healer *healer, int child, uuid_t parent, + const char *bname) +{ + int ret = -1; + + ret = afr_selfheal_name (THIS, parent, bname); + + return ret; +} + +int +afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid) +{ + int ret = 0; + eh_t *eh = NULL; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + shd_event_t *shd_event = NULL; + char *path = NULL; + xlator_t *subvol = NULL; + xlator_t *this = NULL; + crawl_event_t *crawl_event = NULL; + + this = healer->this; + priv = this->private; + shd = &priv->shd; + crawl_event = &healer->crawl_event; + + subvol = priv->children[child]; + + ret = afr_selfheal (this, gfid); + + if (ret == -EIO) { + eh = shd->split_brain; + crawl_event->split_brain_count++; + } else if (ret < 0) { + eh = shd->heal_failed; + crawl_event->heal_failed_count++; + } else if (ret == 0) { + eh = shd->healed; + crawl_event->healed_count++; + } + + afr_shd_gfid_to_path (this, subvol, gfid, &path); + if (!path) + return ret; + + if (eh) { + shd_event = GF_CALLOC (1, sizeof(*shd_event), + gf_afr_mt_shd_event_t); + if (!shd_event) { + GF_FREE (path); + return ret; + } + + shd_event->child = child; + shd_event->path = path; + + if (eh_save_history (eh, shd_event) < 0) { + GF_FREE (shd_event); + GF_FREE (path); + } + } + return ret; +} + + +void +afr_shd_sweep_prepare (struct subvol_healer *healer) +{ + crawl_event_t *event = NULL; + + event = &healer->crawl_event; + + event->healed_count = 0; + event->split_brain_count = 0; + event->heal_failed_count = 0; + + time (&event->start_time); + event->end_time = 0; +} + + +void +afr_shd_sweep_done (struct subvol_healer *healer) +{ + crawl_event_t *event = NULL; + crawl_event_t *history = NULL; + afr_self_heald_t *shd = NULL; + + event = &healer->crawl_event; + shd = &(((afr_private_t *)healer->this->private)->shd); + + time (&event->end_time); + history = memdup (event, sizeof (*event)); + event->start_time = 0; + + if (!history) + return; + + if (eh_save_history (shd->statistics[healer->subvol], history) < 0) + GF_FREE (history); +} + + +int +afr_shd_index_sweep (struct subvol_healer *healer) +{ + xlator_t *this = NULL; + int child = -1; + fd_t *fd = NULL; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; + off_t offset = 0; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + uuid_t gfid; + int ret = 0; + int count = 0; + + this = healer->this; + child = healer->subvol; + priv = this->private; + subvol = priv->children[child]; + + fd = afr_shd_index_opendir (this, child); + if (!fd) { + gf_log (this->name, GF_LOG_WARNING, + "unable to opendir index-dir on %s", subvol->name); + return -errno; + } + + INIT_LIST_HEAD (&entries.list); + + while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) { + if (ret > 0) + ret = 0; + list_for_each_entry (entry, &entries.list, list) { + offset = entry->d_off; + + if (!priv->shd.enabled) { + ret = -EBUSY; + break; + } + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + gf_log (this->name, GF_LOG_DEBUG, "got entry: %s", + entry->d_name); + + ret = uuid_parse (entry->d_name, gfid); + if (ret) + continue; + + ret = afr_shd_selfheal (healer, child, gfid); + if (ret == 0) + count++; + + if (ret == -ENOENT || ret == -ESTALE) { + afr_shd_index_purge (subvol, fd->inode, + entry->d_name); + ret = 0; + } + } + + gf_dirent_free (&entries); + if (ret) + break; + } + + if (fd) + fd_unref (fd); + if (!ret) + ret = count; + return ret; +} + + +int +afr_shd_full_sweep (struct subvol_healer *healer, inode_t *inode) +{ + fd_t *fd = NULL; + xlator_t *this = NULL; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; + off_t offset = 0; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + int ret = 0; + + this = healer->this; + priv = this->private; + subvol = priv->children[healer->subvol]; + + fd = fd_anonymous (inode); + if (!fd) + return -errno; + + INIT_LIST_HEAD (&entries.list); + + while ((ret = syncop_readdirp (subvol, fd, 131072, offset, 0, &entries))) { + if (ret < 0) + break; + + ret = gf_link_inodes_from_dirent (this, fd->inode, &entries); + if (ret) + break; + + list_for_each_entry (entry, &entries.list, list) { + offset = entry->d_off; + + if (!priv->shd.enabled) { + ret = -EBUSY; + break; + } + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + afr_shd_selfheal_name (healer, healer->subvol, + inode->gfid, entry->d_name); + + afr_shd_selfheal (healer, healer->subvol, + entry->d_stat.ia_gfid); + + if (entry->d_stat.ia_type == IA_IFDIR) { + ret = afr_shd_full_sweep (healer, entry->inode); + if (ret) + break; + } + } + + gf_dirent_free (&entries); + if (ret) + break; + } + + if (fd) + fd_unref (fd); + return ret; +} + + +void * +afr_shd_index_healer (void *data) +{ + struct subvol_healer *healer = NULL; + xlator_t *this = NULL; + int ret = 0; + + healer = data; + THIS = this = healer->this; + + for (;;) { + afr_shd_healer_wait (healer); + + ASSERT_LOCAL(this, healer); + + do { + gf_log (this->name, GF_LOG_DEBUG, + "starting index sweep on subvol %s", + afr_subvol_name (this, healer->subvol)); + + afr_shd_sweep_prepare (healer); + + ret = afr_shd_index_sweep (healer); + + afr_shd_sweep_done (healer); + /* + As long as at least one gfid was + healed, keep retrying. We may have + just healed a directory and thereby + created entries for other gfids which + could not be healed thus far. + */ + + gf_log (this->name, GF_LOG_DEBUG, + "finished index sweep on subvol %s", + afr_subvol_name (this, healer->subvol)); + /* + Give a pause before retrying to avoid a busy loop + in case the only entry in index is because of + an ongoing I/O. + */ + sleep (1); + } while (ret > 0); + } + + return NULL; +} + + +void * +afr_shd_full_healer (void *data) +{ + struct subvol_healer *healer = NULL; + xlator_t *this = NULL; + int run = 0; + + healer = data; + THIS = this = healer->this; + + for (;;) { + pthread_mutex_lock (&healer->mutex); + { + run = __afr_shd_healer_wait (healer); + if (!run) + healer->running = _gf_false; + } + pthread_mutex_unlock (&healer->mutex); + + if (!run) + break; + + ASSERT_LOCAL(this, healer); + + gf_log (this->name, GF_LOG_INFO, + "starting full sweep on subvol %s", + afr_subvol_name (this, healer->subvol)); + + afr_shd_sweep_prepare (healer); + + afr_shd_full_sweep (healer, this->itable->root); + + afr_shd_sweep_done (healer); + + gf_log (this->name, GF_LOG_INFO, + "finished full sweep on subvol %s", + afr_subvol_name (this, healer->subvol)); + } + + return NULL; +} + + +int +afr_shd_healer_init (xlator_t *this, struct subvol_healer *healer) +{ + int ret = 0; + + ret = pthread_mutex_init (&healer->mutex, NULL); + if (ret) + goto out; + + ret = pthread_cond_init (&healer->cond, NULL); + if (ret) + goto out; + + healer->this = this; + healer->running = _gf_false; + healer->rerun = _gf_false; + healer->local = _gf_false; +out: + return ret; +} + + +int +afr_shd_healer_spawn (xlator_t *this, struct subvol_healer *healer, + void *(threadfn)(void *)) +{ + int ret = 0; + + pthread_mutex_lock (&healer->mutex); + { + if (healer->running) { + pthread_cond_signal (&healer->cond); + } else { + ret = gf_thread_create (&healer->thread, NULL, + threadfn, healer); + if (ret) + goto unlock; + healer->running = 1; + } + + healer->rerun = 1; + } +unlock: + pthread_mutex_unlock (&healer->mutex); + + return ret; +} + + +int +afr_shd_full_healer_spawn (xlator_t *this, int subvol) +{ + return afr_shd_healer_spawn (this, NTH_FULL_HEALER (this, subvol), + afr_shd_full_healer); +} + + +int +afr_shd_index_healer_spawn (xlator_t *this, int subvol) +{ + return afr_shd_healer_spawn (this, NTH_INDEX_HEALER (this, subvol), + afr_shd_index_healer); +} + + +int +afr_shd_dict_add_crawl_event (xlator_t *this, dict_t *output, + crawl_event_t *crawl_event) +{ + int ret = 0; + uint64_t count = 0; + char key[256] = {0}; + int xl_id = 0; + uint64_t healed_count = 0; + uint64_t split_brain_count = 0; + uint64_t heal_failed_count = 0; + char *start_time_str = 0; + char *end_time_str = NULL; + char *crawl_type = NULL; + int progress = -1; + int child = -1; + + child = crawl_event->child; + healed_count = crawl_event->healed_count; + split_brain_count = crawl_event->split_brain_count; + heal_failed_count = crawl_event->heal_failed_count; + crawl_type = crawl_event->crawl_type; + + if (!crawl_event->start_time) + goto out; + + start_time_str = gf_strdup (ctime (&crawl_event->start_time)); + + if (crawl_event->end_time) + end_time_str = gf_strdup (ctime (&crawl_event->end_time)); + + ret = dict_get_int32 (output, this->name, &xl_id); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "xl does not have id"); + goto out; + } + + snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child); + ret = dict_get_uint64 (output, key, &count); + + + snprintf (key, sizeof (key), "statistics_healed_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64(output, key, healed_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_healed_count to outout"); + goto out; + } + + snprintf (key, sizeof (key), "statistics_sb_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64 (output, key, split_brain_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_split_brain_count to outout"); + goto out; + } + + snprintf (key, sizeof (key), "statistics_crawl_type-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_str (output, key, crawl_type); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_crawl_type to output"); + goto out; + } + + snprintf (key, sizeof (key), "statistics_heal_failed_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64 (output, key, heal_failed_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_healed_failed_count to outout"); + goto out; + } + + snprintf (key, sizeof (key), "statistics_strt_time-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_dynstr (output, key, start_time_str); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_crawl_start_time to outout"); + goto out; + } else { + start_time_str = NULL; + } + + if (!end_time_str) + progress = 1; + else + progress = 0; + + snprintf (key, sizeof (key), "statistics_end_time-%d-%d-%"PRIu64, + xl_id, child, count); + if (!end_time_str) + end_time_str = gf_strdup ("Could not determine the end time"); + ret = dict_set_dynstr (output, key, end_time_str); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_crawl_end_time to outout"); + goto out; + } else { + end_time_str = NULL; + } + + snprintf (key, sizeof (key), "statistics_inprogress-%d-%d-%"PRIu64, + xl_id, child, count); + + ret = dict_set_int32 (output, key, progress); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_inprogress to outout"); + goto out; + } + + snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child); + ret = dict_set_uint64 (output, key, count + 1); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not increment the counter."); + goto out; + } +out: + GF_FREE (start_time_str); + GF_FREE (end_time_str); + return ret; +} + + +int +afr_shd_dict_add_path (xlator_t *this, dict_t *output, int child, char *path, + struct timeval *tv) +{ + int ret = -1; + uint64_t count = 0; + char key[256] = {0}; + int xl_id = 0; + + ret = dict_get_int32 (output, this->name, &xl_id); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "xl does not have id"); + goto out; + } + + snprintf (key, sizeof (key), "%d-%d-count", xl_id, child); + ret = dict_get_uint64 (output, key, &count); + + snprintf (key, sizeof (key), "%d-%d-%"PRIu64, xl_id, child, count); + ret = dict_set_dynstr (output, key, path); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s: Could not add to output", + path); + goto out; + } + + if (tv) { + snprintf (key, sizeof (key), "%d-%d-%"PRIu64"-time", xl_id, + child, count); + ret = dict_set_uint32 (output, key, tv->tv_sec); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s: Could not set time", + path); + goto out; + } + } + + snprintf (key, sizeof (key), "%d-%d-count", xl_id, child); + + ret = dict_set_uint64 (output, key, count + 1); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not increment count"); + goto out; + } + + ret = 0; +out: + return ret; +} + + +int +afr_shd_gfid_to_path (xlator_t *this, xlator_t *subvol, uuid_t gfid, char **path_p) +{ + loc_t loc = {0,}; + char *path = NULL; + dict_t *xattr = NULL; + int ret = 0; + + uuid_copy (loc.gfid, gfid); + loc.inode = inode_new (this->itable); + + ret = syncop_getxattr (subvol, &loc, &xattr, GFID_TO_PATH_KEY); + loc_wipe (&loc); + if (ret) + return ret; + + ret = dict_get_str (xattr, GFID_TO_PATH_KEY, &path); + if (ret || !path) + return -EINVAL; + + *path_p = gf_strdup (path); + if (!*path_p) + return -ENOMEM; + return 0; +} + + +int +afr_shd_gather_index_entries (xlator_t *this, int child, dict_t *output) +{ + fd_t *fd = NULL; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; + off_t offset = 0; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + uuid_t gfid; + int ret = 0; + int count = 0; + char *path = NULL; + + priv = this->private; + subvol = priv->children[child]; + + fd = afr_shd_index_opendir (this, child); + if (!fd) { + gf_log (this->name, GF_LOG_WARNING, + "unable to opendir index-dir on %s", subvol->name); + return -errno; + } + + INIT_LIST_HEAD (&entries.list); + + while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) { + if (ret > 0) + ret = 0; + list_for_each_entry (entry, &entries.list, list) { + offset = entry->d_off; + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + gf_log (this->name, GF_LOG_DEBUG, "got entry: %s", + entry->d_name); + + ret = uuid_parse (entry->d_name, gfid); + if (ret) + continue; + + path = NULL; + ret = afr_shd_gfid_to_path (this, subvol, gfid, &path); + + if (ret == -ENOENT || ret == -ESTALE) { + afr_shd_index_purge (subvol, fd->inode, + entry->d_name); + ret = 0; + continue; + } + + ret = afr_shd_dict_add_path (this, output, child, path, + NULL); + } + + gf_dirent_free (&entries); + if (ret) + break; + } + + if (fd) + fd_unref (fd); + if (!ret) + ret = count; + return ret; +} + + +int +afr_add_shd_event (circular_buffer_t *cb, void *data) +{ + dict_t *output = NULL; + xlator_t *this = THIS; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + shd_event_t *shd_event = NULL; + char *path = NULL; + + output = data; + priv = this->private; + shd = &priv->shd; + shd_event = cb->data; + + if (!shd->index_healers[shd_event->child].local) + return 0; + + path = gf_strdup (shd_event->path); + if (!path) + return -ENOMEM; + + afr_shd_dict_add_path (this, output, shd_event->child, path, + &cb->tv); + return 0; +} + +int +afr_add_crawl_event (circular_buffer_t *cb, void *data) +{ + dict_t *output = NULL; + xlator_t *this = THIS; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + crawl_event_t *crawl_event = NULL; + + output = data; + priv = this->private; + shd = &priv->shd; + crawl_event = cb->data; + + if (!shd->index_healers[crawl_event->child].local) + return 0; + + afr_shd_dict_add_crawl_event (this, output, crawl_event); + + return 0; +} + + +int +afr_selfheal_daemon_init (xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int ret = -1; + int i = 0; + + priv = this->private; + shd = &priv->shd; + + this->itable = inode_table_new (SHD_INODE_LRU_LIMIT, this); + if (!this->itable) + goto out; + + shd->index_healers = GF_CALLOC (sizeof(*shd->index_healers), + priv->child_count, + gf_afr_mt_subvol_healer_t); + if (!shd->index_healers) + goto out; + + for (i = 0; i < priv->child_count; i++) { + shd->index_healers[i].subvol = i; + ret = afr_shd_healer_init (this, &shd->index_healers[i]); + if (ret) + goto out; + } + + shd->full_healers = GF_CALLOC (sizeof(*shd->full_healers), + priv->child_count, + gf_afr_mt_subvol_healer_t); + if (!shd->full_healers) + goto out; + for (i = 0; i < priv->child_count; i++) { + shd->full_healers[i].subvol = i; + ret = afr_shd_healer_init (this, &shd->full_healers[i]); + if (ret) + goto out; + } + + shd->healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false, + afr_destroy_shd_event_data); + if (!shd->healed) + goto out; + + shd->heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false, + afr_destroy_shd_event_data); + if (!shd->heal_failed) + goto out; + + shd->split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false, + afr_destroy_shd_event_data); + if (!shd->split_brain) + goto out; + + shd->statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count, + gf_common_mt_eh_t); + if (!shd->statistics) + goto out; + + for (i = 0; i < priv->child_count ; i++) { + shd->statistics[i] = eh_new (AFR_STATISTICS_HISTORY_SIZE, + _gf_false, + afr_destroy_crawl_event_data); + if (!shd->statistics[i]) + goto out; + shd->full_healers[i].crawl_event.child = i; + shd->full_healers[i].crawl_event.crawl_type = "FULL"; + shd->index_healers[i].crawl_event.child = i; + shd->index_healers[i].crawl_event.crawl_type = "INDEX"; + } + + ret = 0; +out: + return ret; +} + + +int +afr_selfheal_childup (xlator_t *this, int subvol) +{ + afr_shd_index_healer_spawn (this, subvol); + + return 0; +} + + +int64_t +afr_shd_get_index_count (xlator_t *this, int i) +{ + afr_private_t *priv = NULL; + xlator_t *subvol = NULL; + uint64_t count = 0; + loc_t rootloc = {0, }; + dict_t *xattr = NULL; + int ret = -1; + + priv = this->private; + subvol = priv->children[i]; + + rootloc.inode = inode_ref (this->itable->root); + uuid_copy (rootloc.gfid, rootloc.inode->gfid); + + ret = syncop_getxattr (subvol, &rootloc, &xattr, + GF_XATTROP_INDEX_COUNT); + loc_wipe (&rootloc); + + if (ret < 0) + return -1; + + ret = dict_get_uint64 (xattr, GF_XATTROP_INDEX_COUNT, &count); + if (ret) + return -1; + return count; +} + + +int +afr_xl_op (xlator_t *this, dict_t *input, dict_t *output) +{ + gf_xl_afr_op_t op = GF_AFR_OP_INVALID; + int ret = 0; + int xl_id = 0; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + struct subvol_healer *healer = NULL; + int i = 0; + char key[64]; + int op_ret = 0; + int64_t cnt = 0; + + priv = this->private; + shd = &priv->shd; + + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == -1) + goto out; + + ret = dict_get_int32 (input, "xl-op", (int32_t*)&op); + if (ret) + goto out; + ret = dict_get_int32 (input, this->name, &xl_id); + if (ret) + goto out; + ret = dict_set_int32 (output, this->name, xl_id); + if (ret) + goto out; + switch (op) { + case GF_AFR_OP_HEAL_INDEX: + op_ret = -1; + + for (i = 0; i < priv->child_count; i++) { + healer = &shd->index_healers[i]; + snprintf (key, 64, "%d-%d-status", xl_id, i); + + if (!priv->child_up[i]) { + ret = dict_set_str (output, key, + "Brick is not connected"); + } else if (AFR_COUNT (priv->child_up, + priv->child_count) < 2) { + ret = dict_set_str (output, key, + "< 2 bricks in replica are up"); + } else if (!afr_shd_is_subvol_local (this, healer->subvol)) { + ret = dict_set_str (output, key, + "Brick is remote"); + } else { + ret = dict_set_str (output, key, + "Started self-heal"); + afr_shd_index_healer_spawn (this, i); + op_ret = 0; + } + } + break; + case GF_AFR_OP_HEAL_FULL: + op_ret = -1; + + for (i = 0; i < priv->child_count; i++) { + healer = &shd->full_healers[i]; + snprintf (key, 64, "%d-%d-status", xl_id, i); + + if (!priv->child_up[i]) { + ret = dict_set_str (output, key, + "Brick is not connected"); + } else if (AFR_COUNT (priv->child_up, + priv->child_count) < 2) { + ret = dict_set_str (output, key, + "< 2 bricks in replica are up"); + } else if (!afr_shd_is_subvol_local (this, healer->subvol)) { + ret = dict_set_str (output, key, + "Brick is remote"); + } else { + ret = dict_set_str (output, key, + "Started self-heal"); + afr_shd_full_healer_spawn (this, i); + op_ret = 0; + } + } + break; + case GF_AFR_OP_INDEX_SUMMARY: + for (i = 0; i < priv->child_count; i++) + if (shd->index_healers[i].local) + afr_shd_gather_index_entries (this, i, output); + break; + case GF_AFR_OP_HEALED_FILES: + eh_dump (shd->healed, output, afr_add_shd_event); + break; + case GF_AFR_OP_HEAL_FAILED_FILES: + eh_dump (shd->heal_failed, output, afr_add_shd_event); + break; + case GF_AFR_OP_SPLIT_BRAIN_FILES: + eh_dump (shd->split_brain, output, afr_add_shd_event); + break; + case GF_AFR_OP_STATISTICS: + for (i = 0; i < priv->child_count; i++) { + eh_dump (shd->statistics[i], output, + afr_add_crawl_event); + afr_shd_dict_add_crawl_event (this, output, + &shd->index_healers[i].crawl_event); + afr_shd_dict_add_crawl_event (this, output, + &shd->full_healers[i].crawl_event); + } + break; + case GF_AFR_OP_STATISTICS_HEAL_COUNT: + case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA: + op_ret = -1; + + for (i = 0; i < priv->child_count; i++) { + if (!priv->child_up[i]) { + snprintf (key, 64, "%d-%d-status", xl_id, i); + ret = dict_set_str (output, key, + "Brick is not connected"); + } else { + snprintf (key, 64, "%d-%d-hardlinks", xl_id, i); + cnt = afr_shd_get_index_count (this, i); + if (cnt >= 0) { + ret = dict_set_uint64 (output, key, cnt); + } + op_ret = 0; + } + } + +// ret = _do_crawl_op_on_local_subvols (this, INDEX_TO_BE_HEALED, +// STATISTICS_TO_BE_HEALED, +// output); + break; + + default: + gf_log (this->name, GF_LOG_ERROR, "Unknown set op %d", op); + break; + } +out: + dict_del (output, this->name); + return op_ret; +} diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h new file mode 100644 index 000000000..10e229ee7 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heald.h @@ -0,0 +1,72 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _AFR_SELF_HEALD_H +#define _AFR_SELF_HEALD_H + +#include <pthread.h> + + +typedef struct { + int child; + char *path; +} shd_event_t; + +typedef struct { + int child; + uint64_t healed_count; + uint64_t split_brain_count; + uint64_t heal_failed_count; + + /* If start_time is 0, it means crawler is not in progress + and stats are not valid */ + time_t start_time; + /* If start_time is NOT 0 and end_time is 0, it means + cralwer is in progress */ + time_t end_time; + char *crawl_type; +} crawl_event_t; + +struct subvol_healer { + xlator_t *this; + int subvol; + gf_boolean_t local; + gf_boolean_t running; + gf_boolean_t rerun; + crawl_event_t crawl_event; + pthread_mutex_t mutex; + pthread_cond_t cond; + pthread_t thread; +}; + +typedef struct { + gf_boolean_t iamshd; + gf_boolean_t enabled; + struct subvol_healer *index_healers; + struct subvol_healer *full_healers; + + eh_t *healed; + eh_t *heal_failed; + eh_t *split_brain; + eh_t **statistics; +} afr_self_heald_t; + + +int +afr_selfheal_childup (xlator_t *this, int subvol); + +int +afr_selfheal_daemon_init (xlator_t *this); + +int +afr_xl_op (xlator_t *this, dict_t *input, dict_t *output); + +#endif /* !_AFR_SELF_HEALD_H */ diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 9d6e98898..205ff759e 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -1,186 +1,173 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include "dict.h" #include "byte-order.h" #include "common-utils.h" +#include "timer.h" #include "afr.h" #include "afr-transaction.h" #include <signal.h> +gf_boolean_t +afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this); -#define LOCKED_NO 0x0 /* no lock held */ -#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path - of RENAME */ -#define LOCKED_LOWER 0x2 /* for lower_path of RENAME */ +gf_boolean_t +afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this); +int +afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, + afr_changelog_resume_t changelog_resume); -afr_fd_ctx_t * -afr_fd_ctx_get (fd_t *fd, xlator_t *this) -{ - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int ret = 0; - ret = fd_ctx_get (fd, this, &ctx); +int +__afr_txn_write_fop (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; - if (ret < 0) - goto out; + local = frame->local; + priv = this->private; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + call_count = AFR_COUNT (local->transaction.pre_op, priv->child_count); -out: - return fd_ctx; -} + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + local->call_count = call_count; -static void -afr_pid_save (call_frame_t *frame) -{ - afr_local_t * local = NULL; + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + local->transaction.wind (frame, this, i); - local = frame->local; + if (!--call_count) + break; + } + } - local->saved_pid = frame->root->pid; + return 0; } -static void -afr_pid_restore (call_frame_t *frame) +int +__afr_txn_write_done (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; + afr_local_t *local = NULL; local = frame->local; - frame->root->pid = local->saved_pid; -} + local->transaction.unwind (frame, this); + AFR_STACK_DESTROY (frame); -static void -__mark_all_pending (int32_t *pending[], int child_count, - afr_transaction_type type) -{ - int i = 0; - int j = 0; - - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); - pending[i][j] = hton32 (1); - } + return 0; } -static void -__mark_child_dead (int32_t *pending[], int child_count, int child, - afr_transaction_type type) +call_frame_t* +afr_transaction_detach_fop_frame (call_frame_t *frame) { - int j = 0; + afr_local_t * local = NULL; + call_frame_t *fop_frame = NULL; - j = afr_index_for_transaction_type (type); + local = frame->local; + + LOCK (&frame->lock); + { + fop_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); - pending[child][j] = 0; + return fop_frame; } static void -__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index) +afr_save_lk_owner (call_frame_t *frame) { - afr_local_t *local = NULL; - afr_fd_ctx_t *fd_ctx = NULL; + afr_local_t * local = NULL; local = frame->local; - if (!local->fd) - return; - - fd_ctx = afr_fd_ctx_get (local->fd, this); - - if (!fd_ctx) - goto out; - - LOCK (&local->fd->lock); - { - if (local->transaction.type == AFR_DATA_TRANSACTION) - fd_ctx->pre_op_done[child_index]++; - } - UNLOCK (&local->fd->lock); -out: - return; + local->saved_lk_owner = frame->root->lk_owner; } static void -__mark_pre_op_undone_on_fd (call_frame_t *frame, xlator_t *this, int child_index) +afr_restore_lk_owner (call_frame_t *frame) { - afr_local_t *local = NULL; - afr_fd_ctx_t *fd_ctx = NULL; + afr_local_t * local = NULL; local = frame->local; - if (!local->fd) - return; - - fd_ctx = afr_fd_ctx_get (local->fd, this); - - if (!fd_ctx) - goto out; - - LOCK (&local->fd->lock); - { - if (local->transaction.type == AFR_DATA_TRANSACTION) - fd_ctx->pre_op_done[child_index]--; - } - UNLOCK (&local->fd->lock); -out: - return; + frame->root->lk_owner = local->saved_lk_owner; } - -static void -__mark_down_children (int32_t *pending[], int child_count, - unsigned char *child_up, afr_transaction_type type) +void +__mark_all_success (call_frame_t *frame, xlator_t *this) { - int i = 0; - int j = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i; - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); + local = frame->local; + priv = this->private; - if (!child_up[i]) - pending[i][j] = 0; - } + for (i = 0; i < priv->child_count; i++) { + local->transaction.failed_subvols[i] = 0; + } } -static void -__mark_all_success (int32_t *pending[], int child_count, - afr_transaction_type type) +int +afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) { - int i; - int j; + afr_local_t *local = NULL; + fd_t *fd = NULL; - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); - pending[i][j] = hton32 (-1); - } + local = frame->local; + fd = local->fd; + + /* Perform fops with the lk-owner from top xlator. + * Eg: lk-owner of posix-lk and flush should be same, + * flush cant clear the posix-lks without that lk-owner. + */ + afr_save_lk_owner (frame); + frame->root->lk_owner = + local->transaction.main_frame->root->lk_owner; + + if (local->pre_op_compat) + /* old mode, pre-op was done as afr_changelog_do() + just now, before OP */ + afr_changelog_pre_op_update (frame, this); + + /* The wake up needs to happen independent of + what type of fop arrives here. If it was + a write, then it has already inherited the + lock and changelog. If it was not a write, + then the presumption of the optimization (of + optimizing for successive write operations) + fails. + */ + if (fd) + afr_delayed_changelog_wake_up (this, fd); + local->transaction.fop (frame, this); + + return 0; } @@ -215,39 +202,7 @@ __changelog_enabled (afr_private_t *priv, afr_transaction_type type) static int -__changelog_needed_pre_op (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - - int op_ret = 0; - - priv = this->private; - local = frame->local; - - if (__changelog_enabled (priv, local->transaction.type)) { - switch (local->op) { - - case GF_FOP_WRITE: - case GF_FOP_FTRUNCATE: - op_ret = 1; - break; - - case GF_FOP_FLUSH: - op_ret = 0; - break; - - default: - op_ret = 1; - } - } - - return op_ret; -} - - -static int -__changelog_needed_post_op (call_frame_t *frame, xlator_t *this) +__fop_changelog_needed (call_frame_t *frame, xlator_t *this) { afr_private_t * priv = NULL; afr_local_t * local = NULL; @@ -279,63 +234,30 @@ __changelog_needed_post_op (call_frame_t *frame, xlator_t *this) } -static int -afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending) +int +afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int **pending) { int i = 0; int ret = 0; + int pending_zero[AFR_NUM_CHANGE_LOGS] = {0, }; for (i = 0; i < priv->child_count; i++) { + if (!memcmp (pending_zero, pending[i], sizeof (pending_zero))) + /* don't set xattrs for non-pending servers */ + continue; + ret = dict_set_static_bin (xattr, priv->pending_key[i], - pending[i], 3 * sizeof (int32_t)); + pending[i], + AFR_NUM_CHANGE_LOGS * sizeof (int)); /* 3 = data+metadata+entry */ - if (ret < 0) - goto out; - } - -out: - return ret; -} - - -static int -afr_set_piggyback_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, - afr_transaction_type type) -{ - int i = 0; - int ret = 0; - int *arr = NULL; - int index = 0; - size_t pending_xattr_size = 3 * sizeof (int32_t); - /* 3 = data+metadata+entry */ - - index = afr_index_for_transaction_type (type); - - for (i = 0; i < priv->child_count; i++) { - arr = GF_CALLOC (1, pending_xattr_size, - gf_afr_mt_char); - if (!arr) { - ret = -1; - goto out; - } - - memcpy (arr, pending[i], pending_xattr_size); - - arr[index]++; - - ret = dict_set_bin (xattr, priv->pending_key[i], - arr, pending_xattr_size); - - if (ret < 0) - goto out; + if (ret) + break; } -out: return ret; } - int afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) { @@ -361,378 +283,529 @@ afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) /* {{{ pending */ -int32_t -afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) + +int +afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this) { + afr_local_t *local = NULL; + afr_private_t *priv = NULL; afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int child_index = 0; - int call_count = -1; - priv = this->private; - local = frame->local; + local = frame->local; + priv = this->private; int_lock = &local->internal_lock; - child_index = (long) cookie; - - if (op_ret == 1) { - } - - if (op_ret == 0) { - __mark_pre_op_undone_on_fd (frame, this, child_index); - } - - LOCK (&frame->lock); - { - call_count = --local->call_count; - } - UNLOCK (&frame->lock); + if (local->transaction.resume_stub) { + call_resume (local->transaction.resume_stub); + local->transaction.resume_stub = NULL; + } - if (call_count == 0) { - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - local->transaction.done (frame, this); - } else { - int_lock->lock_cbk = local->transaction.done; - afr_unlock (frame, this); - } - } + if (afr_lock_server_count (priv, local->transaction.type) == 0) { + local->transaction.done (frame, this); + } else { + int_lock->lock_cbk = local->transaction.done; + afr_unlock (frame, this); + } - return 0; + return 0; } -void -afr_update_read_child (call_frame_t *frame, xlator_t *this, inode_t *inode, - afr_transaction_type type) +afr_inodelk_t* +afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom) { - int curr_read_child = -1; - int new_read_child = -1; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int **pending = NULL; - int idx = 0; - int32_t *fresh_children = NULL; - size_t success_count = 0; - - idx = afr_index_for_transaction_type (type); - - priv = this->private; - local = frame->local; - curr_read_child = afr_inode_get_read_ctx (this, inode, NULL); - pending = local->pending; + afr_inodelk_t *inodelk = NULL; + int i = 0; - GF_ASSERT (curr_read_child >= 0); - - if (pending[curr_read_child][idx] != 0) - goto out; - - fresh_children = afr_fresh_children_create (priv->child_count); - if (!fresh_children) - goto out; - - for (new_read_child = 0; new_read_child < priv->child_count; - new_read_child++) { + for (i = 0; int_lock->inodelk[i].domain; i++) { + inodelk = &int_lock->inodelk[i]; + if (strcmp (dom, inodelk->domain) == 0) + return inodelk; + } + return NULL; +} - if (!priv->child_up[new_read_child]) - /* child is down */ - continue; +unsigned char* +afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock) +{ + unsigned char *locked_nodes = NULL; + afr_inodelk_t *inodelk = NULL; + switch (type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + locked_nodes = inodelk->locked_nodes; + break; - if (pending[new_read_child][idx] == 0) - /* op just failed */ - continue; - fresh_children[success_count] = new_read_child; - success_count++; + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + /*Because same set of subvols participate in all lockee + * entities*/ + locked_nodes = int_lock->lockee[0].locked_nodes; + break; } - - afr_inode_set_read_ctx (this, inode, fresh_children[0], - fresh_children); -out: - if (fresh_children) - GF_FREE (fresh_children); - return; + return locked_nodes; } int -afr_changelog_post_op (call_frame_t *frame, xlator_t *this) +afr_changelog_call_count (afr_transaction_type type, + unsigned char *pre_op_subvols, + unsigned int child_count) { - afr_private_t * priv = this->private; - afr_internal_lock_t *int_lock = NULL; - int ret = 0; - int i = 0; int call_count = 0; - afr_local_t * local = NULL; - afr_fd_ctx_t *fdctx = NULL; - dict_t **xattr = NULL; - int piggyback = 0; - int index = 0; - int nothing_failed = 1; + call_count = AFR_COUNT(pre_op_subvols, child_count); - local = frame->local; - int_lock = &local->internal_lock; + if (type == AFR_ENTRY_RENAME_TRANSACTION) + call_count *= 2; - __mark_down_children (local->pending, priv->child_count, - local->child_up, local->transaction.type); + return call_count; +} - if (local->fd) - afr_update_read_child (frame, this, local->fd->inode, - local->transaction.type); - xattr = alloca (priv->child_count * sizeof (*xattr)); - memset (xattr, 0, (priv->child_count * sizeof (*xattr))); - for (i = 0; i < priv->child_count; i++) { - xattr[i] = get_new_dict (); - dict_ref (xattr[i]); - } +gf_boolean_t +afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; - call_count = afr_up_children_count (priv->child_count, local->child_up); + local = frame->local; + priv = this->private; - if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { - call_count *= 2; + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) + return _gf_false; } - local->call_count = call_count; - - if (local->fd) - fdctx = afr_fd_ctx_get (local->fd, this); + return _gf_true; +} - if (call_count == 0) { - /* no child is up */ - for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); - } - int_lock->lock_cbk = local->transaction.done; - afr_unlock (frame, this); - return 0; - } - - /* check if something has failed, to handle piggybacking */ - nothing_failed = 1; - index = afr_index_for_transaction_type (local->transaction.type); - for (i = 0; i < priv->child_count; i++) { - if (local->pending[i][index] == 0) { - nothing_failed = 0; - break; - } - } +void +afr_handle_symmetric_errors (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int op_errno = 0; + int i_errno = 0; + gf_boolean_t matching_errors = _gf_true; + int i = 0; + + priv = this->private; + local = frame->local; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret != -1) { + /* Operation succeeded on at least on subvol, + so it is not a failed-everywhere situation. + */ + matching_errors = _gf_false; + break; + } + i_errno = local->replies[i].op_errno; + + if (i_errno == ENOTCONN) { + /* ENOTCONN is not a symmetric error. We do not + know if the operation was performed on the + backend or not. + */ + matching_errors = _gf_false; + break; + } + + if (!op_errno) { + op_errno = i_errno; + } else if (op_errno != i_errno) { + /* Mismatching op_errno's */ + matching_errors = _gf_false; + break; + } + } + + if (matching_errors) + __mark_all_success (frame, this); +} - index = afr_index_for_transaction_type (local->transaction.type); - if (local->optimistic_change_log && - local->transaction.type != AFR_DATA_TRANSACTION) { - /* if nothing_failed, then local->pending[..] == {0 .. 0} */ - for (i = 0; i < priv->child_count; i++) - local->pending[i][index]++; - } - for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i]) - continue; +int +afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = this->private; + int i = 0; + int ret = 0; + int idx = 0; + afr_local_t * local = NULL; + dict_t *xattr = NULL; + int nothing_failed = 1; + gf_boolean_t need_undirty = _gf_false; - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); + local = frame->local; + idx = afr_index_for_transaction_type (local->transaction.type); - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); + nothing_failed = afr_txn_nothing_failed (frame, this); + if (afr_changelog_pre_op_uninherit (frame, this)) + need_undirty = _gf_false; + else + need_undirty = _gf_true; - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - { - if (!fdctx) { - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); - break; - } + if (nothing_failed && !need_undirty) { + afr_changelog_post_op_done (frame, this); + goto out; + } + + xattr = dict_new (); + if (!xattr) { + local->op_ret = -1; + local->op_errno = ENOMEM; + afr_changelog_post_op_done (frame, this); + goto out; + } + + if (need_undirty) { + local->dirty[idx] = hton32(-1); + + ret = dict_set_static_bin (xattr, AFR_DIRTY, local->dirty, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + local->op_ret = -1; + local->op_errno = ENOMEM; + afr_changelog_post_op_done (frame, this); + goto out; + } + + } + + if (!nothing_failed) { + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) + local->pending[i][idx] = hton32(1); + } + ret = afr_set_pending_dict (priv, xattr, local->pending); + if (ret < 0) { + local->op_ret = -1; + local->op_errno = ENOMEM; + afr_changelog_post_op_done (frame, this); + goto out; + } + + } + + afr_changelog_do (frame, this, xattr, afr_changelog_post_op_done); +out: + if (xattr) + dict_unref (xattr); - LOCK (&local->fd->lock); - { - piggyback = 0; - if (fdctx->pre_op_piggyback[i]) { - fdctx->pre_op_piggyback[i]--; - piggyback = 1; - } - } - UNLOCK (&local->fd->lock); + return 0; +} - if (piggyback && !nothing_failed) - ret = afr_set_piggyback_dict (priv, xattr[i], - local->pending, - local->transaction.type); - if (nothing_failed && piggyback) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); - } else { - STACK_WIND_COOKIE (frame, - afr_changelog_post_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - } - } - break; - case AFR_METADATA_TRANSACTION: - { - if (nothing_failed) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); - break; - } +gf_boolean_t +afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + fd_t *fd = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + afr_fd_ctx_t *fd_ctx = NULL; + int type = 0; + + local = frame->local; + priv = this->private; + fd = local->fd; + + type = afr_index_for_transaction_type (local->transaction.type); + if (type != AFR_DATA_TRANSACTION) + return !local->transaction.dirtied; + + if (!fd) + return !local->transaction.dirtied; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return _gf_false; + + if (local->transaction.no_uninherit) + return _gf_false; + + /* This function must be idempotent. So check if we + were called before and return the same answer again. + + It is important to keep this function idempotent for + the call in afr_changelog_post_op_safe() to not have + side effects on the call from afr_changelog_post_op_now() + */ + if (local->transaction.uninherit_done) + return local->transaction.uninherit_value; + + LOCK(&fd->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] != + fd_ctx->pre_op_done[type][i]) { + ret = !local->transaction.dirtied; + goto unlock; + } + } + + if (fd_ctx->inherited[type]) { + ret = _gf_true; + fd_ctx->inherited[type]--; + } else if (fd_ctx->on_disk[type]) { + ret = _gf_false; + fd_ctx->on_disk[type]--; + } else { + /* ASSERT */ + ret = _gf_false; + } + + if (!fd_ctx->inherited[type] && !fd_ctx->on_disk[type]) { + for (i = 0; i < priv->child_count; i++) + fd_ctx->pre_op_done[type][i] = 0; + } + } +unlock: + UNLOCK(&fd->lock); + + local->transaction.uninherit_done = _gf_true; + local->transaction.uninherit_value = ret; + + return ret; +} - if (local->fd) - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - else - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); - } - break; - case AFR_ENTRY_RENAME_TRANSACTION: - { - if (nothing_failed) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); - } else { - STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); - } - call_count--; - } +gf_boolean_t +afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + fd_t *fd = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + afr_fd_ctx_t *fd_ctx = NULL; + int type = 0; + + local = frame->local; + priv = this->private; + fd = local->fd; + + if (local->transaction.type != AFR_DATA_TRANSACTION) + return _gf_false; + + type = afr_index_for_transaction_type (local->transaction.type); + + if (!fd) + return _gf_false; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return _gf_false; + + LOCK(&fd->lock); + { + if (!fd_ctx->on_disk[type]) { + /* nothing to inherit yet */ + ret = _gf_false; + goto unlock; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] != + fd_ctx->pre_op_done[type][i]) { + /* either inherit exactly, or don't */ + ret = _gf_false; + goto unlock; + } + } + + fd_ctx->inherited[type]++; + + ret = _gf_true; + + local->transaction.inherited = _gf_true; + } +unlock: + UNLOCK(&fd->lock); + + return ret; +} - /* - set it again because previous stack_wind - might have already returned (think of case - where subvolume is posix) and would have - used the dict as placeholder for return - value - */ - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); +gf_boolean_t +afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + fd_t *fd = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + int type = 0; + + local = frame->local; + priv = this->private; + fd = local->fd; + + if (!fd) + return _gf_false; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return _gf_false; + + if (local->transaction.inherited) + /* was already inherited in afr_changelog_pre_op */ + return _gf_false; + + if (!local->transaction.dirtied) + return _gf_false; + + if (!afr_txn_nothing_failed (frame, this)) + return _gf_false; + + type = afr_index_for_transaction_type (local->transaction.type); + + ret = _gf_false; + + LOCK(&fd->lock); + { + if (!fd_ctx->on_disk[type]) { + for (i = 0; i < priv->child_count; i++) + fd_ctx->pre_op_done[type][i] = + local->transaction.pre_op[i]; + } else { + for (i = 0; i < priv->child_count; i++) + if (fd_ctx->pre_op_done[type][i] != + local->transaction.pre_op[i]) { + local->transaction.no_uninherit = 1; + goto unlock; + } + } + fd_ctx->on_disk[type]++; + + ret = _gf_true; + } +unlock: + UNLOCK(&fd->lock); + + return ret; +} - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); - /* fall through */ +int +afr_changelog_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + afr_local_t *local = NULL; + int call_count = -1; - case AFR_ENTRY_TRANSACTION: - { - if (nothing_failed) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); - break; - } + local = frame->local; - if (local->fd) - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - else - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); - } - break; - } + if (op_ret == -1) + afr_transaction_fop_failed (frame, this, (long) cookie); - if (!--call_count) - break; - } + call_count = afr_frame_return (frame); - for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); - } + if (call_count == 0) + local->transaction.changelog_resume (frame, this); return 0; } -int32_t -afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) +int +afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, + afr_changelog_resume_t changelog_resume) { - afr_local_t * local = NULL; - afr_private_t * priv = this->private; - loc_t * loc = NULL; - int call_count = -1; - int child_index = (long) cookie; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; - local = frame->local; - loc = &local->loc; + local = frame->local; + priv = this->private; - LOCK (&frame->lock); - { - if (op_ret == 1) { - /* special op_ret for piggyback */ - } + call_count = afr_changelog_call_count (local->transaction.type, + local->transaction.pre_op, + priv->child_count); - if (op_ret == 0) { - __mark_pre_op_done_on_fd (frame, this, child_index); - } + if (call_count == 0) { + changelog_resume (frame, this); + return 0; + } + + local->call_count = call_count; - if (op_ret == -1) { - local->child_up[child_index] = 0; + local->transaction.changelog_resume = changelog_resume; - if (op_errno == ENOTSUP) { - gf_log (this->name, GF_LOG_ERROR, - "xattrop not supported by %s", - priv->children[child_index]->name); - local->op_ret = -1; + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; - } else if (!child_went_down (op_ret, op_errno)) { - gf_log (this->name, GF_LOG_ERROR, - "xattrop failed on child %s: %s", - priv->children[child_index]->name, - strerror (op_errno)); + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + if (!local->fd) { + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); + } else { + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); } - local->op_errno = op_errno; - } + break; + case AFR_ENTRY_RENAME_TRANSACTION: - call_count = --local->call_count; - } - UNLOCK (&frame->lock); + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.new_parent_loc, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); + call_count--; - if (call_count == 0) { - if ((local->op_ret == -1) && - (local->op_errno == ENOTSUP)) { - local->transaction.resume (frame, this); - } else { - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); + /* fall through */ - afr_pid_restore (frame); + case AFR_ENTRY_TRANSACTION: + if (local->fd) + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); + else + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.parent_loc, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); + break; + } - local->transaction.fop (frame, this); - } + if (!--call_count) + break; } - return 0; + return 0; } @@ -743,199 +816,122 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) int i = 0; int ret = 0; int call_count = 0; - dict_t **xattr = NULL; - afr_fd_ctx_t *fdctx = NULL; + int op_errno = 0; afr_local_t *local = NULL; - int piggyback = 0; + afr_internal_lock_t *int_lock = NULL; + unsigned char *locked_nodes = NULL; + unsigned char *pending_subvols = NULL; + int idx = -1; + gf_boolean_t pre_nop = _gf_true; + dict_t *xdata_req = NULL; local = frame->local; + int_lock = &local->internal_lock; + idx = afr_index_for_transaction_type (local->transaction.type); - xattr = alloca (priv->child_count * sizeof (*xattr)); - memset (xattr, 0, (priv->child_count * sizeof (*xattr))); + locked_nodes = afr_locked_nodes_get (local->transaction.type, int_lock); - for (i = 0; i < priv->child_count; i++) { - xattr[i] = get_new_dict (); - dict_ref (xattr[i]); - } + pending_subvols = alloca0 (priv->child_count); - call_count = afr_up_children_count (priv->child_count, - local->child_up); + for (i = 0; i < priv->child_count; i++) { + if (locked_nodes[i]) { + local->transaction.pre_op[i] = 1; + call_count++; + } else { + pending_subvols[i] = 1; + } + } - if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { - call_count *= 2; - } + /* TBD: quorum check w/ call_count */ if (call_count == 0) { - /* no child is up */ - for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); - } - - local->internal_lock.lock_cbk = - local->transaction.done; - afr_unlock (frame, this); - return 0; - } - - local->call_count = call_count; - - __mark_all_pending (local->pending, priv->child_count, - local->transaction.type); - - if (local->fd) - fdctx = afr_fd_ctx_get (local->fd, this); - - for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i]) - continue; - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); + op_errno = ENOTCONN; + goto err; + } + + xdata_req = dict_new(); + if (!xdata_req) { + op_errno = ENOMEM; + goto err; + } + + pre_nop = _gf_true; + + if (afr_changelog_pre_op_inherit (frame, this)) + goto next; + + if (call_count < priv->child_count) { + /* For subvols we are not performing operation on, + mark them as pending up-front along with the FOP + so that we can safely defer unmarking dirty until + later. + */ + for (i = 0; i < priv->child_count; i++) { + if (pending_subvols[i]) + local->pending[i][idx] = hton32(1); + } + ret = afr_set_pending_dict (priv, xdata_req, + local->pending); + if (ret < 0) { + op_errno = ENOMEM; + goto err; + } + pre_nop = _gf_false; + } + + if (call_count > 1 && + (local->transaction.type == AFR_DATA_TRANSACTION || + !local->optimistic_change_log)) { + + /* If we are performing change on only one subvol, no + need to mark dirty, because we are setting the pending + counts already anyways + */ + local->dirty[idx] = hton32(1); + + ret = dict_set_static_bin (xdata_req, AFR_DIRTY, local->dirty, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + op_errno = ENOMEM; + goto err; + } + + pre_nop = _gf_false; + local->transaction.dirtied = 1; + } + + if (pre_nop) + goto next; + + if (!local->pre_op_compat) { + dict_copy (xdata_req, local->xdata_req); + goto next; + } + + afr_changelog_do (frame, this, xdata_req, afr_transaction_perform_fop); + + if (xdata_req) + dict_unref (xdata_req); + + return 0; +next: + afr_transaction_perform_fop (frame, this); + + if (xdata_req) + dict_unref (xdata_req); + return 0; +err: + local->internal_lock.lock_cbk = local->transaction.done; + local->op_ret = -1; + local->op_errno = op_errno; - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - { - if (!fdctx) { - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &(local->loc), - GF_XATTROP_ADD_ARRAY, xattr[i]); - break; - } - - LOCK (&local->fd->lock); - { - piggyback = 0; - if (fdctx->pre_op_done[i]) { - fdctx->pre_op_piggyback[i]++; - piggyback = 1; - fdctx->hit++; - } else { - fdctx->miss++; - } - } - UNLOCK (&local->fd->lock); - - if (piggyback) - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - } - break; - case AFR_METADATA_TRANSACTION: - { - if (local->optimistic_change_log) { - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); - break; - } - - if (local->fd) - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &(local->loc), - GF_XATTROP_ADD_ARRAY, xattr[i]); - } - break; - - case AFR_ENTRY_RENAME_TRANSACTION: - { - if (local->optimistic_change_log) { - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); - } else { - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); - } - - call_count--; - } - - - /* - set it again because previous stack_wind - might have already returned (think of case - where subvolume is posix) and would have - used the dict as placeholder for return - value - */ - - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); - - /* fall through */ - - case AFR_ENTRY_TRANSACTION: - { - if (local->optimistic_change_log) { - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i]); - break; - } - - if (local->fd) - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); - } - break; - } - - if (!--call_count) - break; - } + afr_unlock (frame, this); - for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); - } + if (xdata_req) + dict_unref (xdata_req); - return 0; + return 0; } @@ -1084,12 +1080,14 @@ int afr_set_transaction_flock (afr_local_t *local) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; int_lock = &local->internal_lock; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - int_lock->lk_flock.l_len = local->transaction.len; - int_lock->lk_flock.l_start = local->transaction.start; - int_lock->lk_flock.l_type = F_WRLCK; + inodelk->flock.l_len = local->transaction.len; + inodelk->flock.l_start = local->transaction.start; + inodelk->flock.l_type = F_WRLCK; return 0; } @@ -1104,6 +1102,7 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; int_lock->transaction_lk_type = AFR_TRANSACTION_LK; + int_lock->domain = this->name; switch (local->transaction.type) { case AFR_DATA_TRANSACTION: @@ -1117,8 +1116,8 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this) case AFR_ENTRY_RENAME_TRANSACTION: - int_lock->lock_cbk = afr_post_blocking_rename_cbk; - afr_blocking_lock (frame, this); + int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk; + afr_nonblocking_entrylk (frame, this); break; case AFR_ENTRY_TRANSACTION: @@ -1140,12 +1139,6 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this) int afr_lock (call_frame_t *frame, xlator_t *this) { - afr_pid_save (frame); - - frame->root->pid = (long) frame->root; - - afr_set_lk_owner (frame, this); - afr_set_lock_number (frame, this); return afr_lock_rec (frame, this); @@ -1157,47 +1150,427 @@ afr_lock (call_frame_t *frame, xlator_t *this) int afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + if (__fop_changelog_needed (frame, this)) { + afr_changelog_pre_op (frame, this); + } else { + afr_transaction_perform_fop (frame, this); + } + + return 0; +} + + +void +afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + /* call this function from any of the related optimizations + which benefit from delaying post op are enabled, namely: + + - changelog piggybacking + - eager locking + */ + + priv = this->private; + if (!priv) + return; + + if (!priv->post_op_delay_secs) + return; - priv = this->private; local = frame->local; + if (!local->transaction.eager_lock_on) + return; - if (__changelog_needed_pre_op (frame, this)) { - afr_changelog_pre_op (frame, this); - } else { - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); + if (!local) + return; + + if (!local->fd) + return; + + if (local->op == GF_FOP_WRITE) + local->delayed_post_op = _gf_true; +} + +gf_boolean_t +afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this) +{ + afr_fd_ctx_t *fd_ctx = NULL; + + if (!fd) { + /* If false is returned, it may keep on taking eager-lock + * which may lead to starvation, so return true to avoid that. + */ + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid fd"); + return _gf_true; + } + /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock + * is taken mount2 opened the same file, it won't be able to + * perform any data operations until mount1 releases eager-lock. + * To avoid such scenario do not enable eager-lock for this transaction + * if open-fd-count is > 1 + */ + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return _gf_true; + + if (fd_ctx->open_fd_count > 1) + return _gf_true; + + return _gf_false; +} + + +gf_boolean_t +is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + gf_boolean_t res = _gf_false; + + local = frame->local; + if (!local) + goto out; + + if (!local->delayed_post_op) + goto out; + + //Mark pending changelog ASAP + if (!afr_txn_nothing_failed (frame, this)) + goto out; + + if (local->fd && afr_are_multiple_fds_opened (local->fd, this)) + goto out; - afr_pid_restore (frame); + res = _gf_true; +out: + return res; +} + + +void +afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, + call_stub_t *stub); + +void +afr_delayed_changelog_wake_up_cbk (void *data) +{ + fd_t *fd = NULL; + + fd = data; + + afr_delayed_changelog_wake_up (THIS, fd); +} - local->transaction.fop (frame, this); + +/* SET operation */ +int +afr_fd_report_unstable_write (xlator_t *this, fd_t *fd) +{ + afr_fd_ctx_t *fdctx = NULL; + + fdctx = afr_fd_ctx_get (fd, this); + + LOCK(&fd->lock); + { + fdctx->witnessed_unstable_write = _gf_true; } + UNLOCK(&fd->lock); return 0; } +/* TEST and CLEAR operation */ +gf_boolean_t +afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd) +{ + afr_fd_ctx_t *fdctx = NULL; + gf_boolean_t witness = _gf_false; + + fdctx = afr_fd_ctx_get (fd, this); + if (!fdctx) + return _gf_true; + + LOCK(&fd->lock); + { + if (fdctx->witnessed_unstable_write) { + witness = _gf_true; + fdctx->witnessed_unstable_write = _gf_false; + } + } + UNLOCK (&fd->lock); + + return witness; +} + + +int +afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + afr_private_t *priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (op_ret != 0) { + /* Failure of fsync() is as good as failure of previous + write(). So treat it like one. + */ + gf_log (this->name, GF_LOG_WARNING, + "fsync(%s) failed on subvolume %s. Transaction was %s", + uuid_utoa (local->fd->inode->gfid), + priv->children[child_index]->name, + gf_fop_list[local->op]); + + afr_transaction_fop_failed (frame, this, child_index); + } + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_changelog_post_op_now (frame, this); + + return 0; +} + + +int +afr_changelog_fsync (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + GF_UNUSED int ret = -1; + + local = frame->local; + priv = this->private; + + call_count = AFR_COUNT (local->transaction.pre_op, priv->child_count); + + if (!call_count) { + /* will go straight to unlock */ + afr_changelog_post_op_now (frame, this); + return 0; + } + + local->call_count = call_count; + + xdata = dict_new(); + if (xdata) + ret = dict_set_int32 (xdata, "batch-fsync", 1); + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; + + STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->fsync, local->fd, + 1, xdata); + if (!--call_count) + break; + } + + if (xdata) + dict_unref (xdata); + + return 0; +} + + +int +afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) { + afr_changelog_post_op_now (frame, this); + return 0; + } + + if (afr_changelog_pre_op_uninherit (frame, this) && + afr_txn_nothing_failed (frame, this)) { + /* just detected that this post-op is about to + be optimized away as a new write() has + already piggybacked on this frame's changelog. + */ + afr_changelog_post_op_now (frame, this); + return 0; + } + + /* Calling afr_changelog_post_op_now() now will result in + issuing ->[f]xattrop(). + + Performing a hard POST-OP (->[f]xattrop() FOP) is a more + responsible operation that what it might appear on the surface. + + The changelog of a file (in the xattr of the file on the server) + stores information (pending count) about the state of the file + on the OTHER server. This changelog is blindly trusted, and must + therefore be updated in such a way it remains trustworthy. This + implies that decrementing the pending count (essentially "clearing + the dirty flag") must be done STRICTLY after we are sure that the + operation on the other server has reached stable storage. + + While the backend filesystem on that server will eventually flush + it to stable storage, we (being in userspace) have no mechanism + to get notified when the write became "stable". + + This means we need take matter into our own hands and issue an + fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES, + and get an acknowledgement for it. And we need to wait for the + fsync() acknowledgement before initiating the hard POST-OP. + + However if the FD itself was opened in O_SYNC or O_DSYNC then + we are already guaranteed that the writes were made stable as + part of the FOP itself. The same holds true for NFS stable + writes which happen on an anonymous FD with O_DSYNC or O_SYNC + flag set in the writev() @flags param. For all other write types, + mark a flag in the fdctx whenever an unstable write is witnessed. + */ + + if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) { + afr_changelog_post_op_now (frame, this); + return 0; + } + + /* Check whether users want durability and perform fsync/post-op + * accordingly. + */ + if (priv->ensure_durability) { + /* Time to fsync() */ + afr_changelog_fsync (frame, this); + } else { + afr_changelog_post_op_now (frame, this); + } + + return 0; +} + + +void +afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, + call_stub_t *stub) +{ + afr_fd_ctx_t *fd_ctx = NULL; + call_frame_t *prev_frame = NULL; + struct timespec delta = {0, }; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + goto out; + + delta.tv_sec = priv->post_op_delay_secs; + delta.tv_nsec = 0; + + pthread_mutex_lock (&fd_ctx->delay_lock); + { + prev_frame = fd_ctx->delay_frame; + fd_ctx->delay_frame = NULL; + if (fd_ctx->delay_timer) + gf_timer_call_cancel (this->ctx, fd_ctx->delay_timer); + fd_ctx->delay_timer = NULL; + if (!frame) + goto unlock; + fd_ctx->delay_timer = gf_timer_call_after (this->ctx, delta, + afr_delayed_changelog_wake_up_cbk, + fd); + fd_ctx->delay_frame = frame; + } +unlock: + pthread_mutex_unlock (&fd_ctx->delay_lock); + +out: + if (prev_frame) { + local = prev_frame->local; + local->transaction.resume_stub = stub; + afr_changelog_post_op_now (prev_frame, this); + } else if (stub) { + call_resume (stub); + } +} + + +void +afr_changelog_post_op (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (is_afr_delayed_changelog_post_op_needed (frame, this)) + afr_delayed_changelog_post_op (this, frame, local->fd, NULL); + else + afr_changelog_post_op_safe (frame, this); +} + + + +/* Wake up the sleeping/delayed post-op, and also register + a stub to have it resumed after this transaction + completely finishes. + + The @stub gets saved in @local and gets resumed in + afr_local_cleanup() + */ +void +afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub) +{ + afr_delayed_changelog_post_op (this, NULL, fd, stub); +} + + +void +afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd) +{ + afr_delayed_changelog_post_op (this, NULL, fd, NULL); +} + int afr_transaction_resume (call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; - afr_private_t *priv = NULL; local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - if (__changelog_needed_post_op (frame, this)) { + if (local->transaction.eager_lock_on) { + /* We don't need to retain "local" in the + fd list anymore, writes to all subvols + are finished by now */ + afr_remove_eager_lock_stub (local); + } + + afr_restore_lk_owner (frame); + + afr_handle_symmetric_errors (frame, this); + + if (!local->pre_op_compat) + /* new mode, pre-op was done along + with OP */ + afr_changelog_pre_op_update (frame, this); + + if (__fop_changelog_needed (frame, this)) { afr_changelog_post_op (frame, this); } else { - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - local->transaction.done (frame, this); - } else { - int_lock->lock_cbk = local->transaction.done; - afr_unlock (frame, this); - } + afr_changelog_post_op_done (frame, this); } return 0; @@ -1209,16 +1582,96 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this) */ void -afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index) +afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, + int child_index) { afr_local_t * local = NULL; - afr_private_t * priv = NULL; local = frame->local; - priv = this->private; - __mark_child_dead (local->pending, priv->child_count, - child_index, local->transaction.type); + local->transaction.failed_subvols[child_index] = 1; +} + + + + static gf_boolean_t +afr_locals_overlap (afr_local_t *local1, afr_local_t *local2) +{ + uint64_t start1 = local1->transaction.start; + uint64_t start2 = local2->transaction.start; + uint64_t end1 = 0; + uint64_t end2 = 0; + + if (local1->transaction.len) + end1 = start1 + local1->transaction.len - 1; + else + end1 = ULLONG_MAX; + + if (local2->transaction.len) + end2 = start2 + local2->transaction.len - 1; + else + end2 = ULLONG_MAX; + + return ((end1 >= start2) && (end2 >= start1)); +} + +void +afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_fd_ctx_t *fdctx = NULL; + afr_local_t *each = NULL; + + priv = this->private; + + if (!local->fd) + return; + + if (local->transaction.type != AFR_DATA_TRANSACTION) + return; + + if (!priv->eager_lock) + return; + + fdctx = afr_fd_ctx_get (local->fd, this); + if (!fdctx) + return; + + if (afr_are_multiple_fds_opened (local->fd, this)) + return; + /* + * Once full file lock is acquired in eager-lock phase, overlapping + * writes do not compete for inode-locks, instead are transferred to the + * next writes. Because of this overlapping writes are not ordered. + * This can cause inconsistencies in replication. + * Example: + * Two overlapping writes w1, w2 are sent in parallel on same fd + * in two threads t1, t2. + * Both threads can execute afr_writev_wind in the following manner. + * t1 winds w1 on brick-0 + * t2 winds w2 on brick-0 + * t2 winds w2 on brick-1 + * t1 winds w1 on brick-1 + * + * This check makes sure the locks are not transferred for + * overlapping writes. + */ + LOCK (&local->fd->lock); + { + list_for_each_entry (each, &fdctx->eager_locked, + transaction.eager_locked) { + if (afr_locals_overlap (each, local)) { + local->transaction.eager_lock_on = _gf_false; + goto unlock; + } + } + + local->transaction.eager_lock_on = _gf_true; + list_add_tail (&local->transaction.eager_locked, + &fdctx->eager_locked); + } +unlock: + UNLOCK (&local->fd->lock); } @@ -1227,20 +1680,43 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) { afr_local_t * local = NULL; afr_private_t * priv = NULL; + fd_t *fd = NULL; + int ret = -1; local = frame->local; priv = this->private; - afr_transaction_local_init (local, priv); - local->transaction.resume = afr_transaction_resume; local->transaction.type = type; + ret = afr_transaction_local_init (local, this); + if (ret < 0) + goto out; + + afr_transaction_eager_lock_init (local, this); + + if (local->fd && local->transaction.eager_lock_on) + afr_set_lk_owner (frame, this, local->fd); + else + afr_set_lk_owner (frame, this, frame->root); + + if (!local->transaction.eager_lock_on && local->loc.inode) { + fd = fd_lookup (local->loc.inode, frame->root->pid); + if (fd == NULL) + fd = fd_lookup_anonymous (local->loc.inode); + + if (fd) { + afr_delayed_changelog_wake_up (this, fd); + fd_unref (fd); + } + } + if (afr_lock_server_count (priv, local->transaction.type) == 0) { afr_internal_lock_finish (frame, this); } else { afr_lock (frame, this); } - - return 0; + ret = 0; +out: + return ret; } diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index 167ae9ff6..77cc8eed0 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -1,25 +1,18 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __TRANSACTION_H__ #define __TRANSACTION_H__ +#include "afr.h" + void afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index); @@ -27,7 +20,34 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int afr_lock_server_count (afr_private_t *priv, afr_transaction_type type); +afr_inodelk_t* +afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom); + int32_t afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); +int +afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending); + +void +afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this); + +void +afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd); + +void +__mark_all_success (call_frame_t *frame, xlator_t *this); + +gf_boolean_t +afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this); + +int afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_read_txn_wind_t readfn, afr_transaction_type type); + +int afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol); + +int __afr_txn_write_fop (call_frame_t *frame, xlator_t *this); +int __afr_txn_write_done (call_frame_t *frame, xlator_t *this); +call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame); + #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 0cbbbc44d..5e12910b7 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <libgen.h> @@ -37,8 +28,13 @@ notify (xlator_t *this, int32_t event, void *data, ...) { int ret = -1; + va_list ap; + void *data2 = NULL; - ret = afr_notify (this, event, data); + va_start (ap, data); + data2 = va_arg (ap, dict_t*); + va_end (ap); + ret = afr_notify (this, event, data, data2); return ret; } @@ -62,284 +58,147 @@ mem_acct_init (xlator_t *this) return ret; } + int -validate_options (xlator_t *this, char **op_errstr) +xlator_subvolume_index (xlator_t *this, xlator_t *subvol) { - int ret = 0; - volume_opt_list_t *vol_opt = NULL; - volume_opt_list_t *tmp; + int index = -1; + int i = 0; + xlator_list_t *list = NULL; - if (!this) { - gf_log (this->name, GF_LOG_DEBUG, "'this' not a valid ptr"); - ret =-1; - goto out; - } + list = this->children; - if (list_empty (&this->volume_options)) - goto out; - - vol_opt = list_entry (this->volume_options.next, - volume_opt_list_t, list); - list_for_each_entry_safe (vol_opt, tmp, &this->volume_options, list) { - ret = validate_xlator_volume_options_attacherr (this, - vol_opt->given_opt, - op_errstr); + while (list) { + if (subvol == list->xlator || + strcmp (subvol->name, list->xlator->name) == 0) { + index = i; + break; + } + list = list->next; + i++; } -out: - - return ret; + return index; } +void +fix_quorum_options (xlator_t *this, afr_private_t *priv, char *qtype) +{ + if (priv->quorum_count && strcmp(qtype,"fixed")) { + gf_log(this->name,GF_LOG_WARNING, + "quorum-type %s overriding quorum-count %u", + qtype, priv->quorum_count); + } + if (!strcmp(qtype,"none")) { + priv->quorum_count = 0; + } + else if (!strcmp(qtype,"auto")) { + priv->quorum_count = AFR_QUORUM_AUTO; + } +} int reconfigure (xlator_t *this, dict_t *options) { - - gf_boolean_t metadata_self_heal; /* on/off */ - gf_boolean_t entry_self_heal; - gf_boolean_t data_self_heal; - gf_boolean_t data_change_log; /* on/off */ - gf_boolean_t metadata_change_log; /* on/off */ - gf_boolean_t entry_change_log; /* on/off */ - gf_boolean_t strict_readdir; - - afr_private_t * priv = NULL; - xlator_list_t * trav = NULL; - - char * read_subvol = NULL; - char * self_heal = NULL; - char * change_log = NULL; - char * str_readdir = NULL; - char * self_heal_algo = NULL; - - int32_t background_count = 0; - int32_t window_size = 0; - - int read_ret = -1; - int dict_ret = -1; - int flag = 1; - int ret = 0; - int temp_ret = -1; + afr_private_t *priv = NULL; + xlator_t *read_subvol = NULL; + int read_subvol_index = -1; + int ret = -1; + int index = -1; + char *qtype = NULL; priv = this->private; - dict_ret = dict_get_int32 (options, "background-self-heal-count", - &background_count); - if (dict_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring background self-heal count to %d", - background_count); + GF_OPTION_RECONF ("afr-dirty-xattr", + priv->afr_dirty, options, str, + out); - priv->background_self_heal_count = background_count; - } + GF_OPTION_RECONF ("metadata-splitbrain-forced-heal", + priv->metadata_splitbrain_forced_heal, options, bool, + out); - dict_ret = dict_get_str (options, "metadata-self-heal", - &self_heal); - if (dict_ret == 0) { - temp_ret = gf_string2boolean (self_heal, &metadata_self_heal); - if (temp_ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Reconfiguration Invalid 'option metadata" - "-self-heal %s'. Defaulting to old value.", - self_heal); - ret = -1; - goto out; - } + GF_OPTION_RECONF ("background-self-heal-count", + priv->background_self_heal_count, options, uint32, + out); - priv->metadata_self_heal = metadata_self_heal; - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring 'option metadata" - "-self-heal %s'.", - self_heal); - } + GF_OPTION_RECONF ("metadata-self-heal", + priv->metadata_self_heal, options, bool, out); - dict_ret = dict_get_str (options, "data-self-heal", &self_heal); - if (dict_ret == 0) { - temp_ret = gf_string2boolean (self_heal, &data_self_heal); - if (temp_ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Reconfiguration Invalid 'option data" - "-self-heal %s'. Defaulting to old value.", - self_heal); - ret = -1; - goto out; - } + GF_OPTION_RECONF ("data-self-heal", priv->data_self_heal, options, str, + out); - priv->data_self_heal = data_self_heal; - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring 'option data" - "-self-heal %s'.", self_heal); - } + GF_OPTION_RECONF ("entry-self-heal", priv->entry_self_heal, options, + bool, out); - dict_ret = dict_get_str (options, "entry-self-heal", - &self_heal); - if (dict_ret == 0) { - temp_ret = gf_string2boolean (self_heal, &entry_self_heal); - if (temp_ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Reconfiguration Invalid 'option data" - "-self-heal %s'. Defaulting to old value.", - self_heal); - ret = -1; - goto out; - } + GF_OPTION_RECONF ("data-self-heal-window-size", + priv->data_self_heal_window_size, options, + uint32, out); - priv->entry_self_heal = entry_self_heal; - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring 'option entry" - "-self-heal %s'.", self_heal); - } + GF_OPTION_RECONF ("data-change-log", priv->data_change_log, options, + bool, out); + GF_OPTION_RECONF ("metadata-change-log", + priv->metadata_change_log, options, bool, out); - dict_ret = dict_get_str (options, "strict-readdir", - &str_readdir); - if (dict_ret == 0) { - temp_ret = gf_string2boolean (str_readdir, &strict_readdir); - if (temp_ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option strict-readdir %s'. " - "Defaulting to old value.", - str_readdir); - ret = -1; - goto out; - } - - priv->strict_readdir = strict_readdir; - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring 'option strict" - "-readdir %s'.", str_readdir); - } + GF_OPTION_RECONF ("entry-change-log", priv->entry_change_log, options, + bool, out); - dict_ret = dict_get_int32 (options, "data-self-heal-window-size", - &window_size); - if (dict_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring, Setting data self-heal window size to %d", - window_size); + GF_OPTION_RECONF ("data-self-heal-algorithm", + priv->data_self_heal_algorithm, options, str, out); - priv->data_self_heal_window_size = window_size; - } - else { - priv->data_self_heal_window_size = 16; - } + GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out); + GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode, + options, uint32, out); - dict_ret = dict_get_str (options, "data-change-log", &change_log); - if (dict_ret == 0) { - temp_ret = gf_string2boolean (change_log, &data_change_log); - if (temp_ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Reconfiguration Invalid 'option data-" - "change-log %s'. Defaulting to old value.", - change_log); - ret = -1; + if (read_subvol) { + index = xlator_subvolume_index (this, read_subvol); + if (index == -1) { + gf_log (this->name, GF_LOG_ERROR, "%s not a subvolume", + read_subvol->name); goto out; } - - priv->data_change_log = data_change_log; - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring 'option data-" - "change-log %s'.", change_log); + priv->read_child = index; } - dict_ret = dict_get_str (options, "metadata-change-log", - &change_log); - if (dict_ret == 0) { - temp_ret = gf_string2boolean (change_log, - &metadata_change_log); - if (temp_ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option metadata-change-log %s'. " - "Defaulting to metadata-change-log as 'off'.", - change_log); - ret = -1; - goto out; - } + GF_OPTION_RECONF ("read-subvolume-index",read_subvol_index, options,int32,out); - priv->metadata_change_log = metadata_change_log; - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring 'option metadata-" - "change-log %s'.", change_log); - } - - dict_ret = dict_get_str (options, "entry-change-log", - &change_log); - if (dict_ret == 0) { - temp_ret = gf_string2boolean (change_log, &entry_change_log); - if (temp_ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option entry-change-log %s'. " - "Defaulting to entry-change-log as 'on'.", - change_log); - ret = -1; + if (read_subvol_index >-1) { + index=read_subvol_index; + if (index >= priv->child_count) { + gf_log (this->name, GF_LOG_ERROR, "%d not a subvolume-index", + index); goto out; } - - priv->entry_change_log = entry_change_log; - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring 'option entry-" - "change-log %s'.", change_log); + priv->read_child = index; } - dict_ret = dict_get_str (options, "data-self-heal-algorithm", - &self_heal_algo); - if (dict_ret == 0) { - /* Handling both strcmp cases - s1 > s2 and s1 < s2 */ - - if (!strcmp (self_heal_algo, "full")) { - priv->data_self_heal_algorithm = self_heal_algo; - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring 'option data-self" - "heal-algorithm %s'.", self_heal_algo); - goto next; - } + GF_OPTION_RECONF ("pre-op-compat", priv->pre_op_compat, options, bool, out); - if (!strcmp (self_heal_algo, "diff")) { - priv->data_self_heal_algorithm = self_heal_algo; - gf_log (this->name, GF_LOG_DEBUG, - "Reconfiguring 'option data-self" - "heal-algorithm %s'.", self_heal_algo); - goto next; - } + GF_OPTION_RECONF ("eager-lock", priv->eager_lock, options, bool, out); + GF_OPTION_RECONF ("quorum-type", qtype, options, str, out); + GF_OPTION_RECONF ("quorum-count", priv->quorum_count, options, + uint32, out); + fix_quorum_options(this,priv,qtype); - gf_log (this->name, GF_LOG_WARNING, - "Invalid self-heal algorithm %s," - "defaulting back to old value", - self_heal_algo); - ret = -1; - goto out; - } + GF_OPTION_RECONF ("post-op-delay-secs", priv->post_op_delay_secs, options, + uint32, out); - read_ret = dict_get_str (options, "read-subvolume", &read_subvol); + GF_OPTION_RECONF (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, + options, size, out); + /* Reset this so we re-discover in case the topology changed. */ + GF_OPTION_RECONF ("ensure-durability", priv->ensure_durability, options, + bool, out); - if (read_ret < 0) - goto next;// No need to traverse, hence set the next option + GF_OPTION_RECONF ("self-heal-daemon", priv->shd.enabled, options, + bool, out); - trav = this->children; - flag = 0; - while (trav) { - if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) { - gf_log (this->name, GF_LOG_DEBUG, - "Subvolume '%s' specified as read child.", - trav->xlator->name); - - flag = 1; - break; - } + GF_OPTION_RECONF ("iam-self-heal-daemon", priv->shd.iamshd, options, + bool, out); - trav = trav->next; - } - - if (flag == 0 ) { - gf_log (this->name, GF_LOG_ERROR, - "Invalid 'option read-subvolume %s', no such subvolume" - , read_subvol); - ret = -1; - goto out; - } + priv->did_discovery = _gf_false; -next: + ret = 0; out: return ret; @@ -354,39 +213,20 @@ static const char *favorite_child_warning_str = "You have specified subvolume '% "subvolumes. All versions of the file except that on '%s' " "WILL BE LOST."; -static const char *no_lock_servers_warning_str = "You have set lock-server-count = 0. " - "This means correctness is NO LONGER GUARANTEED in all cases. If two or more " - "applications write to the same region of a file, there is a possibility that " - "its copies will be INCONSISTENT. Set it to a value greater than 0 unless you " - "are ABSOLUTELY SURE of what you are doing and WILL NOT HOLD GlusterFS " - "RESPONSIBLE for inconsistent data. If you are in doubt, set it to a value " - "greater than 0."; int32_t init (xlator_t *this) { - afr_private_t * priv = NULL; - int child_count = 0; - xlator_list_t * trav = NULL; - int i = 0; - int ret = -1; - int op_errno = 0; - char * read_subvol = NULL; - char * fav_child = NULL; - char * self_heal = NULL; - char * algo = NULL; - char * change_log = NULL; - char * strict_readdir = NULL; - char * inodelk_trace = NULL; - char * entrylk_trace = NULL; - char * def_val = NULL; - int32_t background_count = 0; - int32_t lock_server_count = 1; - int32_t window_size = 0; - int fav_ret = -1; - int read_ret = -1; - int dict_ret = -1; - + afr_private_t *priv = NULL; + int child_count = 0; + xlator_list_t *trav = NULL; + int i = 0; + int ret = -1; + GF_UNUSED int op_errno = 0; + xlator_t *read_subvol = NULL; + int read_subvol_index = -1; + xlator_t *fav_child = NULL; + char *qtype = NULL; if (!this->children) { gf_log (this->name, GF_LOG_ERROR, @@ -400,288 +240,111 @@ init (xlator_t *this) "Volume is dangling."); } - ALLOC_OR_GOTO (this->private, afr_private_t, out); + this->private = GF_CALLOC (1, sizeof (afr_private_t), + gf_afr_mt_afr_private_t); + if (!this->private) + goto out; priv = this->private; + LOCK_INIT (&priv->lock); - read_ret = dict_get_str (this->options, "read-subvolume", &read_subvol); - priv->read_child = -1; - - fav_ret = dict_get_str (this->options, "favorite-child", &fav_child); - priv->favorite_child = -1; - - priv->background_self_heal_count = 16; + child_count = xlator_subvolume_count (this); - dict_ret = dict_get_int32 (this->options, "background-self-heal-count", - &background_count); - if (dict_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Setting background self-heal count to %d", - background_count); + priv->child_count = child_count; - priv->background_self_heal_count = background_count; - } + priv->read_child = -1; - /* Default values */ - - priv->data_self_heal = 1; - priv->metadata_self_heal = 1; - priv->entry_self_heal = 1; - - dict_ret = dict_get_str (this->options, "data-self-heal", &self_heal); - if (dict_ret == 0) { - ret = gf_string2boolean (self_heal, &priv->data_self_heal); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option data-self-heal %s'. " - "Defaulting to data-self-heal as 'on'", - self_heal); - priv->data_self_heal = 1; - } - } + GF_OPTION_INIT ("afr-dirty-xattr", priv->afr_dirty, str, out); - if (xlator_get_volopt_info (&this->volume_options, - "data-self-heal-algorithm", &def_val, NULL)) { - gf_log (this->name, GF_LOG_ERROR, "Default value of " - " data-self-heal-algorithm not found"); - ret = -1; - goto out; - } else { - priv->data_self_heal_algorithm = def_val; - } - dict_ret = dict_get_str (this->options, "data-self-heal-algorithm", - &algo); - if (dict_ret == 0) { - priv->data_self_heal_algorithm = gf_strdup (algo); - } + GF_OPTION_INIT ("metadata-splitbrain-forced-heal", + priv->metadata_splitbrain_forced_heal, bool, out); - if (xlator_get_volopt_info (&this->volume_options, - "data-self-heal-window-size",&def_val, - NULL)) { - gf_log (this->name, GF_LOG_ERROR, "Default value of " - "data-self-heal-window-size not found"); - ret = -1; - goto out; - } else { - if (gf_string2int32 (def_val, - (int *)&priv->data_self_heal_window_size)) { - gf_log (this->name, GF_LOG_ERROR, "Default value of " - "data-self-heal-window-size corrupt"); - ret = -1; + GF_OPTION_INIT ("read-subvolume", read_subvol, xlator, out); + if (read_subvol) { + priv->read_child = xlator_subvolume_index (this, read_subvol); + if (priv->read_child == -1) { + gf_log (this->name, GF_LOG_ERROR, "%s not a subvolume", + read_subvol->name); goto out; } } - - dict_ret = dict_get_int32 (this->options, "data-self-heal-window-size", - &window_size); - if (dict_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Setting data self-heal window size to %d", - window_size); - - priv->data_self_heal_window_size = window_size; - } - - dict_ret = dict_get_str (this->options, "metadata-self-heal", - &self_heal); - if (dict_ret == 0) { - ret = gf_string2boolean (self_heal, &priv->metadata_self_heal); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option metadata-self-heal %s'. " - "Defaulting to metadata-self-heal as 'on'.", - self_heal); - priv->metadata_self_heal = 1; - } - } - - dict_ret = dict_get_str (this->options, "entry-self-heal", &self_heal); - if (dict_ret == 0) { - ret = gf_string2boolean (self_heal, &priv->entry_self_heal); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option entry-self-heal %s'. " - "Defaulting to entry-self-heal as 'on'.", - self_heal); - priv->entry_self_heal = 1; - } - } - - /* Change log options */ - - priv->data_change_log = 1; - priv->metadata_change_log = 1; - priv->entry_change_log = 1; - priv->optimistic_change_log = 1; - - dict_ret = dict_get_str (this->options, "data-change-log", - &change_log); - if (dict_ret == 0) { - ret = gf_string2boolean (change_log, &priv->data_change_log); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option data-change-log %s'. " - "Defaulting to data-change-log as 'on'.", - change_log); - priv->data_change_log = 1; - } - } - - dict_ret = dict_get_str (this->options, "metadata-change-log", - &change_log); - if (dict_ret == 0) { - ret = gf_string2boolean (change_log, - &priv->metadata_change_log); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option metadata-change-log %s'. " - "Defaulting to metadata-change-log as 'off'.", - change_log); - priv->metadata_change_log = 0; - } - } - - dict_ret = dict_get_str (this->options, "entry-change-log", - &change_log); - if (dict_ret == 0) { - ret = gf_string2boolean (change_log, &priv->entry_change_log); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option entry-change-log %s'. " - "Defaulting to entry-change-log as 'on'.", - change_log); - priv->entry_change_log = 1; - } - } - - dict_ret = dict_get_str (this->options, "optimistic-change-log", - &change_log); - if (dict_ret == 0) { - ret = gf_string2boolean (change_log, &priv->optimistic_change_log); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option optimistic-change-log %s'. " - "Defaulting to optimistic-change-log as 'on'.", - change_log); - priv->optimistic_change_log = 1; + GF_OPTION_INIT ("read-subvolume-index",read_subvol_index,int32,out); + if (read_subvol_index > -1) { + if (read_subvol_index >= priv->child_count) { + gf_log (this->name, GF_LOG_ERROR, "%d not a subvolume-index", + read_subvol_index); + goto out; } + priv->read_child = read_subvol_index; } + GF_OPTION_INIT ("choose-local", priv->choose_local, bool, out); - /* Locking options */ - - priv->inodelk_trace = 0; - priv->entrylk_trace = 0; - - dict_ret = dict_get_str (this->options, "inodelk-trace", - &inodelk_trace); - if (dict_ret == 0) { - ret = gf_string2boolean (inodelk_trace, &priv->inodelk_trace); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option inodelk-trace %s' ", - inodelk_trace); + GF_OPTION_INIT ("read-hash-mode", priv->hash_mode, uint32, out); - priv->inodelk_trace = 0; + priv->favorite_child = -1; + GF_OPTION_INIT ("favorite-child", fav_child, xlator, out); + if (fav_child) { + priv->favorite_child = xlator_subvolume_index (this, fav_child); + if (priv->favorite_child == -1) { + gf_log (this->name, GF_LOG_ERROR, "%s not a subvolume", + fav_child->name); + goto out; } + gf_log (this->name, GF_LOG_WARNING, + favorite_child_warning_str, fav_child->name, + fav_child->name, fav_child->name); } - dict_ret = dict_get_str (this->options, "entrylk-trace", - &entrylk_trace); - if (dict_ret == 0) { - ret = gf_string2boolean (entrylk_trace, &priv->entrylk_trace); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option entrylk-trace %s' ", - inodelk_trace); + GF_OPTION_INIT ("background-self-heal-count", + priv->background_self_heal_count, uint32, out); - priv->entrylk_trace = 0; - } - } + GF_OPTION_INIT ("data-self-heal", priv->data_self_heal, str, out); + GF_OPTION_INIT ("data-self-heal-algorithm", + priv->data_self_heal_algorithm, str, out); - priv->data_lock_server_count = 1; - priv->metadata_lock_server_count = 0; - priv->entry_lock_server_count = 1; + GF_OPTION_INIT ("data-self-heal-window-size", + priv->data_self_heal_window_size, uint32, out); - dict_ret = dict_get_int32 (this->options, "data-lock-server-count", - &lock_server_count); - if (dict_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Setting data lock server count to %d.", - lock_server_count); + GF_OPTION_INIT ("metadata-self-heal", priv->metadata_self_heal, bool, + out); - if (lock_server_count == 0) - gf_log (this->name, GF_LOG_WARNING, "%s", - no_lock_servers_warning_str); + GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out); - priv->data_lock_server_count = lock_server_count; - } + GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out); + GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool, + out); - dict_ret = dict_get_int32 (this->options, - "metadata-lock-server-count", - &lock_server_count); - if (dict_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Setting metadata lock server count to %d.", - lock_server_count); - priv->metadata_lock_server_count = lock_server_count; - } + GF_OPTION_INIT ("entry-change-log", priv->entry_change_log, bool, out); + GF_OPTION_INIT ("optimistic-change-log", priv->optimistic_change_log, + bool, out); - dict_ret = dict_get_int32 (this->options, "entry-lock-server-count", - &lock_server_count); - if (dict_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Setting entry lock server count to %d.", - lock_server_count); + GF_OPTION_INIT ("inodelk-trace", priv->inodelk_trace, bool, out); - priv->entry_lock_server_count = lock_server_count; - } + GF_OPTION_INIT ("entrylk-trace", priv->entrylk_trace, bool, out); - priv->strict_readdir = _gf_false; - - dict_ret = dict_get_str (this->options, "strict-readdir", - &strict_readdir); - if (dict_ret == 0) { - ret = gf_string2boolean (strict_readdir, &priv->strict_readdir); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Invalid 'option strict-readdir %s'. " - "Defaulting to strict-readdir as 'off'.", - strict_readdir); - } - } + GF_OPTION_INIT ("pre-op-compat", priv->pre_op_compat, bool, out); - trav = this->children; - while (trav) { - if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) { - gf_log (this->name, GF_LOG_DEBUG, - "Subvolume '%s' specified as read child.", - trav->xlator->name); + GF_OPTION_INIT ("eager-lock", priv->eager_lock, bool, out); + GF_OPTION_INIT ("quorum-type", qtype, str, out); + GF_OPTION_INIT ("quorum-count", priv->quorum_count, uint32, out); + GF_OPTION_INIT (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size, + out); + fix_quorum_options(this,priv,qtype); - priv->read_child = child_count; - } + GF_OPTION_INIT ("post-op-delay-secs", priv->post_op_delay_secs, uint32, out); + GF_OPTION_INIT ("ensure-durability", priv->ensure_durability, bool, + out); - if (fav_ret == 0 && !strcmp (fav_child, trav->xlator->name)) { - gf_log (this->name, GF_LOG_WARNING, - favorite_child_warning_str, trav->xlator->name, - trav->xlator->name, trav->xlator->name); - priv->favorite_child = child_count; - } + GF_OPTION_INIT ("self-heal-daemon", priv->shd.enabled, bool, out); - child_count++; - trav = trav->next; - } + GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out); priv->wait_count = 1; - priv->child_count = child_count; - - LOCK_INIT (&priv->lock); - LOCK_INIT (&priv->read_child_lock); - priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, gf_afr_mt_char); if (!priv->child_up) { @@ -720,8 +383,6 @@ init (xlator_t *this) AFR_XATTR_PREFIX, trav->xlator->name); if (-1 == ret) { - gf_log (this->name, GF_LOG_ERROR, - "asprintf failed to set pending key"); ret = -ENOMEM; goto out; } @@ -730,6 +391,13 @@ init (xlator_t *this) i++; } + ret = gf_asprintf (&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT, + this->name); + if (-1 == ret) { + ret = -ENOMEM; + goto out; + } + priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event), gf_afr_mt_int32_t); if (!priv->last_event) { @@ -737,12 +405,22 @@ init (xlator_t *this) goto out; } - LOCK_INIT (&priv->root_inode_lk); - priv->first_lookup = 1; - priv->root_inode = NULL; + ret = afr_selfheal_daemon_init (this); + if (ret) { + ret = -ENOMEM; + goto out; + } - pthread_mutex_init (&priv->mutex, NULL); - INIT_LIST_HEAD (&priv->saved_fds); + /* keep more local here as we may need them for self-heal etc */ + this->local_pool = mem_pool_new (afr_local_t, 512); + if (!this->local_pool) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + goto out; + } + + priv->root_inode = NULL; ret = 0; out: @@ -753,6 +431,13 @@ out: int fini (xlator_t *this) { + afr_private_t *priv = NULL; + + priv = this->private; + this->private = NULL; + afr_priv_destroy (priv); + if (this->itable);//I dont see any destroy func + return 0; } @@ -771,6 +456,9 @@ struct xlator_fops fops = { .finodelk = afr_finodelk, .entrylk = afr_entrylk, .fentrylk = afr_fentrylk, + .fallocate = afr_fallocate, + .discard = afr_discard, + .zerofill = afr_zerofill, /* inode read */ .access = afr_access, @@ -778,6 +466,7 @@ struct xlator_fops fops = { .fstat = afr_fstat, .readlink = afr_readlink, .getxattr = afr_getxattr, + .fgetxattr = afr_fgetxattr, .readv = afr_readv, /* inode write */ @@ -785,9 +474,11 @@ struct xlator_fops fops = { .truncate = afr_truncate, .ftruncate = afr_ftruncate, .setxattr = afr_setxattr, + .fsetxattr = afr_fsetxattr, .setattr = afr_setattr, .fsetattr = afr_fsetattr, .removexattr = afr_removexattr, + .fremovexattr = afr_fremovexattr, /* dir read */ .opendir = afr_opendir, @@ -820,68 +511,239 @@ struct xlator_cbks cbks = { struct volume_options options[] = { { .key = {"read-subvolume" }, - .type = GF_OPTION_TYPE_XLATOR + .type = GF_OPTION_TYPE_XLATOR, + .description = "inode-read fops happen only on one of the bricks in " + "replicate. Afr will prefer the one specified using " + "this option if it is not stale. Option value must be " + "one of the xlator names of the children. " + "Ex: <volname>-client-0 till " + "<volname>-client-<number-of-bricks - 1>" + }, + { .key = {"read-subvolume-index" }, + .type = GF_OPTION_TYPE_INT, + .default_value = "-1", + .description = "inode-read fops happen only on one of the bricks in " + "replicate. AFR will prefer the one specified using " + "this option if it is not stale. allowed options" + " include -1 till replica-count - 1" + }, + { .key = {"read-hash-mode" }, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = 2, + .default_value = "1", + .description = "inode-read fops happen only on one of the bricks in " + "replicate. AFR will prefer the one computed using " + "the method specified using this option" + "0 = first up server, " + "1 = hash by GFID of file (all clients use " + "same subvolume), " + "2 = hash by GFID of file and client PID", + }, + { .key = {"choose-local" }, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "true", + .description = "Choose a local subvolume (i.e. Brick) to read from" + " if read-subvolume is not explicitly set.", }, { .key = {"favorite-child"}, - .type = GF_OPTION_TYPE_XLATOR + .type = GF_OPTION_TYPE_XLATOR, + .description = "If a split-brain happens choose subvol/brick set by " + "this option as source." }, { .key = {"background-self-heal-count"}, .type = GF_OPTION_TYPE_INT, - .min = 0 + .min = 0, + .default_value = "16", + .validate = GF_OPT_VALIDATE_MIN, + .description = "This specifies the number of self-heals that can be " + " performed in background without blocking the fop" }, { .key = {"data-self-heal"}, - .type = GF_OPTION_TYPE_BOOL + .type = GF_OPTION_TYPE_STR, + .value = {"1", "on", "yes", "true", "enable", + "0", "off", "no", "false", "disable", + "open"}, + .default_value = "on", + .description = "Using this option we can enable/disable data " + "self-heal on the file. \"open\" means data " + "self-heal action will only be triggered by file " + "open operations." }, { .key = {"data-self-heal-algorithm"}, .type = GF_OPTION_TYPE_STR, - .default_value = "", .description = "Select between \"full\", \"diff\". The " "\"full\" algorithm copies the entire file from " "source to sink. The \"diff\" algorithm copies to " "sink only those blocks whose checksums don't match " - "with those of source.", - .value = { "diff", "full" } + "with those of source. If no option is configured " + "the option is chosen dynamically as follows: " + "If the file does not exist on one of the sinks " + "or empty file exists or if the source file size is " + "about the same as page size the entire file will " + "be read and written i.e \"full\" algo, " + "otherwise \"diff\" algo is chosen.", + .value = { "diff", "full"} }, { .key = {"data-self-heal-window-size"}, .type = GF_OPTION_TYPE_INT, .min = 1, .max = 1024, - .default_value = "16", + .default_value = "1", .description = "Maximum number blocks per file for which self-heal " "process would be applied simultaneously." }, { .key = {"metadata-self-heal"}, - .type = GF_OPTION_TYPE_BOOL + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Using this option we can enable/disable metadata " + "i.e. Permissions, ownerships, xattrs self-heal on " + "the file/directory." }, { .key = {"entry-self-heal"}, - .type = GF_OPTION_TYPE_BOOL + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Using this option we can enable/disable entry " + "self-heal on the directory." }, { .key = {"data-change-log"}, - .type = GF_OPTION_TYPE_BOOL + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Data fops like write/truncate will not perform " + "pre/post fop changelog operations in afr transaction " + "if this option is disabled" }, { .key = {"metadata-change-log"}, - .type = GF_OPTION_TYPE_BOOL + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Metadata fops like setattr/setxattr will not perform " + "pre/post fop changelog operations in afr transaction " + "if this option is disabled" }, { .key = {"entry-change-log"}, - .type = GF_OPTION_TYPE_BOOL + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Entry fops like create/unlink will not perform " + "pre/post fop changelog operations in afr transaction " + "if this option is disabled" }, { .key = {"optimistic-change-log"}, - .type = GF_OPTION_TYPE_BOOL + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Entry/Metadata fops will not perform " + "pre fop changelog operations in afr transaction " + "if this option is enabled." }, - { .key = {"data-lock-server-count"}, - .type = GF_OPTION_TYPE_INT, - .min = 0 + { .key = {"inodelk-trace"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Enabling this option logs inode lock/unlocks" + }, + { .key = {"entrylk-trace"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Enabling this option logs entry lock/unlocks" + }, + { .key = {"pre-op-compat"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Use separate pre-op xattrop() FOP rather than " + "overloading xdata of the OP" + }, + { .key = {"eager-lock"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Lock phase of a transaction has two sub-phases. " + "First is an attempt to acquire locks in parallel by " + "broadcasting non-blocking lock requests. If lock " + "acquisition fails on any server, then the held locks " + "are unlocked and revert to a blocking locked mode " + "sequentially on one server after another. If this " + "option is enabled the initial broadcasting lock " + "request attempt to acquire lock on the entire file. " + "If this fails, we revert back to the sequential " + "\"regional\" blocking lock as before. In the case " + "where such an \"eager\" lock is granted in the " + "non-blocking phase, it gives rise to an opportunity " + "for optimization. i.e, if the next write transaction " + "on the same FD arrives before the unlock phase of " + "the first transaction, it \"takes over\" the full " + "file lock. Similarly if yet another data transaction " + "arrives before the unlock phase of the \"optimized\" " + "transaction, that in turn \"takes over\" the lock as " + "well. The actual unlock now happens at the end of " + "the last \"optimized\" transaction." + + }, + { .key = {"self-heal-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "This option applies to only self-heal-daemon. " + "Index directory crawl and automatic healing of files " + "will not be performed if this option is turned off." + }, + { .key = {"iam-self-heal-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option differentiates if the replicate " + "translator is running as part of self-heal-daemon " + "or not." }, - { .key = {"metadata-lock-server-count"}, + { .key = {"quorum-type"}, + .type = GF_OPTION_TYPE_STR, + .value = { "none", "auto", "fixed"}, + .default_value = "none", + .description = "If value is \"fixed\" only allow writes if " + "quorum-count bricks are present. If value is " + "\"auto\" only allow writes if more than half of " + "bricks, or exactly half including the first, are " + "present.", + }, + { .key = {"quorum-count"}, .type = GF_OPTION_TYPE_INT, - .min = 0 + .min = 1, + .max = INT_MAX, + .default_value = 0, + .description = "If quorum-type is \"fixed\" only allow writes if " + "this many bricks or present. Other quorum types " + "will OVERWRITE this value.", }, - { .key = {"entry-lock-server-count"}, + { .key = {"node-uuid"}, + .type = GF_OPTION_TYPE_STR, + .description = "Local glusterd uuid string, used in starting " + "self-heal-daemon so that it can crawl only on " + "local index directories.", + }, + { .key = {"post-op-delay-secs"}, .type = GF_OPTION_TYPE_INT, - .min = 0 + .min = 0, + .max = INT_MAX, + .default_value = "1", + .description = "Time interval induced artificially before " + "post-operation phase of the transaction to " + "enhance overlap of adjacent write operations.", + }, + { .key = {AFR_SH_READDIR_SIZE_KEY}, + .type = GF_OPTION_TYPE_SIZET, + .description = "readdirp size for performing entry self-heal", + .min = 1024, + .max = 131072, + .default_value = "1KB", }, - { .key = {"strict-readdir"}, + { .key = {"ensure-durability"}, .type = GF_OPTION_TYPE_BOOL, + .description = "Afr performs fsyncs for transactions if this " + "option is on to make sure the changelogs/data is " + "written to the disk", + .default_value = "on", }, + { .key = {"afr-dirty-xattr"}, + .type = GF_OPTION_TYPE_STR, + .default_value = AFR_DIRTY_DEFAULT, + }, + { .key = {"metadata-splitbrain-forced-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, { .key = {NULL} }, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 915ab20b1..36042f7b2 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -31,57 +22,47 @@ #include "afr-mem-types.h" #include "libxlator.h" +#include "timer.h" +#include "syncop.h" + +#include "afr-self-heald.h" #define AFR_XATTR_PREFIX "trusted.afr" #define AFR_PATHINFO_HEADER "REPLICATE:" +#define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size" +#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal" +#define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty" +#define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty) -struct _pump_private; +#define AFR_LOCKEE_COUNT_MAX 3 +#define AFR_DOM_COUNT_MAX 3 +#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ -typedef int (*afr_expunge_done_cbk_t) (call_frame_t *frame, xlator_t *this, - int child, int32_t op_error, - int32_t op_errno); +typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this); -typedef int (*afr_impunge_done_cbk_t) (call_frame_t *frame, xlator_t *this, - int child, int32_t op_error, - int32_t op_errno); -typedef int (*afr_post_remove_call_t) (call_frame_t *frame, xlator_t *this); +typedef int (*afr_read_txn_wind_t) (call_frame_t *frame, xlator_t *this, int subvol); -typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this); +typedef int (*afr_inode_refresh_cbk_t) (call_frame_t *frame, xlator_t *this, int err); -typedef struct afr_inode_params_ { - uint64_t mask_type; - union { - gf_boolean_t value; - struct { - int32_t read_child; - int32_t *fresh_children; - } read_ctx; - } u; -} afr_inode_params_t; +typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this); -typedef struct afr_inode_ctx_ { - uint64_t masks; - int32_t *fresh_children;//increasing order of latency -} afr_inode_ctx_t; +#define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr;}) +#define AFR_COUNT(array,max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res;}) +#define AFR_INTERSECT(dst,src1,src2,max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i];}) typedef struct _afr_private { gf_lock_t lock; /* to guard access to child_count, etc */ unsigned int child_count; /* total number of children */ - unsigned int read_child_rr; /* round-robin index of the read_child */ - gf_lock_t read_child_lock; /* lock to protect above */ - xlator_t **children; - gf_lock_t root_inode_lk; - int first_lookup; inode_t *root_inode; unsigned char *child_up; char **pending_key; - gf_boolean_t data_self_heal; /* on/off */ + char *data_self_heal; /* on/off/open */ char * data_self_heal_algorithm; /* name of algorithm */ unsigned int data_self_heal_window_size; /* max number of pipelined read/writes */ @@ -95,141 +76,53 @@ typedef struct _afr_private { gf_boolean_t metadata_change_log; /* on/off */ gf_boolean_t entry_change_log; /* on/off */ + gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */ int read_child; /* read-subvolume */ + unsigned int hash_mode; /* for when read_child is not set */ int favorite_child; /* subvolume to be preferred in resolving split-brain cases */ - unsigned int data_lock_server_count; - unsigned int metadata_lock_server_count; - unsigned int entry_lock_server_count; - gf_boolean_t inodelk_trace; gf_boolean_t entrylk_trace; - gf_boolean_t strict_readdir; - unsigned int wait_count; /* # of servers to wait for success */ uint64_t up_count; /* number of CHILD_UPs we have seen */ uint64_t down_count; /* number of CHILD_DOWNs we have seen */ - struct _pump_private *pump_private; /* Set if we are loaded as pump */ - int use_afr_in_pump; - - pthread_mutex_t mutex; - struct list_head saved_fds; /* list of fds on which locks have succeeded */ - gf_boolean_t optimistic_change_log; + gf_boolean_t optimistic_change_log; + gf_boolean_t eager_lock; + gf_boolean_t pre_op_compat; /* on/off */ + uint32_t post_op_delay_secs; + unsigned int quorum_count; char vol_uuid[UUID_SIZE + 1]; int32_t *last_event; -} afr_private_t; - -typedef struct { - /* External interface: These are variables (some optional) that - are set by whoever has triggered self-heal */ - - gf_boolean_t need_data_self_heal; - gf_boolean_t need_metadata_self_heal; - gf_boolean_t need_entry_self_heal; - gf_boolean_t need_gfid_self_heal; - gf_boolean_t need_missing_entry_self_heal; - - gf_boolean_t forced_merge; /* Is this a self-heal triggered to - forcibly merge the directories? */ - - gf_boolean_t healing_fd_opened; /* true if caller has already - opened fd */ - - gf_boolean_t data_lock_held; /* true if caller has already - acquired 0-0 lock */ - - fd_t *healing_fd; /* set if callers has opened fd */ - - gf_boolean_t background; /* do self-heal in background - if possible */ - - ia_type_t type; /* st_mode of the entry we're doing - self-heal on */ - inode_t *inode; /* inode on which the self-heal is - performed on */ - uuid_t sh_gfid_req; /* gfid self-heal needs to be done - with this gfid if it is not null */ - - /* Function to call to unwind. If self-heal is being done in the - background, this function will be called as soon as possible. */ - - int (*unwind) (call_frame_t *frame, xlator_t *this, int32_t op_ret, - int32_t op_errno); - - /* End of external interface members */ - - - /* array of stat's, one for each child */ - struct iatt *buf; - struct iatt *parentbufs; - struct iatt parentbuf; - struct iatt entrybuf; - - afr_expunge_done_cbk_t expunge_done; - afr_impunge_done_cbk_t impunge_done; - int32_t impunge_ret_child; - - /* array of xattr's, one for each child */ - dict_t **xattr; - - /* array containing if the lookups succeeded in the order of response - */ - int32_t *success_children; - int success_count; - /* array containing the fresh children found in the self-heal process */ - int32_t *fresh_children; - /* array containing the fresh children found in the parent lookup */ - int32_t *fresh_parent_dirs; - /* array of errno's, one for each child */ - int *child_errno; - - int32_t **pending_matrix; - int32_t **delta_matrix; - - int32_t op_ret; - int32_t op_errno; - - int *sources; - int source; - int active_source; - int active_sinks; - int *success; - unsigned char *locked_nodes; - int lock_count; - - mode_t impunging_entry_mode; - const char *linkname; - int op_failed; - - int file_has_holes; - blksize_t block_size; - off_t file_size; - off_t offset; - afr_post_remove_call_t post_remove_call; - - loc_t parent_loc; - - call_frame_t *orig_frame; - gf_boolean_t unwound; - - /* private data for the particular self-heal algorithm */ - void *private; - - int (*flush_self_heal_cbk) (call_frame_t *frame, xlator_t *this); - - int (*completion_cbk) (call_frame_t *frame, xlator_t *this); - int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this); - int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this); - void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this); - - call_frame_t *sh_frame; -} afr_self_heal_t; + /* @event_generation: Keeps count of number of events received which can + potentially impact consistency decisions. The events are CHILD_UP + and CHILD_DOWN, when we have to recalculate the freshness/staleness + of copies to detect if changes had happened while the other server + was down. CHILD_DOWN and CHILD_UP can also be received on network + disconnect/reconnects and not necessarily server going down/up. + Recalculating freshness/staleness on network events is equally + important as we might have had a network split brain. + */ + uint32_t event_generation; + + gf_boolean_t choose_local; + gf_boolean_t did_discovery; + uint64_t sh_readdir_size; + gf_boolean_t ensure_durability; + char *sh_domain; + char *afr_dirty; + + afr_self_heald_t shd; + + /* pump dependencies */ + void *pump_private; + gf_boolean_t use_afr_in_pump; +} afr_private_t; typedef enum { @@ -292,11 +185,31 @@ afr_index_for_transaction_type (afr_transaction_type type) return -1; /* make gcc happy */ } +typedef struct { + loc_t loc; + char *basename; + unsigned char *locked_nodes; + int locked_count; + +} afr_entry_lockee_t; + +int +afr_entry_lockee_cmp (const void *l1, const void *l2); + +typedef struct { + char *domain; /* Domain on which inodelk is taken */ + struct gf_flock flock; + unsigned char *locked_nodes; + int32_t lock_count; +} afr_inodelk_t; typedef struct { loc_t *lk_loc; - struct gf_flock lk_flock; + int lockee_count; + afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX]; + + afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX]; const char *lk_basename; const char *lower_basename; const char *higher_basename; @@ -305,86 +218,203 @@ typedef struct { unsigned char *locked_nodes; unsigned char *lower_locked_nodes; - unsigned char *inode_locked_nodes; - unsigned char *entry_locked_nodes; selfheal_lk_type_t selfheal_lk_type; transaction_lk_type_t transaction_lk_type; int32_t lock_count; - int32_t inodelk_lock_count; int32_t entrylk_lock_count; uint64_t lock_number; int32_t lk_call_count; + int32_t lk_expected_count; + int32_t lk_attempted_count; int32_t lock_op_ret; int32_t lock_op_errno; + afr_lock_cbk_t lock_cbk; + char *domain; /* Domain on which inode/entry lock/unlock in progress.*/ +} afr_internal_lock_t; - int (*lock_cbk) (call_frame_t *, xlator_t *); +struct afr_reply { + int valid; + int32_t op_ret; + int32_t op_errno; + dict_t *xdata; + struct iatt poststat; + struct iatt postparent; + struct iatt prestat; + struct iatt preparent; + struct iatt preparent2; + struct iatt postparent2; + uint8_t checksum[MD5_DIGEST_LENGTH]; +}; -} afr_internal_lock_t; +typedef enum { + AFR_FD_NOT_OPENED, + AFR_FD_OPENED, + AFR_FD_OPENING +} afr_fd_open_status_t; + +typedef struct { + unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS]; + int inherited[AFR_NUM_CHANGE_LOGS]; + int on_disk[AFR_NUM_CHANGE_LOGS]; + afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ + + unsigned int *lock_piggyback; + unsigned int *lock_acquired; + + int flags; + + /* used for delayed-post-op optimization */ + pthread_mutex_t delay_lock; + gf_timer_t *delay_timer; + call_frame_t *delay_frame; + + /* set if any write on this fd was a non stable write + (i.e, without O_SYNC or O_DSYNC) + */ + gf_boolean_t witnessed_unstable_write; + + /* @open_fd_count: + Number of open FDs queried from the server, as queried through + xdata in FOPs. Currently, used to decide if eager-locking must be + temporarily disabled. + */ + uint32_t open_fd_count; + + + /* list of frames currently in progress */ + struct list_head eager_locked; +} afr_fd_ctx_t; -typedef struct _afr_locked_fd { - fd_t *fd; - struct list_head list; -} afr_locked_fd_t; typedef struct _afr_local { - int uid; - int gid; + glusterfs_fop_t op; unsigned int call_count; - unsigned int success_count; - unsigned int enoent_count; - - unsigned int govinda_gOvinda; + /* @event_generation: copy of priv->event_generation taken at the + time of starting the transaction. The copy is made so that we + have a stable value through the various phases of the transaction. + */ + unsigned int event_generation; - unsigned int read_child_index; - unsigned char read_child_returned; - unsigned int first_up_child; + uint32_t open_fd_count; + gf_boolean_t update_open_fd_count; - pid_t saved_pid; + gf_lkowner_t saved_lk_owner; int32_t op_ret; int32_t op_errno; int32_t **pending; + int dirty[AFR_NUM_CHANGE_LOGS]; + loc_t loc; loc_t newloc; fd_t *fd; + afr_fd_ctx_t *fd_ctx; - glusterfs_fop_t fop; - + /* @child_up: copy of priv->child_up taken at the time of transaction + start. The copy is taken so that we have a stable child_up array + through the phases of the transaction as priv->child_up[i] can keep + changing through time. + */ unsigned char *child_up; - int32_t *fresh_children; //in the order of response - int32_t *child_errno; + /* @read_attempted: + array of flags representing subvolumes where read operations of + the read transaction have already been attempted. The array is + first pre-filled with down subvolumes, and as reads are performed + on other subvolumes, those are set as well. This way if the read + operation fails we do not retry on that subvolume again. + */ + unsigned char *read_attempted; - dict_t *xattr_req; + /* @readfn: - int32_t inodelk_count; - int32_t entrylk_count; + pointer to function which will perform the read operation on a given + subvolume. Used in read transactions. + */ - afr_internal_lock_t internal_lock; + afr_read_txn_wind_t readfn; + + /* @refreshed: + + the inode was "refreshed" (i.e, pending xattrs from all subvols + freshly inspected and inode ctx updated accordingly) as part of + this transaction already. + */ + gf_boolean_t refreshed; + + /* @inode: + + the inode on which the read txn is performed on. ref'ed and copied + from either fd->inode or loc.inode + */ + + inode_t *inode; + + /* @parent[2]: + + parent inode[s] on which directory transactions are performed. + */ - afr_locked_fd_t *locked_fd; - int32_t source_child; - int32_t lock_recovery_child; + inode_t *parent; + inode_t *parent2; + + /* @readable: + + array of flags representing servers from which a read can be + performed. This is the output of afr_inode_refresh() + */ + unsigned char *readable; + + afr_inode_refresh_cbk_t refreshfn; + + /* @refreshinode: + + Inode currently getting refreshed. + */ + inode_t *refreshinode; + + /* + @pre_op_compat: + + compatibility mode of pre-op. send a separate pre-op and + op operations as part of transaction, rather than combining + */ + + gf_boolean_t pre_op_compat; + + dict_t *xattr_req; + + afr_internal_lock_t internal_lock; dict_t *dict; + int optimistic_change_log; + gf_boolean_t delayed_post_op; + + /* Is the current writev() going to perform a stable write? + i.e, is fd->flags or @flags writev param have O_SYNC or + O_DSYNC? + */ + gf_boolean_t stable_write; - int (*openfd_flush_cbk) (call_frame_t *frame, xlator_t *this); + /* This write appended to the file. Nnot necessarily O_APPEND, + just means the offset of write was at the end of file. + */ + gf_boolean_t append_write; /* This struct contains the arguments for the "continuation" (scheme-like) of fops */ - int op; struct { struct { unsigned char buf_set; @@ -392,25 +422,7 @@ typedef struct _afr_local { } statfs; struct { - uuid_t gfid_req; - inode_t *inode; - struct iatt buf; - struct iatt postparent; - ino_t ino; - uint64_t gen; - ino_t parent_ino; - dict_t **xattrs; - dict_t *xattr; - struct iatt *postparents; - struct iatt *bufs; - int32_t read_child; - int32_t *sources; - int32_t *success_children; - } lookup; - - struct { int32_t flags; - int32_t wbflags; } open; struct { @@ -429,31 +441,28 @@ typedef struct _afr_local { struct { int last_index; - ino_t ino; } stat; struct { int last_index; - ino_t ino; } fstat; struct { size_t size; int last_index; - ino_t ino; } readlink; struct { char *name; int last_index; - long pathinfo_len; + long xattr_len; } getxattr; struct { - ino_t ino; size_t size; off_t offset; int last_index; + uint32_t flags; } readv; /* dir read */ @@ -471,59 +480,43 @@ typedef struct _afr_local { int32_t op_errno; size_t size; off_t offset; - + dict_t *dict; gf_boolean_t failed; int last_index; } readdir; /* inode write */ struct { - ino_t ino; struct iatt prebuf; struct iatt postbuf; + } inode_wfop; //common structure for all inode-write-fops + struct { int32_t op_ret; struct iovec *vector; struct iobref *iobref; int32_t count; off_t offset; + uint32_t flags; } writev; struct { - ino_t ino; - struct iatt prebuf; - struct iatt postbuf; - } fsync; - - struct { - ino_t ino; off_t offset; - struct iatt prebuf; - struct iatt postbuf; } truncate; struct { - ino_t ino; off_t offset; - struct iatt prebuf; - struct iatt postbuf; } ftruncate; struct { - ino_t ino; struct iatt in_buf; int32_t valid; - struct iatt preop_buf; - struct iatt postop_buf; } setattr; struct { - ino_t ino; struct iatt in_buf; int32_t valid; - struct iatt preop_buf; - struct iatt postop_buf; } fsetattr; struct { @@ -532,116 +525,87 @@ typedef struct _afr_local { } setxattr; struct { + dict_t *dict; + int32_t flags; + } fsetxattr; + + struct { char *name; } removexattr; + struct { + dict_t *xattr; + } xattrop; + + struct { + dict_t *xattr; + } fxattrop; + /* dir write */ struct { - ino_t ino; - uint64_t gen; - ino_t parent_ino; - fd_t *fd; - dict_t *params; - int32_t flags; - mode_t mode; inode_t *inode; struct iatt buf; struct iatt preparent; struct iatt postparent; - struct iatt read_child_buf; + struct iatt prenewparent; + struct iatt postnewparent; + } dir_fop; //common structure for all dir fops + + struct { + fd_t *fd; + dict_t *params; + int32_t flags; + mode_t mode; } create; struct { - ino_t ino; - uint64_t gen; - ino_t parent_ino; dev_t dev; mode_t mode; dict_t *params; - inode_t *inode; - struct iatt buf; - struct iatt preparent; - struct iatt postparent; - struct iatt read_child_buf; } mknod; struct { - ino_t ino; - uint64_t gen; - ino_t parent_ino; int32_t mode; dict_t *params; - inode_t *inode; - struct iatt buf; - struct iatt read_child_buf; - struct iatt preparent; - struct iatt postparent; } mkdir; struct { - ino_t parent_ino; - int32_t op_ret; - int32_t op_errno; - struct iatt preparent; - struct iatt postparent; - } unlink; - - struct { - int flags; - ino_t parent_ino; - int32_t op_ret; - int32_t op_errno; - struct iatt preparent; - struct iatt postparent; + int flags; } rmdir; struct { - ino_t oldparent_ino; - ino_t newparent_ino; - ino_t ino; - struct iatt buf; - struct iatt read_child_buf; - struct iatt preoldparent; - struct iatt prenewparent; - struct iatt postoldparent; - struct iatt postnewparent; - } rename; - - struct { - ino_t ino; - uint64_t gen; - ino_t parent_ino; - inode_t *inode; - struct iatt buf; - struct iatt read_child_buf; - struct iatt preparent; - struct iatt postparent; - } link; - - struct { - ino_t ino; - uint64_t gen; - ino_t parent_ino; - inode_t *inode; dict_t *params; - struct iatt buf; - struct iatt read_child_buf; char *linkpath; - struct iatt preparent; - struct iatt postparent; } symlink; + struct { + int32_t mode; + off_t offset; + size_t len; + } fallocate; + + struct { + off_t offset; + size_t len; + } discard; + struct { - int32_t flags; - dir_entry_t *entries; - int32_t count; - } setdents; + off_t offset; + off_t len; + struct iatt prebuf; + struct iatt postbuf; + } zerofill; + + } cont; struct { off_t start, len; + gf_boolean_t eager_lock_on; + int *eager_lock; + char *basename; char *new_basename; @@ -650,15 +614,67 @@ typedef struct _afr_local { afr_transaction_type type; - int success_count; - int erase_pending; - int failure_count; + /* stub to resume on destruction + of the transaction frame */ + call_stub_t *resume_stub; + + struct list_head eager_locked; + + unsigned char *pre_op; + + /* @fop_subvols: subvolumes on which FOP will be attempted */ + unsigned char *fop_subvols; + + /* @failed_subvols: subvolumes on which FOP failed. Always + a subset of @fop_subvols */ + unsigned char *failed_subvols; + + /* @dirtied: flag which indicates whether we set dirty flag + in the OP. Typically true when we are performing operation + on more than one subvol and optimistic changelog is disabled + + A 'true' value set in @dirtied flag means an 'undirtying' + has to be done in POST-OP phase. + */ + gf_boolean_t dirtied; + + /* @inherited: flag which indicates that the dirty flags + of the previous transaction were inherited + */ + gf_boolean_t inherited; + + /* + @no_uninherit: flag which indicates that a pre_op_uninherit() + must _not_ be attempted (and returned as failure) always. This + flag is set when a hard pre-op is performed, but not accounted + for it in fd_ctx->on_disk[]. Such transactions are "isolated" + from the pre-op piggybacking entirely and therefore uninherit + must not be attempted. + */ + gf_boolean_t no_uninherit; - int last_tried; - int32_t *child_errno; + /* @uninherit_done: + @uninherit_value: + + The above pair variables make pre_op_uninherit() idempotent. + Both are FALSE initially. The first call to pre_op_uninherit + sets @uninherit_done to TRUE and the return value to + @uninherit_value. Further calls will check for @uninherit_done + to be TRUE and if so will simply return @uninherit_value. + */ + gf_boolean_t uninherit_done; + gf_boolean_t uninherit_value; + + /* @changelog_resume: function to be called after changlogging + (either pre-op or post-op) is done + */ + + afr_changelog_resume_t changelog_resume; call_frame_t *main_frame; + int (*wind) (call_frame_t *frame, xlator_t *this, int subvol); + int (*fop) (call_frame_t *frame, xlator_t *this); int (*done) (call_frame_t *frame, xlator_t *this); @@ -670,85 +686,96 @@ typedef struct _afr_local { /* post-op hook */ } transaction; - afr_self_heal_t self_heal; + syncbarrier_t barrier; struct marker_str marker; -} afr_local_t; + /* extra data for fops */ + dict_t *xdata_req; + dict_t *xdata_rsp; -typedef struct { - unsigned int *pre_op_done; - unsigned int *opened_on; /* which subvolumes the fd is open on */ - unsigned int *pre_op_piggyback; + mode_t umask; + int xflag; + gf_boolean_t do_discovery; + struct afr_reply *replies; +} afr_local_t; - int flags; - int32_t wbflags; - uint64_t up_count; /* number of CHILD_UPs this fd has seen */ - uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */ - int32_t last_tried; +/* did a call fail due to a child failing? */ +#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \ + ((op_errno == ENOTCONN) || \ + (op_errno == EBADFD))) - int hit, miss; - gf_boolean_t failed_over; - struct list_head entries; /* needed for readdir failover */ +int +afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int *event_generation); +int +__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int *event_generation); - unsigned char *locked_on; /* which subvolumes locks have been successful */ -} afr_fd_ctx_t; +int +__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvol, + int event_generation); +int +afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int event_generation); +int +afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this); -/* try alloc and if it fails, goto label */ -#define ALLOC_OR_GOTO(var, type, label) do { \ - var = GF_CALLOC (sizeof (type), 1, \ - gf_afr_mt_##type); \ - if (!var) { \ - gf_log (this->name, GF_LOG_ERROR, \ - "out of memory :("); \ - op_errno = ENOMEM; \ - goto label; \ - } \ - } while (0); +int +afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, + unsigned char *readable); +int +afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, + int type); +int +afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p, + int *event_p, afr_transaction_type type); -/* did a call fail due to a child failing? */ -#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \ - ((op_errno == ENOTCONN) || \ - (op_errno == EBADFD))) +#define afr_data_subvol_get(i, t, s, e) \ + afr_read_subvol_get(i, t, s, e, AFR_DATA_TRANSACTION) -#define afr_fop_failed(op_ret, op_errno) ((op_ret) == -1) +#define afr_metadata_subvol_get(i, t, s, e) \ + afr_read_subvol_get(i, t, s, e, AFR_METADATA_TRANSACTION) -/* have we tried all children? */ -#define all_tried(i, count) ((i) == (count) - 1) +int +afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_inode_refresh_cbk_t cbk); int32_t -afr_set_dict_gfid (dict_t *dict, uuid_t gfid); +afr_notify (xlator_t *this, int32_t event, void *data, void *data2); int -pump_command_reply (call_frame_t *frame, xlator_t *this); +afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local, + loc_t *loc, char *basename, int child_count); -int32_t -afr_notify (xlator_t *this, int32_t event, - void *data, ...); +void +afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock); int afr_attempt_lock_recovery (xlator_t *this, int32_t child_index); int -afr_save_locked_fd (xlator_t *this, fd_t *fd); - -int afr_mark_locked_nodes (xlator_t *this, fd_t *fd, unsigned char *locked_nodes); void -afr_set_lk_owner (call_frame_t *frame, xlator_t *this); +afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner); int afr_set_lock_number (call_frame_t *frame, xlator_t *this); - -loc_t * -lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2); - int32_t afr_unlock (call_frame_t *frame, xlator_t *this); @@ -764,33 +791,30 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this); int afr_internal_lock_finish (call_frame_t *frame, xlator_t *this); +int +afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, + unsigned int child_count); -int pump_start (call_frame_t *frame, xlator_t *this); +int +__afr_fd_ctx_set (xlator_t *this, fd_t *fd); int afr_fd_ctx_set (xlator_t *this, fd_t *fd); -int32_t -afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children); - -void -afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, - int32_t *fresh_children); +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this); -void -afr_build_parent_loc (loc_t *parent, loc_t *child); +int +afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno); int -afr_up_children_count (int child_count, unsigned char *child_up); +afr_locked_nodes_count (unsigned char *locked_nodes, int child_count); -gf_boolean_t -afr_is_fresh_lookup (loc_t *loc, xlator_t *this); +int +afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode); void -afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent); - -int -afr_locked_nodes_count (unsigned char *locked_nodes, int child_count); +afr_replies_wipe (afr_local_t *local, afr_private_t *priv); void afr_local_cleanup (afr_local_t *local, xlator_t *this); @@ -798,21 +822,9 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this); int afr_frame_return (call_frame_t *frame); -uint64_t -afr_is_split_brain (xlator_t *this, inode_t *inode); - -void -afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set); - int afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, int32_t wbflags); - -void -afr_set_opendir_done (xlator_t *this, inode_t *inode); - -gf_boolean_t -afr_is_opendir_done (xlator_t *this, inode_t *inode); + fd_t *fd, dict_t *xdata); void afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this); @@ -820,9 +832,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this); int afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd); -int -afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd); - #define AFR_STACK_UNWIND(fop, frame, params ...) \ do { \ afr_local_t *__local = NULL; \ @@ -833,22 +842,36 @@ afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd); frame->local = NULL; \ } \ STACK_UNWIND_STRICT (fop, frame, params); \ - afr_local_cleanup (__local, __this); \ - GF_FREE (__local); \ - } while (0); + if (__local) { \ + afr_local_cleanup (__local, __this); \ + mem_put (__local); \ + } \ + } while (0) -#define AFR_STACK_DESTROY(frame) \ - do { \ - afr_local_t *__local = NULL; \ - xlator_t *__this = NULL; \ - __local = frame->local; \ - __this = frame->this; \ - frame->local = NULL; \ - STACK_DESTROY (frame->root); \ - afr_local_cleanup (__local, __this); \ - GF_FREE (__local); \ +#define AFR_STACK_DESTROY(frame) \ + do { \ + afr_local_t *__local = NULL; \ + xlator_t *__this = NULL; \ + __local = frame->local; \ + __this = frame->this; \ + frame->local = NULL; \ + STACK_DESTROY (frame->root); \ + if (__local) { \ + afr_local_cleanup (__local, __this); \ + mem_put (__local); \ + } \ } while (0); +#define AFR_FRAME_INIT(frame, op_errno) \ + ({frame->local = mem_get0 (THIS->local_pool); \ + if (afr_local_init (frame->local, THIS->private, &op_errno)) { \ + afr_local_cleanup (frame->local, THIS); \ + mem_put (frame->local); \ + frame->local = NULL; }; \ + frame->local;}) + +#define AFR_STACK_RESET(frame) do { int opr; STACK_RESET (frame->root); AFR_FRAME_INIT(frame, opr);} while (0) + /* allocate and return a string that is the basename of argument */ static inline char * AFR_BASENAME (const char *str) @@ -861,84 +884,93 @@ AFR_BASENAME (const char *str) return __basename_str; } +call_frame_t * +afr_copy_frame (call_frame_t *base); + int -afr_transaction_local_init (afr_local_t *local, afr_private_t *priv); +afr_transaction_local_init (afr_local_t *local, xlator_t *this); int32_t afr_marker_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,afr_local_t *local, afr_private_t *priv ); -int32_t * -afr_fresh_children_create (int32_t child_count); - int -AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv); +afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno); int afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, transaction_lk_type_t lk_type); int -afr_first_up_child (unsigned char *child_up, size_t child_count); +afr_higher_errno (int32_t old_errno, int32_t new_errno); + +int +afr_final_errno (afr_local_t *local, afr_private_t *priv); int -afr_select_read_child_from_policy (int32_t *fresh_children, int32_t child_count, - int32_t prev_read_child, - int32_t config_read_child, int32_t *sources); +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req); void -afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, - int32_t *fresh_children, int32_t prev_read_child, - int32_t config_read_child); +afr_fix_open (fd_t *fd, xlator_t *this); -int32_t -afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, - int32_t *fresh_children, - int32_t *call_child, int32_t *last_index); +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this); -int32_t -afr_next_call_child (int32_t *fresh_children, unsigned char *child_up, - size_t child_count, int32_t *last_index, - int32_t read_child); -void -afr_get_fresh_children (int32_t *success_children, int32_t *sources, - int32_t *fresh_children, unsigned int child_count); -void -afr_fresh_children_add_child (int32_t *fresh_children, int32_t child, - int32_t child_count); void -afr_reset_children (int32_t *fresh_children, int32_t child_count); -gf_boolean_t -afr_error_more_important (int32_t old_errno, int32_t new_errno); -int -afr_errno_count (int32_t *children, int *child_errno, - unsigned int child_count, int32_t op_errno); +afr_set_low_priority (call_frame_t *frame); int -afr_get_children_count (int32_t *fresh_children, unsigned int child_count); +afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child, + int flags); + gf_boolean_t -afr_is_child_present (int32_t *success_children, int32_t child_count, - int32_t child); +afr_have_quorum (char *logname, afr_private_t *priv); + void -afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs, - int32_t *success_children, - unsigned int child_count); +afr_matrix_cleanup (int32_t **pending, unsigned int m); + +int32_t** +afr_matrix_create (unsigned int m, unsigned int n); + void -afr_reset_xattr (dict_t **xattr, unsigned int child_count); +afr_filter_xattrs (dict_t *xattr); + +/* + * Special value indicating we should use the "auto" quorum method instead of + * a fixed value (including zero to turn off quorum enforcement). + */ +#define AFR_QUORUM_AUTO INT_MAX + +/* + * Having this as a macro will make debugging a bit weirder, but does reduce + * the probability of functions handling this check inconsistently. + */ +#define QUORUM_CHECK(_func,_label) do { \ + if (priv->quorum_count && !afr_have_quorum(this->name,priv)) { \ + gf_log(this->name,GF_LOG_WARNING, \ + "failing "#_func" due to lack of quorum"); \ + op_errno = EROFS; \ + goto _label; \ + } \ +} while (0); + +int +afr_fd_report_unstable_write (xlator_t *this, fd_t *fd); + gf_boolean_t -afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, - unsigned int child_count, const char *path, - const char *xlator_name); +afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd); + +void +afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub); + int -afr_gfid_missing_count (const char *xlator_name, int32_t *children, - struct iatt *bufs, unsigned int child_count, - const char *path); +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count); + void -afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path); +afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this); + +int +afr_local_pathinfo (char *pathinfo, gf_boolean_t *is_local); + void -afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count); -afr_transaction_type -afr_transaction_type_get (ia_type_t ia_type); -int32_t -afr_resultant_errno_get (int32_t *children, - int *child_errno, unsigned int child_count); +afr_remove_eager_lock_stub (afr_local_t *local); #endif /* __AFR_H__ */ diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c index 9fbeab2c3..eed509956 100644 --- a/xlators/cluster/afr/src/pump.c +++ b/xlators/cluster/afr/src/pump.c @@ -1,25 +1,17 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include <unistd.h> #include <sys/time.h> #include <stdlib.h> +#include <fnmatch.h> #ifndef _CONFIG_H #define _CONFIG_H @@ -28,8 +20,130 @@ #include "afr-common.c" #include "defaults.c" +#include "glusterfs.h" +#include "pump.h" + + +static int +afr_set_dict_gfid (dict_t *dict, uuid_t gfid) +{ + int ret = 0; + uuid_t *pgfid = NULL; + + GF_ASSERT (gfid); + + pgfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char); + if (!pgfid) { + ret = -1; + goto out; + } + + uuid_copy (*pgfid, gfid); + + ret = dict_set_dynptr (dict, "gfid-req", pgfid, sizeof (uuid_t)); + if (ret) + gf_log (THIS->name, GF_LOG_ERROR, "gfid set failed"); + +out: + if (ret && pgfid) + GF_FREE (pgfid); + return ret; +} + +static int +afr_set_root_gfid (dict_t *dict) +{ + uuid_t gfid; + int ret = 0; + + memset (gfid, 0, 16); + gfid[15] = 1; + + ret = afr_set_dict_gfid (dict, gfid); + + return ret; +} + +static int +afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +{ + int ret = -1; + uuid_t pargfid = {0}; + + if (!child) + goto out; + + if (!uuid_is_null (parent->inode->gfid)) + uuid_copy (pargfid, parent->inode->gfid); + else if (!uuid_is_null (parent->gfid)) + uuid_copy (pargfid, parent->gfid); + + if (uuid_is_null (pargfid)) + goto out; + + if (strcmp (parent->path, "/") == 0) + ret = gf_asprintf ((char **)&child->path, "/%s", name); + else + ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path, + name); + + if (-1 == ret) { + gf_log (this->name, GF_LOG_ERROR, + "asprintf failed while setting child path"); + } + + child->name = strrchr (child->path, '/'); + if (child->name) + child->name++; + + child->parent = inode_ref (parent->inode); + child->inode = inode_new (parent->inode->table); + uuid_copy (child->pargfid, pargfid); + + if (!child->inode) { + ret = -1; + goto out; + } + + ret = 0; +out: + if ((ret == -1) && child) + loc_wipe (child); + + return ret; +} + +static void +afr_build_root_loc (xlator_t *this, loc_t *loc) +{ + afr_private_t *priv = NULL; + + priv = this->private; + loc->path = gf_strdup ("/"); + loc->name = ""; + loc->inode = inode_ref (priv->root_inode); + uuid_copy (loc->gfid, loc->inode->gfid); +} + +static void +afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) +{ + GF_ASSERT (loc); + GF_ASSERT (buf); + + uuid_copy (loc->gfid, buf->ia_gfid); + if (postparent) + uuid_copy (loc->pargfid, postparent->ia_gfid); +} static uint64_t pump_pid = 0; +static inline void +pump_fill_loc_info (loc_t *loc, struct iatt *iatt, struct iatt *parent) +{ + afr_update_loc_gfids (loc, iatt, parent); + uuid_copy (loc->inode->gfid, iatt->ia_gfid); +} + static int pump_mark_start_pending (xlator_t *this) { @@ -140,85 +254,17 @@ pump_set_resume_path (xlator_t *this, const char *path) LOCK (&pump_priv->resume_path_lock); { - pump_priv->resume_path = strdup (path); - if (!pump_priv->resume_path) - ret = -1; + strncpy (pump_priv->resume_path, path, strlen (path) + 1); } UNLOCK (&pump_priv->resume_path_lock); return ret; } -static void -build_child_loc (loc_t *parent, loc_t *child, char *path, char *name) -{ - child->path = path; - child->name = name; - - child->parent = inode_ref (parent->inode); - child->inode = inode_new (parent->inode->table); -} - -static char * -build_file_path (loc_t *loc, gf_dirent_t *entry) -{ - xlator_t *this = NULL; - char *file_path = NULL; - int pathlen = 0; - int total_size = 0; - - this = THIS; - - pathlen = STRLEN_0 (loc->path); - - if (IS_ROOT_PATH (loc->path)) { - total_size = pathlen + entry->d_len; - file_path = GF_CALLOC (1, total_size, gf_afr_mt_char); - if (!file_path) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - return NULL; - } - - gf_log (this->name, GF_LOG_TRACE, - "constructing file path of size=%d" - "pathlen=%d, d_len=%d", - total_size, pathlen, - entry->d_len); - - snprintf(file_path, total_size, "%s%s", loc->path, entry->d_name); - - } else { - total_size = pathlen + entry->d_len + 1; /* for the extra '/' in the path */ - file_path = GF_CALLOC (1, total_size + 1, gf_afr_mt_char); - if (!file_path) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - return NULL; - } - - gf_log (this->name, GF_LOG_TRACE, - "constructing file path of size=%d" - "pathlen=%d, d_len=%d", - total_size, pathlen, - entry->d_len); - - snprintf(file_path, total_size, "%s/%s", loc->path, entry->d_name); - } - - gf_log (this->name, GF_LOG_TRACE, - "path=%s and d_name=%s", loc->path, entry->d_name); - gf_log (this->name, GF_LOG_TRACE, - "constructed file_path=%s of size=%d", file_path, total_size); - - return file_path; -} - static int pump_save_path (xlator_t *this, const char *path) { afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; pump_state_t state; dict_t *dict = NULL; loc_t loc = {0}; @@ -230,29 +276,30 @@ pump_save_path (xlator_t *this, const char *path) return 0; priv = this->private; - pump_priv = priv->pump_private; GF_ASSERT (priv->root_inode); - build_root_loc (priv->root_inode, &loc); + afr_build_root_loc (this, &loc); dict = dict_new (); dict_ret = dict_set_str (dict, PUMP_PATH, (char *)path); + if (dict_ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set the key %s", path, PUMP_PATH); ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0); if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_INFO, "setxattr failed - could not save path=%s", path); } else { gf_log (this->name, GF_LOG_DEBUG, "setxattr succeeded - saved path=%s", path); - gf_log (this->name, GF_LOG_DEBUG, - "Saving path for status info"); } dict_unref (dict); + loc_wipe (&loc); return 0; } @@ -315,15 +362,9 @@ pump_get_resume_path (xlator_t *this) static int pump_update_resume_state (xlator_t *this, const char *path) { - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - pump_state_t state; const char *resume_path = NULL; - priv = this->private; - pump_priv = priv->pump_private; - state = pump_get_state (); if (state == PUMP_STATE_RESUME) { @@ -351,16 +392,10 @@ pump_update_resume_state (xlator_t *this, const char *path) static gf_boolean_t is_pump_traversal_allowed (xlator_t *this, const char *path) { - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - pump_state_t state; const char *resume_path = NULL; gf_boolean_t ret = _gf_true; - priv = this->private; - pump_priv = priv->pump_private; - state = pump_get_state (); if (state == PUMP_STATE_RESUME) { @@ -403,26 +438,22 @@ pump_save_file_stats (xlator_t *this, const char *path) static int gf_pump_traverse_directory (loc_t *loc) { - xlator_t *this = NULL; - afr_private_t *priv = NULL; - fd_t *fd = NULL; - - off_t offset = 0; - loc_t entry_loc; - gf_dirent_t *entry = NULL; - gf_dirent_t *tmp = NULL; - gf_dirent_t entries; - - struct iatt iatt, parent; - dict_t *xattr_rsp; - - char *file_path = NULL; - int ret = 0; - gf_boolean_t is_directory_empty = _gf_true; + xlator_t *this = NULL; + fd_t *fd = NULL; + off_t offset = 0; + loc_t entry_loc = {0}; + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + gf_dirent_t entries; + struct iatt iatt = {0}; + struct iatt parent = {0}; + dict_t *xattr_rsp = NULL; + int ret = 0; + gf_boolean_t is_directory_empty = _gf_true; + gf_boolean_t free_entries = _gf_false; INIT_LIST_HEAD (&entries.list); this = THIS; - priv = this->private; GF_ASSERT (loc->inode); @@ -444,7 +475,8 @@ gf_pump_traverse_directory (loc_t *loc) "pump opendir on %s returned=%d", loc->path, ret); - while (syncop_readdirp (this, fd, 131072, offset, &entries)) { + while (syncop_readdirp (this, fd, 131072, offset, NULL, &entries)) { + free_entries = _gf_true; if (list_empty (&entries.list)) { gf_log (this->name, GF_LOG_TRACE, @@ -456,110 +488,108 @@ gf_pump_traverse_directory (loc_t *loc) gf_log (this->name, GF_LOG_DEBUG, "found readdir entry=%s", entry->d_name); - file_path = build_file_path (loc, entry); - if (!file_path) { - gf_log (this->name, GF_LOG_DEBUG, - "file path construction failed"); - goto out; + offset = entry->d_off; + if (uuid_is_null (entry->d_stat.ia_gfid)) { + gf_log (this->name, GF_LOG_WARNING, "%s/%s: No " + "gfid present skipping", + loc->path, entry->d_name); + continue; } + loc_wipe (&entry_loc); + ret = afr_build_child_loc (this, &entry_loc, loc, + entry->d_name); + if (ret) + goto out; - build_child_loc (loc, &entry_loc, file_path, entry->d_name); - - if (!IS_ENTRY_CWD (entry->d_name) && - !IS_ENTRY_PARENT (entry->d_name)) { - - is_directory_empty = _gf_false; - ret = syncop_lookup (this, &entry_loc, NULL, - &iatt, &xattr_rsp, &parent); + if ((strcmp (entry->d_name, ".") == 0) || + (strcmp (entry->d_name, "..") == 0)) + continue; - memcpy (entry_loc.inode->gfid, iatt.ia_gfid, 16); + is_directory_empty = _gf_false; + gf_log (this->name, GF_LOG_DEBUG, + "lookup %s => %"PRId64, + entry_loc.path, + iatt.ia_ino); - gf_log (this->name, GF_LOG_DEBUG, - "lookup %s => %"PRId64, - entry_loc.path, - iatt.ia_ino); + ret = syncop_lookup (this, &entry_loc, NULL, &iatt, + &xattr_rsp, &parent); - ret = syncop_lookup (this, &entry_loc, NULL, - &iatt, &xattr_rsp, &parent); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: lookup failed", entry_loc.path); + continue; + } + ret = afr_selfheal_name (this, loc->gfid, entry->d_name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: name self-heal failed (%s/%s)", + entry_loc.path, uuid_utoa (loc->gfid), + entry->d_name); + continue; + } - gf_log (this->name, GF_LOG_DEBUG, - "second lookup ret=%d: %s => %"PRId64, - ret, - entry_loc.path, - iatt.ia_ino); + ret = afr_selfheal (this, iatt.ia_gfid); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: self-heal failed (%s)", + entry_loc.path, uuid_utoa (iatt.ia_gfid)); + continue; + } - pump_update_resume_state (this, entry_loc.path); + pump_fill_loc_info (&entry_loc, &iatt, &parent); - pump_save_path (this, entry_loc.path); - pump_save_file_stats (this, entry_loc.path); + pump_update_resume_state (this, entry_loc.path); - ret = pump_check_and_update_status (this); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Pump beginning to exit out"); - goto out; - } + pump_save_path (this, entry_loc.path); + pump_save_file_stats (this, entry_loc.path); - gf_log (this->name, GF_LOG_TRACE, - "type of file=%d, IFDIR=%d", - iatt.ia_type, IA_IFDIR); + ret = pump_check_and_update_status (this); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "Pump beginning to exit out"); + goto out; + } - if (IA_ISDIR (iatt.ia_type)) { - if (is_pump_traversal_allowed (this, entry_loc.path)) { - gf_log (this->name, GF_LOG_TRACE, - "entering dir=%s", - entry->d_name); - gf_pump_traverse_directory (&entry_loc); - } - } - } - offset = entry->d_off; - loc_wipe (&entry_loc); + if (IA_ISDIR (iatt.ia_type)) { + if (is_pump_traversal_allowed (this, entry_loc.path)) { + gf_log (this->name, GF_LOG_TRACE, + "entering dir=%s", entry->d_name); + gf_pump_traverse_directory (&entry_loc); + } + } } gf_dirent_free (&entries); - gf_log (this->name, GF_LOG_TRACE, - "offset incremented to %d", + free_entries = _gf_false; + gf_log (this->name, GF_LOG_TRACE, "offset incremented to %d", (int32_t ) offset); } - if (is_directory_empty && IS_ROOT_PATH (loc->path)) { + ret = syncop_close (fd); + if (ret < 0) + gf_log (this->name, GF_LOG_DEBUG, "closing the fd failed"); + + if (is_directory_empty && (strcmp (loc->path, "/") == 0)) { pump_change_state (this, PUMP_STATE_RUNNING); gf_log (this->name, GF_LOG_INFO, "Empty source brick. " "Nothing to be done."); } out: + if (entry_loc.path) + loc_wipe (&entry_loc); + if (free_entries) + gf_dirent_free (&entries); return 0; - -} - -void -build_root_loc (inode_t *inode, loc_t *loc) -{ - loc->path = "/"; - loc->name = ""; - loc->inode = inode; - loc->ino = 1; - loc->inode->ino = 1; - memset (loc->inode->gfid, 0, 16); - loc->inode->gfid[15] = 1; - } static int pump_update_resume_path (xlator_t *this) { - afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; - const char *resume_path = NULL; - priv = this->private; - pump_priv = priv->pump_private; - resume_path = pump_get_resume_path (this); if (resume_path) { @@ -580,10 +610,9 @@ pump_update_resume_path (xlator_t *this) static int32_t pump_xattr_cleaner (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; loc_t loc = {0}; int i = 0; int ret = 0; @@ -591,24 +620,25 @@ pump_xattr_cleaner (call_frame_t *frame, void *cookie, xlator_t *this, int sink = 1; priv = this->private; - pump_priv = priv->pump_private; - build_root_loc (priv->root_inode, &loc); + afr_build_root_loc (this, &loc); ret = syncop_removexattr (priv->children[source], &loc, - PUMP_PATH); + PUMP_PATH, 0); ret = syncop_removexattr (priv->children[sink], &loc, - PUMP_SINK_COMPLETE); + PUMP_SINK_COMPLETE, 0); for (i = 0; i < priv->child_count; i++) { ret = syncop_removexattr (priv->children[i], &loc, - PUMP_SOURCE_COMPLETE); - if (ret) + PUMP_SOURCE_COMPLETE, 0); + if (ret) { gf_log (this->name, GF_LOG_DEBUG, "removexattr " - "failed with %s", strerror (errno)); + "failed with %s", strerror (-ret)); + } } + loc_wipe (&loc); return pump_command_reply (frame, this); } @@ -628,7 +658,7 @@ pump_complete_migration (xlator_t *this) GF_ASSERT (priv->root_inode); - build_root_loc (priv->root_inode, &loc); + afr_build_root_loc (this, &loc); dict = dict_new (); @@ -640,6 +670,10 @@ pump_complete_migration (xlator_t *this) pump_priv->pump_finished = _gf_true; dict_ret = dict_set_str (dict, PUMP_SOURCE_COMPLETE, "jargon"); + if (dict_ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set the key %s", + loc.path, PUMP_SOURCE_COMPLETE); ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0); if (ret < 0) { @@ -647,6 +681,10 @@ pump_complete_migration (xlator_t *this) "setxattr failed - while notifying source complete"); } dict_ret = dict_set_str (dict, PUMP_SINK_COMPLETE, "jargon"); + if (dict_ret) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set the key %s", + loc.path, PUMP_SINK_COMPLETE); ret = syncop_setxattr (PUMP_SINK_CHILD (this), &loc, dict, 0); if (ret < 0) { @@ -662,24 +700,11 @@ pump_complete_migration (xlator_t *this) call_resume (pump_priv->cleaner); } + loc_wipe (&loc); return 0; } static int -pump_set_root_gfid (dict_t *dict) -{ - uuid_t gfid; - int ret = 0; - - memset (gfid, 0, 16); - gfid[15] = 1; - - ret = afr_set_dict_gfid (dict, gfid); - - return ret; -} - -static int pump_lookup_sink (loc_t *loc) { xlator_t *this = NULL; @@ -692,7 +717,7 @@ pump_lookup_sink (loc_t *loc) xattr_req = dict_new (); - ret = pump_set_root_gfid (xattr_req); + ret = afr_set_root_gfid (xattr_req); if (ret) goto out; @@ -702,6 +727,7 @@ pump_lookup_sink (loc_t *loc) if (ret) { gf_log (this->name, GF_LOG_DEBUG, "Lookup on sink child failed"); + ret = -1; goto out; } @@ -731,7 +757,7 @@ pump_task (void *data) GF_ASSERT (priv->root_inode); - build_root_loc (priv->root_inode, &loc); + afr_build_root_loc (this, &loc); xattr_req = dict_new (); if (!xattr_req) { gf_log (this->name, GF_LOG_DEBUG, @@ -740,14 +766,13 @@ pump_task (void *data) goto out; } - pump_set_root_gfid (xattr_req); + afr_set_root_gfid (xattr_req); ret = syncop_lookup (this, &loc, xattr_req, &iatt, &xattr_rsp, &parent); gf_log (this->name, GF_LOG_TRACE, - "lookup: ino=%"PRId64", path=%s", - loc.ino, - loc.path); + "lookup: path=%s gfid=%s", + loc.path, uuid_utoa (loc.inode->gfid)); ret = pump_check_and_update_status (this); if (ret < 0) { @@ -756,7 +781,7 @@ pump_task (void *data) pump_update_resume_path (this); - pump_set_root_gfid (xattr_req); + afr_set_root_gfid (xattr_req); ret = pump_lookup_sink (&loc); if (ret) { pump_update_resume_path (this); @@ -770,6 +795,7 @@ out: if (xattr_req) dict_unref (xattr_req); + loc_wipe (&loc); return 0; } @@ -779,12 +805,10 @@ pump_task_completion (int ret, call_frame_t *sync_frame, void *data) { xlator_t *this = NULL; afr_private_t *priv = NULL; - pump_private_t *pump_priv = NULL; this = THIS; priv = this->private; - pump_priv = priv->pump_private; inode_unref (priv->root_inode); STACK_DESTROY (sync_frame->root); @@ -805,7 +829,7 @@ pump_start (call_frame_t *pump_frame, xlator_t *this) priv = this->private; pump_priv = priv->pump_private; - pump_frame->root->lk_owner = (uint64_t) (unsigned long)pump_frame->root; + afr_set_lk_owner (pump_frame, this, pump_frame->root); pump_pid = (uint64_t) (unsigned long)pump_frame->root; ret = synctask_new (pump_priv->env, pump_task, @@ -819,8 +843,8 @@ pump_start (call_frame_t *pump_frame, xlator_t *this) } gf_log (this->name, GF_LOG_DEBUG, - "setting pump as started lk_owner: %"PRIu64" %"PRIu64, - pump_frame->root->lk_owner, pump_pid); + "setting pump as started lk_owner: %s %"PRIu64, + lkowner_utoa (&pump_frame->root->lk_owner), pump_pid); priv->use_afr_in_pump = 1; out: @@ -854,7 +878,7 @@ pump_cmd_start_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) + int32_t op_errno, dict_t *xdata) { call_frame_t *prev = NULL; @@ -906,9 +930,9 @@ pump_initiate_sink_connect (call_frame_t *frame, xlator_t *this) GF_ASSERT (priv->root_inode); - build_root_loc (priv->root_inode, &loc); + afr_build_root_loc (this, &loc); - data = data_ref (dict_get (local->dict, PUMP_CMD_START)); + data = data_ref (dict_get (local->dict, RB_PUMP_CMD_START)); if (!data) { ret = -1; gf_log (this->name, GF_LOG_ERROR, @@ -947,7 +971,7 @@ pump_initiate_sink_connect (call_frame_t *frame, xlator_t *this) PUMP_SINK_CHILD(this)->fops->setxattr, &loc, dict, - 0); + 0, NULL); ret = 0; @@ -961,6 +985,7 @@ out: if (ret && clnt_cmd) GF_FREE (clnt_cmd); + loc_wipe (&loc); return ret; } @@ -980,7 +1005,7 @@ pump_cmd_start_getxattr_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict) + dict_t *dict, dict_t *xdata) { afr_local_t *local = NULL; char *path = NULL; @@ -1047,6 +1072,7 @@ pump_execute_status (call_frame_t *frame, xlator_t *this) uint64_t number_files = 0; char filename[PATH_MAX]; + char summary[PATH_MAX+256]; char *dict_str = NULL; int32_t op_ret = 0; @@ -1075,16 +1101,19 @@ pump_execute_status (call_frame_t *frame, xlator_t *this) } if (pump_priv->pump_finished) { - snprintf (dict_str, PATH_MAX + 256, "Number of files migrated = %"PRIu64" Migration complete ", - number_files); + snprintf (summary, PATH_MAX+256, + "no_of_files=%"PRIu64, number_files); } else { - snprintf (dict_str, PATH_MAX + 256, "Number of files migrated = %"PRIu64" Current file= %s ", - number_files, filename); + snprintf (summary, PATH_MAX+256, + "no_of_files=%"PRIu64":current_file=%s", + number_files, filename); } + snprintf (dict_str, PATH_MAX+256, "status=%d:%s", + (pump_priv->pump_finished)?1:0, summary); dict = dict_new (); - ret = dict_set_dynstr (dict, PUMP_CMD_STATUS, dict_str); + ret = dict_set_dynstr (dict, RB_PUMP_CMD_STATUS, dict_str); if (ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "dict_set_dynstr returned negative value"); @@ -1096,13 +1125,12 @@ pump_execute_status (call_frame_t *frame, xlator_t *this) out: - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, NULL); if (dict) dict_unref (dict); - if (dict_str) - GF_FREE (dict_str); + GF_FREE (dict_str); return 0; } @@ -1144,14 +1172,14 @@ pump_execute_start (call_frame_t *frame, xlator_t *this) GF_ASSERT (priv->root_inode); - build_root_loc (priv->root_inode, &loc); + afr_build_root_loc (this, &loc); STACK_WIND (frame, pump_cmd_start_getxattr_cbk, PUMP_SOURCE_CHILD(this), PUMP_SOURCE_CHILD(this)->fops->getxattr, &loc, - PUMP_PATH); + PUMP_PATH, NULL); ret = 0; @@ -1161,6 +1189,7 @@ out: pump_command_reply (frame, this); } + loc_wipe (&loc); return 0; } @@ -1168,7 +1197,7 @@ static int pump_cleanup_helper (void *data) { call_frame_t *frame = data; - pump_xattr_cleaner (frame, 0, frame->this, 0, 0); + pump_xattr_cleaner (frame, 0, frame->this, 0, 0, NULL); return 0; } @@ -1194,14 +1223,6 @@ pump_execute_commit (call_frame_t *frame, xlator_t *this) pump_priv = priv->pump_private; local = frame->local; - - LOCK (&pump_priv->resume_path_lock); - { - pump_priv->number_files_pumped = 0; - pump_priv->current_file[0] = '\0'; - } - UNLOCK (&pump_priv->resume_path_lock); - local->op_ret = 0; if (pump_priv->pump_finished) { pump_change_state (this, PUMP_STATE_COMMIT); @@ -1258,7 +1279,7 @@ pump_execute_abort (call_frame_t *frame, xlator_t *this) } else { pump_priv->cleaner = fop_setxattr_cbk_stub (frame, pump_xattr_cleaner, - 0, 0); + 0, 0, NULL); } return 0; @@ -1271,7 +1292,7 @@ pump_command_status (xlator_t *this, dict_t *dict) int dict_ret = -1; int ret = _gf_true; - dict_ret = dict_get_str (dict, PUMP_CMD_STATUS, &cmd); + dict_ret = dict_get_str (dict, RB_PUMP_CMD_STATUS, &cmd); if (dict_ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "Not a pump status command"); @@ -1295,7 +1316,7 @@ pump_command_pause (xlator_t *this, dict_t *dict) int dict_ret = -1; int ret = _gf_true; - dict_ret = dict_get_str (dict, PUMP_CMD_PAUSE, &cmd); + dict_ret = dict_get_str (dict, RB_PUMP_CMD_PAUSE, &cmd); if (dict_ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "Not a pump pause command"); @@ -1319,7 +1340,7 @@ pump_command_commit (xlator_t *this, dict_t *dict) int dict_ret = -1; int ret = _gf_true; - dict_ret = dict_get_str (dict, PUMP_CMD_COMMIT, &cmd); + dict_ret = dict_get_str (dict, RB_PUMP_CMD_COMMIT, &cmd); if (dict_ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "Not a pump commit command"); @@ -1343,7 +1364,7 @@ pump_command_abort (xlator_t *this, dict_t *dict) int dict_ret = -1; int ret = _gf_true; - dict_ret = dict_get_str (dict, PUMP_CMD_ABORT, &cmd); + dict_ret = dict_get_str (dict, RB_PUMP_CMD_ABORT, &cmd); if (dict_ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "Not a pump abort command"); @@ -1367,7 +1388,7 @@ pump_command_start (xlator_t *this, dict_t *dict) int dict_ret = -1; int ret = _gf_true; - dict_ret = dict_get_str (dict, PUMP_CMD_START, &cmd); + dict_ret = dict_get_str (dict, RB_PUMP_CMD_START, &cmd); if (dict_ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "Not a pump start command"); @@ -1384,135 +1405,22 @@ out: } -struct _xattr_key { - char *key; - struct list_head list; -}; - -static void -__gather_xattr_keys (dict_t *dict, char *key, data_t *value, - void *data) -{ - struct list_head * list = data; - struct _xattr_key * xkey = NULL; - - if (!strncmp (key, AFR_XATTR_PREFIX, - strlen (AFR_XATTR_PREFIX))) { - - xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key); - if (!xkey) - return; - - xkey->key = key; - INIT_LIST_HEAD (&xkey->list); - - list_add_tail (&xkey->list, list); - } -} - -static void -__filter_xattrs (dict_t *dict) -{ - struct list_head keys; - - struct _xattr_key *key; - struct _xattr_key *tmp; - - INIT_LIST_HEAD (&keys); - - dict_foreach (dict, __gather_xattr_keys, - (void *) &keys); - - list_for_each_entry_safe (key, tmp, &keys, list) { - dict_del (dict, key->key); - - list_del_init (&key->list); - - GF_FREE (key); - } -} - -int32_t -pump_getxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - - priv = this->private; - children = priv->children; - - local = frame->local; - - read_child = (long) cookie; - - if (op_ret == -1) { - last_index = &local->cont.getxattr.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; - - unwind = 0; - STACK_WIND_COOKIE (frame, pump_getxattr_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->getxattr, - &local->loc, - local->cont.getxattr.name); - } - -out: - if (unwind) { - if (op_ret >= 0 && dict) - __filter_xattrs (dict); - - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict); - } - - return 0; -} - -int32_t -pump_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) +int +pump_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) { - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - int32_t op_ret = -1; - int32_t op_errno = 0; - uint64_t read_child = 0; - - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; + afr_private_t *priv = NULL; + int op_errno = 0; + int ret = 0; - ALLOC_OR_GOTO (local, afr_local_t, out); - frame->local = local; + priv = this->private; - op_ret = AFR_LOCAL_INIT (local, priv); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; + if (!priv->use_afr_in_pump) { + STACK_WIND (frame, default_getxattr_cbk, + FIRST_CHILD (this), + (FIRST_CHILD (this))->fops->getxattr, + loc, name, xdata); + return 0; } if (name) { @@ -1523,187 +1431,21 @@ pump_getxattr (call_frame_t *frame, xlator_t *this, goto out; } - if (!strcmp (name, PUMP_CMD_STATUS)) { + if (!strcmp (name, RB_PUMP_CMD_STATUS)) { gf_log (this->name, GF_LOG_DEBUG, "Hit pump command - status"); pump_execute_status (frame, this); - op_ret = 0; + ret = 0; goto out; } } - if (!priv->use_afr_in_pump) { - STACK_WIND (frame, default_getxattr_cbk, - FIRST_CHILD (this), - (FIRST_CHILD (this))->fops->getxattr, - loc, name); - return 0; - } - - local->fresh_children = GF_CALLOC (priv->child_count, - sizeof (*local->fresh_children), - gf_afr_mt_int32_t); - if (local->fresh_children) { - op_errno = ENOMEM; - goto out; - } + afr_getxattr (frame, this, loc, name, xdata); - read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.getxattr.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } - loc_copy (&local->loc, loc); - if (name) - local->cont.getxattr.name = gf_strdup (name); - - STACK_WIND_COOKIE (frame, pump_getxattr_cbk, - (void *) (long) call_child, - children[call_child], children[call_child]->fops->getxattr, - loc, name); - - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, NULL); - } - return 0; -} - -static int -afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (setxattr, main_frame, - local->op_ret, local->op_errno) - } - return 0; -} - -static int -afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->child_count) { - need_unwind = 1; - } - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; -} - -static int -afr_setxattr_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, - local->cont.setxattr.dict, - local->cont.setxattr.flags); - - if (!--call_count) - break; - } - } - - return 0; -} - - -static int -afr_setxattr_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - -int32_t -pump_setxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno) -{ - STACK_UNWIND (frame, - op_ret, - op_errno); + if (ret < 0) + AFR_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); return 0; } @@ -1721,120 +1463,85 @@ pump_command_reply (call_frame_t *frame, xlator_t *this) gf_log (this->name, GF_LOG_INFO, "Command succeeded"); - dict_unref (local->dict); - AFR_STACK_UNWIND (setxattr, frame, local->op_ret, - local->op_errno); + local->op_errno, NULL); return 0; } int -pump_parse_command (call_frame_t *frame, xlator_t *this, - afr_local_t *local, dict_t *dict) +pump_parse_command (call_frame_t *frame, xlator_t *this, dict_t *dict, + int *op_errno_p) { - + afr_local_t *local = NULL; int ret = -1; + int op_errno = 0; if (pump_command_start (this, dict)) { - frame->local = local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; local->dict = dict_ref (dict); ret = pump_execute_start (frame, this); } else if (pump_command_pause (this, dict)) { - frame->local = local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; local->dict = dict_ref (dict); ret = pump_execute_pause (frame, this); } else if (pump_command_abort (this, dict)) { - frame->local = local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; local->dict = dict_ref (dict); ret = pump_execute_abort (frame, this); } else if (pump_command_commit (this, dict)) { - frame->local = local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; local->dict = dict_ref (dict); ret = pump_execute_commit (frame, this); } +out: + if (op_errno_p) + *op_errno_p = op_errno; return ret; } int -pump_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags) +pump_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - + afr_private_t *priv = NULL; int ret = -1; - - int op_ret = -1; int op_errno = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.pump*", dict, op_errno, out); priv = this->private; - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - ret = pump_parse_command (frame, this, - local, dict); - if (ret >= 0) { - op_ret = 0; - goto out; - } - if (!priv->use_afr_in_pump) { STACK_WIND (frame, default_setxattr_cbk, FIRST_CHILD (this), (FIRST_CHILD (this))->fops->setxattr, - loc, dict, flags); + loc, dict, flags, xdata); return 0; } - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - - transaction_frame->local = local; - - local->op_ret = -1; - - local->cont.setxattr.dict = dict_ref (dict); - local->cont.setxattr.flags = flags; - - local->transaction.fop = afr_setxattr_wind; - local->transaction.done = afr_setxattr_done; - local->transaction.unwind = afr_setxattr_unwind; - - loc_copy (&local->loc, loc); - - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + ret = pump_parse_command (frame, this, dict, &op_errno); + if (ret >= 0) + goto out; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + afr_setxattr (frame, this, loc, dict, flags, xdata); - op_ret = 0; + ret = 0; out: - if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno); + if (ret < 0) { + AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); } return 0; @@ -1868,7 +1575,7 @@ static int32_t pump_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, - off_t offset) + off_t offset, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1878,11 +1585,11 @@ pump_truncate (call_frame_t *frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate, loc, - offset); + offset, xdata); return 0; } - afr_truncate (frame, this, loc, offset); + afr_truncate (frame, this, loc, offset, xdata); return 0; } @@ -1891,7 +1598,7 @@ static int32_t pump_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, - off_t offset) + off_t offset, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1901,11 +1608,11 @@ pump_ftruncate (call_frame_t *frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->ftruncate, fd, - offset); + offset, xdata); return 0; } - afr_ftruncate (frame, this, fd, offset); + afr_ftruncate (frame, this, fd, offset, xdata); return 0; } @@ -1914,7 +1621,7 @@ pump_ftruncate (call_frame_t *frame, int pump_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t rdev, dict_t *parms) + loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1922,10 +1629,10 @@ pump_mknod (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, default_mknod_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, - loc, mode, rdev, parms); + loc, mode, rdev, umask, xdata); return 0; } - afr_mknod (frame, this, loc, mode, rdev, parms); + afr_mknod (frame, this, loc, mode, rdev, umask, xdata); return 0; } @@ -1934,7 +1641,7 @@ pump_mknod (call_frame_t *frame, xlator_t *this, int pump_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *params) + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1942,10 +1649,10 @@ pump_mkdir (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, default_mkdir_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, - loc, mode, params); + loc, mode, umask, xdata); return 0; } - afr_mkdir (frame, this, loc, mode, params); + afr_mkdir (frame, this, loc, mode, umask, xdata); return 0; } @@ -1954,7 +1661,7 @@ pump_mkdir (call_frame_t *frame, xlator_t *this, static int32_t pump_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc) + loc_t *loc, int xflag, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -1963,10 +1670,10 @@ pump_unlink (call_frame_t *frame, default_unlink_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, - loc); + loc, xflag, xdata); return 0; } - afr_unlink (frame, this, loc); + afr_unlink (frame, this, loc, xflag, xdata); return 0; } @@ -1974,7 +1681,7 @@ pump_unlink (call_frame_t *frame, static int pump_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags) + loc_t *loc, int flags, dict_t *xdata) { afr_private_t *priv = NULL; @@ -1984,11 +1691,11 @@ pump_rmdir (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, default_rmdir_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rmdir, - loc, flags); + loc, flags, xdata); return 0; } - afr_rmdir (frame, this, loc, flags); + afr_rmdir (frame, this, loc, flags, xdata); return 0; } @@ -1997,7 +1704,7 @@ pump_rmdir (call_frame_t *frame, xlator_t *this, int pump_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *loc, dict_t *params) + const char *linkpath, loc_t *loc, mode_t umask, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2005,10 +1712,10 @@ pump_symlink (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, default_symlink_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink, - linkpath, loc, params); + linkpath, loc, umask, xdata); return 0; } - afr_symlink (frame, this, linkpath, loc, params); + afr_symlink (frame, this, linkpath, loc, umask, xdata); return 0; } @@ -2018,7 +1725,7 @@ static int32_t pump_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc) + loc_t *newloc, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2027,10 +1734,10 @@ pump_rename (call_frame_t *frame, default_rename_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, - oldloc, newloc); + oldloc, newloc, xdata); return 0; } - afr_rename (frame, this, oldloc, newloc); + afr_rename (frame, this, oldloc, newloc, xdata); return 0; } @@ -2040,7 +1747,7 @@ static int32_t pump_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, - loc_t *newloc) + loc_t *newloc, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2049,10 +1756,10 @@ pump_link (call_frame_t *frame, default_link_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, - oldloc, newloc); + oldloc, newloc, xdata); return 0; } - afr_link (frame, this, oldloc, newloc); + afr_link (frame, this, oldloc, newloc, xdata); return 0; } @@ -2061,7 +1768,7 @@ pump_link (call_frame_t *frame, static int32_t pump_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) + mode_t umask, fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2069,10 +1776,10 @@ pump_create (call_frame_t *frame, xlator_t *this, STACK_WIND (frame, default_create_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->create, - loc, flags, mode, fd, params); + loc, flags, mode, umask, fd, xdata); return 0; } - afr_create (frame, this, loc, flags, mode, fd, params); + afr_create (frame, this, loc, flags, mode, umask, fd, xdata); return 0; } @@ -2082,8 +1789,7 @@ static int32_t pump_open (call_frame_t *frame, xlator_t *this, loc_t *loc, - int32_t flags, fd_t *fd, - int32_t wbflags) + int32_t flags, fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2092,10 +1798,10 @@ pump_open (call_frame_t *frame, default_open_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->open, - loc, flags, fd, wbflags); + loc, flags, fd, xdata); return 0; } - afr_open (frame, this, loc, flags, fd, wbflags); + afr_open (frame, this, loc, flags, fd, xdata); return 0; } @@ -2107,8 +1813,8 @@ pump_writev (call_frame_t *frame, fd_t *fd, struct iovec *vector, int32_t count, - off_t off, - struct iobref *iobref) + off_t off, uint32_t flags, + struct iobref *iobref, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2120,20 +1826,20 @@ pump_writev (call_frame_t *frame, fd, vector, count, - off, - iobref); + off, flags, + iobref, xdata); return 0; } - afr_writev (frame, this, fd, vector, count, off, iobref); - return 0; + afr_writev (frame, this, fd, vector, count, off, flags, iobref, xdata); + return 0; } static int32_t pump_flush (call_frame_t *frame, xlator_t *this, - fd_t *fd) + fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2142,10 +1848,10 @@ pump_flush (call_frame_t *frame, default_flush_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->flush, - fd); + fd, xdata); return 0; } - afr_flush (frame, this, fd); + afr_flush (frame, this, fd, xdata); return 0; } @@ -2155,7 +1861,7 @@ static int32_t pump_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t flags) + int32_t flags, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2165,10 +1871,10 @@ pump_fsync (call_frame_t *frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, fd, - flags); + flags, xdata); return 0; } - afr_fsync (frame, this, fd, flags); + afr_fsync (frame, this, fd, flags, xdata); return 0; } @@ -2177,7 +1883,7 @@ pump_fsync (call_frame_t *frame, static int32_t pump_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd) + loc_t *loc, fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2186,10 +1892,10 @@ pump_opendir (call_frame_t *frame, default_opendir_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->opendir, - loc, fd); + loc, fd, xdata); return 0; } - afr_opendir (frame, this, loc, fd); + afr_opendir (frame, this, loc, fd, xdata); return 0; } @@ -2199,7 +1905,7 @@ static int32_t pump_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t flags) + int32_t flags, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2209,10 +1915,10 @@ pump_fsyncdir (call_frame_t *frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsyncdir, fd, - flags); + flags, xdata); return 0; } - afr_fsyncdir (frame, this, fd, flags); + afr_fsyncdir (frame, this, fd, flags, xdata); return 0; } @@ -2223,7 +1929,7 @@ pump_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, gf_xattrop_flags_t flags, - dict_t *dict) + dict_t *dict, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2234,10 +1940,10 @@ pump_xattrop (call_frame_t *frame, FIRST_CHILD(this)->fops->xattrop, loc, flags, - dict); + dict, xdata); return 0; } - afr_xattrop (frame, this, loc, flags, dict); + afr_xattrop (frame, this, loc, flags, dict, xdata); return 0; } @@ -2247,7 +1953,7 @@ pump_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, gf_xattrop_flags_t flags, - dict_t *dict) + dict_t *dict, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2258,10 +1964,10 @@ pump_fxattrop (call_frame_t *frame, FIRST_CHILD(this)->fops->fxattrop, fd, flags, - dict); + dict, xdata); return 0; } - afr_fxattrop (frame, this, fd, flags, dict); + afr_fxattrop (frame, this, fd, flags, dict, xdata); return 0; } @@ -2271,9 +1977,17 @@ static int32_t pump_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name) + const char *name, dict_t *xdata) { - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO (this, out); + + GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.pump*", + name, op_errno, out); + + op_errno = 0; priv = this->private; if (!priv->use_afr_in_pump) { STACK_WIND (frame, @@ -2281,10 +1995,14 @@ pump_removexattr (call_frame_t *frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->removexattr, loc, - name); + name, xdata); return 0; } - afr_removexattr (frame, this, loc, name); + afr_removexattr (frame, this, loc, name, xdata); + + out: + if (op_errno) + AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); return 0; } @@ -2296,7 +2014,7 @@ pump_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t off) + off_t off, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2305,21 +2023,18 @@ pump_readdir (call_frame_t *frame, default_readdir_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdir, - fd, size, off); + fd, size, off, xdata); return 0; } - afr_readdir (frame, this, fd, size, off); + afr_readdir (frame, this, fd, size, off, xdata); return 0; } static int32_t -pump_readdirp (call_frame_t *frame, - xlator_t *this, - fd_t *fd, - size_t size, - off_t off) +pump_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t off, dict_t *dict) { afr_private_t *priv = NULL; priv = this->private; @@ -2328,10 +2043,10 @@ pump_readdirp (call_frame_t *frame, default_readdirp_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp, - fd, size, off); + fd, size, off, dict); return 0; } - afr_readdirp (frame, this, fd, size, off); + afr_readdirp (frame, this, fd, size, off, dict); return 0; } @@ -2362,13 +2077,24 @@ pump_release (xlator_t *this, } +static int32_t +pump_forget (xlator_t *this, inode_t *inode) +{ + afr_private_t *priv = NULL; + + priv = this->private; + if (priv->use_afr_in_pump) + afr_forget (this, inode); + + return 0; +} static int32_t pump_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, - int32_t valid) + int32_t valid, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2377,10 +2103,10 @@ pump_setattr (call_frame_t *frame, default_setattr_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->setattr, - loc, stbuf, valid); + loc, stbuf, valid, xdata); return 0; } - afr_setattr (frame, this, loc, stbuf, valid); + afr_setattr (frame, this, loc, stbuf, valid, xdata); return 0; } @@ -2391,7 +2117,7 @@ pump_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, - int32_t valid) + int32_t valid, dict_t *xdata) { afr_private_t *priv = NULL; priv = this->private; @@ -2400,10 +2126,10 @@ pump_fsetattr (call_frame_t *frame, default_fsetattr_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetattr, - fd, stbuf, valid); + fd, stbuf, valid, xdata); return 0; } - afr_fsetattr (frame, this, fd, stbuf, valid); + afr_fsetattr (frame, this, fd, stbuf, valid, xdata); return 0; } @@ -2452,7 +2178,7 @@ notify (xlator_t *this, int32_t event, child_xl = (xlator_t *) data; - ret = afr_notify (this, event, data); + ret = afr_notify (this, event, data, NULL); switch (event) { case GF_EVENT_CHILD_DOWN: @@ -2482,12 +2208,12 @@ int32_t init (xlator_t *this) { afr_private_t * priv = NULL; - pump_private_t *pump_priv = NULL; + pump_private_t *pump_priv = NULL; int child_count = 0; xlator_list_t * trav = NULL; int i = 0; int ret = -1; - int op_errno = 0; + GF_UNUSED int op_errno = 0; int source_child = 0; @@ -2503,26 +2229,36 @@ init (xlator_t *this) "Volume is dangling."); } - ALLOC_OR_GOTO (this->private, afr_private_t, out); + priv = GF_CALLOC (1, sizeof (afr_private_t), gf_afr_mt_afr_private_t); + if (!priv) + goto out; - priv = this->private; + LOCK_INIT (&priv->lock); + + child_count = xlator_subvolume_count (this); + if (child_count != 2) { + gf_log (this->name, GF_LOG_ERROR, + "There should be exactly 2 children - one source " + "and one sink"); + return -1; + } + priv->child_count = child_count; priv->read_child = source_child; priv->favorite_child = source_child; priv->background_self_heal_count = 0; - priv->data_self_heal = 1; + priv->data_self_heal = "on"; priv->metadata_self_heal = 1; priv->entry_self_heal = 1; - priv->data_self_heal_algorithm = ""; - priv->data_self_heal_window_size = 16; priv->data_change_log = 1; priv->metadata_change_log = 1; priv->entry_change_log = 1; priv->use_afr_in_pump = 1; + priv->sh_readdir_size = 65536; /* Locking options */ @@ -2531,31 +2267,6 @@ init (xlator_t *this) and the sink. */ - priv->data_lock_server_count = 2; - priv->metadata_lock_server_count = 2; - priv->entry_lock_server_count = 2; - - priv->strict_readdir = _gf_false; - - trav = this->children; - while (trav) { - child_count++; - trav = trav->next; - } - - priv->wait_count = 1; - - if (child_count != 2) { - gf_log (this->name, GF_LOG_ERROR, - "There should be exactly 2 children - one source " - "and one sink"); - return -1; - } - priv->child_count = child_count; - - LOCK_INIT (&priv->lock); - LOCK_INIT (&priv->read_child_lock); - priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, gf_afr_mt_char); if (!priv->child_up) { @@ -2589,7 +2300,8 @@ init (xlator_t *this) while (i < child_count) { priv->children[i] = trav->xlator; - ret = gf_asprintf (&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX, + ret = gf_asprintf (&priv->pending_key[i], "%s.%s", + AFR_XATTR_PREFIX, trav->xlator->name); if (-1 == ret) { gf_log (this->name, GF_LOG_ERROR, @@ -2602,7 +2314,12 @@ init (xlator_t *this) i++; } - priv->first_lookup = 1; + ret = gf_asprintf (&priv->sh_domain, "%s-self-heal", this->name); + if (-1 == ret) { + op_errno = ENOMEM; + goto out; + } + priv->root_inode = NULL; priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event), @@ -2627,13 +2344,12 @@ init (xlator_t *this) pump_priv->resume_path = GF_CALLOC (1, PATH_MAX, gf_afr_mt_char); if (!pump_priv->resume_path) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); + gf_log (this->name, GF_LOG_ERROR, "Out of memory"); ret = -1; goto out; } - pump_priv->env = syncenv_new (0); + pump_priv->env = this->ctx->env; if (!pump_priv->env) { gf_log (this->name, GF_LOG_ERROR, "Could not create new sync-environment"); @@ -2641,21 +2357,67 @@ init (xlator_t *this) goto out; } + /* keep more local here as we may need them for self-heal etc */ + this->local_pool = mem_pool_new (afr_local_t, 128); + if (!this->local_pool) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + goto out; + } + priv->pump_private = pump_priv; + pump_priv = NULL; - pthread_mutex_init (&priv->mutex, NULL); - INIT_LIST_HEAD (&priv->saved_fds); + this->private = priv; + priv = NULL; pump_change_state (this, PUMP_STATE_ABORT); ret = 0; out: + + if (pump_priv) { + GF_FREE (pump_priv->resume_path); + LOCK_DESTROY (&pump_priv->resume_path_lock); + LOCK_DESTROY (&pump_priv->pump_state_lock); + GF_FREE (pump_priv); + } + + if (priv) { + GF_FREE (priv->child_up); + GF_FREE (priv->children); + GF_FREE (priv->pending_key); + GF_FREE (priv->last_event); + LOCK_DESTROY (&priv->lock); + GF_FREE (priv); + } + return ret; } int fini (xlator_t *this) { + afr_private_t * priv = NULL; + pump_private_t *pump_priv = NULL; + + priv = this->private; + this->private = NULL; + if (!priv) + goto out; + + pump_priv = priv->pump_private; + if (!pump_priv) + goto afr_priv; + + GF_FREE (pump_priv->resume_path); + LOCK_DESTROY (&pump_priv->resume_path_lock); + LOCK_DESTROY (&pump_priv->pump_state_lock); + GF_FREE (pump_priv); +afr_priv: + afr_priv_destroy (priv); +out: return 0; } @@ -2703,6 +2465,7 @@ struct xlator_dumpops dumpops = { struct xlator_cbks cbks = { .release = pump_release, .releasedir = pump_releasedir, + .forget = pump_forget, }; struct volume_options options[] = { diff --git a/xlators/cluster/afr/src/pump.h b/xlators/cluster/afr/src/pump.h index 9389d51bb..9d0b6db6a 100644 --- a/xlators/cluster/afr/src/pump.h +++ b/xlators/cluster/afr/src/pump.h @@ -1,20 +1,11 @@ /* - Copyright (c) 2007-2010 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __PUMP_H__ @@ -26,16 +17,6 @@ #define CLIENT_CMD_CONNECT "trusted.glusterfs.client-connect" #define CLIENT_CMD_DISCONNECT "trusted.glusterfs.client-disconnect" -#define IS_ROOT_PATH(path) (!strcmp (path, "/")) -#define IS_ENTRY_CWD(entry) (!strcmp (entry, ".")) -#define IS_ENTRY_PARENT(entry) (!strcmp (entry, "..")) - -#define PUMP_CMD_START "trusted.glusterfs.pump.start" -#define PUMP_CMD_COMMIT "trusted.glusterfs.pump.commit" -#define PUMP_CMD_ABORT "trusted.glusterfs.pump.abort" -#define PUMP_CMD_PAUSE "trusted.glusterfs.pump.pause" -#define PUMP_CMD_STATUS "trusted.glusterfs.pump.status" - #define PUMP_SOURCE_COMPLETE "trusted.glusterfs.pump-source-complete" #define PUMP_SINK_COMPLETE "trusted.glusterfs.pump-sink-complete" @@ -54,7 +35,7 @@ typedef enum { typedef struct _pump_private { struct syncenv *env; /* The env pointer to the pump synctask */ - const char *resume_path; /* path to resume from the last pause */ + char *resume_path; /* path to resume from the last pause */ gf_lock_t resume_path_lock; /* Synchronize resume_path changes */ gf_lock_t pump_state_lock; /* Synchronize pump_state changes */ pump_state_t pump_state; /* State of pump */ @@ -94,4 +75,7 @@ pump_command_status (xlator_t *this, dict_t *dict); int pump_execute_status (call_frame_t *frame, xlator_t *this); +int +pump_command_reply (call_frame_t *frame, xlator_t *this); + #endif /* __PUMP_H__ */ |
