diff options
| author | Anand Avati <avati@redhat.com> | 2014-01-16 16:14:36 -0800 | 
|---|---|---|
| committer | Vijay Bellur <vbellur@redhat.com> | 2014-03-22 05:25:57 -0700 | 
| commit | 6d3739292b7b51d2ddbab75b5f884fb38925b943 (patch) | |
| tree | cf332a881a49c0904a7e023935750c2d080fc1c5 /xlators/cluster | |
| parent | eb87c96f49b3dd2c7460e58c54ce909c706cd475 (diff) | |
cluster/afr: refactor
- Remove client side self-healing completely (opendir, openfd, lookup)
- Re-work readdir-failover to work reliably in case of NFS
- Remove unused/dead lock recovery code
- Consistently use xdata in both calls and callbacks in all FOPs
- Per-inode event generation, used to force inode ctx refresh
- Implement dirty flag support (in place of pending counts)
- Eliminate inode ctx structure, use read subvol bits + event_generation
- Implement inode ctx refreshing based on event generation
- Provide backward compatibility in transactions
- remove unused variables and functions
- make code more consistent in style and pattern
- regularize and clean up inode-write transaction code
- regularize and clean up dir-write transaction code
- regularize and clean up common FOPs
- reorganize transaction framework code
- skip setting xattrs in pending dict if nothing is pending
- re-write self-healing code using syncops
- re-write simpler self-heal-daemon
Change-Id: I1e4080c9796c8a2815c2dab4be3073f389d614a8
BUG: 1021686
Signed-off-by: Anand Avati <avati@redhat.com>
Reviewed-on: http://review.gluster.org/6010
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
Diffstat (limited to 'xlators/cluster')
29 files changed, 8734 insertions, 18979 deletions
diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am index 35d18a6c0da..ea5a90abbdb 100644 --- a/xlators/cluster/afr/src/Makefile.am +++ b/xlators/cluster/afr/src/Makefile.am @@ -2,24 +2,26 @@ xlator_LTLIBRARIES = afr.la pump.la  xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster  afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \ -	afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c \ -	afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c \ -	afr-self-heal-algorithm.c afr-lk-common.c afr-self-heald.c \ +	afr-inode-write.c afr-open.c afr-transaction.c afr-lk-common.c \ +	afr-read-txn.c \  	$(top_builddir)/xlators/lib/src/libxlator.c +AFR_SELFHEAL_SOURCES = afr-self-heal-common.c afr-self-heal-data.c \ +	afr-self-heal-entry.c afr-self-heal-metadata.c afr-self-heald.c \ +	afr-self-heal-name.c +  afr_la_LDFLAGS = -module -avoid-version -afr_la_SOURCES = $(afr_common_source) afr.c +afr_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) afr.c  afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la  pump_la_LDFLAGS = -module -avoid-version -pump_la_SOURCES =  $(afr_common_source) pump.c +pump_la_SOURCES =  $(afr_common_source) $(AFR_SELFHEAL_SOURCES) pump.c  pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la  noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h \ -	afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h \ -	afr-self-heal-algorithm.h pump.h afr-mem-types.h afr-common.c \ -	afr-self-heald.h $(top_builddir)/xlators/lib/src/libxlator.h \ -	$(top_builddir)/glusterfsd/src/glusterfsd.h +	afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-mem-types.h \ +	afr-common.c afr-self-heald.h pump.h \ +	$(top_builddir)/xlators/lib/src/libxlator.h  AM_CPPFLAGS = $(GF_CPPFLAGS) \  	-I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ @@ -31,7 +33,6 @@ CLEANFILES =  uninstall-local:  	rm -f $(DESTDIR)$(xlatordir)/replicate.so -	rm -f $(DESTDIR)$(xlatordir)/pump.so  install-data-hook:  	ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 224d3054626..2bab0f8533d 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -45,787 +45,797 @@  #include "afr-dir-write.h"  #include "afr-transaction.h"  #include "afr-self-heal.h" -#include "afr-self-heal-common.h"  #include "afr-self-heald.h" -#include "pump.h" -#define AFR_ICTX_OPENDIR_DONE_MASK     0x0000000100000000ULL -#define AFR_ICTX_READ_CHILD_MASK       0x00000000FFFFFFFFULL -#define AFR_STATISTICS_HISTORY_SIZE    50 -int -afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, -                                gf_boolean_t fail_conflict); -void -afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count) -{ -        int     i = 0; -        for (i = 0; i < child_count; i++) -                dst[i] = src[i]; -} - -void -afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path) +call_frame_t * +afr_copy_frame (call_frame_t *base)  { -        int             i           = 0; -        afr_private_t   *priv       = NULL; -        int             ret         = 0; +	afr_local_t *local = NULL; +	call_frame_t *frame = NULL; +	int op_errno = 0; -        priv   = this->private; +	frame = copy_frame (base); +	if (!frame) +		return NULL; +	local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) { +		AFR_STACK_DESTROY (frame); +		return NULL; +	} -        for (i = 0; i < priv->child_count; i++) { -                ret = dict_set_uint64 (xattr_req, priv->pending_key[i], -                                       3 * sizeof(int32_t)); -                if (ret < 0) -                        gf_log (this->name, GF_LOG_WARNING, -                                "%s: Unable to set dict value for %s", -                                path, priv->pending_key[i]); -                /* 3 = data+metadata+entry */ -        } -        ret = dict_set_int32 (xattr_req, GF_GFIDLESS_LOOKUP, 1); -        if (ret) { -                gf_log (this->name, GF_LOG_DEBUG, "%s: failed to set gfidless " -                        "lookup", path); -        } +	return frame;  } +/* + * INODE CTX 64-bit VALUE FORMAT FOR SMALL (<= 16) SUBVOL COUNTS: + * + * |<----------   64bit   ------------>| + *  63           32 31    16 15       0 + * |   EVENT_GEN   |  DATA  | METADATA | + * + * + *  METADATA (bit-0 .. bit-15): bitmap representing subvolumes from which + *                              metadata can be attempted to be read. + * + *                              bit-0 => priv->subvolumes[0] + *                              bit-1 => priv->subvolumes[1] + *                              ... etc. till bit-15 + * + *  DATA (bit-16 .. bit-31): bitmap representing subvolumes from which data + *                           can be attempted to be read. + * + *                           bit-16 => priv->subvolumes[0] + *                           bit-17 => priv->subvolumes[1] + *                           ... etc. till bit-31 + * + *  EVENT_GEN (bit-32 .. bit-63): event generation (i.e priv->event_generation) + *                                when DATA and METADATA was last updated. + * + *                                If EVENT_GEN is < priv->event_generation, + *                                or is 0, it means afr_inode_refresh() needs + *                                to be called to recalculate the bitmaps. + */ +  int -afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this, -                              dict_t *xattr_req, loc_t *loc, void **gfid_req) +__afr_inode_read_subvol_get_small (inode_t *inode, xlator_t *this, +				   unsigned char *data, unsigned char *metadata, +				   int *event_p)  { -        int     ret = -ENOMEM; +	afr_private_t *priv = NULL; +	int ret = -1; +	uint16_t datamap = 0; +	uint16_t metadatamap = 0; +	uint32_t event = 0; +	uint64_t val = 0; +	int i = 0; -        GF_ASSERT (gfid_req); +	priv = this->private; -        *gfid_req = NULL; -        local->xattr_req = dict_new (); -        if (!local->xattr_req) -                goto out; -        if (xattr_req) -                dict_copy (xattr_req, local->xattr_req); +	ret = __inode_ctx_get (inode, this, &val); +	if (ret < 0) +		return ret; -        afr_xattr_req_prepare (this, local->xattr_req, loc->path); -        ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); -        if (ret < 0) { -                gf_log (this->name, GF_LOG_WARNING, -                        "%s: Unable to set dict value for %s", -                        loc->path, GLUSTERFS_INODELK_COUNT); -        } -        ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); -        if (ret < 0) { -                gf_log (this->name, GF_LOG_WARNING, -                        "%s: Unable to set dict value for %s", -                        loc->path, GLUSTERFS_ENTRYLK_COUNT); -        } +	metadatamap = (val & 0x000000000000ffff); +	datamap =     (val & 0x00000000ffff0000) >> 16; +	event =       (val & 0xffffffff00000000) >> 32; -        ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); -        if (ret < 0) { -                gf_log (this->name, GF_LOG_WARNING, -                        "%s: Unable to set dict value for %s", -                        loc->path, GLUSTERFS_PARENT_ENTRYLK); -        } +	for (i = 0; i < priv->child_count; i++) { +		if (metadata) +			metadata[i] = (metadatamap >> i) & 1; +		if (data) +			data[i] = (datamap >> i) & 1; +	} -        ret = dict_get_ptr (local->xattr_req, "gfid-req", gfid_req); -        if (ret) { -                gf_log (this->name, GF_LOG_DEBUG, -                        "%s: failed to get the gfid from dict", loc->path); -                *gfid_req = NULL; -        } else { -                if (loc->parent != NULL) -                        dict_del (local->xattr_req, "gfid-req"); -        } -        ret = 0; -out: -        return ret; +	if (event_p) +		*event_p = event; +	return ret;  } -void -afr_lookup_save_gfid (uuid_t dst, void* new, const loc_t *loc) -{ -        inode_t  *inode = NULL; - -        inode = loc->inode; -        if (inode && !uuid_is_null (inode->gfid)) -                uuid_copy (dst, inode->gfid); -        else if (!uuid_is_null (loc->gfid)) -                uuid_copy (dst, loc->gfid); -        else if (new && !uuid_is_null (new)) -                uuid_copy (dst, new); -}  int -afr_errno_count (int32_t *children, int *child_errno, -                 unsigned int child_count, int32_t op_errno) -{ -        int i = 0; -        int errno_count = 0; -        int child = 0; +__afr_inode_read_subvol_set_small (inode_t *inode, xlator_t *this, +				   unsigned char *data, unsigned char *metadata, +				   int event) +{ +	afr_private_t *priv = NULL; +	uint16_t datamap = 0; +	uint16_t metadatamap = 0; +	uint64_t val = 0; +	int i = 0; + +	priv = this->private; + +	for (i = 0; i < priv->child_count; i++) { +		if (data[i]) +			datamap |= (1 << i); +		if (metadata[i]) +			metadatamap |= (1 << i); +	} -        for (i = 0; i < child_count; i++) { -                if (children) { -                        child = children[i]; -                        if (child == -1) -                                break; -                } else { -                        child = i; -                } -                if (child_errno[child] == op_errno) -                        errno_count++; -        } -        return errno_count; -} +	val = ((uint64_t) metadatamap) | +		(((uint64_t) datamap) << 16) | +		(((uint64_t) event) << 32); -int32_t -afr_set_dict_gfid (dict_t *dict, uuid_t gfid) -{ -        int ret       = 0; -        uuid_t *pgfid = NULL; +	return __inode_ctx_set (inode, this, &val); +} -        GF_ASSERT (gfid); -        pgfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char); -        if (!pgfid) { -                ret = -1; -                goto out; -        } +int +__afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this) +{ +	int ret = -1; +	uint16_t datamap = 0; +	uint16_t metadatamap = 0; +	uint32_t event = 0; +	uint64_t val = 0; -        uuid_copy (*pgfid, gfid); +	ret = __inode_ctx_get (inode, this, &val); +	(void) ret; -        ret = dict_set_dynptr (dict, "gfid-req", pgfid, sizeof (uuid_t)); -        if (ret) -                gf_log (THIS->name, GF_LOG_ERROR, "gfid set failed"); +	metadatamap = (val & 0x000000000000ffff) >> 0; +	datamap =     (val & 0x00000000ffff0000) >> 16; +	event = 0; -out: -        if (ret && pgfid) -                GF_FREE (pgfid); +	val = ((uint64_t) metadatamap) | +		(((uint64_t) datamap) << 16) | +		(((uint64_t) event) << 32); -        return ret; +	return __inode_ctx_set (inode, this, &val);  } -void -afr_inode_ctx_destroy (afr_inode_ctx_t *ctx) -{ -        if (!ctx) -                return; -        GF_FREE (ctx->fresh_children); -        GF_FREE (ctx); -} -afr_inode_ctx_t* -__afr_inode_ctx_get (inode_t *inode, xlator_t *this) +int +__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, +			     unsigned char *data, unsigned char *metadata, +			     int *event_p)  { -        int             ret      = 0; -        uint64_t        ctx_addr = 0; -        afr_inode_ctx_t *ctx     = NULL; -        afr_private_t   *priv    = NULL; +	afr_private_t *priv = NULL; +	int ret = -1; -        priv = this->private; -        ret = __inode_ctx_get (inode, this, &ctx_addr); -        if (ret < 0) -                ctx_addr = 0; -        if (ctx_addr != 0) { -                ctx = (afr_inode_ctx_t*) (long) ctx_addr; -                goto out; -        } -        ctx = GF_CALLOC (1, sizeof (*ctx), -                         gf_afr_mt_inode_ctx_t); -        if (!ctx) -                goto fail; -        ctx->fresh_children = GF_CALLOC (priv->child_count, -                                         sizeof (*ctx->fresh_children), -                                         gf_afr_mt_int32_t); -        if (!ctx->fresh_children) -                goto fail; -        ret = __inode_ctx_put (inode, this, (uint64_t)ctx); -        if (ret) { -                gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to " -                                  "set the inode ctx (%s)", -                                  uuid_utoa (inode->gfid)); -                goto fail; -        } +	priv = this->private; -out: -        return ctx; +	if (priv->child_count <= 16) +		ret = __afr_inode_read_subvol_get_small (inode, this, data, +							 metadata, event_p); +	else +		/* TBD: allocate structure with array and read from it */ +		ret = -1; -fail: -        afr_inode_ctx_destroy (ctx); -        return NULL; +	return ret;  } -afr_inode_ctx_t* -afr_inode_ctx_get (inode_t *inode, xlator_t *this) + +int +__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, +			     unsigned char *metadata, int event)  { -        afr_inode_ctx_t *ctx = NULL; +	afr_private_t *priv = NULL; +	int ret = -1; -        LOCK (&inode->lock); -        { -                ctx = __afr_inode_ctx_get (inode, this); -        } -        UNLOCK (&inode->lock); -        return ctx; +	priv = this->private; + +	if (priv->child_count <= 16) +		ret = __afr_inode_read_subvol_set_small (inode, this, data, +							 metadata, event); +	else +		ret = -1; + +	return ret;  } -void -afr_inode_get_ctx_params (xlator_t *this, inode_t *inode, -                          afr_inode_params_t *params) + +int +__afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this)  { -        GF_ASSERT (inode); -        GF_ASSERT (params); +	afr_private_t *priv = NULL; +	int ret = -1; -        afr_inode_ctx_t *ctx = NULL; -        afr_private_t   *priv = NULL; -        int             i = 0; -        int32_t         read_child = -1; -        int32_t         *fresh_children = NULL; +	priv = this->private; -        priv = this->private; -        LOCK (&inode->lock); -        { -                ctx = __afr_inode_ctx_get (inode, this); -                if (!ctx) -                        goto unlock; -                switch (params->op) { -                case AFR_INODE_GET_READ_CTX: -                        fresh_children = params->u.read_ctx.children; -                        read_child = (int32_t)(ctx->masks & -                                               AFR_ICTX_READ_CHILD_MASK); -                        params->u.read_ctx.read_child = read_child; -                        if (!fresh_children) -                                goto unlock; -                        for (i = 0; i < priv->child_count; i++) -                                fresh_children[i] = ctx->fresh_children[i]; -                        break; -                case AFR_INODE_GET_OPENDIR_DONE: -                        params->u.value = _gf_false; -                        if (ctx->masks & AFR_ICTX_OPENDIR_DONE_MASK) -                                params->u.value = _gf_true; -                        break; -                default: -                        GF_ASSERT (0); -                        break; -                } -        } -unlock: -        UNLOCK (&inode->lock); +	if (priv->child_count <= 16) +		ret = __afr_inode_read_subvol_reset_small (inode, this); +	else +		ret = -1; + +	return ret;  } -gf_boolean_t -afr_is_split_brain (xlator_t *this, inode_t *inode) + +int +afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, unsigned char *data, +			   unsigned char *metadata, int *event_p)  { -        afr_inode_ctx_t *ctx = NULL; -        gf_boolean_t    spb  = _gf_false; +	int ret = -1; -        ctx = afr_inode_ctx_get (inode, this); -        if (!ctx) -                goto out; -        if ((ctx->mdata_spb == SPB) || (ctx->data_spb == SPB)) -                spb = _gf_true; -out: -        return spb; +	LOCK(&inode->lock); +	{ +		ret = __afr_inode_read_subvol_get (inode, this, data, +						   metadata, event_p); +	} +	UNLOCK(&inode->lock); + +	return ret;  } -gf_boolean_t -afr_is_opendir_done (xlator_t *this, inode_t *inode) + +int +afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, +			   unsigned char *metadata, int event)  { -        afr_inode_params_t params = {0}; +	int ret = -1; + +	LOCK(&inode->lock); +	{ +		ret = __afr_inode_read_subvol_set (inode, this, data, metadata, +						   event); +	} +	UNLOCK(&inode->lock); -        params.op = AFR_INODE_GET_OPENDIR_DONE; -        afr_inode_get_ctx_params (this, inode, ¶ms); -        return params.u.value; +	return ret;  } -int32_t -afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children) + +int +afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this)  { -        afr_inode_params_t      params = {0}; +	int ret = -1; + +	LOCK(&inode->lock); +	{ +		ret = __afr_inode_read_subvol_reset (inode, this); +	} +	UNLOCK(&inode->lock); -        params.op = AFR_INODE_GET_READ_CTX; -        params.u.read_ctx.children = fresh_children; -        afr_inode_get_ctx_params (this, inode, ¶ms); -        return params.u.read_ctx.read_child; +	return ret;  } -void -afr_inode_ctx_set_read_child (afr_inode_ctx_t *ctx, int32_t read_child) -{ -        uint64_t        remaining_mask = 0; -        uint64_t        mask         = 0; -        remaining_mask = (~AFR_ICTX_READ_CHILD_MASK & ctx->masks); -        mask = (AFR_ICTX_READ_CHILD_MASK & read_child); -        ctx->masks = remaining_mask | mask; -} +int +afr_accused_fill (xlator_t *this, dict_t *xdata, unsigned char *accused, +		  afr_transaction_type type) +{ +	afr_private_t *priv = NULL; +	int i = 0; +	int idx = afr_index_for_transaction_type (type); +	void *pending_raw = NULL; +	int pending[3]; +	int ret = 0; + +	priv = this->private; + +	for (i = 0; i < priv->child_count; i++) { +		ret = dict_get_ptr (xdata, priv->pending_key[i], +				    &pending_raw); +		if (ret) /* no pending flags */ +			continue; +		memcpy (pending, pending_raw, sizeof(pending)); + +		if (ntoh32 (pending[idx])) +			accused[i] = 1; +	} -void -afr_inode_ctx_set_read_ctx (afr_inode_ctx_t *ctx, int32_t read_child, -                            int32_t *fresh_children, int32_t child_count) -{ -        int             i            = 0; - -        afr_inode_ctx_set_read_child (ctx, read_child); -        for (i = 0; i < child_count; i++) { -                if (fresh_children) -                        ctx->fresh_children[i] = fresh_children[i]; -                else -                        ctx->fresh_children[i] = -1; -        } +	return 0;  } -void -afr_inode_ctx_rm_stale_children (afr_inode_ctx_t *ctx, int32_t *stale_children, -                                 int32_t child_count) + +int +afr_accuse_smallfiles (xlator_t *this, struct afr_reply *replies, +		       unsigned char *data_accused)  { -        int             i            = 0; -        int32_t         read_child   = -1; +	int i = 0; +	afr_private_t *priv = NULL; +	uint64_t maxsize = 0; -        GF_ASSERT (stale_children); -        for (i = 0; i < child_count; i++) { -                if (stale_children[i] == -1) -                        break; -                afr_children_rm_child (ctx->fresh_children, -                                       stale_children[i], child_count); -        } -        read_child = (int32_t)(ctx->masks & AFR_ICTX_READ_CHILD_MASK); -        if (!afr_is_child_present (ctx->fresh_children, child_count, -                                   read_child)) -                afr_inode_ctx_set_read_child (ctx, ctx->fresh_children[0]); -} +	priv = this->private; -void -afr_inode_ctx_set_opendir_done (afr_inode_ctx_t *ctx) -{ -        uint64_t        remaining_mask = 0; -        uint64_t        mask = 0; +	for (i = 0; i < priv->child_count; i++) { +		if (data_accused[i]) +			continue; +		if (replies[i].poststat.ia_size > maxsize) +			maxsize = replies[i].poststat.ia_size; +	} -        remaining_mask = (~AFR_ICTX_OPENDIR_DONE_MASK & ctx->masks); -        mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_OPENDIR_DONE_MASK); -        ctx->masks = remaining_mask | mask; +	for (i = 0; i < priv->child_count; i++) { +		if (data_accused[i]) +			continue; +		if (replies[i].poststat.ia_size < maxsize) +			data_accused[i] = 1; +	} + +	return 0;  } -void -afr_inode_set_ctx_params (xlator_t *this, inode_t *inode, -                          afr_inode_params_t *params) -{ -        GF_ASSERT (inode); -        GF_ASSERT (params); -        afr_inode_ctx_t *ctx            = NULL; -        afr_private_t   *priv           = NULL; -        int32_t         read_child      = -1; -        int32_t         *fresh_children = NULL; -        int32_t         *stale_children = NULL; +int +afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	struct afr_reply *replies = NULL; +	int event_generation = 0; +	int i = 0; +	unsigned char *data_accused = NULL; +	unsigned char *metadata_accused = NULL; +	unsigned char *data_readable = NULL; +	unsigned char *metadata_readable = NULL; +	int ret = 0; -        priv = this->private; -        LOCK (&inode->lock); -        { -                ctx = __afr_inode_ctx_get (inode, this); -                if (!ctx) -                        goto unlock; -                switch (params->op) { -                case AFR_INODE_SET_READ_CTX: -                        read_child = params->u.read_ctx.read_child; -                        fresh_children = params->u.read_ctx.children; -                        afr_inode_ctx_set_read_ctx (ctx, read_child, -                                                    fresh_children, -                                                    priv->child_count); -                        break; -                case AFR_INODE_RM_STALE_CHILDREN: -                        stale_children = params->u.read_ctx.children; -                        afr_inode_ctx_rm_stale_children (ctx, -                                                         stale_children, -                                                         priv->child_count); -                        break; -                case AFR_INODE_SET_OPENDIR_DONE: -                        afr_inode_ctx_set_opendir_done (ctx); -                        break; -                default: -                        GF_ASSERT (0); -                        break; -                } -        } -unlock: -        UNLOCK (&inode->lock); -} +	local = frame->local; +	priv = this->private; +	replies = local->replies; +	event_generation = local->event_generation; + +	data_accused = alloca0 (priv->child_count); +	data_readable = alloca0 (priv->child_count); +	metadata_accused = alloca0 (priv->child_count); +	metadata_readable = alloca0 (priv->child_count); + +	for (i = 0; i < priv->child_count; i++) { +		data_readable[i] = 1; +		metadata_readable[i] = 1; +	} -void -afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, -                     afr_spb_state_t data_spb) -{ -        afr_inode_ctx_t *ctx = NULL; +	for (i = 0; i < priv->child_count; i++) { +		if (!replies[i].valid) { +			data_readable[i] = 0; +			metadata_readable[i] = 0; +			continue; +		} + +		if (replies[i].op_ret == -1) { +			data_readable[i] = 0; +			metadata_readable[i] = 0; +			continue; +		} + +		afr_accused_fill (this, replies[i].xdata, data_accused, +				  (inode->ia_type == IA_IFDIR) ? +				   AFR_ENTRY_TRANSACTION : AFR_DATA_TRANSACTION); + +		afr_accused_fill (this, replies[i].xdata, +				  metadata_accused, AFR_METADATA_TRANSACTION); + +	} -        ctx = afr_inode_ctx_get (inode, this); -        if (mdata_spb != DONT_KNOW) -                ctx->mdata_spb = mdata_spb; -        if (data_spb != DONT_KNOW) -                ctx->data_spb = data_spb; +	if (inode->ia_type != IA_IFDIR) +		afr_accuse_smallfiles (this, replies, data_accused); + +	for (i = 0; i < priv->child_count; i++) { +		if (data_accused[i]) { +			data_readable[i] = 0; +			ret = 1; +		} +		if (metadata_accused[i]) { +			metadata_readable[i] = 0; +			ret = 1; +		} +	} + +	afr_inode_read_subvol_set (inode, this, data_readable, +				   metadata_readable, event_generation); +	return ret;  } -void -afr_set_opendir_done (xlator_t *this, inode_t *inode) -{ -        afr_inode_params_t params = {0}; -        params.op = AFR_INODE_SET_OPENDIR_DONE; -        afr_inode_set_ctx_params (this, inode, ¶ms); + +int +afr_refresh_selfheal_done (int ret, call_frame_t *heal, void *opaque) +{ +	if (heal) +		STACK_DESTROY (heal->root); +	return 0;  } -void -afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, -                        int32_t *fresh_children) +int +afr_inode_refresh_err (call_frame_t *frame, xlator_t *this)  { -        afr_inode_params_t params = {0}; -        afr_private_t      *priv  = NULL; +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	int i = 0; +	int err = 0; -        priv = this->private; -        GF_ASSERT (read_child >= 0); -        GF_ASSERT (fresh_children); -        GF_ASSERT (afr_is_child_present (fresh_children, priv->child_count, -                                         read_child)); - -        params.op = AFR_INODE_SET_READ_CTX; -        params.u.read_ctx.read_child     = read_child; -        params.u.read_ctx.children = fresh_children; -        afr_inode_set_ctx_params (this, inode, ¶ms); +	local = frame->local; +	priv = this->private; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->replies[i].valid && !local->replies[i].op_ret) { +			err = 0; +			goto ret; +		} +	} + +	err = afr_final_errno (local, priv); +ret: +	return -err;  } -void -afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, -                             int32_t *stale_children) + +int +afr_refresh_selfheal_wrap (void *opaque)  { -        afr_inode_params_t params = {0}; +	call_frame_t *frame = opaque; +	afr_local_t *local = NULL; +	xlator_t *this = NULL; +	int err = 0; + +	local = frame->local; +	this = frame->this; -        GF_ASSERT (stale_children); +	afr_selfheal (frame->this, local->refreshinode->gfid); -        params.op = AFR_INODE_RM_STALE_CHILDREN; -        params.u.read_ctx.children = stale_children; -        afr_inode_set_ctx_params (this, inode, ¶ms); +	afr_selfheal_unlocked_discover (frame, local->refreshinode, +					local->refreshinode->gfid, +					local->replies); + +	afr_replies_interpret (frame, this, local->refreshinode); + +	err = afr_inode_refresh_err (frame, this); + +	afr_replies_wipe (local, this->private); + +	local->refreshfn (frame, this, err); + +	return 0;  } +  gf_boolean_t -afr_is_source_child (int32_t *sources, int32_t child_count, int32_t child) +afr_selfheal_enabled (xlator_t *this)  { -        gf_boolean_t             source_xattrs = _gf_false; +	afr_private_t *priv = NULL; +	gf_boolean_t data = _gf_false; -        GF_ASSERT (child < child_count); +	priv = this->private; -        if ((child >= 0) && (child < child_count) && -             sources[child]) { -                source_xattrs = _gf_true; -        } -        return source_xattrs; +	gf_string2boolean (priv->data_self_heal, &data); + +	return data || priv->metadata_self_heal || priv->entry_self_heal;  } -gf_boolean_t -afr_is_child_present (int32_t *success_children, int32_t child_count, -                      int32_t child) + + +int +afr_inode_refresh_done (call_frame_t *frame, xlator_t *this)  { -        gf_boolean_t             success_child = _gf_false; -        int                      i = 0; +	call_frame_t *heal = NULL; +	afr_local_t *local = NULL; +	int ret = 0; +	int err = 0; -        GF_ASSERT (child < child_count); +	local = frame->local; -        for (i = 0; i < child_count; i++) { -                if (success_children[i] == -1) -                        break; -                if (child == success_children[i]) { -                        success_child = _gf_true; -                        break; -                } -        } -        return success_child; +	ret = afr_replies_interpret (frame, this, local->refreshinode); + +	err = afr_inode_refresh_err (frame, this); + +	afr_replies_wipe (local, this->private); + +	if (ret && afr_selfheal_enabled (this)) { +		heal = copy_frame (frame); +		if (heal) +			heal->root->pid = -1; +		ret = synctask_new (this->ctx->env, afr_refresh_selfheal_wrap, +				    afr_refresh_selfheal_done, heal, frame); +		if (ret) +			goto refresh_done; +	} else { +	refresh_done: +		local->refreshfn (frame, this, err); +	} + +	return 0;  } -gf_boolean_t -afr_is_read_child (int32_t *success_children, int32_t *sources, -                   int32_t child_count, int32_t child) + +int +afr_inode_refresh_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			      int op_ret, int op_errno, inode_t *inode, +			      struct iatt *buf, dict_t *xdata, struct iatt *par)  { -        gf_boolean_t             success_child = _gf_false; -        gf_boolean_t             source        = _gf_false; +	afr_local_t *local = NULL; +	int call_child = (long) cookie; +	int call_count = 0; -        if (child < 0) { -                return _gf_false; -        } +	local = frame->local; -        GF_ASSERT (success_children); -        GF_ASSERT (child_count > 0); +	local->replies[call_child].valid = 1; +	local->replies[call_child].op_ret = op_ret; +	local->replies[call_child].op_errno = op_errno; +	if (op_ret != -1) { +		local->replies[call_child].poststat = *buf; +		local->replies[call_child].postparent = *par; +		local->replies[call_child].xdata = dict_ref (xdata); +	} -        success_child = afr_is_child_present (success_children, child_count, -                                              child); -        if (!success_child) -                goto out; -        if (NULL == sources) { -                source = _gf_true; -                goto out; -        } -        source = afr_is_source_child (sources, child_count, child); -out: -        return (success_child && source); +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		afr_inode_refresh_done (frame, this); + +	return 0;  } -int32_t -afr_hash_child (int32_t *success_children, int32_t child_count, -                unsigned int hmode, uuid_t gfid) + +int +afr_inode_refresh_subvol (call_frame_t *frame, xlator_t *this, int i, +			  inode_t *inode, dict_t *xdata)  { -        uuid_t  gfid_copy = {0,}; -        pid_t pid; +	loc_t loc = {0, }; +	afr_private_t *priv = NULL; -        if (!hmode) { -                return -1; -        } +	priv = this->private; -        if (gfid) { -               uuid_copy(gfid_copy,gfid); -        } -        if (hmode > 1) { -                /* -                 * Why getpid?  Because it's one of the cheapest calls -                 * available - faster than gethostname etc. - and returns a -                 * constant-length value that's sure to be shorter than a UUID. -                 * It's still very unlikely to be the same across clients, so -                 * it still provides good mixing.  We're not trying for -                 * perfection here.  All we need is a low probability that -                 * multiple clients won't converge on the same subvolume. -                 */ -                pid = getpid(); -                memcpy (gfid_copy, &pid, sizeof(pid)); -        } +	loc.inode = inode; +	uuid_copy (loc.gfid, inode->gfid); -        return SuperFastHash((char *)gfid_copy, -                             sizeof(gfid_copy)) % child_count; +	STACK_WIND_COOKIE (frame, afr_inode_refresh_subvol_cbk, +			   (void *) (long) i, priv->children[i], +			   priv->children[i]->fops->lookup, &loc, xdata); +	return 0;  } -/* If sources is NULL the xattrs are assumed to be of source for all - * success_children. - */ +  int -afr_select_read_child_from_policy (int32_t *success_children, -                                   int32_t child_count, int32_t prev_read_child, -                                   int32_t config_read_child, int32_t *sources, -                                   unsigned int hmode, uuid_t gfid) +afr_inode_refresh_do (call_frame_t *frame, xlator_t *this)  { -        int32_t                  read_child   = -1; -        int                      i            = 0; +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	int call_count = 0; +	int i = 0; +	dict_t *xdata = NULL; -        GF_ASSERT (success_children); +	priv = this->private; +	local = frame->local; -        read_child = config_read_child; -        if (afr_is_read_child (success_children, sources, child_count, -                               read_child)) -                goto out; +	afr_replies_wipe (local, priv); -        read_child = prev_read_child; -        if (afr_is_read_child (success_children, sources, child_count, -                               read_child)) -                goto out; +	xdata = dict_new (); +	if (!xdata) { +		afr_inode_refresh_done (frame, this); +		return 0; +	} -        read_child = afr_hash_child (success_children, child_count, -                                     hmode, gfid); -        if (afr_is_read_child (success_children, sources, child_count, -                               read_child)) { -                goto out; -        } +	if (afr_xattr_req_prepare (this, xdata) != 0) { +		dict_unref (xdata); +		afr_inode_refresh_done (frame, this); +		return 0; +	} -        for (i = 0; i < child_count; i++) { -                read_child = success_children[i]; -                if (read_child < 0) -                        break; -                if (afr_is_read_child (success_children, sources, child_count, -                                       read_child)) -                        goto out; -        } -        read_child = -1; +	local->call_count = AFR_COUNT (local->child_up, priv->child_count); -out: -        return read_child; +	call_count = local->call_count; +	for (i = 0; i < priv->child_count; i++) { +		if (!local->child_up[i]) +			continue; + +		afr_inode_refresh_subvol (frame, this, i, local->refreshinode, +					  xdata); + +		if (!--call_count) +			break; +	} + +	dict_unref (xdata); + +	return 0;  } -/* This function should be used when all the success_children are sources - */ -void -afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, -                              int32_t *fresh_children, int32_t prev_read_child, -                              int32_t config_read_child, uuid_t gfid) + +int +afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode, +		   afr_inode_refresh_cbk_t refreshfn)  { -        int                      read_child = -1; -        afr_private_t            *priv = NULL; +	afr_local_t *local = NULL; -        priv = this->private; -        read_child = afr_select_read_child_from_policy (fresh_children, -                                                        priv->child_count, -                                                        prev_read_child, -                                                        config_read_child, -                                                        NULL, -                                                        priv->hash_mode, gfid); -        if (read_child >= 0) -                afr_inode_set_read_ctx (this, inode, read_child, -                                        fresh_children); +	local = frame->local; + +	local->refreshfn = refreshfn; + +	if (local->refreshinode) { +		inode_unref (local->refreshinode); +		local->refreshinode = NULL; +	} + +	local->refreshinode = inode_ref (inode); + +	afr_inode_refresh_do (frame, this); + +	return 0;  } -/* afr_next_call_child () - * This is a common function used by all the read-type fops - * This function should not be called with the inode's read_children array. - * The fop's handler should make a copy of the inode's read_children, - * preferred read_child into the local vars, because while this function is - * in execution there is a chance for inode's read_ctx to change. - */ -int32_t -afr_next_call_child (int32_t *fresh_children, unsigned char *child_up, -                     size_t child_count, int32_t *last_index, -                     int32_t read_child) + +int +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req)  { -        int             next_index      = 0; -        int32_t         next_call_child = -1; +        int             i           = 0; +        afr_private_t   *priv       = NULL; +        int             ret         = 0; -        GF_ASSERT (last_index); +        priv   = this->private; -        next_index = *last_index; -retry: -        next_index++; -        if ((next_index >= child_count) || -           (fresh_children[next_index] == -1)) -                goto out; -        if ((fresh_children[next_index] == read_child) || -           (!child_up[fresh_children[next_index]])) -                goto retry; -        *last_index = next_index; -        next_call_child = fresh_children[next_index]; -out: -        return next_call_child; +        for (i = 0; i < priv->child_count; i++) { +                ret = dict_set_uint64 (xattr_req, priv->pending_key[i], +                                       AFR_NUM_CHANGE_LOGS * sizeof(int)); +                if (ret < 0) +                        gf_log (this->name, GF_LOG_WARNING, +                                "Unable to set dict value for %s", +                                priv->pending_key[i]); +                /* 3 = data+metadata+entry */ +        } +        ret = dict_set_uint64 (xattr_req, AFR_DIRTY, +			       AFR_NUM_CHANGE_LOGS * sizeof(int)); +        if (ret) { +                gf_log (this->name, GF_LOG_DEBUG, "failed to set dirty " +                        "query flag"); +        } + +	return ret;  } - /* This function should not be called with the inode's read_children array. - * The fop's handler should make a copy of the inode's read_children, - * preferred read_child into the local vars, because while this function is - * in execution there is a chance for inode's read_ctx to change. - */ -int32_t -afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, -                    int32_t *fresh_children, -                    int32_t *call_child, int32_t *last_index) +int +afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this, +                              dict_t *xattr_req, loc_t *loc)  { -        int             ret   = 0; -        afr_private_t   *priv = NULL; -        int             i     = 0; - -        GF_ASSERT (child_up); -        GF_ASSERT (call_child); -        GF_ASSERT (last_index); -        GF_ASSERT (fresh_children); +        int     ret = -ENOMEM; -        if (read_child < 0) { -                ret = -EIO; +        local->xattr_req = dict_new (); +        if (!local->xattr_req)                  goto out; -        } -        priv = this->private; -        *call_child = -1; -        *last_index = -1; +        if (xattr_req) +                dict_copy (xattr_req, local->xattr_req); -        if (child_up[read_child]) { -                *call_child = read_child; -        } else { -                for (i = 0; i < priv->child_count; i++) { -                        if (fresh_children[i] == -1) -                                break; -                        if (child_up[fresh_children[i]]) { -                                *call_child = fresh_children[i]; -                                ret = 0; -                                break; -                        } -                } +        ret = afr_xattr_req_prepare (this, local->xattr_req); +	if (ret < 0) { +		gf_log (this->name, GF_LOG_WARNING, +			"%s: Unable to prepare xattr_req", loc->path); +	} -                if (*call_child == -1) { -                        ret = -ENOTCONN; -                        goto out; -                } +        ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_WARNING, +                        "%s: Unable to set dict value for %s", +                        loc->path, GLUSTERFS_INODELK_COUNT); +        } +        ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_WARNING, +                        "%s: Unable to set dict value for %s", +                        loc->path, GLUSTERFS_ENTRYLK_COUNT); +        } -                *last_index = i; +        ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_WARNING, +                        "%s: Unable to set dict value for %s", +                        loc->path, GLUSTERFS_PARENT_ENTRYLK);          } + +        ret = 0;  out: -        gf_log (this->name, GF_LOG_DEBUG, "Returning %d, call_child: %d, " -                "last_index: %d", ret, *call_child, *last_index);          return ret;  } -void -afr_reset_xattr (dict_t **xattr, unsigned int child_count) + +int +afr_hash_child (inode_t *inode, int32_t child_count, int hashmode)  { -        unsigned int i = 0; +        uuid_t gfid_copy = {0,}; +        pid_t pid; -        if (!xattr) -                goto out; -        for (i = 0; i < child_count; i++) { -                if (xattr[i]) { -                        dict_unref (xattr[i]); -                        xattr[i] = NULL; -                } +        if (!hashmode) { +                return -1;          } -out: -        return; -} -void -afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count) -{ -        afr_reset_xattr (xattr, child_count); -        GF_FREE (xattr); -} +        if (inode) { +               uuid_copy (gfid_copy, inode->gfid); +        } -void -afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) -{ -        afr_self_heal_t *sh = NULL; -        afr_private_t   *priv = NULL; +        if (hashmode > 1) { +                /* +                 * Why getpid?  Because it's one of the cheapest calls +                 * available - faster than gethostname etc. - and returns a +                 * constant-length value that's sure to be shorter than a UUID. +                 * It's still very unlikely to be the same across clients, so +                 * it still provides good mixing.  We're not trying for +                 * perfection here.  All we need is a low probability that +                 * multiple clients won't converge on the same subvolume. +                 */ +                pid = getpid(); +                memcpy (gfid_copy, &pid, sizeof(pid)); +        } -        sh = &local->self_heal; -        priv = this->private; +        return SuperFastHash((char *)gfid_copy, +                             sizeof(gfid_copy)) % child_count; +} -        if (sh->data_sh_info && strcmp (sh->data_sh_info, "")) -                GF_FREE (sh->data_sh_info); -        if (sh->metadata_sh_info && strcmp (sh->metadata_sh_info, "")) -                GF_FREE (sh->metadata_sh_info); +int +afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, +				  unsigned char *readable) +{ +	afr_private_t *priv = NULL; +	int read_subvol = -1; +	int i = 0; -        GF_FREE (sh->buf); +	priv = this->private; -        GF_FREE (sh->parentbufs); +	/* first preference - explicitly specified or local subvolume */ +	if (priv->read_child >= 0 && readable[priv->read_child]) +		return priv->read_child; -        if (sh->inode) -                inode_unref (sh->inode); +	/* second preference - use hashed mode */ +	read_subvol = afr_hash_child (inode, priv->child_count, +				      priv->hash_mode); +	if (read_subvol >= 0 && readable[read_subvol]) +		return read_subvol; -        afr_xattr_array_destroy (sh->xattr, priv->child_count); +	for (i = 0; i < priv->child_count; i++) { +		if (readable[i]) +			return i; +	} -        GF_FREE (sh->child_errno); +	/* no readable subvolumes, either split brain or all subvols down */ -        afr_matrix_cleanup (sh->pending_matrix, priv->child_count); -        afr_matrix_cleanup (sh->delta_matrix, priv->child_count); +	return -1; +} -        GF_FREE (sh->sources); -        GF_FREE (sh->success); +int +afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this, +				unsigned char *readable, int *event_p, +				int type) +{ +	int ret = -1; -        GF_FREE (sh->locked_nodes); +	if (type == AFR_METADATA_TRANSACTION) +		ret = afr_inode_read_subvol_get (inode, this, 0, readable, +						 event_p); +	else +		ret = afr_inode_read_subvol_get (inode, this, readable, 0, +						 event_p); +	return ret; +} -        if (sh->healing_fd) { -                fd_unref (sh->healing_fd); -                sh->healing_fd = NULL; -        } -        GF_FREE ((char *)sh->linkname); +int +afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p, +		     int *event_p, afr_transaction_type type) +{ +	afr_private_t *priv = NULL; +	unsigned char *data_readable = NULL; +	unsigned char *metadata_readable = NULL; +	unsigned char *readable = NULL; +	unsigned char *intersection = NULL; +	int subvol = -1; +	int event = 0; -        GF_FREE (sh->success_children); +	priv = this->private; -        GF_FREE (sh->fresh_children); +	readable = alloca0 (priv->child_count); +	data_readable = alloca0 (priv->child_count); +	metadata_readable = alloca0 (priv->child_count); +	intersection = alloca0 (priv->child_count); -        GF_FREE (sh->fresh_parent_dirs); +	afr_inode_read_subvol_type_get (inode, this, readable, &event, type); -        loc_wipe (&sh->parent_loc); -        loc_wipe (&sh->lookup_loc); +	afr_inode_read_subvol_get (inode, this, data_readable, metadata_readable, +				   &event); -        GF_FREE (sh->checksum); +	AFR_INTERSECT (intersection, data_readable, metadata_readable, +		       priv->child_count); -        GF_FREE (sh->write_needed); -        if (sh->healing_fd) -                fd_unref (sh->healing_fd); +	if (AFR_COUNT (intersection, priv->child_count) > 0) +		subvol = afr_read_subvol_select_by_policy (inode, this, +							   intersection); +	else +		subvol = afr_read_subvol_select_by_policy (inode, this, +							   readable); +	if (subvol_p) +		*subvol_p = subvol; +	if (event_p) +		*event_p = event; +	return subvol;  } @@ -838,8 +848,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)          priv = this->private;          afr_matrix_cleanup (local->pending, priv->child_count); -        afr_matrix_cleanup (local->transaction.txn_changelog, -                            priv->child_count);          GF_FREE (local->internal_lock.locked_nodes); @@ -860,7 +868,25 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)          loc_wipe (&local->transaction.parent_loc);          loc_wipe (&local->transaction.new_parent_loc); -        GF_FREE (local->transaction.postop_piggybacked); +} + + +void +afr_replies_wipe (afr_local_t *local, afr_private_t *priv) +{ +	int i; + +	if (!local->replies) +		return; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->replies[i].xdata) { +			dict_unref (local->replies[i].xdata); +			local->replies[i].xdata = NULL; +		} +	} + +	memset (local->replies, 0, sizeof(*local->replies) * priv->child_count);  } @@ -872,7 +898,7 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)          if (!local)                  return; -        afr_local_sh_cleanup (local, this); +	syncbarrier_destroy (&local->barrier);          afr_local_transaction_cleanup (local, this); @@ -890,40 +916,26 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)          if (local->dict)                  dict_unref (local->dict); +	afr_replies_wipe (local, priv);  	GF_FREE(local->replies);          GF_FREE (local->child_up); -        GF_FREE (local->child_errno); +        GF_FREE (local->read_attempted); -        GF_FREE (local->fresh_children); +        GF_FREE (local->readable); -        { /* lookup */ -                if (local->cont.lookup.xattrs) { -                        afr_reset_xattr (local->cont.lookup.xattrs, -                                         priv->child_count); -                        GF_FREE (local->cont.lookup.xattrs); -                        local->cont.lookup.xattrs = NULL; -                } +	if (local->inode) +		inode_unref (local->inode); -                if (local->cont.lookup.xattr) { -                        dict_unref (local->cont.lookup.xattr); -                } - -                if (local->cont.lookup.inode) { -                        inode_unref (local->cont.lookup.inode); -                } +	if (local->parent) +		inode_unref (local->parent); -                GF_FREE (local->cont.lookup.postparents); +	if (local->parent2) +		inode_unref (local->parent2); -                GF_FREE (local->cont.lookup.bufs); - -                GF_FREE (local->cont.lookup.success_children); - -                GF_FREE (local->cont.lookup.sources); -                afr_matrix_cleanup (local->cont.lookup.pending_matrix, -                                    priv->child_count); -        } +	if (local->refreshinode) +		inode_unref (local->refreshinode);          { /* getxattr */                  GF_FREE (local->cont.getxattr.name); @@ -1018,67 +1030,29 @@ afr_frame_return (call_frame_t *frame)          return call_count;  } -int -afr_set_elem_count_get (unsigned char *elems, int child_count) -{ -        int i   = 0; -        int ret = 0; - -        for (i = 0; i < child_count; i++) -                if (elems[i]) -                        ret++; -        return ret; -} - -/** - * up_children_count - return the number of children that are up - */ - -unsigned int -afr_up_children_count (unsigned char *child_up, unsigned int child_count) -{ -        return afr_set_elem_count_get (child_up, child_count); -} - -unsigned int -afr_locked_children_count (unsigned char *children, unsigned int child_count) -{ -        return afr_set_elem_count_get (children, child_count); -} - -unsigned int -afr_pre_op_done_children_count (unsigned char *pre_op, -                                unsigned int child_count) -{ -        return afr_set_elem_count_get (pre_op, child_count); -}  gf_boolean_t -afr_is_fresh_lookup (loc_t *loc, xlator_t *this) -{ -        uint64_t          ctx = 0; -        int32_t           ret = 0; - -        GF_ASSERT (loc); -        GF_ASSERT (this); -        GF_ASSERT (loc->inode); +afr_is_entry_possibly_under_txn (afr_local_t *local, xlator_t *this) +{ +	int i = 0; +	int tmp = 0; +	afr_private_t *priv = NULL; + +	priv = this->private; + +	for (i = 0; i < priv->child_count; i++) { +		if (!local->replies[i].xdata) +			continue; +		if (dict_get_int32 (local->replies[i].xdata, +				    GLUSTERFS_PARENT_ENTRYLK, +				    &tmp) == 0) +			if (tmp) +				return _gf_true; +	} -        ret = inode_ctx_get (loc->inode, this, &ctx); -        if (0 == ret) -                return _gf_false; -        return _gf_true; +	return _gf_false;  } -void -afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) -{ -        GF_ASSERT (loc); -        GF_ASSERT (buf); - -        uuid_copy (loc->gfid, buf->ia_gfid); -        if (postparent) -                uuid_copy (loc->pargfid, postparent->ia_gfid); -}  /*   * Quota size xattrs are not maintained by afr. There is a @@ -1090,1467 +1064,845 @@ afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent)   * */  static void -afr_handle_quota_size (afr_local_t *local, xlator_t *this, -                       dict_t *rsp_dict) +afr_handle_quota_size (call_frame_t *frame, xlator_t *this)  { -        int32_t       *sources       = NULL; -        dict_t        *xattr         = NULL; -        data_t        *max_data      = NULL; -        int64_t       max_quota_size = -1; -        data_t        *data          = NULL; -        int64_t       *size          = NULL; -        int64_t       quota_size     = -1; -        afr_private_t *priv          = NULL; -        int           i              = 0; -        int           ret            = -1; -        gf_boolean_t  source_present = _gf_false; - -        priv    = this->private; -        sources = local->cont.lookup.sources; - -        if (rsp_dict == NULL) { -                gf_log_callingfn (this->name, GF_LOG_ERROR, "%s: Invalid " -                                  "response dictionary", local->loc.path); -                return; -        } - -        for (i = 0; i < priv->child_count; i++) { -                if (sources[i]) { -                        source_present = _gf_true; -                        break; -                } -        } - -        for (i = 0; i < priv->child_count; i++) { -                /* -                 * If there is at least one source lets check -                 * for maximum quota sizes among sources, otherwise take the -                 * maximum of the ones present to be on the safer side. -                 */ -                if (source_present && !sources[i]) -                        continue; - -                xattr = local->cont.lookup.xattrs[i]; -                if (!xattr) -                        continue; - -                data = dict_get (xattr, QUOTA_SIZE_KEY); -                if (!data) -                        continue; - -                size = (int64_t*)data->data; -                quota_size = ntoh64(*size); -                gf_log (this->name, GF_LOG_DEBUG, "%s: %d, size: %"PRId64, -                        local->loc.path, i, quota_size); -                if (quota_size > max_quota_size) { -                        if (max_data) -                                data_unref (max_data); - -                        max_quota_size = quota_size; -                        max_data = data_ref (data); -                } -        } +	unsigned char *readable = NULL; +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	struct afr_reply *replies = NULL; +	int i = 0; +	uint64_t size = 0; +	uint64_t max_size = 0; +	int readable_cnt = 0; -        if (max_data) { -                ret = dict_set (rsp_dict, QUOTA_SIZE_KEY, max_data); -                if (ret) { -                        gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " -                                "quota size", local->loc.path); -                } +	local = frame->local; +	priv = this->private; +	replies = local->replies; + +	readable = alloca0 (priv->child_count); + +	afr_inode_read_subvol_get (local->inode, this, readable, 0, 0); + +	readable_cnt = AFR_COUNT (readable, priv->child_count); + +	for (i = 0; i < priv->child_count; i++) { +		if (!replies[i].valid || replies[i].op_ret == -1) +			continue; +		if (readable_cnt && !readable[i]) +			continue; +		if (!replies[i].xdata) +			continue; +		if (dict_get_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, &size)) +			continue; +		if (size > max_size) +			max_size = size; +	} -                data_unref (max_data); -        } +	if (!max_size) +		return; + +	for (i = 0; i < priv->child_count; i++) { +		if (!replies[i].valid || replies[i].op_ret == -1) +			continue; +		if (readable_cnt && !readable[i]) +			continue; +		if (!replies[i].xdata) +			continue; +		if (dict_set_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, max_size)) +			continue; +	}  } -int -afr_lookup_build_response_params (afr_local_t *local, xlator_t *this) -{ -        struct iatt     *buf = NULL; -        struct iatt     *postparent = NULL; -        dict_t          **xattr = NULL; -        int32_t         *success_children = NULL; -        int32_t         *sources = NULL; -        afr_private_t   *priv = NULL; -        int32_t         read_child = -1; -        int             ret = 0; -        int             i = 0; - -        GF_ASSERT (local); - -        buf = &local->cont.lookup.buf; -        postparent = &local->cont.lookup.postparent; -        xattr = &local->cont.lookup.xattr; -        priv = this->private; - -        read_child = afr_inode_get_read_ctx (this, local->cont.lookup.inode, -                                             local->fresh_children); -        if (read_child < 0) { -                ret = -1; -                goto out; -        } -        success_children = local->cont.lookup.success_children; -        sources = local->cont.lookup.sources; -        memset (sources, 0, sizeof (*sources) * priv->child_count); -        afr_children_intersection_get (local->fresh_children, success_children, -                                       sources, priv->child_count); -        if (!sources[read_child]) { -                read_child = -1; -                for (i = 0; i < priv->child_count; i++) { -                        if (sources[i]) { -                                read_child = i; -                                break; -                        } -                } -        } -        if (read_child < 0) { -                ret = -1; -                goto out; -        } - -        gf_log (this->name, GF_LOG_DEBUG, "Building lookup response from %d", -                read_child); -        if (!*xattr) -                *xattr = dict_ref (local->cont.lookup.xattrs[read_child]); - -        *buf = local->cont.lookup.bufs[read_child]; -        *postparent = local->cont.lookup.postparents[read_child]; - -        if (dict_get (local->xattr_req, QUOTA_SIZE_KEY)) -                afr_handle_quota_size (local, this, *xattr); - -        if (IA_INVAL == local->cont.lookup.inode->ia_type) { -                /* fix for RT #602 */ -                local->cont.lookup.inode->ia_type = buf->ia_type; -        } -out: -        return ret; -}  static void -afr_lookup_update_lk_counts (afr_local_t *local, xlator_t *this, -                            int child_index, dict_t *xattr) +afr_lookup_done (call_frame_t *frame, xlator_t *this)  { -        uint32_t inodelk_count = 0; -        uint32_t entrylk_count = 0; -        int      ret           = -1; -        uint32_t parent_entrylk = 0; - -        GF_ASSERT (local); -        GF_ASSERT (this); -        GF_ASSERT (xattr); -        GF_ASSERT (child_index >= 0); +        afr_private_t       *priv  = NULL; +        afr_local_t         *local = NULL; +	int                 i = -1; +	int                 op_errno = 0; +	int                 read_subvol = 0; +	unsigned char      *readable = NULL; +	int                 event = 0; +	struct afr_reply   *replies = NULL; +	uuid_t              read_gfid = {0, }; +	gf_boolean_t        locked_entry = _gf_false; +	gf_boolean_t        can_interpret = _gf_true; -        ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT, -                               &inodelk_count); -        if (ret == 0) -                local->inodelk_count += inodelk_count; +        priv  = this->private; +        local = frame->local; +	replies = local->replies; -        ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT, -                               &entrylk_count); -        if (ret == 0) -                local->entrylk_count += entrylk_count; -        ret = dict_get_uint32 (xattr, GLUSTERFS_PARENT_ENTRYLK, -                               &parent_entrylk); -        if (!ret) -                local->cont.lookup.parent_entrylk += parent_entrylk; -} +	locked_entry = afr_is_entry_possibly_under_txn (local, this); -/* - * It's important to maintain a commutative property on do_*_self_heal and - * found*; once set, they must not be cleared by a subsequent iteration or - * call, so that they represent a logical OR of all iterations and calls - * regardless of child/key order.  That allows the caller to call us multiple - * times without having to use a separate variable as a "reduce" accumulator. - */ -static void -afr_lookup_set_self_heal_params_by_xattr (afr_local_t *local, xlator_t *this, -                                          dict_t *xattr) -{ -        afr_private_t *priv        = NULL; -        int            i           = 0; -        int            ret         = -1; -        void          *pending_raw = NULL; -        int32_t       *pending     = NULL; +	readable = alloca0 (priv->child_count); -        GF_ASSERT (local); -        GF_ASSERT (this); -        GF_ASSERT (xattr); +	afr_inode_read_subvol_get (local->loc.parent, this, readable, +				   NULL, &event); -        priv = this->private; - -        for (i = 0; i < priv->child_count; i++) { -                ret = dict_get_ptr (xattr, priv->pending_key[i], -                                    &pending_raw); -                if (ret != 0) { -                        continue; -                } -                pending = pending_raw; +	/* First, check if we have an ESTALE from somewhere, +	   If so, propagate that so that a revalidate can be +	   issued +	*/ +	op_errno = afr_final_errno (frame->local, this->private); +	local->op_errno = op_errno; +        if (op_errno == ESTALE) { +		local->op_errno = op_errno; +		local->op_ret = -1; +                goto unwind; +	} -                if (pending[AFR_METADATA_TRANSACTION]) { -                        gf_log(this->name, GF_LOG_DEBUG, -                               "metadata self-heal is pending for %s.", -                               local->loc.path); -                        local->self_heal.do_metadata_self_heal = _gf_true; -                } +	read_subvol = -1; +	for (i = 0; i < priv->child_count; i++) { +		if (!replies[i].valid) +			continue; + +		if (locked_entry && replies[i].op_ret == -1 && +		    replies[i].op_errno == ENOENT) { +			/* Second, check entry is still +			   "underway" in creation */ +			local->op_ret = -1; +			local->op_errno = ENOENT; +			read_subvol = i; +			goto unwind; +		} -                if (pending[AFR_ENTRY_TRANSACTION]) { -                        gf_log(this->name, GF_LOG_DEBUG, -                               "entry self-heal is pending for %s.", -                               local->loc.path); -                        local->self_heal.do_entry_self_heal = _gf_true; -                } +		if (replies[i].op_ret == -1) +			continue; -                if (pending[AFR_DATA_TRANSACTION]) { -                        gf_log(this->name, GF_LOG_DEBUG, -                               "data self-heal is pending for %s.", -                               local->loc.path); -                        local->self_heal.do_data_self_heal = _gf_true; -                } -        } -} +		if (read_subvol == -1 || !readable[read_subvol]) { +			read_subvol = i; +			uuid_copy (read_gfid, replies[i].poststat.ia_gfid); +			local->op_ret = 0; +		} +	} -void -afr_lookup_check_set_metadata_split_brain (afr_local_t *local, xlator_t *this) -{ -        int32_t                  *sources = NULL; -        afr_private_t            *priv = NULL; -        int32_t                  subvol_status = 0; -        int32_t                  *success_children   = NULL; -        dict_t                   **xattrs = NULL; -        struct iatt              *bufs = NULL; -        int32_t                  **pending_matrix = NULL; +	if (read_subvol == -1) +		goto unwind; +	/* We now have a read_subvol, which is readable[] (if there +	   were any). Next we look for GFID mismatches. We don't +	   consider a GFID mismatch as an error if read_subvol is +	   readable[] but the mismatching GFID subvol is not. +	*/ +	for (i = 0; i < priv->child_count; i++) { +		if (!replies[i].valid || replies[i].op_ret == -1) { +			if (priv->child_up[i]) +				can_interpret = _gf_false; +			continue; +		} -        priv = this->private; +		if (!uuid_compare (replies[i].poststat.ia_gfid, +				   read_gfid)) +			continue; -        sources = GF_CALLOC (priv->child_count, sizeof (*sources), -                             gf_afr_mt_int32_t); -        if (NULL == sources) -                goto out; -        success_children = local->cont.lookup.success_children; -        xattrs = local->cont.lookup.xattrs; -        bufs = local->cont.lookup.bufs; -        pending_matrix = local->cont.lookup.pending_matrix; -        afr_build_sources (this, xattrs, bufs, pending_matrix, -                           sources, success_children, AFR_METADATA_TRANSACTION, -                           &subvol_status, _gf_false); -        if (subvol_status & SPLIT_BRAIN) -                local->cont.lookup.possible_spb = _gf_true; -out: -        GF_FREE (sources); -} +		can_interpret = _gf_false; -static void -afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this, -                            struct iatt *buf, struct iatt *lookup_buf) -{ -        if (PERMISSION_DIFFERS (buf, lookup_buf)) { -                /* mismatching permissions */ -                gf_log (this->name, GF_LOG_DEBUG, -                        "permissions differ for %s ", local->loc.path); -                local->self_heal.do_metadata_self_heal = _gf_true; -        } +		if (locked_entry) +			continue; -        if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { -                /* mismatching permissions */ -                local->self_heal.do_metadata_self_heal = _gf_true; -                gf_log (this->name, GF_LOG_DEBUG, -                        "ownership differs for %s ", local->loc.path); -        } +		/* Now GFIDs mismatch. It's OK as long as this subvol +		   is not readable[] but read_subvol is */ +		if (readable[read_subvol] && !readable[i]) +			continue; -        if (SIZE_DIFFERS (buf, lookup_buf) -            && IA_ISREG (buf->ia_type)) { -                gf_log (this->name, GF_LOG_DEBUG, -                        "size differs for %s ", local->loc.path); -                local->self_heal.do_data_self_heal = _gf_true; -        } +		/* LOG ERROR */ +		local->op_ret = -1; +		local->op_errno = EIO; +		goto unwind; +	} -        if (uuid_compare (buf->ia_gfid, lookup_buf->ia_gfid)) { -                /* mismatching gfid */ -                gf_log (this->name, GF_LOG_DEBUG, -                        "%s: gfid different on subvolume", local->loc.path); -        } -} +	/* Forth, for the finalized GFID, pick the best subvolume +	   to return stats from. +	*/ +	if (can_interpret) { +		/* It is safe to call afr_replies_interpret() because we have +		   a response from all the UP subvolumes and all of them resolved +		   to the same GFID +		*/ +		if (afr_replies_interpret (frame, this, local->inode)) { +			read_subvol = afr_data_subvol_get (local->inode, this, +							   0, 0); +			afr_inode_read_subvol_reset (local->inode, this); +			goto cant_interpret; +		} else { +			read_subvol = afr_data_subvol_get (local->inode, this, +							   0, 0); +		} +	} else { +	cant_interpret: +		if (read_subvol == -1) +			dict_del (replies[0].xdata, GF_CONTENT_KEY); +		else +			dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY); +	} -static void -afr_detect_self_heal_by_split_brain_status (afr_local_t *local, xlator_t *this) -{ -        gf_boolean_t split_brain = _gf_false; -        afr_self_heal_t *sh = NULL; +	afr_handle_quota_size (frame, this); -        sh = &local->self_heal; +unwind: +	if (read_subvol == -1) +		read_subvol = 0; -        split_brain = afr_is_split_brain (this, local->cont.lookup.inode); -        split_brain = split_brain || local->cont.lookup.possible_spb; -        if ((local->success_count > 0) && split_brain && -            IA_ISREG (local->cont.lookup.inode->ia_type)) { -                sh->force_confirm_spb = _gf_true; -                gf_log (this->name, GF_LOG_DEBUG, -                        "split brain detected during lookup of %s.", -                        local->loc.path); -        } +	AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, +			  local->inode, &local->replies[read_subvol].poststat, +			  local->replies[read_subvol].xdata, +			  &local->replies[read_subvol].postparent);  } -static void -afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this) -{ -        GF_ASSERT (local); -        GF_ASSERT (this); - -        if ((local->success_count > 0) && (local->enoent_count > 0)) { -                local->self_heal.do_metadata_self_heal = _gf_true; -                local->self_heal.do_data_self_heal     = _gf_true; -                local->self_heal.do_entry_self_heal    = _gf_true; -                local->self_heal.do_gfid_self_heal    = _gf_true; -                local->self_heal.do_missing_entry_self_heal    = _gf_true; -                gf_log(this->name, GF_LOG_DEBUG, -                       "entries are missing in lookup of %s.", -                       local->loc.path); -        } - -        return; -} +/* + * During a lookup, some errors are more "important" than + * others in that they must be given higher priority while + * returning to the user. + * + * The hierarchy is ESTALE > ENOENT > others + */ -gf_boolean_t -afr_can_self_heal_proceed (afr_self_heal_t *sh, afr_private_t *priv) +int +afr_higher_errno (int32_t old_errno, int32_t new_errno)  { -        GF_ASSERT (sh); -        GF_ASSERT (priv); +	if (old_errno == ENODATA || new_errno == ENODATA) +		return ENODATA; +	if (old_errno == ESTALE || new_errno == ESTALE) +		return ESTALE; +	if (old_errno == ENOENT || new_errno == ENOENT) +		return ENOENT; -        if (sh->force_confirm_spb) -                return _gf_true; -        return (sh->do_gfid_self_heal -                || sh->do_missing_entry_self_heal -                || (afr_data_self_heal_enabled (priv->data_self_heal) && -                    sh->do_data_self_heal) -                || (priv->metadata_self_heal && sh->do_metadata_self_heal) -                || (priv->entry_self_heal && sh->do_entry_self_heal)); +	return new_errno;  } -afr_transaction_type -afr_transaction_type_get (ia_type_t ia_type) -{ -        afr_transaction_type    type = AFR_METADATA_TRANSACTION; -        GF_ASSERT (ia_type != IA_INVAL); +int +afr_final_errno (afr_local_t *local, afr_private_t *priv) +{ +	int i = 0; +	int op_errno = 0; +	int tmp_errno = 0; + +	for (i = 0; i < priv->child_count; i++) { +		if (!local->replies[i].valid) +			continue; +		if (local->replies[i].op_ret == 0) +			continue; +		tmp_errno = local->replies[i].op_errno; +		op_errno = afr_higher_errno (op_errno, tmp_errno); +	} -        if (IA_ISDIR (ia_type)) { -                type = AFR_ENTRY_TRANSACTION; -        } else if (IA_ISREG (ia_type)) { -                type = AFR_DATA_TRANSACTION; -        } -        return type; +	return op_errno;  } -int -afr_lookup_select_read_child (afr_local_t *local, xlator_t *this, -                              int32_t *read_child) +static int +get_pathinfo_host (char *pathinfo, char *hostname, size_t size)  { -        ia_type_t               ia_type        = IA_INVAL; -        int32_t                 source         = -1; -        int                     ret            = -1; -        dict_t                  **xattrs       = NULL; -        int32_t                 *success_children = NULL; -        afr_transaction_type    type           = AFR_METADATA_TRANSACTION; -        uuid_t                  *gfid          = NULL; - -        GF_ASSERT (local); -        GF_ASSERT (this); -        GF_ASSERT (local->success_count > 0); +        char    *start = NULL; +        char    *end = NULL; +        int     ret  = -1; +        int     i    = 0; -        success_children = local->cont.lookup.success_children; -        /*We can take the success_children[0] only because we already -         *handle the conflicting children other wise, we could select the -         *read_child based on wrong file type -         */ -        ia_type = local->cont.lookup.bufs[success_children[0]].ia_type; -        type = afr_transaction_type_get (ia_type); -        xattrs = local->cont.lookup.xattrs; -        gfid = &local->cont.lookup.buf.ia_gfid; -        source = afr_lookup_select_read_child_by_txn_type (this, local, xattrs, -                                                           type, *gfid); -        if (source < 0) { -                gf_log (this->name, GF_LOG_DEBUG, "failed to select source " -                        "for %s", local->loc.path); +        if (!pathinfo) +                goto out; + +        start = strchr (pathinfo, ':'); +        if (!start) +                goto out; +        end = strrchr (pathinfo, ':'); +        if (start == end)                  goto out; -        } -        gf_log (this->name, GF_LOG_DEBUG, "Source selected as %d for %s", -                source, local->loc.path); -        *read_child = source; +        memset (hostname, 0, size); +        i = 0; +        while (++start != end) +                hostname[i++] = *start;          ret = 0;  out:          return ret;  } -static inline gf_boolean_t -afr_is_transaction_running (afr_local_t *local) -{ -        GF_ASSERT (local->fop == GF_FOP_LOOKUP); -        return ((local->inodelk_count > 0) || (local->entrylk_count > 0)); -} - -void -afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, -                      gf_boolean_t background, ia_type_t ia_type, char *reason, -                      void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, -                                                   xlator_t *this), -                      int (*unwind) (call_frame_t *frame, xlator_t *this, -                                     int32_t op_ret, int32_t op_errno, -                                     int32_t sh_failed)) -{ -        afr_local_t             *local = NULL; -        char                    sh_type_str[256] = {0,}; -        char                    *bg = ""; - -        GF_ASSERT (frame); -        GF_ASSERT (this); -        GF_ASSERT (inode); -        GF_ASSERT (ia_type != IA_INVAL); - -        local = frame->local; -        local->self_heal.background = background; -        local->self_heal.type       = ia_type; -        local->self_heal.unwind     = unwind; -        local->self_heal.gfid_sh_success_cbk     = gfid_sh_success_cbk; - -        afr_self_heal_type_str_get (&local->self_heal, -                                    sh_type_str, -                                    sizeof (sh_type_str)); - -        if (background) -                bg = "background"; -        gf_log (this->name, GF_LOG_DEBUG, -                "%s %s self-heal triggered. path: %s, reason: %s", bg, -                sh_type_str, local->loc.path, reason); - -        afr_self_heal (frame, this, inode); -} - -unsigned int -afr_gfid_missing_count (const char *xlator_name, int32_t *success_children, -                        struct iatt *bufs, unsigned int child_count, -                        const char *path) +int +afr_local_pathinfo (char *pathinfo, gf_boolean_t *local)  { -        unsigned int    gfid_miss_count   = 0; -        int             i              = 0; -        struct iatt     *child1        = NULL; +        int             ret   = 0; +        char            pathinfohost[1024] = {0}; +        char            localhost[1024] = {0}; +        xlator_t        *this = THIS; -        for (i = 0; i < child_count; i++) { -                if (success_children[i] == -1) -                        break; -                child1 = &bufs[success_children[i]]; -                if (uuid_is_null (child1->ia_gfid)) { -                        gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid is null" -                                " on subvolume %d", path, success_children[i]); -                        gfid_miss_count++; -                } +        *local = _gf_false; +        ret = get_pathinfo_host (pathinfo, pathinfohost, sizeof (pathinfohost)); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "Invalid pathinfo: %s", +                        pathinfo); +                goto out;          } -        return gfid_miss_count; -} - -static int -afr_lookup_gfid_missing_count (afr_local_t *local, xlator_t *this) -{ -        int32_t         *success_children = NULL; -        afr_private_t   *priv          = NULL; -        struct iatt     *bufs          = NULL; -        int             miss_count     = 0; - -        priv = this->private; -        bufs = local->cont.lookup.bufs; -        success_children = local->cont.lookup.success_children; - -        miss_count =  afr_gfid_missing_count (this->name, success_children, -                                              bufs, priv->child_count, -                                              local->loc.path); -        return miss_count; -} - -gf_boolean_t -afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, -                        unsigned int child_count, const char *path, -                        const char *xlator_name) -{ -        gf_boolean_t    conflicting    = _gf_false; -        int             i              = 0; -        struct iatt     *child1        = NULL; -        struct iatt     *child2        = NULL; -        uuid_t          *gfid          = NULL; - -        for (i = 0; i < child_count; i++) { -                if (success_children[i] == -1) -                        break; -                child1 = &bufs[success_children[i]]; -                if ((!gfid) && (!uuid_is_null (child1->ia_gfid))) -                        gfid = &child1->ia_gfid; - -                if (i == 0) -                        continue; - -                child2 = &bufs[success_children[i-1]]; -                if (FILETYPE_DIFFERS (child1, child2)) { -                        gf_log (xlator_name, GF_LOG_DEBUG, "%s: filetype " -                                "differs on subvolumes (%d, %d)", path, -                                success_children[i-1], success_children[i]); -                        conflicting = _gf_true; -                        goto out; -                } -                if (!gfid || uuid_is_null (child1->ia_gfid)) -                        continue; -                if (uuid_compare (*gfid, child1->ia_gfid)) { -                       gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid differs" -                               " on subvolume %d", path, success_children[i]); -                       conflicting = _gf_true; -                       goto out; -                } +        ret = gethostname (localhost, sizeof (localhost)); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "gethostname() failed, " +                        "reason: %s", strerror (errno)); +                goto out;          } -out: -        return conflicting; -} -/* afr_update_gfid_from_iatts: This function should be called only if the - * iatts are not conflicting. - */ -void -afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs, -                            int32_t *success_children, unsigned int child_count) -{ -        uuid_t          *gfid = NULL; -        int             i = 0; -        int             child = 0; - -        for (i = 0; i < child_count; i++) { -                child = success_children[i]; -                if (child == -1) -                        break; -                if ((!gfid) && (!uuid_is_null (bufs[child].ia_gfid))) { -                        gfid = &bufs[child].ia_gfid; -                } else if (gfid && (!uuid_is_null (bufs[child].ia_gfid))) { -                        if (uuid_compare (*gfid, bufs[child].ia_gfid)) { -                                GF_ASSERT (0); -                                goto out; -                        } -                } -        } -        if (gfid && (!uuid_is_null (*gfid))) -                uuid_copy (uuid, *gfid); +        if (!strcmp (localhost, pathinfohost)) +                *local = _gf_true;  out: -        return; -} - -static gf_boolean_t -afr_lookup_conflicting_entries (afr_local_t *local, xlator_t *this) -{ -        afr_private_t           *priv = NULL; -        gf_boolean_t            conflict = _gf_false; - -        priv = this->private; -        conflict =  afr_conflicting_iattrs (local->cont.lookup.bufs, -                                            local->cont.lookup.success_children, -                                            priv->child_count, local->loc.path, -                                            this->name); -        return conflict; -} - -gf_boolean_t -afr_open_only_data_self_heal (char *data_self_heal) -{ -        return !strcmp (data_self_heal, "open"); +        return ret;  } -gf_boolean_t -afr_data_self_heal_enabled (char *data_self_heal) +static int32_t +afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			 int32_t op_ret, int32_t op_errno, dict_t *dict, +			 dict_t *xdata)  { -        gf_boolean_t    enabled = _gf_false; +        int              ret            = 0; +        char            *pathinfo       = NULL; +        gf_boolean_t     is_local        = _gf_false; +        afr_private_t   *priv           = NULL; +        int32_t          child_index    = -1; -        if (gf_string2boolean (data_self_heal, &enabled) == -1) { -                enabled = !strcmp (data_self_heal, "open"); -                GF_ASSERT (enabled); +        if (op_ret != 0) { +                goto out;          } -        return enabled; -} - -static void -afr_lookup_set_self_heal_params (afr_local_t *local, xlator_t *this) -{ -        int                     i = 0; -        struct iatt             *bufs = NULL; -        dict_t                  **xattr = NULL; -        afr_private_t           *priv = NULL; -        int32_t                 child1 = -1; -        int32_t                 child2 = -1; -        afr_self_heal_t         *sh = NULL; - -        priv  = this->private; -        sh = &local->self_heal; - -        afr_detect_self_heal_by_lookup_status (local, this); - -        if (afr_lookup_gfid_missing_count (local, this)) -                local->self_heal.do_gfid_self_heal    = _gf_true; - -        if (_gf_true == afr_lookup_conflicting_entries (local, this)) -                local->self_heal.do_missing_entry_self_heal    = _gf_true; -        else -                afr_update_gfid_from_iatts (local->self_heal.sh_gfid_req, -                                            local->cont.lookup.bufs, -                                            local->cont.lookup.success_children, -                                            priv->child_count); - -        bufs = local->cont.lookup.bufs; -        for (i = 1; i < local->success_count; i++) { -                child1 = local->cont.lookup.success_children[i-1]; -                child2 = local->cont.lookup.success_children[i]; -                afr_detect_self_heal_by_iatt (local, this, -                                              &bufs[child1], &bufs[child2]); -        } +	priv = this->private; +	child_index = (int32_t)(long)cookie; -        xattr = local->cont.lookup.xattrs; -        for (i = 0; i < local->success_count; i++) { -                child1 = local->cont.lookup.success_children[i]; -                afr_lookup_set_self_heal_params_by_xattr (local, this, -                                                          xattr[child1]); +        ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); +        if (ret != 0) { +                goto out;          } -        if (afr_open_only_data_self_heal (priv->data_self_heal)) -                sh->do_data_self_heal = _gf_false; -        if (sh->do_metadata_self_heal) -                afr_lookup_check_set_metadata_split_brain (local, this); -        afr_detect_self_heal_by_split_brain_status (local, this); -} - -int -afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, -                             int32_t op_ret, int32_t op_errno, -                             int32_t sh_failed) -{ -        afr_local_t *local = NULL; -        int         ret    = -1; -        dict_t      *xattr = NULL; - -        local = frame->local; - -        if (op_ret == -1) { -                local->op_ret = -1; -		local->op_errno = afr_most_important_error(local->op_errno, -							   op_errno, _gf_true); +        ret = afr_local_pathinfo (pathinfo, &is_local); +        if (ret) {                  goto out; -        } else { -                local->op_ret = 0;          } -        afr_lookup_done_success_action (frame, this, _gf_true); -        xattr = local->cont.lookup.xattr; -        if (xattr) { -                ret = dict_set_int32 (xattr, "sh-failed", sh_failed); -                if (ret) -                        gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " -                                "sh-failed to %d", local->loc.path, sh_failed); - -                if (local->self_heal.actual_sh_started == _gf_true && -                    sh_failed == 0) { -                        ret = dict_set_int32 (xattr, "actual-sh-done", 1); -                        if (ret) -                                gf_log(this->name, GF_LOG_ERROR, "%s: Failed to" -                                       " set actual-sh-done to %d", -                                       local->loc.path, -                                       local->self_heal.actual_sh_started); -                } +        /* +         * Note that one local subvolume will override another here.  The only +         * way to avoid that would be to retain extra information about whether +         * the previous read_child is local, and it's just not worth it.  Even +         * the slowest local subvolume is far preferable to a remote one. +         */ +        if (is_local) { +                gf_log (this->name, GF_LOG_INFO, +                        "selecting local read_child %s", +                        priv->children[child_index]->name); +                priv->read_child = child_index;          }  out: -        AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, -                          local->cont.lookup.inode, &local->cont.lookup.buf, -                          local->cont.lookup.xattr, -                          &local->cont.lookup.postparent); - +        STACK_DESTROY(frame->root);          return 0;  } -//TODO: At the moment only lookup needs this, so not doing any checks, in the -// future we will have to do fop specific operations -void -afr_post_gfid_sh_success (call_frame_t *sh_frame, xlator_t *this) +static void +afr_attempt_local_discovery (xlator_t *this, int32_t child_index)  { -        afr_local_t             *local = NULL; -        afr_local_t             *sh_local = NULL; -        afr_private_t           *priv = NULL; -        afr_self_heal_t         *sh = NULL; -        int                     i = 0; -        struct iatt             *lookup_bufs = NULL; -        struct iatt             *lookup_parentbufs = NULL; - -        sh_local = sh_frame->local; -        sh       = &sh_local->self_heal; -        local = sh->orig_frame->local; -        lookup_bufs = local->cont.lookup.bufs; -        lookup_parentbufs = local->cont.lookup.postparents; -        priv = this->private; - -        memcpy (lookup_bufs, sh->buf, priv->child_count * sizeof (*sh->buf)); -        memcpy (lookup_parentbufs, sh->parentbufs, -                priv->child_count * sizeof (*sh->parentbufs)); - -        afr_reset_xattr (local->cont.lookup.xattrs, priv->child_count); -        if (local->cont.lookup.xattr) { -                dict_unref (local->cont.lookup.xattr); -                local->cont.lookup.xattr = NULL; -        } +        call_frame_t    *newframe = NULL; +        loc_t            tmploc = {0,}; +        afr_private_t   *priv = this->private; -        for (i = 0; i < priv->child_count; i++) { -                if (sh->xattr[i]) -                        local->cont.lookup.xattrs[i] = dict_ref (sh->xattr[i]); +        newframe = create_frame(this,this->ctx->pool); +        if (!newframe) { +                return;          } -        afr_reset_children (local->cont.lookup.success_children, -                            priv->child_count); -        afr_children_copy (local->cont.lookup.success_children, -                           sh->fresh_children, priv->child_count); +        tmploc.gfid[sizeof(tmploc.gfid)-1] = 1; +        STACK_WIND_COOKIE (newframe, afr_local_discovery_cbk, +                           (void *)(long)child_index, +                           priv->children[child_index], +                           priv->children[child_index]->fops->getxattr, +                           &tmploc, GF_XATTR_PATHINFO_KEY, NULL);  } -static void -afr_lookup_perform_self_heal (call_frame_t *frame, xlator_t *this, -                              gf_boolean_t *sh_launched) -{ -        unsigned int         up_count = 0; -        afr_private_t       *priv    = NULL; -        afr_local_t         *local   = NULL; -        char                *reason  = NULL; - -        GF_ASSERT (sh_launched); -        *sh_launched = _gf_false; -        priv         = this->private; -        local        = frame->local; - -        up_count  = afr_up_children_count (local->child_up, priv->child_count); -        if (up_count == 1) { -                gf_log (this->name, GF_LOG_DEBUG, -                        "Only 1 child up - do not attempt to detect self heal"); -                goto out; -        } - -        afr_lookup_set_self_heal_params (local, this); -        if (afr_can_self_heal_proceed (&local->self_heal, priv)) { -                if  (afr_is_transaction_running (local) && -                     /*Forcefully call afr_launch_self_heal (which will go on to -                       fail) for SB files.This prevents stale data being served -                       due to race in  afr_is_transaction_running() when -                       multiple clients access the same SB file*/ -                     !local->cont.lookup.possible_spb && -                     (!local->attempt_self_heal)) -                        goto out; -                reason = "lookup detected pending operations"; -                afr_launch_self_heal (frame, this, local->cont.lookup.inode, -                                      !local->foreground_self_heal, -                                      local->cont.lookup.buf.ia_type, -                                      reason, afr_post_gfid_sh_success, -                                      afr_self_heal_lookup_unwind); -                *sh_launched = _gf_true; -        } -out: -        return; -} - -void -afr_get_fresh_children (int32_t *success_children, int32_t *sources, -                        int32_t *fresh_children, unsigned int child_count) +int +afr_lookup_selfheal_wrap (void *opaque)  { -        unsigned int i = 0; -        unsigned int j = 0; - -        GF_ASSERT (success_children); -        GF_ASSERT (sources); -        GF_ASSERT (fresh_children); +	call_frame_t *frame = opaque; +	afr_local_t *local = NULL; +	xlator_t *this = NULL; +	inode_t *inode = NULL; -        afr_reset_children (fresh_children, child_count); -        for (i = 0; i < child_count; i++) { -                if (success_children[i] == -1) -                        break; -                if (afr_is_read_child (success_children, sources, child_count, -                                       success_children[i])) { -                        fresh_children[j] = success_children[i]; -                        j++; -                } -        } -} +	local = frame->local; +	this = frame->this; -static int -afr_lookup_set_read_ctx (afr_local_t *local, xlator_t *this, int32_t read_child) -{ -        afr_private_t           *priv = NULL; +	afr_selfheal_name (frame->this, local->loc.pargfid, local->loc.name); -        GF_ASSERT (read_child >= 0); +	afr_replies_wipe (local, this->private); -        priv = this->private; -        afr_get_fresh_children (local->cont.lookup.success_children, -                                local->cont.lookup.sources, -                                local->fresh_children, priv->child_count); -        afr_inode_set_read_ctx (this, local->cont.lookup.inode, read_child, -                                local->fresh_children); +	inode = afr_selfheal_unlocked_lookup_on (frame, local->loc.parent, +						 local->loc.name, local->replies, +						 local->child_up); +	if (inode) +		inode_unref (inode); +	afr_lookup_done (frame, this); -        return 0; +	return 0;  } +  int -afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, -                                gf_boolean_t fail_conflict) +afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)  { -        int32_t             read_child = -1; -        int32_t             ret        = -1; -        afr_local_t         *local     = NULL; -        gf_boolean_t        fresh_lookup = _gf_false; +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	call_frame_t *heal = NULL; +	int i = 0, first = -1; +	gf_boolean_t need_heal = _gf_false; +	struct afr_reply *replies = NULL; +	int ret = 0; -        local   = frame->local; -        fresh_lookup = local->cont.lookup.fresh_lookup; +	local = frame->local; +	replies = local->replies; +	priv = this->private; -        if (local->loc.parent == NULL) -                fail_conflict = _gf_true; +	for (i = 0; i < priv->child_count; i++) { +		if (!replies[i].valid) +			continue; -        if (afr_lookup_conflicting_entries (local, this)) { -                if (fail_conflict == _gf_false) -                        ret = 0; -                goto out; -        } +		if (first == -1) { +			first = i; +			continue; +		} -        ret = afr_lookup_select_read_child (local, this, &read_child); -        if (!afr_is_transaction_running (local) || fresh_lookup) { -                if (read_child < 0) -                        goto out; +		if (replies[i].op_ret != replies[first].op_ret) { +			need_heal = _gf_true; +			break; +		} -                ret = afr_lookup_set_read_ctx (local, this, read_child); -                if (ret) -                        goto out; -        } +		if (uuid_compare (replies[i].poststat.ia_gfid, +				  replies[first].poststat.ia_gfid)) { +			need_heal = _gf_true; +			break; +		} +	} -        ret = afr_lookup_build_response_params (local, this); -        if (ret) -                goto out; -        afr_update_loc_gfids (&local->loc, -                              &local->cont.lookup.buf, -                              &local->cont.lookup.postparent); +	if (need_heal) { +		heal = copy_frame (frame); +		if (heal) +			heal->root->pid = -1; +		ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap, +				    afr_refresh_selfheal_done, heal, frame); +		if (ret) +			goto lookup_done; +	} else { +	lookup_done: +		afr_lookup_done (frame, this); +	} -        ret = 0; -out: -        if (ret) { -                local->op_ret = -1; -                local->op_errno = EIO; -        } -        return ret; +	return ret;  } +  int -afr_lookup_get_latest_subvol (afr_local_t *local, xlator_t *this) +afr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		int op_ret, int op_errno, inode_t *inode, struct iatt *buf, +		dict_t *xdata, struct iatt *postparent)  { -        afr_private_t *priv = NULL; -        int32_t       *success_children = NULL; -        struct iatt   *bufs = NULL; -        int           i = 0; -        int           child = 0; -        int           lsubvol = -1; - -        priv = this->private; -        success_children = local->cont.lookup.success_children; -        bufs = local->cont.lookup.bufs; -        for (i = 0; i < priv->child_count; i++) { -                child = success_children[i]; -                if (child == -1) -                        break; -                if (uuid_is_null (bufs[child].ia_gfid)) -                        continue; -                if (lsubvol < 0) { -                        lsubvol = child; -                } else if (bufs[lsubvol].ia_ctime < bufs[child].ia_ctime) { -                        lsubvol = child; -                } else if ((bufs[lsubvol].ia_ctime == bufs[child].ia_ctime) && -                  (bufs[lsubvol].ia_ctime_nsec < bufs[child].ia_ctime_nsec)) { -                        lsubvol = child; -                } -        } -        return lsubvol; -} +        afr_local_t *   local = NULL; +        int             call_count      = -1; +        int             child_index     = -1; -void -afr_lookup_mark_other_entries_stale (afr_local_t *local, xlator_t *this, -                                     int subvol) -{ -        afr_private_t *priv = NULL; -        int32_t       *success_children = NULL; -        struct iatt   *bufs = NULL; -        int           i = 0; -        int           child = 0; +	child_index = (long) cookie; -        priv = this->private; -        success_children = local->cont.lookup.success_children; -        bufs = local->cont.lookup.bufs; -        memcpy (local->fresh_children, success_children, -                sizeof (*success_children) * priv->child_count); -        for (i = 0; i < priv->child_count; i++) { -                child = local->fresh_children[i]; -                if (child == -1) -                        break; -                if (child == subvol) -                        continue; -                if (uuid_is_null (bufs[child].ia_gfid) && -                    (bufs[child].ia_type == bufs[subvol].ia_type)) -                        continue; -                afr_children_rm_child (success_children, child, -                                       priv->child_count); -                local->success_count--; -        } -        afr_reset_children (local->fresh_children, priv->child_count); -} +	local = frame->local; -void -afr_succeed_lookup_on_latest_iatt (afr_local_t *local, xlator_t *this) -{ -        int    lsubvol = 0; +	local->replies[child_index].valid = 1; +	local->replies[child_index].op_ret = op_ret; +	local->replies[child_index].op_errno = op_errno; +	if (op_ret != -1) { +		local->replies[child_index].poststat = *buf; +		local->replies[child_index].postparent = *postparent; +		if (xdata) +			local->replies[child_index].xdata = dict_ref (xdata); +	} -        if (!afr_lookup_conflicting_entries (local, this)) -                goto out; +        call_count = afr_frame_return (frame); +        if (call_count == 0) { +		afr_lookup_entry_heal (frame, this); +        } -        lsubvol = afr_lookup_get_latest_subvol (local, this); -        if (lsubvol < 0) -                goto out; -        afr_lookup_mark_other_entries_stale (local, this, lsubvol); -out: -        return; +	return 0;  } -gf_boolean_t -afr_is_entry_possibly_under_creation (afr_local_t *local, xlator_t *this) -{ -        /* -         * We need to perform this test in lookup done and treat on going -         * create/DELETE as ENOENT. -         * Reason: -        Multiple clients A, B and C are attempting 'mkdir -p /mnt/a/b/c' - -        1 Client A is in the middle of mkdir(/a). It has acquired lock. -          It has performed mkdir(/a) on one subvol, and second one is still -          in progress -        2 Client B performs a lookup, sees directory /a on one, -          ENOENT on the other, succeeds lookup. -        3 Client B performs lookup on /a/b on both subvols, both return ENOENT -          (one subvol because /a/b does not exist, another because /a -          itself does not exist) -        4 Client B proceeds to mkdir /a/b. It obtains entrylk on inode=/a with -          basename=b on one subvol, but fails on other subvol as /a is yet to -          be created by Client A. -        5 Client A finishes mkdir of /a on other subvol -        6 Client C also attempts to create /a/b, lookup returns ENOENT on -          both subvols. -        7 Client C tries to obtain entrylk on on inode=/a with basename=b, -          obtains on one subvol (where B had failed), and waits for B to unlock -          on other subvol. -        8 Client B finishes mkdir() on one subvol with GFID-1 and completes -          transaction and unlocks -        9 Client C gets the lock on the second subvol, At this stage second -          subvol already has /a/b created from Client B, but Client C does not -          check that in the middle of mkdir transaction -        10 Client C attempts mkdir /a/b on both subvols. It succeeds on -           ONLY ONE (where Client B could not get lock because of -           missing parent /a dir) with GFID-2, and gets EEXIST from ONE subvol. -        This way we have /a/b in GFID mismatch. One subvol got GFID-1 because -        Client B performed transaction on only one subvol (because entrylk() -        could not be obtained on second subvol because of missing parent dir -- -        caused by premature/speculative succeeding of lookup() on /a when locks -        are detected). Other subvol gets GFID-2 from Client C because while -        it was waiting for entrylk() on both subvols, Client B was in the -        middle of creating mkdir() on only one subvol, and Client C does not -        "expect" this when it is between lock() and pre-op()/op() phase of the -        transaction. -         */ -	if (local->cont.lookup.parent_entrylk && local->enoent_count) -		return _gf_true; - -	return _gf_false; -}  static void -afr_lookup_done (call_frame_t *frame, xlator_t *this) +afr_discover_done (call_frame_t *frame, xlator_t *this)  { -        int                 unwind = 1;          afr_private_t       *priv  = NULL;          afr_local_t         *local = NULL; -        int                 ret = -1; -        gf_boolean_t        sh_launched = _gf_false; -        gf_boolean_t        fail_conflict = _gf_false; -        int                 gfid_miss_count = 0; -        int                 enotconn_count = 0; -        int                 up_children_count = 0; +	int                 i = -1; +	int                 op_errno = 0; +	int                 read_subvol = 0;          priv  = this->private;          local = frame->local; -	if (afr_is_entry_possibly_under_creation (local, this)) { -		local->op_ret = -1; -		local->op_errno = ENOENT; -		goto unwind; +	for (i = 0; i < priv->child_count; i++) { +		if (!local->replies[i].valid) +			continue; +		if (local->replies[i].op_ret == 0) +			local->op_ret = 0;  	} -        if (local->op_ret < 0) -                goto unwind; +	op_errno = afr_final_errno (frame->local, this->private); -        if (local->cont.lookup.parent_entrylk && local->success_count > 1) -                afr_succeed_lookup_on_latest_iatt (local, this); - -        gfid_miss_count = afr_lookup_gfid_missing_count (local, this); -        up_children_count = afr_up_children_count (local->child_up, -                                                   priv->child_count); -        enotconn_count = priv->child_count - up_children_count; -        if ((gfid_miss_count == local->success_count) && -            (enotconn_count > 0)) { -                local->op_ret = -1; -                local->op_errno = EIO; -                gf_log (this->name, GF_LOG_ERROR, "Failing lookup for %s, " -                        "LOOKUP on a file without gfid is not allowed when " -                        "some of the children are down", local->loc.path); -                goto unwind; -        } - -        if ((gfid_miss_count == local->success_count) && -            uuid_is_null (local->cont.lookup.gfid_req)) { -                local->op_ret = -1; -                local->op_errno = ENODATA; -                gf_log (this->name, GF_LOG_ERROR, "%s: No gfid present", -                        local->loc.path); +        if (local->op_ret < 0) { +		local->op_errno = op_errno; +		local->op_ret = -1;                  goto unwind; -        } +	} -        if (gfid_miss_count && uuid_is_null (local->cont.lookup.gfid_req)) -                fail_conflict = _gf_true; -        ret = afr_lookup_done_success_action (frame, this, fail_conflict); -        if (ret) -                goto unwind; -        uuid_copy (local->self_heal.sh_gfid_req, local->cont.lookup.gfid_req); +	afr_replies_interpret (frame, this, local->inode); -        afr_lookup_perform_self_heal (frame, this, &sh_launched); -        if (sh_launched) { -                unwind = 0; -                goto unwind; -        } +	read_subvol = afr_data_subvol_get (local->inode, this, 0, 0); +	if (read_subvol == -1) { +		gf_log (this->name, GF_LOG_WARNING, "no read subvols for %s", +			local->loc.path); - unwind: -         if (unwind) { -                 AFR_STACK_UNWIND (lookup, frame, local->op_ret, -                                   local->op_errno, local->cont.lookup.inode, -                                   &local->cont.lookup.buf, -                                   local->cont.lookup.xattr, -                                   &local->cont.lookup.postparent); -        } -} +		for (i = 0; i < priv->child_count; i++) { +			if (!local->replies[i].valid || +			    local->replies[i].op_ret == -1) +				continue; +			read_subvol = i; +			break; +		} +	} -/* - * During a lookup, some errors are more "important" than - * others in that they must be given higher priority while - * returning to the user. - * - * The hierarchy is ESTALE > EIO > ENOENT > others - */ -int32_t -afr_most_important_error(int32_t old_errno, int32_t new_errno, -			 gf_boolean_t eio) -{ -	if (old_errno == ESTALE || new_errno == ESTALE) -		return ESTALE; -	if (eio && (old_errno == EIO || new_errno == EIO)) -		return EIO; -	if (old_errno == ENOENT || new_errno == ENOENT) -		return ENOENT; +unwind: +	if (read_subvol == -1) +		read_subvol = 0; -	return new_errno; +	AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, +			  local->inode, &local->replies[read_subvol].poststat, +			  local->replies[read_subvol].xdata, +			  &local->replies[read_subvol].postparent);  } -int32_t -afr_resultant_errno_get (int32_t *children, -                         int *child_errno, unsigned int child_count) -{ -        int     i = 0; -        int32_t op_errno = 0; -        int     child = 0; - -        for (i = 0; i < child_count; i++) { -                if (children) { -                        child = children[i]; -                        if (child == -1) -                                break; -                } else { -                        child = i; -                } -		op_errno = afr_most_important_error(op_errno, -						    child_errno[child], -						    _gf_false); -        } -        return op_errno; -} -static void -afr_lookup_handle_error (afr_local_t *local, int32_t op_ret,  int32_t op_errno) +int +afr_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		  int op_ret, int op_errno, inode_t *inode, struct iatt *buf, +		  dict_t *xdata, struct iatt *postparent)  { -        GF_ASSERT (local); -        if (op_errno == ENOENT) -                local->enoent_count++; +        afr_local_t *   local = NULL; +        int             call_count      = -1; +        int             child_index     = -1; -	local->op_errno = afr_most_important_error(local->op_errno, op_errno, -						   _gf_false); +	child_index = (long) cookie; -        if (local->op_errno == ESTALE) { -                local->op_ret = -1; -        } -} +	local = frame->local; -static void -afr_set_root_inode_on_first_lookup (afr_local_t *local, xlator_t *this, -                                    inode_t *inode) -{ -        afr_private_t           *priv = NULL; -        GF_ASSERT (inode); +	local->replies[child_index].valid = 1; +	local->replies[child_index].op_ret = op_ret; +	local->replies[child_index].op_errno = op_errno; +	if (op_ret != -1) { +		local->replies[child_index].poststat = *buf; +		local->replies[child_index].postparent = *postparent; +		if (xdata) +			local->replies[child_index].xdata = dict_ref (xdata); +	} -        if (!__is_root_gfid (inode->gfid)) -                goto out; -        if (!afr_is_fresh_lookup (&local->loc, this)) -                goto out; -        priv = this->private; -        if ((priv->first_lookup)) { -                gf_log (this->name, GF_LOG_INFO, "added root inode"); -                priv->root_inode = inode_ref (inode); -                priv->first_lookup = 0; +        if (local->do_discovery && (op_ret == 0)) +                afr_attempt_local_discovery (this, child_index); + +        call_count = afr_frame_return (frame); +        if (call_count == 0) { +               afr_discover_done (frame, this);          } -out: -        return; -} -static void -afr_lookup_cache_args (afr_local_t *local, int child_index, dict_t *xattr, -                       struct iatt *buf, struct iatt *postparent) -{ -        GF_ASSERT (child_index >= 0); -        local->cont.lookup.xattrs[child_index] = dict_ref (xattr); -        local->cont.lookup.postparents[child_index] = *postparent; -        local->cont.lookup.bufs[child_index] = *buf; +	return 0;  } -static void -afr_lookup_handle_first_success (afr_local_t *local, xlator_t *this, -                                 inode_t *inode, struct iatt *buf) -{ -        local->cont.lookup.inode      = inode_ref (inode); -        local->cont.lookup.buf        = *buf; -        afr_set_root_inode_on_first_lookup (local, this, inode); -} -static int32_t -afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                   int32_t op_ret, int32_t op_errno, dict_t *dict, -                   dict_t *xdata) +int +afr_discover_do (call_frame_t *frame, xlator_t *this, int err)  { -        int              ret            = 0; -        char            *pathinfo       = NULL; -        gf_boolean_t     is_local        = _gf_false; -        afr_private_t   *priv           = NULL; -        int32_t          child_index    = -1; +	int ret = 0; +	int i = 0; +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	int call_count = 0; -        if (op_ret != 0) { -                goto out; -        } +	local = frame->local; +	priv = this->private; -        ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); -        if (ret != 0) { -                goto out; -        } +	if (err) { +		local->op_errno = -err; +		ret = -1; +		goto out; +	} -        ret = afr_local_pathinfo (pathinfo, &is_local); +	call_count = local->call_count = AFR_COUNT (local->child_up, +						    priv->child_count); + +        ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, +					    &local->loc);          if (ret) { +                local->op_errno = -ret; +		ret = -1;                  goto out;          } -        priv = this->private; -        /* -         * Note that one local subvolume will override another here.  The only -         * way to avoid that would be to retain extra information about whether -         * the previous read_child is local, and it's just not worth it.  Even -         * the slowest local subvolume is far preferable to a remote one. -         */ -        if (is_local) { -                child_index = (int32_t)(long)cookie; -                gf_log (this->name, GF_LOG_INFO, -                        "selecting local read_child %s", -                        priv->children[child_index]->name); -                priv->read_child = child_index; +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i]) { +                        STACK_WIND_COOKIE (frame, afr_discover_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->lookup, +                                           &local->loc, local->xattr_req); +                        if (!--call_count) +                                break; +                }          } +	return 0;  out: -        STACK_DESTROY(frame->root); -        return 0; +	AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0); +	return 0;  } -static void -afr_attempt_local_discovery (xlator_t *this, int32_t child_index) + +int +afr_discover (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)  { -        call_frame_t    *newframe = NULL; -        loc_t            tmploc = {0,}; -        afr_private_t   *priv = this->private; +	int op_errno = ENOMEM; +	afr_private_t *priv = NULL; +	afr_local_t *local = NULL; +	int event = 0; -        newframe = create_frame(this,this->ctx->pool); -        if (!newframe) { -                return; -        } +	priv = this->private; -        tmploc.gfid[sizeof(tmploc.gfid)-1] = 1; -        STACK_WIND_COOKIE (newframe, afr_discovery_cbk, -                           (void *)(long)child_index, -                           priv->children[child_index], -                           priv->children[child_index]->fops->getxattr, -                           &tmploc, GF_XATTR_PATHINFO_KEY, NULL); -} - -static void -afr_lookup_handle_success (afr_local_t *local, xlator_t *this, int32_t child_index, -                           int32_t op_ret, int32_t op_errno, inode_t *inode, -                           struct iatt *buf, dict_t *xattr, -                           struct iatt *postparent) -{ -        afr_private_t   *priv   = this->private; +	local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out; -        if (local->success_count == 0) { -                if (local->op_errno != ESTALE) { -                        local->op_ret = op_ret; -                        local->op_errno = 0; -                } -                afr_lookup_handle_first_success (local, this, inode, buf); +        if (!local->call_count) { +                op_errno = ENOTCONN; +                goto out;          } -        afr_lookup_update_lk_counts (local, this, -                                     child_index, xattr); -        afr_lookup_cache_args (local, child_index, xattr, -                               buf, postparent); +	if (__is_root_gfid (loc->inode->gfid)) { +		if (!this->itable) +			this->itable = loc->inode->table; +		if (!priv->root_inode) +			priv->root_inode = inode_ref (loc->inode); -        if (local->do_discovery && (priv->read_child == (-1))) { -                afr_attempt_local_discovery(this,child_index); -        } +		if (priv->choose_local && !priv->did_discovery) { +			/* Logic to detect which subvolumes of AFR are +			   local, in order to prefer them for reads +			*/ +			local->do_discovery = _gf_true; +                        priv->did_discovery = _gf_true; +                } +	} -        local->cont.lookup.success_children[local->success_count] = child_index; -        local->success_count++; -} +        local->op = GF_FOP_LOOKUP; -int -afr_lookup_cbk (call_frame_t *frame, void *cookie, -                xlator_t *this,  int32_t op_ret,  int32_t op_errno, -                inode_t *inode,   struct iatt *buf, dict_t *xattr, -                struct iatt *postparent) -{ -        afr_local_t *   local = NULL; -        int             call_count      = -1; -        int             child_index     = -1; +        loc_copy (&local->loc, loc); -         child_index = (long) cookie; +	local->inode = inode_ref (loc->inode); -        LOCK (&frame->lock); -        { -                local = frame->local; +	if (xattr_req) +		/* If xattr_req was null, afr_lookup_xattr_req_prepare() will +		   allocate one for us */ +		local->xattr_req = dict_ref (xattr_req); -                if (op_ret == -1) { -                        afr_lookup_handle_error (local, op_ret, op_errno); -                        goto unlock; -                } -                afr_lookup_handle_success (local, this, child_index, op_ret, -                                           op_errno, inode, buf, xattr, -                                           postparent); +	if (uuid_is_null (loc->inode->gfid)) { +		afr_discover_do (frame, this, 0); +		return 0; +	} -         } -unlock: -        UNLOCK (&frame->lock); +	afr_read_subvol_get (loc->inode, this, NULL, &event, +			     AFR_DATA_TRANSACTION); -        call_count = afr_frame_return (frame); -        if (call_count == 0) { -               afr_lookup_done (frame, this); -        } +	if (event != local->event_generation) +		afr_inode_refresh (frame, this, loc->inode, afr_discover_do); +	else +		afr_discover_do (frame, this, 0); -         return 0; +	return 0; +out: +	AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); +	return 0;  } +  int -afr_lookup_cont_init (afr_local_t *local, unsigned int child_count) +afr_lookup_do (call_frame_t *frame, xlator_t *this, int err)  { -        int               ret            = -ENOMEM; -        struct iatt       *iatts         = NULL; -        int32_t           *success_children = NULL; -        int32_t           *sources       = NULL; -        int32_t           **pending_matrix = NULL; - -        GF_ASSERT (local); -        local->cont.lookup.xattrs = GF_CALLOC (child_count, -                                               sizeof (*local->cont.lookup.xattr), -                                               gf_afr_mt_dict_t); -        if (NULL == local->cont.lookup.xattrs) -                goto out; - -        iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); -        if (NULL == iatts) -                goto out; -        local->cont.lookup.postparents = iatts; +	int ret = 0; +	int i = 0; +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	int call_count = 0; -        iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); -        if (NULL == iatts) -                goto out; -        local->cont.lookup.bufs = iatts; +	local = frame->local; +	priv = this->private; -        success_children = afr_children_create (child_count); -        if (NULL == success_children) -                goto out; -        local->cont.lookup.success_children = success_children; +	if (err < 0) { +		local->op_errno = -err; +		ret = -1; +		goto out; +	} -        local->fresh_children = afr_children_create (child_count); -        if (NULL == local->fresh_children) -                goto out; +	call_count = local->call_count = AFR_COUNT (local->child_up, +						    priv->child_count); -        sources = GF_CALLOC (sizeof (*sources), child_count, gf_afr_mt_int32_t); -        if (NULL == sources) -                goto out; -        local->cont.lookup.sources = sources; - -        pending_matrix = afr_matrix_create (child_count, child_count); -        if (NULL == pending_matrix) +        ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, +					    &local->loc); +        if (ret) { +                local->op_errno = -ret; +		ret = -1;                  goto out; -        local->cont.lookup.pending_matrix = pending_matrix; +        } -        ret = 0; +        for (i = 0; i < priv->child_count; i++) { +                if (local->child_up[i]) { +                        STACK_WIND_COOKIE (frame, afr_lookup_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->lookup, +                                           &local->loc, local->xattr_req); +                        if (!--call_count) +                                break; +                } +        } +	return 0;  out: -        return ret; +	AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0); +	return 0;  } +/* + * afr_lookup() + * + * The goal here is to figure out what the element getting looked up is. + * i.e what is the GFID, inode type and a conservative estimate of the + * inode attributes are. + * + * As we lookup, operations may be underway on the entry name and the + * inode. In lookup() we are primarily concerned only with the entry + * operations. If the entry is getting unlinked or renamed, we detect + * what operation is underway by querying for on-going transactions and + * pending self-healing on the entry through xdata. + * + * If the entry is a file/dir, it may need self-heal and/or in a + * split-brain condition. Lookup is not the place to worry about these + * conditions. Outcast marking will naturally handle them in the read + * paths. + * + * Here is a brief goal of what we are trying to achieve: + * + * - LOOKUP on all subvolumes concurrently, querying on-going transaction + *   and pending self-heal info from the servers. + * + * - If all servers reply the same inode type and GFID, the overall call + *   MUST be a success. + * + * - If inode types or GFIDs mismatch, and there IS either an on-going + *   transaction or pending self-heal, inspect what the nature of the + *   transaction or pending heal is, and select the appropriate subvolume's + *   reply as the winner. + * + * - If inode types or GFIDs mismatch, and there are no on-going transactions + *   or pending self-heal on the entry name on any of the servers, fail the + *   lookup with EIO. Something has gone wrong beyond reasonable action. + */ +  int -afr_lookup (call_frame_t *frame, xlator_t *this, -            loc_t *loc, dict_t *xattr_req) +afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)  { -        afr_private_t  *priv      = NULL; -        afr_local_t    *local     = NULL; -        void           *gfid_req  = NULL; -        int            ret        = -1; -        int            i          = 0; -        int            call_count = 0; -        uint64_t       ctx        = 0; -        int32_t        op_errno   = 0; -                       priv       = this->private; - -        AFR_LOCAL_ALLOC_OR_GOTO (local, out); +        afr_local_t   *local = NULL; +        int32_t        op_errno = 0; +	int            event = 0; -        local->op_ret = -1; +	if (!loc->parent) { +		afr_discover (frame, this, loc, xattr_req); +		return 0; +	} -        frame->local = local; -        local->fop = GF_FOP_LOOKUP; +	if (__is_root_gfid (loc->parent->gfid)) { +		if (!strcmp (loc->name, GF_REPLICATE_TRASH_DIR)) { +			op_errno = EPERM; +			goto out; +		} +	} -        loc_copy (&local->loc, loc); -        ret = loc_path (&local->loc, NULL); -        if (ret < 0) { -                op_errno = EINVAL; -                goto out; -        } +	local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out; -        if (local->loc.path && -            (strcmp (local->loc.path, "/" GF_REPLICATE_TRASH_DIR) == 0)) { -                op_errno = EPERM; -                ret = -1; +        if (!local->call_count) { +                op_errno = ENOTCONN;                  goto out;          } -        ret = inode_ctx_get (local->loc.inode, this, &ctx); -        if (ret == 0) { -                /* lookup is a revalidate */ +        local->op = GF_FOP_LOOKUP; -                local->read_child_index = afr_inode_get_read_ctx (this, -                                                               local->loc.inode, -                                                               NULL); -        } else { -                LOCK (&priv->read_child_lock); -                { -                        if (priv->hash_mode) { -                                local->read_child_index = -1; -                        } -                        else { -                                local->read_child_index = -                                        (++priv->read_child_rr) % -                                        (priv->child_count); -                        } -                } -                UNLOCK (&priv->read_child_lock); -                local->cont.lookup.fresh_lookup = _gf_true; -        } +        loc_copy (&local->loc, loc); -        local->child_up = memdup (priv->child_up, -                                  sizeof (*local->child_up) * priv->child_count); -        if (NULL == local->child_up) { -                op_errno = ENOMEM; -                goto out; -        } +	local->inode = inode_ref (loc->inode); -        ret = afr_lookup_cont_init (local, priv->child_count); -        if (ret < 0) { -                op_errno = -ret; -                goto out; -        } +	if (xattr_req) +		/* If xattr_req was null, afr_lookup_xattr_req_prepare() will +		   allocate one for us */ +		local->xattr_req = dict_ref (xattr_req); -        local->call_count = afr_up_children_count (local->child_up, -                                                   priv->child_count); -        call_count = local->call_count; -        if (local->call_count == 0) { -                ret      = -1; -                op_errno = ENOTCONN; -                goto out; -        } +	afr_read_subvol_get (loc->parent, this, NULL, &event, +			     AFR_DATA_TRANSACTION); -        /* By default assume ENOTCONN. On success it will be set to 0. */ -        local->op_errno = ENOTCONN; +	if (event != local->event_generation) +		afr_inode_refresh (frame, this, loc->parent, afr_lookup_do); +	else +		afr_lookup_do (frame, this, 0); -        ret = dict_get_int32 (xattr_req, "attempt-self-heal", -                              &local->attempt_self_heal); -        dict_del (xattr_req, "attempt-self-heal"); +	return 0; +out: +	AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); -        ret = dict_get_int32 (xattr_req, "foreground-self-heal", -                              &local->foreground_self_heal); -        dict_del (xattr_req, "foreground-self-heal"); +        return 0; +} -        ret = afr_lookup_xattr_req_prepare (local, this, xattr_req, &local->loc, -                                            &gfid_req); -        if (ret) { -                local->op_errno = -ret; -                goto out; -        } -        afr_lookup_save_gfid (local->cont.lookup.gfid_req, gfid_req, -                              &local->loc); -        local->fop = GF_FOP_LOOKUP; -        if (priv->choose_local && !priv->did_discovery) { -                if (gfid_req && __is_root_gfid(gfid_req)) { -                        local->do_discovery = _gf_true; -                        priv->did_discovery = _gf_true; -                } -        } -        for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i]) { -                        STACK_WIND_COOKIE (frame, afr_lookup_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->lookup, -                                           &local->loc, local->xattr_req); -                        if (!--call_count) -                                break; -                } + +/* {{{ open */ + +afr_fd_ctx_t * +__afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ +        uint64_t       ctx = 0; +        int            ret = 0; +        afr_fd_ctx_t  *fd_ctx = NULL; + +        ret = __fd_ctx_get (fd, this, &ctx); + +        if (ret < 0) { +                ret = __afr_fd_ctx_set (this, fd); +                if (ret < 0) +                        goto out; + +                ret = __fd_ctx_get (fd, this, &ctx); +                if (ret < 0) +                        goto out;          } -        ret = 0; +        fd_ctx = (afr_fd_ctx_t *)(long) ctx;  out: -        if (ret) -                AFR_STACK_UNWIND (lookup, frame, -1, op_errno, -                                  NULL, NULL, NULL, NULL); - -        return 0; +        return fd_ctx;  } -/* {{{ open */ +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ +        afr_fd_ctx_t  *fd_ctx = NULL; + +        LOCK(&fd->lock); +        { +                fd_ctx = __afr_fd_ctx_get (fd, this); +        } +        UNLOCK(&fd->lock); + +        return fd_ctx; +} +  int  __afr_fd_ctx_set (xlator_t *this, fd_t *fd) @@ -2559,6 +1911,7 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)          int             ret    = -1;          uint64_t        ctx    = 0;          afr_fd_ctx_t *  fd_ctx = NULL; +	int             i = 0;          VALIDATE_OR_GOTO (this->private, out);          VALIDATE_OR_GOTO (fd, out); @@ -2577,21 +1930,15 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)                  goto out;          } -        fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done), -                                         priv->child_count, -                                         gf_afr_mt_char); -        if (!fd_ctx->pre_op_done) { -                ret = -ENOMEM; -                goto out; -        } - -        fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback), -                                              priv->child_count, -                                              gf_afr_mt_char); -        if (!fd_ctx->pre_op_piggyback) { -                ret = -ENOMEM; -                goto out; -        } +	for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { +		fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]), +						    priv->child_count, +						    gf_afr_mt_int32_t); +		if (!fd_ctx->pre_op_done[i]) { +			ret = -ENOMEM; +			goto out; +		} +	}          fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on),                                         priv->child_count, @@ -2601,6 +1948,13 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)                  goto out;          } +	for (i = 0; i < priv->child_count; i++) { +		if (fd_is_anonymous (fd)) +			fd_ctx->opened_on[i] = AFR_FD_OPENED; +		else +			fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED; +	} +          fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback),                                              priv->child_count,                                              gf_afr_mt_char); @@ -2617,20 +1971,7 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)                  goto out;          } -        fd_ctx->up_count   = priv->up_count; -        fd_ctx->down_count = priv->down_count; - -        fd_ctx->locked_on = GF_CALLOC (sizeof (*fd_ctx->locked_on), -                                       priv->child_count, -                                       gf_afr_mt_char); -        if (!fd_ctx->locked_on) { -                ret = -ENOMEM; -                goto out; -        } -  	pthread_mutex_init (&fd_ctx->delay_lock, NULL); -        INIT_LIST_HEAD (&fd_ctx->entries); -        fd_ctx->call_child = -1;          INIT_LIST_HEAD (&fd_ctx->eager_locked); @@ -2660,32 +2001,31 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd)  /* {{{ flush */  int -afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, -              int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +               int32_t op_ret, int32_t op_errno, dict_t *xdata)  { -        afr_local_t *   local = NULL; -        int call_count  = -1; +        afr_local_t *local = NULL; +        int call_count = -1;          local = frame->local;          LOCK (&frame->lock);          {                  if (op_ret != -1) { -                        if (local->success_count == 0) { -                                local->op_ret = op_ret; -                        } -                        local->success_count++; -                } - -                local->op_errno = op_errno; +			local->op_ret = op_ret; +			if (!local->xdata_rsp && xdata) +				local->xdata_rsp = dict_ref (xdata); +		} else { +			local->op_errno = op_errno; +		}          }          UNLOCK (&frame->lock);  	call_count = afr_frame_return (frame);  	if (call_count == 0) -		AFR_STACK_UNWIND(flush, frame, local->op_ret, -				 local->op_errno, NULL); +		AFR_STACK_UNWIND (flush, frame, local->op_ret, +				  local->op_errno, local->xdata_rsp);          return 0;  } @@ -2708,7 +2048,7 @@ afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)                                             (void *) (long) i,                                             priv->children[i],                                             priv->children[i]->fops->flush, -                                           local->fd, NULL); +                                           local->fd, xdata);                          if (!--call_count)                                  break; @@ -2721,40 +2061,30 @@ afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)  int  afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)  { -        afr_private_t *priv  = NULL;          afr_local_t   *local = NULL;          call_stub_t   *stub = NULL; -        int            ret        = -1; -        int            op_errno   = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); +        int            op_errno   = ENOMEM; -        priv = this->private; - -	AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -	local = frame->local; +	local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out; -	ret = afr_local_init(local, priv, &op_errno); -	if (ret < 0) +	if (!local->call_count) { +		op_errno = ENOTCONN;  		goto out; +	}  	local->fd = fd_ref(fd); +          stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata); -        if (!stub) { -                ret = -1; -                op_errno = ENOMEM; +        if (!stub)                  goto out; -        }          afr_delayed_changelog_wake_resume (this, fd, stub); -	ret = 0; +	return 0;  out: -	if (ret < 0) -		AFR_STACK_UNWIND(flush, frame, -1, op_errno, NULL); - +	AFR_STACK_UNWIND (flush, frame, -1, op_errno, NULL);          return 0;  } @@ -2767,6 +2097,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd)          uint64_t        ctx = 0;          afr_fd_ctx_t    *fd_ctx = NULL;          int             ret = 0; +	int             i = 0;          ret = fd_ctx_get (fd, this, &ctx);          if (ret < 0) @@ -2775,13 +2106,11 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd)          fd_ctx = (afr_fd_ctx_t *)(long) ctx;          if (fd_ctx) { -                GF_FREE (fd_ctx->pre_op_done); +		for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) +			GF_FREE (fd_ctx->pre_op_done[i]);                  GF_FREE (fd_ctx->opened_on); -                GF_FREE (fd_ctx->locked_on); - -                GF_FREE (fd_ctx->pre_op_piggyback);                  GF_FREE (fd_ctx->lock_piggyback);                  GF_FREE (fd_ctx->lock_acquired); @@ -2799,24 +2128,8 @@ out:  int  afr_release (xlator_t *this, fd_t *fd)  { -        afr_locked_fd_t *locked_fd = NULL; -        afr_locked_fd_t *tmp       = NULL; -        afr_private_t   *priv      = NULL; - -        priv = this->private; -          afr_cleanup_fd_ctx (this, fd); -        list_for_each_entry_safe (locked_fd, tmp, &priv->saved_fds, -                                  list) { - -                if (locked_fd->fd == fd) { -                        list_del_init (&locked_fd->list); -                        GF_FREE (locked_fd); -                } - -        } -          return 0;  } @@ -2841,36 +2154,38 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          afr_local_t *local = NULL;          int call_count = -1;          int child_index = (long) cookie; -        int read_child  = 0; +	int read_subvol = 0;  	call_stub_t *stub = NULL;          local = frame->local; -        read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); +	read_subvol = afr_data_subvol_get (local->inode, this, 0, 0);          LOCK (&frame->lock);          { -                if (child_index == read_child) { -                        local->read_child_returned = _gf_true; -                } -                  if (op_ret == 0) { -                        local->op_ret = 0; +                        if (local->op_ret == -1) { +				local->op_ret = 0; -                        if (local->success_count == 0) {                                  local->cont.inode_wfop.prebuf  = *prebuf;                                  local->cont.inode_wfop.postbuf = *postbuf; + +				if (xdata) +					local->xdata_rsp = dict_ref (xdata);                          } -                        if (child_index == read_child) { +                        if (child_index == read_subvol) {                                  local->cont.inode_wfop.prebuf  = *prebuf;                                  local->cont.inode_wfop.postbuf = *postbuf; +				if (xdata) { +					if (local->xdata_rsp) +						dict_unref (local->xdata_rsp); +					local->xdata_rsp = dict_ref (xdata); +				}                          } - -                        local->success_count++; -                } - -                local->op_errno = op_errno; +                } else { +			local->op_errno = op_errno; +		}          }          UNLOCK (&frame->lock); @@ -2890,7 +2205,7 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                                             local->op_ret, local->op_errno,                                             &local->cont.inode_wfop.prebuf,                                             &local->cont.inode_wfop.postbuf, -                                           xdata); +                                           local->xdata_rsp);  		if (!stub) {  			AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0);  			return 0; @@ -2910,37 +2225,35 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  int -afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, -           int32_t datasync, dict_t *xdata) +afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, +	   dict_t *xdata)  { -        afr_private_t *priv = NULL; +	afr_private_t *priv = NULL;          afr_local_t *local = NULL; -        int ret = -1;          int i = 0;          int32_t call_count = 0; -        int32_t op_errno = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); - -        priv = this->private; +        int32_t op_errno = ENOMEM; -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; +	priv = this->private; -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +	local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out;          call_count = local->call_count; +	if (!call_count) { +		op_errno = ENOTCONN; +		goto out; +	} -        local->fd             = fd_ref (fd); +        local->fd = fd_ref (fd);  	if (afr_fd_has_witnessed_unstable_write (this, fd)) {  		/* don't care. we only wanted to CLEAR the bit */  	} +	local->inode = inode_ref (fd->inode); +          for (i = 0; i < priv->child_count; i++) {                  if (local->child_up[i]) {                          STACK_WIND_COOKIE (frame, afr_fsync_cbk, @@ -2953,10 +2266,10 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,                  }          } -        ret = 0; +	return 0;  out: -        if (ret < 0) -                AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); +	AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); +          return 0;  } @@ -2964,10 +2277,9 @@ out:  /* {{{ fsync */ -int32_t -afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, -                  xlator_t *this, int32_t op_ret, int32_t op_errno, -                  dict_t *xdata) +int +afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		  int32_t op_ret, int32_t op_errno, dict_t *xdata)  {          afr_local_t *local = NULL;          int call_count = -1; @@ -2976,10 +2288,13 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie,          LOCK (&frame->lock);          { -                if (op_ret == 0) +                if (op_ret == 0) {                          local->op_ret = 0; - -                local->op_errno = op_errno; +			if (!local->xdata_rsp && xdata) +				local->xdata_rsp = dict_ref (xdata); +		} else { +			local->op_errno = op_errno; +		}          }          UNLOCK (&frame->lock); @@ -2987,37 +2302,33 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie,          if (call_count == 0)                  AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret, -                                  local->op_errno, xdata); +				  local->op_errno, local->xdata_rsp);          return 0;  } -int32_t -afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, -              int32_t datasync, dict_t *xdata) +int +afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, +	      dict_t *xdata)  { -        afr_private_t *priv = NULL; +	afr_private_t *priv = NULL;          afr_local_t *local = NULL; -        int ret = -1;          int i = 0;          int32_t call_count = 0; -        int32_t op_errno = 0; +        int32_t op_errno = ENOMEM; -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); +	priv = this->private; -        priv = this->private; - -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +        local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out;          call_count = local->call_count; +	if (!call_count) { +		op_errno = ENOTCONN; +		goto out; +	}          for (i = 0; i < priv->child_count; i++) {                  if (local->child_up[i]) { @@ -3030,10 +2341,10 @@ afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd,                  }          } -        ret = 0; +	return 0;  out: -        if (ret < 0) -                AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); +	AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); +          return 0;  } @@ -3056,6 +2367,10 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie,                  if (op_ret == 0) {                          if (!local->cont.xattrop.xattr)                                  local->cont.xattrop.xattr = dict_ref (xattr); + +			if (!local->xdata_rsp && xdata) +				local->xdata_rsp = dict_ref (xdata); +                          local->op_ret = 0;                  } @@ -3067,7 +2382,7 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie,          if (call_count == 0)                  AFR_STACK_UNWIND (xattrop, frame, local->op_ret, local->op_errno, -                local->cont.xattrop.xattr, xdata); +                local->cont.xattrop.xattr, local->xdata_rsp);          return 0;  } @@ -3079,25 +2394,21 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,  {          afr_private_t *priv = NULL;          afr_local_t *local  = NULL; -        int ret = -1;          int i = 0;          int32_t call_count = 0; -        int32_t op_errno = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); +        int32_t op_errno = ENOMEM;          priv = this->private; -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +	local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out;          call_count = local->call_count; +	if (!call_count) { +		op_errno = ENOTCONN; +		goto out; +	}          for (i = 0; i < priv->child_count; i++) {                  if (local->child_up[i]) { @@ -3110,10 +2421,10 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,                  }          } -        ret = 0; +	return 0;  out: -        if (ret < 0) -                AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); +	AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); +          return 0;  } @@ -3138,6 +2449,8 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie,                          if (!local->cont.fxattrop.xattr)                                  local->cont.fxattrop.xattr = dict_ref (xattr); +			if (!local->xdata_rsp && xdata) +				local->xdata_rsp = dict_ref (xdata);                          local->op_ret = 0;                  } @@ -3149,7 +2462,7 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie,          if (call_count == 0)                  AFR_STACK_UNWIND (fxattrop, frame, local->op_ret, local->op_errno, -                                  local->cont.fxattrop.xattr, xdata); +                                  local->cont.fxattrop.xattr, local->xdata_rsp);          return 0;  } @@ -3161,25 +2474,21 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,  {          afr_private_t *priv = NULL;          afr_local_t *local  = NULL; -        int ret = -1;          int i = 0;          int32_t call_count = 0;          int32_t op_errno = 0; -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); -          priv = this->private; -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) +	local = AFR_FRAME_INIT (frame, op_errno); +	if (!local)                  goto out;          call_count = local->call_count; +	if (!call_count) { +		op_errno = ENOTCONN; +		goto out; +	}          for (i = 0; i < priv->child_count; i++) {                  if (local->child_up[i]) { @@ -3192,10 +2501,10 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,                  }          } -        ret = 0; +	return 0;  out: -        if (ret < 0) -                AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); +	AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); +          return 0;  } @@ -3203,8 +2512,8 @@ out:  int32_t -afr_inodelk_cbk (call_frame_t *frame, void *cookie, -                 xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		 int32_t op_ret, int32_t op_errno, dict_t *xdata)  {          afr_local_t *local = NULL; @@ -3238,25 +2547,21 @@ afr_inodelk (call_frame_t *frame, xlator_t *this,  {          afr_private_t *priv = NULL;          afr_local_t *local  = NULL; -        int ret = -1;          int i = 0;          int32_t call_count = 0; -        int32_t op_errno = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); +        int32_t op_errno = ENOMEM;          priv = this->private; -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) +        local = AFR_FRAME_INIT (frame, op_errno); +        if (!local)                  goto out;          call_count = local->call_count; +	if (!call_count) { +		op_errno = ENOMEM; +		goto out; +	}          for (i = 0; i < priv->child_count; i++) {                  if (local->child_up[i]) { @@ -3270,18 +2575,17 @@ afr_inodelk (call_frame_t *frame, xlator_t *this,                  }          } -        ret = 0; +	return 0;  out: -        if (ret < 0) -                AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); +	AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); +          return 0;  }  int32_t -afr_finodelk_cbk (call_frame_t *frame, void *cookie, -                  xlator_t *this, int32_t op_ret, int32_t op_errno, -                  dict_t *xdata) +afr_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		  int32_t op_ret, int32_t op_errno, dict_t *xdata)  {          afr_local_t *local = NULL; @@ -3309,31 +2613,26 @@ afr_finodelk_cbk (call_frame_t *frame, void *cookie,  int32_t -afr_finodelk (call_frame_t *frame, xlator_t *this, -              const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, -              dict_t *xdata) +afr_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, +	      int32_t cmd, struct gf_flock *flock, dict_t *xdata)  {          afr_private_t *priv = NULL;          afr_local_t *local  = NULL; -        int ret = -1;          int i = 0;          int32_t call_count = 0; -        int32_t op_errno = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); +        int32_t op_errno = ENOMEM;          priv = this->private; -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +	local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out;          call_count = local->call_count; +	if (!call_count) { +		op_errno = ENOTCONN; +		goto out; +	}          for (i = 0; i < priv->child_count; i++) {                  if (local->child_up[i]) { @@ -3347,10 +2646,10 @@ afr_finodelk (call_frame_t *frame, xlator_t *this,                  }          } -        ret = 0; +	return 0;  out: -        if (ret < 0) -                AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); +	AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); +          return 0;  } @@ -3383,33 +2682,28 @@ afr_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  } -int32_t -afr_entrylk (call_frame_t *frame, xlator_t *this, -             const char *volume, loc_t *loc, -             const char *basename, entrylk_cmd cmd, entrylk_type type, -             dict_t *xdata) +int +afr_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, +	     loc_t *loc, const char *basename, entrylk_cmd cmd, +	     entrylk_type type, dict_t *xdata)  {          afr_private_t *priv = NULL;          afr_local_t *local  = NULL; -        int ret = -1;          int i = 0;          int32_t call_count = 0;          int32_t op_errno = 0; -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); -          priv = this->private; -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +	local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out;          call_count = local->call_count; +	if (!call_count) { +		op_errno = ENOTCONN; +		goto out; +	}          for (i = 0; i < priv->child_count; i++) {                  if (local->child_up[i]) { @@ -3423,18 +2717,18 @@ afr_entrylk (call_frame_t *frame, xlator_t *this,                  }          } -        ret = 0; +	return 0;  out: -        if (ret < 0) -                AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); +	AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); +          return 0;  } -int32_t -afr_fentrylk_cbk (call_frame_t *frame, void *cookie, -                  xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) +int +afr_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		  int32_t op_ret, int32_t op_errno, dict_t *xdata)  {          afr_local_t *local = NULL; @@ -3461,33 +2755,28 @@ afr_fentrylk_cbk (call_frame_t *frame, void *cookie,  } -int32_t -afr_fentrylk (call_frame_t *frame, xlator_t *this, -              const char *volume, fd_t *fd, -              const char *basename, entrylk_cmd cmd, -              entrylk_type type, dict_t *xdata) +int +afr_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, +              const char *basename, entrylk_cmd cmd, entrylk_type type, +	      dict_t *xdata)  {          afr_private_t *priv = NULL;          afr_local_t *local  = NULL; -        int ret = -1;          int i = 0;          int32_t call_count = 0; -        int32_t op_errno = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); +        int32_t op_errno = ENOMEM;          priv = this->private; -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +        local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out;          call_count = local->call_count; +	if (!call_count) { +		op_errno = ENOTCONN; +		goto out; +	}          for (i = 0; i < priv->child_count; i++) {                  if (local->child_up[i]) { @@ -3501,82 +2790,85 @@ afr_fentrylk (call_frame_t *frame, xlator_t *this,                  }          } -        ret = 0; +	return 0;  out: -        if (ret < 0) -                AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); +	AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); +          return 0;  } -int32_t -afr_statfs_cbk (call_frame_t *frame, void *cookie, -                xlator_t *this, int32_t op_ret, int32_t op_errno, -                struct statvfs *statvfs, dict_t *xdata) + +int +afr_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, +		int op_errno, struct statvfs *statvfs, dict_t *xdata)  {          afr_local_t *local = NULL;          int call_count = 0; +	struct statvfs *buf = NULL;          LOCK (&frame->lock);          {                  local = frame->local; -                if (op_ret == 0) { -                        local->op_ret   = op_ret; - -                        if (local->cont.statfs.buf_set) { -                                if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail) -                                        local->cont.statfs.buf = *statvfs; -                        } else { -                                local->cont.statfs.buf = *statvfs; -                                local->cont.statfs.buf_set = 1; -                        } -                } - -                if (op_ret == -1) +                if (op_ret != 0) {                          local->op_errno = op_errno; +			goto unlock; +		} +		local->op_ret = op_ret; + +		buf = &local->cont.statfs.buf; +		if (local->cont.statfs.buf_set) { +			if (statvfs->f_bavail < buf->f_bavail) { +				*buf = *statvfs; +				if (xdata) { +					if (local->xdata_rsp) +						dict_unref (local->xdata_rsp); +					local->xdata_rsp = dict_ref (xdata); +				} +			} +		} else { +			*buf = *statvfs; +			local->cont.statfs.buf_set = 1; +			if (xdata) +				local->xdata_rsp = dict_ref (xdata); +		}          } +unlock:          UNLOCK (&frame->lock);          call_count = afr_frame_return (frame);          if (call_count == 0)                  AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno, -                                  &local->cont.statfs.buf, xdata); +                                  &local->cont.statfs.buf, local->xdata_rsp);          return 0;  } -int32_t -afr_statfs (call_frame_t *frame, xlator_t *this, -            loc_t *loc, dict_t *xdata) +int +afr_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)  { -        afr_private_t *  priv        = NULL; -        int              child_count = 0;          afr_local_t   *  local       = NULL; +	afr_private_t   *priv        = NULL;          int              i           = 0; -        int              ret = -1;          int              call_count = 0; -        int32_t          op_errno    = 0; - -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); -        VALIDATE_OR_GOTO (loc, out); - -        priv = this->private; -        child_count = priv->child_count; +        int32_t          op_errno    = ENOMEM; -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; +	priv = this->private; -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +	local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out;          call_count = local->call_count; +	if (!call_count) { +		op_errno = ENOTCONN; +		goto out; +	} -        for (i = 0; i < child_count; i++) { +        for (i = 0; i < priv->child_count; i++) {                  if (local->child_up[i]) {                          STACK_WIND (frame, afr_statfs_cbk,                                      priv->children[i], @@ -3587,10 +2879,10 @@ afr_statfs (call_frame_t *frame, xlator_t *this,                  }          } -        ret = 0; +	return 0;  out: -        if (ret < 0) -                AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); +	AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); +          return 0;  } @@ -3699,21 +2991,6 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                  AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN,                                    &local->cont.lk.ret_flock, NULL);          } else { -                /* locking has succeeded on all nodes that are up */ - -                /* temporarily -                   ret = afr_mark_locked_nodes (this, local->fd, -                   local->cont.lk.locked_nodes); -                   if (ret) -                   gf_log (this->name, GF_LOG_DEBUG, -                   "Could not save locked nodes info in fdctx"); - -                   ret = afr_save_locked_fd (this, local->fd); -                   if (ret) -                   gf_log (this->name, GF_LOG_DEBUG, -                   "Could not save locked fd"); - -                */                  AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno,                                    &local->cont.lk.ret_flock, NULL);          } @@ -3729,20 +3006,12 @@ afr_lk (call_frame_t *frame, xlator_t *this,          afr_private_t *priv = NULL;          afr_local_t *local = NULL;          int i = 0; -        int32_t op_errno = 0; -        int     ret      = -1; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); +        int32_t op_errno = ENOMEM;          priv = this->private; -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) +        local = AFR_FRAME_INIT (frame, op_errno); +        if (!local)                  goto out;          local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count, @@ -3764,28 +3033,16 @@ afr_lk (call_frame_t *frame, xlator_t *this,                             priv->children[i]->fops->lk,                             fd, cmd, flock, xdata); -        ret = 0; +	return 0;  out: -        if (ret < 0) -                AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); +	AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); +          return 0;  }  int  afr_forget (xlator_t *this, inode_t *inode)  { -        uint64_t        ctx_addr = 0; -        afr_inode_ctx_t *ctx     = NULL; - -        inode_ctx_get (inode, this, &ctx_addr); - -        if (!ctx_addr) -                goto out; - -        ctx = (afr_inode_ctx_t *)(long)ctx_addr; -        GF_FREE (ctx->fresh_children); -        GF_FREE (ctx); -out:          return 0;  } @@ -3805,7 +3062,6 @@ afr_priv_dump (xlator_t *this)          snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name);          gf_proc_dump_add_section(key_prefix);          gf_proc_dump_write("child_count", "%u", priv->child_count); -        gf_proc_dump_write("read_child_rr", "%u", priv->read_child_rr);          for (i = 0; i < priv->child_count; i++) {                  sprintf (key, "child_up[%d]", i);                  gf_proc_dump_write(key, "%d", priv->child_up[i]); @@ -3862,7 +3118,7 @@ afr_notify (xlator_t *this, int32_t event,          int             idx                 = -1;          int             ret                 = -1;          int             call_psh            = 0; -        int             up_child            = AFR_ALL_CHILDREN; +        int             up_child            = -1;          dict_t          *input              = NULL;          dict_t          *output             = NULL; @@ -3914,6 +3170,7 @@ afr_notify (xlator_t *this, int32_t event,                           */                          if (priv->child_up[idx] != 1) {                                  priv->up_count++; +				priv->event_generation++;                          }                          priv->child_up[idx] = 1; @@ -3953,6 +3210,7 @@ afr_notify (xlator_t *this, int32_t event,                           */                          if (priv->child_up[idx] == 1) {                                  priv->down_count++; +				priv->event_generation++;                          }                          priv->child_up[idx] = 0; @@ -4019,8 +3277,7 @@ afr_notify (xlator_t *this, int32_t event,                  LOCK (&priv->lock);                  { -                        up_children = afr_up_children_count (priv->child_up, -                                                             priv->child_count); +                        up_children = AFR_COUNT (priv->child_up, priv->child_count);                          for (i = 0; i < priv->child_count; i++) {                                  if (priv->last_event[i] == GF_EVENT_CHILD_UP) {                                          event = GF_EVENT_CHILD_UP; @@ -4040,39 +3297,23 @@ afr_notify (xlator_t *this, int32_t event,          ret = 0;          if (propagate)                  ret = default_notify (this, event, data); -        if (call_psh && priv->shd.iamshd) -                afr_proactive_self_heal ((void*) (long) up_child); +        if (call_psh && priv->shd.iamshd) { +                afr_selfheal_childup (this, up_child); +	}  out:          return ret;  } -int -afr_first_up_child (unsigned char *child_up, size_t child_count) -{ -        int         ret      = -1; -        int         i        = 0; - -        GF_ASSERT (child_up); - -        for (i = 0; i < child_count; i++) { -                if (child_up[i]) { -                        ret = i; -                        break; -                } -        } - -        return ret; -}  int  afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)  { -        int     ret = -1; -          local->op_ret = -1;          local->op_errno = EUCLEAN; +	syncbarrier_init (&local->barrier); +          local->child_up = GF_CALLOC (priv->child_count,                                       sizeof (*local->child_up),                                       gf_afr_mt_char); @@ -4084,38 +3325,42 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)          memcpy (local->child_up, priv->child_up,                  sizeof (*local->child_up) * priv->child_count); -        local->call_count = afr_up_children_count (local->child_up, -                                                   priv->child_count); +        local->call_count = AFR_COUNT (local->child_up, priv->child_count);          if (local->call_count == 0) {                  gf_log (THIS->name, GF_LOG_INFO, "no subvolumes up");                  if (op_errno)                          *op_errno = ENOTCONN;                  goto out;          } +	local->event_generation = priv->event_generation; -        local->child_errno = GF_CALLOC (priv->child_count, -                                        sizeof (*local->child_errno), -                                        gf_afr_mt_int32_t); -        if (!local->child_errno) { -                if (op_errno) -                        *op_errno = ENOMEM; -                goto out; -        } +	local->read_attempted = GF_CALLOC (priv->child_count, sizeof (char), +					   gf_afr_mt_char); +	if (!local->read_attempted) { +		if (op_errno) +			*op_errno = ENOMEM; +		goto out; +	} -        local->transaction.postop_piggybacked = GF_CALLOC (priv->child_count, -							   sizeof (int), -							   gf_afr_mt_int32_t); -        if (!local->transaction.postop_piggybacked) { -                if (op_errno) -                        *op_errno = ENOMEM; -                goto out; -        } +	local->readable = GF_CALLOC (priv->child_count, sizeof (char), +				     gf_afr_mt_char); +	if (!local->readable) { +		if (op_errno) +			*op_errno = ENOMEM; +		goto out; +	} -	local->append_write = _gf_false; +	local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), +				   gf_afr_mt_reply_t); +	if (!local->replies) { +		if (op_errno) +			*op_errno = ENOMEM; +		goto out; +	} -        ret = 0; +	return 0;  out: -        return ret; +        return -1;  }  int @@ -4218,13 +3463,11 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)          }          ret = -ENOMEM; -        child_up_count = afr_up_children_count (local->child_up, -                                                priv->child_count); +        child_up_count = AFR_COUNT (local->child_up, priv->child_count);          if (priv->optimistic_change_log && child_up_count == priv->child_count)                  local->optimistic_change_log = 1; -        local->first_up_child = afr_first_up_child (local->child_up, -                                                    priv->child_count); +	local->pre_op_compat = priv->pre_op_compat;          local->transaction.eager_lock =                  GF_CALLOC (sizeof (*local->transaction.eager_lock), @@ -4234,26 +3477,29 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)          if (!local->transaction.eager_lock)                  goto out; -        local->fresh_children = afr_children_create (priv->child_count); -        if (!local->fresh_children) -                goto out; -          local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op),                                                 priv->child_count,                                                 gf_afr_mt_char);          if (!local->transaction.pre_op)                  goto out; +        local->transaction.fop_subvols = GF_CALLOC (sizeof (*local->transaction.fop_subvols), +						    priv->child_count, +						    gf_afr_mt_char); +        if (!local->transaction.fop_subvols) +                goto out; + +        local->transaction.failed_subvols = GF_CALLOC (sizeof (*local->transaction.failed_subvols), +						       priv->child_count, +						       gf_afr_mt_char); +        if (!local->transaction.failed_subvols) +                goto out; +          local->pending = afr_matrix_create (priv->child_count,                                              AFR_NUM_CHANGE_LOGS);          if (!local->pending)                  goto out; -        local->transaction.txn_changelog = afr_matrix_create (priv->child_count, -                                                           AFR_NUM_CHANGE_LOGS); -        if (!local->transaction.txn_changelog) -                goto out; -  	INIT_LIST_HEAD (&local->transaction.eager_locked);          ret = 0; @@ -4261,86 +3507,6 @@ out:          return ret;  } -void -afr_reset_children (int32_t *fresh_children, int32_t child_count) -{ -        unsigned int i = 0; -        for (i = 0; i < child_count; i++) -                fresh_children[i] = -1; -} - -int32_t* -afr_children_create (int32_t child_count) -{ -        int32_t           *children = NULL; -        int               i               = 0; - -        GF_ASSERT (child_count > 0); - -        children = GF_CALLOC (child_count, sizeof (*children), -                              gf_afr_mt_int32_t); -        if (NULL == children) -                goto out; -        for (i = 0; i < child_count; i++) -                children[i] = -1; -out: -        return children; -} - -void -afr_children_add_child (int32_t *children, int32_t child, -                        int32_t child_count) -{ -        gf_boolean_t child_found = _gf_false; -        int          i               = 0; - -        for (i = 0; i < child_count; i++) { -                if (children[i] == -1) -                        break; -                if (children[i] == child) { -                        child_found = _gf_true; -                        break; -                } -        } - -        if (!child_found) { -                GF_ASSERT (i < child_count); -                children[i] = child; -        } -} - -void -afr_children_rm_child (int32_t *children, int32_t child, int32_t child_count) -{ -        int          i = 0; - -        GF_ASSERT ((child >= 0) && (child < child_count)); -        for (i = 0; i < child_count; i++) { -                if (children[i] == -1) -                        break; -                if (children[i] == child) { -                        if (i != (child_count - 1)) -                                memmove (children + i, children + i + 1, -                                         sizeof (*children)*(child_count - i - 1)); -                        children[child_count - 1] = -1; -                        break; -                } -        } -} - -int -afr_get_children_count (int32_t *children, unsigned int child_count) -{ -        int count = 0; -        int i = 0; - -        for (i = 0; i < child_count; i++) { -                if (children[i] == -1) -                        break; -                count++; -        } -        return count; -}  void  afr_set_low_priority (call_frame_t *frame) @@ -4348,38 +3514,6 @@ afr_set_low_priority (call_frame_t *frame)          frame->root->pid = LOW_PRIO_PROC_PID;  } -int -afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child, -                      int flags) -{ -        int             ret = 0; -        uint64_t        ctx = 0; -        afr_fd_ctx_t    *fd_ctx      = NULL; - -        GF_ASSERT (fd && fd->inode); -        ret = afr_fd_ctx_set (this, fd); -        if (ret < 0) { -                gf_log (this->name, GF_LOG_ERROR, -                        "could not set fd ctx for fd=%p", fd); -                goto out; -        } - -        ret = fd_ctx_get (fd, this, &ctx); -        if (ret < 0) { -                gf_log (this->name, GF_LOG_ERROR, -                        "could not get fd ctx for fd=%p", fd); -                goto out; -        } - -        fd_ctx = (afr_fd_ctx_t *)(long) ctx; -        fd_ctx->opened_on[child] = AFR_FD_OPENED; -        if (!IA_ISDIR (fd->inode->ia_type)) { -                fd_ctx->flags            = flags; -        } -        ret = 0; -out: -        return ret; -}  gf_boolean_t  afr_have_quorum (char *logname, afr_private_t *priv) @@ -4426,33 +3560,6 @@ afr_priv_destroy (afr_private_t *priv)          if (!priv)                  goto out;          inode_unref (priv->root_inode); -        GF_FREE (priv->shd.pos); -        GF_FREE (priv->shd.pending); -        GF_FREE (priv->shd.inprogress); -//        for (i = 0; i < priv->child_count; i++) -//                if (priv->shd.timer && priv->shd.timer[i]) -//                        gf_timer_call_cancel (this->ctx, priv->shd.timer[i]); -        GF_FREE (priv->shd.timer); - -        if (priv->shd.healed) -                eh_destroy (priv->shd.healed); - -        if (priv->shd.heal_failed) -                eh_destroy (priv->shd.heal_failed); - -        if (priv->shd.split_brain) -                eh_destroy (priv->shd.split_brain); - -        for (i = 0; i < priv->child_count; i++) -        { -                if (priv->shd.statistics[i]) -                        eh_destroy (priv->shd.statistics[i]); -        } - -        GF_FREE (priv->shd.statistics); - -        GF_FREE (priv->shd.crawl_events); -          GF_FREE (priv->last_event);          if (priv->pending_key) {                  for (i = 0; i < priv->child_count; i++) @@ -4462,8 +3569,7 @@ afr_priv_destroy (afr_private_t *priv)          GF_FREE (priv->children);          GF_FREE (priv->child_up);          LOCK_DESTROY (&priv->lock); -        LOCK_DESTROY (&priv->read_child_lock); -        pthread_mutex_destroy (&priv->mutex); +          GF_FREE (priv);  out:          return; @@ -4480,124 +3586,21 @@ xlator_subvolume_count (xlator_t *this)          return i;  } -inline gf_boolean_t -afr_is_errno_set (int *child_errno, int child) -{ -        return child_errno[child]; -} - -inline gf_boolean_t -afr_is_errno_unset (int *child_errno, int child) -{ -        return !afr_is_errno_set (child_errno, child); -} - -void -afr_prepare_new_entry_pending_matrix (int32_t **pending, -                                      gf_boolean_t (*is_pending) (int *, int), -                                      int *ctx, struct iatt *buf, -                                      unsigned int child_count) -{ -        int midx = 0; -        int idx  = 0; -        int i    = 0; - -        midx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); -        if (IA_ISDIR (buf->ia_type)) -                idx = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); -        else if (IA_ISREG (buf->ia_type)) -                idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); -        else -                idx = -1; -        for (i = 0; i < child_count; i++) { -                if (is_pending (ctx, i)) { -                        pending[i][midx] = hton32 (1); -                        if (idx == -1) -                                continue; -                        pending[i][idx] = hton32 (1); -                } -        } -} - -gf_boolean_t -afr_is_fd_fixable (fd_t *fd) -{ -        if (!fd || !fd->inode) -                return _gf_false; -        else if (fd_is_anonymous (fd)) -                return _gf_false; -        else if (uuid_is_null (fd->inode->gfid)) -                return _gf_false; - -        return _gf_true; -}  void  afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this)  {          afr_local_t     *local = NULL; -        inode_t         *inode = NULL; -        afr_inode_ctx_t *ctx   = NULL; +        afr_fd_ctx_t    *fd_ctx   = NULL;          local = frame->local; -        if (local->fd) -                inode = local->fd->inode; -        else -                inode = local->loc.inode; - -        if (!inode) -                return; - -        LOCK (&inode->lock); -        { -                ctx = __afr_inode_ctx_get (inode, this); -                ctx->open_fd_count = local->open_fd_count; -        } -        UNLOCK (&inode->lock); -} - -int -afr_initialise_statistics (xlator_t *this) -{ -        afr_private_t       *priv = NULL; -        int                 ret = -1; -        int                 i = 0; -        int                 child_count = 0; -        eh_t                *stats_per_brick = NULL; -        shd_crawl_event_t   ***shd_crawl_events = NULL; -        priv = this->private; - -        priv->shd.statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count, -                                          gf_common_mt_eh_t); -        if (!priv->shd.statistics) { -                ret = -1; -                goto out; -        } -        child_count = priv->child_count; -        for (i=0; i < child_count ; i++) { -                stats_per_brick = eh_new (AFR_STATISTICS_HISTORY_SIZE, -                                          _gf_false, -                                          _destroy_crawl_event_data); -                if (!stats_per_brick) { -                        ret = -1; -                        goto out; -                } -                priv->shd.statistics[i] = stats_per_brick; - -        } - -        shd_crawl_events = (shd_crawl_event_t***)(&priv->shd.crawl_events); -        *shd_crawl_events  = GF_CALLOC (sizeof(shd_crawl_event_t*), -                                        priv->child_count, -                                        gf_afr_mt_shd_crawl_event_t); +        if (!local->fd) +		return; -        if (!priv->shd.crawl_events) { -                ret = -1; -                goto out; -        } -        ret = 0; -out: -        return ret; +	fd_ctx = afr_fd_ctx_get (local->fd, this); +	if (!fd_ctx) +		return; +	fd_ctx->open_fd_count = local->open_fd_count;  } diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index 689dd84e646..fa1da3958df 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -37,177 +37,7 @@  #include "checksum.h"  #include "afr.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" - -int -afr_examine_dir_sh_unwind (call_frame_t *frame, xlator_t *this, int32_t op_ret, -                           int32_t op_errno, int32_t sh_failed) -{ -        afr_local_t *local  = NULL; - -        local = frame->local; - -        afr_set_opendir_done (this, local->fd->inode); - -        AFR_STACK_UNWIND (opendir, frame, local->op_ret, -                          local->op_errno, local->fd, NULL); - -        return 0; -} - - -gf_boolean_t -__checksums_differ (uint32_t *checksum, int child_count, -                    unsigned char *child_up) -{ -        int          ret            = _gf_false; -        int          i              = 0; -        uint32_t     cksum          = 0; -        gf_boolean_t activate_check = _gf_false; - -        for (i = 0; i < child_count; i++) { -                if (!child_up[i]) -                        continue; -                if (_gf_false == activate_check) { -                        cksum          = checksum[i]; -                        activate_check = _gf_true; -                        continue; -                } - -                if (cksum != checksum[i]) { -                        ret = _gf_true; -                        break; -                } - -                cksum = checksum[i]; -        } - -        return ret; -} - - -int32_t -afr_examine_dir_readdir_cbk (call_frame_t *frame, void *cookie, -                             xlator_t *this, int32_t op_ret, int32_t op_errno, -                             gf_dirent_t *entries, dict_t *xdata) -{ -        afr_private_t *   priv        = NULL; -        afr_local_t *     local       = NULL; -        afr_self_heal_t * sh          = NULL; -        gf_dirent_t *     entry       = NULL; -        gf_dirent_t *     tmp         = NULL; -        char              *reason     = NULL; -        int               child_index = 0; -        uint32_t          entry_cksum = 0; -        int               call_count  = 0; -        off_t             last_offset = 0; -        inode_t           *inode      = NULL; - -        priv  = this->private; -        local = frame->local; -        sh    = &local->self_heal; -        inode = local->fd->inode; - -        child_index = (long) cookie; - -        if (op_ret == -1) { -                gf_log (this->name, GF_LOG_INFO, -                        "%s: failed to do opendir on %s", -                        local->loc.path, priv->children[child_index]->name); -                local->op_ret = -1; -                local->op_ret = op_errno; -                goto out; -        } - -        if (op_ret == 0) { -                gf_log (this->name, GF_LOG_DEBUG, -                        "%s: no entries found in %s", -                        local->loc.path, priv->children[child_index]->name); -                goto out; -        } - -        list_for_each_entry_safe (entry, tmp, &entries->list, list) { -                entry_cksum = gf_rsync_weak_checksum ((unsigned char *)entry->d_name, -                                                      strlen (entry->d_name)); -                local->cont.opendir.checksum[child_index] ^= entry_cksum; -        } - -        list_for_each_entry (entry, &entries->list, list) { -                last_offset = entry->d_off; -        } - -        /* read more entries */ - -        STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk, -                           (void *) (long) child_index, -                           priv->children[child_index], -                           priv->children[child_index]->fops->readdir, -                           local->fd, 131072, last_offset, NULL); - -        return 0; - -out: -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                if (__checksums_differ (local->cont.opendir.checksum, -                                        priv->child_count, -                                        local->child_up)) { - -                        sh->do_entry_self_heal  = _gf_true; -                        sh->forced_merge          = _gf_true; - -                        reason = "checksums of directory differ"; -                        afr_launch_self_heal (frame, this, inode, _gf_false, -                                              inode->ia_type, reason, NULL, -                                              afr_examine_dir_sh_unwind); -                } else { -                        afr_set_opendir_done (this, inode); - -                        AFR_STACK_UNWIND (opendir, frame, local->op_ret, -                                          local->op_errno, local->fd, NULL); -                } -        } - -        return 0; -} - - -int -afr_examine_dir (call_frame_t *frame, xlator_t *this) -{ -        afr_private_t * priv       = NULL; -        afr_local_t *   local      = NULL; -        int             i          = 0; -        int             call_count = 0; - -        local = frame->local; -        priv  = this->private; - -        local->cont.opendir.checksum = GF_CALLOC (priv->child_count, -                                                  sizeof (*local->cont.opendir.checksum), -                                                  gf_afr_mt_int32_t); - -        call_count = afr_up_children_count (local->child_up, priv->child_count); - -        local->call_count = call_count; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i]) { -                        STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->readdir, -                                           local->fd, 131072, 0, NULL); - -                        if (!--call_count) -                                break; -                } -        } - -        return 0; -} +#include "afr-transaction.h"  int32_t @@ -215,112 +45,66 @@ afr_opendir_cbk (call_frame_t *frame, void *cookie,                   xlator_t *this, int32_t op_ret, int32_t op_errno,                   fd_t *fd, dict_t *xdata)  { -        afr_private_t *priv              = NULL;          afr_local_t   *local             = NULL; -        int32_t        up_children_count = 0; -        int            ret               = -1;          int            call_count        = -1;          int32_t        child_index       = 0; +	afr_fd_ctx_t  *fd_ctx = NULL; -        priv  = this->private;          local = frame->local; +	fd_ctx = local->fd_ctx;          child_index = (long) cookie; -        up_children_count = afr_up_children_count (local->child_up, -                                                   priv->child_count); -          LOCK (&frame->lock);          { -                if (op_ret >= 0) { +                if (op_ret == -1) { +                        local->op_errno = op_errno; +			fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; +                } else {                          local->op_ret = op_ret; -                        ret = afr_child_fd_ctx_set (this, fd, child_index, 0); -                        if (ret) { -                                local->op_ret = -1; -                                local->op_errno = -ret; -                                goto unlock; -                        } +			fd_ctx->opened_on[child_index] = AFR_FD_OPENED; +			if (!local->xdata_rsp && xdata) +				local->xdata_rsp = dict_ref (xdata);                  } - -                local->op_errno = op_errno;          } -unlock:          UNLOCK (&frame->lock);          call_count = afr_frame_return (frame); -        if (call_count == 0) { -                if (local->op_ret != 0) -                        goto out; - -                if (!afr_is_opendir_done (this, local->fd->inode) && -                    up_children_count > 1 && priv->entry_self_heal) { - -                        /* -                         * This is the first opendir on this inode. We need -                         * to check if the directory's entries are the same -                         * on all subvolumes. This is needed in addition -                         * to regular entry self-heal because the readdir -                         * call is sent only to the first subvolume, and -                         * thus files that exist only there will never be healed -                         * otherwise (assuming changelog shows no anomalies). -                         */ - -                        gf_log (this->name, GF_LOG_TRACE, -                                "reading contents of directory %s looking for mismatch", -                                local->loc.path); - -                        afr_examine_dir (frame, this); - -                } else { -                        /* do the unwind */ -                        goto out; -                } -        } - -        return 0; - -out: -        AFR_STACK_UNWIND (opendir, frame, local->op_ret, -                          local->op_errno, local->fd, NULL); - +        if (call_count == 0) +		AFR_STACK_UNWIND (opendir, frame, local->op_ret, +				  local->op_errno, local->fd, NULL);          return 0;  } -int32_t -afr_opendir (call_frame_t *frame, xlator_t *this, -             loc_t *loc, fd_t *fd) +int +afr_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)  {          afr_private_t * priv        = NULL;          afr_local_t   * local       = NULL; -        int             child_count = 0;          int             i           = 0; -        int             ret         = -1;          int             call_count  = -1; -        int32_t         op_errno    = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); +        int32_t         op_errno    = ENOMEM; +	afr_fd_ctx_t *fd_ctx = NULL;          priv = this->private; -        child_count = priv->child_count; +        local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out; -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +	fd_ctx = afr_fd_ctx_get (fd, this); +	if (!fd_ctx) +		goto out;          loc_copy (&local->loc, loc);          local->fd    = fd_ref (fd); +	local->fd_ctx = fd_ctx;          call_count = local->call_count; -        for (i = 0; i < child_count; i++) { +        for (i = 0; i < priv->child_count; i++) {                  if (local->child_up[i]) {                          STACK_WIND_COOKIE (frame, afr_opendir_cbk,                                             (void*) (long) i, @@ -333,182 +117,280 @@ afr_opendir (call_frame_t *frame, xlator_t *this,                  }          } -        ret = 0; +	return 0;  out: -        if (ret < 0) -                AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL); - +	AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL);          return 0;  } -/** - * Common algorithm for directory read calls: - * - * - Try the fop on the first child that is up - * - if we have failed due to ENOTCONN: - *     try the next child - * - * Applicable to: readdir - */ +#define BACKEND_D_OFF_BITS 63 +#define PRESENT_D_OFF_BITS 63 +#define ONE 1ULL +#define MASK (~0ULL) +#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS)) +#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS)) -struct entry_name { -        char *name; -        struct list_head list; -}; +#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1)) +#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1))) -static void -afr_forget_entries (fd_t *fd) +static uint64_t +afr_bits_for (uint64_t num)  { -        struct entry_name *entry  = NULL; -        struct entry_name *tmp    = NULL; -        int                ret    = 0; -        uint64_t           ctx    = 0; -        afr_fd_ctx_t      *fd_ctx = NULL; - -        ret = fd_ctx_get (fd, THIS, &ctx); -        if (ret < 0) { -                gf_log (THIS->name, GF_LOG_INFO, -                        "could not get fd ctx for fd=%p", fd); -                return; -        } +	uint64_t bits = 0, ctrl = 1; -        fd_ctx = (afr_fd_ctx_t *)(long) ctx; +	while (ctrl < num) { +		ctrl *= 2; +		bits ++; +	} -        list_for_each_entry_safe (entry, tmp, &fd_ctx->entries, list) { -                GF_FREE (entry->name); -                list_del (&entry->list); -                GF_FREE (entry); -        } +	return bits;  } -static void -afr_readdir_filter_trash_dir (gf_dirent_t *entries, fd_t *fd) +int +afr_itransform (xlator_t *this, int subvol, uint64_t x, uint64_t *y_p)  { -        gf_dirent_t *   entry       = NULL; -        gf_dirent_t *   tmp         = NULL; +        afr_private_t *conf = NULL; +        int         cnt = 0; +        int         max = 0; +        uint64_t    y = 0; +        uint64_t    hi_mask = 0; +        uint64_t    off_mask = 0; +        int         max_bits = 0; + +        if (x == ((uint64_t) -1)) { +                y = (uint64_t) -1; +                goto out; +        } -        list_for_each_entry_safe (entry, tmp, &entries->list, list) { -                if (__is_root_gfid (fd->inode->gfid) && -                    !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { -                        list_del_init (&entry->list); -                        GF_FREE (entry); -                } +        conf = this->private; +        if (!conf) +                goto out; + +        max = conf->child_count; +        cnt = subvol; + +	if (max == 1) { +		y = x; +		goto out; +	} + +        max_bits = afr_bits_for (max); + +        hi_mask = ~(PRESENT_MASK >> (max_bits + 1)); + +        if (x & hi_mask) { +                /* HUGE d_off */ +                off_mask = MASK << max_bits; +                y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt; +        } else { +                /* small d_off */ +                y = ((x * max) + cnt);          } + +out: +        if (y_p) +                *y_p = y; + +        return 0;  } -int32_t -afr_readdir_cbk (call_frame_t *frame, void *cookie, -                 xlator_t *this, int32_t op_ret, int32_t op_errno, -                 gf_dirent_t *entries, dict_t *xdata) + +int +afr_deitransform (xlator_t *this, uint64_t y, int *subvol_p, +                  uint64_t *x_p)  { -        afr_local_t     *local = NULL; +        afr_private_t *conf = NULL; +        int         cnt = 0; +        int         max = 0; +        uint64_t    x = 0; +        int         subvol = 0; +        int         max_bits = 0; +        uint64_t    off_mask = 0; +        uint64_t    host_mask = 0; + +        if (!this->private) +                return -1; + +        conf = this->private; +        max = conf->child_count; + +	if (max == 1) { +		x = y; +		cnt = 0; +		goto out; +	} + +        if (y & TOP_BIT) { +                /* HUGE d_off */ +                max_bits = afr_bits_for (max); +                off_mask = (MASK << max_bits); +                host_mask = ~(off_mask); + +                x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS; + +                cnt = y & host_mask; +	} else { +                /* small d_off */ +                cnt = y % max; +                x = y / max; +        } -        if (op_ret == -1) -                goto out; +out: +        subvol = cnt; -        local = frame->local; -        afr_readdir_filter_trash_dir (entries, local->fd); +        if (subvol_p) +                *subvol_p = subvol; + +        if (x_p) +                *x_p = x; -out: -        AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, NULL);          return 0;  } -int32_t -afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                  int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, -                  dict_t *xdata) +static void +afr_readdir_transform_entries (gf_dirent_t *subvol_entries, int subvol, +			       gf_dirent_t *entries, fd_t *fd)  { -        afr_local_t     *local = NULL; +	afr_private_t *priv = NULL; +        gf_dirent_t *entry = NULL; +        gf_dirent_t *tmp = NULL; +	unsigned char *data_readable = NULL; +	unsigned char *metadata_readable = NULL; +	int gen = 0; -        if (op_ret == -1) -                goto out; +	priv = THIS->private; -        local = frame->local; -        afr_readdir_filter_trash_dir (entries, local->fd); +	data_readable = alloca0 (priv->child_count); +	metadata_readable = alloca0 (priv->child_count); -out: -        AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, NULL); -        return 0; +        list_for_each_entry_safe (entry, tmp, &subvol_entries->list, list) { +                if (__is_root_gfid (fd->inode->gfid) && +                    !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { +			continue; +                } + +		list_del_init (&entry->list); +		afr_itransform (THIS, subvol, entry->d_off, &entry->d_off); +		list_add_tail (&entry->list, &entries->list); + +		if (entry->inode) { +			gen = 0; +			afr_inode_read_subvol_get (entry->inode, THIS, +						   data_readable, +						   metadata_readable, &gen); + +			if (gen != priv->event_generation || +				!data_readable[subvol] || +				!metadata_readable[subvol]) { + +				inode_unref (entry->inode); +				entry->inode = NULL; +			} +		} +        }  } +  int32_t -afr_do_readdir (call_frame_t *frame, xlator_t *this, -                fd_t *fd, size_t size, off_t offset, int whichop, dict_t *dict) +afr_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		 int32_t op_ret, int32_t op_errno, gf_dirent_t *subvol_entries, +		 dict_t *xdata)  { -        afr_private_t *priv      = NULL; -        xlator_t      **children = NULL; -        int           call_child = 0; -        afr_local_t   *local     = NULL; -        afr_fd_ctx_t  *fd_ctx    = NULL; -        int           ret        = -1; -        int32_t       op_errno   = 0; -        uint64_t      read_child = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); +        afr_local_t *local = NULL; +	gf_dirent_t  entries; -        priv     = this->private; -        children = priv->children; +	INIT_LIST_HEAD (&entries.list); -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);          local = frame->local; -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +        if (op_ret < 0 && !local->cont.readdir.offset) { +		/* failover only if this was first readdir, detected +		   by offset == 0 */ +		local->op_ret = op_ret; +		local->op_errno = op_errno; -        local->fresh_children = afr_children_create (priv->child_count); -        if (!local->fresh_children) { -                op_errno = ENOMEM; -                goto out; -        } +		afr_read_txn_continue (frame, this, (long) cookie); +		return 0; +	} -        read_child = afr_inode_get_read_ctx (this, fd->inode, -                                             local->fresh_children); -        ret = afr_get_call_child (this, local->child_up, read_child, -                                  local->fresh_children, -                                  &call_child, -                                  &local->cont.readdir.last_index); -        if (ret < 0) { -                op_errno = -ret; -                goto out; -        } +	if (op_ret >= 0) +		afr_readdir_transform_entries (subvol_entries, (long) cookie, +					       &entries, local->fd); -        fd_ctx  = afr_fd_ctx_get (fd, this); -        if (!fd_ctx) { -                op_errno = EBADF; -                goto out; -        } +        AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries, xdata); -        if ((offset == 0) || (fd_ctx->call_child == -1)) { -                fd_ctx->call_child = call_child; -        } else if ((priv->readdir_failover == _gf_false) && -                   (call_child != fd_ctx->call_child)) { -                op_errno = EBADF; -                goto out; -        } +        return 0; +} + + +int +afr_readdir_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; + +	priv = this->private; +	local = frame->local; -        local->fd                  = fd_ref (fd); -        local->cont.readdir.size   = size; -        local->cont.readdir.dict   = (dict)? dict_ref (dict) : NULL; +	if (subvol == -1) { +		AFR_STACK_UNWIND (readdir, frame, local->op_ret, +				  local->op_errno, 0, 0); +		return 0; +	} -        if (whichop == GF_FOP_READDIR) +        if (local->op == GF_FOP_READDIR)                  STACK_WIND_COOKIE (frame, afr_readdir_cbk, -                                   (void *) (long) call_child, -                                   children[call_child], -                                   children[call_child]->fops->readdir, fd, -                                   size, offset, dict); +                                   (void *) (long) subvol, +                                   priv->children[subvol], +                                   priv->children[subvol]->fops->readdir, +				   local->fd, local->cont.readdir.size, +                                   local->cont.readdir.offset, +				   local->xdata_req);          else -                STACK_WIND_COOKIE (frame, afr_readdirp_cbk, -                                   (void *) (long) call_child, -                                   children[call_child], -                                   children[call_child]->fops->readdirp, fd, -                                   size, offset, dict); +                STACK_WIND_COOKIE (frame, afr_readdir_cbk, +                                   (void *) (long) subvol, +                                   priv->children[subvol], +                                   priv->children[subvol]->fops->readdirp, +				   local->fd, local->cont.readdir.size, +				   local->cont.readdir.offset, +				   local->xdata_req); +	return 0; +} + + +int +afr_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, +		off_t offset, int whichop, dict_t *dict) +{ +        afr_local_t   *local     = NULL; +        int32_t       op_errno   = 0; +	int           subvol = -1; + +	local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out; + +	local->op = whichop; +        local->fd = fd_ref (fd); +        local->cont.readdir.size = size; +	local->cont.readdir.offset = offset; +        local->xdata_req = (dict)? dict_ref (dict) : NULL; + +	if (offset == 0) { +		/* First readdir has option of failing over and selecting +		   an appropriate read subvolume */ +		afr_read_txn (frame, this, fd->inode, afr_readdir_wind, +			      AFR_DATA_TRANSACTION); +	} else { +		/* But continued readdirs MUST stick to the same subvolume +		   without an option to failover */ +		afr_deitransform (this, offset, &subvol, +				  (uint64_t *)&local->cont.readdir.offset); +		afr_readdir_wind (frame, this, subvol); +	}          return 0;  out: @@ -521,7 +403,8 @@ int32_t  afr_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,               off_t offset, dict_t *xdata)  { -        afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata); +	afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata); +          return 0;  } @@ -531,6 +414,7 @@ afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,                off_t offset, dict_t *dict)  {          afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP, dict); +          return 0;  } @@ -538,7 +422,6 @@ afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,  int32_t  afr_releasedir (xlator_t *this, fd_t *fd)  { -        afr_forget_entries (fd);          afr_cleanup_fd_ctx (this, fd);          return 0; diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 1943b719bb5..465dde54f9c 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -34,10 +34,14 @@  #include "common-utils.h"  #include "compat-errno.h"  #include "compat.h" +#include "byte-order.h"  #include "afr.h"  #include "afr-transaction.h" +void +afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this); +  int  afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno)  { @@ -56,79 +60,214 @@ afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno)                          *op_errno = ENOMEM;                  goto out;          } -        parent->path = gf_strdup( dirname (child_path) ); -	if (!parent->path) { + +        parent->path = gf_strdup (dirname (child_path)); +        if (!parent->path) {                  if (op_errno)                          *op_errno = ENOMEM;                  goto out;          } -        parent->inode  = inode_ref (child->parent); -        uuid_copy (parent->gfid, child->pargfid); + +        parent->inode = inode_ref (child->parent); +	uuid_copy (parent->gfid, child->pargfid);          ret = 0;  out: -	GF_FREE(child_path); +        GF_FREE (child_path);          return ret;  } -void -__dir_entry_fop_common_cbk (call_frame_t *frame, int child_index, -                            xlator_t *this, int32_t op_ret, -                            int32_t op_errno, inode_t *inode, -                            struct iatt *buf, struct iatt *preparent, -                            struct iatt *postparent, struct iatt *prenewparent, -                            struct iatt *postnewparent) + +static void +__afr_dir_write_finalize (call_frame_t *frame, xlator_t *this)  { -        afr_local_t     *local          = NULL; +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	int inode_read_subvol = -1; +	int parent_read_subvol = -1; +	int parent2_read_subvol = -1; +	int i = 0; + +	local = frame->local; +	priv = this->private; + +	if (local->inode) { +		afr_replies_interpret (frame, this, local->inode); +		inode_read_subvol = afr_data_subvol_get (local->inode, this, +							 NULL, NULL); +	} +	if (local->parent) +		parent_read_subvol = afr_data_subvol_get (local->parent, this, +							  NULL, NULL); +	if (local->parent2) +		parent2_read_subvol = afr_data_subvol_get (local->parent2, this, +							   NULL, NULL); + +	local->op_ret = -1; +	local->op_errno = afr_final_errno (local, priv); + +	for (i = 0; i < priv->child_count; i++) { +		if (!local->replies[i].valid) +			continue; +		if (local->replies[i].op_ret < 0) { +			if (local->inode) +				afr_inode_read_subvol_reset (local->inode, +							     this); +			if (local->parent) +				afr_inode_read_subvol_reset (local->parent, +							     this); +			if (local->parent2) +				afr_inode_read_subvol_reset (local->parent2, +							     this); +			continue; +		} + +		if (local->op_ret == -1) { +			local->op_ret = local->replies[i].op_ret; +			local->op_errno = local->replies[i].op_errno; + +			local->cont.dir_fop.buf = +				local->replies[i].poststat; +			local->cont.dir_fop.preparent = +				local->replies[i].preparent; +			local->cont.dir_fop.postparent = +				local->replies[i].postparent; +			local->cont.dir_fop.prenewparent = +				local->replies[i].preparent2; +			local->cont.dir_fop.postnewparent = +				local->replies[i].postparent2; +			if (local->replies[i].xdata) +				local->xdata_rsp = +					dict_ref (local->replies[i].xdata); +			continue; +		} + +		if (i == inode_read_subvol) { +			local->cont.dir_fop.buf = +				local->replies[i].poststat; +			if (local->replies[i].xdata) { +				if (local->xdata_rsp) +					dict_unref (local->xdata_rsp); +				local->xdata_rsp = +					dict_ref (local->replies[i].xdata); +			} +		} + +		if (i == parent_read_subvol) { +			local->cont.dir_fop.preparent = +				local->replies[i].preparent; +			local->cont.dir_fop.postparent = +				local->replies[i].postparent; +		} + +		if (i == parent2_read_subvol) { +			local->cont.dir_fop.prenewparent = +				local->replies[i].preparent2; +			local->cont.dir_fop.postnewparent = +				local->replies[i].postparent2; +		} +	} +} + + +static void +__afr_dir_write_fill (call_frame_t *frame, xlator_t *this, int child_index, +		      int op_ret, int op_errno, struct iatt *poststat, +		      struct iatt *preparent, struct iatt *postparent, +		      struct iatt *preparent2, struct iatt *postparent2, +		      dict_t *xdata) +{ +        afr_local_t *local = NULL; +	afr_fd_ctx_t *fd_ctx = NULL;          local = frame->local; +	fd_ctx = local->fd_ctx; + +	local->replies[child_index].valid = 1; +	local->replies[child_index].op_ret = op_ret; +	local->replies[child_index].op_errno = op_errno; + +	if (op_ret >= 0) { +		if (poststat) +			local->replies[child_index].poststat = *poststat; +		if (preparent) +			local->replies[child_index].preparent = *preparent; +		if (postparent) +			local->replies[child_index].postparent = *postparent; +		if (preparent2) +			local->replies[child_index].preparent2 = *preparent2; +		if (postparent2) +			local->replies[child_index].postparent2 = *postparent2; +		if (xdata) +			local->replies[child_index].xdata = dict_ref (xdata); + +		if (fd_ctx) +			fd_ctx->opened_on[child_index] = AFR_FD_OPENED; +	} else { +		if (op_errno != ENOTEMPTY) +			afr_transaction_fop_failed (frame, this, child_index); +		if (fd_ctx) +			fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; +	} + +        return; +} + + +static int +__afr_dir_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		     int op_ret, int op_errno, struct iatt *buf, +		     struct iatt *preparent, struct iatt *postparent, +		     struct iatt *preparent2, struct iatt *postparent2, +                     dict_t *xdata) +{ +        afr_local_t *local = NULL; +        int child_index = (long) cookie; +        int call_count = -1; + +        local = frame->local; + +	LOCK (&frame->lock); +	{ +		__afr_dir_write_fill (frame, this, child_index, op_ret, +				      op_errno, buf, preparent, postparent, +				      preparent2, postparent2, xdata); +	} +	UNLOCK (&frame->lock); +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +		__afr_dir_write_finalize (frame, this); + +		if (afr_txn_nothing_failed (frame, this)) +			local->transaction.unwind (frame, this); -        if (afr_fop_failed (op_ret, op_errno)) -                afr_transaction_fop_failed (frame, this, child_index); - -        if (op_ret > -1) { -                local->op_ret = op_ret; - -                if ((local->success_count == 0) || -                    (child_index == local->read_child_index)) { -                        local->cont.dir_fop.preparent      = *preparent; -                        local->cont.dir_fop.postparent     = *postparent; -                        if (buf) -                                local->cont.dir_fop.buf            = *buf; -                        if (prenewparent) -                             local->cont.dir_fop.prenewparent  = *prenewparent; -                        if (postnewparent) -                             local->cont.dir_fop.postnewparent = *postnewparent; -                } - -                local->cont.dir_fop.inode = inode; - -                local->fresh_children[local->success_count] = child_index; -                local->success_count++; -                local->child_errno[child_index] = 0; -        } else { -                local->child_errno[child_index] = op_errno; +		afr_mark_entry_pending_changelog (frame, this); + +                local->transaction.resume (frame, this);          } -        local->op_errno = op_errno; +        return 0;  } +  int  afr_mark_new_entry_changelog_cbk (call_frame_t *frame, void *cookie, -                                  xlator_t *this, -                                  int32_t op_ret, int32_t op_errno, +				  xlator_t *this, int op_ret, int op_errno,                                    dict_t *xattr, dict_t *xdata)  { -        int     call_count = 0; +        int call_count = 0;          call_count = afr_frame_return (frame); -        if (call_count == 0) { + +        if (call_count == 0)                  AFR_STACK_DESTROY (frame); -        } +          return 0;  } +  void  afr_mark_new_entry_changelog (call_frame_t *frame, xlator_t *this)  { @@ -136,125 +275,109 @@ afr_mark_new_entry_changelog (call_frame_t *frame, xlator_t *this)          afr_local_t   *local      = NULL;          afr_local_t   *new_local  = NULL;          afr_private_t *priv       = NULL; -        dict_t        **xattr     = NULL; +        dict_t        *xattr      = NULL;          int32_t       **changelog = NULL;          int           i           = 0; -        GF_UNUSED int op_errno    = 0; +	int           idx         = 0; +        int           op_errno    = ENOMEM; +	unsigned char *pending    = NULL; +	int           call_count   = 0;          local = frame->local;          priv = this->private;          new_frame = copy_frame (frame); -        if (!new_frame) { +        if (!new_frame)                  goto out; -        } -        AFR_LOCAL_ALLOC_OR_GOTO (new_frame->local, out); -        new_local = new_frame->local; +	new_local = AFR_FRAME_INIT (new_frame, op_errno); +	if (!new_local) +		goto out; +          changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS);          if (!changelog)                  goto out; -        xattr = GF_CALLOC (priv->child_count, sizeof (*xattr), -                           gf_afr_mt_dict_t); -        if (!xattr) -                goto out; -        for (i = 0; i < priv->child_count; i++) { -                if (local->child_errno[i]) -                        continue; -                xattr[i] = dict_new (); -                if (!xattr[i]) -                        goto out; -        } +        xattr = dict_new (); +	if (!xattr) +		goto out; + +	idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); -        afr_prepare_new_entry_pending_matrix (changelog, -                                              afr_is_errno_set, -                                              local->child_errno, -                                              &local->cont.dir_fop.buf, -                                              priv->child_count); +	pending = alloca0 (priv->child_count); + +	for (i = 0; i < priv->child_count; i++) { +		if (local->transaction.pre_op[i] && +		    !local->transaction.failed_subvols[i]) { +			call_count ++; +			continue; +		} + +		changelog[i][idx] = hton32(1); +		pending[i] = 1; +	}          new_local->pending = changelog;          uuid_copy (new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid); -        new_local->loc.inode = inode_ref (local->cont.dir_fop.inode); -        new_local->call_count = local->success_count; +        new_local->loc.inode = inode_ref (local->inode); + + +	afr_set_pending_dict (priv, xattr, changelog); + +        new_local->call_count = call_count;          for (i = 0; i < priv->child_count; i++) { -                if (local->child_errno[i]) +		if (pending[i])                          continue; -                afr_set_pending_dict (priv, xattr[i], changelog, i, LOCAL_LAST);                  STACK_WIND_COOKIE (new_frame, afr_mark_new_entry_changelog_cbk,                                     (void *) (long) i, priv->children[i],                                     priv->children[i]->fops->xattrop,                                     &new_local->loc, GF_XATTROP_ADD_ARRAY, -                                   xattr[i], NULL); +                                   xattr, NULL); +		if (!--call_count) +			break;          } +          new_frame = NULL;  out:          if (new_frame)                  AFR_STACK_DESTROY (new_frame); -        afr_xattr_array_destroy (xattr, priv->child_count); +	if (xattr) +		dict_unref (xattr);          return;  } -gf_boolean_t -afr_is_new_entry_changelog_needed (glusterfs_fop_t fop) -{ -        glusterfs_fop_t fops[]   = {GF_FOP_CREATE, GF_FOP_MKNOD, GF_FOP_NULL}; -        int             i        = 0; - -        for (i = 0; fops[i] != GF_FOP_NULL; i++) { -                if (fop == fops[i]) -                        return _gf_true; -        } -        return _gf_false; -}  void -afr_dir_fop_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this) +afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this)  { -        afr_local_t   *local      = NULL; -        afr_private_t *priv       = NULL; +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +	int pre_op_count = 0; +	int failed_count = 0;          local = frame->local;          priv  = this->private;          if (local->op_ret < 0) -                goto out; +		return; -        if (local->success_count == priv->child_count) -                goto out; +	if (local->op != GF_FOP_CREATE && local->op != GF_FOP_MKNOD) +		return; -        if (!afr_is_new_entry_changelog_needed (local->op)) -                goto out; +	pre_op_count = AFR_COUNT (local->transaction.pre_op, priv->child_count); +	failed_count = AFR_COUNT (local->transaction.failed_subvols, +				  priv->child_count); + +	if (pre_op_count == priv->child_count && !failed_count) +		return;          afr_mark_new_entry_changelog (frame, this); -out:          return;  } -void -afr_dir_fop_done (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t     *local          = NULL; -        afr_private_t   *priv           = NULL; - -        local = frame->local; -        priv  = this->private; - -        if (local->cont.dir_fop.inode == NULL) -                goto done; -        afr_set_read_ctx_from_policy (this, local->cont.dir_fop.inode, -                                      local->fresh_children, -                                      local->read_child_index, -                                      priv->read_child, -                                      local->cont.dir_fop.buf.ia_gfid); -done: -        local->transaction.unwind (frame, this); -        afr_dir_fop_mark_entry_pending_changelog (frame, this); -        local->transaction.resume (frame, this); -}  /* {{{ create */ @@ -266,26 +389,16 @@ afr_create_unwind (call_frame_t *frame, xlator_t *this)          local = frame->local; -        LOCK (&frame->lock); -        { -                if (local->transaction.main_frame) { -                        main_frame = local->transaction.main_frame; -                } -                local->transaction.main_frame = NULL; -        } -        UNLOCK (&frame->lock); - -        if (main_frame) { -                AFR_STACK_UNWIND (create, main_frame, -                                  local->op_ret, local->op_errno, -                                  local->cont.create.fd, -                                  local->cont.dir_fop.inode, -                                  &local->cont.dir_fop.buf, -                                  &local->cont.dir_fop.preparent, -                                  &local->cont.dir_fop.postparent, -                                  local->xdata_rsp); -        } +	main_frame = afr_transaction_detach_fop_frame (frame); +        if (!main_frame) +		return 0; + +	AFR_STACK_UNWIND (create, main_frame, local->op_ret, local->op_errno, +			  local->cont.create.fd, local->inode, +			  &local->cont.dir_fop.buf, +			  &local->cont.dir_fop.preparent, +			  &local->cont.dir_fop.postparent, local->xdata_rsp);          return 0;  } @@ -297,175 +410,79 @@ afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                       struct iatt *preparent, struct iatt *postparent,                       dict_t *xdata)  { -        afr_local_t     *local = NULL; -        uint64_t        ctx = 0; -        afr_fd_ctx_t    *fd_ctx = NULL; -        int             ret = 0; -        int             call_count = -1; -        int             child_index = -1; - -        local = frame->local; - -        child_index = (long) cookie; - -        LOCK (&frame->lock); -        { -                if (op_ret > -1) { -                        ret = afr_fd_ctx_set (this, fd); -                        if (ret < 0) { -                                gf_log (this->name, GF_LOG_ERROR, -                                        "could not set ctx on fd=%p", fd); - -                                local->op_ret   = -1; -                                local->op_errno = -ret; -                                goto unlock; -                        } - -                        ret = fd_ctx_get (fd, this, &ctx); -                        if (ret < 0) { -                                gf_log (this->name, GF_LOG_ERROR, -                                        "could not get fd ctx for fd=%p", fd); -                                local->op_ret   = -1; -                                local->op_errno = -ret; -                                goto unlock; -                        } - -                        fd_ctx = (afr_fd_ctx_t *)(long) ctx; - -                        fd_ctx->opened_on[child_index] = AFR_FD_OPENED; -                        fd_ctx->flags                  = local->cont.create.flags; - -                        if (local->success_count == 0) { -				if (xdata) -					local->xdata_rsp = dict_ref(xdata); -			} -                } -                __dir_entry_fop_common_cbk (frame, child_index, this, -                                            op_ret, op_errno, inode, buf, -                                            preparent, postparent, NULL, NULL); -        } - -unlock: -        UNLOCK (&frame->lock); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) -                afr_dir_fop_done (frame, this); - -        return 0; +        return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, +				    preparent, postparent, NULL, NULL, xdata);  }  int -afr_create_wind (call_frame_t *frame, xlator_t *this) +afr_create_wind (call_frame_t *frame, xlator_t *this, int subvol)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; -        int call_count = -1; -        int i = 0;          local = frame->local;          priv = this->private; -        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, -                                                     priv->child_count); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -                return 0; -        } - -        local->call_count = call_count; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->transaction.pre_op[i]) { -                        STACK_WIND_COOKIE (frame, afr_create_wind_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->create, -                                           &local->loc, -                                           local->cont.create.flags, -                                           local->cont.create.mode, -                                           local->umask, -                                           local->cont.create.fd, -                                           local->xdata_req); -                        if (!--call_count) -                                break; -                } -        } - -        return 0; -} - - -int -afr_create_done (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t * local = NULL; - -        local = frame->local; - -        local->transaction.unwind (frame, this); - -        AFR_STACK_DESTROY (frame); - +	STACK_WIND_COOKIE (frame, afr_create_wind_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->create, +			   &local->loc, local->cont.create.flags, +			   local->cont.create.mode, local->umask, +			   local->cont.create.fd, local->xdata_req);          return 0;  }  int -afr_create (call_frame_t *frame, xlator_t *this, -            loc_t *loc, int32_t flags, mode_t mode, -            mode_t umask, fd_t *fd, dict_t *params) +afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, +	    mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)  {          afr_private_t           *priv                   = NULL;          afr_local_t             *local                  = NULL;          afr_internal_lock_t     *int_lock               = NULL;          call_frame_t            *transaction_frame      = NULL;          int                     ret                     = -1; -        int                     op_errno                = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); +        int                     op_errno                = ENOMEM;          priv = this->private;          QUORUM_CHECK(create,out);          transaction_frame = copy_frame (frame); -        if (!transaction_frame) { -                op_errno = ENOMEM; +        if (!transaction_frame)                  goto out; -        } -        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); -        local = transaction_frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +	local = AFR_FRAME_INIT (transaction_frame, op_errno); +	if (!local) +		goto out;          loc_copy (&local->loc, loc); -        LOCK (&priv->read_child_lock); -        { -                local->read_child_index = (++priv->read_child_rr) -                        % (priv->child_count); -        } -        UNLOCK (&priv->read_child_lock); +	local->fd_ctx = afr_fd_ctx_get (fd, this); +	if (!local->fd_ctx) +		goto out; + +	local->inode = inode_ref (loc->inode); +	local->parent = inode_ref (loc->parent);          local->op                = GF_FOP_CREATE;          local->cont.create.flags = flags;          local->cont.create.mode  = mode;          local->cont.create.fd    = fd_ref (fd);          local->umask  = umask; -        if (params) -                local->xdata_req = dict_ref (params); -        local->transaction.fop    = afr_create_wind; -        local->transaction.done   = afr_create_done; +        if (xdata) +                local->xdata_req = dict_copy_with_ref (xdata, NULL); +	else +		local->xdata_req = dict_new (); + +	if (!local->xdata_req) +		goto out; + +        local->transaction.wind   = afr_create_wind; +        local->transaction.fop    = __afr_txn_write_fop; +        local->transaction.done   = __afr_txn_write_done;          local->transaction.unwind = afr_create_unwind;          ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -492,15 +509,13 @@ afr_create (call_frame_t *frame, xlator_t *this,              goto out;          } -        ret = 0; +	return 0;  out: -        if (ret < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); -                AFR_STACK_UNWIND (create, frame, -1, op_errno, -                                  NULL, NULL, NULL, NULL, NULL, NULL); -        } +	if (transaction_frame) +		AFR_STACK_DESTROY (transaction_frame); +	AFR_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL, +			  NULL, NULL);          return 0;  } @@ -516,25 +531,14 @@ afr_mknod_unwind (call_frame_t *frame, xlator_t *this)          local = frame->local; -        LOCK (&frame->lock); -        { -                if (local->transaction.main_frame) { -                        main_frame = local->transaction.main_frame; -                } -                local->transaction.main_frame = NULL; -        } -        UNLOCK (&frame->lock); - -        if (main_frame) { -                AFR_STACK_UNWIND (mknod, main_frame, -                                  local->op_ret, local->op_errno, -                                  local->cont.dir_fop.inode, -                                  &local->cont.dir_fop.buf, -                                  &local->cont.dir_fop.preparent, -                                  &local->cont.dir_fop.postparent, -                                  NULL); -        } +	main_frame = afr_transaction_detach_fop_frame (frame); +        if (!main_frame) +		return 0; +	AFR_STACK_UNWIND (mknod, main_frame, local->op_ret, local->op_errno, +			  local->inode, &local->cont.dir_fop.buf, +			  &local->cont.dir_fop.preparent, +			  &local->cont.dir_fop.postparent, local->xdata_rsp);          return 0;  } @@ -545,131 +549,72 @@ afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                      struct iatt *buf, struct iatt *preparent,                      struct iatt *postparent, dict_t *xdata)  { -        int             call_count      = -1; -        int             child_index     = -1; - -        child_index = (long) cookie; - -        LOCK (&frame->lock); -        { -                __dir_entry_fop_common_cbk (frame, child_index, this, -                                            op_ret, op_errno, inode, buf, -                                            preparent, postparent, NULL, NULL); -        } -        UNLOCK (&frame->lock); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) -                afr_dir_fop_done (frame, this); - -        return 0; +	return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, +				    preparent, postparent, NULL, NULL, xdata);  } -int32_t -afr_mknod_wind (call_frame_t *frame, xlator_t *this) +int +afr_mknod_wind (call_frame_t *frame, xlator_t *this, int subvol)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; -        int call_count = -1; -        int i = 0;          local = frame->local;          priv  = this->private; -        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, -                                                     priv->child_count); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -                return 0; -        } - -        local->call_count = call_count; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->transaction.pre_op[i]) { -                        STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->mknod, -                                           &local->loc, local->cont.mknod.mode, -                                           local->cont.mknod.dev, -                                           local->umask, -                                           local->xdata_req); -                        if (!--call_count) -                                break; -                } -        } - -        return 0; -} - - -int -afr_mknod_done (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t * local = NULL; - -        local = frame->local; - -        local->transaction.unwind (frame, this); -        AFR_STACK_DESTROY (frame); - +	STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->mknod, +			   &local->loc, local->cont.mknod.mode, +			   local->cont.mknod.dev, local->umask, +			   local->xdata_req);          return 0;  } -  int  afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, -           dev_t dev, mode_t umask, dict_t *params) +           dev_t dev, mode_t umask, dict_t *xdata)  {          afr_private_t           *priv                   = NULL;          afr_local_t             *local                  = NULL;          afr_internal_lock_t     *int_lock               = NULL;          call_frame_t            *transaction_frame      = NULL;          int                     ret                     = -1; -        int                     op_errno                = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); +        int                     op_errno                = ENOMEM;          priv = this->private;          QUORUM_CHECK(mknod,out);          transaction_frame = copy_frame (frame); -        if (!transaction_frame) { -                op_errno = ENOMEM; +        if (!transaction_frame)                  goto out; -        } -        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); -        local = transaction_frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +	local = AFR_FRAME_INIT (transaction_frame, op_errno); +	if (!local) +		goto out;          loc_copy (&local->loc, loc); - -        LOCK (&priv->read_child_lock); -        { -                local->read_child_index = (++priv->read_child_rr) -                        % (priv->child_count); -        } -        UNLOCK (&priv->read_child_lock); +	local->inode = inode_ref (loc->inode); +	local->parent = inode_ref (loc->parent);          local->op               = GF_FOP_MKNOD;          local->cont.mknod.mode  = mode;          local->cont.mknod.dev   = dev;          local->umask = umask; -        if (params) -                local->xdata_req = dict_ref (params); -        local->transaction.fop    = afr_mknod_wind; -        local->transaction.done   = afr_mknod_done; +        if (xdata) +                local->xdata_req = dict_copy_with_ref (xdata, NULL); +	else +		local->xdata_req = dict_new (); + +	if (!local->xdata_req) +		goto out; + +        local->transaction.wind   = afr_mknod_wind; +        local->transaction.fop    = __afr_txn_write_fop; +        local->transaction.done   = __afr_txn_write_done;          local->transaction.unwind = afr_mknod_unwind;          ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -692,19 +637,17 @@ afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,          int_lock->lockee_count++;          ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);          if (ret < 0) { -            op_errno = -ret; -            goto out; +		op_errno = -ret; +		goto out;          } -        ret = 0; +	return 0;  out: -        if (ret < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); -                AFR_STACK_UNWIND (mknod, frame, -1, op_errno, -                                  NULL, NULL, NULL, NULL, NULL); -        } +	if (transaction_frame) +		AFR_STACK_DESTROY (transaction_frame); +	AFR_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, +			  NULL);          return 0;  } @@ -721,25 +664,14 @@ afr_mkdir_unwind (call_frame_t *frame, xlator_t *this)          local = frame->local; -        LOCK (&frame->lock); -        { -                if (local->transaction.main_frame) { -                        main_frame = local->transaction.main_frame; -                } -                local->transaction.main_frame = NULL; -        } -        UNLOCK (&frame->lock); - -        if (main_frame) { -                AFR_STACK_UNWIND (mkdir, main_frame, -                                  local->op_ret, local->op_errno, -                                  local->cont.dir_fop.inode, -                                  &local->cont.dir_fop.buf, -                                  &local->cont.dir_fop.preparent, -                                  &local->cont.dir_fop.postparent, -                                  NULL); -        } +	main_frame = afr_transaction_detach_fop_frame (frame); +        if (!main_frame) +		return 0; +	AFR_STACK_UNWIND (mkdir, main_frame, local->op_ret, local->op_errno, +			  local->inode, &local->cont.dir_fop.buf, +			  &local->cont.dir_fop.preparent, +			  &local->cont.dir_fop.postparent, local->xdata_rsp);          return 0;  } @@ -750,130 +682,71 @@ afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                      struct iatt *buf, struct iatt *preparent,                      struct iatt *postparent, dict_t *xdata)  { -        int             call_count      = -1; -        int             child_index     = -1; - -        child_index = (long) cookie; - -        LOCK (&frame->lock); -        { -                __dir_entry_fop_common_cbk (frame, child_index, this, -                                            op_ret, op_errno, inode, buf, -                                            preparent, postparent, NULL, NULL); -        } -        UNLOCK (&frame->lock); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) -                afr_dir_fop_done (frame, this); - -        return 0; +	return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, +				    preparent, postparent, NULL, NULL, xdata);  }  int -afr_mkdir_wind (call_frame_t *frame, xlator_t *this) +afr_mkdir_wind (call_frame_t *frame, xlator_t *this, int subvol)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; -        int call_count = -1; -        int i = 0;          local = frame->local;          priv  = this->private; -        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, -                                                     priv->child_count); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -                return 0; -        } - -        local->call_count = call_count; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->transaction.pre_op[i]) { -                        STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->mkdir, -                                           &local->loc, local->cont.mkdir.mode, -                                           local->umask, -                                           local->xdata_req); -                        if (!--call_count) -                                break; -                } -        } - +	STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->mkdir, &local->loc, +			   local->cont.mkdir.mode, local->umask, +			   local->xdata_req);          return 0;  }  int -afr_mkdir_done (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t * local = NULL; - -        local = frame->local; - -        local->transaction.unwind (frame, this); - -        AFR_STACK_DESTROY (frame); - -        return 0; -} - -int -afr_mkdir (call_frame_t *frame, xlator_t *this, -           loc_t *loc, mode_t mode, mode_t umask, dict_t *params) +afr_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, +	   mode_t umask, dict_t *xdata)  {          afr_private_t           *priv                   = NULL;          afr_local_t             *local                  = NULL;          afr_internal_lock_t     *int_lock               = NULL;          call_frame_t            *transaction_frame      = NULL;          int                     ret                     = -1; -        int                     op_errno                = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); +        int                     op_errno                = ENOMEM;          priv = this->private;          QUORUM_CHECK(mkdir,out);          transaction_frame = copy_frame (frame); -        if (!transaction_frame) { -                op_errno = ENOMEM; +        if (!transaction_frame)                  goto out; -        } -        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); -        local = transaction_frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +	local = AFR_FRAME_INIT (transaction_frame, op_errno); +	if (!local) +		goto out;          loc_copy (&local->loc, loc); - -        LOCK (&priv->read_child_lock); -        { -                local->read_child_index = (++priv->read_child_rr) -                        % (priv->child_count); -        } -        UNLOCK (&priv->read_child_lock); +	local->inode = inode_ref (loc->inode); +	local->parent = inode_ref (loc->parent);          local->cont.mkdir.mode  = mode;          local->umask = umask; -        if (params) -                local->xdata_req = dict_ref (params); + +        if (xdata) +                local->xdata_req = dict_copy_with_ref (xdata, NULL); +	else +		local->xdata_req = dict_new (); + +	if (!local->xdata_req) +		goto out;          local->op = GF_FOP_MKDIR; -        local->transaction.fop    = afr_mkdir_wind; -        local->transaction.done   = afr_mkdir_done; +        local->transaction.wind   = afr_mkdir_wind; +        local->transaction.fop    = __afr_txn_write_fop; +        local->transaction.done   = __afr_txn_write_done;          local->transaction.unwind = afr_mkdir_unwind;          ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -896,20 +769,17 @@ afr_mkdir (call_frame_t *frame, xlator_t *this,          int_lock->lockee_count++;          ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);          if (ret < 0) { -            op_errno = -ret; -            goto out; +		op_errno = -ret; +		goto out;          } -        ret = 0; +	return 0;  out: -        if (ret < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); - -                AFR_STACK_UNWIND (mkdir, frame, -1, op_errno, -                                  NULL, NULL, NULL, NULL, NULL); -        } +	if (transaction_frame) +		AFR_STACK_DESTROY (transaction_frame); +	AFR_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, +			  NULL);          return 0;  } @@ -926,25 +796,14 @@ afr_link_unwind (call_frame_t *frame, xlator_t *this)          local = frame->local; -        LOCK (&frame->lock); -        { -                if (local->transaction.main_frame) { -                        main_frame = local->transaction.main_frame; -                } -                local->transaction.main_frame = NULL; -        } -        UNLOCK (&frame->lock); - -        if (main_frame) { -                AFR_STACK_UNWIND (link, main_frame, -                                  local->op_ret, local->op_errno, -                                  local->cont.dir_fop.inode, -                                  &local->cont.dir_fop.buf, -                                  &local->cont.dir_fop.preparent, -                                  &local->cont.dir_fop.postparent, -                                  NULL); -        } +	main_frame = afr_transaction_detach_fop_frame (frame); +        if (!main_frame) +		return 0; +	AFR_STACK_UNWIND (link, main_frame, local->op_ret, local->op_errno, +			  local->inode, &local->cont.dir_fop.buf, +			  &local->cont.dir_fop.preparent, +			  &local->cont.dir_fop.postparent, local->xdata_rsp);          return 0;  } @@ -955,127 +814,70 @@ afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                     struct iatt *buf, struct iatt *preparent,                     struct iatt *postparent, dict_t *xdata)  { -        int             call_count      = -1; -        int             child_index     = -1; - -        child_index = (long) cookie; - -        LOCK (&frame->lock); -        { -                __dir_entry_fop_common_cbk (frame, child_index, this, -                                            op_ret, op_errno, inode, buf, -                                            preparent, postparent, NULL, NULL); -        } -        UNLOCK (&frame->lock); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) -                afr_dir_fop_done (frame, this); - -        return 0; +        return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, +				    preparent, postparent, NULL, NULL, xdata);  }  int -afr_link_wind (call_frame_t *frame, xlator_t *this) +afr_link_wind (call_frame_t *frame, xlator_t *this, int subvol)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; -        int call_count = -1; -        int i = 0;          local = frame->local;          priv  = this->private; -        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, -                                                     priv->child_count); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -                return 0; -        } - -        local->call_count = call_count; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->transaction.pre_op[i]) { -                        STACK_WIND_COOKIE (frame, afr_link_wind_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->link, -                                           &local->loc, -                                           &local->newloc, local->xdata_req); - -                        if (!--call_count) -                                break; -                } -        } - +	STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->link, +			   &local->loc, &local->newloc, local->xdata_req);          return 0;  }  int -afr_link_done (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t * local = frame->local; - -        local->transaction.unwind (frame, this); - -        AFR_STACK_DESTROY (frame); - -        return 0; -} - - -int -afr_link (call_frame_t *frame, xlator_t *this, -          loc_t *oldloc, loc_t *newloc, dict_t *xdata) +afr_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, +	  dict_t *xdata)  {          afr_private_t           *priv                   = NULL;          afr_local_t             *local                  = NULL;          afr_internal_lock_t     *int_lock               = NULL;          call_frame_t            *transaction_frame      = NULL;          int                     ret                     = -1; -        int                     op_errno                = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); +        int                     op_errno                = ENOMEM;          priv = this->private;          QUORUM_CHECK(link,out);          transaction_frame = copy_frame (frame); -        if (!transaction_frame) { -                op_errno = ENOMEM; +        if (!transaction_frame)                  goto out; -        } -        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); -        local = transaction_frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +	local = AFR_FRAME_INIT (transaction_frame, op_errno); +	if (!local) +		goto out;          loc_copy (&local->loc,    oldloc);          loc_copy (&local->newloc, newloc); + +	local->inode = inode_ref (oldloc->inode); +	local->parent = inode_ref (newloc->parent); +          if (xdata) -                local->xdata_req = dict_ref (xdata); +                local->xdata_req = dict_copy_with_ref (xdata, NULL); +	else +		local->xdata_req = dict_new (); -        LOCK (&priv->read_child_lock); -        { -                local->read_child_index = (++priv->read_child_rr) -                        % (priv->child_count); -        } -        UNLOCK (&priv->read_child_lock); +	if (!local->xdata_req) +		goto out;          local->op = GF_FOP_LINK; -        local->transaction.fop    = afr_link_wind; -        local->transaction.done   = afr_link_done; + +        local->transaction.wind   = afr_link_wind; +        local->transaction.fop    = __afr_txn_write_fop; +        local->transaction.done   = __afr_txn_write_done;          local->transaction.unwind = afr_link_unwind;          ret = afr_build_parent_loc (&local->transaction.parent_loc, newloc, @@ -1098,18 +900,17 @@ afr_link (call_frame_t *frame, xlator_t *this,          int_lock->lockee_count++;          ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);          if (ret < 0) { -            op_errno = -ret; -            goto out; +		op_errno = -ret; +		goto out;          } -        ret = 0; + +	return 0;  out: -        if (ret < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); -                AFR_STACK_UNWIND (link, frame, -1, op_errno, -                                  NULL, NULL, NULL, NULL, NULL); -        } +	if (transaction_frame) +		AFR_STACK_DESTROY (transaction_frame); +	AFR_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL, +			  NULL);          return 0;  } @@ -1126,25 +927,14 @@ afr_symlink_unwind (call_frame_t *frame, xlator_t *this)          local = frame->local; -        LOCK (&frame->lock); -        { -                if (local->transaction.main_frame) { -                        main_frame = local->transaction.main_frame; -                } -                local->transaction.main_frame = NULL; -        } -        UNLOCK (&frame->lock); - -        if (main_frame) { -                AFR_STACK_UNWIND (symlink, main_frame, -                                  local->op_ret, local->op_errno, -                                  local->cont.dir_fop.inode, -                                  &local->cont.dir_fop.buf, -                                  &local->cont.dir_fop.preparent, -                                  &local->cont.dir_fop.postparent, -                                  NULL); -        } +	main_frame = afr_transaction_detach_fop_frame (frame); +        if (!main_frame) +		return 0; +	AFR_STACK_UNWIND (symlink, main_frame, local->op_ret, local->op_errno, +			  local->inode, &local->cont.dir_fop.buf, +			  &local->cont.dir_fop.preparent, +			  &local->cont.dir_fop.postparent, local->xdata_rsp);          return 0;  } @@ -1155,132 +945,71 @@ afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                        struct iatt *buf, struct iatt *preparent,                        struct iatt *postparent, dict_t *xdata)  { -        int             call_count      = -1; -        int             child_index     = -1; - -        child_index = (long) cookie; - -        LOCK (&frame->lock); -        { -                __dir_entry_fop_common_cbk (frame, child_index, this, -                                            op_ret, op_errno, inode, buf, -                                            preparent, postparent, NULL, NULL); -        } -        UNLOCK (&frame->lock); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) -                afr_dir_fop_done (frame, this); - -        return 0; +        return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, +				    preparent, postparent, NULL, NULL, xdata);  }  int -afr_symlink_wind (call_frame_t *frame, xlator_t *this) +afr_symlink_wind (call_frame_t *frame, xlator_t *this, int subvol)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; -        int call_count = -1; -        int i = 0;          local = frame->local;          priv = this->private; -        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, -                                                     priv->child_count); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -                return 0; -        } - -        local->call_count = call_count; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->transaction.pre_op[i]) { -                        STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->symlink, -                                           local->cont.symlink.linkpath, -                                           &local->loc, -                                           local->umask, -                                           local->xdata_req); - -                        if (!--call_count) -                                break; - -                } -        } - -        return 0; -} - - -int -afr_symlink_done (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t * local = frame->local; - -        local->transaction.unwind (frame, this); - -        AFR_STACK_DESTROY (frame); - +	STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->symlink, +			   local->cont.symlink.linkpath, &local->loc, +			   local->umask, local->xdata_req);          return 0;  }  int -afr_symlink (call_frame_t *frame, xlator_t *this, -             const char *linkpath, loc_t *loc, mode_t umask, dict_t *params) +afr_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, +	     loc_t *loc, mode_t umask, dict_t *xdata)  {          afr_private_t           *priv                   = NULL;          afr_local_t             *local                  = NULL;          afr_internal_lock_t     *int_lock               = NULL;          call_frame_t            *transaction_frame      = NULL;          int                     ret                     = -1; -        int                     op_errno                = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); +        int                     op_errno                = ENOMEM;          priv = this->private;          QUORUM_CHECK(symlink,out);          transaction_frame = copy_frame (frame); -        if (!transaction_frame) { -                op_errno = ENOMEM; +        if (!transaction_frame)                  goto out; -        } - -        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); -        local = transaction_frame->local; -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +	local = AFR_FRAME_INIT (transaction_frame, op_errno); +	if (!local) +		goto out;          loc_copy (&local->loc, loc); - -        LOCK (&priv->read_child_lock); -        { -                local->read_child_index = (++priv->read_child_rr) -                        % (priv->child_count); -        } -        UNLOCK (&priv->read_child_lock); +	local->inode = inode_ref (loc->inode); +	local->parent = inode_ref (loc->parent);          local->cont.symlink.linkpath = gf_strdup (linkpath);          local->umask = umask; -        if (params) -                local->xdata_req = dict_ref (params); + +        if (xdata) +                local->xdata_req = dict_copy_with_ref (xdata, NULL); +	else +		local->xdata_req = dict_new (); + +	if (!local->xdata_req) +		goto out;          local->op = GF_FOP_SYMLINK; -        local->transaction.fop    = afr_symlink_wind; -        local->transaction.done   = afr_symlink_done; +        local->transaction.wind   = afr_symlink_wind; +        local->transaction.fop    = __afr_txn_write_fop; +        local->transaction.done   = __afr_txn_write_done;          local->transaction.unwind = afr_symlink_unwind;          ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -1303,19 +1032,17 @@ afr_symlink (call_frame_t *frame, xlator_t *this,          int_lock->lockee_count++;          ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);          if (ret < 0) { -            op_errno = -ret; -            goto out; +		op_errno = -ret; +		goto out;          } -        ret = 0; +	return 0;  out: -        if (ret  < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); -                AFR_STACK_UNWIND (symlink, frame, -1, op_errno, -                                  NULL, NULL, NULL, NULL, NULL); -        } +	if (transaction_frame) +		AFR_STACK_DESTROY (transaction_frame); +	AFR_STACK_UNWIND (symlink, frame, -1, op_errno, NULL, NULL, NULL, +			  NULL, NULL);          return 0;  } @@ -1331,26 +1058,16 @@ afr_rename_unwind (call_frame_t *frame, xlator_t *this)          local = frame->local; -        LOCK (&frame->lock); -        { -                if (local->transaction.main_frame) { -                        main_frame = local->transaction.main_frame; -                } -                local->transaction.main_frame = NULL; -        } -        UNLOCK (&frame->lock); - -        if (main_frame) { -                AFR_STACK_UNWIND (rename, main_frame, -                                  local->op_ret, local->op_errno, -                                  &local->cont.dir_fop.buf, -                                  &local->cont.dir_fop.preparent, -                                  &local->cont.dir_fop.postparent, -                                  &local->cont.dir_fop.prenewparent, -                                  &local->cont.dir_fop.postnewparent, -                                  NULL); -        } +	main_frame = afr_transaction_detach_fop_frame (frame); +        if (!main_frame) +		return 0; +	AFR_STACK_UNWIND (rename, main_frame, local->op_ret, local->op_errno, +			  &local->cont.dir_fop.buf, +			  &local->cont.dir_fop.preparent, +			  &local->cont.dir_fop.postparent, +			  &local->cont.dir_fop.prenewparent, +			  &local->cont.dir_fop.postnewparent, local->xdata_rsp);          return 0;  } @@ -1362,131 +1079,72 @@ afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                       struct iatt *prenewparent, struct iatt *postnewparent,                       dict_t *xdata)  { -        afr_local_t *   local = NULL; -        int call_count = -1; -        int child_index = -1; - -        local = frame->local; - -        child_index = (long) cookie; - -        LOCK (&frame->lock); -        { -                if (afr_fop_failed (op_ret, op_errno) && op_errno != ENOTEMPTY) -                        afr_transaction_fop_failed (frame, this, child_index); -                local->op_errno = op_errno; -                local->child_errno[child_index] = op_errno; - -                if (op_ret > -1) -                        __dir_entry_fop_common_cbk (frame, child_index, this, -                                                   op_ret, op_errno, NULL, buf, -                                                   preoldparent, postoldparent, -                                                   prenewparent, postnewparent); - -        } -        UNLOCK (&frame->lock); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) -                afr_dir_fop_done (frame, this); - -        return 0; +        return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, +				    preoldparent, postoldparent, prenewparent, +				    postnewparent, xdata);  } -int32_t -afr_rename_wind (call_frame_t *frame, xlator_t *this) +int +afr_rename_wind (call_frame_t *frame, xlator_t *this, int subvol)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; -        int call_count = -1; -        int i = 0; -        local = frame->local; -        priv = this->private; - -        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, -                                                     priv->child_count); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -                return 0; -        } - -        local->call_count = call_count; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->transaction.pre_op[i]) { -                        STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->rename, -                                           &local->loc, -                                           &local->newloc, NULL); -                        if (!--call_count) -                                break; -                } -        } +	local = frame->local; +	priv = this->private; +	STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->rename, +			   &local->loc, &local->newloc, local->xdata_req);          return 0;  }  int -afr_rename_done (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t * local = frame->local; - -        local->transaction.unwind (frame, this); - -        AFR_STACK_DESTROY (frame); - -        return 0; -} - - -int -afr_rename (call_frame_t *frame, xlator_t *this, -            loc_t *oldloc, loc_t *newloc, dict_t *xdata) +afr_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, +	    dict_t *xdata)  {          afr_private_t           *priv                   = NULL;          afr_local_t             *local                  = NULL;          afr_internal_lock_t     *int_lock               = NULL;          call_frame_t            *transaction_frame      = NULL;          int                     ret                     = -1; -        int                     op_errno                = 0; +        int                     op_errno                = ENOMEM;          int                     nlockee                 = 0; -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); -          priv = this->private;          QUORUM_CHECK(rename,out);          transaction_frame = copy_frame (frame); -        if (!transaction_frame) { +        if (!transaction_frame)                  op_errno = ENOMEM; -                goto out; -        } -        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); -        local = transaction_frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +	local = AFR_FRAME_INIT (transaction_frame, op_errno); +	if (!local) +		goto out;          loc_copy (&local->loc,    oldloc);          loc_copy (&local->newloc, newloc); -        local->read_child_index = afr_inode_get_read_ctx (this, oldloc->inode, NULL); +	local->inode = inode_ref (oldloc->inode); +	local->parent = inode_ref (oldloc->parent); +	local->parent2 = inode_ref (newloc->parent); + +        if (xdata) +                local->xdata_req = dict_copy_with_ref (xdata, NULL); +	else +		local->xdata_req = dict_new (); + +	if (!local->xdata_req) +		goto out;          local->op = GF_FOP_RENAME; -        local->transaction.fop    = afr_rename_wind; -        local->transaction.done   = afr_rename_done; +        local->transaction.wind   = afr_rename_wind; +        local->transaction.fop    = __afr_txn_write_fop; +        local->transaction.done   = __afr_txn_write_done;          local->transaction.unwind = afr_rename_unwind;          ret = afr_build_parent_loc (&local->transaction.parent_loc, oldloc, @@ -1536,20 +1194,17 @@ afr_rename (call_frame_t *frame, xlator_t *this,          ret = afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION);          if (ret < 0) { -            op_errno = -ret; -            goto out; +		op_errno = -ret; +		goto out;          } -        ret = 0; +	return 0;  out: -        if (ret < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); - -                AFR_STACK_UNWIND (rename, frame, -1, op_errno, -                                  NULL, NULL, NULL, NULL, NULL, NULL); -        } +	if (transaction_frame) +		AFR_STACK_DESTROY (transaction_frame); +	AFR_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, +			  NULL, NULL);          return 0;  } @@ -1565,23 +1220,13 @@ afr_unlink_unwind (call_frame_t *frame, xlator_t *this)          local = frame->local; -        LOCK (&frame->lock); -        { -                if (local->transaction.main_frame) { -                        main_frame = local->transaction.main_frame; -                } -                local->transaction.main_frame = NULL; -        } -        UNLOCK (&frame->lock); - -        if (main_frame) { -                AFR_STACK_UNWIND (unlink, main_frame, -                                  local->op_ret, local->op_errno, -                                  &local->cont.dir_fop.preparent, -                                  &local->cont.dir_fop.postparent, -                                  NULL); -        } +	main_frame = afr_transaction_detach_fop_frame (frame); +        if (!main_frame) +		return 0; +	AFR_STACK_UNWIND (unlink, main_frame, local->op_ret, local->op_errno, +			  &local->cont.dir_fop.preparent, +			  &local->cont.dir_fop.postparent, local->xdata_rsp);          return 0;  } @@ -1591,123 +1236,69 @@ afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                       int32_t op_ret, int32_t op_errno, struct iatt *preparent,                       struct iatt *postparent, dict_t *xdata)  { -        afr_local_t *   local = NULL; -        int call_count  = -1; -        int child_index = (long) cookie; - -        local = frame->local; - -        LOCK (&frame->lock); -        { -                if (child_index == local->read_child_index) { -                        local->read_child_returned = _gf_true; -                } -                __dir_entry_fop_common_cbk (frame, child_index, this, -                                            op_ret, op_errno, NULL, NULL, -                                            preparent, postparent, NULL, NULL); -        } -        UNLOCK (&frame->lock); - -        call_count = afr_frame_return (frame); -        if (call_count == 0) -                afr_dir_fop_done (frame, this); - -        return 0; +        return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL, +				    preparent, postparent, NULL, NULL, xdata);  } -int32_t -afr_unlink_wind (call_frame_t *frame, xlator_t *this) +int +afr_unlink_wind (call_frame_t *frame, xlator_t *this, int subvol)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; -        int call_count = -1; -        int i = 0;          local = frame->local;          priv  = this->private; -        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, -                                                     priv->child_count); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -                return 0; -        } - -        local->call_count = call_count; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->transaction.pre_op[i]) { -                        STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->unlink, -                                           &local->loc, local->xflag, -                                           local->xdata_req); - -                        if (!--call_count) -                                break; -                } -        } - -        return 0; -} - - -int32_t -afr_unlink_done (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t * local = frame->local; - -        local->transaction.unwind (frame, this); - -        AFR_STACK_DESTROY (frame); - +	STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->unlink, +			   &local->loc, local->xflag, local->xdata_req);          return 0;  } -int32_t -afr_unlink (call_frame_t *frame, xlator_t *this, -            loc_t *loc, int xflag, dict_t *xdata) +int +afr_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, +	    dict_t *xdata)  {          afr_private_t           *priv                   = NULL;          afr_local_t             *local                  = NULL;          afr_internal_lock_t     *int_lock               = NULL;          call_frame_t            *transaction_frame      = NULL;          int                     ret                     = -1; -        int                     op_errno                = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); +        int                     op_errno                = ENOMEM;          priv = this->private;          QUORUM_CHECK(unlink,out);          transaction_frame = copy_frame (frame); -        if (!transaction_frame) { -                op_errno = ENOMEM; +        if (!transaction_frame)                  goto out; -        } - -        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); -        local = transaction_frame->local; -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +	local = AFR_FRAME_INIT (transaction_frame, op_errno); +	if (!local) +		goto out;          loc_copy (&local->loc, loc);          local->xflag = xflag; + +	local->inode = inode_ref (loc->inode); +	local->parent = inode_ref (loc->parent); +          if (xdata) -                local->xdata_req = dict_ref (xdata); +                local->xdata_req = dict_copy_with_ref (xdata, NULL); +	else +		local->xdata_req = dict_new (); + +	if (!local->xdata_req) +		goto out;          local->op = GF_FOP_UNLINK; -        local->transaction.fop    = afr_unlink_wind; -        local->transaction.done   = afr_unlink_done; +        local->transaction.wind   = afr_unlink_wind; +        local->transaction.fop    = __afr_txn_write_fop; +        local->transaction.done   = __afr_txn_write_done;          local->transaction.unwind = afr_unlink_unwind;          ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -1730,19 +1321,16 @@ afr_unlink (call_frame_t *frame, xlator_t *this,          int_lock->lockee_count++;          ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);          if (ret < 0) { -            op_errno = -ret; -            goto out; +		op_errno = -ret; +		goto out;          } -        ret = 0; +	return 0;  out: -        if (ret < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); -                AFR_STACK_UNWIND (unlink, frame, -1, op_errno, -                                  NULL, NULL, NULL); -        } +	if (transaction_frame) +		AFR_STACK_DESTROY (transaction_frame); +	AFR_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL);          return 0;  } @@ -1760,23 +1348,13 @@ afr_rmdir_unwind (call_frame_t *frame, xlator_t *this)          local = frame->local; -        LOCK (&frame->lock); -        { -                if (local->transaction.main_frame) { -                        main_frame = local->transaction.main_frame; -                } -                local->transaction.main_frame = NULL; -        } -        UNLOCK (&frame->lock); - -        if (main_frame) { -                AFR_STACK_UNWIND (rmdir, main_frame, -                                  local->op_ret, local->op_errno, -                                  &local->cont.dir_fop.preparent, -                                  &local->cont.dir_fop.postparent, -                                  NULL); -        } +	main_frame = afr_transaction_detach_fop_frame (frame); +	if (!main_frame) +		return 0; +	AFR_STACK_UNWIND (rmdir, main_frame, local->op_ret, local->op_errno, +			  &local->cont.dir_fop.preparent, +			  &local->cont.dir_fop.postparent, local->xdata_rsp);          return 0;  } @@ -1786,130 +1364,71 @@ afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                      int32_t op_ret, int32_t op_errno, struct iatt *preparent,                      struct iatt *postparent, dict_t *xdata)  { -        afr_local_t *   local = NULL; -        int call_count  = -1; -        int child_index = (long) cookie; -        int read_child  = 0; - -        local = frame->local; - -        LOCK (&frame->lock); -        { -                if (child_index == read_child) { -                        local->read_child_returned = _gf_true; -                } -                if (afr_fop_failed (op_ret, op_errno) && (op_errno != ENOTEMPTY)) -                        afr_transaction_fop_failed (frame, this, child_index); -                local->op_errno = op_errno; -                local->child_errno[child_index] = op_errno; -                if (op_ret > -1) -                        __dir_entry_fop_common_cbk (frame, child_index, this, -                                                   op_ret, op_errno, NULL, NULL, -                                                   preparent, postparent, NULL, -                                                   NULL); - -        } -        UNLOCK (&frame->lock); - -        call_count = afr_frame_return (frame); -        if (call_count == 0) -                afr_dir_fop_done (frame, this); - -        return 0; +        return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL, +				    preparent, postparent, NULL, NULL, xdata);  }  int -afr_rmdir_wind (call_frame_t *frame, xlator_t *this) +afr_rmdir_wind (call_frame_t *frame, xlator_t *this, int subvol)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; -        int call_count = -1; -        int i = 0;          local = frame->local;          priv  = this->private; -        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, -                                                     priv->child_count); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -                return 0; -        } - -        local->call_count = call_count; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->transaction.pre_op[i]) { -                        STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->rmdir, -                                           &local->loc, local->cont.rmdir.flags, -                                           NULL); - -                        if (!--call_count) -                                break; -                } -        } - -        return 0; -} - - -int -afr_rmdir_done (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t * local = frame->local; - -        local->transaction.unwind (frame, this); - -        AFR_STACK_DESTROY (frame); - +	STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->rmdir, +			   &local->loc, local->cont.rmdir.flags, local->xdata_req);          return 0;  }  int -afr_rmdir (call_frame_t *frame, xlator_t *this, -           loc_t *loc, int flags, dict_t *xdata) +afr_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, +	   dict_t *xdata)  {          afr_private_t           *priv                   = NULL;          afr_local_t             *local                  = NULL;          afr_internal_lock_t     *int_lock               = NULL;          call_frame_t            *transaction_frame      = NULL;          int                     ret                     = -1; -        int                     op_errno                = 0; +        int                     op_errno                = ENOMEM;          int                     nlockee                 = 0; -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); -          priv = this->private;          QUORUM_CHECK(rmdir,out);          transaction_frame = copy_frame (frame); -        if (!transaction_frame) { -                op_errno = ENOMEM; +        if (!transaction_frame)                  goto out; -        } -        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); -        local = transaction_frame->local; +	local = AFR_FRAME_INIT (transaction_frame, op_errno); +	if (!local) +		goto out; -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; -        local->cont.rmdir.flags = flags;          loc_copy (&local->loc, loc); +	local->inode = inode_ref (loc->inode); +	local->parent = inode_ref (loc->parent); + +        local->cont.rmdir.flags = flags; + +        if (xdata) +                local->xdata_req = dict_copy_with_ref (xdata, NULL); +	else +		local->xdata_req = dict_new (); + +	if (!local->xdata_req) +		goto out;          local->op = GF_FOP_RMDIR; -        local->transaction.fop    = afr_rmdir_wind; -        local->transaction.done   = afr_rmdir_done; +        local->transaction.wind   = afr_rmdir_wind; +        local->transaction.fop    = __afr_txn_write_fop; +        local->transaction.done   = __afr_txn_write_done;          local->transaction.unwind = afr_rmdir_unwind;          ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -1944,18 +1463,16 @@ afr_rmdir (call_frame_t *frame, xlator_t *this,          ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);          if (ret < 0) { -            op_errno = -ret; -            goto out; +		op_errno = -ret; +		goto out;          } -        ret = 0; +	return 0;  out: -        if (ret < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); -                AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL); -        } +	if (transaction_frame) +		AFR_STACK_DESTROY (transaction_frame); +	AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL);          return 0;  } diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index 0cfebcb9d55..01e078c13e6 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -35,241 +35,153 @@  #include "compat-errno.h"  #include "compat.h" -/** - * Common algorithm for inode read calls: - * - * - Try the fop on the first child that is up - * - if we have failed due to ENOTCONN: - *     try the next child - * - * Applicable to: access, stat, fstat, readlink, getxattr - */ +#include "afr-transaction.h" +  /* {{{ access */ -int32_t -afr_access_cbk (call_frame_t *frame, void *cookie, -                xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) +int +afr_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		int op_ret, int op_errno, dict_t *xdata)  { -        afr_private_t * priv            = NULL; -        afr_local_t *   local           = NULL; -        xlator_t **     children        = NULL; -        int             unwind          = 1; -        int32_t         *last_index     = NULL; -        int32_t         next_call_child = -1; -        int32_t         read_child      = -1; -        int32_t         *fresh_children  = NULL; - -        priv     = this->private; -        children = priv->children; +        afr_local_t     *local = NULL;          local = frame->local; -        read_child = (long) cookie; +	if (op_ret < 0) { +		local->op_ret = op_ret; +		local->op_errno = op_errno; -        if (op_ret == -1) { -                last_index = &local->cont.access.last_index; -                fresh_children = local->fresh_children; -                next_call_child = afr_next_call_child (fresh_children, -                                                       local->child_up, -                                                       priv->child_count, -                                                       last_index, read_child); -                if (next_call_child < 0) -                        goto out; +		afr_read_txn_continue (frame, this, (long) cookie); +		return 0; +	} -                unwind = 0; - -                STACK_WIND_COOKIE (frame, afr_access_cbk, -                                   (void *) (long) read_child, -                                   children[next_call_child], -                                   children[next_call_child]->fops->access, -                                   &local->loc, local->cont.access.mask, -                                   NULL); -        } - -out: -        if (unwind) { -                AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata); -        } +	AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata);          return 0;  } -int32_t -afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, -            dict_t *xdata) +int +afr_access_wind (call_frame_t *frame, xlator_t *this, int subvol)  { -        afr_private_t   *priv      = NULL; -        xlator_t        **children = NULL; -        int             call_child = 0; -        afr_local_t     *local     = NULL; -        int32_t         op_errno   = 0; -        int32_t         read_child = -1; -        int             ret        = -1; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); - -        priv     = this->private; -        VALIDATE_OR_GOTO (priv->children, out); - -        children = priv->children; - -        AFR_SBRAIN_CHECK_LOC (loc, out); - -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; - -        local->fresh_children = afr_children_create (priv->child_count); -        if (!local->fresh_children) { -                op_errno = ENOMEM; -                goto out; -        } +	afr_private_t *priv = NULL; +	afr_local_t *local = NULL; + +	priv = this->private; +	local = frame->local; + +	if (subvol == -1) { +		AFR_STACK_UNWIND (access, frame, local->op_ret, +				  local->op_errno, 0); +		return 0; +	} + +        STACK_WIND_COOKIE (frame, afr_access_cbk, (void *) (long) subvol, +                           priv->children[subvol], +                           priv->children[subvol]->fops->access, +                           &local->loc, local->cont.access.mask, +			   local->xdata_req); +	return 0; +} +int +afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, +	    int mask, dict_t *xdata) +{ +        afr_local_t *local = NULL; +	int op_errno = 0; -        read_child = afr_inode_get_read_ctx (this, loc->inode, -                                             local->fresh_children); -        ret = afr_get_call_child (this, local->child_up, read_child, -                                     local->fresh_children, -                                     &call_child, -                                     &local->cont.access.last_index); -        if (ret < 0) { -                op_errno = -ret; -                goto out; -        } +        local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out; -        loc_copy (&local->loc, loc); -        local->cont.access.mask = mask; +	local->op = GF_FOP_ACCESS; +	loc_copy (&local->loc, loc); +	local->cont.access.mask = mask; +	if (xdata) +		local->xdata_req = dict_ref (xdata); -        STACK_WIND_COOKIE (frame, afr_access_cbk, -                           (void *) (long) call_child, -                           children[call_child], -                           children[call_child]->fops->access, -                           loc, mask, xdata); +	afr_read_txn (frame, this, loc->inode, afr_access_wind, +		      AFR_METADATA_TRANSACTION); -        ret = 0; +	return 0;  out: -        if (ret < 0) -                AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL); +	AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL); +          return 0;  } -  /* }}} */  /* {{{ stat */ -int32_t +int  afr_stat_cbk (call_frame_t *frame, void *cookie,                xlator_t *this, int32_t op_ret, int32_t op_errno,                struct iatt *buf, dict_t *xdata)  { -        afr_private_t * priv            = NULL; -        afr_local_t *   local           = NULL; -        xlator_t **     children        = NULL; -        int             unwind          = 1; -        int32_t         *last_index     = NULL; -        int32_t         next_call_child = -1; -        int32_t         read_child      = -1; -        int32_t         *fresh_children  = NULL; - -        priv     = this->private; -        children = priv->children; - -        read_child = (long) cookie; +        afr_local_t     *local = NULL;          local = frame->local; -        if (op_ret == -1) { -                last_index = &local->cont.stat.last_index; -                fresh_children = local->fresh_children; -                next_call_child = afr_next_call_child (fresh_children, -                                                       local->child_up, -                                                       priv->child_count, -                                                       last_index, read_child); -                if (next_call_child < 0) -                        goto out; +	if (op_ret < 0) { +		local->op_ret = op_ret; +		local->op_errno = op_errno; -                unwind = 0; +		afr_read_txn_continue (frame, this, (long) cookie); +		return 0; +	} -                STACK_WIND_COOKIE (frame, afr_stat_cbk, -                                   (void *) (long) read_child, -                                   children[next_call_child], -                                   children[next_call_child]->fops->stat, -                                   &local->loc, NULL); -        } - -out: -        if (unwind) { -                AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); -        } +	AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata);          return 0;  } -int32_t -afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +int +afr_stat_wind (call_frame_t *frame, xlator_t *this, int subvol)  { -        afr_private_t   *priv      = NULL; -        afr_local_t     *local     = NULL; -        xlator_t        **children = NULL; -        int             call_child = 0; -        int32_t         op_errno   = 0; -        int32_t         read_child = -1; -        int             ret        = -1; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); - -        priv     = this->private; -        VALIDATE_OR_GOTO (priv->children, out); - -        children = priv->children; - -        AFR_SBRAIN_CHECK_LOC (loc, out); - -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; +	afr_private_t *priv = NULL; +	afr_local_t *local = NULL; + +	priv = this->private; +	local = frame->local; + +	if (subvol == -1) { +		AFR_STACK_UNWIND (stat, frame, local->op_ret, local->op_errno, +				  0, 0); +		return 0; +	} + +        STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) subvol, +                           priv->children[subvol], +                           priv->children[subvol]->fops->stat, +                           &local->loc, local->xdata_req); +	return 0; +} -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +int +afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ +        afr_local_t *local = NULL; +	int op_errno = 0; -        local->fresh_children = afr_children_create (priv->child_count); -        if (!local->fresh_children) { -                op_errno = ENOMEM; -                goto out; -        } +        local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out; -        read_child = afr_inode_get_read_ctx (this, loc->inode, -                                             local->fresh_children); -        ret = afr_get_call_child (this, local->child_up, read_child, -                                     local->fresh_children, -                                     &call_child, -                                     &local->cont.stat.last_index); -        if (ret < 0) { -                op_errno = -ret; -                goto out; -        } -        loc_copy (&local->loc, loc); +	local->op = GF_FOP_STAT; +	loc_copy (&local->loc, loc); +	if (xdata) +		local->xdata_req = dict_ref (xdata); -        STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child, -                           children[call_child], -                           children[call_child]->fops->stat, -                           loc, xdata); +	afr_read_txn (frame, this, loc->inode, afr_stat_wind, +		      AFR_DATA_TRANSACTION); -        ret = 0; +	return 0;  out: -        if (ret < 0) -                AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); +	AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL);          return 0;  } @@ -279,52 +191,49 @@ out:  /* {{{ fstat */ -int32_t +int  afr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                 int32_t op_ret, int32_t op_errno, struct iatt *buf,                 dict_t *xdata)  { -        afr_private_t   *priv           = NULL; -        afr_local_t     *local          = NULL; -        xlator_t        **children      = NULL; -        int             unwind          = 1; -        int32_t         *last_index     = NULL; -        int32_t         next_call_child = -1; -        int32_t         read_child      = -1; -        int32_t         *fresh_children  = NULL; - -        priv     = this->private; -        children = priv->children; +        afr_local_t     *local = NULL;          local = frame->local; -        read_child = (long) cookie; +	if (op_ret < 0) { +		local->op_ret = op_ret; +		local->op_errno = op_errno; -        if (op_ret == -1) { -                last_index = &local->cont.fstat.last_index; -                fresh_children = local->fresh_children; -                next_call_child = afr_next_call_child (fresh_children, -                                                       local->child_up, -                                                       priv->child_count, -                                                       last_index, read_child); -                if (next_call_child < 0) -                        goto out; +		afr_read_txn_continue (frame, this, (long) cookie); +		return 0; +	} -                unwind = 0; +	AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); -                STACK_WIND_COOKIE (frame, afr_fstat_cbk, -                                   (void *) (long) read_child, -                                   children[next_call_child], -                                   children[next_call_child]->fops->fstat, -                                   local->fd, NULL); -        } +        return 0; +} -out: -        if (unwind) { -                AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); -        } -        return 0; +int +afr_fstat_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ +	afr_private_t *priv = NULL; +	afr_local_t *local = NULL; + +	priv = this->private; +	local = frame->local; + +	if (subvol == -1) { +		AFR_STACK_UNWIND (fstat, frame, local->op_ret, local->op_errno, +				  0, 0); +		return 0; +	} + +        STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) subvol, +                           priv->children[subvol], +                           priv->children[subvol]->fops->fstat, +                           local->fd, local->xdata_req); +	return 0;  } @@ -332,68 +241,26 @@ int32_t  afr_fstat (call_frame_t *frame, xlator_t *this,             fd_t *fd, dict_t *xdata)  { -        afr_private_t   *priv      = NULL; -        afr_local_t     *local     = NULL; -        xlator_t        **children = NULL; -        int             call_child = 0; -        int32_t         op_errno   = 0; -        int32_t         read_child = 0; -        int             ret        = -1; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (fd, out); -        VALIDATE_OR_GOTO (this->private, out); - -        priv     = this->private; -        VALIDATE_OR_GOTO (priv->children, out); - -        children = priv->children; - -        VALIDATE_OR_GOTO (fd->inode, out); - -        AFR_SBRAIN_CHECK_FD (fd, out); - -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; - -        local->fresh_children = afr_children_create (priv->child_count); -        if (!local->fresh_children) { -                op_errno = ENOMEM; -                goto out; -        } +        afr_local_t *local = NULL; +	int op_errno = 0; -        read_child = afr_inode_get_read_ctx (this, fd->inode, -                                             local->fresh_children); +        local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out; +	local->op = GF_FOP_FSTAT; +	local->fd = fd_ref (fd); +	if (xdata) +		local->xdata_req = dict_ref (xdata); +        afr_fix_open (fd, this); -        ret = afr_get_call_child (this, local->child_up, read_child, -                                     local->fresh_children, -                                     &call_child, -                                     &local->cont.fstat.last_index); -        if (ret < 0) { -                op_errno = -ret; -                goto out; -        } - -        local->fd = fd_ref (fd); - -        afr_open_fd_fix (fd, this); +	afr_read_txn (frame, this, fd->inode, afr_fstat_wind, +		      AFR_DATA_TRANSACTION); -        STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child, -                           children[call_child], -                           children[call_child]->fops->fstat, -                           fd, xdata); - -        ret = 0; +	return 0;  out: -        if (ret < 0) -                AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); +	AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL);          return 0;  } @@ -402,117 +269,77 @@ out:  /* {{{ readlink */ -int32_t +int  afr_readlink_cbk (call_frame_t *frame, void *cookie,                    xlator_t *this, int32_t op_ret, int32_t op_errno,                    const char *buf, struct iatt *sbuf, dict_t *xdata)  { -        afr_private_t * priv                  = NULL; -        afr_local_t *   local                 = NULL; -        xlator_t **     children              = NULL; -        int             unwind                = 1; -        int32_t         *last_index           = NULL; -        int32_t         next_call_child       = -1; -        int32_t         read_child            = -1; -        int32_t         *fresh_children        = NULL; +	afr_local_t *local = NULL; -        priv     = this->private; -        children = priv->children; +	local = frame->local; -        local = frame->local; +	if (op_ret < 0) { +		local->op_ret = -1; +		local->op_errno = op_errno; -        read_child = (long) cookie; +		afr_read_txn_continue (frame, this, (long) cookie); +		return 0; +	} -        if (op_ret == -1) { -                last_index = &local->cont.readlink.last_index; -                fresh_children = local->fresh_children; -                next_call_child = afr_next_call_child (fresh_children, -                                                       local->child_up, -                                                       priv->child_count, -                                                       last_index, read_child); -                if (next_call_child < 0) -                        goto out; - -                unwind = 0; -                STACK_WIND_COOKIE (frame, afr_readlink_cbk, -                                   (void *) (long) read_child, -                                   children[next_call_child], -                                   children[next_call_child]->fops->readlink, -                                   &local->loc, -                                   local->cont.readlink.size, NULL); -        } - -out: -        if (unwind) { -                AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, buf, sbuf, -                                  xdata); -        } +	AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, +			  buf, sbuf, xdata); +	return 0; +} -        return 0; +int +afr_readlink_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; + +	local = frame->local; +	priv = this->private; + +	if (subvol == -1) { +		AFR_STACK_UNWIND (readlink, frame, local->op_ret, +				  local->op_errno, 0, 0, 0); +		return 0; +	} + +	STACK_WIND_COOKIE (frame, afr_readlink_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->readlink, +			   &local->loc, local->cont.readlink.size, +			   local->xdata_req); +	return 0;  } -int32_t +int  afr_readlink (call_frame_t *frame, xlator_t *this,                loc_t *loc, size_t size, dict_t *xdata)  { -        afr_private_t   *priv      = NULL; -        xlator_t        **children = NULL; -        int             call_child = 0; -        afr_local_t     *local     = NULL; +        afr_local_t   * local      = NULL;          int32_t         op_errno   = 0; -        int32_t         read_child = -1; -        int             ret        = -1; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); - -        priv     = this->private; -        VALIDATE_OR_GOTO (priv->children, out); -        children = priv->children; - -        AFR_SBRAIN_CHECK_LOC (loc, out); - -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; - -        local->fresh_children = afr_children_create (priv->child_count); -        if (!local->fresh_children) { -                op_errno = ENOMEM; -                goto out; -        } -        read_child = afr_inode_get_read_ctx (this, loc->inode, -                                             local->fresh_children); -        ret = afr_get_call_child (this, local->child_up, read_child, -                                     local->fresh_children, -                                     &call_child, -                                     &local->cont.readlink.last_index); -        if (ret < 0) { -                op_errno = -ret; -                goto out; -        } +	local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out; +	local->op = GF_FOP_READLINK;          loc_copy (&local->loc, loc); +        local->cont.readlink.size = size; +	if (xdata) +		local->xdata_req = dict_ref (xdata); -        local->cont.readlink.size       = size; - -        STACK_WIND_COOKIE (frame, afr_readlink_cbk, -                           (void *) (long) call_child, -                           children[call_child], -                           children[call_child]->fops->readlink, -                           loc, size, xdata); +	afr_read_txn (frame, this, loc->inode, afr_readlink_wind, +		      AFR_DATA_TRANSACTION); -        ret = 0; -out: -        if (ret < 0) -                AFR_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL, NULL);          return 0; +out: +	AFR_STACK_UNWIND(readlink, frame, -1, op_errno, 0, 0, 0); + +	return 0;  } @@ -550,7 +377,7 @@ __gather_xattr_keys (dict_t *dict, char *key, data_t *value,  void -__filter_xattrs (dict_t *dict) +afr_filter_xattrs (dict_t *dict)  {          struct list_head   keys = {0,};          struct _xattr_key *key  = NULL; @@ -571,59 +398,56 @@ __filter_xattrs (dict_t *dict)  } - -int32_t +int  afr_getxattr_cbk (call_frame_t *frame, void *cookie,                    xlator_t *this, int32_t op_ret, int32_t op_errno,                    dict_t *dict, dict_t *xdata)  { -        afr_private_t * priv            = NULL; -        afr_local_t *   local           = NULL; -        xlator_t **     children        = NULL; -        int             unwind          = 1; -        int32_t         *last_index     = NULL; -        int32_t         next_call_child = -1; -        int32_t         read_child      = -1; -        int32_t         *fresh_children  = NULL; - -        priv     = this->private; -        children = priv->children; +        afr_local_t *local = NULL;          local = frame->local; -        read_child = (long) cookie; - -        if (op_ret == -1) { -                last_index = &local->cont.getxattr.last_index; -                fresh_children = local->fresh_children; -                next_call_child = afr_next_call_child (fresh_children, -                                                       local->child_up, -                                                       priv->child_count, -                                                       last_index, read_child); -                if (next_call_child < 0) -                        goto out; +	if (op_ret < 0) { +		local->op_ret = op_ret; +		local->op_errno = op_errno; -                unwind = 0; -                STACK_WIND_COOKIE (frame, afr_getxattr_cbk, -                                   (void *) (long) read_child, -                                   children[next_call_child], -                                   children[next_call_child]->fops->getxattr, -                                   &local->loc, -                                   local->cont.getxattr.name, -                                   NULL); -        } +		afr_read_txn_continue (frame, this, (long) cookie); +		return 0; +	} -out: -        if (unwind) { -                if (op_ret >= 0 && dict) -                        __filter_xattrs (dict); +	if (dict) +		afr_filter_xattrs (dict); -                AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); -        } +	AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata);          return 0;  } + +int +afr_getxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; + +	local = frame->local; +	priv = this->private; + +	if (subvol == -1) { +                AFR_STACK_UNWIND (getxattr, frame, local->op_ret, +				  local->op_errno, NULL, NULL); +		return 0; +	} + +	STACK_WIND_COOKIE (frame, afr_getxattr_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->getxattr, +			   &local->loc, local->cont.getxattr.name, +			   local->xdata_req); +	return 0; +} + +  int32_t  afr_getxattr_unwind (call_frame_t *frame, int op_ret, int op_errno,                       dict_t *dict, dict_t *xdata) @@ -659,7 +483,7 @@ afr_fgetxattr_clrlk_cbk (call_frame_t *frame, void *cookie,          {                  callcnt = --local->call_count;                  if (op_ret == -1) -                        local->child_errno[cky] = op_errno; +                        local->replies[cky].op_errno = op_errno;                  if (!local->dict)                          local->dict = dict_new (); @@ -710,12 +534,10 @@ unlock:          unwind:                  // Updating child_errno with more recent 'events' -                local->child_errno[cky] = op_errno; -                op_errno = afr_resultant_errno_get (NULL, local->child_errno, -                                                    priv->child_count); +                op_errno = afr_final_errno (local, priv); +                  AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr,                                    xdata); -                  if (xattr)                          dict_unref (xattr);          } @@ -749,7 +571,7 @@ afr_getxattr_clrlk_cbk (call_frame_t *frame, void *cookie,          {                  callcnt = --local->call_count;                  if (op_ret == -1) -                        local->child_errno[cky] = op_errno; +                        local->replies[cky].op_errno = op_errno;                  if (!local->dict)                          local->dict = dict_new (); @@ -800,9 +622,8 @@ unlock:          unwind:                  // Updating child_errno with more recent 'events' -                local->child_errno[cky] = op_errno; -                op_errno = afr_resultant_errno_get (NULL, local->child_errno, -                                                    priv->child_count); +                op_errno = afr_final_errno (local, priv); +                  AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata);                  if (xattr) @@ -1411,7 +1232,7 @@ afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk,          }          if (!strcmp (name, GF_XATTR_PATHINFO_KEY) || -                        !strcmp (name, GF_XATTR_USER_PATHINFO_KEY)) { +	    !strcmp (name, GF_XATTR_USER_PATHINFO_KEY)) {                  if (is_fgetxattr) {                          *cbk = afr_fgetxattr_pathinfo_cbk;                  } else { @@ -1442,18 +1263,16 @@ out:  }  static void -afr_getxattr_frm_all_children (xlator_t *this, call_frame_t *frame, -                               const char *name, loc_t *loc, -                               fop_getxattr_cbk_t cbk) +afr_getxattr_all_subvols (xlator_t *this, call_frame_t *frame, +			  const char *name, loc_t *loc, +			  fop_getxattr_cbk_t cbk)  {          afr_private_t   *priv           = NULL;          afr_local_t     *local          = NULL; -        xlator_t        **children      = NULL;          int             i               = 0;          int             call_count      = 0;          priv     = this->private; -        children = priv->children;          local = frame->local;          //local->call_count set in afr_local_init @@ -1465,8 +1284,8 @@ afr_getxattr_frm_all_children (xlator_t *this, call_frame_t *frame,          for (i = 0; i < priv->child_count; i++) {                  if (local->child_up[i]) {                          STACK_WIND_COOKIE (frame, cbk, -                                           (void *) (long) i, children[i], -                                           children[i]->fops->getxattr, +                                           (void *) (long) i, priv->children[i], +                                           priv->children[i]->fops->getxattr,                                             loc, name, NULL);                          if (!--call_count)                                  break; @@ -1481,41 +1300,41 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,  {          afr_private_t           *priv         = NULL;          xlator_t                **children    = NULL; -        int                     call_child    = 0;          afr_local_t             *local        = NULL;          xlator_list_t           *trav         = NULL;          xlator_t                **sub_volumes = NULL;          int                     i             = 0;          int32_t                 op_errno      = 0; -        int32_t                 read_child    = -1;          int                     ret           = -1;          fop_getxattr_cbk_t      cbk           = NULL;          int                     afr_xtime_gauge[MCNT_MAX] = {0,}; -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); + +	local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out;          priv     = this->private; -        VALIDATE_OR_GOTO (priv->children, out);          children = priv->children; -        AFR_SBRAIN_CHECK_LOC (loc, out); +        loc_copy (&local->loc, loc); -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; +	local->op = GF_FOP_GETXATTR; -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +	if (xdata) +		local->xdata_req = dict_ref (xdata); -        loc_copy (&local->loc, loc);          if (!name)                  goto no_name;          local->cont.getxattr.name = gf_strdup (name); +	if (!local->cont.getxattr.name) { +		op_errno = ENOMEM; +		goto out; +	} +          if (!strncmp (name, AFR_XATTR_PREFIX,                        strlen (AFR_XATTR_PREFIX))) {                  gf_log (this->name, GF_LOG_INFO, @@ -1559,8 +1378,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,           * collect information from all childs           */          if (afr_is_special_xattr (name, &cbk, 0)) { -                afr_getxattr_frm_all_children (this, frame, name, -                                               loc, cbk); +                afr_getxattr_all_subvols (this, frame, name, loc, cbk);                  return 0;          } @@ -1615,28 +1433,9 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,          }  no_name: -        local->fresh_children = afr_children_create (priv->child_count); -        if (!local->fresh_children) { -                op_errno = ENOMEM; -                goto out; -        } -        read_child = afr_inode_get_read_ctx (this, loc->inode, -                                             local->fresh_children); -        ret = afr_get_call_child (this, local->child_up, read_child, -                                     local->fresh_children, -                                     &call_child, -                                     &local->cont.getxattr.last_index); -        if (ret < 0) { -                op_errno = -ret; -                goto out; -        } - -        STACK_WIND_COOKIE (frame, afr_getxattr_cbk, -                           (void *) (long) call_child, -                           children[call_child], -                           children[call_child]->fops->getxattr, -                           loc, name, xdata); +	afr_read_txn (frame, this, local->loc.inode, afr_getxattr_wind, +		      AFR_METADATA_TRANSACTION);          ret = 0;  out: @@ -1653,76 +1452,60 @@ afr_fgetxattr_cbk (call_frame_t *frame, void *cookie,                     xlator_t *this, int32_t op_ret, int32_t op_errno,                     dict_t *dict, dict_t *xdata)  { -        afr_private_t * priv            = NULL; -        afr_local_t *   local           = NULL; -        xlator_t **     children        = NULL; -        int             unwind          = 1; -        int32_t         *last_index     = NULL; -        int32_t         next_call_child = -1; -        int32_t         read_child      = -1; -        int32_t         *fresh_children  = NULL; - -        priv     = this->private; -        children = priv->children; - -        local = frame->local; +	afr_local_t *local = NULL; -        read_child = (long) cookie; +	local = frame->local; -        if (op_ret == -1) { -                last_index = &local->cont.getxattr.last_index; -                fresh_children = local->fresh_children; -                next_call_child = afr_next_call_child (fresh_children, -                                                       local->child_up, -                                                       priv->child_count, -                                                       last_index, read_child); -                if (next_call_child < 0) -                        goto out; +	if (op_ret < 0) { +		local->op_ret = -1; +		local->op_errno = op_errno; -                unwind = 0; -                STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, -                                   (void *) (long) read_child, -                                   children[next_call_child], -                                   children[next_call_child]->fops->fgetxattr, -                                   local->fd, -                                   local->cont.getxattr.name, -                                   NULL); -        } +		afr_read_txn_continue (frame, this, (long) cookie); +		return 0; +	} -out: -        if (unwind) { -                if (op_ret >= 0 && dict) -                        __filter_xattrs (dict); +	if (dict) +		afr_filter_xattrs (dict); -                AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, -                                  xdata); -        } +	AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata); -        return 0; +	return 0;  } -int32_t -afr_fgetxattr_unwind (call_frame_t *frame, -                      int op_ret, int op_errno, dict_t *dict, dict_t *xdata) - +int +afr_fgetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)  { -        AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata); -        return 0; +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; + +	local = frame->local; +	priv = this->private; + +	if (subvol == -1) { +		AFR_STACK_UNWIND (fgetxattr, frame, local->op_ret, +				  local->op_errno, NULL, NULL); +		return 0; +	} + +	STACK_WIND_COOKIE (frame, (void *) (long) subvol, afr_fgetxattr_cbk, +			   priv->children[subvol], +			   priv->children[subvol]->fops->fgetxattr, +			   local->fd, local->cont.getxattr.name, +			   local->xdata_req); +	return 0;  } +  static void -afr_fgetxattr_frm_all_children (xlator_t *this, call_frame_t *frame, -                                const char *name, fd_t *fd, -                                fop_fgetxattr_cbk_t cbk) +afr_fgetxattr_all_subvols (xlator_t *this, call_frame_t *frame, +			   fop_fgetxattr_cbk_t cbk)  {          afr_private_t   *priv           = NULL;          afr_local_t     *local          = NULL; -        xlator_t        **children      = NULL;          int             i               = 0;          int             call_count      = 0;          priv     = this->private; -        children = priv->children;          local = frame->local;          //local->call_count set in afr_local_init @@ -1735,9 +1518,10 @@ afr_fgetxattr_frm_all_children (xlator_t *this, call_frame_t *frame,                  if (local->child_up[i]) {                          STACK_WIND_COOKIE (frame, cbk,                                             (void *) (long) i, -                                           children[i], -                                           children[i]->fops->fgetxattr, -                                           fd, name, NULL); +                                           priv->children[i], +                                           priv->children[i]->fops->fgetxattr, +                                           local->fd, local->cont.getxattr.name, +					   NULL);                          if (!--call_count)                                  break;                  } @@ -1746,42 +1530,30 @@ afr_fgetxattr_frm_all_children (xlator_t *this, call_frame_t *frame,          return;  } -int32_t + +int  afr_fgetxattr (call_frame_t *frame, xlator_t *this,                 fd_t *fd, const char *name, dict_t *xdata)  { -        afr_private_t        *priv       = NULL; -        xlator_t            **children   = NULL; -        int                   call_child = 0;          afr_local_t          *local      = NULL; -        int32_t               op_ret     = -1;          int32_t               op_errno   = 0; -        int32_t               read_child = -1;          fop_fgetxattr_cbk_t   cbk        = NULL; -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); - -        priv     = this->private; -        VALIDATE_OR_GOTO (priv->children, out); - -        children = priv->children; - -        AFR_SBRAIN_CHECK_FD (fd, out); - -        AFR_LOCAL_ALLOC_OR_GOTO (local, out); -        frame->local = local; - -        op_ret = afr_local_init (local, priv, &op_errno); -        if (op_ret < 0) { -                op_errno = -op_ret; -                goto out; -        } +        local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out; +	local->op = GF_FOP_FGETXATTR;          local->fd = fd_ref (fd); -        if (name) +        if (name) {                  local->cont.getxattr.name = gf_strdup (name); +		if (!local->cont.getxattr.name) { +			op_errno = ENOMEM; +			goto out; +		} +	} +	if (xdata) +		local->xdata_req = dict_ref (xdata);          /* pathinfo gets handled only in getxattr(), but we need to handle           * lockinfo. @@ -1789,42 +1561,19 @@ afr_fgetxattr (call_frame_t *frame, xlator_t *this,           * collect information from all children.           */          if (afr_is_special_xattr (name, &cbk, 1)) { -                afr_fgetxattr_frm_all_children (this, frame, name, -                                                fd, cbk); +                afr_fgetxattr_all_subvols (this, frame, cbk);                  return 0;          } +        afr_fix_open (fd, this); -        local->fresh_children = afr_children_create (priv->child_count); -        if (!local->fresh_children) { -                op_errno = ENOMEM; -                goto out; -        } - -        read_child = afr_inode_get_read_ctx (this, fd->inode, -                                             local->fresh_children); -        op_ret = afr_get_call_child (this, local->child_up, read_child, -                                     local->fresh_children, -                                     &call_child, -                                     &local->cont.getxattr.last_index); -        if (op_ret < 0) { -                op_errno = -op_ret; -                op_ret = -1; -                goto out; -        } - -        STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, -                           (void *) (long) call_child, -                           children[call_child], -                           children[call_child]->fops->fgetxattr, -                           fd, name, xdata); +	afr_read_txn (frame, this, fd->inode, afr_fgetxattr_wind, +		      AFR_METADATA_TRANSACTION); -        op_ret = 0; +	return 0;  out: -        if (op_ret == -1) { -                AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, NULL, -                                  NULL); -        } +	AFR_STACK_UNWIND (fgetxattr, frame, -1, op_errno, NULL, NULL); +          return 0;  } @@ -1833,144 +1582,84 @@ out:  /* {{{ readv */ -/** - * read algorithm: - * - * if the user has specified a read subvolume, use it - * otherwise - - *   use the inode number to hash it to one of the subvolumes, and - *   read from there (to balance read load) - * - * if any of the above read's fail, try the children in sequence - * beginning at the beginning - */ - -int32_t +int  afr_readv_cbk (call_frame_t *frame, void *cookie,                 xlator_t *this, int32_t op_ret, int32_t op_errno,                 struct iovec *vector, int32_t count, struct iatt *buf,                 struct iobref *iobref, dict_t *xdata)  { -        afr_private_t * priv            = NULL; -        afr_local_t *   local           = NULL; -        xlator_t **     children        = NULL; -        int             unwind          = 1; -        int32_t         *last_index     = NULL; -        int32_t         next_call_child = -1; -        int32_t         *fresh_children  = NULL; -        int32_t         read_child      = -1; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); - -        priv     = this->private; -        VALIDATE_OR_GOTO (priv->children, out); - -        children = priv->children; +	afr_local_t *local = NULL; -        local = frame->local; +	local = frame->local; -        read_child = (long) cookie; +	if (op_ret < 0) { +		local->op_ret = -1; +		local->op_errno = op_errno; -        if (op_ret == -1) { -                last_index = &local->cont.readv.last_index; -                fresh_children = local->fresh_children; -                next_call_child = afr_next_call_child (fresh_children, -                                                       local->child_up, -                                                       priv->child_count, -                                                       last_index, read_child); -                if (next_call_child < 0) -                        goto out; - -                unwind = 0; +		afr_read_txn_continue (frame, this, (long) cookie); +		return 0; +	} -                STACK_WIND_COOKIE (frame, afr_readv_cbk, -                                   (void *) (long) read_child, -                                   children[next_call_child], -                                   children[next_call_child]->fops->readv, -                                   local->fd, local->cont.readv.size, -                                   local->cont.readv.offset, -                                   local->cont.readv.flags, -                                   NULL); -        } +	AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, +			  vector, count, buf, iobref, xdata); +	return 0; +} -out: -        if (unwind) { -                AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, -                                  vector, count, buf, iobref, xdata); -        } -        return 0; +int +afr_readv_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; + +	local = frame->local; +	priv = this->private; + +	if (subvol == -1) { +		AFR_STACK_UNWIND (readv, frame, local->op_ret, local->op_errno, +				  0, 0, 0, 0, 0); +		return 0; +	} + +	STACK_WIND_COOKIE (frame, afr_readv_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->readv, +			   local->fd, local->cont.readv.size, +			   local->cont.readv.offset, local->cont.readv.flags, +			   local->xdata_req); +	return 0;  } -int32_t -afr_readv (call_frame_t *frame, xlator_t *this, -           fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata) +int +afr_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, +	   off_t offset, uint32_t flags, dict_t *xdata)  { -        afr_private_t * priv       = NULL;          afr_local_t   * local      = NULL; -        xlator_t **     children   = NULL; -        int             call_child = 0;          int32_t         op_errno   = 0; -        int32_t         read_child = -1; -        int             ret        = -1; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); -        VALIDATE_OR_GOTO (fd, out); - -        priv     = this->private; -        children = priv->children; -        AFR_SBRAIN_CHECK_FD (fd, out); +	local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out; -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; - -        local->fresh_children = afr_children_create (priv->child_count); -        if (!local->fresh_children) { -                op_errno = ENOMEM; -                goto out; -        } - -        read_child = afr_inode_get_read_ctx (this, fd->inode, local->fresh_children); -        ret = afr_get_call_child (this, local->child_up, read_child, -                                     local->fresh_children, -                                     &call_child, -                                     &local->cont.readv.last_index); -        if (ret < 0) { -                op_errno = -ret; -                goto out; -        } - -        local->fd                    = fd_ref (fd); - -        local->cont.readv.size       = size; -        local->cont.readv.offset     = offset; -        local->cont.readv.flags      = flags; +	local->op = GF_FOP_READ; +        local->fd = fd_ref (fd); +        local->cont.readv.size = size; +        local->cont.readv.offset = offset; +        local->cont.readv.flags = flags; +	if (xdata) +		local->xdata_req = dict_ref (xdata); -        afr_open_fd_fix (fd, this); +        afr_fix_open (fd, this); -        STACK_WIND_COOKIE (frame, afr_readv_cbk, -                           (void *) (long) call_child, -                           children[call_child], -                           children[call_child]->fops->readv, -                           fd, size, offset, flags, xdata); +	afr_read_txn (frame, this, fd->inode, afr_readv_wind, +		      AFR_DATA_TRANSACTION); -        ret = 0; -out: -        if (ret < 0) { -                AFR_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, -                                  NULL, NULL); -        }          return 0; +out: +	AFR_STACK_UNWIND(readv, frame, -1, op_errno, 0, 0, 0, 0, 0); + +	return 0;  }  /* }}} */ diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index d62847defa3..3dacfc8dd5d 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -37,46 +37,128 @@  #include "afr.h"  #include "afr-transaction.h" -#include "afr-self-heal-common.h" +//#include "afr-self-heal-common.h" -void -__inode_write_fop_cbk (call_frame_t *frame, int child_index, int read_child, -                       xlator_t *this, int32_t *op_ret, int32_t *op_errno, -                       struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) + +static void +__afr_inode_write_finalize (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	int read_subvol = 0; +	int i = 0; + +	local = frame->local; +	priv = this->private; + +	if (local->inode) { +		if (local->transaction.type == AFR_METADATA_TRANSACTION) +			read_subvol = afr_metadata_subvol_get (local->inode, this, +							       NULL, NULL); +		else +			read_subvol = afr_data_subvol_get (local->inode, this, +							   NULL, NULL); +	} + +	local->op_ret = -1; +	local->op_errno = afr_final_errno (local, priv); + +	for (i = 0; i < priv->child_count; i++) { +		if (!local->replies[i].valid) +			continue; +		if (local->replies[i].op_ret < 0) { +			afr_inode_read_subvol_reset (local->inode, this); +			continue; +		} + +		/* Order of checks in the compound conditional +		   below is important. + +		   - Highest precedence: largest op_ret +		   - Next precendence: if all op_rets are equal, read subvol +		   - Least precedence: any succeeded subvol +		*/ +		if ((local->op_ret < local->replies[i].op_ret) || +		    ((local->op_ret == local->replies[i].op_ret) && +		     (i == read_subvol))) { + +			local->op_ret = local->replies[i].op_ret; +			local->op_errno = local->replies[i].op_errno; + +			local->cont.inode_wfop.prebuf = +				local->replies[i].prestat; +			local->cont.inode_wfop.postbuf = +				local->replies[i].poststat; + +			if (local->replies[i].xdata) { +				if (local->xdata_rsp) +					dict_unref (local->xdata_rsp); +				local->xdata_rsp = +					dict_ref (local->replies[i].xdata); +			} +		} +	} +} + + +static void +__afr_inode_write_fill (call_frame_t *frame, xlator_t *this, int child_index, +			int op_ret, int op_errno, +			struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)  { -        afr_local_t     *local = NULL; +        afr_local_t *local = NULL;          local = frame->local; -        if (afr_fop_failed (*op_ret, *op_errno)) { -                local->child_errno[child_index] = *op_errno; - -                switch (local->op) { -                case GF_FOP_TRUNCATE: -                case GF_FOP_FTRUNCATE: -                        if (*op_errno != EFBIG) -                                afr_transaction_fop_failed (frame, this, -                                                            child_index); -                break; -                default: -                        afr_transaction_fop_failed (frame, this, child_index); -                break; -                } -                local->op_errno = *op_errno; -                goto out; +	local->replies[child_index].valid = 1; +	local->replies[child_index].op_ret = op_ret; +	local->replies[child_index].op_errno = op_errno; + +	if (op_ret >= 0) { +		if (prebuf) +			local->replies[child_index].prestat = *prebuf; +		if (postbuf) +			local->replies[child_index].poststat = *postbuf; +		if (xdata) +			local->replies[child_index].xdata = dict_ref (xdata); +	} else { +		afr_transaction_fop_failed (frame, this, child_index); +	} + +        return; +} + + +static int +__afr_inode_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                       int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +                       struct iatt *postbuf, dict_t *xdata) +{ +        afr_local_t *local = NULL; +        int child_index = (long) cookie; +        int call_count = -1; + +        local = frame->local; + +        LOCK (&frame->lock); +        { +                __afr_inode_write_fill (frame, this, child_index, op_ret, +					op_errno, prebuf, postbuf, xdata);          } +        UNLOCK (&frame->lock); -        if ((local->success_count == 0) || (read_child == child_index)) { -                local->op_ret              = *op_ret; -                if (prebuf) -                        local->cont.inode_wfop.prebuf  = *prebuf; -                if (postbuf) -                        local->cont.inode_wfop.postbuf = *postbuf; +        call_count = afr_frame_return (frame); + +        if (call_count == 0) { +		__afr_inode_write_finalize (frame, this); + +		if (afr_txn_nothing_failed (frame, this)) +			local->transaction.unwind (frame, this); + +                local->transaction.resume (frame, this);          } -        local->success_count++; -out: -        return; +        return 0;  }  /* {{{ writev */ @@ -94,6 +176,8 @@ afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame)          dst_local->op_errno = src_local->op_errno;          dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf;          dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf; +	if (src_local->xdata_rsp) +		dst_local->xdata_rsp = dict_ref (src_local->xdata_rsp);  }  void @@ -106,26 +190,9 @@ afr_writev_unwind (call_frame_t *frame, xlator_t *this)                            local->op_ret, local->op_errno,                            &local->cont.inode_wfop.prebuf,                            &local->cont.inode_wfop.postbuf, -                          NULL); +                          local->xdata_rsp);  } -call_frame_t* -afr_transaction_detach_fop_frame (call_frame_t *frame) -{ -        afr_local_t *   local = NULL; -        call_frame_t   *fop_frame = NULL; - -        local = frame->local; - -        LOCK (&frame->lock); -        { -                fop_frame = local->transaction.main_frame; -                local->transaction.main_frame = NULL; -        } -        UNLOCK (&frame->lock); - -        return fop_frame; -}  int  afr_transaction_writev_unwind (call_frame_t *frame, xlator_t *this) @@ -173,82 +240,60 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                       struct iatt *postbuf, dict_t *xdata)  {          afr_local_t *   local = NULL; -        afr_private_t  *priv  = NULL;          call_frame_t    *fop_frame = NULL;          int child_index = (long) cookie;          int call_count  = -1; -        int read_child  = 0; -        int      ret = 0; +        int ret = 0;          uint32_t open_fd_count = 0;          uint32_t write_is_append = 0;          local = frame->local; -        priv  = this->private; - -        read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL);          LOCK (&frame->lock);          { -                if (child_index == read_child) { -                        local->read_child_returned = _gf_true; -                } - -                __inode_write_fop_cbk (frame, child_index, read_child, this, -                                       &op_ret, &op_errno, prebuf, postbuf, -                                       xdata); - -		local->replies[child_index].valid = 1; -		local->replies[child_index].op_ret = op_ret; -		local->replies[child_index].op_errno = op_errno; - - -		/* stage the best case return value for unwind */ -                if ((local->success_count == 0) || (op_ret > local->op_ret)) { -                        local->op_ret              = op_ret; -			local->op_errno		   = op_errno; -		} - -		if (op_ret != -1) { -                        if (xdata) { -                                ret = dict_get_uint32 (xdata, -                                                       GLUSTERFS_OPEN_FD_COUNT, -                                                       &open_fd_count); -                                if ((ret == 0) && -                                    (open_fd_count > local->open_fd_count)) { -                                        local->open_fd_count = open_fd_count; -                                        local->update_open_fd_count = _gf_true; -                                } - -				write_is_append = 0; -                                ret = dict_get_uint32 (xdata, -                                                       GLUSTERFS_WRITE_IS_APPEND, -                                                       &write_is_append); -                                if (ret || !write_is_append) -					local->append_write = _gf_false; -                        } - +                __afr_inode_write_fill (frame, this, child_index, op_ret, +					op_errno, prebuf, postbuf, xdata); +		if (op_ret == -1 || !xdata) +			goto unlock; + +		write_is_append = 0; +		ret = dict_get_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND, +				       &write_is_append); +		if (ret || !write_is_append) +			local->append_write = _gf_false; + +		ret = dict_get_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, +				       &open_fd_count); +		if (ret == -1) +			goto unlock; +		if ((open_fd_count > local->open_fd_count)) { +			local->open_fd_count = open_fd_count; +			local->update_open_fd_count = _gf_true;  		}          } +unlock:          UNLOCK (&frame->lock);          call_count = afr_frame_return (frame);          if (call_count == 0) { - -                if (local->update_open_fd_count) -                        afr_handle_open_fd_count (frame, this); - -                if (!local->stable_write && !local->append_write) +		if (!local->stable_write && !local->append_write)  			/* An appended write removes the necessity to  			   fsync() the file. This is because self-heal  			   has the logic to check for larger file when  			   the xattrs are not reliably pointing at  			   a stale file.  			*/ -                        afr_fd_report_unstable_write (this, local->fd); +			afr_fd_report_unstable_write (this, local->fd); + +		__afr_inode_write_finalize (frame, this);                  afr_writev_handle_short_writes (frame, this); -                if (afr_any_fops_failed (local, priv)) { + +                if (local->update_open_fd_count) +                        afr_handle_open_fd_count (frame, this); + +                if (!afr_txn_nothing_failed (frame, this)) {                          //Don't unwind until post-op is complete                          local->transaction.resume (frame, this);                  } else { @@ -272,91 +317,23 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          return 0;  } +  int -afr_writev_wind (call_frame_t *frame, xlator_t *this) +afr_writev_wind (call_frame_t *frame, xlator_t *this, int subvol)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; -        int i = 0; -        int call_count = -1; -        dict_t *xdata = NULL; -        GF_UNUSED int     ret = 0;          local = frame->local;          priv = this->private; -        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, -                                                     priv->child_count); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -                return 0; -        } - -        local->call_count = call_count; -	local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), -				   gf_afr_mt_reply_t); -	if (!local->replies) { -		local->op_ret = -1; -		local->op_errno = ENOMEM; -		local->transaction.unwind(frame, this); -		local->transaction.resume(frame, this); -		return 0; -	} - -        xdata = dict_new (); -        if (xdata) { -                ret = dict_set_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, -                                       sizeof (uint32_t)); -		ret = dict_set_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND, -				       0); -		/* Set append_write to be true speculatively. If on any -		   server it turns not be true, we unset it in the -		   callback. -		*/ -		local->append_write = _gf_true; -        } - -        for (i = 0; i < priv->child_count; i++) { -                if (local->transaction.pre_op[i]) { -                        STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->writev, -                                           local->fd, -                                           local->cont.writev.vector, -                                           local->cont.writev.count, -                                           local->cont.writev.offset, -                                           local->cont.writev.flags, -                                           local->cont.writev.iobref, -                                           xdata); - -                        if (!--call_count) -                                break; -                } -        } - -        if (xdata) -                dict_unref (xdata); - -        return 0; -} - - -int -afr_writev_done (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t *local = NULL; - -        local = frame->local; - -        iobref_unref (local->cont.writev.iobref); -        local->cont.writev.iobref = NULL; - -        local->transaction.unwind (frame, this); - -        AFR_STACK_DESTROY (frame); - +	STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->writev, +			   local->fd, local->cont.writev.vector, +			   local->cont.writev.count, local->cont.writev.offset, +			   local->cont.writev.flags, local->cont.writev.iobref, +			   local->xdata_req);          return 0;  } @@ -366,29 +343,29 @@ afr_do_writev (call_frame_t *frame, xlator_t *this)  {          call_frame_t    *transaction_frame = NULL;          afr_local_t     *local             = NULL; -        int             op_ret   = -1; -        int             op_errno = 0; - -        local = frame->local; +        int             ret   = -1; +        int             op_errno = ENOMEM;          transaction_frame = copy_frame (frame); -        if (!transaction_frame) { -                op_errno = ENOMEM; +        if (!transaction_frame)                  goto out; -        } +        local = frame->local;          transaction_frame->local = local; -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); +	frame->local = NULL; -        local->op = GF_FOP_WRITE; +	if (!AFR_FRAME_INIT (frame, op_errno)) +		goto out; -        local->success_count      = 0; +        local->op = GF_FOP_WRITE; -        local->transaction.fop    = afr_writev_wind; -        local->transaction.done   = afr_writev_done; +        local->transaction.wind   = afr_writev_wind; +        local->transaction.fop    = __afr_txn_write_fop; +        local->transaction.done   = __afr_txn_write_done;          local->transaction.unwind = afr_transaction_writev_unwind;          local->transaction.main_frame = frame; +          if (local->fd->flags & O_APPEND) {                 /*                  * Backend vfs ignores the 'offset' for append mode fd so @@ -405,179 +382,86 @@ afr_do_writev (call_frame_t *frame, xlator_t *this)                                                           local->cont.writev.count);          } -        op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); -        if (op_ret < 0) { -            op_errno = -op_ret; +        ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); +        if (ret < 0) { +            op_errno = -ret;              goto out;          } -        op_ret = 0; +	return 0;  out: -        if (op_ret < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); -                AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL, NULL); -        } +	if (transaction_frame) +		AFR_STACK_DESTROY (transaction_frame); +	AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL);          return 0;  } -static void -afr_trigger_open_fd_self_heal (fd_t *fd, xlator_t *this) -{ -        call_frame_t    *frame   = NULL; -        afr_local_t     *local   = NULL; -        afr_self_heal_t *sh      = NULL; -        char            *reason  = NULL; -        int32_t         op_errno = 0; -        int             ret      = 0; - -        if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) { -                gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid args: " -                                  "fd: %p, inode: %p", fd, -                                  fd ? fd->inode : NULL); -                goto out; -        } - -        frame = create_frame (this, this->ctx->pool); -        if (!frame) -                goto out; - -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; -        ret = afr_local_init (local, this->private, &op_errno); -        if (ret < 0) -                goto out; - -        local->loc.inode = inode_ref (fd->inode); -        ret = loc_path (&local->loc, NULL); -        if (ret < 0) -                goto out; - -        sh    = &local->self_heal; -        sh->do_metadata_self_heal = _gf_true; -        if (fd->inode->ia_type == IA_IFREG) -                sh->do_data_self_heal = _gf_true; -        else if (fd->inode->ia_type == IA_IFDIR) -                sh->do_entry_self_heal = _gf_true; - -        reason = "subvolume came online"; -        afr_launch_self_heal (frame, this, fd->inode, _gf_true, -                              fd->inode->ia_type, reason, NULL, NULL); -        return; -out: -        AFR_STACK_DESTROY (frame); -} - -void -afr_open_fd_fix (fd_t *fd, xlator_t *this) -{ -        int           ret             = 0; -        int           i               = 0; -        afr_fd_ctx_t  *fd_ctx         = NULL; -        gf_boolean_t  need_self_heal  = _gf_false; -        int           *need_open      = NULL; -        size_t        need_open_count = 0; -        afr_private_t *priv           = NULL; - -        priv  = this->private; - -        if (!afr_is_fd_fixable (fd)) -                goto out; - -        fd_ctx = afr_fd_ctx_get (fd, this); -        if (!fd_ctx) -                goto out; - -        LOCK (&fd->lock); -        { -                if (fd_ctx->up_count < priv->up_count) { -                        need_self_heal = _gf_true; -                        fd_ctx->up_count   = priv->up_count; -                        fd_ctx->down_count = priv->down_count; -                } - -                need_open = alloca (priv->child_count * sizeof (*need_open)); -                for (i = 0; i < priv->child_count; i++) { -                        need_open[i] = 0; -                        if (fd_ctx->opened_on[i] != AFR_FD_NOT_OPENED) -                                continue; - -                        if (!priv->child_up[i]) -                                continue; - -                        fd_ctx->opened_on[i] = AFR_FD_OPENING; - -                        need_open[i] = 1; -                        need_open_count++; -                } -        } -        UNLOCK (&fd->lock); -        if (ret) -                goto out; - -        if (need_self_heal) -                afr_trigger_open_fd_self_heal (fd, this); - -        if (!need_open_count) -                goto out; - -        afr_fix_open (this, fd, need_open_count, need_open); -out: -        return; -}  int  afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,              struct iovec *vector, int32_t count, off_t offset,              uint32_t flags, struct iobref *iobref, dict_t *xdata)  { -        afr_private_t * priv  = NULL; -        afr_local_t   * local = NULL; -        int ret = -1; -        int op_errno = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); +        afr_private_t *priv = NULL; +        afr_local_t *local = NULL; +        int op_errno = ENOMEM;          priv = this->private; -        if (afr_is_split_brain (this, fd->inode)) { -                op_errno = EIO; -                goto out; -        } -          QUORUM_CHECK(writev,out); -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +	local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out; -        local->cont.writev.vector     = iov_dup (vector, count); +        local->cont.writev.vector = iov_dup (vector, count); +	if (!local->cont.writev.vector) +		goto out;          local->cont.writev.count      = count;          local->cont.writev.offset     = offset;          local->cont.writev.flags      = flags;          local->cont.writev.iobref     = iobref_ref (iobref); -        local->fd                = fd_ref (fd); +	if (xdata) +		local->xdata_req = dict_copy_with_ref (xdata, NULL); +	else +		local->xdata_req = dict_new (); + +	if (!local->xdata_req) +		goto out; + +        local->fd = fd_ref (fd); +	local->inode = inode_ref (fd->inode); + +	if (dict_set_uint32 (local->xdata_req, GLUSTERFS_OPEN_FD_COUNT, 4)) { +		op_errno = ENOMEM; +		goto out; +	} + +	if (dict_set_uint32 (local->xdata_req, GLUSTERFS_WRITE_IS_APPEND, 4)) { +		op_errno = ENOMEM; +		goto out; +	} + +	/* Set append_write to be true speculatively. If on any +	   server it turns not be true, we unset it in the +	   callback. +	*/ +	local->append_write = _gf_true;  	/* detect here, but set it in writev_wind_cbk *after* the unstable  	   write is performed  	*/  	local->stable_write = !!((fd->flags|flags)&(O_SYNC|O_DSYNC)); -        afr_open_fd_fix (fd, this); +        afr_fix_open (fd, this);          afr_do_writev (frame, this); -        ret = 0; +	return 0;  out: -        if (ret < 0) -                AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); +	AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL);          return 0;  } @@ -595,22 +479,13 @@ afr_truncate_unwind (call_frame_t *frame, xlator_t *this)          local = frame->local; -        LOCK (&frame->lock); -        { -                if (local->transaction.main_frame) -                        main_frame = local->transaction.main_frame; -                local->transaction.main_frame = NULL; -        } -        UNLOCK (&frame->lock); - -        if (main_frame) { -                AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, -                                  local->op_errno, -                                  &local->cont.inode_wfop.prebuf, -                                  &local->cont.inode_wfop.postbuf, -                                  NULL); -        } +	main_frame = afr_transaction_detach_fop_frame (frame); +	if (!main_frame) +		return 0; +	AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, local->op_errno, +			  &local->cont.inode_wfop.prebuf, +			  &local->cont.inode_wfop.postbuf, local->xdata_rsp);          return 0;  } @@ -620,96 +495,32 @@ afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                         int32_t op_ret, int32_t op_errno, struct iatt *prebuf,                         struct iatt *postbuf, dict_t *xdata)  { -        afr_local_t *   local = NULL; -        int child_index = (long) cookie; -        int read_child  = 0; -        int call_count  = -1; - -        local = frame->local; - -        read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); +	afr_local_t *local = NULL; -        LOCK (&frame->lock); -        { -                if (child_index == read_child) { -                        local->read_child_returned = _gf_true; -                } +	local = frame->local; -                if (op_ret != -1) { -			if (prebuf->ia_size != postbuf->ia_size) -				local->stable_write = _gf_false; -                } -                __inode_write_fop_cbk (frame, child_index, read_child, this, -                                       &op_ret, &op_errno, prebuf, postbuf, -                                       xdata); -        } -        UNLOCK (&frame->lock); +	if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size) +		local->stable_write = _gf_false; -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -		if (local->stable_write && afr_txn_nothing_failed (frame, this)) -			local->transaction.unwind (frame, this); - -                local->transaction.resume (frame, this); -        } - -        return 0; +	return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, +				      prebuf, postbuf, xdata);  } -int32_t -afr_truncate_wind (call_frame_t *frame, xlator_t *this) +int +afr_truncate_wind (call_frame_t *frame, xlator_t *this, int subvol)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; -        int call_count = -1; -        int i = 0;          local = frame->local;          priv = this->private; -        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, -                                                     priv->child_count); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -                return 0; -        } - -        local->call_count = call_count; -	local->stable_write = _gf_true; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->transaction.pre_op[i]) { -                        STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->truncate, -                                           &local->loc, -                                           local->cont.truncate.offset, -                                           NULL); - -                        if (!--call_count) -                                break; -                } -        } - -        return 0; -} - - -int -afr_truncate_done (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t *local = NULL; - -        local = frame->local; - -        local->transaction.unwind (frame, this); - -        AFR_STACK_DESTROY (frame); - +	STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->truncate, +			   &local->loc, local->cont.truncate.offset, +			   local->xdata_req);          return 0;  } @@ -721,56 +532,60 @@ afr_truncate (call_frame_t *frame, xlator_t *this,          afr_private_t * priv  = NULL;          afr_local_t   * local = NULL;          call_frame_t   *transaction_frame = NULL; -        int ret = -1; -        int op_errno = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); +	int ret = -1; +        int op_errno = ENOMEM;          priv = this->private;          QUORUM_CHECK(truncate,out);          transaction_frame = copy_frame (frame); -        if (!transaction_frame) { -                op_errno = ENOMEM; +        if (!transaction_frame)                  goto out; -        } -        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); -        local = transaction_frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +	local = AFR_FRAME_INIT (transaction_frame, op_errno); +	if (!local) +		goto out;          local->cont.truncate.offset  = offset; +	if (xdata) +		local->xdata_req = dict_copy_with_ref (xdata, NULL); +	else +		local->xdata_req = dict_new (); + +	if (!local->xdata_req) +		goto out; -        local->transaction.fop    = afr_truncate_wind; -        local->transaction.done   = afr_truncate_done; +	local->transaction.wind   = afr_truncate_wind; +        local->transaction.fop    = __afr_txn_write_fop; +        local->transaction.done   = __afr_txn_write_done;          local->transaction.unwind = afr_truncate_unwind;          loc_copy (&local->loc, loc); +	local->inode = inode_ref (loc->inode); + +        local->op = GF_FOP_TRUNCATE;          local->transaction.main_frame = frame;          local->transaction.start   = offset;          local->transaction.len     = 0; +	/* Set it true speculatively, will get reset in afr_truncate_wind_cbk +	   if truncate was not a NOP */ +	local->stable_write = _gf_true; +          ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);          if (ret < 0) { -            op_errno = -ret; -            goto out; +		op_errno = -ret; +		goto out;          } -        ret = 0; +	return 0;  out: -        if (ret < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); -                AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); -        } +	if (transaction_frame) +		AFR_STACK_DESTROY (transaction_frame); +	AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL);          return 0;  } @@ -788,21 +603,13 @@ afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this)          local = frame->local; -        LOCK (&frame->lock); -        { -                if (local->transaction.main_frame) -                        main_frame = local->transaction.main_frame; -                local->transaction.main_frame = NULL; -        } -        UNLOCK (&frame->lock); +	main_frame = afr_transaction_detach_fop_frame (frame); +	if (!main_frame) +		return 0; -        if (main_frame) { -                AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, -                                  local->op_errno, -                                  &local->cont.inode_wfop.prebuf, -                                  &local->cont.inode_wfop.postbuf, -                                  NULL); -        } +	AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, local->op_errno, +			  &local->cont.inode_wfop.prebuf, +			  &local->cont.inode_wfop.postbuf, local->xdata_rsp);          return 0;  } @@ -812,122 +619,75 @@ afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          int32_t op_ret, int32_t op_errno, struct iatt *prebuf,                          struct iatt *postbuf, dict_t *xdata)  { -        afr_local_t *   local = NULL; -        int child_index = (long) cookie; -        int call_count  = -1; -        int read_child  = 0; - -        local = frame->local; - -        read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - -        LOCK (&frame->lock); -        { -                if (child_index == read_child) { -                        local->read_child_returned = _gf_true; -                } - -                if (op_ret != -1) { -			if (prebuf->ia_size != postbuf->ia_size) -				local->stable_write = _gf_false; -                } -                __inode_write_fop_cbk (frame, child_index, read_child, this, -                                       &op_ret, &op_errno, prebuf, postbuf, -                                       xdata); -        } -        UNLOCK (&frame->lock); +	afr_local_t *local = NULL; -        call_count = afr_frame_return (frame); +	local = frame->local; -        if (call_count == 0) { -		if (local->stable_write && afr_txn_nothing_failed (frame, this)) -			local->transaction.unwind (frame, this); +	if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size) +		local->stable_write = _gf_false; -                local->transaction.resume (frame, this); -        } - -        return 0; +	return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, +				      prebuf, postbuf, xdata);  }  int -afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) +afr_ftruncate_wind (call_frame_t *frame, xlator_t *this, int subvol)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; -        int call_count = -1; -        int i = 0; - -        local = frame->local; -        priv = this->private; - -        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, -                                                     priv->child_count); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -                return 0; -        } -        local->call_count = call_count; -	local->stable_write = _gf_true; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->transaction.pre_op[i]) { -                        STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->ftruncate, -                                           local->fd, -                                           local->cont.ftruncate.offset, -                                           NULL); - -                        if (!--call_count) -                                break; -                } -        } +	local = frame->local; +	priv = this->private; +	STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->ftruncate, +			   local->fd, local->cont.ftruncate.offset, +			   local->xdata_req);          return 0;  }  int -afr_ftruncate_done (call_frame_t *frame, xlator_t *this) +afr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +	       dict_t *xdata)  { +        afr_private_t *priv = NULL;          afr_local_t *local = NULL; +	call_frame_t *transaction_frame = NULL; +	int ret = -1; +        int op_errno = ENOMEM; -        local = frame->local; - -        local->transaction.unwind (frame, this); +        priv = this->private; -        AFR_STACK_DESTROY (frame); +        QUORUM_CHECK(ftruncate,out); -        return 0; -} +	transaction_frame = copy_frame (frame); +	if (!frame) +		goto out; +        local = AFR_FRAME_INIT (transaction_frame, op_errno); +        if (!local) +		goto out; -int -afr_do_ftruncate (call_frame_t *frame, xlator_t *this) -{ -        call_frame_t * transaction_frame = NULL; -        afr_local_t *  local             = NULL; -        int op_ret   = -1; -        int op_errno = 0; +        local->cont.ftruncate.offset  = offset; +	if (xdata) +		local->xdata_req = dict_copy_with_ref (xdata, NULL); +	else +		local->xdata_req = dict_new (); -        local = frame->local; +	if (!local->xdata_req) +		goto out; -        transaction_frame = copy_frame (frame); -        if (!transaction_frame) { -                goto out; -        } - -        transaction_frame->local = local; -        frame->local = NULL; +        local->fd = fd_ref (fd); +	local->inode = inode_ref (fd->inode);          local->op = GF_FOP_FTRUNCATE; -        local->transaction.fop    = afr_ftruncate_wind; -        local->transaction.done   = afr_ftruncate_done; +	local->transaction.wind   = afr_ftruncate_wind; +        local->transaction.fop    = __afr_txn_write_fop; +        local->transaction.done   = __afr_txn_write_done;          local->transaction.unwind = afr_ftruncate_unwind;          local->transaction.main_frame = frame; @@ -935,69 +695,21 @@ afr_do_ftruncate (call_frame_t *frame, xlator_t *this)          local->transaction.start   = local->cont.ftruncate.offset;          local->transaction.len     = 0; -        op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); -        if (op_ret < 0) { -            op_errno = -op_ret; -            goto out; -        } +        afr_fix_open (fd, this); -        op_ret = 0; -out: -        if (op_ret < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); -                AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, -                                  NULL, NULL); -        } - -        return 0; -} - - -int -afr_ftruncate (call_frame_t *frame, xlator_t *this, -               fd_t *fd, off_t offset, dict_t *xdata) -{ -        afr_private_t * priv  = NULL; -        afr_local_t   * local = NULL; -        call_frame_t   *transaction_frame = NULL; -        int ret = -1; -        int op_errno = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); - -        priv = this->private; +	/* Set it true speculatively, will get reset in afr_ftruncate_wind_cbk +	   if truncate was not a NOP */ +	local->stable_write = _gf_true; -        if (afr_is_split_brain (this, fd->inode)) { -                op_errno = EIO; -                goto out; +        ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); +        if (ret < 0) { +		op_errno = -ret; +		goto out;          } -        QUORUM_CHECK(ftruncate,out); - -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; -        local->cont.ftruncate.offset  = offset; - -        local->fd = fd_ref (fd); - -        afr_open_fd_fix (fd, this); - -        afr_do_ftruncate (frame, this); - -        ret = 0; +	return 0;  out: -        if (ret < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); -                AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); -        } +	AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL);          return 0;  } @@ -1009,173 +721,92 @@ out:  int  afr_setattr_unwind (call_frame_t *frame, xlator_t *this)  { -        afr_local_t *   local = NULL; -        call_frame_t   *main_frame = NULL; +        afr_local_t *local = NULL; +        call_frame_t *main_frame = NULL;          local = frame->local; -        LOCK (&frame->lock); -        { -                if (local->transaction.main_frame) -                        main_frame = local->transaction.main_frame; -                local->transaction.main_frame = NULL; -        } -        UNLOCK (&frame->lock); - -        if (main_frame) { -                AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, -                                  local->op_errno, -                                  &local->cont.inode_wfop.prebuf, -                                  &local->cont.inode_wfop.postbuf, -                                  NULL); -        } +	main_frame = afr_transaction_detach_fop_frame (frame); +	if (!main_frame) +		return 0; +	AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, local->op_errno, +			  &local->cont.inode_wfop.prebuf, +			  &local->cont.inode_wfop.postbuf, +			  local->xdata_rsp);          return 0;  }  int  afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                      int32_t op_ret, int32_t op_errno, +                      int op_ret, int op_errno,                        struct iatt *preop, struct iatt *postop, dict_t *xdata)  { -        afr_local_t *   local = NULL; -        afr_private_t * priv  = NULL; -        int child_index = (long) cookie; -        int read_child  = 0; -        int call_count  = -1; -        int need_unwind = 0; - -        local = frame->local; -        priv  = this->private; - -        read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); - -        LOCK (&frame->lock); -        { -                if (child_index == read_child) { -                        local->read_child_returned = _gf_true; -                } - -                __inode_write_fop_cbk (frame, child_index, read_child, this, -                                       &op_ret, &op_errno, preop, postop, -                                       xdata); - -                if ((local->success_count >= priv->wait_count) -                    && local->read_child_returned) { -                        need_unwind = 1; -                } -        } -        UNLOCK (&frame->lock); - -        if (need_unwind) -                local->transaction.unwind (frame, this); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -        } - -        return 0; +	return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, +				      preop, postop, xdata);  } -int32_t -afr_setattr_wind (call_frame_t *frame, xlator_t *this) +int +afr_setattr_wind (call_frame_t *frame, xlator_t *this, int subvol)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; -        int call_count = -1; -        int i = 0;          local = frame->local;          priv = this->private; -        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, -                                                     priv->child_count); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -                return 0; -        } - -        local->call_count = call_count; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->transaction.pre_op[i]) { -                        STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->setattr, -                                           &local->loc, -                                           &local->cont.setattr.in_buf, -                                           local->cont.setattr.valid, -                                           NULL); - -                        if (!--call_count) -                                break; -                } -        } - +	STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->setattr, +			   &local->loc, &local->cont.setattr.in_buf, +			   local->cont.setattr.valid, local->xdata_req);          return 0;  }  int -afr_setattr_done (call_frame_t *frame, xlator_t *this) +afr_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf, +	     int32_t valid, dict_t *xdata)  { +        afr_private_t *priv = NULL;          afr_local_t *local = NULL; - -        local = frame->local; - -        local->transaction.unwind (frame, this); - -        AFR_STACK_DESTROY (frame); - -        return 0; -} - - -int -afr_setattr (call_frame_t *frame, xlator_t *this, -             loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata) -{ -        afr_private_t * priv  = NULL; -        afr_local_t   * local = NULL; -        call_frame_t   *transaction_frame = NULL; +        call_frame_t *transaction_frame = NULL;          int ret = -1; -        int op_errno = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); +        int op_errno = ENOMEM;          priv = this->private;          QUORUM_CHECK(setattr,out);          transaction_frame = copy_frame (frame); -        if (!transaction_frame) { -                op_errno = ENOMEM; +        if (!transaction_frame)                  goto out; -        } -        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); -        local = transaction_frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +	local = AFR_FRAME_INIT (transaction_frame, op_errno); +	if (!local) +		goto out;          local->cont.setattr.in_buf = *buf;          local->cont.setattr.valid  = valid; +	if (xdata) +		local->xdata_req = dict_copy_with_ref (xdata, NULL); +	else +		local->xdata_req = dict_new (); -        local->transaction.fop    = afr_setattr_wind; -        local->transaction.done   = afr_setattr_done; +	if (!local->xdata_req) +		goto out; + +        local->transaction.wind   = afr_setattr_wind; +        local->transaction.fop    = __afr_txn_write_fop; +        local->transaction.done   = __afr_txn_write_done;          local->transaction.unwind = afr_setattr_unwind;          loc_copy (&local->loc, loc); +	local->inode = inode_ref (loc->inode); + +	local->op = GF_FOP_SETATTR;          local->transaction.main_frame = frame;          local->transaction.start   = LLONG_MAX - 1; @@ -1183,18 +814,16 @@ afr_setattr (call_frame_t *frame, xlator_t *this,          ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);          if (ret < 0) { -            op_errno = -ret; -            goto out; +		op_errno = -ret; +		goto out;          } -        ret = 0; +	return 0;  out: -        if (ret < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); -                AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); -        } +	if (transaction_frame) +		AFR_STACK_DESTROY (transaction_frame); +	AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL);          return 0;  } @@ -1208,22 +837,13 @@ afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this)          local = frame->local; -        LOCK (&frame->lock); -        { -                if (local->transaction.main_frame) -                        main_frame = local->transaction.main_frame; -                local->transaction.main_frame = NULL; -        } -        UNLOCK (&frame->lock); - -        if (main_frame) { -                AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, -                                  local->op_errno, -                                  &local->cont.inode_wfop.prebuf, -                                  &local->cont.inode_wfop.postbuf, -                                  NULL); -        } +	main_frame = afr_transaction_detach_fop_frame (frame); +        if (!main_frame) +		return 0; +	AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, local->op_errno, +			  &local->cont.inode_wfop.prebuf, +			  &local->cont.inode_wfop.postbuf, local->xdata_rsp);          return 0;  } @@ -1233,149 +853,72 @@ afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                         int32_t op_ret, int32_t op_errno,                         struct iatt *preop, struct iatt *postop, dict_t *xdata)  { -        afr_local_t *   local = NULL; -        afr_private_t * priv  = NULL; -        int child_index = (long) cookie; -        int read_child  = 0; -        int call_count  = -1; -        int need_unwind = 0; - -        local = frame->local; -        priv  = this->private; - -        read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - -        LOCK (&frame->lock); -        { -                if (child_index == read_child) { -                        local->read_child_returned = _gf_true; -                } - -                __inode_write_fop_cbk (frame, child_index, read_child, this, -                                       &op_ret, &op_errno, preop, postop, -                                       xdata); - -                if ((local->success_count >= priv->wait_count) -                    && local->read_child_returned) { -                        need_unwind = 1; -                } -        } -        UNLOCK (&frame->lock); - -        if (need_unwind) -                local->transaction.unwind (frame, this); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -        } - -        return 0; +	return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, +				      preop, postop, xdata);  } -int32_t -afr_fsetattr_wind (call_frame_t *frame, xlator_t *this) +int +afr_fsetattr_wind (call_frame_t *frame, xlator_t *this, int subvol)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; -        int call_count = -1; -        int i = 0;          local = frame->local;          priv = this->private; -        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, -                                                     priv->child_count); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -                return 0; -        } - -        local->call_count = call_count; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->transaction.pre_op[i]) { -                        STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->fsetattr, -                                           local->fd, -                                           &local->cont.fsetattr.in_buf, -                                           local->cont.fsetattr.valid, -                                           NULL); - -                        if (!--call_count) -                                break; -                } -        } - +	STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->fsetattr, +			   local->fd, &local->cont.fsetattr.in_buf, +			   local->cont.fsetattr.valid, local->xdata_req);          return 0;  }  int -afr_fsetattr_done (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t *local = NULL; - -        local = frame->local; - -        local->transaction.unwind (frame, this); - -        AFR_STACK_DESTROY (frame); - -        return 0; -} - -int  afr_fsetattr (call_frame_t *frame, xlator_t *this,                fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata)  { -        afr_private_t * priv  = NULL; -        afr_local_t   * local = NULL; -        call_frame_t   *transaction_frame = NULL; +        afr_private_t *priv = NULL; +        afr_local_t *local = NULL; +        call_frame_t *transaction_frame = NULL;          int ret = -1; -        int op_errno = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); +        int op_errno = ENOMEM;          priv = this->private; -        if (afr_is_split_brain (this, fd->inode)) { -                op_errno = EIO; -                goto out; -        } -          QUORUM_CHECK(fsetattr,out);          transaction_frame = copy_frame (frame); -        if (!transaction_frame) { -                op_errno = ENOMEM; +        if (!transaction_frame)                  goto out; -        } -        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); -        local = transaction_frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +	local = AFR_FRAME_INIT (transaction_frame, op_errno); +	if (!local) +		goto out;          local->cont.fsetattr.in_buf = *buf;          local->cont.fsetattr.valid  = valid; +	if (xdata) +		local->xdata_req = dict_copy_with_ref (xdata, NULL); +	else +		local->xdata_req = dict_new (); -        local->transaction.fop    = afr_fsetattr_wind; -        local->transaction.done   = afr_fsetattr_done; +	if (!local->xdata_req) +		goto out; + +        local->transaction.wind   = afr_fsetattr_wind; +        local->transaction.fop    = __afr_txn_write_fop; +        local->transaction.done   = __afr_txn_write_done;          local->transaction.unwind = afr_fsetattr_unwind;          local->fd                 = fd_ref (fd); +	local->inode = inode_ref (fd->inode); + +	local->op = GF_FOP_FSETATTR; -        afr_open_fd_fix (fd, this); +        afr_fix_open (fd, this);          local->transaction.main_frame = frame;          local->transaction.start   = LLONG_MAX - 1; @@ -1383,18 +926,16 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this,          ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);          if (ret < 0) { -            op_errno = -ret; -            goto out; +		op_errno = -ret; +		goto out;          } -        ret = 0; +	return 0;  out: -        if (ret < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); -                AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); -        } +	if (transaction_frame) +		AFR_STACK_DESTROY (transaction_frame); +	AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL);          return 0;  } @@ -1410,19 +951,12 @@ afr_setxattr_unwind (call_frame_t *frame, xlator_t *this)          local = frame->local; -        LOCK (&frame->lock); -        { -                if (local->transaction.main_frame) -                        main_frame = local->transaction.main_frame; -                local->transaction.main_frame = NULL; -        } -        UNLOCK (&frame->lock); +	main_frame = afr_transaction_detach_fop_frame (frame); +	if (!main_frame) +		return 0; -        if (main_frame) { -                AFR_STACK_UNWIND (setxattr, main_frame, -                                  local->op_ret, local->op_errno, -                                  NULL); -        } +	AFR_STACK_UNWIND (setxattr, main_frame, local->op_ret, local->op_errno, +			  local->xdata_rsp);          return 0;  } @@ -1431,95 +965,32 @@ int  afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                         int32_t op_ret, int32_t op_errno, dict_t *xdata)  { -        afr_local_t   *local      = NULL; -        afr_private_t *priv       = NULL; -        int           call_count  = -1; -        int           need_unwind = 0; -        int           child_index = (long) cookie; - -        local = frame->local; -        priv = this->private; - -        LOCK (&frame->lock); -        { -                __inode_write_fop_cbk (frame, child_index, -1, this, -                                       &op_ret, &op_errno, NULL, NULL, -                                       xdata); -                if (local->success_count == priv->child_count) { -                        need_unwind = 1; -                } -        } -        UNLOCK (&frame->lock); - -        if (need_unwind) -                local->transaction.unwind (frame, this); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -        } - -        return 0; +        return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, +				      NULL, NULL, xdata);  }  int -afr_setxattr_wind (call_frame_t *frame, xlator_t *this) +afr_setxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)  {          afr_local_t      *local         = NULL;          afr_private_t    *priv          = NULL; -        int               call_count    = -1; -        int               i             = 0;          local = frame->local;          priv = this->private; -        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, -                                                     priv->child_count); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -                return 0; -        } - -        local->call_count = call_count; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->transaction.pre_op[i]) { -                        STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->setxattr, -                                           &local->loc, -                                           local->cont.setxattr.dict, -                                           local->cont.setxattr.flags, -                                           NULL); - -                        if (!--call_count) -                                break; -                } -        } - +	STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->setxattr, +			   &local->loc, local->cont.setxattr.dict, +			   local->cont.setxattr.flags, local->xdata_req);          return 0;  }  int -afr_setxattr_done (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t   *local    = frame->local; - -        local->transaction.unwind (frame, this); - -        AFR_STACK_DESTROY (frame); - -        return 0; -} - -int -afr_setxattr (call_frame_t *frame, xlator_t *this, -              loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) +afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, +	      int32_t flags, dict_t *xdata)  {          afr_private_t  *priv              = NULL;          afr_local_t    *local             = NULL; @@ -1527,59 +998,60 @@ afr_setxattr (call_frame_t *frame, xlator_t *this,          int             ret               = -1;          int             op_errno          = EINVAL; -        VALIDATE_OR_GOTO (this, out); -          GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict,                                     op_errno, out);          GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict,                                     op_errno, out); -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this->private, out); -          priv = this->private;          QUORUM_CHECK(setxattr,out); +          transaction_frame = copy_frame (frame); -        if (!transaction_frame) { -                op_errno = ENOMEM; +        if (!transaction_frame)                  goto out; -        } - -        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); -        local = transaction_frame->local; -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) +	local = AFR_FRAME_INIT (transaction_frame, op_errno); +	if (!local)                  goto out;          local->cont.setxattr.dict  = dict_ref (dict);          local->cont.setxattr.flags = flags; +	if (xdata) +		local->xdata_req = dict_copy_with_ref (xdata, NULL); +	else +		local->xdata_req = dict_new (); -        local->transaction.fop    = afr_setxattr_wind; -        local->transaction.done   = afr_setxattr_done; +	if (!local->xdata_req) +		goto out; + +        local->transaction.wind   = afr_setxattr_wind; +        local->transaction.fop    = __afr_txn_write_fop; +        local->transaction.done   = __afr_txn_write_done;          local->transaction.unwind = afr_setxattr_unwind;          loc_copy (&local->loc, loc); +	local->inode = inode_ref (loc->inode);          local->transaction.main_frame = frame;          local->transaction.start   = LLONG_MAX - 1;          local->transaction.len     = 0; +	local->op = GF_FOP_SETXATTR; +          ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);          if (ret < 0) { -            op_errno = -ret; -            goto out; +		op_errno = -ret; +		goto out;          } -        ret = 0; +	return 0;  out: -        if (ret < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); -                AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); -        } +	if (transaction_frame) +		AFR_STACK_DESTROY (transaction_frame); + +	AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);          return 0;  } @@ -1595,19 +1067,12 @@ afr_fsetxattr_unwind (call_frame_t *frame, xlator_t *this)          local = frame->local; -        LOCK (&frame->lock); -        { -                if (local->transaction.main_frame) -                        main_frame = local->transaction.main_frame; -                local->transaction.main_frame = NULL; -        } -        UNLOCK (&frame->lock); +	main_frame = afr_transaction_detach_fop_frame (frame); +        if (!main_frame) +		return 0; -        if (main_frame) { -                AFR_STACK_UNWIND (fsetxattr, main_frame, -                                  local->op_ret, local->op_errno, -                                  NULL); -        } +	AFR_STACK_UNWIND (fsetxattr, main_frame, local->op_ret, local->op_errno, +			  local->xdata_rsp);          return 0;  } @@ -1616,94 +1081,30 @@ int  afr_fsetxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          int32_t op_ret, int32_t op_errno, dict_t *xdata)  { -        afr_local_t   *local      = NULL; -        afr_private_t *priv       = NULL; -        int           call_count  = -1; -        int           need_unwind = 0; -        int           child_index = (long) cookie; - -        local = frame->local; -        priv = this->private; - -        LOCK (&frame->lock); -        { - -                __inode_write_fop_cbk (frame, child_index, -1, this, -                                       &op_ret, &op_errno, NULL, NULL, -                                       xdata); -                if (local->success_count == priv->child_count) { -                        need_unwind = 1; -                } -        } -        UNLOCK (&frame->lock); - -        if (need_unwind) -                local->transaction.unwind (frame, this); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -        } - -        return 0; +        return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, +				      NULL, NULL, xdata);  }  int -afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this) +afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)  {          afr_local_t        *local       = NULL;          afr_private_t      *priv        = NULL; -        int                 call_count  = -1; -        int                 i           = 0;          local = frame->local;          priv = this->private; -        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, -                                                     priv->child_count); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -                return 0; -        } - -        local->call_count = call_count; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->transaction.pre_op[i]) { -                        STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->fsetxattr, -                                           local->fd, -                                           local->cont.fsetxattr.dict, -                                           local->cont.fsetxattr.flags, -                                           NULL); - -                        if (!--call_count) -                                break; -                } -        } - +	STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->fsetxattr, +			   local->fd, local->cont.fsetxattr.dict, +			   local->cont.fsetxattr.flags, local->xdata_req);          return 0;  }  int -afr_fsetxattr_done (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t   *local   = frame->local; - -        local->transaction.unwind (frame, this); - -        AFR_STACK_DESTROY (frame); - -        return 0; -} - -int  afr_fsetxattr (call_frame_t *frame, xlator_t *this,                 fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata)  { @@ -1711,11 +1112,7 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this,          afr_local_t      *local             = NULL;          call_frame_t     *transaction_frame = NULL;          int               ret               = -1; -        int               op_errno          = EINVAL; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); +        int               op_errno          = ENOMEM;          GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict,                                     op_errno, out); @@ -1725,36 +1122,36 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this,          priv = this->private; -        if (afr_is_split_brain (this, fd->inode)) { -                op_errno = EIO; -                goto out; -        } -          QUORUM_CHECK(fsetxattr,out); -        AFR_LOCAL_ALLOC_OR_GOTO (local, out); - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; -          transaction_frame = copy_frame (frame); -        if (!transaction_frame) { +        if (!transaction_frame)                  goto out; -        } - -        transaction_frame->local = local; -        local->op_ret = -1; +	local = AFR_FRAME_INIT (transaction_frame, op_errno); +	if (!local) +		goto out;          local->cont.fsetxattr.dict  = dict_ref (dict);          local->cont.fsetxattr.flags = flags; -        local->transaction.fop    = afr_fsetxattr_wind; -        local->transaction.done   = afr_fsetxattr_done; +	if (xdata) +		local->xdata_req = dict_copy_with_ref (xdata, NULL); +	else +		local->xdata_req = dict_new (); + +	if (!local->xdata_req) +		goto out; + +        local->transaction.wind   = afr_fsetxattr_wind; +        local->transaction.fop    = __afr_txn_write_fop; +        local->transaction.done   = __afr_txn_write_done;          local->transaction.unwind = afr_fsetxattr_unwind;          local->fd                 = fd_ref (fd); +	local->inode = inode_ref (fd->inode); + +	local->op = GF_FOP_FSETXATTR;          local->transaction.main_frame = frame;          local->transaction.start  = LLONG_MAX - 1; @@ -1762,18 +1159,16 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this,          ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);          if (ret < 0) { -            op_errno = -ret; -            goto out; +		op_errno = -ret; +		goto out;          } -        ret = 0; +	return 0;  out: -        if (ret < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); -                AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); -        } +	if (transaction_frame) +		AFR_STACK_DESTROY (transaction_frame); +	AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);          return 0;  } @@ -1791,19 +1186,12 @@ afr_removexattr_unwind (call_frame_t *frame, xlator_t *this)          local = frame->local; -        LOCK (&frame->lock); -        { -                if (local->transaction.main_frame) -                        main_frame = local->transaction.main_frame; -                local->transaction.main_frame = NULL; -        } -        UNLOCK (&frame->lock); +	main_frame = afr_transaction_detach_fop_frame (frame); +	if (!main_frame) +		return 0; -        if (main_frame) { -                AFR_STACK_UNWIND (removexattr, main_frame, -                                  local->op_ret, local->op_errno, -                                  NULL); -        } +	AFR_STACK_UNWIND (removexattr, main_frame, local->op_ret, local->op_errno, +			  local->xdata_rsp);          return 0;  } @@ -1812,88 +1200,25 @@ int  afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                            int32_t op_ret, int32_t op_errno, dict_t *xdata)  { -        afr_local_t   *local      = NULL; -        afr_private_t *priv       = NULL; -        int           call_count  = -1; -        int           need_unwind = 0; -        int           child_index = (long) cookie; - -        local = frame->local; -        priv = this->private; - -        LOCK (&frame->lock); -        { -                __inode_write_fop_cbk (frame, child_index, -1, this, -                                       &op_ret, &op_errno, NULL, NULL, -                                       xdata); -                if (local->success_count == priv->wait_count) { -                        need_unwind = 1; -                } -        } -        UNLOCK (&frame->lock); - -        if (need_unwind) -                local->transaction.unwind (frame, this); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -        } - -        return 0; +        return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, +				      NULL, NULL, xdata);  } -int32_t -afr_removexattr_wind (call_frame_t *frame, xlator_t *this) +int +afr_removexattr_wind (call_frame_t *frame, xlator_t *this, int subvol)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; -        int call_count = -1; -        int i = 0; - -        local = frame->local; -        priv = this->private; - -        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, -                                                     priv->child_count); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -                return 0; -        } - -        local->call_count = call_count; -        for (i = 0; i < priv->child_count; i++) { -                if (local->transaction.pre_op[i]) { -                        STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->removexattr, -                                           &local->loc, -                                           local->cont.removexattr.name, -                                           NULL); - -                        if (!--call_count) -                                break; -                } -        } - -        return 0; -} - - -int -afr_removexattr_done (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t * local = frame->local; - -        local->transaction.unwind (frame, this); - -        AFR_STACK_DESTROY (frame); +	local = frame->local; +	priv = this->private; +	STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->removexattr, +			   &local->loc, local->cont.removexattr.name, +			   local->xdata_req);          return 0;  } @@ -1906,9 +1231,7 @@ afr_removexattr (call_frame_t *frame, xlator_t *this,          afr_local_t     *local             = NULL;          call_frame_t    *transaction_frame = NULL;          int              ret               = -1; -        int              op_errno          = 0; - -        VALIDATE_OR_GOTO (this, out); +        int              op_errno          = ENOMEM;          GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*",                                   name, op_errno, out); @@ -1916,34 +1239,37 @@ afr_removexattr (call_frame_t *frame, xlator_t *this,          GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*",                                   name, op_errno, out); -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this->private, out); -        VALIDATE_OR_GOTO (loc, out); -          priv = this->private;          QUORUM_CHECK(removexattr,out);          transaction_frame = copy_frame (frame); -        if (!transaction_frame) { -                op_errno = ENOMEM; +        if (!transaction_frame)                  goto out; -        } -        AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); -        local = transaction_frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +	local = AFR_FRAME_INIT (transaction_frame, op_errno); +	if (!local) +		goto out;          local->cont.removexattr.name = gf_strdup (name); -        local->transaction.fop    = afr_removexattr_wind; -        local->transaction.done   = afr_removexattr_done; +	if (xdata) +		local->xdata_req = dict_copy_with_ref (xdata, NULL); +	else +		local->xdata_req = dict_new (); + +	if (!local->xdata_req) +		goto out; + +        local->transaction.wind   = afr_removexattr_wind; +        local->transaction.fop    = __afr_txn_write_fop; +        local->transaction.done   = __afr_txn_write_done;          local->transaction.unwind = afr_removexattr_unwind;          loc_copy (&local->loc, loc); +	local->inode = inode_ref (loc->inode); + +	local->op = GF_FOP_REMOVEXATTR;          local->transaction.main_frame = frame;          local->transaction.start   = LLONG_MAX - 1; @@ -1951,18 +1277,16 @@ afr_removexattr (call_frame_t *frame, xlator_t *this,          ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);          if (ret < 0) { -            op_errno = -ret; -            goto out; +		op_errno = -ret; +		goto out;          } -        ret = 0; +	return 0;  out: -        if (ret < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); -                AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); -        } +	if (transaction_frame) +		AFR_STACK_DESTROY (transaction_frame); +	AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL);          return 0;  } @@ -1975,19 +1299,12 @@ afr_fremovexattr_unwind (call_frame_t *frame, xlator_t *this)          local = frame->local; -        LOCK (&frame->lock); -        { -                if (local->transaction.main_frame) -                        main_frame = local->transaction.main_frame; -                local->transaction.main_frame = NULL; -        } -        UNLOCK (&frame->lock); +	main_frame = afr_transaction_detach_fop_frame (frame); +        if (!main_frame) +		return 0; -        if (main_frame) { -                AFR_STACK_UNWIND (fremovexattr, main_frame, -                                  local->op_ret, local->op_errno, -                                  NULL); -        } +	AFR_STACK_UNWIND (fremovexattr, main_frame, local->op_ret, local->op_errno, +			  local->xdata_rsp);          return 0;  } @@ -1996,105 +1313,38 @@ int  afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                            int32_t op_ret, int32_t op_errno, dict_t *xdata)  { -        afr_local_t *   local       = NULL; -        afr_private_t * priv        = NULL; -        int             call_count  = -1; -        int             need_unwind = 0; -        int             child_index = (long) cookie; - -        local = frame->local; -        priv = this->private; - -        LOCK (&frame->lock); -        { -                __inode_write_fop_cbk (frame, child_index, -1, this, -                                       &op_ret, &op_errno, NULL, NULL, -                                       xdata); - -                if (local->success_count == priv->wait_count) { -                        need_unwind = 1; -                } -        } -        UNLOCK (&frame->lock); - -        if (need_unwind) -                local->transaction.unwind (frame, this); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -        } - -        return 0; +	return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, +				      NULL, NULL, xdata);  } -int32_t -afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this) +int +afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this, int subvol)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; -        int call_count = -1; -        int i = 0;          local = frame->local;          priv = this->private; -        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, -                                                     priv->child_count); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -                return 0; -        } - -        local->call_count = call_count; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->transaction.pre_op[i]) { -                        STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->fremovexattr, -                                           local->fd, -                                           local->cont.removexattr.name, -                                           NULL); - -                        if (!--call_count) -                                break; -                } -        } - +	STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->fremovexattr, +			   local->fd, local->cont.removexattr.name, +			   local->xdata_req);          return 0;  }  int -afr_fremovexattr_done (call_frame_t *frame, xlator_t *this) +afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, +		  const char *name, dict_t *xdata)  { -        afr_local_t * local = frame->local; - -        local->transaction.unwind (frame, this); - -        AFR_STACK_DESTROY (frame); - -        return 0; -} - - -int -afr_fremovexattr (call_frame_t *frame, xlator_t *this, -                  fd_t *fd, const char *name, dict_t *xdata) -{ -        afr_private_t * priv  = NULL; -        afr_local_t   * local = NULL; -        call_frame_t   *transaction_frame = NULL; +	afr_private_t *priv = NULL; +        afr_local_t *local = NULL; +        call_frame_t *transaction_frame = NULL;          int ret = -1; -        int op_ret   = -1; -        int op_errno = 0; - -        VALIDATE_OR_GOTO (this, out); +        int op_errno = ENOMEM;          GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*",                                   name, op_errno, out); @@ -2102,64 +1352,59 @@ afr_fremovexattr (call_frame_t *frame, xlator_t *this,          GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*",                                   name, op_errno, out); -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this->private, out); - -        priv = this->private; -        if (afr_is_split_brain (this, fd->inode)) { -                op_errno = EIO; -                goto out; -        } +	priv = this->private;          QUORUM_CHECK(fremovexattr, out);          transaction_frame = copy_frame (frame); -        if (!transaction_frame) { +        if (!transaction_frame)                  goto out; -        } -        AFR_LOCAL_ALLOC_OR_GOTO (local, out); - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) { -                op_errno = -ret; +	local = AFR_FRAME_INIT (transaction_frame, op_errno); +	if (!local)                  goto out; -        } - -        transaction_frame->local = local; - -        local->op_ret = -1;          local->cont.removexattr.name = gf_strdup (name); +	if (xdata) +		local->xdata_req = dict_copy_with_ref (xdata, NULL); +	else +		local->xdata_req = dict_new (); -        local->transaction.fop    = afr_fremovexattr_wind; -        local->transaction.done   = afr_fremovexattr_done; +	if (!local->xdata_req) +		goto out; + +        local->transaction.wind   = afr_fremovexattr_wind; +        local->transaction.fop    = __afr_txn_write_fop; +        local->transaction.done   = __afr_txn_write_done;          local->transaction.unwind = afr_fremovexattr_unwind;          local->fd = fd_ref (fd); +	local->inode = inode_ref (fd->inode); + +	local->op = GF_FOP_FREMOVEXATTR;          local->transaction.main_frame = frame;          local->transaction.start   = LLONG_MAX - 1;          local->transaction.len     = 0; -        op_ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); -        if (op_ret < 0) { -            op_errno = -op_ret; -            goto out; +        ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); +        if (ret < 0) { +		op_errno = -ret; +		goto out;          } -        op_ret = 0; +	return 0;  out: -        if (op_ret < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); -                AFR_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, NULL); -        } +	if (transaction_frame) +		AFR_STACK_DESTROY (transaction_frame); + +	AFR_STACK_UNWIND (fremovexattr, frame, -1, op_errno, NULL);          return 0;  } -static int + +int  afr_fallocate_unwind (call_frame_t *frame, xlator_t *this)  {          afr_local_t *   local = NULL; @@ -2167,147 +1412,88 @@ afr_fallocate_unwind (call_frame_t *frame, xlator_t *this)          local = frame->local; -        LOCK (&frame->lock); -        { -                if (local->transaction.main_frame) -                        main_frame = local->transaction.main_frame; -                local->transaction.main_frame = NULL; -        } -        UNLOCK (&frame->lock); +	main_frame = afr_transaction_detach_fop_frame (frame); +	if (!main_frame) +		return 0; -        if (main_frame) { -                AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, -                                  local->op_errno, -                                  &local->cont.inode_wfop.prebuf, -                                  &local->cont.inode_wfop.postbuf, -                                  NULL); -        } +	AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, local->op_errno, +			  &local->cont.inode_wfop.prebuf, +			  &local->cont.inode_wfop.postbuf, local->xdata_rsp);          return 0;  } -static int + +int  afr_fallocate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          int32_t op_ret, int32_t op_errno, struct iatt *prebuf,                          struct iatt *postbuf, dict_t *xdata)  { -        afr_local_t *   local = NULL; -        afr_private_t * priv  = NULL; -        int child_index = (long) cookie; -        int call_count  = -1; -        int need_unwind = 0; -        int read_child  = 0; - -        local = frame->local; -        priv  = this->private; - -        read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - -        LOCK (&frame->lock); -        { -                if (child_index == read_child) { -                        local->read_child_returned = _gf_true; -                } - -                __inode_write_fop_cbk (frame, child_index, read_child, this, -                                       &op_ret, &op_errno, prebuf, postbuf, -                                       xdata); - -                if ((local->success_count >= priv->wait_count) -                    && local->read_child_returned) { -                        need_unwind = 1; -                } -        } -        UNLOCK (&frame->lock); - -        if (need_unwind) -                local->transaction.unwind (frame, this); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -        } - -        return 0; +        return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, +				      prebuf, postbuf, xdata);  } -static int -afr_fallocate_wind (call_frame_t *frame, xlator_t *this) + +int +afr_fallocate_wind (call_frame_t *frame, xlator_t *this, int subvol)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; -        int call_count = -1; -        int i = 0;          local = frame->local;          priv = this->private; -        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, -                                                     priv->child_count); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -                return 0; -        } - -        local->call_count = call_count; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->transaction.pre_op[i]) { -                        STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->fallocate, -                                           local->fd, -                                           local->cont.fallocate.mode, -                                           local->cont.fallocate.offset, -                                           local->cont.fallocate.len, -                                           NULL); - -                        if (!--call_count) -                                break; -                } -        } - +	STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->fallocate, +			   local->fd, local->cont.fallocate.mode, +			   local->cont.fallocate.offset, +			   local->cont.fallocate.len, local->xdata_req);          return 0;  } -static int -afr_fallocate_done (call_frame_t *frame, xlator_t *this) + +int +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, +               off_t offset, size_t len, dict_t *xdata)  { +	afr_private_t *priv = NULL; +        call_frame_t *transaction_frame = NULL;          afr_local_t *local = NULL; +        int ret = -1; +        int op_errno = ENOMEM; -        local = frame->local; +	priv = this->private; -        local->transaction.unwind (frame, this); +        QUORUM_CHECK(fallocate,out); -        AFR_STACK_DESTROY (frame); +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) +                goto out; -        return 0; -} +	local = AFR_FRAME_INIT (transaction_frame, op_errno); +	if (!local) +		goto out; -static int -afr_do_fallocate (call_frame_t *frame, xlator_t *this) -{ -        call_frame_t * transaction_frame = NULL; -        afr_local_t *  local             = NULL; -        int op_ret   = -1; -        int op_errno = 0; +        local->cont.fallocate.mode = mode; +        local->cont.fallocate.offset  = offset; +        local->cont.fallocate.len = len; -        local = frame->local; +        local->fd = fd_ref (fd); +	local->inode = inode_ref (fd->inode); -        transaction_frame = copy_frame (frame); -        if (!transaction_frame) { -                goto out; -        } +	if (xdata) +		local->xdata_req = dict_copy_with_ref (xdata, NULL); +	else +		local->xdata_req = dict_new (); -        transaction_frame->local = local; -        frame->local = NULL; +	if (!local->xdata_req) +		goto out;          local->op = GF_FOP_FALLOCATE; -        local->transaction.fop    = afr_fallocate_wind; -        local->transaction.done   = afr_fallocate_done; +        local->transaction.wind   = afr_fallocate_wind; +        local->transaction.fop    = __afr_txn_write_fop; +        local->transaction.done   = __afr_txn_write_done;          local->transaction.unwind = afr_fallocate_unwind;          local->transaction.main_frame = frame; @@ -2315,80 +1501,29 @@ afr_do_fallocate (call_frame_t *frame, xlator_t *this)          local->transaction.start   = local->cont.fallocate.offset;          local->transaction.len     = 0; -        /* fallocate can modify the file size */ -        op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); -        if (op_ret < 0) { -            op_errno = -op_ret; -            goto out; -        } - -        op_ret = 0; -out: -        if (op_ret < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); -                AFR_STACK_UNWIND (fallocate, frame, op_ret, op_errno, NULL, -                                  NULL, NULL); -        } - -        return 0; -} - -int -afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, -               off_t offset, size_t len, dict_t *xdata) -{ -        afr_private_t * priv  = NULL; -        afr_local_t   * local = NULL; -        call_frame_t   *transaction_frame = NULL; -        int ret = -1; -        int op_errno = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); +        afr_fix_open (fd, this); -        priv = this->private; - -        if (afr_is_split_brain (this, fd->inode)) { -                op_errno = EIO; -                goto out; +        ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); +        if (ret < 0) { +		op_errno = -ret; +		goto out;          } -        QUORUM_CHECK(fallocate,out); - -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; -        local->cont.fallocate.mode = mode; -        local->cont.fallocate.offset  = offset; -        local->cont.fallocate.len = len; - -        local->fd = fd_ref (fd); - -        afr_open_fd_fix (fd, this); - -        afr_do_fallocate (frame, this); - -        ret = 0; +	return 0;  out: -        if (ret < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); -                AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); -        } +	if (transaction_frame) +		AFR_STACK_DESTROY (transaction_frame); +	AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);          return 0;  } +  /* }}} */  /* {{{ discard */ -static int +int  afr_discard_unwind (call_frame_t *frame, xlator_t *this)  {          afr_local_t *   local = NULL; @@ -2396,146 +1531,86 @@ afr_discard_unwind (call_frame_t *frame, xlator_t *this)          local = frame->local; -        LOCK (&frame->lock); -        { -                if (local->transaction.main_frame) -                        main_frame = local->transaction.main_frame; -                local->transaction.main_frame = NULL; -        } -        UNLOCK (&frame->lock); +	main_frame = afr_transaction_detach_fop_frame (frame); +        if (!main_frame) +		return 0; -        if (main_frame) { -                AFR_STACK_UNWIND (discard, main_frame, local->op_ret, -                                  local->op_errno, -                                  &local->cont.inode_wfop.prebuf, -                                  &local->cont.inode_wfop.postbuf, -                                  NULL); -        } +	AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno, +			  &local->cont.inode_wfop.prebuf, +			  &local->cont.inode_wfop.postbuf, local->xdata_rsp);          return 0;  } -static int + +int  afr_discard_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                        int32_t op_ret, int32_t op_errno, struct iatt *prebuf,                        struct iatt *postbuf, dict_t *xdata)  { -        afr_local_t *   local = NULL; -        afr_private_t * priv  = NULL; -        int child_index = (long) cookie; -        int call_count  = -1; -        int need_unwind = 0; -        int read_child  = 0; - -        local = frame->local; -        priv  = this->private; - -        read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - -        LOCK (&frame->lock); -        { -                if (child_index == read_child) { -                        local->read_child_returned = _gf_true; -                } - -                __inode_write_fop_cbk (frame, child_index, read_child, this, -                                       &op_ret, &op_errno, prebuf, postbuf, -                                       xdata); - -                if ((local->success_count >= priv->wait_count) -                    && local->read_child_returned) { -                        need_unwind = 1; -                } -        } -        UNLOCK (&frame->lock); - -        if (need_unwind) -                local->transaction.unwind (frame, this); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -        } - -        return 0; +	return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, +				      prebuf, postbuf, xdata);  } -static int -afr_discard_wind (call_frame_t *frame, xlator_t *this) + +int +afr_discard_wind (call_frame_t *frame, xlator_t *this, int subvol)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; -        int call_count = -1; -        int i = 0;          local = frame->local;          priv = this->private; -        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, -                                                     priv->child_count); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -                return 0; -        } - -        local->call_count = call_count; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->transaction.pre_op[i]) { -                        STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->discard, -                                           local->fd, -                                           local->cont.discard.offset, -                                           local->cont.discard.len, -                                           NULL); - -                        if (!--call_count) -                                break; -                } -        } - +	STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->discard, +			   local->fd, local->cont.discard.offset, +			   local->cont.discard.len, local->xdata_req);          return 0;  } -static int -afr_discard_done (call_frame_t *frame, xlator_t *this) + +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +             size_t len, dict_t *xdata)  { +        afr_private_t *priv  = NULL;          afr_local_t *local = NULL; +        call_frame_t *transaction_frame = NULL; +        int ret = -1; +        int op_errno = ENOMEM; -        local = frame->local; +        priv = this->private; -        local->transaction.unwind (frame, this); +        QUORUM_CHECK(discard, out); -        AFR_STACK_DESTROY (frame); +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) +		goto out; -        return 0; -} +	local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out; -static int -afr_do_discard (call_frame_t *frame, xlator_t *this) -{ -        call_frame_t * transaction_frame = NULL; -        afr_local_t *  local             = NULL; -        int op_ret   = -1; -        int op_errno = 0; +        local->cont.discard.offset  = offset; +        local->cont.discard.len = len; -        local = frame->local; +        local->fd = fd_ref (fd); +	local->inode = inode_ref (fd->inode); -        transaction_frame = copy_frame (frame); -        if (!transaction_frame) { -                goto out; -        } +	if (xdata) +		local->xdata_req = dict_copy_with_ref (xdata, NULL); +	else +		local->xdata_req = dict_new (); -        transaction_frame->local = local; -        frame->local = NULL; +	if (!local->xdata_req) +		goto out;          local->op = GF_FOP_DISCARD; -        local->transaction.fop    = afr_discard_wind; -        local->transaction.done   = afr_discard_done; +        local->transaction.wind   = afr_discard_wind; +        local->transaction.fop    = __afr_txn_write_fop; +        local->transaction.done   = __afr_txn_write_done;          local->transaction.unwind = afr_discard_unwind;          local->transaction.main_frame = frame; @@ -2543,316 +1618,134 @@ afr_do_discard (call_frame_t *frame, xlator_t *this)          local->transaction.start   = local->cont.discard.offset;          local->transaction.len     = 0; -        op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); -        if (op_ret < 0) { -            op_errno = -op_ret; -            goto out; -        } +        afr_fix_open (fd, this); -        op_ret = 0; -out: -        if (op_ret < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); -                AFR_STACK_UNWIND (discard, frame, op_ret, op_errno, NULL, -                                  NULL, NULL); -        } - -        return 0; -} - -int -afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, -             size_t len, dict_t *xdata) -{ -        afr_private_t * priv  = NULL; -        afr_local_t   * local = NULL; -        call_frame_t   *transaction_frame = NULL; -        int ret = -1; -        int op_errno = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); - -        priv = this->private; - -        if (afr_is_split_brain (this, fd->inode)) { -                op_errno = EIO; -                goto out; +        ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); +        if (ret < 0) { +		op_errno = -ret; +		goto out;          } -        QUORUM_CHECK(discard, out); - -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; -        local->cont.discard.offset  = offset; -        local->cont.discard.len = len; - -        local->fd = fd_ref (fd); - -        afr_open_fd_fix (fd, this); - -        afr_do_discard(frame, this); - -        ret = 0; +	return 0;  out: -        if (ret < 0) { -                if (transaction_frame) -                        AFR_STACK_DESTROY (transaction_frame); -                AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); -        } +	if (transaction_frame) +		AFR_STACK_DESTROY (transaction_frame); +	AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);          return 0;  }  /* {{{ zerofill */ -static int +int  afr_zerofill_unwind (call_frame_t *frame, xlator_t *this)  { -        afr_local_t     *local            = NULL; -        call_frame_t    *main_frame       = NULL; +        afr_local_t *   local = NULL; +        call_frame_t   *main_frame = NULL;          local = frame->local; -        LOCK (&frame->lock); -        { -                if (local->transaction.main_frame) { -                        main_frame = local->transaction.main_frame; -                } -                local->transaction.main_frame = NULL; -        } -        UNLOCK (&frame->lock); +	main_frame = afr_transaction_detach_fop_frame (frame); +        if (!main_frame) +		return 0; -        if (main_frame) { -                AFR_STACK_UNWIND (zerofill, main_frame, local->op_ret, -                                  local->op_errno, -                                  &local->cont.zerofill.prebuf, -                                  &local->cont.zerofill.postbuf, -                                  NULL); -        } +	AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno, +			  &local->cont.inode_wfop.prebuf, +			  &local->cont.inode_wfop.postbuf, local->xdata_rsp);          return 0;  } -static int -afr_zerofill_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, -                     int32_t op_ret, int32_t op_errno, struct iatt *prebuf, -                     struct iatt *postbuf, dict_t *xdata) -{ -        afr_local_t       *local             = NULL; -        afr_private_t     *priv              = NULL; -        int                child_index       = (long) cookie; -        int                call_count        = -1; -        int                need_unwind       = 0; -        int                read_child        = 0; - -        local = frame->local; -        priv  = this->private; - -        read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - -        LOCK (&frame->lock); -        { -                if (child_index == read_child) { -                        local->read_child_returned = _gf_true; -                } - -                if (afr_fop_failed (op_ret, op_errno)) { -                        afr_transaction_fop_failed (frame, this, child_index); -                } -                if (op_ret != -1) { -                        if (local->success_count == 0) { -                                local->op_ret = op_ret; -                                local->cont.zerofill.prebuf  = *prebuf; -                                local->cont.zerofill.postbuf = *postbuf; -                        } - -                        if (child_index == read_child) { -                                local->cont.zerofill.prebuf  = *prebuf; -                                local->cont.zerofill.postbuf = *postbuf; -                        } - -                        local->success_count++; - -                        if ((local->success_count >= priv->wait_count) -                            && local->read_child_returned) { -                                need_unwind = 1; -                        } -                } -                local->op_errno = op_errno; -        } -        UNLOCK (&frame->lock); - -        if (need_unwind) { -                local->transaction.unwind (frame, this); -        } -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -        } - -        return 0; +int +afr_zerofill_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                      int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +                      struct iatt *postbuf, dict_t *xdata) +{ +	return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, +				      prebuf, postbuf, xdata);  } -static int -afr_zerofill_wind (call_frame_t *frame, xlator_t *this) + +int +afr_zerofill_wind (call_frame_t *frame, xlator_t *this, int subvol)  { -        afr_local_t    *local         = NULL; -        afr_private_t  *priv          = NULL; -        int             call_count    = -1; -        int             i             = 0; +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL;          local = frame->local;          priv = this->private; -        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, -                                                     priv->child_count); - -        if (call_count == 0) { -                local->transaction.resume (frame, this); -                return 0; -        } - -        local->call_count = call_count; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->transaction.pre_op[i]) { -                        STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->zerofill, -                                           local->fd, -                                           local->cont.zerofill.offset, -                                           local->cont.zerofill.len, -                                           NULL); - -                        if (!--call_count) -                                break; -                } -        } - +	STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, (void *) (long) subvol, +			   priv->children[subvol], +			   priv->children[subvol]->fops->zerofill, +			   local->fd, local->cont.zerofill.offset, +			   local->cont.zerofill.len, local->xdata_req);          return 0;  } -static int -afr_zerofill_done (call_frame_t *frame, xlator_t *this) +int +afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +             size_t len, dict_t *xdata)  { +        afr_private_t *priv  = NULL;          afr_local_t *local = NULL; +        call_frame_t *transaction_frame = NULL; +        int ret = -1; +        int op_errno = ENOMEM; -        local = frame->local; +        priv = this->private; -        local->transaction.unwind (frame, this); +        QUORUM_CHECK(discard, out); -        AFR_STACK_DESTROY (frame); +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) +		goto out; -        return 0; -} +	local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out; -static int -afr_do_zerofill(call_frame_t *frame, xlator_t *this) -{ -        call_frame_t  *transaction_frame = NULL; -        afr_local_t   *local             = NULL; -        int            op_ret            = -1; -        int            op_errno          = 0; +        local->cont.zerofill.offset  = offset; +        local->cont.zerofill.len = len; -        local = frame->local; +        local->fd = fd_ref (fd); +	local->inode = inode_ref (fd->inode); -        transaction_frame = copy_frame (frame); -        if (!transaction_frame) { -                goto out; -        } +	if (xdata) +		local->xdata_req = dict_copy_with_ref (xdata, NULL); +	else +		local->xdata_req = dict_new (); -        transaction_frame->local = local; -        frame->local = NULL; +	if (!local->xdata_req) +		goto out;          local->op = GF_FOP_ZEROFILL; -        local->transaction.fop    = afr_zerofill_wind; -        local->transaction.done   = afr_zerofill_done; +        local->transaction.wind   = afr_zerofill_wind; +        local->transaction.fop    = __afr_txn_write_fop; +        local->transaction.done   = __afr_txn_write_done;          local->transaction.unwind = afr_zerofill_unwind;          local->transaction.main_frame = frame; -        local->transaction.start   = local->cont.zerofill.offset; -        local->transaction.len     = 0; - -        op_ret = afr_transaction (transaction_frame, this, -                                  AFR_DATA_TRANSACTION); -        if (op_ret < 0) { -                op_errno = -op_ret; -                goto out; -        } - -        op_ret = 0; -out: -        if (op_ret < 0) { -                if (transaction_frame) { -                        AFR_STACK_DESTROY (transaction_frame); -                } -                AFR_STACK_UNWIND (zerofill, frame, op_ret, op_errno, NULL, -                                  NULL, NULL); -        } - -        return 0; -} - -int -afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, -              off_t len, dict_t *xdata) -{ -        afr_private_t   *priv               = NULL; -        afr_local_t     *local              = NULL; -        call_frame_t    *transaction_frame  = NULL; -        int              ret                = -1; -        int              op_errno           = 0; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); - -        priv = this->private; - -        if (afr_is_split_brain (this, fd->inode)) { -                op_errno = EIO; -                goto out; -        } -        QUORUM_CHECK(zerofill, out); +        local->transaction.start   = local->cont.discard.offset; +        local->transaction.len     = len; -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; +        afr_fix_open (fd, this); -        ret = afr_local_init (local, priv, &op_errno); +        ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);          if (ret < 0) { -                goto out; +		op_errno = -ret; +		goto out;          } -        local->cont.zerofill.offset  = offset; -        local->cont.zerofill.len = len; - -        local->fd = fd_ref (fd); - -        afr_open_fd_fix (fd, this); -        afr_do_zerofill(frame, this); - -        ret = 0; +	return 0;  out: -        if (ret < 0) { -                if (transaction_frame) { -                        AFR_STACK_DESTROY (transaction_frame); -                } -                AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, -                                  NULL, NULL); -        } +	if (transaction_frame) +		AFR_STACK_DESTROY (transaction_frame); +	AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL);          return 0;  } diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index 060d78f3505..a2a758f35af 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -580,22 +580,6 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this)          return 0;  } -loc_t * -lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2) -{ -        int ret = 0; - -        ret = uuid_compare (l1->inode->gfid, l2->inode->gfid); - -        if (ret == 0) -                ret = strcmp (b1, b2); - -        if (ret <= 0) -                return l1; -        else -                return l2; -} -  int  afr_lockee_locked_nodes_count (afr_internal_lock_t *int_lock)  { @@ -1213,8 +1197,7 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this)          case AFR_ENTRY_RENAME_TRANSACTION:          case AFR_ENTRY_TRANSACTION: -                up_count = afr_up_children_count (local->child_up, -                                                  priv->child_count); +                up_count = AFR_COUNT (local->child_up, priv->child_count);                  int_lock->lk_call_count = int_lock->lk_expected_count                                          = (int_lock->lockee_count *                                             up_count); @@ -1648,496 +1631,6 @@ afr_unlock (call_frame_t *frame, xlator_t *this)  }  int -afr_mark_locked_nodes (xlator_t *this, fd_t *fd, -                       unsigned char *locked_nodes) -{ -        afr_private_t *priv  = NULL; -        afr_fd_ctx_t  *fdctx = NULL; -        uint64_t       tmp   = 0; -        int            ret   = 0; - -        priv = this->private; - -        ret = afr_fd_ctx_set (this, fd); -        if (ret) -                goto out; - -        ret = fd_ctx_get (fd, this, &tmp); -        if (ret) { -                gf_log (this->name, GF_LOG_INFO, -                        "failed to get the fd ctx"); -                goto out; -        } -        fdctx = (afr_fd_ctx_t *) (long) tmp; - -        GF_ASSERT (fdctx->locked_on); - -        memcpy (fdctx->locked_on, locked_nodes, -                priv->child_count); - -out: -        return ret; -} - -static int -__is_fd_saved (xlator_t *this, fd_t *fd) -{ -        afr_locked_fd_t *locked_fd = NULL; -        afr_private_t   *priv      = NULL; -        int              found     = 0; - -        priv = this->private; - -        list_for_each_entry (locked_fd, &priv->saved_fds, list) { -                if (locked_fd->fd == fd) { -                        found = 1; -                        break; -                } -        } - -        return found; -} - -static int -__afr_save_locked_fd (xlator_t *this, fd_t *fd) -{ -        afr_private_t   *priv      = NULL; -        afr_locked_fd_t *locked_fd = NULL; -        int              ret       = 0; - -        priv = this->private; - -        locked_fd = GF_CALLOC (1, sizeof (*locked_fd), -                               gf_afr_mt_locked_fd); -        if (!locked_fd) { -                ret = -1; -                goto out; -        } - -        locked_fd->fd = fd; -        INIT_LIST_HEAD (&locked_fd->list); - -        list_add_tail (&locked_fd->list, &priv->saved_fds); - -out: -        return ret; -} - -int -afr_save_locked_fd (xlator_t *this, fd_t *fd) -{ -        afr_private_t   *priv      = NULL; -        int              ret       = 0; - -        priv = this->private; - -        pthread_mutex_lock (&priv->mutex); -        { -                if (__is_fd_saved (this, fd)) { -                        gf_log (this->name, GF_LOG_DEBUG, -                                "fd=%p already saved", fd); -                        goto unlock; -                } - -                ret = __afr_save_locked_fd (this, fd); -                if (ret) { -                        gf_log (this->name, GF_LOG_INFO, -                                "fd=%p could not be saved", fd); -                        goto unlock; -                } -        } -unlock: -        pthread_mutex_unlock (&priv->mutex); - -        return ret; -} - -static int -afr_lock_recovery_cleanup (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t     *local     = NULL; -        afr_locked_fd_t *locked_fd = NULL; - -        local = frame->local; - -        locked_fd = local->locked_fd; - -        STACK_DESTROY (frame->root); -        afr_local_cleanup (local, this); - -        afr_save_locked_fd (this, locked_fd->fd); - -        return 0; - -} - -static int -afr_get_source_lock_recovery (xlator_t *this, fd_t *fd) -{ -        afr_fd_ctx_t  *fdctx        = NULL; -        afr_private_t *priv         = NULL; -        uint64_t      tmp           = 0; -        int           i             = 0; -        int           source_child  = -1; -        int           ret           = 0; - -        priv = this->private; - -        ret = fd_ctx_get (fd, this, &tmp); -        if (ret) -                goto out; - -        fdctx = (afr_fd_ctx_t *) (long) tmp; - -        for (i = 0; i < priv->child_count; i++) { -                if (fdctx->locked_on[i]) { -                        gf_log (this->name, GF_LOG_DEBUG, -                                "Found lock recovery source=%d", i); -                        source_child = i; -                        break; -                } -        } - -out: -        return source_child; - -} - -int32_t -afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                      int32_t op_ret, int32_t op_errno, struct gf_flock *lock, -                      dict_t *xdata); -int32_t -afr_recover_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                      int32_t op_ret, int32_t op_errno, struct gf_flock *lock, -                      dict_t *xdata) -{ -        afr_local_t   *local = NULL; -        afr_private_t *priv  = NULL; -        int32_t source_child = 0; -        struct gf_flock flock   = {0,}; - -        local = frame->local; -        priv  = this->private; - -        if (op_ret) { -                gf_log (this->name, GF_LOG_INFO, -                        "lock recovery failed"); -                goto cleanup; -        } - -        source_child = local->source_child; - -        memcpy (&flock, lock, sizeof (*lock)); - -        STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk, -                           (void *) (long) source_child, -                           priv->children[source_child], -                           priv->children[source_child]->fops->lk, -                           local->fd, F_GETLK_FD, &flock, NULL); - -        return 0; - -cleanup: -        afr_lock_recovery_cleanup (frame, this); -        return 0; -} - -int -afr_recover_lock (call_frame_t *frame, xlator_t *this, -                  struct gf_flock *flock) -{ -        afr_local_t   *local             = NULL; -        afr_private_t *priv              = NULL; -        int32_t      lock_recovery_child = 0; - -        priv  = this->private; -        local = frame->local; - -        lock_recovery_child = local->lock_recovery_child; - -        frame->root->lk_owner = flock->l_owner; - -        STACK_WIND_COOKIE (frame, afr_recover_lock_cbk, -                           (void *) (long) lock_recovery_child, -                           priv->children[lock_recovery_child], -                           priv->children[lock_recovery_child]->fops->lk, -                           local->fd, F_SETLK, flock, NULL); - -        return 0; -} - -static int -is_afr_lock_eol (struct gf_flock *lock) -{ -        int ret = 0; - -        if ((lock->l_type == GF_LK_EOL)) -                ret = 1; - -        return ret; -} - -int32_t -afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                      int32_t op_ret, int32_t op_errno, struct gf_flock *lock, -                      dict_t *xdata) -{ -        if (op_ret) { -                gf_log (this->name, GF_LOG_INFO, -                        "Failed to get locks on fd"); -                goto cleanup; -        } - -        gf_log (this->name, GF_LOG_DEBUG, -                "Got a lock on fd"); - -        if (is_afr_lock_eol (lock)) { -                gf_log (this->name, GF_LOG_INFO, -                        "Reached EOL on locks on fd"); -                goto cleanup; -        } - -        afr_recover_lock (frame, this, lock); - -        return 0; - -cleanup: -        afr_lock_recovery_cleanup (frame, this); - -        return 0; -} - -static int -afr_lock_recovery (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t   *local        = NULL; -        afr_private_t *priv         = NULL; -        fd_t          *fd           = NULL; -        int            ret          = 0; -        int32_t        source_child = 0; -        struct gf_flock   flock        = {0,}; - -        priv  = this->private; -        local = frame->local; - -        fd = local->fd; - -        source_child = afr_get_source_lock_recovery (this, fd); -        if (source_child < 0) { -                gf_log (this->name, GF_LOG_ERROR, -                        "Could not recover locks due to lock " -                        "split brain"); -                ret = -1; -                goto out; -        } - -        local->source_child = source_child; - -        /* the flock can be zero filled as we're querying incrementally -           the locks held on the fd. -        */ -        STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk, -                           (void *) (long) source_child, -                           priv->children[source_child], -                           priv->children[source_child]->fops->lk, -                           local->fd, F_GETLK_FD, &flock, NULL); - -out: -        return ret; -} - - -static int -afr_mark_fd_opened (xlator_t *this, fd_t *fd, int32_t child_index) -{ -        afr_fd_ctx_t *fdctx = NULL; -        uint64_t      tmp   = 0; -        int           ret   = 0; - -        ret = fd_ctx_get (fd, this, &tmp); -        if (ret) -                goto out; - -        fdctx = (afr_fd_ctx_t *) (long) tmp; - -        fdctx->opened_on[child_index] = AFR_FD_OPENED; - -out: -        return ret; -} - -int32_t -afr_lock_recovery_preopen_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                               int32_t op_ret, int32_t op_errno, fd_t *fd, -                               dict_t *xdata) -{ -        int32_t child_index = (long )cookie; -        int ret = 0; - -        if (op_ret) { -                gf_log (this->name, GF_LOG_INFO, -                        "Reopen during lock-recovery failed"); -                goto cleanup; -        } - -        gf_log (this->name, GF_LOG_DEBUG, -                "Open succeeded => proceed to recover locks"); - -        ret = afr_lock_recovery (frame, this); -        if (ret) { -                gf_log (this->name, GF_LOG_INFO, -                        "Lock recovery failed"); -                goto cleanup; -        } - -        ret = afr_mark_fd_opened (this, fd, child_index); -        if (ret) { -                gf_log (this->name, GF_LOG_INFO, -                        "Marking fd open failed"); -                goto cleanup; -        } - -        return 0; - -cleanup: -        afr_lock_recovery_cleanup (frame, this); -        return 0; -} - -static int -afr_lock_recovery_preopen (call_frame_t *frame, xlator_t *this) -{ -        afr_private_t *priv  = NULL; -        afr_local_t   *local = NULL; -        uint64_t       tmp   = 0; -        afr_fd_ctx_t  *fdctx = NULL; -        loc_t          loc   = {0,}; -        int32_t        child_index = 0; -        int            ret   = 0; - -        priv  = this->private; -        local = frame->local; - -        GF_ASSERT (local && local->fd); - -        ret = fd_ctx_get (local->fd, this, &tmp); -        if (ret) -                gf_log (this->name, GF_LOG_WARNING, -                        "%s: failed to get the context of fd", -                        uuid_utoa (local->fd->inode->gfid)); -        fdctx = (afr_fd_ctx_t *) (long) tmp; -        /* TODO: instead we should return from the function */ -        GF_ASSERT (fdctx); - -        child_index = local->lock_recovery_child; - -        inode_path (local->fd->inode, NULL, (char **)&loc.path); -        loc.name   = strrchr (loc.path, '/'); -        loc.inode  = inode_ref (local->fd->inode); -        loc.parent = inode_parent (local->fd->inode, 0, NULL); - - -        STACK_WIND_COOKIE (frame, afr_lock_recovery_preopen_cbk, -                           (void *)(long) child_index, -                           priv->children[child_index], -                           priv->children[child_index]->fops->open, -                           &loc, fdctx->flags, local->fd, NULL); - -        return 0; -} - -static int -is_fd_opened (fd_t *fd, int32_t child_index) -{ -        afr_fd_ctx_t *fdctx = NULL; -        uint64_t      tmp = 0; -        int           ret = 0; - -        ret = fd_ctx_get (fd, THIS, &tmp); -        if (ret) -                goto out; - -        fdctx = (afr_fd_ctx_t *) (long) tmp; - -        if (fdctx->opened_on[child_index] == AFR_FD_OPENED) -                ret = 1; - -out: -        return ret; -} - -int -afr_attempt_lock_recovery (xlator_t *this, int32_t child_index) -{ -        call_frame_t    *frame      = NULL; -        afr_private_t   *priv       = NULL; -        afr_local_t     *local      = NULL; -        afr_locked_fd_t *locked_fd  = NULL; -        afr_locked_fd_t  *tmp       = NULL; -        int              ret        = -1; -        struct list_head locks_list = {0,}; -        int32_t          op_errno   = 0; - - -        priv = this->private; - -        if (list_empty (&priv->saved_fds)) -                goto out; - -        frame = create_frame (this, this->ctx->pool); -        if (!frame) { -                ret = -1; -                goto out; -        } - -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) { -                ret = -1; -                goto out; -        } - -        frame->local = local; - -        INIT_LIST_HEAD (&locks_list); - -        pthread_mutex_lock (&priv->mutex); -        { -                list_splice_init (&priv->saved_fds, &locks_list); -        } -        pthread_mutex_unlock (&priv->mutex); - -        list_for_each_entry_safe (locked_fd, tmp, -                                  &locks_list, list) { - -                list_del_init (&locked_fd->list); - -                local->fd                  = fd_ref (locked_fd->fd); -                local->lock_recovery_child = child_index; -                local->locked_fd           = locked_fd; - -                if (!is_fd_opened (locked_fd->fd, child_index)) { -                        gf_log (this->name, GF_LOG_DEBUG, -                                "attempting open before lock " -                                "recovery"); -                        afr_lock_recovery_preopen (frame, this); -                } else { -                        gf_log (this->name, GF_LOG_DEBUG, -                                "attempting lock recovery " -                                "without a preopen"); -                        afr_lock_recovery (frame, this); -                } -        } - -out: -        if ((ret < 0) && frame) -                AFR_STACK_DESTROY (frame); -        return ret; -} - -int  afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom,                            unsigned int child_count)  { diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 73594f26526..05df90cc0ee 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -41,10 +41,8 @@ enum gf_afr_mem_types_ {          gf_afr_mt_shd_event_t,          gf_afr_mt_time_t,          gf_afr_mt_pos_data_t, -        gf_afr_mt_reply_t, -        gf_afr_mt_stats_t, -        gf_afr_mt_shd_crawl_event_t, -        gf_afr_mt_uint64_t, +	gf_afr_mt_reply_t, +	gf_afr_mt_subvol_healer_t,          gf_afr_mt_end  };  #endif diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index 643a5d692df..f86aa7fd80d 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -43,85 +43,29 @@  #include "afr-dir-read.h"  #include "afr-dir-write.h"  #include "afr-transaction.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" -int -afr_stale_child_up (afr_local_t *local, xlator_t *this) -{ -        int             i = 0; -        afr_private_t   *priv = NULL; -        int             up = -1; - -        priv = this->private; - -        if (!local->fresh_children) -                local->fresh_children = afr_children_create (priv->child_count); -        if (!local->fresh_children) -                goto out; - -        afr_inode_get_read_ctx (this, local->fd->inode, local->fresh_children); -        if (priv->child_count == afr_get_children_count (local->fresh_children, -                                                         priv->child_count)) -                goto out; -        for (i = 0; i < priv->child_count; i++) { -                if (!local->child_up[i]) -                        continue; -                if (afr_is_child_present (local->fresh_children, -                                          priv->child_count, i)) -                        continue; -                up = i; -                break; -        } -out: -        return up; -} - -void -afr_perform_data_self_heal (call_frame_t *frame, xlator_t *this) +gf_boolean_t +afr_is_fd_fixable (fd_t *fd)  { -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        inode_t         *inode = NULL; -        int             st_child = -1; -        char            reason[64] = {0}; - -        local = frame->local; -        sh = &local->self_heal; -        inode = local->fd->inode; - -        if (!IA_ISREG (inode->ia_type)) -                goto out; - -        st_child = afr_stale_child_up (local, this); -        if (st_child < 0) -                goto out; - -        sh->do_data_self_heal          = _gf_true; -        sh->do_metadata_self_heal      = _gf_true; -        sh->do_gfid_self_heal          = _gf_true; -        sh->do_missing_entry_self_heal = _gf_true; - -        snprintf (reason, sizeof (reason), "stale subvolume %d detected", -                  st_child); -        afr_launch_self_heal (frame, this, inode, _gf_true, inode->ia_type, -                              reason, NULL, NULL); -out: -        return; +        if (!fd || !fd->inode) +                return _gf_false; +        else if (fd_is_anonymous (fd)) +                return _gf_false; +        else if (uuid_is_null (fd->inode->gfid)) +                return _gf_false; + +        return _gf_true;  } +  int  afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          int32_t op_ret, int32_t op_errno, struct iatt *prebuf,                          struct iatt *postbuf, dict_t *xdata)  {          afr_local_t * local = frame->local; -        afr_private_t *priv = NULL; -        priv = this->private; -        if (afr_open_only_data_self_heal (priv->data_self_heal)) -                afr_perform_data_self_heal (frame, this);          AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno,                            local->fd, xdata);          return 0; @@ -134,49 +78,38 @@ afr_open_cbk (call_frame_t *frame, void *cookie,                fd_t *fd, dict_t *xdata)  {          afr_local_t *  local       = NULL; -        int            ret         = 0;          int            call_count  = -1;          int            child_index = (long) cookie; -        afr_private_t *priv        = NULL; +	afr_fd_ctx_t  *fd_ctx = NULL; -        priv = this->private;          local = frame->local; +	fd_ctx = local->fd_ctx;          LOCK (&frame->lock);          {                  if (op_ret == -1) {                          local->op_errno = op_errno; -                } - -                if (op_ret >= 0) { +			fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; +                } else {                          local->op_ret = op_ret; -                        local->success_count++; - -                        ret = afr_child_fd_ctx_set (this, fd, child_index, -                                                    local->cont.open.flags); -                        if (ret) { -                                local->op_ret = -1; -                                local->op_errno = -ret; -                                goto unlock; -                        } +			fd_ctx->opened_on[child_index] = AFR_FD_OPENED; +			if (!local->xdata_rsp && xdata) +				local->xdata_rsp = dict_ref (xdata);                  }          } -unlock:          UNLOCK (&frame->lock);          call_count = afr_frame_return (frame);          if (call_count == 0) { -                if ((local->cont.open.flags & O_TRUNC) -                    && (local->op_ret >= 0)) { +                if ((fd_ctx->flags & O_TRUNC) && (local->op_ret >= 0)) {                          STACK_WIND (frame, afr_open_ftruncate_cbk,                                      this, this->fops->ftruncate,                                      fd, 0, NULL);                  } else { -                        if (afr_open_only_data_self_heal (priv->data_self_heal)) -                                afr_perform_data_self_heal (frame, this);                          AFR_STACK_UNWIND (open, frame, local->op_ret, -                                          local->op_errno, local->fd, xdata); +                                          local->op_errno, local->fd, +					  local->xdata_rsp);                  }          } @@ -190,16 +123,11 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,          afr_private_t * priv       = NULL;          afr_local_t *   local      = NULL;          int             i          = 0; -        int             ret        = -1;          int32_t         call_count = 0;          int32_t         op_errno   = 0; -        int32_t         wind_flags = flags & (~O_TRUNC); -        //We can't let truncation to happen outside transaction. +	afr_fd_ctx_t   *fd_ctx = NULL; -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); -        VALIDATE_OR_GOTO (loc, out); +        //We can't let truncation to happen outside transaction.          priv = this->private; @@ -207,44 +135,38 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,                  QUORUM_CHECK(open,out);          } -        if (afr_is_split_brain (this, loc->inode)) { -                /* self-heal failed */ -                gf_log (this->name, GF_LOG_WARNING, -                        "failed to open as split brain seen, returning EIO"); -                op_errno = EIO; -                goto out; -        } - -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; +	local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out; -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +	fd_ctx = afr_fd_ctx_get (fd, this); +	if (!fd_ctx) { +		op_errno = ENOMEM; +		goto out; +	} -        call_count   = local->call_count; -        loc_copy (&local->loc, loc); +        local->fd = fd_ref (fd); +	local->fd_ctx = fd_ctx; +	fd_ctx->flags = flags; -        local->cont.open.flags   = flags; +        call_count = local->call_count; -        local->fd = fd_ref (fd); +        local->cont.open.flags = flags;          for (i = 0; i < priv->child_count; i++) {                  if (local->child_up[i]) {                          STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i,                                             priv->children[i],                                             priv->children[i]->fops->open, -                                           loc, wind_flags, fd, xdata); - +                                           loc, (flags & ~O_TRUNC), fd, xdata);                          if (!--call_count)                                  break;                  }          } -        ret = 0; +	return 0;  out: -        if (ret < 0) -                AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, xdata); +	AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, NULL);          return 0;  } @@ -273,12 +195,7 @@ afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          priv->children[child_index]->name);          } -        fd_ctx = afr_fd_ctx_get (local->fd, this); -        if (!fd_ctx) { -                gf_log (this->name, GF_LOG_WARNING, -                        "failed to get fd context, %p", local->fd); -                goto out; -        } +        fd_ctx = local->fd_ctx;          LOCK (&local->fd->lock);          { @@ -289,7 +206,7 @@ afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                  }          }          UNLOCK (&local->fd->lock); -out: +          call_count = afr_frame_return (frame);          if (call_count == 0)                  AFR_STACK_DESTROY (frame); @@ -297,8 +214,42 @@ out:          return 0;  } + +static int +afr_fd_ctx_need_open (fd_t *fd, xlator_t *this, unsigned char *need_open) +{ +	afr_fd_ctx_t *fd_ctx = NULL; +	afr_private_t *priv = NULL; +	int i = 0; +	int count = 0; + +	priv = this->private; + +	fd_ctx = afr_fd_ctx_get (fd, this); +	if (!fd_ctx) +		return 0; + +	LOCK (&fd->lock); +	{ +		for (i = 0; i < priv->child_count; i++) { +			if (fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED && +			    priv->child_up[i]) { +				fd_ctx->opened_on[i] = AFR_FD_OPENING; +				need_open[i] = 1; +				count++; +			} else { +				need_open[i] = 0; +			} +		} +	} +	UNLOCK (&fd->lock); + +	return count; +} + +  void -afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open) +afr_fix_open (fd_t *fd, xlator_t *this)  {          afr_private_t *priv    = NULL;          int           i        = 0; @@ -307,29 +258,31 @@ afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open)          int           ret      = -1;          int32_t       op_errno = 0;          afr_fd_ctx_t  *fd_ctx  = NULL; +	unsigned char *need_open = NULL; +	int call_count = 0;          priv  = this->private; -        if (!afr_is_fd_fixable (fd) || !need_open || !need_open_count) +        if (!afr_is_fd_fixable (fd))                  goto out;          fd_ctx = afr_fd_ctx_get (fd, this); -        if (!fd_ctx) { -                ret = -1; +        if (!fd_ctx)                  goto out; -        } + +	need_open = alloca0 (priv->child_count); + +	call_count = afr_fd_ctx_need_open (fd, this, need_open); +	if (!call_count) +		goto out;          frame = create_frame (this, this->ctx->pool); -        if (!frame) { -                ret = -1; +        if (!frame)                  goto out; -        } -        AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -        local = frame->local; -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; +	local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) +		goto out;          local->loc.inode = inode_ref (fd->inode);          ret = loc_path (&local->loc, NULL); @@ -337,10 +290,12 @@ afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open)                  goto out;          local->fd = fd_ref (fd); -        local->call_count = need_open_count; +	local->fd_ctx = fd_ctx; + +        local->call_count = call_count; -        gf_log (this->name, GF_LOG_DEBUG, "need open count: %zd", -                need_open_count); +        gf_log (this->name, GF_LOG_DEBUG, "need open count: %d", +                call_count);          for (i = 0; i < priv->child_count; i++) {                  if (!need_open[i]) @@ -371,12 +326,12 @@ afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open)                                             local->fd, NULL);                  } +		if (!--call_count) +			break;          } -        op_errno = 0; -        ret = 0; + +	return;  out: -        if (op_errno) -                ret = -1; //For handling ALLOC_OR_GOTO -        if (ret && frame) +        if (frame)                  AFR_STACK_DESTROY (frame);  } diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c new file mode 100644 index 00000000000..186f68c3359 --- /dev/null +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -0,0 +1,239 @@ +/* +  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#include "afr.h" +#include "afr-transaction.h" + +int +afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	int i = 0; +	int subvol = -1; + +	local = frame->local; +	priv = this->private; + + +	for (i = 0; i < priv->child_count; i++) { +		if (!local->readable[i]) { +			/* don't even bother trying here. +			   just mark as attempted and move on. */ +			local->read_attempted[i] = 1; +			continue; +		} + +		if (!local->read_attempted[i]) { +			subvol = i; +			break; +		} +	} + +	/* If no more subvols were available for reading, we leave +	   @subvol as -1, which is an indication we have run out of +	   readable subvols. */ +	if (subvol != -1) +		local->read_attempted[subvol] = 1; +	local->readfn (frame, this, subvol); + +	return 0; +} + + +int +afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err) +{ +	afr_local_t *local = NULL; +	int read_subvol = 0; +	int event_generation = 0; +	inode_t *inode = NULL; +	int ret = -1; + +	local = frame->local; +	inode = local->inode; + +	if (err) { +		local->op_errno = -err; +		local->op_ret = -1; +		read_subvol = -1; +		goto readfn; +	} + +	ret = afr_inode_read_subvol_type_get (inode, this, local->readable, +					      &event_generation, +					      local->transaction.type); + +	if (ret == -1 || !event_generation) { +		/* Even after refresh, we don't have a good +		   read subvolume. Time to bail */ +		local->op_ret = -1; +		local->op_errno = EIO; +		read_subvol = -1; +		goto readfn; +	} + +	read_subvol = afr_read_subvol_select_by_policy (inode, this, +							local->readable); + +	if (read_subvol == -1) { +		local->op_ret = -1; +		local->op_errno = EIO; +		goto readfn; +	} + +	if (local->read_attempted[read_subvol]) { +		afr_read_txn_next_subvol (frame, this); +		return 0; +	} + +	local->read_attempted[read_subvol] = 1; +readfn: +	local->readfn (frame, this, read_subvol); + +	return 0; +} + + +int +afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol) +{ +	afr_local_t *local = NULL; + +	local = frame->local; + +	if (!local->refreshed) { +		local->refreshed = _gf_true; +		afr_inode_refresh (frame, this, local->inode, +				   afr_read_txn_refresh_done); +	} else { +		afr_read_txn_next_subvol (frame, this); +	} + +	return 0; +} + + +/* afr_read_txn_wipe: + +   clean internal variables in @local in order to make +   it possible to call afr_read_txn() multiple times from +   the same frame +*/ + +void +afr_read_txn_wipe (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	int i = 0; + +	local = frame->local; +	priv = this->private; + +	local->readfn = NULL; + +	if (local->inode) +		inode_unref (local->inode); + +	for (i = 0; i < priv->child_count; i++) { +		local->read_attempted[i] = 0; +		local->readable[i] = 0; +	} +} + + +/* +  afr_read_txn: + +  This is the read transaction function. The way it works: + +  - Determine read-subvolume from inode ctx. + +  - If read-subvolume's generation was stale, refresh ctx once by +    calling afr_inode_refresh() + +    Else make an attempt to read on read-subvolume. + +  - If attempted read on read-subvolume fails, refresh ctx once +    by calling afr_inode_refresh() + +  - After ctx refresh, query read-subvolume freshly and attempt +    read once. + +  - If read fails, try every other readable[] subvolume before +    finally giving up. readable[] elements are set by afr_inode_refresh() +    based on dirty and pending flags. + +  - If file is in split brain in the backend, generation will be +    kept 0 by afr_inode_refresh() and readable[] will be set 0 for +    all elements. Therefore reads always fail. +*/ + +int +afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, +	      afr_read_txn_wind_t readfn, afr_transaction_type type) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	int read_subvol = -1; +	int event_generation = 0; +	int ret = -1; + +	priv = this->private; +	local = frame->local; + +	afr_read_txn_wipe (frame, this); + +	local->readfn = readfn; +	local->inode = inode_ref (inode); + +	local->transaction.type = type; +	ret = afr_inode_read_subvol_type_get (inode, this, local->readable, +					      &event_generation, type); +	if (ret == -1) +		/* very first transaction on this inode */ +		goto refresh; + +	if (local->event_generation != event_generation) +		/* servers have disconnected / reconnected, and possibly +		   rebooted, very likely changing the state of freshness +		   of copies */ +		goto refresh; + +	read_subvol = afr_read_subvol_select_by_policy (inode, this, +							local->readable); + +	if (read_subvol < 0 || read_subvol > priv->child_count) { +		gf_log (this->name, GF_LOG_WARNING, "Unreadable subvolume %d " +			"found with event generation %d", read_subvol, +			event_generation); +		goto refresh; +	} + +	if (!local->child_up[read_subvol]) { +		/* should never happen, just in case */ +		gf_log (this->name, GF_LOG_WARNING, "subvolume %d is the " +			"read subvolume in this generation, but is not up", +			read_subvol); +		goto refresh; +	} + +	local->read_attempted[read_subvol] = 1; + +	local->readfn (frame, this, read_subvol); + +	return 0; + +refresh: +	afr_inode_refresh (frame, this, inode, afr_read_txn_refresh_done); + +	return 0; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c deleted file mode 100644 index 83846f152d2..00000000000 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c +++ /dev/null @@ -1,837 +0,0 @@ -/* -  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> -  This file is part of GlusterFS. - -  This file is licensed to you under your choice of the GNU Lesser -  General Public License, version 3 or any later version (LGPLv3 or -  later), or the GNU General Public License, version 2 (GPLv2), in all -  cases as published by the Free Software Foundation. -*/ - - -#include <openssl/md5.h> -#include "glusterfs.h" -#include "afr.h" -#include "xlator.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" -#include "afr-self-heal-algorithm.h" - -/* -  This file contains the various self-heal algorithms -*/ - -static int -sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, -                gf_boolean_t is_first_call, call_frame_t *old_loop_frame); -static int -sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame, -                int32_t op_ret, int32_t op_errno); -static int -sh_destroy_frame (call_frame_t *frame, xlator_t *this) -{ -        if (!frame) -                goto out; - -        AFR_STACK_DESTROY (frame); -out: -        return 0; -} - -static void -sh_private_cleanup (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t             *local   = NULL; -        afr_self_heal_t         *sh      = NULL; -        afr_sh_algo_private_t   *sh_priv = NULL; - -        local = frame->local; -        sh    = &local->self_heal; - -        sh_priv = sh->private; -        GF_FREE (sh_priv); -} - -static int -sh_number_of_writes_needed (unsigned char *write_needed, int child_count) -{ -        int writes = 0; -        int i      = 0; - -        for (i = 0; i < child_count; i++) { -                if (write_needed[i]) -                        writes++; -        } - -        return writes; -} - - -static int -sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this, -                     call_frame_t *last_loop_frame) -{ -        afr_local_t             *local        = NULL; -        afr_self_heal_t         *sh           = NULL; -        afr_sh_algo_private_t   *sh_priv      = NULL; -        int32_t                 total_blocks = 0; -        int32_t                 diff_blocks  = 0; - -        local        = sh_frame->local; -        sh           = &local->self_heal; -        sh_priv      = sh->private; -        if (sh_priv) { -                total_blocks = sh_priv->total_blocks; -                diff_blocks  = sh_priv->diff_blocks; -        } - -        sh_private_cleanup (sh_frame, this); -        if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { -                GF_ASSERT (!last_loop_frame); -                //loop_finish should have happened and the old_loop should be NULL -                gf_log (this->name, GF_LOG_DEBUG, -                        "self-heal aborting on %s", -                        local->loc.path); - -                local->self_heal.algo_abort_cbk (sh_frame, this); -        } else { -                GF_ASSERT (last_loop_frame); -                if (diff_blocks == total_blocks) { -                        gf_log (this->name, GF_LOG_DEBUG, "full self-heal " -                                "completed on %s",local->loc.path); -                } else { -                        gf_log (this->name, GF_LOG_DEBUG, -                                "diff self-heal on %s: completed. " -                                "(%d blocks of %d were different (%.2f%%))", -                                local->loc.path, diff_blocks, total_blocks, -                                ((diff_blocks * 1.0)/total_blocks) * 100); -                } - -                sh->old_loop_frame = last_loop_frame; -                local->self_heal.algo_completion_cbk (sh_frame, this); -        } - -        return 0; -} - -int -sh_loop_finish (call_frame_t *loop_frame, xlator_t *this) -{ -        afr_local_t             *loop_local = NULL; -        afr_self_heal_t         *loop_sh = NULL; - -        if (!loop_frame) -                goto out; - -        loop_local = loop_frame->local; -        if (loop_local) { -                loop_sh = &loop_local->self_heal; -        } - -        if (loop_sh && loop_sh->data_lock_held) { -                afr_sh_data_unlock (loop_frame, this, this->name, -                                    sh_destroy_frame); -        } else { -                sh_destroy_frame (loop_frame, this); -        } -out: -        return 0; -} - -static int -sh_loop_lock_success (call_frame_t *loop_frame, xlator_t *this) -{ -        afr_local_t                 *loop_local = NULL; -        afr_self_heal_t             *loop_sh    = NULL; - -        loop_local = loop_frame->local; -        loop_sh = &loop_local->self_heal; - -        sh_loop_finish (loop_sh->old_loop_frame, this); -        loop_sh->old_loop_frame = NULL; - -        gf_log (this->name, GF_LOG_DEBUG, "Acquired lock for range %"PRIu64 -                " %"PRIu64, loop_sh->offset, loop_sh->block_size); -        loop_sh->data_lock_held = _gf_true; -        loop_sh->sh_data_algo_start (loop_frame, this); -        return 0; -} - -static int -sh_loop_lock_failure (call_frame_t *loop_frame, xlator_t *this) -{ -        call_frame_t                *sh_frame = NULL; -        afr_local_t                 *loop_local = NULL; -        afr_self_heal_t             *loop_sh    = NULL; - -        loop_local = loop_frame->local; -        loop_sh = &loop_local->self_heal; -        sh_frame = loop_sh->sh_frame; - -        gf_log (this->name, GF_LOG_ERROR, "failed lock for range %"PRIu64 -                " %"PRIu64, loop_sh->offset, loop_sh->block_size); -        sh_loop_finish (loop_sh->old_loop_frame, this); -        loop_sh->old_loop_frame = NULL; -        sh_loop_return (sh_frame, this, loop_frame, -1, ENOTCONN); -        return 0; -} - -static int -sh_loop_frame_create (call_frame_t *sh_frame, xlator_t *this, -                      call_frame_t *old_loop_frame, call_frame_t **loop_frame) -{ -        call_frame_t                *new_loop_frame = NULL; -        afr_local_t                 *local          = NULL; -        afr_self_heal_t             *sh             = NULL; -        afr_local_t                 *new_loop_local = NULL; -        afr_self_heal_t             *new_loop_sh    = NULL; -        afr_private_t               *priv           = NULL; - -        GF_ASSERT (sh_frame); -        GF_ASSERT (loop_frame); - -        *loop_frame = NULL; -        local   = sh_frame->local; -        sh      = &local->self_heal; -        priv    = this->private; - -        new_loop_frame = copy_frame (sh_frame); -        if (!new_loop_frame) -                goto out; -        //We want the frame to have same lk_owner as sh_frame -        //so that locks translator allows conflicting locks -        new_loop_local = afr_self_heal_local_init (local, this); -        if (!new_loop_local) -                goto out; -        new_loop_frame->local = new_loop_local; - -        new_loop_sh = &new_loop_local->self_heal; -        new_loop_sh->sources = memdup (sh->sources, -                                       priv->child_count * sizeof (*sh->sources)); -        if (!new_loop_sh->sources) -                goto out; -        new_loop_sh->write_needed = GF_CALLOC (priv->child_count, -                                               sizeof (*new_loop_sh->write_needed), -                                               gf_afr_mt_char); -        if (!new_loop_sh->write_needed) -                goto out; -        new_loop_sh->checksum = GF_CALLOC (priv->child_count, MD5_DIGEST_LENGTH, -                                           gf_afr_mt_uint8_t); -        if (!new_loop_sh->checksum) -                goto out; -        new_loop_sh->inode      = inode_ref (sh->inode); -        new_loop_sh->sh_data_algo_start = sh->sh_data_algo_start; -        new_loop_sh->source = sh->source; -        new_loop_sh->active_sinks = sh->active_sinks; -        new_loop_sh->healing_fd = fd_ref (sh->healing_fd); -        new_loop_sh->file_has_holes = sh->file_has_holes; -        new_loop_sh->old_loop_frame = old_loop_frame; -        new_loop_sh->sh_frame = sh_frame; -        *loop_frame = new_loop_frame; -        return 0; -out: -        sh_destroy_frame (new_loop_frame, this); -        return -ENOMEM; -} - -static int -sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, -               call_frame_t *old_loop_frame) -{ -        call_frame_t                *new_loop_frame = NULL; -        afr_local_t                 *local          = NULL; -        afr_self_heal_t             *sh             = NULL; -        afr_local_t                 *new_loop_local = NULL; -        afr_self_heal_t             *new_loop_sh    = NULL; -        int                         ret             = 0; - -        GF_ASSERT (sh_frame); - -        local   = sh_frame->local; -        sh      = &local->self_heal; - -        ret = sh_loop_frame_create (sh_frame, this, old_loop_frame, -                                    &new_loop_frame); -        if (ret) -                goto out; -        new_loop_local = new_loop_frame->local; -        new_loop_sh = &new_loop_local->self_heal; -        new_loop_sh->offset = offset; -        new_loop_sh->block_size = sh->block_size; -        afr_sh_data_lock (new_loop_frame, this, offset, new_loop_sh->block_size, -                          _gf_true, this->name, sh_loop_lock_success, sh_loop_lock_failure); -        return 0; -out: -        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -        if (old_loop_frame) -                sh_loop_finish (old_loop_frame, this); -        sh_loop_return (sh_frame, this, new_loop_frame, -1, ENOMEM); -        return 0; -} - -static int -sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, -                gf_boolean_t is_first_call, call_frame_t *old_loop_frame) -{ -        afr_local_t *               local          = NULL; -        afr_self_heal_t *           sh             = NULL; -        afr_sh_algo_private_t       *sh_priv        = NULL; -        gf_boolean_t                is_driver_done = _gf_false; -        blksize_t                   block_size     = 0; -        int                         loop           = 0; -        off_t                       offset         = 0; -        afr_private_t               *priv          = NULL; - -        priv    = this->private; -        local   = sh_frame->local; -        sh      = &local->self_heal; -        sh_priv = sh->private; - -        LOCK (&sh_priv->lock); -        { -                if (!is_first_call) -                        sh_priv->loops_running--; -                offset = sh_priv->offset; -                block_size = sh->block_size; -                while ((!sh->eof_reached) && -                       (!is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) && -                     (sh_priv->loops_running < priv->data_self_heal_window_size) -                       && (sh_priv->offset < sh->file_size)) { - -                        loop++; -                        sh_priv->offset += block_size; -                        sh_priv->loops_running++; - -                        if (!is_first_call) -                                break; -                } -                if (0 == sh_priv->loops_running) { -                        is_driver_done = _gf_true; -                } -        } -        UNLOCK (&sh_priv->lock); - -        if (0 == loop) { -                //loop finish does unlock, but the erasing of the pending -                //xattrs needs to happen before that so do not finish the loop -                if (is_driver_done && -                    !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) -                        goto driver_done; -                if (old_loop_frame) { -                        sh_loop_finish (old_loop_frame, this); -                        old_loop_frame = NULL; -                } -        } - -        //If we have more loops to form we should finish previous loop after -        //the next loop lock -        while (loop--) { -                if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { -                        // op failed in other loop, stop spawning more loops -                        if (old_loop_frame) { -                                sh_loop_finish (old_loop_frame, this); -                                old_loop_frame = NULL; -                        } -                        sh_loop_driver (sh_frame, this, _gf_false, NULL); -                } else { -                        gf_log (this->name, GF_LOG_TRACE, "spawning a loop " -                                "for offset %"PRId64, offset); - -                        sh_loop_start (sh_frame, this, offset, old_loop_frame); -                        old_loop_frame = NULL; -                        offset += block_size; -                } -        } - -driver_done: -        if (is_driver_done) { -                sh_loop_driver_done (sh_frame, this, old_loop_frame); -        } -        return 0; -} - -static int -sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame, -                int32_t op_ret, int32_t op_errno) -{ -        afr_local_t *               loop_local = NULL; -        afr_self_heal_t *           loop_sh    = NULL; -        afr_local_t *               sh_local = NULL; -        afr_self_heal_t            *sh       = NULL; - -        sh_local = sh_frame->local; -        sh       = &sh_local->self_heal; - -        if (loop_frame) { -                loop_local = loop_frame->local; -                if (loop_local) -                        loop_sh    = &loop_local->self_heal; -                if (loop_sh) -                        gf_log (this->name, GF_LOG_TRACE, "loop for offset " -                                "%"PRId64" returned", loop_sh->offset); -        } - -        if (op_ret == -1) { -                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -                afr_sh_set_error (sh, op_errno); -                if (loop_frame) { -                        sh_loop_finish (loop_frame, this); -                        loop_frame = NULL; -                } -        } - -        sh_loop_driver (sh_frame, this, _gf_false, loop_frame); - -        return 0; -} - -static int -sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, -                   int32_t op_ret, int32_t op_errno, struct iatt *buf, -                   struct iatt *postbuf, dict_t *xdata) -{ -        afr_private_t *             priv        = NULL; -        afr_local_t *               loop_local    = NULL; -        afr_self_heal_t *           loop_sh       = NULL; -        call_frame_t               *sh_frame    = NULL; -        afr_local_t *               sh_local    = NULL; -        afr_self_heal_t            *sh          = NULL; -        int                         call_count  = 0; -        int                         child_index = 0; - -        priv     = this->private; -        loop_local = loop_frame->local; -        loop_sh    = &loop_local->self_heal; - -        sh_frame = loop_sh->sh_frame; -        sh_local = sh_frame->local; -        sh       = &sh_local->self_heal; - -        child_index =  (long) cookie; - -        gf_log (this->name, GF_LOG_TRACE, -                "wrote %d bytes of data from %s to child %d, offset %"PRId64"", -                op_ret, sh_local->loc.path, child_index, loop_sh->offset); - -        if (op_ret == -1) { -                gf_log (this->name, GF_LOG_ERROR, -                        "write to %s failed on subvolume %s (%s)", -                        sh_local->loc.path, -                        priv->children[child_index]->name, -                        strerror (op_errno)); - -                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -                afr_sh_set_error (loop_sh, op_errno); -        } else if (op_ret < loop_local->cont.writev.vector->iov_len) { -                gf_log (this->name, GF_LOG_ERROR, -                        "incomplete write to %s on subvolume %s " -                        "(expected %lu, returned %d)", sh_local->loc.path, -                        priv->children[child_index]->name, -                        loop_local->cont.writev.vector->iov_len, op_ret); -                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -        } - -        call_count = afr_frame_return (loop_frame); - -        if (call_count == 0) { -		iobref_unref(loop_local->cont.writev.iobref); - -                sh_loop_return (sh_frame, this, loop_frame, -                                loop_sh->op_ret, loop_sh->op_errno); -        } - -        return 0; -} - -static void -sh_prune_writes_needed (call_frame_t *sh_frame, call_frame_t *loop_frame, -                        afr_private_t *priv) -{ -        afr_local_t     *sh_local     = NULL; -        afr_self_heal_t *sh           = NULL; -        afr_local_t     *loop_local   = NULL; -        afr_self_heal_t *loop_sh      = NULL; -        int             i             = 0; - -        sh_local   = sh_frame->local; -        sh         = &sh_local->self_heal; - -        if (!strcmp (sh->algo->name, "diff")) -                return; - -        loop_local = loop_frame->local; -        loop_sh    = &loop_local->self_heal; - -        /* full self-heal guarantees there exists atleast 1 file with size 0 -         * That means for other files we can preserve holes that come after -         * its size before 'trim' -         */ -        for (i = 0; i < priv->child_count; i++) { -                if (loop_sh->write_needed[i] && -                    ((loop_sh->offset + 1) > sh->buf[i].ia_size)) -                        loop_sh->write_needed[i] = 0; -        } -} - -static int -sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, -                  xlator_t *this, int32_t op_ret, int32_t op_errno, -                  struct iovec *vector, int32_t count, struct iatt *buf, -                  struct iobref *iobref, dict_t *xdata) -{ -        afr_private_t *               priv       = NULL; -        afr_local_t *                 loop_local   = NULL; -        afr_self_heal_t *             loop_sh      = NULL; -        call_frame_t                 *sh_frame   = NULL; -        int                           i          = 0; -        int                           call_count = 0; -        afr_local_t *                 sh_local   = NULL; -        afr_self_heal_t *             sh      = NULL; - -        priv       = this->private; -        loop_local = loop_frame->local; -        loop_sh    = &loop_local->self_heal; - -        sh_frame = loop_sh->sh_frame; -        sh_local = sh_frame->local; -        sh       = &sh_local->self_heal; - -        gf_log (this->name, GF_LOG_TRACE, -                "read %d bytes of data from %s, offset %"PRId64"", -                op_ret, loop_local->loc.path, loop_sh->offset); - -        if (op_ret <= 0) { -                if (op_ret < 0) { -                        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -                        gf_log (this->name, GF_LOG_ERROR, "read failed on %d " -                                "for %s reason :%s", sh->source, -                                sh_local->loc.path, strerror (errno)); -                } else { -                        sh->eof_reached = _gf_true; -                        gf_log (this->name, GF_LOG_DEBUG, "Eof reached for %s", -                                sh_local->loc.path); -                } -                sh_loop_return (sh_frame, this, loop_frame, op_ret, op_errno); -                goto out; -        } - -        if (loop_sh->file_has_holes && iov_0filled (vector, count) == 0) -                sh_prune_writes_needed (sh_frame, loop_frame, priv); - -        call_count = sh_number_of_writes_needed (loop_sh->write_needed, -                                                 priv->child_count); -        if (call_count == 0) { -                sh_loop_return (sh_frame, this, loop_frame, 0, 0); -                goto out; -        } - -        loop_local->call_count = call_count; - -	/* -	 * We only really need the request size at the moment, but the buffer -	 * is required if we want to issue a retry in the event of a short write. -	 * Therefore, we duplicate the vector and ref the iobref here... -	 */ -	loop_local->cont.writev.vector = iov_dup(vector, count); -	loop_local->cont.writev.iobref = iobref_ref(iobref); - -        for (i = 0; i < priv->child_count; i++) { -                if (!loop_sh->write_needed[i]) -                        continue; -                STACK_WIND_COOKIE (loop_frame, sh_loop_write_cbk, -                                   (void *) (long) i, -                                   priv->children[i], -                                   priv->children[i]->fops->writev, -                                   loop_sh->healing_fd, vector, count, -                                   loop_sh->offset, 0, iobref, NULL); - -                if (!--call_count) -                        break; -        } - -out: -        return 0; -} - - -static int -sh_loop_read (call_frame_t *loop_frame, xlator_t *this) -{ -        afr_private_t           *priv       = NULL; -        afr_local_t             *loop_local   = NULL; -        afr_self_heal_t         *loop_sh      = NULL; - -        priv     = this->private; -        loop_local = loop_frame->local; -        loop_sh    = &loop_local->self_heal; - -        STACK_WIND_COOKIE (loop_frame, sh_loop_read_cbk, -                           (void *) (long) loop_sh->source, -                           priv->children[loop_sh->source], -                           priv->children[loop_sh->source]->fops->readv, -                           loop_sh->healing_fd, loop_sh->block_size, -                           loop_sh->offset, 0, NULL); - -        return 0; -} - - -static int -sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, -                      int32_t op_ret, int32_t op_errno, -                      uint32_t weak_checksum, uint8_t *strong_checksum, -                      dict_t *xdata) -{ -        afr_private_t                 *priv         = NULL; -        afr_local_t                   *loop_local   = NULL; -        afr_self_heal_t               *loop_sh      = NULL; -        call_frame_t                  *sh_frame     = NULL; -        afr_local_t                   *sh_local     = NULL; -        afr_self_heal_t               *sh           = NULL; -        afr_sh_algo_private_t         *sh_priv      = NULL; -        int                           child_index  = 0; -        int                           call_count   = 0; -        int                           i            = 0; -        int                           write_needed = 0; - -        priv  = this->private; - -        loop_local = loop_frame->local; -        loop_sh    = &loop_local->self_heal; - -        sh_frame = loop_sh->sh_frame; -        sh_local = sh_frame->local; -        sh       = &sh_local->self_heal; - -        sh_priv = sh->private; - -        child_index = (long) cookie; - -        if (op_ret < 0) { -                gf_log (this->name, GF_LOG_ERROR, -                        "checksum on %s failed on subvolume %s (%s)", -                        sh_local->loc.path, priv->children[child_index]->name, -                        strerror (op_errno)); -                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -        } else { -                memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LENGTH, -                        strong_checksum, MD5_DIGEST_LENGTH); -        } - -        call_count = afr_frame_return (loop_frame); - -        if (call_count == 0) { -                for (i = 0; i < priv->child_count; i++) { -                        if (sh->sources[i] || !sh_local->child_up[i]) -                                continue; - -                        if (memcmp (loop_sh->checksum + (i * MD5_DIGEST_LENGTH), -                                    loop_sh->checksum + (sh->source * MD5_DIGEST_LENGTH), -                                    MD5_DIGEST_LENGTH)) { -                                /* -                                  Checksums differ, so this block -                                  must be written to this sink -                                */ - -                                gf_log (this->name, GF_LOG_DEBUG, -                                        "checksum on subvolume %s at offset %" -                                        PRId64" differs from that on source", -                                        priv->children[i]->name, loop_sh->offset); - -                                write_needed = loop_sh->write_needed[i] = 1; -                        } -                } - -                LOCK (&sh_priv->lock); -                { -                        sh_priv->total_blocks++; -                        if (write_needed) -                                sh_priv->diff_blocks++; -                } -                UNLOCK (&sh_priv->lock); - -                if (write_needed && -                    !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { -                        sh_loop_read (loop_frame, this); -                } else { -                        sh_loop_return (sh_frame, this, loop_frame, -                                        op_ret, op_errno); -                } -        } - -        return 0; -} - -static int -sh_diff_checksum (call_frame_t *loop_frame, xlator_t *this) -{ -        afr_private_t           *priv         = NULL; -        afr_local_t             *loop_local   = NULL; -        afr_self_heal_t         *loop_sh      = NULL; -        int                     call_count    = 0; -        int                     i             = 0; - -        priv         = this->private; -        loop_local   = loop_frame->local; -        loop_sh      = &loop_local->self_heal; - -        call_count = loop_sh->active_sinks + 1;  /* sinks and source */ - -        loop_local->call_count = call_count; - -        STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk, -                           (void *) (long) loop_sh->source, -                           priv->children[loop_sh->source], -                           priv->children[loop_sh->source]->fops->rchecksum, -                           loop_sh->healing_fd, -                           loop_sh->offset, loop_sh->block_size, NULL); - -        for (i = 0; i < priv->child_count; i++) { -                if (loop_sh->sources[i] || !loop_local->child_up[i]) -                        continue; - -                STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk, -                                   (void *) (long) i, -                                   priv->children[i], -                                   priv->children[i]->fops->rchecksum, -                                   loop_sh->healing_fd, -                                   loop_sh->offset, loop_sh->block_size, NULL); - -                if (!--call_count) -                        break; -        } - -        return 0; -} - -static int -sh_full_read_write_to_sinks (call_frame_t *loop_frame, xlator_t *this) -{ -        afr_private_t           *priv         = NULL; -        afr_local_t             *loop_local   = NULL; -        afr_self_heal_t         *loop_sh      = NULL; -        int                     i             = 0; - -        priv         = this->private; -        loop_local   = loop_frame->local; -        loop_sh      = &loop_local->self_heal; - -        for (i = 0; i < priv->child_count; i++) { -                if (loop_sh->sources[i] || !loop_local->child_up[i]) -                        continue; -                loop_sh->write_needed[i] = 1; -        } -        sh_loop_read (loop_frame, this); -        return 0; -} - -afr_sh_algo_private_t* -afr_sh_priv_init () -{ -        afr_sh_algo_private_t   *sh_priv = NULL; - -        sh_priv = GF_CALLOC (1, sizeof (*sh_priv), -                             gf_afr_mt_afr_private_t); -        if (!sh_priv) -                goto out; - -        LOCK_INIT (&sh_priv->lock); -out: -        return sh_priv; -} - -int -afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, char *dom, -                      unsigned int child_count) -{ -        afr_local_t             *dst_local   = NULL; -        afr_self_heal_t         *dst_sh      = NULL; -        afr_local_t             *src_local   = NULL; -        afr_self_heal_t         *src_sh      = NULL; -        int                     ret          = -1; - -        dst_local = dst->local; -        dst_sh = &dst_local->self_heal; -        src_local = src->local; -        src_sh = &src_local->self_heal; -        GF_ASSERT (src_sh->data_lock_held); -        GF_ASSERT (!dst_sh->data_lock_held); -        ret = afr_lk_transfer_datalock (dst, src, dom, child_count); -        if (ret) -                return ret; -        src_sh->data_lock_held = _gf_false; -        dst_sh->data_lock_held = _gf_true; -        return 0; -} - -int -afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this, -                    afr_sh_algo_fn sh_data_algo_start) -{ -        call_frame_t            *first_loop_frame = NULL; -        afr_local_t             *local   = NULL; -        afr_self_heal_t         *sh      = NULL; -        int                     ret      = 0; -        afr_private_t           *priv    = NULL; - -        local = sh_frame->local; -        sh    = &local->self_heal; -        priv  = this->private; - -        sh->sh_data_algo_start = sh_data_algo_start; -        local->call_count = 0; -        ret = sh_loop_frame_create (sh_frame, this, NULL, &first_loop_frame); -        if (ret) -                goto out; -        ret = afr_sh_transfer_lock (first_loop_frame, sh_frame, this->name, -                                    priv->child_count); -        if (ret) -                goto out; -        sh->private = afr_sh_priv_init (); -        if (!sh->private) { -                ret = -1; -                goto out; -        } -        sh_loop_driver (sh_frame, this, _gf_true, first_loop_frame); -        ret = 0; -out: -        if (ret) { -                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -                sh_loop_driver_done (sh_frame, this, NULL); -        } -        return 0; -} - -int -afr_sh_algo_diff (call_frame_t *sh_frame, xlator_t *this) -{ -        afr_sh_start_loops (sh_frame, this, sh_diff_checksum); -        return 0; -} - -int -afr_sh_algo_full (call_frame_t *sh_frame, xlator_t *this) -{ -        afr_sh_start_loops (sh_frame, this, sh_full_read_write_to_sinks); -        return 0; -} - -struct afr_sh_algorithm afr_self_heal_algorithms[] = { -        {.name = "full",  .fn = afr_sh_algo_full}, -        {.name = "diff",  .fn = afr_sh_algo_diff}, -        {0, 0}, -}; diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.h b/xlators/cluster/afr/src/afr-self-heal-algorithm.h deleted file mode 100644 index 6b20789b1bb..00000000000 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.h +++ /dev/null @@ -1,32 +0,0 @@ -/* -  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> -  This file is part of GlusterFS. - -  This file is licensed to you under your choice of the GNU Lesser -  General Public License, version 3 or any later version (LGPLv3 or -  later), or the GNU General Public License, version 2 (GPLv2), in all -  cases as published by the Free Software Foundation. -*/ - -#ifndef __AFR_SELF_HEAL_ALGORITHM_H__ -#define __AFR_SELF_HEAL_ALGORITHM_H__ - -typedef int (*afr_sh_algo_fn) (call_frame_t *frame, -                               xlator_t *this); - -struct afr_sh_algorithm { -        const char *name; -        afr_sh_algo_fn fn; -}; - -extern struct afr_sh_algorithm afr_self_heal_algorithms[3]; -typedef struct { -        gf_lock_t lock; -        unsigned int loops_running; -        off_t offset; - -        int32_t total_blocks; -        int32_t diff_blocks; -} afr_sh_algo_private_t; - -#endif /* __AFR_SELF_HEAL_ALGORITHM_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index ef92b420551..4dac8311340 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -1,5 +1,5 @@  /* -  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>    This file is part of GlusterFS.    This file is licensed to you under your choice of the GNU Lesser @@ -8,2805 +8,1002 @@    cases as published by the Free Software Foundation.  */ -#include "glusterfs.h" -#include "xlator.h" -#include "byte-order.h" + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif  #include "afr.h" -#include "afr-transaction.h" -#include "afr-self-heal-common.h"  #include "afr-self-heal.h" -#include "pump.h" - -#define ADD_FMT_STRING(msg, off, sh_str, status, print_log)                 \ -        do {                                                                \ -                if (AFR_SELF_HEAL_NOT_ATTEMPTED != status) {                \ -                        off += snprintf (msg + off, sizeof (msg) - off,     \ -                                         " "sh_str" self heal %s,",         \ -                                         get_sh_completion_status (status));\ -                        print_log = 1;                                      \ -                }                                                           \ -        } while (0) - -#define ADD_FMT_STRING_SYNC(msg, off, sh_str, status, print_log)            \ -        do {                                                                \ -                if (AFR_SELF_HEAL_SYNC_BEGIN == status ||                   \ -                    AFR_SELF_HEAL_FAILED == status)  {                      \ -                        off += snprintf (msg + off, sizeof (msg) - off,     \ -                                         " "sh_str" self heal %s,",         \ -                                         get_sh_completion_status (status));\ -                        print_log = 1;                                      \ -                }                                                           \ -        } while (0) +#include "byte-order.h" -void -afr_sh_reset (call_frame_t *frame, xlator_t *this) +int +afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			  int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)  { -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        afr_private_t   *priv = NULL; - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; - -        memset (sh->child_errno, 0, -                sizeof (*sh->child_errno) * priv->child_count); -        memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count); -        memset (sh->parentbufs, 0, -                sizeof (*sh->parentbufs) * priv->child_count); -        memset (sh->success, 0, sizeof (*sh->success) * priv->child_count); -        memset (sh->locked_nodes, 0, -                sizeof (*sh->locked_nodes) * priv->child_count); -        sh->active_sinks = 0; - -        afr_reset_xattr (sh->xattr, priv->child_count); -} +	afr_local_t *local = NULL; -//Intersection[child]=1 if child is part of intersection -void -afr_children_intersection_get (int32_t *set1, int32_t *set2, -                               int *intersection, unsigned int child_count) -{ -        int                      i = 0; - -        memset (intersection, 0, sizeof (*intersection) * child_count); -        for (i = 0; i < child_count; i++) { -                intersection[i] = afr_is_child_present (set1, child_count, i) -                                     && afr_is_child_present (set2, child_count, -                                                              i); -        } +	local = frame->local; + +	syncbarrier_wake (&local->barrier); + +	return 0;  } -/** - * select_source - select a source and return it - */  int -afr_sh_select_source (int sources[], int child_count) +afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode, +		      int subvol, dict_t *xattr)  { -        int i = 0; -        for (i = 0; i < child_count; i++) -                if (sources[i]) -                        return i; +	afr_private_t *priv = NULL; +	afr_local_t *local = NULL; +	loc_t loc = {0, }; -        return -1; -} +	priv = this->private; +	local = frame->local; -void -afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this) -{ -        int              i = 0; -        afr_local_t     *local      = NULL; -        afr_self_heal_t *sh = NULL; -        afr_private_t   *priv = NULL; -        int              active_sinks = 0; - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; - -        for (i = 0; i < priv->child_count; i++) { -                if (sh->sources[i] == 0 && local->child_up[i] == 1) { -                        active_sinks++; -                        sh->success[i] = 1; -                } else if (sh->sources[i] == 1 && local->child_up[i] == 1) { -                        sh->success[i] = 1; -                } -        } -        sh->active_sinks = active_sinks; -} +	loc.inode = inode_ref (inode); +	uuid_copy (loc.gfid, inode->gfid); -int -afr_sh_source_count (int sources[], int child_count) -{ -        int i = 0; -        int nsource = 0; +	STACK_WIND (frame, afr_selfheal_post_op_cbk, priv->children[subvol], +		    priv->children[subvol]->fops->xattrop, &loc, +		    GF_XATTROP_ADD_ARRAY, xattr, NULL); -        for (i = 0; i < child_count; i++) -                if (sources[i]) -                        nsource++; -        return nsource; -} +	syncbarrier_wait (&local->barrier, 1); -void -afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno) -{ -        sh->op_ret = -1; -	sh->op_errno = afr_most_important_error(sh->op_errno, op_errno, -						_gf_false); +	return 0;  } -void -afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) -{ -        afr_private_t *  priv = this->private; -        char            *buf  = NULL; -        char            *ptr  = NULL; -        int              i    = 0; -        int              j    = 0; - -        /* 10 digits per entry + 1 space + '[' and ']' */ -        buf = GF_MALLOC (priv->child_count * 11 + 8, gf_afr_mt_char); - -        for (i = 0; i < priv->child_count; i++) { -                ptr = buf; -                ptr += sprintf (ptr, "[ "); -                for (j = 0; j < priv->child_count; j++) { -                        ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); -                } -                sprintf (ptr, "]"); -                gf_log (this->name, GF_LOG_DEBUG, "pending_matrix: %s", buf); -        } - -        GF_FREE (buf); -} -char* -afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this) +dict_t * +afr_selfheal_output_xattr (xlator_t *this, afr_transaction_type type, +			   int *output_dirty, int **output_matrix, int subvol)  { -        afr_private_t *  priv = this->private; -        char            *buf  = NULL; -        char            *ptr  = NULL; -        int              i    = 0; -        int              j    = 0; -        int             child_count = priv->child_count; -        char            *matrix_begin = "[ [ "; -        char            *matrix_end = "] ]"; -        char            *seperator = "] [ "; -        int             pending_entry_strlen = 12; //Including space after entry -        int             matrix_begin_strlen = 0; -        int             matrix_end_strlen = 0; -        int             seperator_strlen = 0; -        int             string_length = 0; -        char            *msg = "- Pending matrix:  "; - -        /* -         *  for a list of lists of [ [ a b ] [ c d ] ] -         * */ - -        matrix_begin_strlen = strlen (matrix_begin); -        matrix_end_strlen = strlen (matrix_end); -        seperator_strlen = strlen (seperator); -        string_length = matrix_begin_strlen + matrix_end_strlen -                        + (child_count -1) * seperator_strlen -                        + (child_count * child_count * pending_entry_strlen); - -        buf = GF_CALLOC (1, 1 + strlen (msg) + string_length , gf_afr_mt_char); -        if (!buf) -                goto out; - -        ptr = buf; -        ptr += sprintf (ptr, "%s", msg); -        ptr += sprintf (ptr, "%s", matrix_begin); -        for (i = 0; i < priv->child_count; i++) { -                for (j = 0; j < priv->child_count; j++) { -                        ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); -                } -                if (i < priv->child_count -1) -                        ptr += sprintf (ptr, "%s", seperator); -        } - -        ptr += sprintf (ptr, "%s", matrix_end); +	dict_t *xattr = NULL; +	afr_private_t *priv = NULL; +	int j = 0; +	int idx = 0; +	int ret = 0; +	int *raw = 0; -out: -        return buf; -} +	priv = this->private; +	idx = afr_index_for_transaction_type (type); -void -afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, -                              const char *loc) -{ -        char *buf      = NULL; -        char *free_ptr = NULL; +	xattr = dict_new (); +	if (!xattr) +		return NULL; -        buf = afr_get_pending_matrix_str (pending_matrix, this); -        if (buf) -                free_ptr = buf; -        else -                buf = ""; +	if (output_dirty[subvol]) { +		/* clear dirty */ +		raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t); +		if (!raw) +			goto err; +		raw[idx] = hton32 (output_dirty[subvol]); +		ret = dict_set_bin (xattr, AFR_DIRTY, raw, +				    sizeof(int) * AFR_NUM_CHANGE_LOGS); +		if (ret) +			goto err; +	} -        gf_log (this->name, GF_LOG_ERROR, "Unable to self-heal contents of '%s'" -                " (possible split-brain). Please delete the file from all but " -                "the preferred subvolume.%s", loc, buf); -        GF_FREE (free_ptr); -        return; -} +	/* clear/set pending */ +	for (j = 0; j < priv->child_count; j++) { +		if (!output_matrix[subvol][j]) +			continue; +		raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS, +				 gf_afr_mt_int32_t); +		if (!raw) +			goto err; -void -afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count) -{ -        int             i   = 0; -        int             j   = 0; +		raw[idx] = hton32 (output_matrix[subvol][j]); -        GF_ASSERT (pending_matrix); +		ret = dict_set_bin (xattr, priv->pending_key[j], +				    raw, sizeof(int) * AFR_NUM_CHANGE_LOGS); +		if (ret) +			goto err; +	} -        for (i = 0; i < child_count; i++) { -                for (j = 0; j < child_count; j++) { -                        pending_matrix[i][j] = 0; -                } -        } +	return xattr; +err: +	if (xattr) +		dict_unref (xattr); +	return NULL;  } -void -afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix, -                                      unsigned char *ignorant_subvols, -                                      size_t  child_count) -{ -        int            i                = 0; -        int            j                = 0; - -        GF_ASSERT (pending_matrix); -        GF_ASSERT (ignorant_subvols); - -        for (i = 0; i < child_count; i++) { -                if (ignorant_subvols[i]) { -                        for (j = 0; j < child_count; j++) { -                                if (!ignorant_subvols[j]) -                                        pending_matrix[j][i] += 1; -                        } -                } -        } -}  int -afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, -                          unsigned char *ignorant_subvols, -                          dict_t *xattr[], afr_transaction_type type, -                          size_t child_count) -{ -        /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ -        int32_t        pending[3]       = {0,}; -        void          *pending_raw      = NULL; -        int            ret              = -1; -        int            i                = 0; -        int            j                = 0; -        int            k                = 0; - -        afr_init_pending_matrix (pending_matrix, child_count); - -        for (i = 0; i < child_count; i++) { -                pending_raw = NULL; - -                for (j = 0; j < child_count; j++) { -                        ret = dict_get_ptr (xattr[i], pending_key[j], -                                            &pending_raw); - -                        if (ret != 0) { -                                /* -                                 * There is no xattr present. This means this -                                 * subvolume should be considered an 'ignorant' -                                 * subvolume. -                                 */ - -                                if (ignorant_subvols) -                                        ignorant_subvols[i] = 1; -                                continue; -                        } - -                        memcpy (pending, pending_raw, sizeof(pending)); -                        k = afr_index_for_transaction_type (type); - -                        pending_matrix[i][j] = ntoh32 (pending[k]); -                } -        } - -        return ret; -} +afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode, +			   unsigned char *sources, unsigned char *sinks, +			   unsigned char *healed_sinks, afr_transaction_type type, +			   struct afr_reply *replies, unsigned char *locked_on) +{ +	afr_private_t *priv = NULL; +	int i = 0; +	int j = 0; +	unsigned char *pending = NULL; +	int *input_dirty = NULL; +	int **input_matrix = NULL; +	int *output_dirty = NULL; +	int **output_matrix = NULL; +	dict_t *xattr = NULL; + +	priv = this->private; + +	pending = alloca0 (priv->child_count); + +	input_dirty = alloca0 (priv->child_count * sizeof (int)); +	input_matrix = ALLOC_MATRIX (priv->child_count, int); +	output_dirty = alloca0 (priv->child_count * sizeof (int)); +	output_matrix = ALLOC_MATRIX (priv->child_count, int); + +	afr_selfheal_extract_xattr (this, replies, type, input_dirty, +				    input_matrix); + +	for (i = 0; i < priv->child_count; i++) +		if (sinks[i] && !healed_sinks[i]) +			pending[i] = 1; + +	for (i = 0; i < priv->child_count; i++) { +		for (j = 0; j < priv->child_count; j++) { +			if (pending[j]) +				output_matrix[i][j] = 1; +			else +				output_matrix[i][j] = -input_matrix[i][j]; +		} +	} -typedef enum { -        AFR_NODE_INVALID, -        AFR_NODE_INNOCENT, -        AFR_NODE_FOOL, -        AFR_NODE_WISE, -} afr_node_type; +	for (i = 0; i < priv->child_count; i++) { +		if (!pending[i]) +			output_dirty[i] = -input_dirty[i]; +	} -typedef struct { -        afr_node_type type; -        int           wisdom; -} afr_node_character; +	for (i = 0; i < priv->child_count; i++) { +		if (!locked_on[i]) +			/* perform post-op only on subvols we had locked +			   and inspected on. +			*/ +			continue; +		xattr = afr_selfheal_output_xattr (this, type, output_dirty, +						   output_matrix, i); +		if (!xattr) { +			gf_log (this->name, GF_LOG_ERROR, +				"unable to allocate xdata for subvol %d", i); +			continue; +		} -static int -afr_sh_is_innocent (int32_t *array, int child_count) -{ -        int i   = 0; -        int ret = 1;   /* innocent until proven guilty */ +		afr_selfheal_post_op (frame, this, inode, i, xattr); -        for (i = 0; i < child_count; i++) { -                if (array[i]) { -                        ret = 0; -                        break; -                } -        } +		dict_unref (xattr); +	} -        return ret; +	return 0;  } -static int -afr_sh_is_fool (int32_t *array, int i, int child_count) -{ -        return array[i];   /* fool if accuses itself */ +void +afr_replies_copy (struct afr_reply *dst, struct afr_reply *src, int count) +{ +	int i = 0; +	dict_t *xdata = NULL; + +	if (dst == src) +		return; + +	for (i = 0; i < count; i++) { +		dst[i].valid = src[i].valid; +		dst[i].op_ret = src[i].op_ret; +		dst[i].op_errno = src[i].op_errno; +		dst[i].prestat = src[i].prestat; +		dst[i].poststat = src[i].poststat; +		dst[i].preparent = src[i].preparent; +		dst[i].postparent = src[i].postparent; +		dst[i].preparent2 = src[i].preparent2; +		dst[i].postparent2 = src[i].postparent2; +		if (src[i].xdata) +			xdata = dict_ref (src[i].xdata); +		else +			xdata = NULL; +		if (dst[i].xdata) +			dict_unref (dst[i].xdata); +		dst[i].xdata = xdata; +		memcpy (dst[i].checksum, src[i].checksum, +			MD5_DIGEST_LENGTH); +	}  } -static int -afr_sh_is_wise (int32_t *array, int i, int child_count) +int +afr_selfheal_fill_dirty (xlator_t *this, int *dirty, int subvol, +			 int idx, dict_t *xdata)  { -        return !array[i];  /* wise if does not accuse itself */ -} +	void *pending_raw = NULL; +	int pending[3] = {0, }; +	if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw)) +		return -1; -static int -afr_sh_all_nodes_innocent (afr_node_character *characters, -                           int child_count) -{ -        int i   = 0; -        int ret = 1; +	if (!pending_raw) +		return -1; + +	memcpy (pending, pending_raw, sizeof(pending)); -        for (i = 0; i < child_count; i++) { -                if (characters[i].type != AFR_NODE_INNOCENT) { -                        ret = 0; -                        break; -                } -        } +	dirty[subvol] = ntoh32 (pending[idx]); -        return ret; +	return 0;  } -static int -afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count) +int +afr_selfheal_fill_matrix (xlator_t *this, int **matrix, int subvol, +			  int idx, dict_t *xdata)  { -        int i   = 0; -        int ret = 0; +	int i = 0; +	void *pending_raw = NULL; +	int pending[3] = {0, }; +	afr_private_t *priv = NULL; -        for (i = 0; i < child_count; i++) { -                if (characters[i].type == AFR_NODE_WISE) { -                        ret = 1; -                        break; -                } -        } +	priv = this->private; -        return ret; -} +	for (i = 0; i < priv->child_count; i++) { +		if (dict_get_ptr (xdata, priv->pending_key[i], &pending_raw)) +			continue; +		if (!pending_raw) +			continue; -/* - * The 'wisdom' of a wise node is 0 if any other wise node accuses it. - * It is 1 if no other wise node accuses it. - * Only wise nodes with wisdom 1 are sources. - * - * If no nodes with wisdom 1 exist, a split-brain has occurred. - */ +		memcpy (pending, pending_raw, sizeof(pending)); -static void -afr_sh_compute_wisdom (int32_t *pending_matrix[], -                       afr_node_character characters[], int child_count) -{ -        int i = 0; -        int j = 0; - -        for (i = 0; i < child_count; i++) { -                if (characters[i].type == AFR_NODE_WISE) { -                        characters[i].wisdom = 1; - -                        for (j = 0; j < child_count; j++) { -                                if ((characters[j].type == AFR_NODE_WISE) -                                    && pending_matrix[j][i]) { - -                                        characters[i].wisdom = 0; -                                } -                        } -                } -        } +		matrix[subvol][i] = ntoh32 (pending[idx]); +	} + +	return 0;  } -static int -afr_sh_wise_nodes_conflict (afr_node_character *characters, -                            int child_count) +int +afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies, +			    afr_transaction_type type, int *dirty, int **matrix)  { -        int i   = 0; -        int ret = 1; +	afr_private_t *priv = NULL; +	int i = 0; +	dict_t *xdata = NULL; +	int idx = -1; + +	idx = afr_index_for_transaction_type (type); -        for (i = 0; i < child_count; i++) { -                if ((characters[i].type == AFR_NODE_WISE) -                    && characters[i].wisdom == 1) { +	priv = this->private; -                        /* There is atleast one bona-fide wise node */ -                        ret = 0; -                        break; -                } -        } +	for (i = 0; i < priv->child_count; i++) { +		if (!replies[i].xdata) +			continue; + +		xdata = replies[i].xdata; -        return ret; +		afr_selfheal_fill_dirty (this, dirty, i, idx, xdata); +		afr_selfheal_fill_matrix (this, matrix, i, idx, xdata); +	} + +	return 0;  } -static int -afr_sh_mark_wisest_as_sources (int sources[], -                               afr_node_character *characters, -                               int child_count) -{ -        int nsources = 0; -        int i        = 0; -        for (i = 0; i < child_count; i++) { -                if (characters[i].wisdom == 1) { -                        sources[i] = 1; -                        nsources++; -                } -        } +/* + * This function determines if a self-heal is required for a given inode, + * and if needed, in what direction. + * + * locked_on[] is the array representing servers which have been locked and + * from which xattrs have been fetched for analysis. + * + * The output of the function is by filling the arrays sources[] and sinks[]. + * + * sources[i] is set if i'th server is an eligible source for a selfheal. + * + * sinks[i] is set if i'th server needs to be healed. + * + * if sources[0..N] are all set, there is no need for a selfheal. + * + * if sinks[0..N] are all set, the inode is in split brain. + * + */ -        return nsources; -} +int +afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, +			     struct afr_reply *replies, +			     afr_transaction_type type, unsigned char *locked_on, +			     unsigned char *sources, unsigned char *sinks) +{ +	afr_private_t *priv = NULL; +	int i = 0; +	int j = 0; +	int *dirty = NULL; +	int **matrix = NULL; +	char *accused = NULL; + +	priv = this->private; + +	dirty = alloca0 (priv->child_count * sizeof (int)); +	accused = alloca0 (priv->child_count); +	matrix = ALLOC_MATRIX(priv->child_count, int); + +	/* First construct the pending matrix for further analysis */ +	afr_selfheal_extract_xattr (this, replies, type, dirty, matrix); + +	/* Next short list all accused to exclude them from being sources */ +	for (i = 0; i < priv->child_count; i++) { +		for (j = 0; j < priv->child_count; j++) { +			if (matrix[i][j]) +				accused[j] = 1; +		} +	} -static void -afr_compute_witness_of_fools (int32_t *witnesses, int32_t **pending_matrix, -                              afr_node_character *characters, -                              int32_t child_count) -{ -        int i       = 0; -        int j       = 0; -        int witness = 0; - -        GF_ASSERT (witnesses); -        GF_ASSERT (pending_matrix); -        GF_ASSERT (characters); -        GF_ASSERT (child_count > 0); - -        for (i = 0; i < child_count; i++) { -                if (characters[i].type != AFR_NODE_FOOL) -                        continue; - -                witness = 0; -                for (j = 0; j < child_count; j++) { -                        if (i == j) -                                continue; -                        witness += pending_matrix[i][j]; -                } -                witnesses[i] = witness; -        } -} +	/* Short list all non-accused as sources */ +	memset (sources, 0, priv->child_count); +	for (i = 0; i < priv->child_count; i++) { +		if (!accused[i] && locked_on[i]) +			sources[i] = 1; +	} -static int32_t -afr_find_biggest_witness_among_fools (int32_t *witnesses, -                                      afr_node_character *characters, -                                      int32_t child_count) -{ -        int i               = 0; -        int biggest_witness = -1; -        int biggest_witness_idx = -1; -        int biggest_witness_cnt = -1; - -        GF_ASSERT (witnesses); -        GF_ASSERT (characters); -        GF_ASSERT (child_count > 0); - -        for (i = 0; i < child_count; i++) { -                if (characters[i].type != AFR_NODE_FOOL) -                        continue; - -                if (biggest_witness < witnesses[i]) { -                        biggest_witness = witnesses[i]; -			biggest_witness_idx = i; -			biggest_witness_cnt = 1; +	/* Everyone accused by sources are sinks */ +	memset (sinks, 0, priv->child_count); +	for (i = 0; i < priv->child_count; i++) { +		if (!sources[i])  			continue; +		for (j = 0; j < priv->child_count; j++) { +			if (matrix[i][j]) +				sinks[j] = 1;  		} +	} -		if (biggest_witness == witnesses[i]) -			biggest_witness_cnt++; -        } +	/* If any source has 'dirty' bit, pick first +	   'dirty' source and make everybody else sinks */ +	for (i = 0; i < priv->child_count; i++) { +		if (sources[i] && dirty[i]) { +			for (j = 0; j < priv->child_count; j++) { +				if (j != i) { +					sources[j] = 0; +					sinks[j] = 1; +				} +			} +			break; +		} +	} -	if (biggest_witness_cnt != 1) -		return -1; +	/* If no sources, all locked nodes are sinks - split brain */ +	if (AFR_COUNT (sources, priv->child_count) == 0) { +		for (i = 0; i < priv->child_count; i++) { +			if (locked_on[i]) +				sinks[i] = 1; +		} +	} -        return biggest_witness_idx; +	return 0;  } +  int -afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses, -                                    afr_node_character *characters, -                                    int32_t child_count, int32_t witness) +afr_selfheal_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			   int op_ret, int op_errno, inode_t *inode, +			   struct iatt *buf, dict_t *xdata, struct iatt *parbuf)  { -        int i        = 0; -        int nsources = 0; - -        GF_ASSERT (sources); -        GF_ASSERT (witnesses); -        GF_ASSERT (characters); -        GF_ASSERT (child_count > 0); - -        for (i = 0; i < child_count; i++) { -                if (characters[i].type != AFR_NODE_FOOL) -                        continue; - -                if (witness == witnesses[i]) { -                        sources[i] = 1; -                        nsources++; -                } -        } -        return nsources; -} +	afr_local_t *local = NULL; +	int i = -1; +	local = frame->local; +	i = (long) cookie; + +	local->replies[i].valid = 1; +	local->replies[i].op_ret = op_ret; +	local->replies[i].op_errno = op_errno; +	if (buf) +		local->replies[i].poststat = *buf; +	if (parbuf) +		local->replies[i].postparent = *parbuf; +	if (xdata) +		local->replies[i].xdata = dict_ref (xdata); + +	syncbarrier_wake (&local->barrier); -int -afr_mark_fool_as_source_by_idx (int32_t *sources, int child_count, int idx) -{ -	if (idx >= 0 && idx < child_count) { -		sources[idx] = 1; -		return 1; -	}  	return 0;  } -static int -afr_find_largest_file_size (struct iatt *bufs, int32_t *success_children, -			    int child_count) +inode_t * +afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent, +				 const char *name, struct afr_reply *replies, +				 unsigned char *lookup_on)  { -	int idx = -1; -	int i = -1; -	int child = -1; -	uint64_t max_size = 0; -        uint64_t min_size = 0; -        int      num_children = 0; +	loc_t loc = {0, }; +	dict_t *xattr_req = NULL; +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	inode_t *inode = NULL; -	for (i = 0; i < child_count; i++) { -		if (success_children[i] == -1) -			break; +	local = frame->local; +	priv = frame->this->private; -		child = success_children[i]; -		if (bufs[child].ia_size > max_size) { -			max_size = bufs[child].ia_size; -			idx = child; -		} - -                if ((num_children == 0) || (bufs[child].ia_size < min_size)) { -                        min_size = bufs[child].ia_size; -                } +	xattr_req = dict_new (); +	if (!xattr_req) +		return NULL; -                num_children++; +	if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) { +		dict_destroy (xattr_req); +		return NULL;  	} -        /* If sizes are same for all of them, finding sources will have to -         * happen with pending changelog. So return -1 -         */ -        if ((num_children > 1) && (min_size == max_size)) -                return -1; -	return idx; -} +	inode = inode_new (parent->table); +	if (!inode) { +		dict_destroy (xattr_req); +		return NULL; +	} +	loc.parent = inode_ref (parent); +	uuid_copy (loc.pargfid, parent->gfid); +	loc.name = name; +	loc.inode = inode_ref (inode); -static int -afr_find_newest_file (struct iatt *bufs, int32_t *success_children, -		      int child_count) -{ -	int idx = -1; -	int i = -1; -	int child = -1; -	uint64_t max_ctime = 0; +	AFR_ONLIST (lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc, +		    xattr_req); -	for (i = 0; i < child_count; i++) { -		if (success_children[i] == -1) -			break; +	afr_replies_copy (replies, local->replies, priv->child_count); -		child = success_children[i]; -		if (bufs[child].ia_ctime > max_ctime) { -			max_ctime = bufs[child].ia_ctime; -			idx = child; -		} -	} +	loc_wipe (&loc); +	dict_unref (xattr_req); -	return idx; +	return inode;  } -static int -afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, -                                     afr_node_character *characters, -				     int32_t *success_children, -                                     int child_count, struct iatt *bufs) -{ -        int32_t       biggest_witness = 0; -        int           nsources        = 0; -        int32_t       *witnesses      = NULL; - -        GF_ASSERT (child_count > 0); - -	biggest_witness = afr_find_largest_file_size (bufs, success_children, -						      child_count); -	if (biggest_witness != -1) -		goto found; - -        witnesses = GF_CALLOC (child_count, sizeof (*witnesses), -                               gf_afr_mt_int32_t); -        if (NULL == witnesses) { -                nsources = -1; -                goto out; -        } - -        afr_compute_witness_of_fools (witnesses, pending_matrix, characters, -                                      child_count); -        biggest_witness = afr_find_biggest_witness_among_fools (witnesses, -                                                                characters, -                                                                child_count); -	if (biggest_witness != -1) -		goto found; - -	biggest_witness = afr_find_newest_file (bufs, success_children, -						child_count); - -found: -	nsources = afr_mark_fool_as_source_by_idx (sources, child_count, -						   biggest_witness); -out: -        GF_FREE (witnesses); -        return nsources; -} -  int -afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs, -                                 int32_t *success_children, -                                 unsigned int child_count, uint32_t uid) +afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode, +				   uuid_t gfid, struct afr_reply *replies, +				   unsigned char *discover_on)  { -        int     i        = 0; -        int     nsources = 0; -        int     child    = 0; - -        for (i = 0; i < child_count; i++) { -                if (-1 == success_children[i]) -                        break; - -                child = success_children[i]; -                if (uid == bufs[child].ia_uid) { -                        sources[child] = 1; -                        nsources++; -                } -        } -        return nsources; -} +	loc_t loc = {0, }; +	dict_t *xattr_req = NULL; +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; -int -afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *success_children, -                               unsigned int child_count) -{ -        int     i        = 0; -        int     smallest = -1; -        int     child    = 0; - -        for (i = 0; i < child_count; i++) { -                if (-1 == success_children[i]) -                        break; -                child = success_children[i]; -                if ((smallest == -1) || -                    (bufs[child].ia_uid < bufs[smallest].ia_uid)) { -                        smallest = child; -                } -        } -        return smallest; -} +	local = frame->local; +	priv = frame->this->private; -static int -afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *success_children, -                                  int child_count, int32_t *sources) -{ -        int   nsources              = 0; -        int   smallest              = 0; - -        smallest = afr_get_child_with_lowest_uid (bufs, success_children, -                                                  child_count); -        if (smallest < 0) { -                nsources = -1; -                goto out; -        } -        nsources = afr_mark_child_as_source_by_uid (sources, bufs, -                                                    success_children, child_count, -                                                    bufs[smallest].ia_uid); -out: -        return nsources; -} +	xattr_req = dict_new (); +	if (!xattr_req) +		return -ENOMEM; -int -afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children, -                                 struct iatt *bufs) -{ -        afr_private_t *priv = NULL; -        int            i = 0; -        int            child = -1; -        int            read_child = -1; - -        priv = this->private; -        for (i = 0; i < priv->child_count; i++) { -                child = success_children[i]; -                if (child < 0) -                        break; -                if (read_child < 0) -                        read_child = child; -                else if (bufs[read_child].ia_size < bufs[child].ia_size) -                        read_child = child; -        } -        return read_child; -} +	if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) { +		dict_destroy (xattr_req); +		return -ENOMEM; +	} -int -afr_sh_mark_zero_size_file_as_sink (struct iatt *bufs, int32_t *success_children, -                                    int child_count, int32_t *sources) -{ -        int             nsources = 0; -        int             i = 0; -        int             child = 0; -        gf_boolean_t    sink_exists = _gf_false; -        gf_boolean_t    source_exists = _gf_false; -        int             source = -1; - -        for (i = 0; i < child_count; i++) { -                child = success_children[i]; -                if (child < 0) -                        break; -                if (!bufs[child].ia_size) { -                        sink_exists = _gf_true; -                        continue; -                } -                if (!source_exists) { -                        source_exists = _gf_true; -                        source = child; -                        continue; -                } -                if (bufs[source].ia_size != bufs[child].ia_size) { -                        nsources = -1; -                        goto out; -                } -        } -        if (!source_exists && !sink_exists) { -                nsources = -1; -                goto out; -        } - -        if (!source_exists || !sink_exists) -                goto out; - -        for (i = 0; i < child_count; i++) { -                child = success_children[i]; -                if (child < 0) -                        break; -                if (bufs[child].ia_size) { -                        sources[child] = 1; -                        nsources++; -                } -        } -out: -        return nsources; -} +	loc.inode = inode_ref (inode); +	uuid_copy (loc.gfid, gfid); -char * -afr_get_character_str (afr_node_type type) -{ -        char *character = NULL; - -        switch (type) { -        case AFR_NODE_INNOCENT: -                character = "innocent"; -                break; -        case AFR_NODE_FOOL: -                character = "fool"; -                break; -        case AFR_NODE_WISE: -                character = "wise"; -                break; -        default: -                character = "invalid"; -                break; -        } -        return character; -} +	AFR_ONLIST (discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc, +		    xattr_req); -afr_node_type -afr_find_child_character_type (int32_t *pending_row, int32_t child, -                               unsigned int child_count) -{ -        afr_node_type type = AFR_NODE_INVALID; +	afr_replies_copy (replies, local->replies, priv->child_count); -        GF_ASSERT ((child >= 0) && (child < child_count)); +	loc_wipe (&loc); +	dict_unref (xattr_req); -        if (afr_sh_is_innocent (pending_row, child_count)) -                type = AFR_NODE_INNOCENT; -        else if (afr_sh_is_fool (pending_row, child, child_count)) -                type = AFR_NODE_FOOL; -        else if (afr_sh_is_wise (pending_row, child, child_count)) -                type = AFR_NODE_WISE; -        return type; +	return 0;  }  int -afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, -                   int32_t **pending_matrix, int32_t *sources, -                   int32_t *success_children, afr_transaction_type type, -                   int32_t *subvol_status, gf_boolean_t ignore_ignorant) +afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode, +				uuid_t gfid, struct afr_reply *replies)  { -        afr_private_t           *priv = NULL; -        afr_self_heal_type      sh_type    = AFR_SELF_HEAL_INVALID; -        int                     nsources   = -1; -        unsigned char           *ignorant_subvols = NULL; -        unsigned int            child_count = 0; - -        priv = this->private; -        child_count = priv->child_count; - -        if (afr_get_children_count (success_children, priv->child_count) == 0) -                goto out; - -        if (!ignore_ignorant) { -                ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), -                                              child_count, gf_afr_mt_char); -                if (NULL == ignorant_subvols) -                        goto out; -        } - -        afr_build_pending_matrix (priv->pending_key, pending_matrix, -                                  ignorant_subvols, xattr, type, -                                  priv->child_count); - -        if (!ignore_ignorant) -                afr_mark_ignorant_subvols_as_pending (pending_matrix, -                                                      ignorant_subvols, -                                                      priv->child_count); -        sh_type = afr_self_heal_type_for_transaction (type); -        if (AFR_SELF_HEAL_INVALID == sh_type) -                goto out; - -        afr_sh_print_pending_matrix (pending_matrix, this); - -        nsources = afr_mark_sources (this, sources, pending_matrix, bufs, -                                     sh_type, success_children, subvol_status); -out: -        GF_FREE (ignorant_subvols); -        return nsources; -} +	afr_private_t *priv = NULL; -void -afr_find_character_types (afr_node_character *characters, -                          int32_t **pending_matrix, int32_t *success_children, -                          unsigned int child_count) -{ -        afr_node_type type  = AFR_NODE_INVALID; -        int           child = 0; -        int           i     = 0; - -        for (i = 0; i < child_count; i++) { -                child = success_children[i]; -                if (child == -1) -                        break; -                type = afr_find_child_character_type (pending_matrix[child], -                                                      child, child_count); -                characters[child].type = type; -        } -} +	priv = frame->this->private; -void -afr_mark_success_children_sources (int32_t *sources, int32_t *success_children, -                                   unsigned int child_count) -{ -        int i = 0; -        for (i = 0; i < child_count; i++) { -                if (success_children[i] == -1) -                        break; -                sources[success_children[i]] = 1; -        } +	return afr_selfheal_unlocked_discover_on (frame, inode, gfid, replies, +						  priv->child_up);  } -/** - * mark_sources: Mark all 'source' nodes and return number of source - * nodes found - * - * A node (a row in the pending matrix) belongs to one of - * three categories: - * - * M is the pending matrix. - * - * 'innocent' - M[i] is all zeroes - * 'fool'     - M[i] has i'th element = 1 (self-reference) - * 'wise'     - M[i] has i'th element = 0, others are 1 or 0. - * - * All 'innocent' nodes are sinks. If all nodes are innocent, no self-heal is - * needed. - * - * A 'wise' node can be a source. If two 'wise' nodes conflict, it is - * a split-brain. If one wise node refers to the other but the other doesn't - * refer back, the referrer is a source. - * - * All fools are sinks, unless there are no 'wise' nodes. In that case, - * one of the fools is made a source. - */ +  int -afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, -                  struct iatt *bufs, afr_self_heal_type type, -                  int32_t *success_children, int32_t *subvol_status) +afr_selfheal_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		       int op_ret, int op_errno, dict_t *xdata)  { -        /* stores the 'characters' (innocent, fool, wise) of the nodes */ -        afr_node_character *characters =  NULL; -        int                nsources    = -1; -        unsigned int       child_count = 0; -        afr_private_t      *priv       = NULL; - -        priv = this->private; -        child_count = priv->child_count; -        characters = GF_CALLOC (sizeof (afr_node_character), -                                child_count, gf_afr_mt_afr_node_character); -        if (!characters) -                goto out; - -        this = THIS; - -        /* start clean */ -        memset (sources, 0, sizeof (*sources) * child_count); -        nsources = 0; -        afr_find_character_types (characters, pending_matrix, success_children, -                                  child_count); -        if (afr_sh_all_nodes_innocent (characters, child_count)) { -                switch (type) { -                case AFR_SELF_HEAL_METADATA: -                        nsources = afr_sh_mark_lowest_uid_as_source (bufs, -                                                             success_children, -                                                             child_count, -                                                             sources); -                        break; -                case AFR_SELF_HEAL_DATA: -                        nsources = afr_sh_mark_zero_size_file_as_sink (bufs, -                                                             success_children, -                                                             child_count, -                                                             sources); -                        if ((nsources < 0) && subvol_status) -                                *subvol_status |= SPLIT_BRAIN; -                        break; -                default: -                        break; -                } -                goto out; -        } - -        if (afr_sh_wise_nodes_exist (characters, child_count)) { -                afr_sh_compute_wisdom (pending_matrix, characters, child_count); - -                if (afr_sh_wise_nodes_conflict (characters, child_count)) { -                        if (subvol_status) -                                *subvol_status |= SPLIT_BRAIN; -                        nsources = -1; -                } else { -                        nsources = afr_sh_mark_wisest_as_sources (sources, -                                                                  characters, -                                                                  child_count); -                } -        } else { -                if (subvol_status) -                        *subvol_status |= ALL_FOOLS; -                nsources = afr_mark_biggest_of_fools_as_source (sources, -                                                                pending_matrix, -                                                                characters, -								success_children, -                                                                child_count, bufs); -        } +	afr_local_t *local = NULL; +	int i = 0; -out: -        if (nsources == 0) -                afr_mark_success_children_sources (sources, success_children, -                                                   child_count); -        GF_FREE (characters); +	local = frame->local; +	i = (long) cookie; -        gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources); -        return nsources; -} +	local->replies[i].valid = 1; +	local->replies[i].op_ret = op_ret; +	local->replies[i].op_errno = op_errno; -void -afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, -                         int32_t *delta_matrix[], unsigned char success[], -                         int child_count, afr_transaction_type type) -{ -        int     tgt     = 0; -        int     src     = 0; -        int     value   = 0; - -        afr_build_pending_matrix (priv->pending_key, delta_matrix, NULL, -                                  xattr, type, priv->child_count); - -        /* -         * The algorithm here has two parts.  First, for each subvol indexed -         * as tgt, we try to figure out what count everyone should have for it. -         * If the self-heal succeeded, that's easy; the value is zero. -         * Otherwise, the value is the maximum of the succeeding nodes' counts. -         * Once we know the value, we loop through (possibly for a second time) -         * setting each count to the difference so that when we're done all -         * succeeding nodes will have the same count for tgt. -         */ -        for (tgt = 0; tgt < priv->child_count; ++tgt) { -                value = 0; -                if (!success[tgt]) { -                        /* Find the maximum. */ -                        for (src = 0; src < priv->child_count; ++src) { -                                if (!success[src]) { -                                        continue; -                                } -                                if (delta_matrix[src][tgt] > value) { -                                        value = delta_matrix[src][tgt]; -                                } -                        } -                } -                /* Force everyone who succeeded to the chosen value. */ -                for (src = 0; src < priv->child_count; ++src) { -                        if (success[src]) { -                                delta_matrix[src][tgt] = value -                                                       - delta_matrix[src][tgt]; -                        } -                        else { -                                delta_matrix[src][tgt] = 0; -                        } -                } -        } +	syncbarrier_wake (&local->barrier); + +	return 0;  }  int -afr_sh_delta_to_xattr (xlator_t *this, -                       int32_t *delta_matrix[], dict_t *xattr[], -                       int child_count, afr_transaction_type type) -{ -        int              i       = 0; -        int              j       = 0; -        int              k       = 0; -        int              ret     = 0; -        int32_t         *pending = NULL; -        int32_t         *local_pending = NULL; -        afr_private_t   *priv = NULL; - -        priv = this->private; -        for (i = 0; i < child_count; i++) { -                if (!xattr[i]) -                        continue; - -                local_pending = NULL; -                for (j = 0; j < child_count; j++) { -                        pending = GF_CALLOC (sizeof (int32_t), 3, -                                             gf_afr_mt_int32_t); - -                        if (!pending) { -                                gf_log (this->name, GF_LOG_ERROR, -                                        "failed to allocate pending entry " -                                        "for %s[%d] on %s", -                                        priv->pending_key[j], type, -                                        priv->children[i]->name); -                                continue; -                        } -                        /* 3 = data+metadata+entry */ - -                        k = afr_index_for_transaction_type (type); - -                        pending[k] = hton32 (delta_matrix[i][j]); - -                        if (j == i) { -                                local_pending = pending; -                                continue; -                        } -                        ret = dict_set_bin (xattr[i], priv->pending_key[j], -                                            pending, -                                        AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); -                        if (ret < 0) { -                                gf_log (this->name, GF_LOG_WARNING, -                                        "Unable to set dict value."); -                                GF_FREE (pending); -                        } -                } -                if (local_pending) { -                        ret = dict_set_bin (xattr[i], priv->pending_key[i], -                                            local_pending, -                                        AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); -                        if (ret < 0) { -                                gf_log (this->name, GF_LOG_WARNING, -                                        "Unable to set dict value."); -                                GF_FREE (local_pending); -                        } -                } -        } -        return 0; +afr_selfheal_locked_fill (call_frame_t *frame, xlator_t *this, +			  unsigned char *locked_on) +{ +	int i = 0; +	afr_private_t *priv = NULL; +	afr_local_t *local = NULL; +	int count = 0; + +	local = frame->local; +	priv = this->private; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->replies[i].valid && local->replies[i].op_ret == 0) { +			locked_on[i] = 1; +			count++; +		} else { +			locked_on[i] = 0; +		} +	} + +	return count;  }  int -afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) +afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, +			 char *dom, off_t off, size_t size, +			 unsigned char *locked_on)  { -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; - -        local = frame->local; -        sh = &local->self_heal; - -        afr_sh_reset (frame, this); - -        if (local->unhealable) { -                gf_log (this->name, GF_LOG_DEBUG, -                        "split brain found, aborting selfheal of %s", -                        local->loc.path); -        } - -        if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { -                sh->completion_cbk (frame, this); -        } else { -                gf_log (this->name, GF_LOG_TRACE, -                        "proceeding to metadata check on %s", -                        local->loc.path); -                afr_self_heal_metadata (frame, this); -        } - -        return 0; -} +	loc_t loc = {0,}; +	struct gf_flock flock = {0, }; +	loc.inode = inode_ref (inode); +	uuid_copy (loc.gfid, inode->gfid); -static int -afr_sh_missing_entries_finish (call_frame_t *frame, xlator_t *this) -{ -        afr_internal_lock_t *int_lock = NULL; -        afr_local_t         *local    = NULL; +	flock.l_type = F_WRLCK; +	flock.l_start = off; +	flock.l_len = size; -        local = frame->local; -        int_lock = &local->internal_lock; +	AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom, +		    &loc, F_SETLK, &flock, NULL); -        int_lock->lock_cbk = afr_sh_missing_entries_done; -        afr_unlock (frame, this); +	loc_wipe (&loc); -        return 0; +	return afr_selfheal_locked_fill (frame, this, locked_on);  } +  int -afr_sh_common_create (afr_self_heal_t *sh, unsigned int child_count) -{ -        int     ret = -ENOMEM; -        sh->buf = GF_CALLOC (child_count, sizeof (*sh->buf), -                             gf_afr_mt_iatt); -        if (!sh->buf) -                goto out; -        sh->parentbufs = GF_CALLOC (child_count, sizeof (*sh->parentbufs), -                                    gf_afr_mt_iatt); -        if (!sh->parentbufs) -                goto out; -        sh->child_errno = GF_CALLOC (child_count, sizeof (*sh->child_errno), -                                     gf_afr_mt_int); -        if (!sh->child_errno) -                goto out; -        sh->success_children = afr_children_create (child_count); -        if (!sh->success_children) -                goto out; -        sh->fresh_children = afr_children_create (child_count); -        if (!sh->fresh_children) -                goto out; -        sh->xattr = GF_CALLOC (child_count, sizeof (*sh->xattr), -                               gf_afr_mt_dict_t); -        if (!sh->xattr) -                goto out; -        ret = 0; -out: -        return ret; -} +afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, +		      char *dom, off_t off, size_t size, +		      unsigned char *locked_on) +{ +	loc_t loc = {0,}; +	struct gf_flock flock = {0, }; +	afr_local_t *local = NULL; +	int i = 0; +	afr_private_t *priv = NULL; + +	priv = this->private; +	local = frame->local; + +	loc.inode = inode_ref (inode); +	uuid_copy (loc.gfid, inode->gfid); + +	flock.l_type = F_WRLCK; +	flock.l_start = off; +	flock.l_len = size; + +	AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom, +		    &loc, F_SETLK, &flock, NULL); + +	for (i = 0; i < priv->child_count; i++) { +		if (local->replies[i].op_ret == -1 && +		    local->replies[i].op_errno == EAGAIN) { +			afr_selfheal_locked_fill (frame, this, locked_on); +			afr_selfheal_uninodelk (frame, this, inode, dom, off, +						size, locked_on); + +			AFR_SEQ (frame, afr_selfheal_lock_cbk, inodelk, dom, +				 &loc, F_SETLKW, &flock, NULL); +			break; +		} +	} -void -afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, -                                   xlator_t *this, -                                   int32_t op_ret, int32_t op_errno, -                                   inode_t *inode, struct iatt *buf, -                                   dict_t *xattr, struct iatt *postparent, -                                   loc_t *loc) -{ -        int              child_index = 0; -        afr_local_t     *local = NULL; -        afr_private_t   *priv = NULL; -        afr_self_heal_t *sh = NULL; - -        local = frame->local; -        priv = this->private; -        sh   = &local->self_heal; -        child_index = (long) cookie; - -        LOCK (&frame->lock); -        { -                if (op_ret == 0) { -                        sh->buf[child_index] = *buf; -                        sh->parentbufs[child_index] = *postparent; -                        sh->success_children[sh->success_count] = child_index; -                        sh->success_count++; -                        sh->xattr[child_index] = dict_ref (xattr); -                } else { -                        gf_log (this->name, GF_LOG_DEBUG, "path %s on subvolume" -                                " %s => -1 (%s)", loc->path, -                                priv->children[child_index]->name, -                                strerror (op_errno)); -                        local->self_heal.child_errno[child_index] = op_errno; -                } -        } -        UNLOCK (&frame->lock); -        return; -} +	loc_wipe (&loc); -gf_boolean_t -afr_valid_ia_type (ia_type_t ia_type) -{ -        switch (ia_type) { -        case IA_IFSOCK: -        case IA_IFREG: -        case IA_IFBLK: -        case IA_IFCHR: -        case IA_IFIFO: -        case IA_IFLNK: -        case IA_IFDIR: -                return _gf_true; -        default: -                return _gf_false; -        } -        return _gf_false; +	return afr_selfheal_locked_fill (frame, this, locked_on);  } +  int -afr_impunge_frame_create (call_frame_t *frame, xlator_t *this, -                          int active_source, call_frame_t **impunge_frame) +afr_selfheal_uninodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, +			char *dom, off_t off, size_t size, +			const unsigned char *locked_on)  { -        afr_local_t     *local         = NULL; -        afr_local_t     *impunge_local = NULL; -        afr_self_heal_t *impunge_sh    = NULL; -        int32_t         op_errno       = 0; -        afr_private_t   *priv          = NULL; -        int             ret            = 0; -        call_frame_t    *new_frame     = NULL; - -        op_errno = ENOMEM; -        priv = this->private; -        new_frame = copy_frame (frame); -        if (!new_frame) { -                goto out; -        } - -        AFR_LOCAL_ALLOC_OR_GOTO (impunge_local, out); - -        local = frame->local; -        new_frame->local = impunge_local; -        impunge_sh = &impunge_local->self_heal; -        impunge_sh->sh_frame = frame; -        impunge_sh->active_source = active_source; -        impunge_local->child_up  = memdup (local->child_up, -                                           sizeof (*local->child_up) * -                                           priv->child_count); -        if (!impunge_local->child_up) -                goto out; - -        impunge_local->pending = afr_matrix_create (priv->child_count, -                                                    AFR_NUM_CHANGE_LOGS); -        if (!impunge_local->pending) -                goto out; - -        ret = afr_sh_common_create (impunge_sh, priv->child_count); -        if (ret) { -                op_errno = -ret; -                goto out; -        } -        op_errno = 0; -        *impunge_frame = new_frame; -out: -        if (op_errno && new_frame) -                AFR_STACK_DESTROY (new_frame); -        return -op_errno; -} +	loc_t loc = {0,}; +	struct gf_flock flock = {0, }; -void -afr_sh_missing_entry_call_impunge_recreate (call_frame_t *frame, xlator_t *this, -                                            struct iatt *buf, -                                            struct iatt *postparent, -                                            afr_impunge_done_cbk_t impunge_done) -{ -        call_frame_t    *impunge_frame = NULL; -        afr_local_t     *local = NULL; -        afr_local_t     *impunge_local = NULL; -        afr_self_heal_t *sh = NULL; -        afr_self_heal_t *impunge_sh = NULL; -        int             ret = 0; -        unsigned int    enoent_count = 0; -        afr_private_t   *priv = NULL; -        int             i = 0; -        int32_t         op_errno = 0; - -        local = frame->local; -        sh    = &local->self_heal; -        priv  = this->private; - -        enoent_count = afr_errno_count (NULL, sh->child_errno, -                                        priv->child_count, ENOENT); -        if (!enoent_count) { -                gf_log (this->name, GF_LOG_INFO, -                        "no missing files - %s. proceeding to metadata check", -                        local->loc.path); -                goto out; -        } -        sh->impunge_done = impunge_done; -        ret = afr_impunge_frame_create (frame, this, sh->source, &impunge_frame); -        if (ret) -                goto out; -        impunge_local = impunge_frame->local; -        impunge_sh    = &impunge_local->self_heal; -        loc_copy (&impunge_local->loc, &local->loc); -        ret = afr_build_parent_loc (&impunge_sh->parent_loc, -                                    &impunge_local->loc, &op_errno); -        if (ret) { -                ret = -op_errno; -                goto out; -        } -        impunge_local->call_count = enoent_count; -        impunge_sh->entrybuf = sh->buf[sh->source]; -        impunge_sh->parentbuf = sh->parentbufs[sh->source]; -        for (i = 0; i < priv->child_count; i++) { -                if (!impunge_local->child_up[i]) { -                        impunge_sh->child_errno[i] = ENOTCONN; -                        continue; -                } -                if (sh->child_errno[i] != ENOENT) { -                        impunge_sh->child_errno[i] = EEXIST; -                        continue; -                } -        } -        for (i = 0; i < priv->child_count; i++) { -                if (sh->child_errno[i] != ENOENT) -                        continue; -                afr_sh_entry_impunge_create (impunge_frame, this, i); -                enoent_count--; -        } -        GF_ASSERT (!enoent_count); -        return; -out: -        if (ret) { -                gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, " -                        "reason: %s", local->loc.path, strerror (-ret)); -                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -        } -        afr_sh_missing_entries_finish (frame, this); -} -int -afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this, -                         int32_t op_ret, int32_t op_errno) -{ -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; - -        local = frame->local; -        sh = &local->self_heal; -        if (op_ret < 0) -                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -        afr_sh_missing_entries_finish (frame, this); -        return 0; -} +	loc.inode = inode_ref (inode); +	uuid_copy (loc.gfid, inode->gfid); -static int -sh_missing_entries_create (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        int              type = 0; -        struct iatt     *buf = NULL; -        struct iatt     *postparent = NULL; - -        local = frame->local; -        sh = &local->self_heal; - -        buf = &sh->buf[sh->source]; -        postparent = &sh->parentbufs[sh->source]; - -        type = buf->ia_type; -        if (!afr_valid_ia_type (type)) { -                gf_log (this->name, GF_LOG_ERROR, -                        "%s: unknown file type: 0%o", local->loc.path, type); -                afr_set_local_for_unhealable (local); -                afr_sh_missing_entries_finish (frame, this); -                goto out; -        } - -        afr_sh_missing_entry_call_impunge_recreate (frame, this, -                                                    buf, postparent, -                                                    afr_sh_create_entry_cbk); -out: -        return 0; -} +	flock.l_type = F_UNLCK; +	flock.l_start = off; +	flock.l_len = size; -void -afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this, -                                    int32_t op_ret, int32_t op_errno) -{ -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        afr_private_t   *priv = NULL; -        ia_type_t       ia_type = IA_INVAL; -        int32_t         nsources = 0; -        loc_t           *loc = NULL; -        int32_t         subvol_status = 0; -        afr_transaction_type txn_type = AFR_DATA_TRANSACTION; -        gf_boolean_t    split_brain = _gf_false; -        int             read_child = -1; - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; -        loc = &local->loc; - -        if (op_ret < 0) { -                if (op_errno == EIO) { -                        afr_set_local_for_unhealable (local); -                } -                // EIO can happen if finding the fresh parent dir failed -                goto out; -        } - -        //now No chance for the ia_type to conflict -        ia_type = sh->buf[sh->success_children[0]].ia_type; -        txn_type = afr_transaction_type_get (ia_type); -        nsources = afr_build_sources (this, sh->xattr, sh->buf, -                                      sh->pending_matrix, sh->sources, -                                      sh->success_children, txn_type, -                                      &subvol_status, _gf_false); -        if (nsources < 0) { -                gf_log (this->name, GF_LOG_INFO, "No sources for dir of %s," -                        " in missing entry self-heal, continuing with the rest" -                        " of the self-heals", local->loc.path); -                if (subvol_status & SPLIT_BRAIN) { -                        split_brain = _gf_true; -                        switch (txn_type) { -                        case AFR_DATA_TRANSACTION: -                                nsources = 1; -                                sh->sources[sh->success_children[0]] = 1; -                                break; -                        case AFR_ENTRY_TRANSACTION: -                                read_child = afr_get_no_xattr_dir_read_child -                                                          (this, -                                                           sh->success_children, -                                                           sh->buf); -                                sh->sources[read_child] = 1; -                                nsources = 1; -                                break; -                        default: -                                op_errno = EIO; -                                goto out; -                        } -                } else { -                        op_errno = EIO; -                        goto out; -                } -        } - -        afr_get_fresh_children (sh->success_children, sh->sources, -                                sh->fresh_children, priv->child_count); -        sh->source = sh->fresh_children[0]; -        if (sh->source == -1) { -                gf_log (this->name, GF_LOG_DEBUG, "No active sources found."); -                op_errno = EIO; -                goto out; -        } - -        if (sh->gfid_sh_success_cbk) -                sh->gfid_sh_success_cbk (frame, this); -        sh->type = sh->buf[sh->source].ia_type; -        if (uuid_is_null (loc->inode->gfid)) -                uuid_copy (loc->gfid, sh->buf[sh->source].ia_gfid); -        if (split_brain) { -                afr_sh_missing_entries_finish (frame, this); -        } else { -                sh_missing_entries_create (frame, this); -        } -        return; -out: -        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -        afr_sh_set_error (sh, op_errno); -        afr_sh_missing_entries_finish (frame, this); -        return; -} +	AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, inodelk, +		    dom, &loc, F_SETLK, &flock, NULL); -static int -afr_sh_common_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                          int32_t op_ret, int32_t op_errno, inode_t *inode, -                          struct iatt *buf, dict_t *xattr, -                          struct iatt *postparent) -{ -        int                     call_count = 0; -        afr_local_t             *local = NULL; -        afr_self_heal_t         *sh    = NULL; -        afr_private_t           *priv  = NULL; - -        local = frame->local; -        sh    = &local->self_heal; -        priv  = this->private; - -        afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, -                                           op_errno, inode, buf, xattr, -                                           postparent, &sh->lookup_loc); -        call_count = afr_frame_return (frame); - -        if (call_count) -                goto out; -        op_ret = -1; -        if (!sh->success_count) { -                op_errno = afr_resultant_errno_get (NULL, sh->child_errno, -                                                    priv->child_count); -                gf_log (this->name, GF_LOG_ERROR, "Failed to lookup %s, " -                        "reason %s", sh->lookup_loc.path, -                        strerror (op_errno)); -                goto done; -        } - -        if ((sh->lookup_flags & AFR_LOOKUP_FAIL_CONFLICTS) && -            (afr_conflicting_iattrs (sh->buf, sh->success_children, -                                     priv->child_count, -                                     sh->lookup_loc.path, this->name))) { -                op_errno = EIO; -                gf_log (this->name, GF_LOG_ERROR, "Conflicting entries " -                        "for %s", sh->lookup_loc.path); -                goto done; -        } - -        if ((sh->lookup_flags & AFR_LOOKUP_FAIL_MISSING_GFIDS) && -            (afr_gfid_missing_count (this->name, sh->success_children, -                                     sh->buf, priv->child_count, -                                     sh->lookup_loc.path))) { -                op_errno = ENODATA; -                gf_log (this->name, GF_LOG_ERROR, "Missing Gfids " -                        "for %s", sh->lookup_loc.path); -                goto done; -        } -        op_ret = 0; - -done: -        sh->lookup_done (frame, this, op_ret, op_errno); -out: -        return 0; +	loc_wipe (&loc); + +	return 0;  } +  int -afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child, -                         int32_t op_ret, int32_t op_errno) +afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, +			 char *dom, const char *name, unsigned char *locked_on)  { -        int             call_count = 0; -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; - -        local = frame->local; -        sh = &local->self_heal; - -        GF_ASSERT (sh->post_remove_call); -        if ((op_ret == -1) && (op_errno != ENOENT)) { -                gf_log (this->name, GF_LOG_ERROR, -                        "purge entry %s failed, on child %d reason, %s", -                        local->loc.path, child, strerror (op_errno)); -                LOCK (&frame->lock); -                { -                        afr_sh_set_error (sh, EIO); -                        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -                } -                UNLOCK (&frame->lock); -        } -        call_count = afr_frame_return (frame); -        if (call_count == 0) -                sh->post_remove_call (frame, this); -        return 0; -} +	loc_t loc = {0,}; -void -afr_sh_call_entry_expunge_remove (call_frame_t *frame, xlator_t *this, -                                  int child_index, struct iatt *buf, -                                  struct iatt *parentbuf, -                                  afr_expunge_done_cbk_t expunge_done) -{ -        call_frame_t    *expunge_frame = NULL; -        afr_local_t     *local = NULL; -        afr_local_t     *expunge_local = NULL; -        afr_self_heal_t *sh = NULL; -        afr_self_heal_t *expunge_sh = NULL; -        int32_t         op_errno = 0; -        int             ret = 0; - -        expunge_frame = copy_frame (frame); -        if (!expunge_frame) { -                goto out; -        } - -        AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out); - -        local = frame->local; -        sh = &local->self_heal; -        expunge_frame->local = expunge_local; -        expunge_sh = &expunge_local->self_heal; -        expunge_sh->sh_frame = frame; -        loc_copy (&expunge_local->loc, &local->loc); -        ret = afr_build_parent_loc (&expunge_sh->parent_loc, -                                    &expunge_local->loc, &op_errno); -        if (ret) { -                ret = -op_errno; -                goto out; -        } -        sh->expunge_done = expunge_done; -        afr_sh_entry_expunge_remove (expunge_frame, this, child_index, buf, -                                     parentbuf); -        return; -out: -        gf_log (this->name, GF_LOG_ERROR, "Expunge of %s failed, reason: %s", -                local->loc.path, strerror (op_errno)); -        expunge_done (frame, this, child_index, -1, op_errno); -} +	loc.inode = inode_ref (inode); +	uuid_copy (loc.gfid, inode->gfid); -void -afr_sh_remove_stale_lookup_info (afr_self_heal_t *sh, int32_t *success_children, -                                 int32_t *fresh_children, -                                 unsigned int child_count) -{ -        int     i = 0; - -        for (i = 0; i < child_count; i++) { -                if (afr_is_child_present (success_children, child_count, i) && -                    !afr_is_child_present (fresh_children, child_count, i)) { -                        sh->child_errno[i] = ENOENT; -                        GF_ASSERT (sh->xattr[i]); -                        dict_unref (sh->xattr[i]); -                        sh->xattr[i] = NULL; -                } -        } -} +	AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom, +		   &loc, name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); -int -afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t         *local    = NULL; -        afr_self_heal_t     *sh       = NULL; -        afr_private_t       *priv     = NULL; - -        local    = frame->local; -        sh       = &local->self_heal; -        priv     = this->private; - -        if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { -                afr_sh_missing_entries_finish (frame, this); -        } else { -                if (afr_gfid_missing_count (this->name, sh->fresh_children, -                                            sh->buf, priv->child_count, -                                            local->loc.path)) { -                        afr_sh_common_lookup (frame, this, &local->loc, -                                              afr_sh_missing_entries_lookup_done, -                                              sh->sh_gfid_req, -                                              AFR_LOOKUP_FAIL_CONFLICTS| -                                              AFR_LOOKUP_FAIL_MISSING_GFIDS, -                                              NULL); -                } else { -                        //No need to set gfid so goto missing entries lookup done -                        //Behave as if you have done the lookup -                        afr_sh_remove_stale_lookup_info (sh, -                                                         sh->success_children, -                                                         sh->fresh_children, -                                                         priv->child_count); -                        afr_children_copy (sh->success_children, -                                           sh->fresh_children, -                                           priv->child_count); -                        afr_sh_missing_entries_lookup_done (frame, this, 0, 0); -                } -        } -        return 0; +	loc_wipe (&loc); + +	return afr_selfheal_locked_fill (frame, this, locked_on);  } -gf_boolean_t -afr_sh_purge_entry_condition (afr_local_t *local, afr_private_t *priv, -                              int child) + +int +afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, +		      char *dom, const char *name, unsigned char *locked_on)  { -        afr_self_heal_t *sh = NULL; +	loc_t loc = {0,}; +	afr_local_t *local = NULL; +	int i = 0; +	afr_private_t *priv = NULL; -        sh = &local->self_heal; +	priv = this->private; +	local = frame->local; -        if (local->child_up[child] && -            (!afr_is_child_present (sh->fresh_parent_dirs, priv->child_count, -                                    child)) -            && (sh->child_errno[child] != ENOENT)) -                return _gf_true; +	loc.inode = inode_ref (inode); +	uuid_copy (loc.gfid, inode->gfid); -        return _gf_false; -} +	AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, +		   name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); -gf_boolean_t -afr_sh_purge_stale_entry_condition (afr_local_t *local, afr_private_t *priv, -                                    int child) -{ -        afr_self_heal_t *sh = NULL; +	for (i = 0; i < priv->child_count; i++) { +		if (local->replies[i].op_ret == -1 && +		    local->replies[i].op_errno == EAGAIN) { +			afr_selfheal_locked_fill (frame, this, locked_on); +			afr_selfheal_unentrylk (frame, this, inode, dom, name, +						locked_on); -        sh = &local->self_heal; +			AFR_SEQ (frame, afr_selfheal_lock_cbk, entrylk, dom, +				 &loc, name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); +			break; +		} +	} -        if (local->child_up[child] && -            (!afr_is_child_present (sh->fresh_children, priv->child_count, -                                    child)) -             && (sh->child_errno[child] != ENOENT)) -                return _gf_true; +	loc_wipe (&loc); -        return _gf_false; +	return afr_selfheal_locked_fill (frame, this, locked_on);  } -void -afr_sh_purge_entry_common (call_frame_t *frame, xlator_t *this, -                           gf_boolean_t purge_condition (afr_local_t *local, -                                                         afr_private_t *priv, -                                                         int child)) -{ -        afr_local_t     *local = NULL; -        afr_private_t   *priv = NULL; -        afr_self_heal_t *sh = NULL; -        int             i = 0; -        int             call_count = 0; - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; - -        for (i = 0; i < priv->child_count; i++) { -                if (purge_condition (local, priv, i)) -                        call_count++; -        } - -        if (call_count == 0) { -                sh->post_remove_call (frame, this); -                goto out; -        } - -        local->call_count = call_count; -        for (i = 0; i < priv->child_count; i++) { -                if (!purge_condition (local, priv, i)) -                        continue; -                gf_log (this->name, GF_LOG_INFO, "purging the stale entry %s " -                        "on %s", local->loc.path, priv->children[i]->name); -                afr_sh_call_entry_expunge_remove (frame, this, -                                                  (long) i, &sh->buf[i], -                                                  &sh->parentbufs[i], -                                                  afr_sh_remove_entry_cbk); -        } -out: -        return; -} -void -afr_sh_purge_entry (call_frame_t *frame, xlator_t *this) +int +afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, +			char *dom, const char *name, unsigned char *locked_on)  { -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; +	loc_t loc = {0,}; + +	loc.inode = inode_ref (inode); +	uuid_copy (loc.gfid, inode->gfid); + +	AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, entrylk, +		    dom, &loc, name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); -        local = frame->local; -        sh = &local->self_heal; -        sh->post_remove_call = afr_sh_missing_entries_finish; +	loc_wipe (&loc); -        afr_sh_purge_entry_common (frame, this, afr_sh_purge_entry_condition); +	return 0;  } -void -afr_sh_purge_stale_entry (call_frame_t *frame, xlator_t *this) + +gf_boolean_t +afr_is_pending_set (xlator_t *this, dict_t *xdata, int type)  { -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        afr_private_t   *priv = NULL; -        int             i = 0; +	int idx = -1; +	afr_private_t *priv = NULL; +	void *pending_raw = NULL; +	int *pending_int = NULL; +	int i = 0; -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; +	priv = this->private; +	idx = afr_index_for_transaction_type (type); -        sh->post_remove_call = afr_sh_purge_stale_entries_done; +	if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw) == 0) { +		if (pending_raw) { +			pending_int = pending_raw; -        for (i = 0; i < priv->child_count; i++) { -                if (afr_is_child_present (sh->fresh_children, -                                          priv->child_count, i)) -                        continue; +			if (ntoh32 (pending_int[idx])) +				return _gf_true; +		} +	} -                if ((!local->child_up[i]) || sh->child_errno[i] != 0) -                        continue; +	for (i = 0; i < priv->child_count; i++) { +		if (dict_get_ptr (xdata, priv->pending_key[i], +				  &pending_raw)) +			continue; +		if (!pending_raw) +			continue; +		pending_int = pending_raw; -                GF_ASSERT (!uuid_is_null (sh->entrybuf.ia_gfid) || -                           uuid_is_null (sh->buf[i].ia_gfid)); +		if (ntoh32 (pending_int[idx])) +			return _gf_true; +	} -                if ((sh->entrybuf.ia_type != sh->buf[i].ia_type) || -                    (uuid_compare (sh->buf[i].ia_gfid, -                                   sh->entrybuf.ia_gfid))) -                        continue; +	return _gf_false; +} -                afr_children_add_child (sh->fresh_children, i, -                                        priv->child_count); -        } -        afr_sh_purge_entry_common (frame, this, -                                   afr_sh_purge_stale_entry_condition); +gf_boolean_t +afr_is_data_set (xlator_t *this, dict_t *xdata) +{ +	return afr_is_pending_set (this, xdata, AFR_DATA_TRANSACTION);  } -void -afr_sh_save_child_iatts_from_policy (int32_t *children, struct iatt *bufs, -                                     struct iatt *save, -                                     unsigned int child_count) +gf_boolean_t +afr_is_metadata_set (xlator_t *this, dict_t *xdata)  { -        int             i = 0; -        int             child = 0; -        gf_boolean_t    saved = _gf_false; - -        GF_ASSERT (save); -        //if iatt buf with gfid exists sets it -        for (i = 0; i < child_count; i++) { -                child = children[i]; -                if (child == -1) -                        break; -                *save = bufs[child]; -                saved = _gf_true; -                if (!uuid_is_null (save->ia_gfid)) -                        break; -        } -        GF_ASSERT (saved); +	return afr_is_pending_set (this, xdata, AFR_METADATA_TRANSACTION);  } -void -afr_get_children_of_fresh_parent_dirs (afr_self_heal_t *sh, -                                       unsigned int child_count) +gf_boolean_t +afr_is_entry_set (xlator_t *this, dict_t *xdata)  { -        afr_children_intersection_get (sh->success_children, -                                       sh->fresh_parent_dirs, -                                       sh->sources, child_count); -        afr_get_fresh_children (sh->success_children, sh->sources, -                                sh->fresh_children, child_count); -        memset (sh->sources, 0, sizeof (*sh->sources) * child_count); +	return afr_is_pending_set (this, xdata, AFR_ENTRY_TRANSACTION);  } +  void -afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this, -                             int32_t op_ret, int32_t op_errno) +afr_inode_link (inode_t *inode, struct iatt *iatt)  { -        afr_local_t      *local = NULL; -        afr_self_heal_t  *sh = NULL; -        afr_private_t    *priv = NULL; -        int32_t          fresh_child_enoents = 0; -        int32_t          fresh_parent_count = 0; - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; - -        if (op_ret < 0) -                goto fail; -        afr_get_children_of_fresh_parent_dirs (sh, priv->child_count); -        fresh_parent_count = afr_get_children_count (sh->fresh_parent_dirs, -                                                     priv->child_count); -        //we need the enoent count of the subvols present in fresh_parent_dirs -        fresh_child_enoents = afr_errno_count (sh->fresh_parent_dirs, -                                               sh->child_errno, -                                               priv->child_count, ENOENT); -        if (fresh_child_enoents == fresh_parent_count) { -                afr_sh_set_error (sh, ENOENT); -                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -                afr_sh_purge_entry (frame, this); -        } else if (!afr_conflicting_iattrs (sh->buf, sh->fresh_children, -                                            priv->child_count, local->loc.path, -                                            this->name)) { -                afr_sh_save_child_iatts_from_policy (sh->fresh_children, -                                                     sh->buf, &sh->entrybuf, -                                                     priv->child_count); -                afr_update_gfid_from_iatts (sh->sh_gfid_req, sh->buf, -                                            sh->fresh_children, -                                            priv->child_count); -                afr_sh_purge_stale_entry (frame, this); -        } else { -                op_errno = EIO; -                afr_set_local_for_unhealable (local); -                goto fail; -        } - -        return; - -fail: -        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -        afr_sh_set_error (sh, op_errno); -        afr_sh_missing_entries_finish (frame, this); -        return; -} +	inode_t *linked_inode = NULL; -static void -afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this, -                           int32_t op_ret, int32_t op_errno) -{ -        afr_self_heal_t *sh  = NULL; -        afr_private_t   *priv = NULL; -        afr_local_t     *local = NULL; -        int             enoent_count = 0; -        int             nsources = 0; -        int             source  = -1; -        int32_t         subvol_status = 0; - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; - -        if (op_ret < 0) -                goto out; -        enoent_count = afr_errno_count (NULL, sh->child_errno, -                                        priv->child_count, ENOENT); -        if (enoent_count > 0) { -                gf_log (this->name, GF_LOG_INFO, "Parent dir missing for %s," -                        " in missing entry self-heal, aborting missing-entry " -                        "self-heal", -                        local->loc.path); -                afr_sh_missing_entries_finish (frame, this); -                return; -        } - -        nsources = afr_build_sources (this, sh->xattr, sh->buf, -                                      sh->pending_matrix, sh->sources, -                                      sh->success_children, -                                      AFR_ENTRY_TRANSACTION, &subvol_status, -                                      _gf_true); -        if ((subvol_status & ALL_FOOLS) || -            (subvol_status & SPLIT_BRAIN)) { -                gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative " -                        "merge", sh->parent_loc.path); -                afr_mark_success_children_sources (sh->sources, -                                                   sh->success_children, -                                                   priv->child_count); -        } else if (nsources < 0) { -                gf_log (this->name, GF_LOG_ERROR, "No sources for dir " -                        "of %s, in missing entry self-heal, aborting " -                        "self-heal", local->loc.path); -                op_errno = EIO; -                goto out; -        } - -        source = afr_sh_select_source (sh->sources, priv->child_count); -        if (source == -1) { -                GF_ASSERT (0); -                gf_log (this->name, GF_LOG_DEBUG, "No active sources found."); -                op_errno = EIO; -                goto out; -        } -        afr_get_fresh_children (sh->success_children, sh->sources, -                                sh->fresh_parent_dirs, priv->child_count); -        afr_sh_common_lookup (frame, this, &local->loc, -                              afr_sh_children_lookup_done, NULL, 0, -                              NULL); -        return; +	linked_inode = inode_link (inode, NULL, NULL, iatt); -out: -        afr_sh_set_error (sh, op_errno); -        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -	afr_sh_missing_entries_finish (frame, this); -        return; -} +	uuid_copy (inode->gfid, iatt->ia_gfid); +	inode->ia_type = iatt->ia_type; -void -afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count) -{ -        int             i = 0; - -        for (i = 0; i < child_count; i++) { -                memset (&sh->buf[i], 0, sizeof (sh->buf[i])); -                memset (&sh->parentbufs[i], 0, sizeof (sh->parentbufs[i])); -                sh->child_errno[i] = 0; -        } -        memset (&sh->parentbuf, 0, sizeof (sh->parentbuf)); -        sh->success_count = 0; -        afr_reset_children (sh->success_children, child_count); -        afr_reset_children (sh->fresh_children, child_count); -        afr_reset_xattr (sh->xattr, child_count); -        loc_wipe (&sh->lookup_loc); +	if (linked_inode) { +		inode_lookup (linked_inode); +		inode_unref (linked_inode); +	}  } -/* afr self-heal state will be lost if this call is made - * please check the afr_sh_common_reset that is called in this function + +/* + * This function inspects the looked up replies (in an unlocked manner) + * and decides whether a locked verification and possible healing is + * required or not. It updates the three booleans for each type + * of healing. If the boolean flag gets set to FALSE, then we are sure + * no healing is required. If the boolean flag gets set to TRUE then + * we have to proceed with locked reinspection.   */ +  int -afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, -                      afr_lookup_done_cbk_t lookup_done , uuid_t gfid, -                      int32_t flags, dict_t *xdata) +afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, +			       inode_t *inode, uuid_t gfid, +			       gf_boolean_t *data_selfheal, +			       gf_boolean_t *metadata_selfheal, +			       gf_boolean_t *entry_selfheal)  { -        afr_local_t    *local = NULL; -        int             i = 0; -        int             call_count = 0; -        afr_private_t  *priv = NULL; -        dict_t         *xattr_req = NULL; -        afr_self_heal_t *sh = NULL; - -        local = frame->local; -        priv  = this->private; -        sh    = &local->self_heal; - -        call_count = afr_up_children_count (local->child_up, priv->child_count); - -        local->call_count = call_count; - -        xattr_req = dict_new(); - -        if (xattr_req) { -                afr_xattr_req_prepare (this, xattr_req, loc->path); -                if (gfid) { -                        gf_log (this->name, GF_LOG_DEBUG, -                                "looking up %s with gfid: %s", -                                loc->path, uuid_utoa (gfid)); -                        GF_ASSERT (!uuid_is_null (gfid)); -                        afr_set_dict_gfid (xattr_req, gfid); -                } -        } - -        afr_sh_common_reset (sh, priv->child_count); -        sh->lookup_done = lookup_done; -        loc_copy (&sh->lookup_loc, loc); -        sh->lookup_flags = flags; -        for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i]) { -                        gf_log (this->name, GF_LOG_DEBUG, -                                "looking up %s on subvolume %s", -                                loc->path, priv->children[i]->name); - -                        STACK_WIND_COOKIE (frame, -                                           afr_sh_common_lookup_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->lookup, -                                           loc, xattr_req); - -                        if (!--call_count) -                                break; -                } -        } - -        if (xattr_req) -                dict_unref (xattr_req); - -        return 0; -} +	afr_private_t *priv = NULL; +	int i = 0; +	int valid_cnt = 0; +	struct iatt first = {0, }; +	struct afr_reply *replies = NULL; +	int ret = -1; +	priv = this->private; +	replies = alloca0 (sizeof (*replies) * priv->child_count); -int -afr_sh_post_nb_entrylk_missing_entry_sh_cbk (call_frame_t *frame, -                                             xlator_t *this) -{ -        afr_internal_lock_t *int_lock = NULL; -        afr_local_t         *local    = NULL; -        afr_self_heal_t     *sh       = NULL; - -        local    = frame->local; -        int_lock = &local->internal_lock; -        sh       = &local->self_heal; - -        if (int_lock->lock_op_ret < 0) { -                gf_log (this->name, GF_LOG_INFO, -                        "Non blocking entrylks failed."); -                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -                afr_sh_missing_entries_done (frame, this); -        } else { - -                gf_log (this->name, GF_LOG_DEBUG, -                        "Non blocking entrylks done. Proceeding to FOP"); -                afr_sh_common_lookup (frame, this, &sh->parent_loc, -                                      afr_sh_find_fresh_parents, -                                      NULL, AFR_LOOKUP_FAIL_CONFLICTS, -                                      NULL); -        } - -        return 0; -} +	ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies); +	if (ret) +		return ret; -int -afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, -                char *base_name, afr_lock_cbk_t lock_cbk) -{ -        afr_internal_lock_t *int_lock = NULL; -        afr_local_t         *local    = NULL; -        afr_private_t       *priv     = NULL; +	for (i = 0; i < priv->child_count; i++) { +		if (!replies[i].valid) +			continue; +		if (replies[i].op_ret == -1) +			continue; -        priv     = this->private; -        local    = frame->local; -        int_lock = &local->internal_lock; +		if (afr_is_data_set (this, replies[i].xdata)) +			*data_selfheal = _gf_true; -        int_lock->transaction_lk_type = AFR_SELFHEAL_LK; -        int_lock->selfheal_lk_type    = AFR_ENTRY_SELF_HEAL_LK; +		if (afr_is_metadata_set (this, replies[i].xdata)) +			*metadata_selfheal = _gf_true; -        afr_set_lock_number (frame, this); +		if (afr_is_entry_set (this, replies[i].xdata)) +			*entry_selfheal = _gf_true; -        int_lock->lk_basename = base_name; -        int_lock->lk_loc      = loc; -        int_lock->lock_cbk    = lock_cbk; -        int_lock->domain      = this->name; +		valid_cnt ++; +		if (valid_cnt == 1) { +			first = replies[i].poststat; +			continue; +		} -        int_lock->lockee_count = 0; -        afr_init_entry_lockee (&int_lock->lockee[0], local, loc, -                               base_name, priv->child_count); -        int_lock->lockee_count++; -        afr_nonblocking_entrylk (frame, this); +		if (!IA_EQUAL (first, replies[i].poststat, type)) { +			gf_log (this->name, GF_LOG_ERROR, +				"TYPE mismatch %d vs %d on %s for gfid:%s", +				(int) first.ia_type, +				(int) replies[i].poststat.ia_type, +				priv->children[i]->name, +				uuid_utoa (replies[i].poststat.ia_gfid)); +			return -EIO; +		} -        return 0; -} +		if (!IA_EQUAL (first, replies[i].poststat, uid)) { +			gf_log (this->name, GF_LOG_DEBUG, +				"UID mismatch %d vs %d on %s for gfid:%s", +				(int) first.ia_uid, +				(int) replies[i].poststat.ia_uid, +				priv->children[i]->name, +				uuid_utoa (replies[i].poststat.ia_gfid)); -static int -afr_self_heal_parent_entrylk (call_frame_t *frame, xlator_t *this, -                              afr_lock_cbk_t lock_cbk) -{ -        afr_local_t         *local    = NULL; -        afr_self_heal_t     *sh       = NULL; -        afr_internal_lock_t *int_lock = NULL; -        int                 ret       = -1; -        int32_t             op_errno  = 0; - -        local    = frame->local; -        sh       = &local->self_heal; - -        gf_log (this->name, GF_LOG_TRACE, -                "attempting to recreate missing entries for path=%s", -                local->loc.path); - -        ret = afr_build_parent_loc (&sh->parent_loc, &local->loc, &op_errno); -        if (ret) -                goto out; - -        afr_sh_entrylk (frame, this, &sh->parent_loc, NULL, -                        lock_cbk); -        return 0; -out: -        int_lock = &local->internal_lock; -        int_lock->lock_op_ret = -1; -        lock_cbk (frame, this); -        return 0; -} +			*metadata_selfheal = _gf_true; +		} -static int -afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; +		if (!IA_EQUAL (first, replies[i].poststat, gid)) { +			gf_log (this->name, GF_LOG_DEBUG, +				"GID mismatch %d vs %d on %s for gfid:%s", +				(int) first.ia_uid, +				(int) replies[i].poststat.ia_uid, +				priv->children[i]->name, +				uuid_utoa (replies[i].poststat.ia_gfid)); -        local = frame->local; -        sh = &local->self_heal; +			*metadata_selfheal = _gf_true; +		} -        sh->sh_type_in_action  = AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY; +		if (!IA_EQUAL (first, replies[i].poststat, prot)) { +			gf_log (this->name, GF_LOG_DEBUG, +				"MODE mismatch %d vs %d on %s for gfid:%s", +				(int) st_mode_from_ia (first.ia_prot, 0), +				(int) st_mode_from_ia (replies[i].poststat.ia_prot, 0), +				priv->children[i]->name, +				uuid_utoa (replies[i].poststat.ia_gfid)); -        afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); +			*metadata_selfheal = _gf_true; +		} -        afr_self_heal_parent_entrylk (frame, this, -                                      afr_sh_post_nb_entrylk_missing_entry_sh_cbk); -        return 0; -} +		if (IA_ISREG(first.ia_type) && +		    !IA_EQUAL (first, replies[i].poststat, size)) { +			gf_log (this->name, GF_LOG_DEBUG, +				"SIZE mismatch %lld vs %lld on %s for gfid:%s", +				(long long) first.ia_size, +				(long long) replies[i].poststat.ia_size, +				priv->children[i]->name, +				uuid_utoa (replies[i].poststat.ia_gfid)); -afr_local_t* -afr_self_heal_local_init (afr_local_t *l, xlator_t *this) -{ -        afr_private_t   *priv  = NULL; -        afr_local_t     *lc    = NULL; -        afr_self_heal_t *sh    = NULL; -        afr_self_heal_t *shc   = NULL; -        int             ret    = 0; - -        priv = this->private; - -        sh = &l->self_heal; - -        lc = mem_get0 (this->local_pool); -        if (!lc) -                goto out; - -        shc = &lc->self_heal; - -        shc->unwind = sh->unwind; -        shc->gfid_sh_success_cbk = sh->gfid_sh_success_cbk; -        shc->do_missing_entry_self_heal = sh->do_missing_entry_self_heal; -        shc->do_gfid_self_heal = sh->do_gfid_self_heal; -        shc->do_data_self_heal = sh->do_data_self_heal; -        shc->do_metadata_self_heal = sh->do_metadata_self_heal; -        shc->do_entry_self_heal = sh->do_entry_self_heal; -        shc->force_confirm_spb = sh->force_confirm_spb; -        shc->forced_merge = sh->forced_merge; -        shc->background = sh->background; -        shc->type = sh->type; -        shc->data_sh_info = ""; -        shc->metadata_sh_info =  ""; - -        uuid_copy (shc->sh_gfid_req, sh->sh_gfid_req); -        if (l->loc.path) { -                ret = loc_copy (&lc->loc, &l->loc); -                if (ret < 0) -                        goto out; -        } - -        lc->child_up  = memdup (l->child_up, -                                sizeof (*lc->child_up) * priv->child_count); -        if (!lc->child_up) { -                ret = -1; -                goto out; -        } - -        if (l->xattr_req) -                lc->xattr_req = dict_ref (l->xattr_req); - -        if (l->cont.lookup.inode) -                lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode); -        if (l->cont.lookup.xattr) -                lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr); - -        lc->internal_lock.locked_nodes = -                             GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), -                                        priv->child_count, gf_afr_mt_char); -        if (!lc->internal_lock.locked_nodes) { -                ret = -1; -                goto out; -        } - -        ret = afr_inodelk_init (&lc->internal_lock.inodelk[0], -                                this->name, priv->child_count); -        if (ret) -                goto out; +			*data_selfheal = _gf_true; +		} +	} -out: -        if (ret) { -                afr_local_cleanup (lc, this); -                lc = NULL; -        } -        return lc; -} +	if (valid_cnt > 0) +		afr_inode_link (inode, &first); -int -afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) -{ -        afr_private_t *   priv  = NULL; -        afr_local_t *     local = NULL; -        afr_self_heal_t * sh    = NULL; -        afr_local_t *     orig_frame_local = NULL; -        afr_self_heal_t * orig_frame_sh = NULL; -        char              sh_type_str[256] = {0,}; -        gf_loglevel_t     loglevel = 0; - -        priv  = this->private; -        local = bgsh_frame->local; -        sh    = &local->self_heal; - -        if (local->unhealable) { -                afr_set_split_brain (this, sh->inode, SPB, SPB); -        } - -        afr_self_heal_type_str_get (sh, sh_type_str, -                                    sizeof(sh_type_str)); -        if (is_self_heal_failed (sh, AFR_CHECK_ALL) && !priv->shd.iamshd) { -                loglevel = GF_LOG_ERROR; -        } else if (!is_self_heal_failed (sh, AFR_CHECK_ALL)) { -                loglevel = GF_LOG_INFO; -        } else { -                loglevel = GF_LOG_DEBUG; -        } - -        afr_log_self_heal_completion_status (local, loglevel); - -        FRAME_SU_UNDO (bgsh_frame, afr_local_t); - -        if (!sh->unwound && sh->unwind) { -                orig_frame_local = sh->orig_frame->local; -                orig_frame_sh = &orig_frame_local->self_heal; -                orig_frame_sh->actual_sh_started = _gf_true; -                sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, -                            is_self_heal_failed (sh, AFR_CHECK_ALL)); -        } - -        if (sh->background) { -                LOCK (&priv->lock); -                { -                        priv->background_self_heals_started--; -                } -                UNLOCK (&priv->lock); -        } - -        AFR_STACK_DESTROY (bgsh_frame); - -        return 0; -} - -int -afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) -{ -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        afr_private_t   *priv = NULL; -        int32_t          op_errno = 0; -        int              ret = 0; -        afr_self_heal_t *orig_sh = NULL; -        call_frame_t    *sh_frame = NULL; -        afr_local_t     *sh_local = NULL; -        loc_t           *loc   = NULL; - -        local = frame->local; -        orig_sh = &local->self_heal; -        priv  = this->private; - -        GF_ASSERT (local->loc.path); - -        gf_log (this->name, GF_LOG_TRACE, -                "performing self heal on %s (metadata=%d data=%d entry=%d)", -                local->loc.path, -                local->self_heal.do_metadata_self_heal, -                local->self_heal.do_data_self_heal, -                local->self_heal.do_entry_self_heal); - -        op_errno        = ENOMEM; -        sh_frame        = copy_frame (frame); -        if (!sh_frame) -                goto out; -        afr_set_lk_owner (sh_frame, this, sh_frame->root); -        afr_set_low_priority (sh_frame); - -        sh_local        = afr_self_heal_local_init (local, this); -        if (!sh_local) -                goto out; -        sh_frame->local = sh_local; -        sh              = &sh_local->self_heal; - -        sh->inode       = inode_ref (inode); -        sh->orig_frame  = frame; - -        sh->completion_cbk = afr_self_heal_completion_cbk; - -        sh->success = GF_CALLOC (priv->child_count, sizeof (*sh->success), -                                 gf_afr_mt_char); -        if (!sh->success) -                goto out; -        sh->sources = GF_CALLOC (sizeof (*sh->sources), priv->child_count, -                                 gf_afr_mt_int); -        if (!sh->sources) -                goto out; -        sh->locked_nodes = GF_CALLOC (sizeof (*sh->locked_nodes), -                                      priv->child_count, -                                      gf_afr_mt_int); -        if (!sh->locked_nodes) -                goto out; - -        sh->pending_matrix = afr_matrix_create (priv->child_count, -                                                priv->child_count); -        if (!sh->pending_matrix) -                goto out; - -        sh->delta_matrix = afr_matrix_create (priv->child_count, -                                              priv->child_count); -        if (!sh->delta_matrix) -                goto out; - -        sh->fresh_parent_dirs = afr_children_create (priv->child_count); -        if (!sh->fresh_parent_dirs) -                goto out; -        ret = afr_sh_common_create (sh, priv->child_count); -        if (ret) { -                op_errno = -ret; -                goto out; -        } - -        if (local->self_heal.background) { -                LOCK (&priv->lock); -                { -                        if (priv->background_self_heals_started -                            < priv->background_self_heal_count) { -                                priv->background_self_heals_started++; - - -                        } else { -                                local->self_heal.background = _gf_false; -                                sh->background = _gf_false; -                        } -                } -                UNLOCK (&priv->lock); -        } - -        if (!local->loc.parent) { -                sh->do_missing_entry_self_heal = _gf_false; -                sh->do_gfid_self_heal = _gf_false; -        } - -        sh->sh_type_in_action = AFR_SELF_HEAL_INVALID; - -        FRAME_SU_DO (sh_frame, afr_local_t); -        if (sh->do_missing_entry_self_heal || sh->do_gfid_self_heal) { -                afr_self_heal_missing_entries (sh_frame, this); -        } else { -                loc = &sh_local->loc; -                if (uuid_is_null (loc->inode->gfid) && uuid_is_null (loc->gfid)) { -                        if (!uuid_is_null (inode->gfid)) -                                GF_ASSERT (!uuid_compare (inode->gfid, -                                           sh->sh_gfid_req)); -                        uuid_copy (loc->gfid, sh->sh_gfid_req); -                } -                gf_log (this->name, GF_LOG_TRACE, -                        "proceeding to metadata check on %s", -                        local->loc.path); - -                afr_sh_missing_entries_done (sh_frame, this); -        } -        op_errno = 0; +	if (valid_cnt < 2) +		return -ENOTCONN; -out: -        if (op_errno) { -                orig_sh->unwind (frame, this, -1, op_errno, 1); -                if (sh_frame) -                        AFR_STACK_DESTROY (sh_frame); -        } -        return 0; +	return 0;  } -void -afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, -                            size_t size) + +inode_t * +afr_inode_find (xlator_t *this, uuid_t gfid)  { -        GF_ASSERT (str && (size > strlen (" missing-entry gfid " -                                          "meta-data data entry"))); +	inode_table_t *table = NULL; +	inode_t *inode = NULL; -        if (self_heal_p->do_metadata_self_heal) { -                snprintf (str, size, " meta-data"); -        } +	table = this->itable; +	if (!table) +		return NULL; -        if (self_heal_p->do_data_self_heal) { -                snprintf (str + strlen(str), size - strlen(str), " data"); -        } +	inode = inode_find (table, gfid); +	if (inode) +		return inode; -        if (self_heal_p->do_entry_self_heal) { -                snprintf (str + strlen(str), size - strlen(str), " entry"); -        } +	inode = inode_new (table); +	if (!inode) +		return NULL; -        if (self_heal_p->do_missing_entry_self_heal) { -                snprintf (str + strlen(str), size - strlen(str), -                         " missing-entry"); -        } +	uuid_copy (inode->gfid, gfid); -        if (self_heal_p->do_gfid_self_heal) { -                snprintf (str + strlen(str), size - strlen(str), " gfid"); -        } +	return inode;  } -afr_self_heal_type -afr_self_heal_type_for_transaction (afr_transaction_type type) -{ -        afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; - -        switch (type) { -        case AFR_DATA_TRANSACTION: -                sh_type = AFR_SELF_HEAL_DATA; -                break; -        case AFR_METADATA_TRANSACTION: -                sh_type = AFR_SELF_HEAL_METADATA; -                break; -        case AFR_ENTRY_TRANSACTION: -                sh_type = AFR_SELF_HEAL_ENTRY; -                break; -        case AFR_ENTRY_RENAME_TRANSACTION: -                GF_ASSERT (0); -                break; -        } -        return sh_type; -} -int -afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +call_frame_t * +afr_frame_create (xlator_t *this)  { -        int   ret = -1; -        uuid_t pargfid = {0}; - -        if (!child) -                goto out; - -        if (!uuid_is_null (parent->inode->gfid)) -                uuid_copy (pargfid, parent->inode->gfid); -        else if (!uuid_is_null (parent->gfid)) -                uuid_copy (pargfid, parent->gfid); - -        if (uuid_is_null (pargfid)) -                goto out; - -        if (strcmp (parent->path, "/") == 0) -                ret = gf_asprintf ((char **)&child->path, "/%s", name); -        else -                ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path, -                                   name); +	call_frame_t *frame = NULL; +	afr_local_t *local = NULL; +	int op_errno = 0; +	pid_t pid = -1; -        if (-1 == ret) { -                gf_log (this->name, GF_LOG_ERROR, -                        "asprintf failed while setting child path"); -        } +	frame = create_frame (this, this->ctx->pool); +	if (!frame) +		return NULL; -        child->name = strrchr (child->path, '/'); -        if (child->name) -                child->name++; +	local = AFR_FRAME_INIT (frame, op_errno); +	if (!local) { +		STACK_DESTROY (frame->root); +		return NULL; +	} -        child->parent = inode_ref (parent->inode); -        child->inode = inode_new (parent->inode->table); -        uuid_copy (child->pargfid, pargfid); +	syncopctx_setfspid (&pid); -        if (!child->inode) { -                ret = -1; -                goto out; -        } +	frame->root->pid = pid; -        ret = 0; -out: -        if ((ret == -1) && child) -                loc_wipe (child); +	afr_set_lk_owner (frame, this, frame->root); -        return ret; +	return frame;  } -int -afr_sh_erase_pending (call_frame_t *frame, xlator_t *this, -                      afr_transaction_type type, afr_fxattrop_cbk_t cbk, -                      int (*finish)(call_frame_t *frame, xlator_t *this)) -{ -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        afr_private_t   *priv = NULL; -        int              call_count = 0; -        int              i = 0; -        dict_t          **erase_xattr = NULL; -        int             ret = -1; - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; - -        afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, -                                 sh->success, priv->child_count, type); - -        erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, -                                 gf_afr_mt_dict_t); -        if (!erase_xattr) -                goto out; - -        for (i = 0; i < priv->child_count; i++) { -                if (sh->xattr[i]) { -                        call_count++; -                        erase_xattr[i] = dict_new (); -                        if (!erase_xattr[i]) -                                goto out; -                } -        } - -        afr_sh_delta_to_xattr (this, sh->delta_matrix, erase_xattr, -                               priv->child_count, type); - -        gf_log (this->name, GF_LOG_DEBUG, "Delta matrix for: %s", -                lkowner_utoa (&frame->root->lk_owner)); -        afr_sh_print_pending_matrix (sh->delta_matrix, this); -        local->call_count = call_count; -        if (call_count == 0) { -                ret = 0; -                finish (frame, this); -                goto out; -        } - -        for (i = 0; i < priv->child_count; i++) { -                if (!erase_xattr[i]) -                        continue; - -                if (sh->healing_fd) {//true for ENTRY, reg file DATA transaction -                        STACK_WIND_COOKIE (frame, cbk, (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->fxattrop, -                                           sh->healing_fd, -                                           GF_XATTROP_ADD_ARRAY, erase_xattr[i], -                                           NULL); -                } else { -                        STACK_WIND_COOKIE (frame, cbk, (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->xattrop, -                                           &local->loc, -                                           GF_XATTROP_ADD_ARRAY, erase_xattr[i], -                                           NULL); -                } -        } - -        ret = 0; -out: -        if (erase_xattr) { -                for (i = 0; i < priv->child_count; i++) { -                        if (erase_xattr[i]) { -                                dict_unref (erase_xattr[i]); -                        } -                } -        } - -        GF_FREE (erase_xattr); - -        if (ret < 0) { -                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -                finish (frame, this); -        } - -        return 0; -} -void -afr_set_self_heal_status(afr_self_heal_t *sh, afr_self_heal_status status) -{ -        xlator_t                *this = NULL; -        afr_sh_status_for_all_type *sh_status = &(sh->afr_all_sh_status); -        afr_self_heal_type  sh_type_in_action = sh->sh_type_in_action; -        this = THIS; - -        if (!sh) { -                gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal" -                                  "Structure"); -                goto out; -        } - -        switch (sh_type_in_action) { -                case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: -                       sh_status->gfid_or_missing_entry_self_heal = status; -                        break; -                case AFR_SELF_HEAL_METADATA: -                        sh_status->metadata_self_heal = status; -                        break; -                case AFR_SELF_HEAL_DATA: -                        sh_status->data_self_heal = status; -                        break; -                case AFR_SELF_HEAL_ENTRY: -                        sh_status->entry_self_heal = status; -                        break; -                case AFR_SELF_HEAL_INVALID: -                        gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid" -                                          "self heal type in action"); -                        break; -        } -out: -        return; -} +/* + * This is the entry point for healing a given GFID + */ -void -afr_set_local_for_unhealable (afr_local_t *local) +int +afr_selfheal (xlator_t *this, uuid_t gfid)  { -        afr_self_heal_t  *sh = NULL; - -        sh = &local->self_heal; +	inode_t *inode = NULL; +	call_frame_t *frame = NULL; +	int ret = -1; +	gf_boolean_t data_selfheal = _gf_false; +	gf_boolean_t metadata_selfheal = _gf_false; +	gf_boolean_t entry_selfheal = _gf_false; -        local->unhealable = 1; -        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -} +	inode = afr_inode_find (this, gfid); +	if (!inode) +		goto out; -int -is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type) -{ -        afr_sh_status_for_all_type      sh_status = sh->afr_all_sh_status; -        afr_self_heal_type   sh_type_in_action =  AFR_SELF_HEAL_INVALID; -        afr_self_heal_status    status = AFR_SELF_HEAL_FAILED; -        xlator_t                *this = NULL; -        int                     sh_failed = 0; - -        this = THIS; - -        if (!sh) { -                gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal " -                                  "structure"); -                sh_failed = 1; -                goto out; -        } - -        if (type == AFR_CHECK_ALL) { -                if ((sh_status.gfid_or_missing_entry_self_heal == AFR_SELF_HEAL_FAILED) -                    || (sh_status.metadata_self_heal == AFR_SELF_HEAL_FAILED) -                    || (sh_status.data_self_heal == AFR_SELF_HEAL_FAILED) -                    || (sh_status.entry_self_heal == AFR_SELF_HEAL_FAILED)) -                sh_failed = 1; -        } else if (type == AFR_CHECK_SPECIFIC) { -                sh_type_in_action = sh->sh_type_in_action; -                switch (sh_type_in_action) { -                        case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: -                             status = sh_status.gfid_or_missing_entry_self_heal; -                                break; -                        case AFR_SELF_HEAL_METADATA: -                                status = sh_status.metadata_self_heal; -                                break; -                        case AFR_SELF_HEAL_ENTRY: -                                status = sh_status.entry_self_heal; -                                break; -                        case AFR_SELF_HEAL_DATA: -                                status = sh_status.data_self_heal; -                                break; -                        case AFR_SELF_HEAL_INVALID: -                                status = AFR_SELF_HEAL_NOT_ATTEMPTED; -                                break; -                } -                if (status == AFR_SELF_HEAL_FAILED) -                        sh_failed = 1; - -        } +	frame = afr_frame_create (this); +	if (!frame) +		goto out; -out: -        return sh_failed; -} +	ret = afr_selfheal_unlocked_inspect (frame, this, inode, gfid, +					     &data_selfheal, +					     &metadata_selfheal, +					     &entry_selfheal); +	if (ret) +		goto out; -char * -get_sh_completion_status (afr_self_heal_status status) -{ +	if (data_selfheal) +		afr_selfheal_data (frame, this, inode); -        char *not_attempted       = " is not attempted"; -        char *failed              = " failed"; -        char *started             = " is started"; -        char *sync_begin          = " is successfully completed"; -        char *result              = " has unknown status"; - -        switch (status) -        { -                case AFR_SELF_HEAL_NOT_ATTEMPTED: -                        result = not_attempted; -                        break; -                case AFR_SELF_HEAL_FAILED: -                        result = failed; -                        break; -                case AFR_SELF_HEAL_STARTED: -                        result = started; -                        break; -                case AFR_SELF_HEAL_SYNC_BEGIN: -                        result = sync_begin; -                        break; -        } - -        return result; +	if (metadata_selfheal) +		afr_selfheal_metadata (frame, this, inode); -} +	if (entry_selfheal) +		afr_selfheal_entry (frame, this, inode); -void -afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t loglvl) -{ +	inode_forget (inode, 1); +out: +	if (inode) +		inode_unref (inode); +	if (frame) +		AFR_STACK_DESTROY (frame); -        char sh_log[4096]              = {0}; -        afr_self_heal_t *sh            = &local->self_heal; -        afr_sh_status_for_all_type   all_status = sh->afr_all_sh_status; -        xlator_t      *this            = NULL; -        size_t        off              = 0; -        int           data_sh          = 0; -        int           metadata_sh      = 0; -        int           print_log        = 0; - -        this = THIS; - -        ADD_FMT_STRING (sh_log, off, "gfid or missing entry", -                        all_status.gfid_or_missing_entry_self_heal, print_log); -        ADD_FMT_STRING_SYNC (sh_log, off, "metadata", -                             all_status.metadata_self_heal, print_log); -        if (sh->background) { -                ADD_FMT_STRING_SYNC (sh_log, off, "backgroung data", -                                all_status.data_self_heal, print_log); -        } else { -                ADD_FMT_STRING_SYNC (sh_log, off, "foreground data", -                                all_status.data_self_heal, print_log); -        } -        ADD_FMT_STRING_SYNC (sh_log, off, "entry", all_status.entry_self_heal, -                             print_log); - -        if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.data_self_heal && -	    strcmp (sh->data_sh_info, "") && sh->data_sh_info ) -                data_sh = 1; -        if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.metadata_self_heal && -	    strcmp (sh->metadata_sh_info, "") && sh->metadata_sh_info) -                metadata_sh = 1; - -        if (!print_log) -                return; - -        gf_log (this->name, loglvl, "%s %s %s on %s", sh_log, -                ((data_sh == 1) ? sh->data_sh_info : ""), -                ((metadata_sh == 1) ? sh->metadata_sh_info : ""), -                local->loc.path); +	return ret;  } diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h deleted file mode 100644 index 4732647760d..00000000000 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ /dev/null @@ -1,144 +0,0 @@ -/* -  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> -  This file is part of GlusterFS. - -  This file is licensed to you under your choice of the GNU Lesser -  General Public License, version 3 or any later version (LGPLv3 or -  later), or the GNU General Public License, version 2 (GPLv2), in all -  cases as published by the Free Software Foundation. -*/ - -#ifndef __AFR_SELF_HEAL_COMMON_H__ -#define __AFR_SELF_HEAL_COMMON_H__ - -#define FILE_HAS_HOLES(buf) (((buf)->ia_size) > ((buf)->ia_blocks * 512)) -#define AFR_SH_MIN_PARTICIPANTS 2 - -typedef enum { -        AFR_LOOKUP_FAIL_CONFLICTS = 1, -        AFR_LOOKUP_FAIL_MISSING_GFIDS = 2, -} afr_lookup_flags_t; - -int -afr_sh_select_source (int sources[], int child_count); - -int -afr_sh_source_count (int sources[], int child_count); - -void -afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this); - -void -afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, -                              const char *loc); - -int -afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, -                          unsigned char *ignorant_subvols, -                          dict_t *xattr[], afr_transaction_type type, -                          size_t child_count); - -void -afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, -                         int32_t *delta_matrix[], unsigned char success[], -                         int child_count, afr_transaction_type type); - -int -afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, -                  struct iatt *bufs, afr_self_heal_type type, -                  int32_t *success_children, int32_t *subvol_status); - -int -afr_sh_delta_to_xattr (xlator_t *this, -                       int32_t *delta_matrix[], dict_t *xattr[], -		       int child_count, afr_transaction_type type); - -void -afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, -                            size_t size); - -afr_self_heal_type -afr_self_heal_type_for_transaction (afr_transaction_type type); - -int -afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, -                   int32_t **pending_matrix, int32_t *sources, -                   int32_t *success_children, afr_transaction_type type, -                   int32_t *subvol_status, gf_boolean_t ignore_ignorant); -void -afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count); - -void -afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, -                                   xlator_t *this, -                                   int32_t op_ret, int32_t op_errno, -                                   inode_t *inode, struct iatt *buf, -                                   dict_t *xattr, struct iatt *postparent, -                                   loc_t *loc); - -int -afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, -                      afr_lookup_done_cbk_t lookup_cbk, uuid_t uuid, -                      int32_t flags, dict_t *xdata); -int -afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, -                             int active_src, struct iatt *buf, -                             struct iatt *parentbuf); -int -afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, -                char *base_name, afr_lock_cbk_t lock_cbk); -int -afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, -                             int child_index); -int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, -                    afr_lock_cbk_t lock_cbk); -afr_local_t * -afr_self_heal_local_init (afr_local_t *l, xlator_t *this); -int -afr_sh_data_lock (call_frame_t *frame, xlator_t *this, -                  off_t start, off_t len, gf_boolean_t block, char *dom, -                  afr_lock_cbk_t success_handler, -                  afr_lock_cbk_t failure_handler); -void -afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno); -void -afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this); -typedef int -(*afr_fxattrop_cbk_t) (call_frame_t *frame, void *cookie, -                       xlator_t *this, int32_t op_ret, int32_t op_errno, -                       dict_t *xattr, dict_t *xdata); -int -afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name); -int -afr_impunge_frame_create (call_frame_t *frame, xlator_t *this, -                          int active_source, call_frame_t **impunge_frame); -void -afr_sh_reset (call_frame_t *frame, xlator_t *this); - -void -afr_children_intersection_get (int32_t *set1, int32_t *set2, -                               int *intersection, unsigned int child_count); -int -afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children, -                                 struct iatt *bufs); -int -afr_sh_erase_pending (call_frame_t *frame, xlator_t *this, -                      afr_transaction_type type, afr_fxattrop_cbk_t cbk, -                      int (*finish)(call_frame_t *frame, xlator_t *this)); - -void -afr_set_local_for_unhealable (afr_local_t *local); - -int -is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type); - -void -afr_set_self_heal_status (afr_self_heal_t *sh, afr_self_heal_status status); - -void -afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t  logl); - -char* -afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this); -#endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 9de26ee569c..c0385153ff5 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -1,5 +1,5 @@  /* -  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>    This file is part of GlusterFS.    This file is licensed to you under your choice of the GNU Lesser @@ -8,1747 +8,609 @@    cases as published by the Free Software Foundation.  */ -#include <libgen.h> -#include <unistd.h> -#include <fnmatch.h> -#include <sys/time.h> -#include <stdlib.h> -#include <signal.h>  #ifndef _CONFIG_H  #define _CONFIG_H  #include "config.h"  #endif -#include "glusterfs.h"  #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h"  #include "afr-self-heal.h" -#include "afr-self-heal-common.h" -#include "afr-self-heal-algorithm.h" - -int -afr_sh_data_fail (call_frame_t *frame, xlator_t *this); - -static inline gf_boolean_t -afr_sh_data_proceed (unsigned int success_count) -{ -        return (success_count >= AFR_SH_MIN_PARTICIPANTS); -} - -extern int -sh_loop_finish (call_frame_t *loop_frame, xlator_t *this); +#include "byte-order.h" -int -afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this); +enum { +	AFR_SELFHEAL_DATA_FULL = 0, +	AFR_SELFHEAL_DATA_DIFF, +}; -int -afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this); -int -afr_sh_data_finish (call_frame_t *frame, xlator_t *this); - -int -afr_sh_data_done (call_frame_t *frame, xlator_t *this) +#define HAS_HOLES(i) ((i->ia_blocks * 512) < (i->ia_size)) +static int +__checksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		int op_ret, int op_errno, uint32_t weak, uint8_t *strong, +		dict_t *xdata)  { -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; +	afr_local_t *local = NULL; +	int i = (long) cookie; -        local = frame->local; -        sh = &local->self_heal; +	local = frame->local; -        sh->completion_cbk (frame, this); +	local->replies[i].valid = 1; +	local->replies[i].op_ret = op_ret; +	local->replies[i].op_errno = op_errno; +	if (strong) +		memcpy (local->replies[i].checksum, strong, MD5_DIGEST_LENGTH); -        return 0; +	syncbarrier_wake (&local->barrier); +	return 0;  } -int -afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                       int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ -        afr_local_t   *local       = NULL; -        afr_private_t *priv        = NULL; -        int            call_count  = 0; -        int            child_index = (long) cookie; - -        local = frame->local; -        priv = this->private; - -        LOCK (&frame->lock); -        { -                if (op_ret == -1) { -                        gf_log (this->name, GF_LOG_ERROR, -                                "flush failed on %s on subvolume %s: %s", -                                local->loc.path, priv->children[child_index]->name, -                                strerror (op_errno)); -                } -        } -        UNLOCK (&frame->lock); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                afr_sh_data_done (frame, this); -        } - -        return 0; -} - -int -afr_sh_data_close (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t     *local      = NULL; -        afr_private_t   *priv       = NULL; -        afr_self_heal_t *sh         = NULL; -        int              i          = 0; -        int              call_count = 0; - -        local = frame->local; -        sh    = &local->self_heal; -        priv  = this->private; - -        if (!sh->healing_fd) { -                //This happens when file is non-reg -                afr_sh_data_done (frame, this); -                return 0; -        } -        call_count        = afr_set_elem_count_get (sh->success, -                                                    priv->child_count); -        local->call_count = call_count; - -        if (call_count == 0) { -                afr_sh_data_done (frame, this); -                return 0; -        } - -        for (i = 0; i < priv->child_count; i++) { -                if (!sh->success[i]) -                        continue; -                gf_log (this->name, GF_LOG_DEBUG, -                        "closing fd of %s on %s", -                        local->loc.path, priv->children[i]->name); - -                STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, -                                   (void *) (long) i, -                                   priv->children[i], -                                   priv->children[i]->fops->flush, -                                   sh->healing_fd, NULL); - -                if (!--call_count) -                        break; -        } - -        return 0; -} - -int -afr_sh_dom_unlock (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh    = NULL; -        afr_private_t   *priv  = NULL; - -        local = frame->local; -        sh    = &local->self_heal; -        priv  = this->private; - -        if (sh->sh_dom_lock_held) -                afr_sh_data_unlock (frame, this, priv->sh_domain, -                                    afr_sh_data_close); -        else -                afr_sh_data_close (frame, this); -        return 0; -} - -int -afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                         int32_t op_ret, int32_t op_errno, struct iatt *statpre, -                         struct iatt *statpost, dict_t *xdata) +static int +attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +	  int op_ret, int op_errno, struct iatt *pre, struct iatt *post, +	  dict_t *xdata)  { +	int i = (long) cookie; +	afr_local_t *local = NULL; -        afr_local_t   *local       = NULL; -        afr_private_t *priv        = NULL; -        int            call_count  = 0; -        int            child_index = (long) cookie; +	local = frame->local; -        local = frame->local; -        priv = this->private; +	local->replies[i].valid = 1; +	local->replies[i].op_ret = op_ret; +	local->replies[i].op_errno = op_errno; +	if (pre) +		local->replies[i].prestat = *pre; +	if (post) +		local->replies[i].poststat = *post; +	if (xdata) +		local->replies[i].xdata = dict_ref (xdata); -        LOCK (&frame->lock); -        { -                if (op_ret == -1) { -                        gf_log (this->name, GF_LOG_INFO, -                                "setattr failed on %s on subvolume %s: %s", -                                local->loc.path, priv->children[child_index]->name, -                                strerror (op_errno)); -                } -        } -        UNLOCK (&frame->lock); +	syncbarrier_wake (&local->barrier); -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                afr_sh_data_finish (frame, this); -        } - -        return 0; +	return 0;  } -int -afr_sh_data_setattr (call_frame_t *frame, xlator_t *this, struct iatt* stbuf) -{ -        afr_local_t     *local      = NULL; -        afr_private_t   *priv       = NULL; -        afr_self_heal_t *sh         = NULL; -        int              i          = 0; -        int              call_count = 0; -        int32_t          valid      = 0; - -        local = frame->local; -        sh    = &local->self_heal; -        priv  = this->private; - -        valid = (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME); - -        call_count        = afr_set_elem_count_get (sh->success, -                                                    priv->child_count); -        local->call_count = call_count; - -        if (call_count == 0) { -                GF_ASSERT (0); -                afr_sh_data_finish (frame, this); -                return 0; -        } - -        for (i = 0; i < priv->child_count; i++) { -                if (!sh->success[i]) -                        continue; - -                STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk, -                                   (void *) (long) i, -                                   priv->children[i], -                                   priv->children[i]->fops->setattr, -                                   &local->loc, stbuf, valid, NULL); - -                if (!--call_count) -                        break; -        } - -        return 0; -} -int -afr_sh_data_setattr_fstat_cbk (call_frame_t *frame, void *cookie, -                               xlator_t *this, int32_t op_ret, int32_t op_errno, -                               struct iatt *buf, dict_t *xdata) +static gf_boolean_t +__afr_selfheal_data_checksums_match (call_frame_t *frame, xlator_t *this, +				     fd_t *fd, int source, +				     unsigned char *healed_sinks, +				     off_t offset, size_t size)  { -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        int child_index = (long) cookie; - -        local = frame->local; -        sh = &local->self_heal; - -        GF_ASSERT (sh->source == child_index); -        if (op_ret != -1) { -                sh->buf[child_index] = *buf; -                afr_sh_data_setattr (frame, this, buf); -        } else { -                gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " -                        "time-stamps after self-heal", local->loc.path); -                afr_sh_data_fail (frame, this); -        } - -        return 0; -} +	afr_private_t *priv = NULL; +	afr_local_t *local = NULL; +	unsigned char *wind_subvols = NULL; +	int i = 0; -/* - * If there are any writes after the self-heal is triggered then the - * stbuf stored in local->self_heal.buf[] will be invalid so we do one more - * stat on the source and then set the [am]times - */ -int -afr_sh_set_timestamps (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t     *local      = NULL; -        afr_private_t   *priv       = NULL; -        afr_self_heal_t *sh         = NULL; - -        local = frame->local; -        sh    = &local->self_heal; -        priv  = this->private; - -        STACK_WIND_COOKIE (frame, afr_sh_data_setattr_fstat_cbk, -                           (void *) (long) sh->source, -                           priv->children[sh->source], -                           priv->children[sh->source]->fops->fstat, -                           sh->healing_fd, NULL); -        return 0; -} - -//Fun fact, lock_cbk is being used for both lock & unlock -int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, -                    afr_lock_cbk_t lock_cbk) -{ -        afr_local_t         *local    = NULL; -        afr_internal_lock_t *int_lock = NULL; -        afr_self_heal_t     *sh       = NULL; -        afr_private_t       *priv     = NULL; -        int                 ret       = 0; - -        local    = frame->local; -        int_lock = &local->internal_lock; -        sh       = &local->self_heal; -        priv     = this->private; - -        if (strcmp (dom, this->name) == 0) { -                sh->data_lock_held = _gf_false; -        } else if (strcmp (dom, priv->sh_domain) == 0) { -                sh->sh_dom_lock_held = _gf_false; -        } else { -                ret = -1; -                goto out; -        } -        int_lock->lock_cbk = lock_cbk; -        int_lock->domain = dom; -        afr_unlock (frame, this); - -out: -        if (ret) { -                int_lock->lock_op_ret = -1; -                int_lock->lock_cbk (frame, this); -        } -        return 0; -} - -int -afr_sh_data_finish (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh    = NULL; - -        local = frame->local; -        sh = &local->self_heal; - -        gf_log (this->name, GF_LOG_DEBUG, -                "finishing data selfheal of %s", local->loc.path); - -        if (sh->data_lock_held) -                afr_sh_data_unlock (frame, this, this->name, afr_sh_dom_unlock); -        else -                afr_sh_dom_unlock (frame, this); - -        return 0; -} - -int -afr_sh_data_fail (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t   *local = NULL; -        afr_self_heal_t *sh = NULL; +	priv = this->private; +	local = frame->local; -        local = frame->local; -        sh = &local->self_heal; +	wind_subvols = alloca0 (priv->child_count); +	for (i = 0; i < priv->child_count; i++) { +		if (i == source || healed_sinks[i]) +			wind_subvols[i] = 1; +	} -        gf_log (this->name, GF_LOG_DEBUG, -                "finishing failed data selfheal of %s", local->loc.path); +	AFR_ONLIST (wind_subvols, frame, __checksum_cbk, rchecksum, fd, +		    offset, size, NULL); -        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -        afr_sh_data_finish (frame, this); -        return 0; -} +	if (!local->replies[source].valid || local->replies[source].op_ret != 0) +		return _gf_false; -int -afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, -                               xlator_t *this, int32_t op_ret, -                               int32_t op_errno, dict_t *xattr, dict_t *xdata) -{ -        int             call_count = 0; -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        afr_private_t   *priv = NULL; -        int32_t         child_index = (long) cookie; - -        priv  = this->private; -        local = frame->local; -        sh    = &local->self_heal; -        if (op_ret < 0) { -                gf_log (this->name, GF_LOG_ERROR, "Erasing of pending change " -                        "log failed on %s for subvol %s, reason: %s", -                        local->loc.path, priv->children[child_index]->name, -                        strerror (op_errno)); -                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -        } - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { -                        if (sh->old_loop_frame) -                                sh_loop_finish (sh->old_loop_frame, this); -                        sh->old_loop_frame = NULL; -                        afr_sh_data_fail (frame, this); -                        goto out; -                } -                if (!IA_ISREG (sh->type)) { -                        afr_sh_data_finish (frame, this); -                        goto out; -                } -                GF_ASSERT (sh->old_loop_frame); -                afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, -                                  afr_post_sh_big_lock_success, -                                  afr_post_sh_big_lock_failure); -        } -out: -        return 0; -} +	for (i = 0; i < priv->child_count; i++) { +		if (i == source) +			continue; +		if (memcmp (local->replies[source].checksum, +			    local->replies[i].checksum, +			    MD5_DIGEST_LENGTH)) +			return _gf_false; +	} -int -afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) -{ -        afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION, -                              afr_sh_data_erase_pending_cbk, -                              afr_sh_data_finish); -        return 0; +	return _gf_true;  } -int -afr_sh_data_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                       int op_ret, int op_errno, struct iatt *pre, -                       struct iatt *post, dict_t *xdata) -{ -        afr_local_t     *local      = NULL; -        afr_private_t   *priv       = NULL; -        afr_self_heal_t *sh         = NULL; -        int             call_count  = 0; -        int             child_index = (long) cookie; - -        local = frame->local; -        priv = this->private; -        sh   = &local->self_heal; - -        if (op_ret < 0) { -                gf_log (this->name, GF_LOG_ERROR, "%s: Failed to fsync on " -                        "%s - %s", local->loc.path, -                        priv->children[child_index]->name, strerror (op_errno)); -                LOCK (&frame->lock); -                { -                        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -                } -                UNLOCK (&frame->lock); -                if (sh->old_loop_frame) -                        sh_loop_finish (sh->old_loop_frame, this); -                sh->old_loop_frame = NULL; -        } - -        call_count = afr_frame_return (frame); -        if (call_count == 0) { -                if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) -                        afr_sh_data_fail (frame, this); -                else -                        afr_sh_data_erase_pending (frame, this); -        } -        return 0; -} -/* - * Before erasing xattrs, make sure the data is written to disk - */ -int -afr_sh_data_fsync (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t     *local     = NULL; -        afr_private_t   *priv      = NULL; -        afr_self_heal_t *sh        = NULL; -        int             i          = 0; -        int             call_count = 0; - -        local = frame->local; -        priv = this->private; -        sh   = &local->self_heal; - -        call_count        = sh->active_sinks; -        if (call_count == 0) { -                afr_sh_data_erase_pending (frame, this); -                return 0; -        } - -        local->call_count = call_count; -        for (i = 0; i < priv->child_count; i++) { -                if (!sh->success[i] || sh->sources[i]) -                        continue; - -                STACK_WIND_COOKIE (frame, afr_sh_data_fsync_cbk, -                                   (void *) (long) i, priv->children[i], -                                   priv->children[i]->fops->fsync, -                                   sh->healing_fd, 1, NULL); -        } - -        return 0; -} - -static struct afr_sh_algorithm * -sh_algo_from_name (xlator_t *this, char *name) +static int +__afr_selfheal_data_read_write (call_frame_t *frame, xlator_t *this, fd_t *fd, +				int source, unsigned char *healed_sinks, +				off_t offset, size_t size, +				struct afr_reply *replies)  { -        int i = 0; +	struct iovec *iovec = NULL; +	int count = 0; +	struct iobref *iobref = NULL; +	int ret = 0; +	int i = 0; +	afr_private_t *priv = NULL; -        if (name == NULL) -                goto out; +	priv = this->private; -        while (afr_self_heal_algorithms[i].name) { -                if (!strcmp (name, afr_self_heal_algorithms[i].name)) { -                        return &afr_self_heal_algorithms[i]; -                } +	ret = syncop_readv (priv->children[source], fd, size, offset, 0, +			    &iovec, &count, &iobref); +	if (ret <= 0) +		return ret; -                i++; -        } +	for (i = 0; i < priv->child_count; i++) { +		if (!healed_sinks[i]) +			continue; + +		/* +		 * TODO: Use fiemap() and discard() to heal holes +		 * in the future. +		 * +		 * For now, +		 * +		 * - if the source had any holes at all, +		 * AND +		 * - if we are writing past the original file size +		 *   of the sink +		 * AND +		 * - is NOT the last block of the source file. if +		 *   the block contains EOF, it has to be written +		 *   in order to set the file size even if the +		 *   last block is 0-filled. +		 * AND +		 * - if the read buffer is filled with only 0's +		 * +		 * then, skip writing to this source. We don't depend +		 * on the write to happen to update the size as we +		 * have performed an ftruncate() upfront anyways. +		 */ +#define is_last_block(o,b,s) ((s >= o) && (s <= (o + b))) +		if (HAS_HOLES ((&replies[source].poststat)) && +		    offset > replies[i].poststat.ia_size && +		    !is_last_block (offset, size, +				    replies[source].poststat.ia_size) && +		    (iov_0filled (iovec, count) == 0)) +			continue; + +		ret = syncop_writev (priv->children[i], fd, iovec, count, +				     offset, iobref, 0); +		if (ret != iov_length (iovec, count)) { +			/* write() failed on this sink. unset the corresponding +			   member in sinks[] (which is healed_sinks[] in the +			   caller) so that this server does NOT get considered +			   as successfully healed. +			*/ +			healed_sinks[i] = 0; +		} +	} +	if (iobref) +		iobref_unref (iobref); -out: -        return NULL; +	return ret;  }  static int -sh_zero_byte_files_exist (afr_local_t *local, int child_count) -{ -        int             i = 0; -        int             ret = 0; -        afr_self_heal_t *sh = NULL; - -        sh = &local->self_heal; -        for (i = 0; i < child_count; i++) { -                if (!local->child_up[i] || sh->child_errno[i]) -                        continue; -                if (sh->buf[i].ia_size == 0) { -                        ret = 1; -                        break; -                } -        } - -        return ret; -} - - -struct afr_sh_algorithm * -afr_sh_data_pick_algo (call_frame_t *frame, xlator_t *this) -{ -        afr_private_t *           priv  = NULL; -        struct afr_sh_algorithm * algo  = NULL; -        afr_local_t *             local = NULL; -        afr_self_heal_t *         sh    = NULL; - -        priv  = this->private; -        local = frame->local; -        sh    = &local->self_heal; -        algo  = sh_algo_from_name (this, priv->data_self_heal_algorithm); - -        if (algo == NULL) { -                /* option not set, so fall back on heuristics */ - -                if (sh_zero_byte_files_exist (local, priv->child_count) -                    || (sh->file_size <= (priv->data_self_heal_window_size * -                                          this->ctx->page_size))) { - -                        /* -                         * If the file does not exist on one of the subvolumes, -                         * or a zero-byte file exists (created by entry self-heal) -                         * the entire content has to be copied anyway, so there -                         * is no benefit from using the "diff" algorithm. -                         * -                         * If the file size is about the same as page size, -                         * the entire file can be read and written with a few -                         * (pipelined) STACK_WINDs, which will be faster -                         * than "diff" which has to read checksums and then -                         * read and write. -                         */ - -                        algo = sh_algo_from_name (this, "full"); - -                } else { -                        algo = sh_algo_from_name (this, "diff"); -                } -        } - -        return algo; -} - - -int -afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        struct afr_sh_algorithm *sh_algo = NULL; - -        local = frame->local; -        sh = &local->self_heal; - -        sh->algo_completion_cbk = afr_sh_data_fsync; -        sh->algo_abort_cbk      = afr_sh_data_fail; - -        sh_algo = afr_sh_data_pick_algo (frame, this); +afr_selfheal_data_block (call_frame_t *frame, xlator_t *this, fd_t *fd, +			 int source, unsigned char *healed_sinks, off_t offset, +			 size_t size, int type, struct afr_reply *replies) +{ +	int ret = -1; +	int sink_count = 0; +	afr_private_t *priv = NULL; +	unsigned char *data_lock = NULL; + +	priv = this->private; +	sink_count = AFR_COUNT (healed_sinks, priv->child_count); +	data_lock = alloca0 (priv->child_count); + +	ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, +				    offset, size, data_lock); +	{ +		if (ret < sink_count) { +			ret = -ENOTCONN; +			goto unlock; +		} -        sh->algo = sh_algo; -        sh_algo->fn (frame, this); +		if (type == AFR_SELFHEAL_DATA_DIFF && +		    __afr_selfheal_data_checksums_match (frame, this, fd, source, +							 healed_sinks, offset, size)) { +			ret = 0; +			goto unlock; +		} -        return 0; +		ret = __afr_selfheal_data_read_write (frame, this, fd, source, +						      healed_sinks, offset, size, +						      replies); +	} +unlock: +	afr_selfheal_uninodelk (frame, this, fd->inode, this->name, +				offset, size, data_lock); +	return ret;  } -int -afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                      int32_t op_ret, int32_t op_errno, struct iatt *prebuf, -                      struct iatt *postbuf, dict_t *xdata) -{ -        int              call_count = 0; -        int              child_index = 0; -        afr_private_t    *priv = NULL; -        afr_local_t      *local  = NULL; -        afr_self_heal_t  *sh = NULL; - -        priv  = this->private; -        local = frame->local; -        sh    = &local->self_heal; - -        child_index = (long) cookie; - -        LOCK (&frame->lock); -        { -                if (op_ret == -1) { -                        gf_log (this->name, GF_LOG_ERROR, -                                "ftruncate of %s on subvolume %s failed (%s)", -                                local->loc.path, -                                priv->children[child_index]->name, -                                strerror (op_errno)); -                        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -                } else { -                        gf_log (this->name, GF_LOG_DEBUG, -                                "ftruncate of %s on subvolume %s completed", -                                local->loc.path, -                                priv->children[child_index]->name); -                } -        } -        UNLOCK (&frame->lock); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) -                        afr_sh_data_fail (frame, this); -                else -                        afr_sh_data_sync_prepare (frame, this); -        } - -        return 0; -} -int -afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) +static int +afr_selfheal_data_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, +			 unsigned char *healed_sinks)  { -        afr_private_t * priv = NULL; -        afr_local_t * local  = NULL; -        afr_self_heal_t *sh  = NULL; -        int             *sources = NULL; -        int              call_count = 0; -        int              i = 0; - - -        priv = this->private; -        local = frame->local; -        sh = &local->self_heal; - -        sources = sh->sources; -        call_count = sh->active_sinks; - -        local->call_count = call_count; +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	int i = 0; -        for (i = 0; i < priv->child_count; i++) { -                if (sources[i] || !local->child_up[i]) -                        continue; +	local = frame->local; +	priv = this->private; -                STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk, -                                   (void *) (long) i, -                                   priv->children[i], -                                   priv->children[i]->fops->ftruncate, -                                   sh->healing_fd, sh->file_size, -                                   NULL); +	AFR_ONLIST (healed_sinks, frame, attr_cbk, fsync, fd, 0, NULL); -                if (!--call_count) -                        break; -        } - -        return 0; +	for (i = 0; i < priv->child_count; i++) +		if (healed_sinks[i] && local->replies[i].op_ret != 0) +			/* fsync() failed. Do NOT consider this server +			   as successfully healed. Mark it so. +			*/ +			healed_sinks[i] = 0; +	return 0;  } -int -afr_sh_inode_set_read_ctx (afr_self_heal_t *sh, xlator_t *this) -{ -        afr_private_t   *priv = NULL; -        int             ret = 0; -        int             i = 0; - -        priv = this->private; -        sh->source = afr_sh_select_source (sh->sources, priv->child_count); -        if (sh->source < 0) { -                ret = -1; -                goto out; -        } - -        /* detect changes not visible through pending flags -- JIC */ -        for (i = 0; i < priv->child_count; i++) { -                if (i == sh->source || sh->child_errno[i]) -                        continue; - -                if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[sh->source])) -                        sh->sources[i] = 0; -        } - -        afr_reset_children (sh->fresh_children, priv->child_count); -        afr_get_fresh_children (sh->success_children, sh->sources, -                                sh->fresh_children, priv->child_count); -        afr_inode_set_read_ctx (this, sh->inode, sh->source, -                                sh->fresh_children); -out: -        return ret; -} -char* -afr_get_sizes_str (afr_local_t *local, struct iatt *bufs, xlator_t *this) -{ -        afr_private_t *priv = NULL; -        int           i     = 0; -        char          num[1024] = {0}; -        size_t        len = 0; -        char          *sizes_str = NULL; -        size_t        off = 0; -        char          *fmt_str = "%llu bytes on %s, "; -        char          *child_down =  " %s,"; -        char          *child_unknown = " %s,"; -        int           down_child_present = 0; -        int           down_count = 0; -        int           unknown_count = 0; -        int           unknown_child_present = 0; -        char          *down_subvol_1 = " down subvolume is "; -        char          *unknown_subvol_1 = " unknown subvolume is "; -        char          *down_subvol_2 = " down subvolumes are "; -        char          *unknown_subvol_2 = " unknown subvolumes are "; - -        priv = this->private; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i] == 1) { -                        len += snprintf (num, sizeof (num), fmt_str, -                                         (unsigned long long) bufs[i].ia_size, -                                         priv->children[i]->name); -                } else if (local->child_up[i] == 0) { -                        len += snprintf (num, sizeof (num), child_down, -                                         priv->children[i]->name); -                        if (!down_child_present) -                                down_child_present = 1; -                        down_count ++; -                } else if (local->child_up[i] == -1) { -                        len += snprintf (num, sizeof (num), child_unknown, -                                         priv->children[i]->name); -                        if (!unknown_child_present) -                                unknown_child_present = 1; -                        unknown_count++; -                } - -        } - -        if (down_child_present) { -                if (down_count > 1) -                        len += snprintf (num, sizeof (num), "%s", -                                         down_subvol_2); -                else -                        len += snprintf (num, sizeof (num), "%s", -                                        down_subvol_1); -        } -        if (unknown_child_present) { -                if (unknown_count > 1) -                        len += snprintf (num, sizeof (num), "%s", -                                         unknown_subvol_2); -                else -                        len += snprintf (num, sizeof (num), "%s", -                                         unknown_subvol_1); -        } - -        len++;//for '\0' - -        sizes_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); - -        if (!sizes_str) -                return NULL; - -        for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i] == 1) { -                        off += snprintf (sizes_str + off, len - off, fmt_str, -                                         (unsigned long long) bufs[i].ia_size, -                                         priv->children[i]->name); -                } -        } - -        if (down_child_present) { -                if (down_count > 1) { -                        off += snprintf (sizes_str + off, len - off, "%s", -                                         down_subvol_2); -                } else { -                        off += snprintf (sizes_str + off, len - off, "%s", -                                         down_subvol_1); -                } -        } - -        for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i] == 0) { -                        off += snprintf (sizes_str + off, len - off, child_down, -                                         priv->children[i]->name); -                } -        } - -        if (unknown_child_present) { -                if (unknown_count > 1) { -                        off += snprintf (sizes_str + off, len - off, "%s", -                                        unknown_subvol_2); -                } else { -                        off += snprintf (sizes_str + off, len - off, "%s", -                                         unknown_subvol_1); -                } -        } - -        for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i] == -1) { -                        off += snprintf (sizes_str + off, len - off, -                                         child_unknown, -                                         priv->children[i]->name); - -                } -        } - -        return sizes_str; -} - -char* -afr_get_sinks_str (xlator_t *this, afr_local_t *local, afr_self_heal_t *sh) +static int +afr_selfheal_data_restore_time (call_frame_t *frame, xlator_t *this, +				inode_t *inode, int source, +				unsigned char *healed_sinks, +				struct afr_reply *replies)  { -        afr_private_t   *priv = NULL; -        int             i = 0; -        char            num[1024] = {0}; -        size_t          len = 0; -        char            *sinks_str = NULL; -        char            *temp_str = " to sinks "; -        char            *str_format = " %s,"; -        char            off = 0; - -        priv = this->private; - -        len += snprintf (num, sizeof (num), "%s", temp_str); -        for (i = 0; i < priv->child_count; i++) { -                if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { -                        len += snprintf (num, sizeof (num), str_format, -                                         priv->children[i]->name); -                } -        } +	loc_t loc = {0, }; -        len ++; +	loc.inode = inode_ref (inode); +	uuid_copy (loc.gfid, inode->gfid); -        sinks_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); +	AFR_ONLIST (healed_sinks, frame, attr_cbk, setattr, &loc, +		    &replies[source].poststat, +		    (GF_SET_ATTR_ATIME|GF_SET_ATTR_MTIME), NULL); -        if (!sinks_str) -                return NULL; - -        off += snprintf (sinks_str + off, len - off, "%s", temp_str); - -        for (i = 0; i < priv->child_count; i++) { -                if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { -                        off += snprintf (sinks_str + off, len - off, -                                         str_format, -                                         priv->children[i]->name); -                } -        } - -        return sinks_str; +	loc_wipe (&loc); +	return 0;  } +static int +afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd, +		      int source, unsigned char *healed_sinks, +		      struct afr_reply *replies) +{ +	afr_private_t *priv = NULL; +	int i = 0; +	off_t off = 0; +	size_t block = 128 * 1024; +	int type = AFR_SELFHEAL_DATA_FULL; +	int ret = -1; +	call_frame_t *iter_frame = NULL; +	char *sinks_str = NULL; +	char *p = NULL; + +	priv = this->private; + +	sinks_str = alloca0 (priv->child_count * 8); +	p = sinks_str; +	for (i = 0; i < priv->child_count; i++) { +		if (!healed_sinks[i]) +			continue; +		p += sprintf (p, "%d ", i); +	} -void -afr_set_data_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, xlator_t *this) -{ -        char            *pending_matrix_str = NULL; -        char            *sizes_str = NULL; -        char            *sinks_str = NULL; -        afr_private_t   *priv = NULL; - -        priv = this->private; - -        pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix, -                                                         this); -        if (!pending_matrix_str) -                pending_matrix_str = ""; - -        sizes_str = afr_get_sizes_str (local, sh->buf, this); -        if (!sizes_str) -                sizes_str = ""; +	gf_log (this->name, GF_LOG_INFO, "performing data selfheal on %s. " +		"source=%d sinks=%s", +		uuid_utoa (fd->inode->gfid), source, sinks_str); -        sinks_str = afr_get_sinks_str (this, local, sh); -        if (!sinks_str) -                sinks_str = ""; +	for (i = 0; i < priv->child_count; i++) { +		if (!healed_sinks[i] && i != source) +			continue; +		if (replies[i].poststat.ia_size) { +			type = AFR_SELFHEAL_DATA_DIFF; +			break; +		} +	} -        gf_asprintf (&sh->data_sh_info, " data self heal from %s %s with " -                     "%s data %s", priv->children[sh->source]->name, sinks_str, -                     sizes_str, pending_matrix_str); +	iter_frame = afr_copy_frame (frame); +	if (!iter_frame) +		return -ENOMEM; -        if (pending_matrix_str && strcmp (pending_matrix_str, "")) -                GF_FREE (pending_matrix_str); +	for (off = 0; off < replies[source].poststat.ia_size; off += block) { +		ret = afr_selfheal_data_block (iter_frame, this, fd, source, +					       healed_sinks, off, block, type, +					       replies); +		if (ret < 0) +			goto out; -        if (sizes_str && strcmp (sizes_str, "")) -                GF_FREE (sizes_str); -} +		AFR_STACK_RESET (iter_frame); +	} -void -afr_sh_data_fix (call_frame_t *frame, xlator_t *this) -{ -        int              source = 0; -        afr_local_t     *local      = NULL; -        afr_self_heal_t *sh = NULL; -        afr_private_t   *priv = NULL; - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; - -        source     = sh->source; -        sh->block_size = this->ctx->page_size; -        sh->file_size  = sh->buf[source].ia_size; - -        if (FILE_HAS_HOLES (&sh->buf[source])) -                sh->file_has_holes = 1; - -        if (sh->background && sh->unwind && !sh->unwound) { -                sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, -                            is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)); -                sh->unwound = _gf_true; -        } - -        afr_sh_mark_source_sinks (frame, this); -        if (sh->active_sinks == 0) { -                gf_log (this->name, GF_LOG_INFO, -                        "no active sinks for performing self-heal on file %s", -                        local->loc.path); -                afr_sh_data_finish (frame, this); -                return; -        } - -        gf_log (this->name, GF_LOG_DEBUG, -                "self-healing file %s from subvolume %s to %d other", -                local->loc.path, priv->children[sh->source]->name, -                sh->active_sinks); - -        sh->actual_sh_started = _gf_true; -        afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); -        afr_sh_data_trim_sinks (frame, this); -} +	afr_selfheal_data_restore_time (frame, this, fd->inode, source, +					healed_sinks, replies); -int -afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t     *local      = NULL; -        afr_self_heal_t *sh = NULL; -        afr_private_t   *priv = NULL; -        int              nsources = 0; -        int              ret = 0; -        int             *old_sources = NULL; -        int             tstamp_source = 0; -        int             i = 0; - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; - -        gf_log (this->name, GF_LOG_DEBUG, "Pending matrix for: %s", -                lkowner_utoa (&frame->root->lk_owner)); -        if (sh->sync_done) { -                //store sources before sync so that mtime can be set using the -                //iatt buf from one of them. -                old_sources = alloca (priv->child_count*sizeof (*old_sources)); -                memcpy (old_sources, sh->sources, -                        priv->child_count * sizeof (*old_sources)); -        } - -        nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, -                                      sh->sources, sh->success_children, -                                      AFR_DATA_TRANSACTION, NULL, _gf_true); -        if ((nsources == -1) -            && (priv->favorite_child != -1) -            && (sh->child_errno[priv->favorite_child] == 0)) { - -                gf_log (this->name, GF_LOG_DEBUG, -                        "Picking favorite child %s as authentic source to " -                        "resolve conflicting data of %s", -                        priv->children[priv->favorite_child]->name, -                        local->loc.path); - -                sh->sources[priv->favorite_child] = 1; - -                nsources = afr_sh_source_count (sh->sources, -                                                priv->child_count); -        } - -        if (nsources == -1) { -                afr_sh_print_split_brain_log (sh->pending_matrix, this, -                                              local->loc.path); -                afr_set_split_brain (this, sh->inode, DONT_KNOW, SPB); - -                afr_sh_data_fail (frame, this); -                return 0; -        } - -        afr_set_split_brain (this, sh->inode, DONT_KNOW, NO_SPB); - -        ret = afr_sh_inode_set_read_ctx (sh, this); -        if (ret) { -                gf_log (this->name, GF_LOG_DEBUG, -                        "No active sources found."); - -                afr_sh_data_fail (frame, this); -                return 0; -        } - -        if (sh->sync_done) { -                /* Perform setattr from one of the old_sources if possible -                 * Because only they have the correct mtime, the new sources -                 * (i.e. old sinks) have mtime from last writev in sync. -                 */ -                tstamp_source = sh->source; -                for (i = 0; i < priv->child_count; i++) { -                        if (old_sources[i] && sh->sources[i]) -                                tstamp_source = i; -                } -                afr_sh_data_setattr (frame, this, &sh->buf[tstamp_source]); -        } else { -                afr_set_data_sh_info_str (local, sh, this); -                if (nsources == 0) { -                        gf_log (this->name, GF_LOG_DEBUG, -                                "No self-heal needed for %s", -                                local->loc.path); - -                        afr_sh_data_finish (frame, this); -                        return 0; -                } - -                if (sh->do_data_self_heal && -                    afr_data_self_heal_enabled (priv->data_self_heal)) -                        afr_sh_data_fix (frame, this); -                else -                        afr_sh_data_finish (frame, this); -        } -        return 0; -} +	ret = afr_selfheal_data_fsync (frame, this, fd, healed_sinks); -int -afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, -                                          dict_t **xattr, -                                          afr_transaction_type txn_type, -                                          uuid_t gfid) -{ -        afr_private_t            *priv      = NULL; -        int                      read_child = -1; -        int32_t                  **pending_matrix = NULL; -        int32_t                  *sources         = NULL; -        int32_t                  *success_children   = NULL; -        struct iatt              *bufs            = NULL; -        int32_t                  nsources         = 0; -        int32_t                  prev_read_child  = -1; -        int32_t                  config_read_child = -1; -        int32_t                  subvol_status = 0; - -        priv = this->private; -        bufs = local->cont.lookup.bufs; -        success_children = local->cont.lookup.success_children; - -        pending_matrix = local->cont.lookup.pending_matrix; -        sources = local->cont.lookup.sources; -        memset (sources, 0, sizeof (*sources) * priv->child_count); - -        nsources = afr_build_sources (this, xattr, bufs, pending_matrix, -                                      sources, success_children, txn_type, -                                      &subvol_status, _gf_false); -        if (subvol_status & SPLIT_BRAIN) { -                gf_log (this->name, GF_LOG_DEBUG, "%s: Possible split-brain", -                        local->loc.path); -                switch (txn_type) { -                case AFR_DATA_TRANSACTION: -                        local->cont.lookup.possible_spb = _gf_true; -                        nsources = 1; -                        sources[success_children[0]] = 1; -                        break; -                case AFR_ENTRY_TRANSACTION: -                        read_child = afr_get_no_xattr_dir_read_child (this, -                                                             success_children, -                                                             bufs); -                        sources[read_child] = 1; -                        nsources = 1; -                        break; -                default: -                        break; -                } -        } -        if (nsources < 0) -                goto out; - -        prev_read_child = local->read_child_index; -        config_read_child = priv->read_child; -        read_child = afr_select_read_child_from_policy (success_children, -                                                        priv->child_count, -                                                        prev_read_child, -                                                        config_read_child, -                                                        sources, -                                                        priv->hash_mode, gfid);  out: -        gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d", -                read_child); -        return read_child; +	if (iter_frame) +		AFR_STACK_DESTROY (iter_frame); +	return ret;  } -int -afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie, -                       xlator_t *this, int32_t op_ret, int32_t op_errno, -                       struct iatt *buf, dict_t *xdata) + +static int +__afr_selfheal_truncate_sinks (call_frame_t *frame, xlator_t *this, +			       fd_t *fd, unsigned char *healed_sinks, +			       struct afr_reply *replies, uint64_t size)  { -        afr_private_t   *priv  = NULL; -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        int call_count  = -1; -        int child_index = (long) cookie; - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; - -        LOCK (&frame->lock); -        { -                if (op_ret != -1) { -                        gf_log (this->name, GF_LOG_TRACE, -                                "fstat of %s on %s succeeded", -                                local->loc.path, -                                priv->children[child_index]->name); - -                        sh->buf[child_index] = *buf; -                        sh->success_children[sh->success_count] = child_index; -                        sh->success_count++; -                } else { -                        gf_log (this->name, GF_LOG_ERROR, "%s: fstat failed " -                                "on %s, reason %s", local->loc.path, -                                priv->children[child_index]->name, -                                strerror (op_errno)); -                        sh->child_errno[child_index] = op_errno; -                } -        } -        UNLOCK (&frame->lock); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                /* Previous versions of glusterfs might have set -                 * the pending data xattrs which need to be erased -                 */ -                if (!afr_sh_data_proceed (sh->success_count)) { -                        gf_log (this->name, GF_LOG_ERROR, "inspecting metadata " -                                "succeeded on < %d children, aborting " -                                "self-heal for %s", AFR_SH_MIN_PARTICIPANTS, -                                local->loc.path); -                        afr_sh_data_fail (frame, this); -                        goto out; -                } -                afr_sh_data_fxattrop_fstat_done (frame, this); -        } -out: -        return 0; -} +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	unsigned char *larger_sinks = 0; +	int i = 0; +	local = frame->local; +	priv = this->private; -int -afr_sh_data_fstat (call_frame_t *frame, xlator_t *this) -{ -        afr_self_heal_t *sh    = NULL; -        afr_local_t     *local = NULL; -        afr_private_t   *priv  = NULL; -        int             call_count = 0; -        int             i = 0; -        int             child = 0; -        int32_t         *fstat_children = NULL; - -        priv  = this->private; -        local = frame->local; -        sh    = &local->self_heal; - -        fstat_children = memdup (sh->success_children, -                                 sizeof (*fstat_children) * priv->child_count); -        if (!fstat_children) { -                afr_sh_data_fail (frame, this); -                goto out; -        } -        call_count = sh->success_count; -        local->call_count = call_count; - -        memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count); -        afr_reset_children (sh->success_children, priv->child_count); -        sh->success_count = 0; -        for (i = 0; i < priv->child_count; i++) { -                child = fstat_children[i]; -                if (child == -1) -                        break; -                STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk, -                                   (void *) (long) child, -                                   priv->children[child], -                                   priv->children[child]->fops->fstat, -                                   sh->healing_fd, NULL); -                --call_count; -        } -        GF_ASSERT (!call_count); -out: -        GF_FREE (fstat_children); -        return 0; -} +	larger_sinks = alloca0 (priv->child_count); +	for (i = 0; i < priv->child_count; i++) { +		if (healed_sinks[i] && replies[i].poststat.ia_size > size) +			larger_sinks[i] = 1; +	} -void -afr_sh_common_fxattrop_resp_handler (call_frame_t *frame, void *cookie, -                                     xlator_t *this, int32_t op_ret, -                                     int32_t op_errno, dict_t *xattr) -{ -        afr_private_t   *priv  = NULL; -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        int child_index = (long) cookie; - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; - -        LOCK (&frame->lock); -        { -                if (op_ret != -1) { -                        gf_log (this->name, GF_LOG_TRACE, -                                "fxattrop of %s on %s succeeded", -                                local->loc.path, -                                priv->children[child_index]->name); - -                        sh->xattr[child_index] = dict_ref (xattr); -                        sh->success_children[sh->success_count] = child_index; -                        sh->success_count++; -                } else { -                        gf_log (this->name, GF_LOG_ERROR, "fxattrop of %s " -                                "failed on %s, reason %s", local->loc.path, -                                priv->children[child_index]->name, -                                strerror (op_errno)); -                        sh->child_errno[child_index] = op_errno; -                } -        } -        UNLOCK (&frame->lock); -} +	AFR_ONLIST (larger_sinks, frame, attr_cbk, ftruncate, fd, size, NULL); -int -afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, -                          xlator_t *this, int32_t op_ret, int32_t op_errno, -                          dict_t *xattr, dict_t *xdata) -{ -        int             call_count  = -1; -        afr_local_t     *local  = NULL; -        afr_self_heal_t *sh     = NULL; - -        local = frame->local; -        sh    = &local->self_heal; - -        afr_sh_common_fxattrop_resp_handler (frame, cookie, this, op_ret, -                                             op_errno, xattr); - -        call_count = afr_frame_return (frame); -        if (call_count == 0) { -                if (!afr_sh_data_proceed (sh->success_count)) { -                        gf_log (this->name, GF_LOG_ERROR, "%s, inspecting " -                                "change log succeeded on < %d children", -                                local->loc.path, AFR_SH_MIN_PARTICIPANTS); -                        afr_sh_data_fail (frame, this); -                        goto out; -                } -                afr_sh_data_fstat (frame, this); -        } -out: -        return 0; +	for (i = 0; i < priv->child_count; i++) +		if (healed_sinks[i] && local->replies[i].op_ret == -1) +			/* truncate() failed. Do NOT consider this server +			   as successfully healed. Mark it so. +			*/ +			healed_sinks[i] = 0; +	return 0;  } - -int -afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this) -{ -        afr_self_heal_t *sh    = NULL; -        afr_local_t     *local = NULL; -        afr_private_t   *priv  = NULL; -        dict_t          **xattr_req; -        int32_t         *zero_pending = NULL; -        int call_count = 0; -        int i = 0; -        int ret = 0; -	int j; - -        priv  = this->private; -        local = frame->local; -        sh    = &local->self_heal; - -        call_count = afr_up_children_count (local->child_up, -                                            priv->child_count); - -        local->call_count = call_count; - -	xattr_req = GF_CALLOC(priv->child_count, sizeof(struct dict_t *), -			      gf_afr_mt_dict_t); -	if (!xattr_req) -		goto out; +/* + * If by chance there are multiple sources with differing sizes, select + * the largest file as the source. + * + * This can only happen if data was directly modified in the backend. + */ +static int +__afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources, +				     unsigned char *sinks, +				     unsigned char *healed_sinks, +				     unsigned char *locked_on, +				     struct afr_reply *replies) +{ +	int i = 0; +	afr_private_t *priv = NULL; +	uint64_t size = 0; +	int source = -1; +	int locked_count = 0; +	int sources_count = 0; +	int healed_sinks_count = 0; + +	priv = this->private; + +	locked_count = AFR_COUNT (locked_on, priv->child_count); +	sources_count = AFR_COUNT (sources, priv->child_count); +	healed_sinks_count = AFR_COUNT (healed_sinks, priv->child_count); + +	if (locked_count == healed_sinks_count || !sources_count) { +		/* split brain */ +		return -EIO; +	}  	for (i = 0; i < priv->child_count; i++) { -		xattr_req[i] = dict_new(); -		if (!xattr_req[i]) { -			ret = -1; -			goto out; +		if (!sources[i]) +			continue; +		if (size <= replies[i].poststat.ia_size) { +			size = replies[i].poststat.ia_size; +			source = i;  		}  	}  	for (i = 0; i < priv->child_count; i++) { -		for (j = 0; j < priv->child_count; j++) { -			zero_pending = GF_CALLOC (3, sizeof (*zero_pending), -						  gf_afr_mt_int32_t); -			if (!zero_pending) { -				ret = -1; -				goto out; -			} -			ret = dict_set_dynptr (xattr_req[i], priv->pending_key[j], -					       zero_pending, -					       3 * sizeof (*zero_pending)); -			if (ret < 0) { -				gf_log (this->name, GF_LOG_WARNING, -					"Unable to set dict value"); -				goto out; -			} else { -				zero_pending = NULL; -			} +		if (!sources[i]) +			continue; +		if (replies[i].poststat.ia_size < size) { +			sources[i] = 0; +			sinks[i] = 1;  		}  	} -        afr_reset_xattr (sh->xattr, priv->child_count); -        afr_reset_children (sh->success_children, priv->child_count); -        memset (sh->child_errno, 0, -                sizeof (*sh->child_errno) * priv->child_count); -        sh->success_count = 0; -        for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i]) { -                        STACK_WIND_COOKIE (frame, afr_sh_data_fxattrop_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->fxattrop, -                                           sh->healing_fd, GF_XATTROP_ADD_ARRAY, -                                           xattr_req[i], NULL); - -                        if (!--call_count) -                                break; -                } -        } - -out: -	if (xattr_req) { -		for (i = 0; i < priv->child_count; i++) -			if (xattr_req[i]) -				dict_unref(xattr_req[i]); -		GF_FREE(xattr_req); -	} - -        if (ret) { -                GF_FREE (zero_pending); -                afr_sh_data_fail (frame, this); -        } - -        return 0; +	return source;  } -int -afr_sh_data_big_lock_success (call_frame_t *frame, xlator_t *this) +/* + * __afr_selfheal_data_prepare: + * + * This function inspects the on-disk xattrs and determines which subvols + * are sources and sinks. + * + * The return value is the index of the subvolume to be used as the source + * for self-healing, or -1 if no healing is necessary/split brain. + */ +static int +__afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd, +			     unsigned char *locked_on, unsigned char *sources, +			     unsigned char *sinks, unsigned char *healed_sinks, +			     struct afr_reply *replies)  { -        afr_local_t   *local = NULL; -        afr_self_heal_t *sh = NULL; +	int ret = -1; +	int source = -1; +	afr_private_t *priv = NULL; +	int i = 0; -        local = frame->local; -        sh = &local->self_heal; +	priv = this->private; -        sh->data_lock_held = _gf_true; -        afr_sh_data_fxattrop (frame, this); -        return 0; -} +	ret = afr_selfheal_unlocked_discover (frame, fd->inode, fd->inode->gfid, +					      replies); +	if (ret) +		return ret; -int -afr_sh_dom_lock_success (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t   *local = NULL; -        afr_self_heal_t *sh = NULL; +	ret = afr_selfheal_find_direction (frame, this, replies, +					   AFR_DATA_TRANSACTION, +					   locked_on, sources, sinks); +	if (ret) +		return ret; -        local = frame->local; -        sh = &local->self_heal; +	source = __afr_selfheal_data_finalize_source (this, sources, sinks, +						      healed_sinks, locked_on, +						      replies); +	if (source < 0) +		return -EIO; -        sh->sh_dom_lock_held = _gf_true; -        afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, -                          afr_sh_data_big_lock_success, -                          afr_sh_data_fail); -        return 0; -} +	for (i = 0; i < priv->child_count; i++) +		/* Initialize the healed_sinks[] array optimistically to +		   the intersection of to-be-healed (i.e sinks[]) and +		   the list of servers which are up (i.e locked_on[]). -int -afr_sh_data_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) -{ -        afr_internal_lock_t *int_lock = NULL; -        afr_local_t         *local    = NULL; -        afr_self_heal_t     *sh       = NULL; +		   As we encounter failures in the healing process, we +		   will unmark the respective servers in the healed_sinks[] +		   array. +		*/ +		healed_sinks[i] = sinks[i] && locked_on[i]; -        local    = frame->local; -        int_lock = &local->internal_lock; -        sh       = &local->self_heal; +	return source; +} -        if (int_lock->lock_op_ret < 0) { -                gf_log (this->name, GF_LOG_ERROR, "Blocking data inodelks " -                        "failed for %s. by %s", -                        local->loc.path, lkowner_utoa (&frame->root->lk_owner)); -                sh->data_lock_failure_handler (frame, this); -        } else { +static int +__afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd, +		     unsigned char *locked_on) +{ +	afr_private_t *priv = NULL; +	int ret = -1; +	unsigned char *sources = NULL; +	unsigned char *sinks = NULL; +	unsigned char *data_lock = NULL; +	unsigned char *healed_sinks = NULL; +	struct afr_reply *locked_replies = NULL; +	int source = -1; +	gf_boolean_t compat = _gf_false; +	unsigned char *compat_lock = NULL; + +	priv = this->private; + +	sources = alloca0 (priv->child_count); +	sinks = alloca0 (priv->child_count); +	healed_sinks = alloca0 (priv->child_count); +	data_lock = alloca0 (priv->child_count); +	compat_lock = alloca0 (priv->child_count); + +	locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); + +	ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, 0, 0, +				    data_lock); +	{ +		if (ret < 2) { +			ret = -ENOTCONN; +			goto unlock; +		} -                gf_log (this->name, GF_LOG_DEBUG, "Blocking data inodelks " -                        "done for %s by %s. Proceding to self-heal", -                        local->loc.path, lkowner_utoa (&frame->root->lk_owner)); +		ret = __afr_selfheal_data_prepare (frame, this, fd, data_lock, +						   sources, sinks, healed_sinks, +						   locked_replies); +		if (ret < 0) +			goto unlock; + +		source = ret; + +		ret = __afr_selfheal_truncate_sinks (frame, this, fd, healed_sinks, +						     locked_replies, +						     locked_replies[source].poststat.ia_size); +		if (ret < 0) +			goto unlock; + +		ret = 0; + +		/* Locking from (LLONG_MAX - 2) to (LLONG_MAX - 1) is for +		   compatibility with older self-heal clients which do not +		   hold a lock in the @priv->sh_domain domain to guard +		   against concurrent ongoing self-heals +		*/ +		afr_selfheal_inodelk (frame, this, fd->inode, this->name, +				      LLONG_MAX - 2, 1, compat_lock); +		compat = _gf_true; +	} +unlock: +	afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0, +				data_lock); +	if (ret < 0) +		goto out; -                sh->data_lock_success_handler (frame, this); -        } +	ret = afr_selfheal_data_do (frame, this, fd, source, healed_sinks, +				    locked_replies); +	if (ret) +		goto out; -        return 0; +	ret = afr_selfheal_undo_pending (frame, this, fd->inode, sources, sinks, +					 healed_sinks, AFR_DATA_TRANSACTION, +					 locked_replies, data_lock); +out: +	if (compat) +		afr_selfheal_uninodelk (frame, this, fd->inode, this->name, +					LLONG_MAX - 2, 1, compat_lock); +	return ret;  } -int -afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) -{ -        afr_internal_lock_t *int_lock = NULL; -        afr_local_t         *local    = NULL; -        afr_self_heal_t     *sh       = NULL; - -        local    = frame->local; -        int_lock = &local->internal_lock; -        sh       = &local->self_heal; - -        if (int_lock->lock_op_ret < 0) { -                gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks " -                        "failed for %s. by %s", -                        local->loc.path, lkowner_utoa (&frame->root->lk_owner)); - -		if (!sh->data_lock_block) { -			sh->data_lock_failure_handler(frame, this); -		} else { -			int_lock->lock_cbk = -				afr_sh_data_post_blocking_inodelk_cbk; -			afr_blocking_lock (frame, this); -		} -        } else { - -                gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks " -                        "done for %s by %s. Proceeding to self-heal", -                        local->loc.path, lkowner_utoa (&frame->root->lk_owner)); -                sh->data_lock_success_handler (frame, this); -        } - -        return 0; -} -int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, char *dom, -                      off_t start, off_t len) +static fd_t * +afr_selfheal_data_open (xlator_t *this, inode_t *inode)  { -        afr_internal_lock_t *int_lock = NULL; -        afr_inodelk_t       *inodelk  = NULL; -        afr_local_t         *local    = NULL; - -        local    = frame->local; -        int_lock = &local->internal_lock; - -        int_lock->transaction_lk_type = AFR_SELFHEAL_LK; -        int_lock->selfheal_lk_type    = AFR_DATA_SELF_HEAL_LK; +	loc_t loc = {0,}; +	int ret = 0; +	fd_t *fd = NULL; -        afr_set_lock_number (frame, this); +	fd = fd_create (inode, 0); +	if (!fd) +		return NULL; -        int_lock->lock_cbk         = afr_sh_data_post_nonblocking_inodelk_cbk; +	loc.inode = inode_ref (inode); +	uuid_copy (loc.gfid, inode->gfid); -        int_lock->domain = dom; -        inodelk = afr_get_inodelk (int_lock, int_lock->domain); -        inodelk->flock.l_start = start; -        inodelk->flock.l_len   = len; -        inodelk->flock.l_type  = F_WRLCK; - -        afr_nonblocking_inodelk (frame, this); - -        return 0; -} - -int -afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t     *local     = NULL; -        afr_self_heal_t *sh        = NULL; - -        local = frame->local; -        sh = &local->self_heal; - -        GF_ASSERT (sh->old_loop_frame); -        sh_loop_finish (sh->old_loop_frame, this); -        sh->old_loop_frame = NULL; -        sh->data_lock_held = _gf_true; -        sh->sync_done = _gf_true; -        afr_sh_data_fxattrop (frame, this); -        return 0; -} - -int -afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t     *local     = NULL; -        afr_self_heal_t *sh        = NULL; +	ret = syncop_open (this, &loc, O_RDWR|O_LARGEFILE, fd); +	if (ret) { +		fd_unref (fd); +		fd = NULL; +	} else { +		fd_bind (fd); +	} -        local = frame->local; -        sh = &local->self_heal; +	loc_wipe (&loc); -        GF_ASSERT (sh->old_loop_frame); -        sh_loop_finish (sh->old_loop_frame, this); -        sh->old_loop_frame = NULL; -        afr_sh_set_timestamps (frame, this); -        return 0; +	return fd;  }  int -afr_sh_data_lock (call_frame_t *frame, xlator_t *this, -                  off_t start, off_t len, gf_boolean_t block, -                  char *dom, afr_lock_cbk_t success_handler, -                  afr_lock_cbk_t failure_handler) +afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode)  { -        afr_local_t *   local = NULL; -        afr_self_heal_t * sh  = NULL; - -        local = frame->local; -        sh    = &local->self_heal; - -        sh->data_lock_success_handler = success_handler; -        sh->data_lock_failure_handler = failure_handler; -	sh->data_lock_block = block; -        return afr_sh_data_lock_rec (frame, this, dom, start, len); -} +	afr_private_t *priv = NULL; +	unsigned char *locked_on = NULL; +	int ret = 0; +	fd_t *fd = NULL; -int -afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                      int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) -{ -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        afr_private_t   *priv = NULL; -        int              call_count = 0; -        int              child_index = 0; - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; - -        child_index = (long) cookie; - -        /* TODO: some of the open's might fail. -           In that case, modify cleanup fn to send flush on those -           fd's which are already open */ - -        LOCK (&frame->lock); -        { -                if (op_ret == -1) { -                        gf_log (this->name, GF_LOG_ERROR, -                                "open of %s failed on child %s (%s)", -                                local->loc.path, -                                priv->children[child_index]->name, -                                strerror (op_errno)); -                        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -                } else { -                        gf_log (this->name, GF_LOG_TRACE, -                                "open of %s succeeded on child %s", -                                local->loc.path, -                                priv->children[child_index]->name); -                } -        } -        UNLOCK (&frame->lock); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { -                        afr_sh_data_fail (frame, this); -                        return 0; -                } - -                gf_log (this->name, GF_LOG_TRACE, -                        "fd for %s opened, commencing sync", -                        local->loc.path); - -                afr_sh_data_lock (frame, this, 0, 0, _gf_true, priv->sh_domain, -                                  afr_sh_dom_lock_success, afr_sh_data_fail); -        } - -        return 0; -} +	priv = this->private; +	fd = afr_selfheal_data_open (this, inode); +	if (!fd) +		return -EIO; -int -afr_sh_data_open (call_frame_t *frame, xlator_t *this) -{ -        int i = 0; -        int call_count = 0; -        fd_t *fd = NULL; -        afr_local_t *   local = NULL; -        afr_private_t * priv  = NULL; -        afr_self_heal_t *sh = NULL; - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; - -        call_count = afr_up_children_count (local->child_up, priv->child_count); -        local->call_count = call_count; - -        fd = fd_create (local->loc.inode, frame->root->pid); -        sh->healing_fd = fd; - -        /* open sinks */ -        for (i = 0; i < priv->child_count; i++) { -                if(!local->child_up[i]) -                        continue; - -                STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, -                                   (void *) (long) i, -                                   priv->children[i], -                                   priv->children[i]->fops->open, -                                   &local->loc, -                                   O_RDWR|O_LARGEFILE, fd, NULL); - -                if (!--call_count) -                        break; -        } - -        return 0; -} +	locked_on = alloca0 (priv->child_count); -void -afr_sh_non_reg_fix (call_frame_t *frame, xlator_t *this, -                    int32_t op_ret, int32_t op_errno) -{ -        afr_private_t   *priv = NULL; -        afr_self_heal_t *sh = NULL; -        afr_local_t     *local = NULL; -        int             i = 0; - -        if (op_ret < 0) { -                afr_sh_data_fail (frame, this); -                return; -        } - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; - -        for (i = 0; i < priv->child_count ; i++) { -                if (1 == local->child_up[i]) -                        sh->success[i] = 1; -        } - -        afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION, -                              afr_sh_data_erase_pending_cbk, -                              afr_sh_data_finish); -} +	ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain, 0, 0, +				       locked_on); +	{ +		if (ret < 2) { +			/* Either less than two subvols available, or another +			   selfheal (from another server) is in progress. Skip +			   for now in any case there isn't anything to do. +			*/ +			ret = -ENOTCONN; +			goto unlock; +		} -int -afr_sh_non_reg_lock_success (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t   *local = NULL; -        afr_self_heal_t *sh = NULL; - -        local = frame->local; -        sh = &local->self_heal; -        sh->data_lock_held = _gf_true; -        afr_sh_common_lookup (frame, this, &local->loc, -                              afr_sh_non_reg_fix, NULL, -                              AFR_LOOKUP_FAIL_CONFLICTS | -                              AFR_LOOKUP_FAIL_MISSING_GFIDS, -                              NULL); -        return 0; -} +		ret = __afr_selfheal_data (frame, this, fd, locked_on); +	} +unlock: +	afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on); -gf_boolean_t -afr_can_start_data_self_heal (afr_self_heal_t *sh, afr_private_t *priv) -{ -        if (sh->force_confirm_spb) -                return _gf_true; -        if (sh->do_data_self_heal && -            afr_data_self_heal_enabled (priv->data_self_heal)) -                return _gf_true; -        return _gf_false; -} +	if (fd) +		fd_unref (fd); -int -afr_self_heal_data (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh    = NULL; -        afr_private_t   *priv  = this->private; -        int             ret    = -1; - -        local = frame->local; -        sh = &local->self_heal; - -        sh->sh_type_in_action = AFR_SELF_HEAL_DATA; - -        if (afr_can_start_data_self_heal (sh, priv)) { -                afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); -                ret = afr_inodelk_init (&local->internal_lock.inodelk[1], -                                        priv->sh_domain, priv->child_count); -                if (ret < 0) { -                        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -                        afr_sh_data_done (frame, this); -                        return 0; -                } - -                if (IA_ISREG (sh->type)) { -                        afr_sh_data_open (frame, this); -                } else { -                        afr_sh_data_lock (frame, this, 0, 0, _gf_true, -                                          this->name, -                                          afr_sh_non_reg_lock_success, -                                          afr_sh_data_fail); -                } -        } else { -                gf_log (this->name, GF_LOG_TRACE, -                        "not doing data self heal on %s", -                        local->loc.path); -                afr_sh_data_done (frame, this); -        } - -        return 0; +	return ret;  } diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 00f1a9cb91b..9605d69f417 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -1,5 +1,5 @@  /* -  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>    This file is part of GlusterFS.    This file is licensed to you under your choice of the GNU Lesser @@ -8,2399 +8,624 @@    cases as published by the Free Software Foundation.  */ -#include <libgen.h> -#include <unistd.h> -#include <fnmatch.h> -#include <sys/time.h> -#include <stdlib.h> -#include <signal.h>  #ifndef _CONFIG_H  #define _CONFIG_H  #include "config.h"  #endif -#include "glusterfs.h" -#include "inode.h"  #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" +#include "afr-self-heal.h"  #include "byte-order.h" -  #include "afr-transaction.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" - -#define AFR_INIT_SH_FRAME_VALS(_frame, _local, _sh, _sh_frame, _sh_local, _sh_sh)\ -        do {\ -                _local = _frame->local;\ -                _sh = &_local->self_heal;\ -                _sh_frame = _sh->sh_frame;\ -                _sh_local = _sh_frame->local;\ -                _sh_sh    = &_sh_local->self_heal;\ -        } while (0); - -int -afr_sh_entry_impunge_create_file (call_frame_t *impunge_frame, xlator_t *this, -                                  int child_index); -int -afr_sh_entry_done (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; - -        local = frame->local; -        sh = &local->self_heal; - -        sh->completion_cbk (frame, this); - -        return 0; -} - - -int -afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t         *local    = NULL; -        afr_internal_lock_t *int_lock = NULL; - -        local    = frame->local; -        int_lock = &local->internal_lock; - -        int_lock->lock_cbk = afr_sh_entry_done; -        afr_unlock (frame, this); - -        return 0; -} - - -int -afr_sh_entry_finish (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t   *local = NULL; - -        local = frame->local; - -        gf_log (this->name, GF_LOG_TRACE, -                "finishing entry selfheal of %s", local->loc.path); - -        afr_sh_entry_unlock (frame, this); - -        return 0; -} - - -int -afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie, -                                xlator_t *this, int32_t op_ret, -                                int32_t op_errno, dict_t *xattr, dict_t *xdata) -{ -        long                 i          = 0; -        int                  call_count = 0; -        afr_local_t         *local      = NULL; -        afr_self_heal_t     *sh         = NULL; -        afr_local_t         *orig_local = NULL; -        call_frame_t        *orig_frame = NULL; -        afr_private_t       *priv       = NULL; -        int32_t             read_child  = -1; - -        local = frame->local; -        priv  = this->private; -        sh = &local->self_heal; -        i = (long)cookie; - - -        afr_children_add_child (sh->fresh_children, i, priv->child_count); -        if (op_ret == -1) { -                gf_log (this->name, GF_LOG_INFO, -                        "%s: failed to erase pending xattrs on %s (%s)", -                        local->loc.path, priv->children[i]->name, -                        strerror (op_errno)); -        } - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                if (sh->source == -1) { -                        //this happens if the forced merge option is set -                        read_child = sh->fresh_children[0]; -                } else { -                        read_child = sh->source; -                } -                afr_inode_set_read_ctx (this, sh->inode, read_child, -                                        sh->fresh_children); -                orig_frame = sh->orig_frame; -                orig_local = orig_frame->local; - -                if (sh->source != -1) { -                        orig_local->cont.lookup.buf.ia_nlink = sh->buf[sh->source].ia_nlink; -                } - -                afr_sh_entry_finish (frame, this); -        } - -        return 0; -} - - -int -afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; - -        local = frame->local; -        sh = &local->self_heal; - -        if (sh->entries_skipped) { -                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -                goto out; -        } -        afr_sh_erase_pending (frame, this, AFR_ENTRY_TRANSACTION, -                              afr_sh_entry_erase_pending_cbk, -                              afr_sh_entry_finish); -        return 0; -out: -        afr_sh_entry_finish (frame, this); -        return 0; -} - - - -static int -next_active_source (call_frame_t *frame, xlator_t *this, -                    int current_active_source) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *local  = NULL; -        afr_self_heal_t *sh  = NULL; -        int              source = -1; -        int              next_active_source = -1; -        int              i = 0; - -        priv = this->private; -        local = frame->local; -        sh = &local->self_heal; - -        source = sh->source; - -        if (source != -1) { -                if (current_active_source != source) -                        next_active_source = source; -                goto out; -        } - -        /* -          the next active sink becomes the source for the -          'conservative decision' of merging all entries -        */ - -        for (i = 0; i < priv->child_count; i++) { -                if ((sh->sources[i] == 0) -                    && (local->child_up[i] == 1) -                    && (i > current_active_source)) { - -                        next_active_source = i; -                        break; -                } -        } -out: -        return next_active_source; -} -  static int -next_active_sink (call_frame_t *frame, xlator_t *this, -                  int current_active_sink) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *local  = NULL; -        afr_self_heal_t *sh  = NULL; -        int              next_active_sink = -1; -        int              i = 0; - -        priv = this->private; -        local = frame->local; -        sh = &local->self_heal; - -        /* -          the next active sink becomes the source for the -          'conservative decision' of merging all entries -        */ - -        for (i = 0; i < priv->child_count; i++) { -                if ((sh->sources[i] == 0) -                    && (local->child_up[i] == 1) -                    && (i > current_active_sink)) { - -                        next_active_sink = i; -                        break; -                } -        } - -        return next_active_sink; -} - -int -afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this); - -int -afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this); +afr_selfheal_entry_delete (call_frame_t *frame, xlator_t *this, inode_t *dir, +			   const char *name, inode_t *inode, int child, +			   struct afr_reply *replies) +{ +	afr_private_t *priv = NULL; +	xlator_t *subvol = NULL; +	int ret = 0; +	loc_t loc = {0, }; +	char g[64]; + +	priv = this->private; + +	subvol = priv->children[child]; + +	loc.parent = inode_ref (dir); +	uuid_copy (loc.pargfid, dir->gfid); +	loc.name = name; +	loc.inode = inode_ref (inode); + +	if (replies[child].valid && replies[child].op_ret == 0) { +		switch (replies[child].poststat.ia_type) { +		case IA_IFDIR: +			gf_log (this->name, GF_LOG_WARNING, +				"expunging dir %s/%s (%s) on %s", +				uuid_utoa (dir->gfid), name, +				uuid_utoa_r (replies[child].poststat.ia_gfid, g), +				subvol->name); +			ret = syncop_rmdir (subvol, &loc, 1); +			break; +		default: +			gf_log (this->name, GF_LOG_WARNING, +				"expunging file %s/%s (%s) on %s", +				uuid_utoa (dir->gfid), name, +				uuid_utoa_r (replies[child].poststat.ia_gfid, g), +				subvol->name); +			ret = syncop_unlink (subvol, &loc); +			break; +		} +	} + +	loc_wipe (&loc); + +	return ret; +} + + +int +afr_selfheal_recreate_entry (call_frame_t *frame, xlator_t *this, int dst, +			     int source, inode_t *dir, const char *name, +			     inode_t *inode, struct afr_reply *replies) +{ +	int ret = 0; +	loc_t loc = {0,}; +	loc_t srcloc = {0,}; +	afr_private_t *priv = NULL; +	dict_t *xdata = NULL; +	struct iatt *iatt = NULL; +	char *linkname = NULL; +	mode_t mode = 0; +	struct iatt newent = {0,}; + +	priv = this->private; + +	xdata = dict_new(); +	if (!xdata) +		return -ENOMEM; + +	loc.parent = inode_ref (dir); +	uuid_copy (loc.pargfid, dir->gfid); +	loc.name = name; +	loc.inode = inode_ref (inode); + +	ret = afr_selfheal_entry_delete (frame, this, dir, name, inode, dst, +					 replies); +	if (ret) +		goto out; + +	ret = dict_set_static_bin (xdata, "gfid-req", +				   replies[source].poststat.ia_gfid, 16); +	if (ret) +		goto out; + +	iatt = &replies[source].poststat; + +	srcloc.inode = inode_ref (inode); +	uuid_copy (srcloc.gfid, iatt->ia_gfid); + +	mode = st_mode_from_ia (iatt->ia_prot, iatt->ia_type); + +	switch (iatt->ia_type) { +	case IA_IFDIR: +		ret = syncop_mkdir (priv->children[dst], &loc, mode, xdata, 0); +		break; +	case IA_IFLNK: +		ret = syncop_lookup (priv->children[dst], &srcloc, 0, 0, 0, 0); +		if (ret == 0) { +			ret = syncop_link (priv->children[dst], &srcloc, &loc); +		} else { +			ret = syncop_readlink (priv->children[source], &srcloc, +					       &linkname, 4096); +			if (ret <= 0) +				goto out; +			ret = syncop_symlink (priv->children[dst], &loc, linkname, +					      xdata, NULL); +		} +		break; +	default: +		ret = dict_set_int32 (xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1); +		if (ret) +			goto out; +		ret = syncop_mknod (priv->children[dst], &loc, mode, +				    iatt->ia_rdev, xdata, &newent); +		if (ret == 0 && iatt->ia_size && !newent.ia_size) { +			/* New entry created. Mark @dst pending on all sources */ +			ret = 1; +		} +		break; +	} -int -afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this); - -int -afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, -                             int active_src); - -int -afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this, -                                 int active_src, int32_t op_ret, -                                 int32_t op_errno) -{ -        int              call_count = 0; - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) -                afr_sh_entry_expunge_subvol (frame, this, active_src); - -        return 0; -} - -int -afr_sh_entry_expunge_parent_setattr_cbk (call_frame_t *expunge_frame, -                                         void *cookie, xlator_t *this, -                                         int32_t op_ret, int32_t op_errno, -                                         struct iatt *preop, struct iatt *postop, -                                         dict_t *xdata) -{ -        afr_private_t   *priv          = NULL; -        afr_local_t     *expunge_local = NULL; -        afr_self_heal_t *expunge_sh    = NULL; -        call_frame_t    *frame         = NULL; -        int              active_src    = (long) cookie; -        afr_self_heal_t *sh            = NULL; -        afr_local_t     *local         = NULL; - -        priv          = this->private; -        expunge_local = expunge_frame->local; -        expunge_sh    = &expunge_local->self_heal; -        frame         = expunge_sh->sh_frame; -        local         = frame->local; -        sh            = &local->self_heal; - -        if (op_ret != 0) { -                gf_log (this->name, GF_LOG_ERROR, -                        "setattr on parent directory of %s on subvolume %s failed: %s", -                        expunge_local->loc.path, -                        priv->children[active_src]->name, strerror (op_errno)); -        } - -        AFR_STACK_DESTROY (expunge_frame); -        sh->expunge_done (frame, this, active_src, op_ret, op_errno); - -        return 0; -} - - -int -afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie, -                                 xlator_t *this, -                                 int32_t op_ret, int32_t op_errno, -                                 struct iatt *preparent, -                                 struct iatt *postparent, dict_t *xdata) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *expunge_local = NULL; -        afr_self_heal_t *expunge_sh = NULL; -        int              active_src = 0; -        int32_t          valid = 0; - -        priv = this->private; -        expunge_local = expunge_frame->local; -        expunge_sh = &expunge_local->self_heal; - -        active_src = (long) cookie; - -        if (op_ret == 0) { -                gf_log (this->name, GF_LOG_DEBUG, -                        "removed %s on %s", -                        expunge_local->loc.path, -                        priv->children[active_src]->name); -        } else { -                gf_log (this->name, GF_LOG_INFO, -                        "removing %s on %s failed (%s)", -                        expunge_local->loc.path, -                        priv->children[active_src]->name, -                        strerror (op_errno)); -        } - -        valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - -        STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_parent_setattr_cbk, -                           (void *) (long) active_src, -                           priv->children[active_src], -                           priv->children[active_src]->fops->setattr, -                           &expunge_sh->parent_loc, -                           &expunge_sh->parentbuf, -                           valid, NULL); - -        return 0; -} - - -int -afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this, -                             int active_src) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *expunge_local = NULL; - -        priv          = this->private; -        expunge_local = expunge_frame->local; - -        gf_log (this->name, GF_LOG_TRACE, -                "expunging file %s on %s", -                expunge_local->loc.path, priv->children[active_src]->name); - -        STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, -                           (void *) (long) active_src, -                           priv->children[active_src], -                           priv->children[active_src]->fops->unlink, -                           &expunge_local->loc, 0, NULL); - -        return 0; -} - - - -int -afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this, -                            int active_src) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *expunge_local = NULL; - -        priv          = this->private; -        expunge_local = expunge_frame->local; - -        gf_log (this->name, GF_LOG_DEBUG, -                "expunging directory %s on %s", -                expunge_local->loc.path, priv->children[active_src]->name); - -        STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, -                           (void *) (long) active_src, -                           priv->children[active_src], -                           priv->children[active_src]->fops->rmdir, -                           &expunge_local->loc, 1, NULL); - -        return 0; -} - - -int -afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, -                             int active_src, struct iatt *buf, -                             struct iatt *parentbuf) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *expunge_local = NULL; -        afr_self_heal_t *expunge_sh = NULL; -        call_frame_t    *frame = NULL; -        int              type = 0; -        afr_self_heal_t *sh            = NULL; -        afr_local_t     *local         = NULL; -        loc_t           *loc           = NULL; - -        priv = this->private; -        expunge_local = expunge_frame->local; -        expunge_sh = &expunge_local->self_heal; -        frame = expunge_sh->sh_frame; -        local         = frame->local; -        sh            = &local->self_heal; -        loc           = &expunge_local->loc; - -        type = buf->ia_type; -        if (loc->parent && uuid_is_null (loc->parent->gfid)) -                uuid_copy (loc->pargfid, parentbuf->ia_gfid); - -        switch (type) { -        case IA_IFSOCK: -        case IA_IFREG: -        case IA_IFBLK: -        case IA_IFCHR: -        case IA_IFIFO: -        case IA_IFLNK: -                afr_sh_entry_expunge_unlink (expunge_frame, this, active_src); -                break; -        case IA_IFDIR: -                afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src); -                break; -        default: -                gf_log (this->name, GF_LOG_ERROR, -                        "%s has unknown file type on %s: 0%o", -                        expunge_local->loc.path, -                        priv->children[active_src]->name, type); -                goto out; -                break; -        } - -        return 0; -out: -        AFR_STACK_DESTROY (expunge_frame); -        sh->expunge_done (frame, this, active_src, -1, EINVAL); - -        return 0; -} - - -int -afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie, -                                 xlator_t *this, -                                 int32_t op_ret, int32_t op_errno, -                                 inode_t *inode, struct iatt *buf, dict_t *x, -                                 struct iatt *postparent) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *expunge_local = NULL; -        afr_self_heal_t *expunge_sh = NULL; -        call_frame_t    *frame = NULL; -        int              active_src = 0; -        afr_self_heal_t *sh            = NULL; -        afr_local_t     *local         = NULL; - -        priv = this->private; -        expunge_local = expunge_frame->local; -        expunge_sh = &expunge_local->self_heal; -        frame = expunge_sh->sh_frame; -        active_src = (long) cookie; -        local         = frame->local; -        sh            = &local->self_heal; - -        if (op_ret == -1) { -                gf_log (this->name, GF_LOG_ERROR, -                        "lookup of %s on %s failed (%s)", -                        expunge_local->loc.path, -                        priv->children[active_src]->name, -                        strerror (op_errno)); -                goto out; -        } - -        afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf, -                                     postparent); - -        return 0;  out: -        AFR_STACK_DESTROY (expunge_frame); -        sh->expunge_done (frame, this, active_src, op_ret, op_errno); - -        return 0; +	if (xdata) +		dict_unref (xdata); +	loc_wipe (&loc); +	loc_wipe (&srcloc); +	return ret;  } -int -afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this, -                            int active_src) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *expunge_local = NULL; - -        priv = this->private; -        expunge_local = expunge_frame->local; - -        gf_log (this->name, GF_LOG_TRACE, -                "looking up %s on %s", -                expunge_local->loc.path, priv->children[active_src]->name); - -        STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk, -                           (void *) (long) active_src, -                           priv->children[active_src], -                           priv->children[active_src]->fops->lookup, -                           &expunge_local->loc, NULL); - -        return 0; -} - -int -afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie, -                                xlator_t *this, -                                int32_t op_ret, int32_t op_errno, -                                inode_t *inode, struct iatt *buf, dict_t *x, -                                struct iatt *postparent) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *expunge_local = NULL; -        afr_self_heal_t *expunge_sh = NULL; -        int              source = 0; -        call_frame_t    *frame = NULL; -        int              active_src = 0; -        int              need_expunge = 0; -        afr_self_heal_t *sh            = NULL; -        afr_local_t     *local         = NULL; - -        priv = this->private; -        expunge_local = expunge_frame->local; -        expunge_sh = &expunge_local->self_heal; -        frame = expunge_sh->sh_frame; -        active_src = expunge_sh->active_source; -        source = (long) cookie; -        local         = frame->local; -        sh            = &local->self_heal; - -        if (op_ret == -1 && op_errno == ENOENT) -                need_expunge = 1; -        else if (op_ret == -1) -                goto out; - -        if (!uuid_is_null (expunge_sh->entrybuf.ia_gfid) && -            !uuid_is_null (buf->ia_gfid) && -            (uuid_compare (expunge_sh->entrybuf.ia_gfid, buf->ia_gfid) != 0)) { -                char uuidbuf1[64]; -                char uuidbuf2[64]; -                gf_log (this->name, GF_LOG_DEBUG, -                        "entry %s found on %s with mismatching gfid (%s/%s)", -                        expunge_local->loc.path, -                        priv->children[source]->name, -                        uuid_utoa_r (expunge_sh->entrybuf.ia_gfid, uuidbuf1), -                        uuid_utoa_r (buf->ia_gfid, uuidbuf2)); -                need_expunge = 1; -        } - -        if (need_expunge) { -                gf_log (this->name, GF_LOG_INFO, -                        "Entry %s is missing on %s and deleting from " -                        "replica's other bricks", -                        expunge_local->loc.path, -                        priv->children[source]->name); - -                if (postparent) -                        expunge_sh->parentbuf = *postparent; - -                afr_sh_entry_expunge_purge (expunge_frame, this, active_src); - -                return 0; -        } - -out: -        if (op_ret == 0) { -                gf_log (this->name, GF_LOG_TRACE, -                        "%s exists under %s", -                        expunge_local->loc.path, -                        priv->children[source]->name); -        } else { -                gf_log (this->name, GF_LOG_INFO, -                        "looking up %s under %s failed (%s)", -                        expunge_local->loc.path, -                        priv->children[source]->name, -                        strerror (op_errno)); -        } - -        AFR_STACK_DESTROY (expunge_frame); -        sh->expunge_done (frame, this, active_src, op_ret, op_errno); - -        return 0; -} - -static gf_boolean_t -can_skip_entry_self_heal (char *name, loc_t *parent_loc) -{ -        if (strcmp (name, ".") == 0) { -                return _gf_true; -        } else if (strcmp (name, "..") == 0) { -                return _gf_true; -        } else if (loc_is_root (parent_loc) && -                   (strcmp (name, GF_REPLICATE_TRASH_DIR) == 0)) { -                return _gf_true; -        } -        return _gf_false; -} - -int -afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, -                            gf_dirent_t *entry) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *local  = NULL; -        afr_self_heal_t *sh  = NULL; -        int              ret = -1; -        call_frame_t    *expunge_frame = NULL; -        afr_local_t     *expunge_local = NULL; -        afr_self_heal_t *expunge_sh = NULL; -        int              active_src = 0; -        int              source = 0; -        int              op_errno = 0; -        char            *name = NULL; -        int             op_ret = -1; - -        priv = this->private; -        local = frame->local; -        sh = &local->self_heal; - -        active_src = sh->active_source; -        source = sh->source; -        sh->expunge_done = afr_sh_entry_expunge_entry_done; - -        name = entry->d_name; -        if (can_skip_entry_self_heal (name, &local->loc)) { -                op_ret = 0; -                goto out; -        } - -        gf_log (this->name, GF_LOG_TRACE, -                "inspecting existence of %s under %s", -                name, local->loc.path); - -        expunge_frame = copy_frame (frame); -        if (!expunge_frame) { -                op_errno = ENOMEM; -                goto out; -        } - -        AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out); - -        expunge_frame->local = expunge_local; -        expunge_sh = &expunge_local->self_heal; -        expunge_sh->sh_frame = frame; -        expunge_sh->active_source = active_src; -        expunge_sh->entrybuf = entry->d_stat; -        loc_copy (&expunge_sh->parent_loc, &local->loc); - -        ret = afr_build_child_loc (this, &expunge_local->loc, &local->loc, -                                   name); -        if (ret != 0) { -                op_errno = EINVAL; -                goto out; -        } - -        gf_log (this->name, GF_LOG_TRACE, -                "looking up %s on %s", expunge_local->loc.path, -                priv->children[source]->name); - -        STACK_WIND_COOKIE (expunge_frame, -                           afr_sh_entry_expunge_entry_cbk, -                           (void *) (long) source, -                           priv->children[source], -                           priv->children[source]->fops->lookup, -                           &expunge_local->loc, NULL); - -        ret = 0; -out: -        if (ret == -1) -                sh->expunge_done (frame, this, active_src, op_ret, op_errno); - -        return 0; -} - - -int -afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie, -                                  xlator_t *this, -                                  int32_t op_ret, int32_t op_errno, -                                  gf_dirent_t *entries, dict_t *xdata) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *local  = NULL; -        afr_self_heal_t *sh  = NULL; -        gf_dirent_t     *entry = NULL; -        off_t            last_offset = 0; -        int              active_src = 0; -        int              entry_count = 0; - -        priv = this->private; -        local = frame->local; -        sh = &local->self_heal; - -        active_src = sh->active_source; - -        if (op_ret <= 0) { -                if (op_ret < 0) { -                        gf_log (this->name, GF_LOG_INFO, -                                "readdir of %s on subvolume %s failed (%s)", -                                local->loc.path, -                                priv->children[active_src]->name, -                                strerror (op_errno)); -                } else { -                        gf_log (this->name, GF_LOG_TRACE, -                                "readdir of %s on subvolume %s complete", -                                local->loc.path, -                                priv->children[active_src]->name); -                } - -                afr_sh_entry_expunge_all (frame, this); -                return 0; -        } - -        list_for_each_entry (entry, &entries->list, list) { -                last_offset = entry->d_off; -                entry_count++; -        } - -        gf_log (this->name, GF_LOG_TRACE, -                "readdir'ed %d entries from %s", -                entry_count, priv->children[active_src]->name); - -        sh->offset = last_offset; -        local->call_count = entry_count; - -        list_for_each_entry (entry, &entries->list, list) { -                afr_sh_entry_expunge_entry (frame, this, entry); -        } - -        return 0; -} - -int -afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, -                             int active_src) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *local  = NULL; -        afr_self_heal_t *sh  = NULL; - -        priv = this->private; -        local = frame->local; -        sh = &local->self_heal; - -        STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk, -                    priv->children[active_src], -                    priv->children[active_src]->fops->readdirp, -                    sh->healing_fd, sh->block_size, sh->offset, NULL); - -        return 0; -} - - -int -afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this) +static int +afr_selfheal_newentry_mark (call_frame_t *frame, xlator_t *this, inode_t *inode, +			    int source, struct afr_reply *replies, +			    unsigned char *sources, unsigned char *newentry)  { -        afr_private_t   *priv = NULL; -        afr_local_t     *local  = NULL; -        afr_self_heal_t *sh  = NULL; -        int              active_src = -1; - -        priv = this->private; -        local = frame->local; -        sh = &local->self_heal; +	int ret = 0; +	int i = 0; +	afr_private_t *priv = NULL; +	dict_t *xattr = NULL; +	int **changelog = NULL; +	int idx = 0; -        sh->offset = 0; +	priv = this->private; -        if (sh->source == -1) { -                gf_log (this->name, GF_LOG_DEBUG, -                        "no active sources for %s to expunge entries", -                        local->loc.path); -                goto out; -        } +	idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); -        active_src = next_active_sink (frame, this, sh->active_source); -        sh->active_source = active_src; +	uuid_copy (inode->gfid, replies[source].poststat.ia_gfid); -        if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { -                goto out; -        } +	changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); -        if (active_src == -1) { -                /* completed creating missing files on all subvolumes */ -                goto out; -        } +	xattr = dict_new(); +	if (!xattr) +		return -ENOMEM; -        gf_log (this->name, GF_LOG_TRACE, -                "expunging entries of %s on %s to other sinks", -                local->loc.path, priv->children[active_src]->name); +	for (i = 0; i < priv->child_count; i++) { +		if (!newentry[i]) +			continue; +		changelog[i][idx] = hton32(1); +	} -        afr_sh_entry_expunge_subvol (frame, this, active_src); +	afr_set_pending_dict (priv, xattr, changelog); -        return 0; -out: -        afr_sh_entry_impunge_all (frame, this); -        return 0; +	for (i = 0; i < priv->child_count; i++) { +		if (!sources[i]) +			continue; +		afr_selfheal_post_op (frame, this, inode, i, xattr); +	} +	dict_unref (xattr); +	return ret;  } -int -afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this, -                                 int32_t op_ret, int32_t op_errno) -{ -        int              call_count = 0; -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; - -        local = frame->local; -        sh = &local->self_heal; -        if (op_ret < 0) -                sh->entries_skipped = _gf_true; -        call_count = afr_frame_return (frame); -        if (call_count == 0) -                afr_sh_entry_impunge_subvol (frame, this); - -        return 0; +static int +__afr_selfheal_heal_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, +			    char *name, inode_t *inode, int source, +			    unsigned char *sources, unsigned char *healed_sinks, +			    unsigned char *locked_on, struct afr_reply *replies) +{ +	int ret = 0; +	afr_private_t *priv = NULL; +	int i = 0; +	unsigned char *newentry = NULL; + +	priv = this->private; +	newentry = alloca0 (priv->child_count); + +	if (!replies[source].valid) +		return -EIO; + +	for (i = 0; i < priv->child_count; i++) { +		if (!healed_sinks[i]) +			continue; +		if (replies[source].op_ret == -1 && +		    replies[source].op_errno == ENOENT) { +			ret = afr_selfheal_entry_delete (frame, this, fd->inode, +							 name, inode, i, replies); +		} else { +			if (!uuid_compare (replies[i].poststat.ia_gfid, +					   replies[source].poststat.ia_gfid)) +				continue; + +			ret = afr_selfheal_recreate_entry (frame, this, i, source, +							   fd->inode, name, inode, +							   replies); +			if (ret > 0) { +				newentry[i] = 1; +				ret = 0; +			} +		} +		if (ret < 0) +			break; +	} + +	if (AFR_COUNT (newentry, priv->child_count)) +		afr_selfheal_newentry_mark (frame, this, inode, source, replies, +					    sources, newentry); +	return ret;  } -void -afr_sh_entry_call_impunge_done (call_frame_t *impunge_frame, xlator_t *this, -                                int32_t op_ret, int32_t op_errno) -{ -        afr_local_t     *impunge_local = NULL; -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        afr_self_heal_t *impunge_sh = NULL; -        call_frame_t    *frame = NULL; -        AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, -                                frame, local, sh); - -        AFR_STACK_DESTROY (impunge_frame); -        sh->impunge_done (frame, this, op_ret, op_errno); -} - -int -afr_sh_entry_impunge_setattr_cbk (call_frame_t *impunge_frame, void *cookie, -                                  xlator_t *this, -                                  int32_t op_ret, int32_t op_errno, -                                  struct iatt *preop, struct iatt *postop, -                                  dict_t *xdata) +static int +__afr_selfheal_merge_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, +			     char *name, inode_t *inode, unsigned char *sources, +			     unsigned char *healed_sinks, unsigned char *locked_on, +			     struct afr_reply *replies)  { -        int              call_count = 0; -        afr_private_t   *priv = NULL; -        afr_local_t     *impunge_local = NULL; -        int              child_index = 0; - -        priv = this->private; -        impunge_local = impunge_frame->local; -        child_index = (long) cookie; - -        if (op_ret == 0) { -                gf_log (this->name, GF_LOG_DEBUG, -                        "setattr done for %s on %s", -                        impunge_local->loc.path, -                        priv->children[child_index]->name); -        } else { -                gf_log (this->name, GF_LOG_INFO, -                        "setattr (%s) on %s failed (%s)", -                        impunge_local->loc.path, -                        priv->children[child_index]->name, -                        strerror (op_errno)); -        } - -        call_count = afr_frame_return (impunge_frame); -        if (call_count == 0) { -                afr_sh_entry_call_impunge_done (impunge_frame, this, -                                                0, op_errno); -        } - -        return 0; -} +	int ret = 0; +	afr_private_t *priv = NULL; +	int i = 0; +	int source = -1; -int -afr_sh_entry_impunge_parent_setattr_cbk (call_frame_t *setattr_frame, -                                         void *cookie, xlator_t *this, -                                         int32_t op_ret, int32_t op_errno, -                                         struct iatt *preop, struct iatt *postop, -                                         dict_t *xdata) -{ -        int             call_count = 0; -        afr_local_t     *setattr_local = NULL; - -        setattr_local = setattr_frame->local; -        if (op_ret != 0) { -                gf_log (this->name, GF_LOG_INFO, -                        "setattr on parent directory (%s) failed: %s", -                        setattr_local->loc.path, strerror (op_errno)); -        } - -        call_count = afr_frame_return (setattr_frame); -        if (call_count == 0) -                AFR_STACK_DESTROY (setattr_frame); -        return 0; -} +	priv = this->private; -int -afr_sh_entry_impunge_setattr (call_frame_t *impunge_frame, xlator_t *this) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *impunge_local = NULL; -        afr_local_t     *setattr_local = NULL; -        afr_self_heal_t *impunge_sh = NULL; -        call_frame_t    *setattr_frame = NULL; -        int32_t          valid = 0; -        int32_t          op_errno = 0; -        int              child_index = 0; -        int              call_count = 0; -        int              i = 0; - -        priv          = this->private; -        impunge_local = impunge_frame->local; -        impunge_sh    = &impunge_local->self_heal; - -        gf_log (this->name, GF_LOG_DEBUG, -                "setting ownership of %s on %s to %d/%d", -                impunge_local->loc.path, -                priv->children[child_index]->name, -                impunge_sh->entrybuf.ia_uid, -                impunge_sh->entrybuf.ia_gid); - -        setattr_frame = copy_frame (impunge_frame); -        if (!setattr_frame) { -                op_errno = ENOMEM; -                goto out; -        } -        AFR_LOCAL_ALLOC_OR_GOTO (setattr_frame->local, out); -        setattr_local = setattr_frame->local; -        call_count = afr_errno_count (NULL, impunge_sh->child_errno, -                                      priv->child_count, 0); -        loc_copy (&setattr_local->loc, &impunge_sh->parent_loc); -        impunge_local->call_count = call_count; -        setattr_local->call_count = call_count; -        for (i = 0; i < priv->child_count; i++) { -                if (impunge_sh->child_errno[i]) -                        continue; -                valid         = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; -                STACK_WIND_COOKIE (setattr_frame, -                                   afr_sh_entry_impunge_parent_setattr_cbk, -                                   (void *) (long) i, priv->children[i], -                                   priv->children[i]->fops->setattr, -                                   &setattr_local->loc, -                                   &impunge_sh->parentbuf, valid, NULL); - -                valid = GF_SET_ATTR_UID   | GF_SET_ATTR_GID | -                        GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; -                STACK_WIND_COOKIE (impunge_frame, -                                   afr_sh_entry_impunge_setattr_cbk, -                                   (void *) (long) i, priv->children[i], -                                   priv->children[i]->fops->setattr, -                                   &impunge_local->loc, -                                   &impunge_sh->entrybuf, valid, NULL); -                call_count--; -        } -        GF_ASSERT (!call_count); -        return 0; -out: -        if (setattr_frame) -                AFR_STACK_DESTROY (setattr_frame); -        afr_sh_entry_call_impunge_done (impunge_frame, this, 0, op_errno); -        return 0; -} +	for (i = 0; i < priv->child_count; i++) { +		if (replies[i].valid && replies[i].op_ret == 0) { +			source = i; +			break; +		} +	} -int -afr_sh_entry_impunge_xattrop_cbk (call_frame_t *impunge_frame, void *cookie, -                                  xlator_t *this, -                                  int32_t op_ret, int32_t op_errno, -                                  dict_t *xattr, dict_t *xdata) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *impunge_local = NULL; -        int              child_index = 0; -        int              call_count = -1; - -        priv          = this->private; -        impunge_local = impunge_frame->local; - -        child_index = (long) cookie; - -        if (op_ret == -1) { -                gf_log (this->name, GF_LOG_INFO, -                        "%s: failed to perform xattrop on %s (%s)", -                        impunge_local->loc.path, -                        priv->children[child_index]->name, strerror (op_errno)); - -                        LOCK (&impunge_frame->lock); -                        { -                                impunge_local->op_ret = -1; -                                impunge_local->op_errno = op_errno; -                        } -                        UNLOCK (&impunge_frame->lock); -        } - -        call_count = afr_frame_return (impunge_frame); - -        if (call_count == 0) { -                if (impunge_local->op_ret == 0) { -                        afr_sh_entry_impunge_setattr (impunge_frame, this); -                } else { -                        afr_sh_entry_call_impunge_done (impunge_frame, this, -                                                -1, impunge_local->op_errno); -                } -        } -        return 0; -} +	if (source == -1) { +		/* entry got deleted in the mean time? */ +		return 0; +	} -int -afr_sh_entry_impunge_perform_xattrop (call_frame_t *impunge_frame, -                                      xlator_t *this) -{ -        int              active_src       = 0; -        dict_t          *xattr            = NULL; -        afr_private_t   *priv             = NULL; -        afr_local_t     *impunge_local    = NULL; -        afr_self_heal_t *impunge_sh       = NULL; -        int32_t         op_errno          = 0; -        int32_t         call_count        = 0; -        int32_t         i                 = 0; - - -        priv = this->private; -        impunge_local = impunge_frame->local; -        impunge_sh = &impunge_local->self_heal; -        active_src = impunge_sh->active_source; -        impunge_local->op_ret = 0; - -        afr_prepare_new_entry_pending_matrix (impunge_local->pending, -                                              afr_is_errno_unset, -                                              impunge_sh->child_errno, -                                              &impunge_sh->entrybuf, -                                              priv->child_count); -        xattr = dict_new (); -        if (!xattr) { -                op_errno = ENOMEM; -                goto out; -        } - -        afr_set_pending_dict (priv, xattr, impunge_local->pending, active_src, -                              LOCAL_LAST); - -        for (i = 0; i < priv->child_count; i++) { -                if ((impunge_sh->child_errno[i] == EEXIST) && -                    (impunge_local->child_up[i] == 1)) - -                        call_count++; -        } - -        impunge_local->call_count  = call_count; - -        for (i = 0; i < priv->child_count; i++) { - -                if ((impunge_sh->child_errno[i] == EEXIST) -                    && (impunge_local->child_up[i] == 1)) { - - -                        STACK_WIND_COOKIE (impunge_frame, -                                           afr_sh_entry_impunge_xattrop_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->xattrop, -                                           &impunge_local->loc, -                                           GF_XATTROP_ADD_ARRAY, xattr, NULL); -                        if (!--call_count) -                                break; -                } -        } - -        if (xattr) -                dict_unref (xattr); -        return 0; -out: -        afr_sh_entry_call_impunge_done (impunge_frame, this, -                                        -1, op_errno); -        return 0; -} +	for (i = 0; i < priv->child_count; i++) { +		if (i == source || !healed_sinks[i]) +			continue; -int -afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie, -                                  xlator_t *this, -                                  int32_t op_ret, int32_t op_errno, -                                  inode_t *inode, struct iatt *stbuf, -                                  struct iatt *preparent, -                                  struct iatt *postparent, dict_t *xdata) -{ -        int              call_count       = 0; -        afr_private_t   *priv             = NULL; -        afr_local_t     *impunge_local    = NULL; -        afr_self_heal_t *impunge_sh       = NULL; -        int              child_index      = 0; - -        priv = this->private; -        impunge_local = impunge_frame->local; -        impunge_sh = &impunge_local->self_heal; - -        child_index = (long) cookie; - -        if (op_ret == -1) { -                impunge_sh->child_errno[child_index] = op_errno; -                gf_log (this->name, GF_LOG_ERROR, -                        "creation of %s on %s failed (%s)", -                        impunge_local->loc.path, -                        priv->children[child_index]->name, -                        strerror (op_errno)); -        } else { -                impunge_sh->child_errno[child_index] = 0; -        } - -        call_count = afr_frame_return (impunge_frame); -        if (call_count == 0) { -                if (!afr_errno_count (NULL, impunge_sh->child_errno, -                                      priv->child_count, 0)) { -                        // new_file creation failed every where -                        afr_sh_entry_call_impunge_done (impunge_frame, this, -                                                        -1, op_errno); -                        goto out; -                } -                afr_sh_entry_impunge_perform_xattrop (impunge_frame, this); -        } -out: -        return 0; -} +		if (replies[i].op_errno != ENOENT) +			continue; -int -afr_sh_entry_impunge_hardlink_cbk (call_frame_t *impunge_frame, void *cookie, -                                   xlator_t *this, int32_t op_ret, -                                   int32_t op_errno, inode_t *inode, -                                   struct iatt *buf, struct iatt *preparent, -                                   struct iatt *postparent, dict_t *xdata) -{ -        int              call_count        = 0; -        afr_local_t     *impunge_local = NULL; -        afr_self_heal_t *impunge_sh  = NULL; - -        impunge_local = impunge_frame->local; -        impunge_sh = &impunge_local->self_heal; - -        if (IA_IFLNK == impunge_sh->entrybuf.ia_type) { -                //For symlinks impunge is attempted un-conditionally -                //So the file can already exist. -                if ((op_ret < 0) && (op_errno == EEXIST)) -                        op_ret = 0; -        } - -        call_count = afr_frame_return (impunge_frame); -        if (call_count == 0) -                afr_sh_entry_call_impunge_done (impunge_frame, this, -                                                op_ret, op_errno); - -        return 0; -} +		ret = afr_selfheal_recreate_entry (frame, this, i, source, +						   fd->inode, name, inode, +						   replies); +	} -int -afr_sh_entry_impunge_hardlink (call_frame_t *impunge_frame, xlator_t *this, -                               int child_index) -{ -        afr_private_t   *priv          = NULL; -        afr_local_t     *impunge_local = NULL; -        afr_self_heal_t *impunge_sh  = NULL; -        loc_t           *loc           = NULL; -        struct iatt     *buf           = NULL; -        loc_t            oldloc        = {0}; - -        priv = this->private; -        impunge_local = impunge_frame->local; -        impunge_sh = &impunge_local->self_heal; -        loc = &impunge_local->loc; -        buf = &impunge_sh->entrybuf; - -        oldloc.inode = inode_ref (loc->inode); -        uuid_copy (oldloc.gfid, buf->ia_gfid); -        gf_log (this->name, GF_LOG_DEBUG, "linking missing file %s on %s", -                loc->path, priv->children[child_index]->name); - -        STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_hardlink_cbk, -                           (void *) (long) child_index, -                           priv->children[child_index], -                           priv->children[child_index]->fops->link, -                           &oldloc, loc, NULL); -        loc_wipe (&oldloc); - -        return 0; +	return ret;  } -int -afr_sh_nameless_lookup_cbk (call_frame_t *impunge_frame, void *cookie, -                            xlator_t *this, -                            int32_t op_ret, int32_t op_errno, inode_t *inode, -                            struct iatt *buf, dict_t *xattr, -                            struct iatt *postparent) -{ -        if (op_ret < 0) { -                 afr_sh_entry_impunge_create_file (impunge_frame, this, -                                                   (long)cookie); -        } else { -                afr_sh_entry_impunge_hardlink (impunge_frame, this, -                                               (long)cookie); -        } -        return 0; -} -int -afr_sh_entry_impunge_check_hardlink (call_frame_t *impunge_frame, -                                     xlator_t *this, -                                     int child_index, struct iatt *stbuf) +static int +__afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, +			     char *name, inode_t *inode, int source, +			     unsigned char *sources, unsigned char *healed_sinks, +			     unsigned char *locked_on, struct afr_reply *replies)  { -        afr_private_t   *priv          = NULL; -        call_frame_t    *frame             = NULL; -        afr_local_t     *impunge_local     = NULL; -        afr_local_t     *local             = NULL; -        afr_self_heal_t *impunge_sh        = NULL; -        afr_self_heal_t *sh                = NULL; -        loc_t           *loc           = NULL; -        dict_t          *xattr_req     = NULL; -        loc_t            oldloc        = {0}; -        int              ret           = -1; - -        priv = this->private; -        AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, -                                frame, local, sh); -        loc = &impunge_local->loc; - -        xattr_req = dict_new (); -        if (!xattr_req) -                goto out; -        oldloc.inode = inode_ref (loc->inode); -        uuid_copy (oldloc.gfid, stbuf->ia_gfid); - -        STACK_WIND_COOKIE (impunge_frame, afr_sh_nameless_lookup_cbk, -                           (void *) (long) child_index, -                           priv->children[child_index], -                           priv->children[child_index]->fops->lookup, -                           &oldloc, xattr_req); -        ret = 0; -out: -        if (xattr_req) -                dict_unref (xattr_req); -        loc_wipe (&oldloc); -        if (ret) -                sh->impunge_done (frame, this, -1, ENOMEM); -        return 0; -} +	int ret = -1; -int -afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, -                            int child_index, struct iatt *stbuf) -{ -        afr_private_t *priv          = NULL; -        afr_local_t   *impunge_local = NULL; -        dict_t        *dict          = NULL; -        int            ret           = 0; - -        priv = this->private; -        impunge_local = impunge_frame->local; - -        gf_log (this->name, GF_LOG_DEBUG, -                "creating missing file %s on %s", -                impunge_local->loc.path, -                priv->children[child_index]->name); - -        dict = dict_new (); -        if (!dict) -                gf_log (this->name, GF_LOG_ERROR, "Out of memory"); - -        GF_ASSERT (!uuid_is_null (stbuf->ia_gfid)); -        ret = afr_set_dict_gfid (dict, stbuf->ia_gfid); -        if (ret) -                gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed", -                        impunge_local->loc.path); - -        /* -         * Reason for adding GLUSTERFS_INTERNAL_FOP_KEY : -         * -         * Problem: -         * While a brick is down in a replica pair, lets say the user creates -         * one file(file-A) and a hard link to that file(h-file-A). After the -         * brick comes back up, entry self-heal is attempted on parent dir of -         * these two files. As part of readdir in self-heal it reads both the -         * entries file-A and h-file-A for both of them it does name less lookup -         * to check if there are any hardlinks already present in the -         * destination brick. It finds that there are no hard links already -         * present for files file-A, h-file-A. Self-heal does mknods for both -         * file-A and h-file-A. This leads to file-A and h-file-A not being -         * hardlinks anymore. -         * -         * Fix: (More like shrinking of race-window, the race itself is still -         * present in posix-mknod). -         * If mknod comes with the presence of GLUSTERFS_INTERNAL_FOP_KEY then -         * posix_mknod checks if there are already any gfid-links and does -         * link() instead of mknod. There still can be a race where two -         * posix_mknods same gfid see that -         * gfid-link file is not present and proceeds with mknods and result in -         * two different files with same gfid. -         */ -        ret = dict_set_str (dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); -        if (ret) -                gf_log (this->name, GF_LOG_INFO, "%s: %s set failed", -                        impunge_local->loc.path, GLUSTERFS_INTERNAL_FOP_KEY); - -        STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, -                           (void *) (long) child_index, -                           priv->children[child_index], -                           priv->children[child_index]->fops->mknod, -                           &impunge_local->loc, -                           st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type), -                           makedev (ia_major (stbuf->ia_rdev), -                                    ia_minor (stbuf->ia_rdev)), 0, dict); - -        if (dict) -                dict_unref (dict); - -        return 0; +	if (source < 0) +		ret = __afr_selfheal_merge_dirent (frame, this, fd, name, inode, +						   sources, healed_sinks, +						   locked_on, replies); +	else +		ret = __afr_selfheal_heal_dirent (frame, this, fd, name, inode, +						  source, sources, healed_sinks, +						  locked_on, replies); +	return ret;  } - -int -afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this, -                            int child_index, struct iatt *stbuf) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *impunge_local = NULL; -        dict_t          *dict = NULL; - -        int ret = 0; - -        priv = this->private; -        impunge_local = impunge_frame->local; - -        dict = dict_new (); -        if (!dict) { -                gf_log (this->name, GF_LOG_ERROR, -                        "Out of memory"); -                return 0; -        } - -        GF_ASSERT (!uuid_is_null (stbuf->ia_gfid)); -        ret = afr_set_dict_gfid (dict, stbuf->ia_gfid); -        if (ret) -                gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed", -                        impunge_local->loc.path); - -        gf_log (this->name, GF_LOG_DEBUG, -                "creating missing directory %s on %s", -                impunge_local->loc.path, -                priv->children[child_index]->name); - -        STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, -                           (void *) (long) child_index, -                           priv->children[child_index], -                           priv->children[child_index]->fops->mkdir, -                           &impunge_local->loc, -                           st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type), -                           0, dict); - -        if (dict) -                dict_unref (dict); - -        return 0; +static int +afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, +			   int source, unsigned char *sources, +			   unsigned char *healed_sinks, char *name) +{ +	afr_private_t *priv = NULL; +	int ret = 0; +	unsigned char *locked_on = NULL; +	struct afr_reply *replies = NULL; +	inode_t *inode = NULL; + +	priv = this->private; + +	locked_on = alloca0 (priv->child_count); + +	replies = alloca0 (priv->child_count * sizeof(*replies)); + +	ret = afr_selfheal_entrylk (frame, this, fd->inode, this->name, +				    name, locked_on); +	{ +		if (ret < 2) { +			ret = -ENOTCONN; +			goto unlock; +		} + +		inode = afr_selfheal_unlocked_lookup_on (frame, fd->inode, name, +							 replies, locked_on); +		if (!inode) { +			ret = -ENOMEM; +			goto unlock; +		} + +		ret = __afr_selfheal_entry_dirent (frame, this, fd, name, inode, +						   source, sources, healed_sinks, +						   locked_on, replies); +	} +unlock: +	afr_selfheal_unentrylk (frame, this, fd->inode, this->name, name, +				locked_on); +	if (inode) +		inode_unref (inode); +	return ret;  } -int -afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this, -                              int child_index, const char *linkname) +static int +afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this, fd_t *fd, +			      int child, int source, unsigned char *sources, +			      unsigned char *healed_sinks)  { -        afr_private_t   *priv          = NULL; -        afr_local_t     *impunge_local = NULL; -        dict_t          *dict          = NULL; -        struct iatt     *buf           = NULL; -        int              ret           = 0; - -        priv = this->private; -        impunge_local = impunge_frame->local; - -        buf = &impunge_local->cont.dir_fop.buf; - -        dict = dict_new (); -        if (!dict) { -                afr_sh_entry_call_impunge_done (impunge_frame, this, -                                                -1, ENOMEM); -                goto out; -        } - -        GF_ASSERT (!uuid_is_null (buf->ia_gfid)); -        ret = afr_set_dict_gfid (dict, buf->ia_gfid); -        if (ret) -                gf_log (this->name, GF_LOG_INFO, -                        "%s: dict set gfid failed", -                        impunge_local->loc.path); - -        gf_log (this->name, GF_LOG_DEBUG, -                "creating missing symlink %s -> %s on %s", -                impunge_local->loc.path, linkname, -                priv->children[child_index]->name); - -        STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, -                           (void *) (long) child_index, -                           priv->children[child_index], -                           priv->children[child_index]->fops->symlink, -                           linkname, &impunge_local->loc, 0, dict); - -        if (dict) -                dict_unref (dict); -out: -        return 0; -} +	int ret = 0; +	gf_dirent_t entries; +	gf_dirent_t *entry = NULL; +	off_t offset = 0; +	call_frame_t *iter_frame = NULL; +	xlator_t *subvol = NULL; +	afr_private_t *priv = NULL; +	priv = this->private; +	subvol = priv->children[child]; -int -afr_sh_entry_impunge_symlink_unlink_cbk (call_frame_t *impunge_frame, -                                         void *cookie, xlator_t *this, -                                         int32_t op_ret, int32_t op_errno, -                                         struct iatt *preparent, -                                         struct iatt *postparent, dict_t *xdata) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *impunge_local = NULL; -        afr_self_heal_t *impunge_sh = NULL; -        int              child_index = -1; -        int              call_count = -1; - -        priv          = this->private; -        impunge_local = impunge_frame->local; -        impunge_sh    = &impunge_local->self_heal; - -        child_index = (long) cookie; - -        if (op_ret == -1) { -                gf_log (this->name, GF_LOG_INFO, -                        "unlink of %s on %s failed (%s)", -                        impunge_local->loc.path, -                        priv->children[child_index]->name, -                        strerror (op_errno)); -                goto out; -        } - -        afr_sh_entry_impunge_symlink (impunge_frame, this, child_index, -                                      impunge_sh->linkname); - -        return 0; -out: -        LOCK (&impunge_frame->lock); -        { -                call_count = --impunge_local->call_count; -        } -        UNLOCK (&impunge_frame->lock); +	INIT_LIST_HEAD (&entries.list); -        if (call_count == 0) -                afr_sh_entry_call_impunge_done (impunge_frame, this, -                                                op_ret, op_errno); +	iter_frame = afr_copy_frame (frame); +	if (!iter_frame) +		return -ENOMEM; -        return 0; -} +	while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) { +		if (ret > 0) +			ret = 0; +		list_for_each_entry (entry, &entries.list, list) { +			offset = entry->d_off; +			if (!strcmp (entry->d_name, ".") || +			    !strcmp (entry->d_name, "..")) +				continue; -int -afr_sh_entry_impunge_symlink_unlink (call_frame_t *impunge_frame, xlator_t *this, -                                     int child_index) -{ -        afr_private_t   *priv          = NULL; -        afr_local_t     *impunge_local = NULL; +			if (__is_root_gfid (fd->inode->gfid) && +			    !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) +				continue; -        priv          = this->private; -        impunge_local = impunge_frame->local; +			ret = afr_selfheal_entry_dirent (iter_frame, this, fd, +							 source, sources, +							 healed_sinks, +							 entry->d_name); +			AFR_STACK_RESET (iter_frame); -        gf_log (this->name, GF_LOG_DEBUG, -                "unlinking symlink %s with wrong target on %s", -                impunge_local->loc.path, -                priv->children[child_index]->name); +			if (ret) +				break; +		} -        STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_symlink_unlink_cbk, -                           (void *) (long) child_index, -                           priv->children[child_index], -                           priv->children[child_index]->fops->unlink, -                           &impunge_local->loc, 0, NULL); +		gf_dirent_free (&entries); +		if (ret) +			break; +	} -        return 0; +	AFR_STACK_DESTROY (iter_frame); +	return ret;  } - -int -afr_sh_entry_impunge_readlink_sink_cbk (call_frame_t *impunge_frame, void *cookie, -                                        xlator_t *this, -                                        int32_t op_ret, int32_t op_errno, -                                        const char *linkname, struct iatt *sbuf, dict_t *xdata) +static int +afr_selfheal_entry_do (call_frame_t *frame, xlator_t *this, fd_t *fd, +		       int source, unsigned char *sources, +		       unsigned char *healed_sinks, +		       struct afr_reply *locked_replies)  { -        afr_private_t   *priv = NULL; -        afr_local_t     *impunge_local = NULL; -        afr_self_heal_t *impunge_sh = NULL; -        int              child_index = -1; -        int              call_count = -1; -        int              active_src = -1; - -        priv          = this->private; -        impunge_local = impunge_frame->local; -        impunge_sh    = &impunge_local->self_heal; -        active_src    = impunge_sh->active_source; - -        child_index = (long) cookie; - -        if ((op_ret == -1) && (!afr_inode_missing(op_errno))) { -                gf_log (this->name, GF_LOG_INFO, -                        "readlink of %s on %s failed (%s)", -                        impunge_local->loc.path, -                        priv->children[active_src]->name, -                        strerror (op_errno)); -                goto out; -        } - -        /* symlink doesn't exist on the sink */ - -        if ((op_ret == -1) && (afr_inode_missing(op_errno))) { -                afr_sh_entry_impunge_symlink (impunge_frame, this, -                                              child_index, impunge_sh->linkname); -                return 0; -        } - - -        /* symlink exists on the sink, so check if targets match */ - -        if (strcmp (linkname, impunge_sh->linkname) == 0) { -                /* targets match, nothing to do */ - -                goto out; -        } else { -                /* -                 * Hah! Sneaky wolf in sheep's clothing! -                 */ -                afr_sh_entry_impunge_symlink_unlink (impunge_frame, this, -                                                     child_index); -                return 0; -        } +	int i = 0; +	afr_private_t *priv = NULL; +	int ret = 0; -out: -        LOCK (&impunge_frame->lock); -        { -                call_count = --impunge_local->call_count; -        } -        UNLOCK (&impunge_frame->lock); +	priv = this->private; -        if (call_count == 0) -                afr_sh_entry_call_impunge_done (impunge_frame, this, -                                                op_ret, op_errno); +	gf_log (this->name, GF_LOG_INFO, "performing entry selfheal on %s", +		uuid_utoa (fd->inode->gfid)); -        return 0; +	for (i = 0; i < priv->child_count; i++) { +		if (i != source && !healed_sinks[i]) +			continue; +		ret = afr_selfheal_entry_do_subvol (frame, this, fd, i, source, +						    sources, healed_sinks); +		if (ret) +			break; +	} +	return ret;  } -int -afr_sh_entry_impunge_readlink_sink (call_frame_t *impunge_frame, xlator_t *this, -                                    int child_index) +static int +__afr_selfheal_entry_finalize_source (xlator_t *this, unsigned char *sources, +				      unsigned char *sinks, +				      unsigned char *locked_on, +				      struct afr_reply *replies)  { -        afr_private_t   *priv = NULL; -        afr_local_t     *impunge_local = NULL; +	int i = 0; +	afr_private_t *priv = NULL; +	int source = -1; +	int locked_count = 0; +	int sources_count = 0; +	int sinks_count = 0; -        priv = this->private; -        impunge_local = impunge_frame->local; +	priv = this->private; -        gf_log (this->name, GF_LOG_DEBUG, -                "checking symlink target of %s on %s", -                impunge_local->loc.path, priv->children[child_index]->name); +	locked_count = AFR_COUNT (locked_on, priv->child_count); +	sources_count = AFR_COUNT (sources, priv->child_count); +	sinks_count = AFR_COUNT (sinks, priv->child_count); -        STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_sink_cbk, -                           (void *) (long) child_index, -                           priv->children[child_index], -                           priv->children[child_index]->fops->readlink, -                           &impunge_local->loc, 4096, NULL); +	if (locked_count == sinks_count || !sources_count) { +		return -1; +	} -        return 0; -} - - -int -afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie, -                                   xlator_t *this, -                                   int32_t op_ret, int32_t op_errno, -                                   const char *linkname, struct iatt *sbuf, dict_t *xdata) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *impunge_local = NULL; -        afr_self_heal_t *impunge_sh = NULL; -        int              child_index = -1; -        int              call_count = -1; -        int              active_src = -1; - -        priv = this->private; -        impunge_local = impunge_frame->local; -        impunge_sh = &impunge_local->self_heal; -        active_src = impunge_sh->active_source; - -        child_index = (long) cookie; - -        if (op_ret == -1) { -                gf_log (this->name, GF_LOG_INFO, -                        "readlink of %s on %s failed (%s)", -                        impunge_local->loc.path, -                        priv->children[active_src]->name, -                        strerror (op_errno)); -                goto out; -        } - -        impunge_sh->linkname = gf_strdup (linkname); -        afr_sh_entry_impunge_readlink_sink (impunge_frame, this, child_index); - -        return 0; +	for (i = 0; i < priv->child_count; i++) { +		if (sources[i]) { +			source = i; +			break; +		} +	} -out: -        LOCK (&impunge_frame->lock); -        { -                call_count = --impunge_local->call_count; -        } -        UNLOCK (&impunge_frame->lock); - -        if (call_count == 0) -                afr_sh_entry_call_impunge_done (impunge_frame, this, -                                                op_ret, op_errno); - -        return 0; +	return source;  } -int -afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this, -                               int child_index, struct iatt *stbuf) +static int +__afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd, +			      unsigned char *locked_on, unsigned char *sources, +			      unsigned char *sinks, unsigned char *healed_sinks, +			      struct afr_reply *replies, int *source_p)  { -        afr_private_t   *priv = NULL; -        afr_local_t     *impunge_local = NULL; -        afr_self_heal_t *impunge_sh = NULL; -        int              active_src = -1; - -        priv = this->private; -        impunge_local = impunge_frame->local; -        impunge_sh = &impunge_local->self_heal; -        active_src = impunge_sh->active_source; -        impunge_local->cont.dir_fop.buf = *stbuf; - -        STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk, -                           (void *) (long) child_index, -                           priv->children[active_src], -                           priv->children[active_src]->fops->readlink, -                           &impunge_local->loc, 4096, NULL); - -        return 0; -} +	int ret = -1; +	int source = -1; +	afr_private_t *priv = NULL; +	int i = 0; -int -afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, -                             int child_index) -{ -        call_frame_t    *frame             = NULL; -        afr_local_t     *impunge_local     = NULL; -        afr_local_t     *local             = NULL; -        afr_self_heal_t *impunge_sh        = NULL; -        afr_self_heal_t *sh                = NULL; -        afr_private_t   *priv = NULL; -        ia_type_t       type = IA_INVAL; -        int             active_src = 0; -        struct iatt     *buf = NULL; - -        AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, -                                frame, local, sh); -        active_src = impunge_sh->active_source; -        afr_update_loc_gfids (&impunge_local->loc, &impunge_sh->entrybuf, -                              &impunge_sh->parentbuf); - -        buf = &impunge_sh->entrybuf; -        type = buf->ia_type; - -        switch (type) { -        case IA_IFSOCK: -        case IA_IFREG: -        case IA_IFBLK: -        case IA_IFCHR: -        case IA_IFIFO: -        case IA_IFLNK: -                afr_sh_entry_impunge_check_hardlink (impunge_frame, this, -                                                     child_index, buf); -                break; -        case IA_IFDIR: -                afr_sh_entry_impunge_mkdir (impunge_frame, this, -                                            child_index, buf); -                break; -        default: -                gf_log (this->name, GF_LOG_ERROR, -                        "%s has unknown file type on %s: 0%o", -                        impunge_local->loc.path, -                        priv->children[active_src]->name, type); -                sh->impunge_done (frame, this, -1, EINVAL); -                break; -        } - -        return 0; -} +	priv = this->private; -int -afr_sh_entry_impunge_create_file (call_frame_t *impunge_frame, xlator_t *this, -                                  int child_index) -{ -        call_frame_t    *frame             = NULL; -        afr_local_t     *impunge_local     = NULL; -        afr_local_t     *local             = NULL; -        afr_self_heal_t *impunge_sh        = NULL; -        afr_self_heal_t *sh                = NULL; -        afr_private_t   *priv = NULL; -        ia_type_t       type = IA_INVAL; -        int             active_src = 0; -        struct iatt     *buf = NULL; - -        AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, -                                frame, local, sh); -        active_src = impunge_sh->active_source; -        buf = &impunge_sh->entrybuf; -        type = buf->ia_type; - -        switch (type) { -        case IA_IFSOCK: -        case IA_IFREG: -        case IA_IFBLK: -        case IA_IFCHR: -        case IA_IFIFO: -                afr_sh_entry_impunge_mknod (impunge_frame, this, -                                            child_index, buf); -                break; -        case IA_IFLNK: -                afr_sh_entry_impunge_readlink (impunge_frame, this, -                                               child_index, buf); -                break; -        default: -                gf_log (this->name, GF_LOG_ERROR, -                        "%s has unknown file type on %s: 0%o", -                        impunge_local->loc.path, -                        priv->children[active_src]->name, type); -                sh->impunge_done (frame, this, -1, EINVAL); -                break; -        } - -        return 0; -} +	ret = afr_selfheal_unlocked_discover (frame, fd->inode, fd->inode->gfid, +					      replies); +	if (ret) +		return ret; -gf_boolean_t -afr_sh_need_recreate (afr_self_heal_t *impunge_sh, unsigned int child, -                      unsigned int child_count) -{ -        gf_boolean_t    recreate = _gf_false; +	ret = afr_selfheal_find_direction (frame, this, replies, +					   AFR_ENTRY_TRANSACTION, +					   locked_on, sources, sinks); +	if (ret) +		return ret; -        GF_ASSERT (impunge_sh->child_errno); +	source = __afr_selfheal_entry_finalize_source (this, sources, sinks, +						       locked_on, replies); +	if (source < 0) { +		/* If source is < 0 (typically split-brain), we perform a +		   conservative merge of entries rather than erroring out */ +	} +	*source_p = source; -        if (child == impunge_sh->active_source) -                goto out; +	for (i = 0; i < priv->child_count; i++) +		/* Initialize the healed_sinks[] array optimistically to +		   the intersection of to-be-healed (i.e sinks[]) and +		   the list of servers which are up (i.e locked_on[]). -        if (IA_IFLNK == impunge_sh->entrybuf.ia_type) { -                recreate = _gf_true; -                goto out; -        } +		   As we encounter failures in the healing process, we +		   will unmark the respective servers in the healed_sinks[] +		   array. +		*/ +		healed_sinks[i] = sinks[i] && locked_on[i]; -        if (impunge_sh->child_errno[child] == ENOENT) -                recreate = _gf_true; -out: -        return recreate; +	return ret;  } -unsigned int -afr_sh_recreate_count (afr_self_heal_t *impunge_sh, int *sources, -                       unsigned int child_count) -{ -        int             count = 0; -        int             i = 0; - -        for (i = 0; i < child_count; i++) { -                if (afr_sh_need_recreate (impunge_sh, i, child_count)) -                        count++; -        } - -        return count; -} -int -afr_sh_entry_call_impunge_recreate (call_frame_t *impunge_frame, -                                    xlator_t *this) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *impunge_local = NULL; -        afr_self_heal_t *impunge_sh = NULL; -        call_frame_t    *frame = NULL; -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        unsigned int     recreate_count = 0; -        int              i = 0; -        int              active_src = 0; - -        priv          = this->private; -        AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, -                                frame, local, sh); -        active_src = impunge_sh->active_source; -        impunge_sh->entrybuf = impunge_sh->buf[active_src]; -        impunge_sh->parentbuf = impunge_sh->parentbufs[active_src]; -        recreate_count = afr_sh_recreate_count (impunge_sh, sh->sources, -                                                priv->child_count); -        if (!recreate_count) { -                afr_sh_entry_call_impunge_done (impunge_frame, this, 0, 0); -                goto out; -        } -        impunge_local->call_count = recreate_count; -        for (i = 0; i < priv->child_count; i++) { -                if (!impunge_local->child_up[i]) { -                        impunge_sh->child_errno[i] = ENOTCONN; -                        continue; -                } -                if (!afr_sh_need_recreate (impunge_sh, i, priv->child_count)) { -                        impunge_sh->child_errno[i] = EEXIST; -                        continue; -                } -        } -        for (i = 0; i < priv->child_count; i++) { -                if (!afr_sh_need_recreate (impunge_sh, i, priv->child_count)) -                        continue; -                (void)afr_sh_entry_impunge_create (impunge_frame, this, i); -                recreate_count--; -        } -        GF_ASSERT (!recreate_count); +static int +__afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd, +		      unsigned char *locked_on) +{ +	afr_private_t *priv = NULL; +	int ret = -1; +	unsigned char *sources = NULL; +	unsigned char *sinks = NULL; +	unsigned char *data_lock = NULL; +	unsigned char *healed_sinks = NULL; +	struct afr_reply *locked_replies = NULL; +	int source = -1; + +	priv = this->private; + +	sources = alloca0 (priv->child_count); +	sinks = alloca0 (priv->child_count); +	healed_sinks = alloca0 (priv->child_count); +	data_lock = alloca0 (priv->child_count); + +	locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); + +	ret = afr_selfheal_entrylk (frame, this, fd->inode, this->name, NULL, +				    data_lock); +	{ +		if (ret < 2) { +			ret = -ENOTCONN; +			goto unlock; +		} + +		ret = __afr_selfheal_entry_prepare (frame, this, fd, data_lock, +						    sources, sinks, healed_sinks, +						    locked_replies, &source); +	} +unlock: +	afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL, +				data_lock); +	if (ret < 0) +		goto out; + +	ret = afr_selfheal_entry_do (frame, this, fd, source, sources, +				     healed_sinks, locked_replies); +	if (ret) +		goto out; + +	ret = afr_selfheal_undo_pending (frame, this, fd->inode, sources, sinks, +					 healed_sinks, AFR_ENTRY_TRANSACTION, +					 locked_replies, data_lock);  out: -        return 0; +	return ret;  } -void -afr_sh_entry_common_lookup_done (call_frame_t *impunge_frame, xlator_t *this, -                                 int32_t op_ret, int32_t op_errno) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *impunge_local = NULL; -        afr_self_heal_t *impunge_sh = NULL; -        call_frame_t    *frame = NULL; -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        unsigned int     gfid_miss_count = 0; -        unsigned int     children_up_count = 0; -        uuid_t           gfid = {0}; -        int              active_src = 0; - -        priv          = this->private; -        AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, -                                frame, local, sh); -        active_src    = impunge_sh->active_source; - -        if (op_ret < 0) -                goto done; -        if (impunge_sh->child_errno[active_src]) { -                op_ret = -1; -                op_errno = impunge_sh->child_errno[active_src]; -                goto done; -        } - -        gfid_miss_count = afr_gfid_missing_count (this->name, -                                                  impunge_sh->success_children, -                                                  impunge_sh->buf, priv->child_count, -                                                  impunge_local->loc.path); -        children_up_count = afr_up_children_count (impunge_local->child_up, -                                                   priv->child_count); -        if ((gfid_miss_count == children_up_count) && -            (children_up_count < priv->child_count)) { -                op_ret = -1; -                op_errno = ENODATA; -                gf_log (this->name, GF_LOG_ERROR, "Not all children are up, " -                        "gfid should not be assigned in this state for %s", -                        impunge_local->loc.path); -                goto done; -        } - -        if (gfid_miss_count) { -                afr_update_gfid_from_iatts (gfid, impunge_sh->buf, -                                            impunge_sh->success_children, -                                            priv->child_count); -                if (uuid_is_null (gfid)) { -                        sh->entries_skipped = _gf_true; -                        gf_log (this->name, GF_LOG_INFO, "%s: Skipping entry " -                                "self-heal because of gfid absence", -                                impunge_local->loc.path); -                        goto done; -                } -                afr_sh_common_lookup (impunge_frame, this, &impunge_local->loc, -                                      afr_sh_entry_common_lookup_done, gfid, -                                      AFR_LOOKUP_FAIL_CONFLICTS | -                                      AFR_LOOKUP_FAIL_MISSING_GFIDS, -                                      NULL); -        } else { -                afr_sh_entry_call_impunge_recreate (impunge_frame, this); -        } -        return; -done: -        afr_sh_entry_call_impunge_done (impunge_frame, this, -                                        op_ret, op_errno); -        return; -} -int -afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this, -                            gf_dirent_t *entry) +static fd_t * +afr_selfheal_data_opendir (xlator_t *this, inode_t *inode)  { -        afr_local_t     *local  = NULL; -        afr_self_heal_t *sh  = NULL; -        afr_self_heal_t *impunge_sh  = NULL; -        int              ret = -1; -        call_frame_t    *impunge_frame = NULL; -        afr_local_t     *impunge_local = NULL; -        int              active_src = 0; -        int              op_errno = 0; -        int              op_ret = -1; - -        local = frame->local; -        sh = &local->self_heal; - -        active_src = sh->active_source; -        sh->impunge_done = afr_sh_entry_impunge_entry_done; - -        if (can_skip_entry_self_heal (entry->d_name, &local->loc)) { -                op_ret = 0; -                goto out; -        } - -        gf_log (this->name, GF_LOG_TRACE, -                "inspecting existence of %s under %s", -                entry->d_name, local->loc.path); - -        ret = afr_impunge_frame_create (frame, this, active_src, -                                        &impunge_frame); -        if (ret) { -                op_errno = -ret; -                goto out; -        } - -        impunge_local = impunge_frame->local; -        impunge_sh = &impunge_local->self_heal; -        ret = afr_build_child_loc (this, &impunge_local->loc, &local->loc, -                                   entry->d_name); -        loc_copy (&impunge_sh->parent_loc, &local->loc); -        if (ret != 0) { -                op_errno = ENOMEM; -                goto out; -        } - -        afr_sh_common_lookup (impunge_frame, this, &impunge_local->loc, -                              afr_sh_entry_common_lookup_done, NULL, -                              AFR_LOOKUP_FAIL_CONFLICTS, NULL); - -        op_ret = 0; -out: -        if (ret) { -                if (impunge_frame) -                        AFR_STACK_DESTROY (impunge_frame); -                sh->impunge_done (frame, this, op_ret, op_errno); -        } +	loc_t loc = {0,}; +	int ret = 0; +	fd_t *fd = NULL; -        return 0; -} +	fd = fd_create (inode, 0); +	if (!fd) +		return NULL; +	loc.inode = inode_ref (inode); +	uuid_copy (loc.gfid, inode->gfid); -int -afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, -                                  xlator_t *this, -                                  int32_t op_ret, int32_t op_errno, -                                  gf_dirent_t *entries, dict_t *xdata) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *local  = NULL; -        afr_self_heal_t *sh  = NULL; -        gf_dirent_t     *entry = NULL; -        off_t            last_offset = 0; -        int              active_src = 0; -        int              entry_count = 0; - -        priv = this->private; -        local = frame->local; -        sh = &local->self_heal; - -        active_src = sh->active_source; - -        if (op_ret <= 0) { -                if (op_ret < 0) { -                        gf_log (this->name, GF_LOG_INFO, -                                "readdir of %s on subvolume %s failed (%s)", -                                local->loc.path, -                                priv->children[active_src]->name, -                                strerror (op_errno)); -                        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -                } else { -                        gf_log (this->name, GF_LOG_TRACE, -                                "readdir of %s on subvolume %s complete", -                                local->loc.path, -                                priv->children[active_src]->name); -                } - -                afr_sh_entry_impunge_all (frame, this); -                return 0; -        } - -        list_for_each_entry (entry, &entries->list, list) { -                last_offset = entry->d_off; -                entry_count++; -        } - -        gf_log (this->name, GF_LOG_DEBUG, -                "readdir'ed %d entries from %s", -                entry_count, priv->children[active_src]->name); - -        sh->offset = last_offset; -        local->call_count = entry_count; - -        list_for_each_entry (entry, &entries->list, list) { -                afr_sh_entry_impunge_entry (frame, this, entry); -        } - -        return 0; -} +	ret = syncop_opendir (this, &loc, fd); +	if (ret) { +		fd_unref (fd); +		fd = NULL; +	} else { +		fd_bind (fd); +	} +	loc_wipe (&loc); -int -afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *local  = NULL; -        afr_self_heal_t *sh  = NULL; -        int32_t         active_src = 0; - -        priv = this->private; -        local = frame->local; -        sh = &local->self_heal; -        active_src = sh->active_source; -        gf_log (this->name, GF_LOG_DEBUG, "%s: readdir from offset %zd", -                local->loc.path, sh->offset); - -        STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk, -                    priv->children[active_src], -                    priv->children[active_src]->fops->readdirp, -                    sh->healing_fd, sh->block_size, sh->offset, NULL); - -        return 0; +	return fd;  } -int -afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) -{ -        afr_private_t   *priv = NULL; -        afr_local_t     *local  = NULL; -        afr_self_heal_t *sh  = NULL; -        int              active_src = -1; - -        priv = this->private; -        local = frame->local; -        sh = &local->self_heal; - -        sh->offset = 0; - -        active_src = next_active_source (frame, this, sh->active_source); -        sh->active_source = active_src; - -        if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { -                afr_sh_entry_finish (frame, this); -                return 0; -        } - -        if (active_src == -1) { -                /* completed creating missing files on all subvolumes */ -                afr_sh_entry_erase_pending (frame, this); -                return 0; -        } - -        gf_log (this->name, GF_LOG_TRACE, -                "impunging entries of %s on %s to other sinks", -                local->loc.path, priv->children[active_src]->name); - -        afr_sh_entry_impunge_subvol (frame, this); - -        return 0; -} -  int -afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                          int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode)  { -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        afr_private_t   *priv = NULL; -        int              call_count = 0; -        int              child_index = 0; - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; - -        child_index = (long) cookie; - -        /* TODO: some of the open's might fail. -           In that case, modify cleanup fn to send flush on those -           fd's which are already open */ - -        LOCK (&frame->lock); -        { -                if (op_ret == -1) { -                        gf_log (this->name, GF_LOG_ERROR, -                                "opendir of %s failed on child %s (%s)", -                                local->loc.path, -                                priv->children[child_index]->name, -                                strerror (op_errno)); -                        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -                } -        } -        UNLOCK (&frame->lock); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { -                        afr_sh_entry_finish (frame, this); -                        return 0; -                } -                gf_log (this->name, GF_LOG_TRACE, -                        "fd for %s opened, commencing sync", -                        local->loc.path); - -                sh->active_source = -1; -                afr_sh_entry_expunge_all (frame, this); -        } - -        return 0; -} - - -int -afr_sh_entry_open (call_frame_t *frame, xlator_t *this) -{ -        int i = 0; -        int call_count = 0; - -        int source = -1; -        int *sources = NULL; +	afr_private_t *priv = NULL; +	unsigned char *locked_on = NULL; +	fd_t *fd = NULL; +	int ret = 0; -        fd_t *fd = NULL; +	priv = this->private; -        afr_local_t *   local = NULL; -        afr_private_t * priv  = NULL; -        afr_self_heal_t *sh = NULL; +	fd = afr_selfheal_data_opendir (this, inode); +	if (!fd) +		return -EIO; -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; +	locked_on = alloca0 (priv->child_count); -        source  = local->self_heal.source; -        sources = local->self_heal.sources; +	ret = afr_selfheal_tryentrylk (frame, this, inode, priv->sh_domain, NULL, +				       locked_on); +	{ +		if (ret < 2) { +			/* Either less than two subvols available, or another +			   selfheal (from another server) is in progress. Skip +			   for now in any case there isn't anything to do. +			*/ +			ret = -ENOTCONN; +			goto unlock; +		} -        sh->block_size = priv->sh_readdir_size; -        sh->offset = 0; +		ret = __afr_selfheal_entry (frame, this, fd, locked_on); +	} +unlock: +	afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL, locked_on); -        call_count = sh->active_sinks; -        if (source != -1) -                call_count++; +	if (fd) +		fd_unref (fd); -        local->call_count = call_count; - -        fd = fd_create (local->loc.inode, frame->root->pid); -        sh->healing_fd = fd; - -        if (source != -1) { -                gf_log (this->name, GF_LOG_TRACE, -                        "opening directory %s on subvolume %s (source)", -                        local->loc.path, priv->children[source]->name); - -                /* open source */ -                STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, -                                   (void *) (long) source, -                                   priv->children[source], -                                   priv->children[source]->fops->opendir, -                                   &local->loc, fd, NULL); -                call_count--; -        } - -        /* open sinks */ -        for (i = 0; i < priv->child_count; i++) { -                if (sources[i] || !local->child_up[i]) -                        continue; - -                gf_log (this->name, GF_LOG_TRACE, -                        "opening directory %s on subvolume %s (sink)", -                        local->loc.path, priv->children[i]->name); - -                STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, -                                   (void *) (long) i, -                                   priv->children[i], -                                   priv->children[i]->fops->opendir, -                                   &local->loc, fd, NULL); - -                if (!--call_count) -                        break; -        } - -        return 0; -} - - -int -afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        afr_private_t   *priv = NULL; -        int              source = 0; - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; - -        source = sh->source; - -        afr_sh_mark_source_sinks (frame, this); -        if (source != -1) -                sh->success[source] = 1; - -        if (sh->active_sinks == 0) { -                gf_log (this->name, GF_LOG_TRACE, -                        "no active sinks for self-heal on dir %s", -                        local->loc.path); -                afr_sh_entry_finish (frame, this); -                return 0; -        } -        if (source == -1 && sh->active_sinks < 2) { -                gf_log (this->name, GF_LOG_TRACE, -                        "cannot sync with 0 sources and 1 sink on dir %s", -                        local->loc.path); -                afr_sh_entry_finish (frame, this); -                return 0; -        } - -        if (source != -1) -                gf_log (this->name, GF_LOG_DEBUG, -                        "self-healing directory %s from subvolume %s to " -                        "%d other", -                        local->loc.path, priv->children[source]->name, -                        sh->active_sinks); -        else -                gf_log (this->name, GF_LOG_DEBUG, -                        "no active sources for %s found. " -                        "merging all entries as a conservative decision", -                        local->loc.path); - -        sh->actual_sh_started = _gf_true; -        afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); -        afr_sh_entry_open (frame, this); - -        return 0; -} - - -void -afr_sh_entry_fix (call_frame_t *frame, xlator_t *this, -                  int32_t op_ret, int32_t op_errno) -{ -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        afr_private_t   *priv = NULL; -        int              source = 0; -        int              nsources = 0; -        int32_t          subvol_status = 0; - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; - -        if (op_ret < 0) { -                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -                afr_sh_set_error (sh, op_errno); -                afr_sh_entry_finish (frame, this); -                goto out; -        } - -        if (sh->forced_merge) { -                sh->source = -1; -                goto heal; -        } - -        nsources = afr_build_sources (this, sh->xattr, sh->buf, -                                      sh->pending_matrix, sh->sources, -                                      sh->success_children, -                                      AFR_ENTRY_TRANSACTION, &subvol_status, -                                      _gf_true); -        if ((subvol_status & ALL_FOOLS) || -            (subvol_status & SPLIT_BRAIN)) { -                gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative " -                        "merge", local->loc.path); -                source = -1; -                memset (sh->sources, 0, -                        sizeof (*sh->sources) * priv->child_count); -        } else if (nsources == 0) { -                gf_log (this->name, GF_LOG_TRACE, -                        "No self-heal needed for %s", -                        local->loc.path); - -                afr_sh_entry_finish (frame, this); -                return; -        } else { -                source = afr_sh_select_source (sh->sources, priv->child_count); -        } - -        sh->source = source; - -        afr_reset_children (sh->fresh_children, priv->child_count); -        afr_get_fresh_children (sh->success_children, sh->sources, -                                sh->fresh_children, priv->child_count); -        if (sh->source >= 0) -                afr_inode_set_read_ctx (this, sh->inode, sh->source, -                                        sh->fresh_children); - -heal: -        afr_sh_entry_sync_prepare (frame, this); -out: -        return; -} - -int -afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this) -{ -        afr_internal_lock_t *int_lock = NULL; -        afr_local_t         *local    = NULL; -        afr_self_heal_t     *sh       = NULL; - -        local    = frame->local; -        int_lock = &local->internal_lock; -        sh       = &local->self_heal; - -        if (int_lock->lock_op_ret < 0) { -                gf_log (this->name, GF_LOG_ERROR, "Non Blocking entrylks " -                        "failed for %s.", local->loc.path); -                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -                afr_sh_entry_done (frame, this); -        } else { - -                gf_log (this->name, GF_LOG_DEBUG, "Non Blocking entrylks done " -                        "for %s. Proceeding to FOP", local->loc.path); -                afr_sh_common_lookup (frame, this, &local->loc, -                                      afr_sh_entry_fix, NULL, -                                      AFR_LOOKUP_FAIL_CONFLICTS | -                                      AFR_LOOKUP_FAIL_MISSING_GFIDS, -                                      NULL); -        } - -        return 0; -} - -int -afr_self_heal_entry (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t     *local = NULL; -        afr_private_t   *priv = NULL; -        afr_self_heal_t *sh = NULL; - -        priv = this->private; -        local = frame->local; -        sh = &local->self_heal; - -        sh->sh_type_in_action = AFR_SELF_HEAL_ENTRY; - -        if (local->self_heal.do_entry_self_heal && priv->entry_self_heal) { -                afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); -                afr_sh_entrylk (frame, this, &local->loc, NULL, -                                afr_sh_post_nonblocking_entry_cbk); -        } else { -                gf_log (this->name, GF_LOG_TRACE, -                        "proceeding to completion on %s", -                        local->loc.path); -                afr_sh_entry_done (frame, this); -        } - -        return 0; +	return ret;  } diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index fd5da6cfd33..b31a33237f5 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -1,5 +1,5 @@  /* -  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>    This file is part of GlusterFS.    This file is licensed to you under your choice of the GNU Lesser @@ -8,763 +8,274 @@    cases as published by the Free Software Foundation.  */ -#include <libgen.h> -#include <unistd.h> -#include <fnmatch.h> -#include <sys/time.h> -#include <stdlib.h> -#include <signal.h>  #ifndef _CONFIG_H  #define _CONFIG_H  #include "config.h"  #endif -#include "glusterfs.h"  #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h"  #include "afr-self-heal.h" -#include "afr-self-heal-common.h" - - -int -afr_sh_metadata_done (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; - -        local = frame->local; -        sh = &local->self_heal; - -        afr_sh_reset (frame, this); -        if (IA_ISDIR (sh->type)) { -                gf_log (this->name, GF_LOG_DEBUG, -                        "proceeding to entry check on %s", -                        local->loc.path); -                afr_self_heal_entry (frame, this); -        } else { -                gf_log (this->name, GF_LOG_DEBUG, -                        "proceeding to data check on %s", -                        local->loc.path); -                afr_self_heal_data (frame, this); -        } - -        return 0; -} - -int -afr_sh_inode_unlock (call_frame_t *frame, xlator_t *this) -{ -        afr_internal_lock_t *int_lock = NULL; -        afr_local_t         *local    = NULL; - -        local    = frame->local; -        int_lock = &local->internal_lock; - -        int_lock->lock_cbk = afr_sh_metadata_done; -        afr_unlock (frame, this); - -        return 0; -} - -int -afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this) -{ -        afr_sh_inode_unlock (frame, this); - -        return 0; -} - -int -afr_sh_metadata_fail (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t         *local    = NULL; -        afr_self_heal_t     *sh       = NULL; - -        local    = frame->local; -        sh       = &local->self_heal; - -        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -        afr_sh_metadata_finish (frame, this); -        return 0; -} - -int -afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie, -                                   xlator_t *this, int32_t op_ret, -                                   int32_t op_errno, dict_t *xattr, dict_t *xdata) -{ -        afr_local_t     *local     = NULL; -        int             call_count = 0; -        long            i          = 0; -        afr_self_heal_t *sh        = NULL; -        afr_private_t   *priv      = NULL; - -        local = frame->local; -        priv  = this->private; -        sh = &local->self_heal; -        i = (long)cookie; - -        if ((!IA_ISREG (sh->buf[sh->source].ia_type)) && -            (!IA_ISDIR (sh->buf[sh->source].ia_type))) { -                afr_children_add_child (sh->fresh_children, i, -                                        priv->child_count); -        } -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                if ((!IA_ISREG (sh->buf[sh->source].ia_type)) && -                    (!IA_ISDIR (sh->buf[sh->source].ia_type))) { -                        afr_inode_set_read_ctx (this, sh->inode, sh->source, -                                                sh->fresh_children); -                } -                afr_sh_metadata_finish (frame, this); -        } - -        return 0; -} - -int -afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this) -{ -         afr_sh_erase_pending (frame, this, AFR_METADATA_TRANSACTION, -                               afr_sh_metadata_erase_pending_cbk, -                               afr_sh_metadata_finish); -         return 0; -} - - -int -afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                          int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        afr_private_t   *priv = NULL; -        int              call_count = 0; -        int              child_index = 0; - - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; - -        child_index = (long) cookie; - -        LOCK (&frame->lock); -        { -                if (op_ret == -1) { -                        gf_log (this->name, GF_LOG_INFO, -                                "setting attributes failed for %s on %s (%s)", -                                local->loc.path, -                                priv->children[child_index]->name, -                                strerror (op_errno)); - -                        sh->success[child_index] = 0; -                } -        } -        UNLOCK (&frame->lock); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                if (local->xattr_req) { -                        dict_unref (local->xattr_req); -                        local->xattr_req = NULL; -                } -                afr_sh_metadata_erase_pending (frame, this); -        } - -        return 0; -} - - -int -afr_sh_metadata_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                             int32_t op_ret, int32_t op_errno, -                             struct iatt *preop, struct iatt *postop, dict_t *xdata) -{ -        afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno, xdata); - -        return 0; -} - - -int -afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                           int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ -        afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno, xdata); - -        return 0; -} - -int -afr_sh_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                        int32_t op_ret, int32_t op_errno, -                        dict_t *xdata) -{ -        int            i     = 0; -        afr_private_t *priv  = NULL; -        afr_local_t   *local = NULL; - -        priv = this->private; -        local = frame->local; - -        if (op_ret < 0) { -                afr_sh_metadata_sync_cbk (frame, cookie, -                                          this, -1, op_errno, xdata); -                goto out; -        } - -        i = (long) cookie; - -        STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk, -                           (void *) (long) i, -                           priv->children[i], -                           priv->children[i]->fops->setxattr, -                           &local->loc, local->xattr_req, 0, NULL); - - out: -        return 0; -} - -inline void -afr_prune_special_keys (dict_t *xattr_dict) -{ -        dict_del (xattr_dict, GF_SELINUX_XATTR_KEY); -} - -inline void -afr_prune_pending_keys (dict_t *xattr_dict, afr_private_t *priv) -{ -        int i = 0; - -        for (; i < priv->child_count; i++) { -                dict_del (xattr_dict, priv->pending_key[i]); -        } -} - -int -afr_sh_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                     int32_t op_ret, int32_t op_errno, dict_t *xattr, -                     dict_t *xdata) -{ -        int            i     = 0; -        afr_private_t *priv  = NULL; -        afr_local_t   *local = NULL; - -        priv = this->private; -        local = frame->local; - -        if (op_ret < 0) { -                afr_sh_metadata_sync_cbk (frame, cookie, -                                          this, -1, op_errno, xdata); -                goto out; -        } - -        afr_prune_pending_keys (xattr, priv); - -        afr_prune_special_keys (xattr); - -        i = (long) cookie; +#include "byte-order.h" -        /* send removexattr in bulk via xdata */ -        STACK_WIND_COOKIE (frame, afr_sh_removexattr_cbk, -                           cookie, -                           priv->children[i], -                           priv->children[i]->fops->removexattr, -                           &local->loc, "", xattr); - out: -        return 0; -} +#define AFR_HEAL_ATTR (GF_SET_ATTR_UID|GF_SET_ATTR_GID|GF_SET_ATTR_MODE)  int -afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) +afr_selfheal_metadata_do (call_frame_t *frame, xlator_t *this, inode_t *inode, +			  int source, unsigned char *healed_sinks, +			  struct afr_reply *locked_replies)  { -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        afr_private_t   *priv = NULL; -        int              source = 0; -        int              active_sinks = 0; -        int              call_count = 0; -        int              i = 0; - -        struct iatt      stbuf = {0,}; -        int32_t          valid = 0; - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; - -        source = sh->source; -        active_sinks = sh->active_sinks; - -        /* -         * 2 calls per sink - setattr, setxattr -         */ -        if (xattr) { -                call_count = active_sinks * 2; -                local->xattr_req = dict_ref (xattr); -        } else -                call_count = active_sinks; - -        local->call_count = call_count; - -        stbuf.ia_atime = sh->buf[source].ia_atime; -        stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec; -        stbuf.ia_mtime = sh->buf[source].ia_mtime; -        stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec; - -        stbuf.ia_uid = sh->buf[source].ia_uid; -        stbuf.ia_gid = sh->buf[source].ia_gid; - -        stbuf.ia_type = sh->buf[source].ia_type; -        stbuf.ia_prot = sh->buf[source].ia_prot; - -        valid = GF_SET_ATTR_MODE  | -                GF_SET_ATTR_UID   | GF_SET_ATTR_GID | -                GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - -        for (i = 0; i < priv->child_count; i++) { -                if (call_count == 0) { -                        break; -                } -                if (sh->sources[i] || !local->child_up[i]) -                        continue; - -                gf_log (this->name, GF_LOG_DEBUG, -                        "self-healing metadata of %s from %s to %s", -                        local->loc.path, priv->children[source]->name, -                        priv->children[i]->name); - -                STACK_WIND_COOKIE (frame, afr_sh_metadata_setattr_cbk, -                                   (void *) (long) i, -                                   priv->children[i], -                                   priv->children[i]->fops->setattr, -                                   &local->loc, &stbuf, valid, NULL); - -                call_count--; - -                if (!xattr) -                        continue; - -                STACK_WIND_COOKIE (frame, afr_sh_getxattr_cbk, -                                   (void *) (long) i, -                                   priv->children[i], -                                   priv->children[i]->fops->getxattr, -                                   &local->loc, NULL, NULL); -                call_count--; -        } - -        return 0; +	int ret = -1; +	loc_t loc = {0,}; +	dict_t *xattr = NULL; +	dict_t *old_xattr = NULL; +	afr_private_t *priv = NULL; +	int i = 0; + +	priv = this->private; + +	loc.inode = inode_ref (inode); +	uuid_copy (loc.gfid, inode->gfid); + +	gf_log (this->name, GF_LOG_INFO, "performing metadata selfheal on %s", +		uuid_utoa (inode->gfid)); + +	ret = syncop_getxattr (priv->children[source], &loc, &xattr, NULL); +	if (ret < 0) { +		loc_wipe (&loc); +		return -EIO; +	} + +	afr_filter_xattrs (xattr); +	dict_del (xattr, GF_SELINUX_XATTR_KEY); + +	for (i = 0; i < priv->child_count; i++) { +		if (!healed_sinks[i]) +			continue; + +		ret = syncop_setattr (priv->children[i], &loc, +				      &locked_replies[source].poststat, +				      AFR_HEAL_ATTR, NULL, NULL); +		if (ret) +			healed_sinks[i] = 0; + +		old_xattr = NULL; +		ret = syncop_getxattr (priv->children[i], &loc, &old_xattr, 0); +		if (old_xattr) { +			dict_del (old_xattr, GF_SELINUX_XATTR_KEY); +			afr_filter_xattrs (old_xattr); +			ret = syncop_removexattr (priv->children[i], &loc, "", +						  old_xattr); +		} + +		ret = syncop_setxattr (priv->children[i], &loc, xattr, 0); +		if (ret) +			healed_sinks[i] = 0; +	} + +	loc_wipe (&loc); +	if (xattr) +		dict_unref (xattr); + +	return 0;  } -int -afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                              int32_t op_ret, int32_t op_errno, dict_t *xattr, -                              dict_t *xdata) +/* + * Look for mismatching uid/gid or mode even if xattrs don't say so, and + * pick one arbitrarily as winner. + */ + +static int +__afr_selfheal_metadata_finalize_source (xlator_t *this, unsigned char *sources, +					 unsigned char *sinks, +					 unsigned char *locked_on, +					 struct afr_reply *replies)  { -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        afr_private_t   *priv = NULL; -        int              source = 0; - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; - -        source = sh->source; - -        if (op_ret == -1) { -                gf_log (this->name, GF_LOG_DEBUG, -                        "getxattr of %s failed on subvolume %s (%s). proceeding without xattr", -                        local->loc.path, priv->children[source]->name, -                        strerror (op_errno)); - -                afr_sh_metadata_sync (frame, this, NULL); -        } else { -                afr_prune_pending_keys (xattr, priv); -                afr_sh_metadata_sync (frame, this, xattr); -        } - -        return 0; +	int i = 0; +	afr_private_t *priv = NULL; +	struct iatt first = {0, }; +	int source = -1; +	int locked_count = 0; +	int sources_count = 0; +	int sinks_count = 0; + +	priv = this->private; + +	locked_count = AFR_COUNT (locked_on, priv->child_count); +	sources_count = AFR_COUNT (sources, priv->child_count); +	sinks_count = AFR_COUNT (sinks, priv->child_count); + +	if (locked_count == sinks_count || !sources_count) { +		if (!priv->metadata_splitbrain_forced_heal) { +			return -EIO; +		} +		/* Metadata split brain, select one subvol +		   arbitrarily */ +		for (i = 0; i < priv->child_count; i++) { +			if (locked_on[i] && sinks[i]) { +				sources[i] = 1; +				sinks[i] = 0; +				break; +			} +		} +	} + +	for (i = 0; i < priv->child_count; i++) { +		if (!sources[i]) +			continue; +		if (source == -1) { +			source = i; +			first = replies[i].poststat; +		} +	} + +	for (i = 0; i < priv->child_count; i++) { +		if (!sources[i]) +			continue; +		if (!IA_EQUAL (first, replies[i].poststat, type) || +		    !IA_EQUAL (first, replies[i].poststat, uid) || +		    !IA_EQUAL (first, replies[i].poststat, gid) || +		    !IA_EQUAL (first, replies[i].poststat, prot)) { +			sources[i] = 0; +			sinks[i] = 1; +		} +	} + +	return source;  } -static void -afr_set_metadata_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, -                              xlator_t *this) -{ -        afr_private_t    *priv = NULL; -        int              i = 0; -        char             num[1024] = {0}; -        size_t           len = 0; -        char             *string = NULL; -        size_t           off = 0; -        char             *source_child =  " from source %s to"; -        char             *format = " %s, "; -        char             *string_msg = " metadata self heal"; -        char             *pending_matrix_str = NULL; -        int              down_child_present = 0; -        int              unknown_child_present = 0; -        char             *down_subvol_1 = " down subvolume is "; -        char             *unknown_subvol_1 = " unknown subvolume is"; -        char             *down_subvol_2 = " down subvolumes are "; -        char             *unknown_subvol_2 = " unknown subvolumes are "; -        int              down_count = 0; -        int              unknown_count = 0; - -        priv = this->private; - -        pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix, -                                                         this); - -        if (!pending_matrix_str) -                pending_matrix_str = ""; - -        len += snprintf (num, sizeof (num), "%s", string_msg); - -        for (i = 0; i < priv->child_count; i++) { -                if ((sh->source == i) && (local->child_up[i] == 1)) { -                        len += snprintf (num, sizeof (num), source_child, -                                         priv->children[i]->name); -                } else if ((local->child_up[i] == 1) && (sh->sources[i] == 0)) { -                        len += snprintf (num, sizeof (num), format, -                                         priv->children[i]->name); -                } else if (local->child_up[i] == 0) { -                        len += snprintf (num, sizeof (num), format, -                                         priv->children[i]->name); -                        if (!down_child_present) -                                down_child_present = 1; -                        down_count++; -                } else if (local->child_up[i] == -1) { -                        len += snprintf (num, sizeof (num), format, -                                         priv->children[i]->name); -                        if (!unknown_child_present) -                                unknown_child_present = 1; -                        unknown_count++; -                } -        } - -        if (down_child_present) { -                if (down_count > 1) { -                        len += snprintf (num, sizeof (num), "%s", -                                         down_subvol_2); -                } else { -                        len += snprintf (num, sizeof (num), "%s", -                                         down_subvol_1); -                } -        } -        if (unknown_child_present) { -                if (unknown_count > 1) { -                        len += snprintf (num, sizeof (num), "%s", -                                         unknown_subvol_2); -                } else { -                        len += snprintf (num, sizeof (num), "%s", -                                         unknown_subvol_1); -                } -        } - -        len ++; - -        string = GF_CALLOC (len, sizeof (char), gf_common_mt_char); -        if (!string) -                return; - -        off += snprintf (string + off, len - off, "%s", string_msg); -        for (i=0; i < priv->child_count; i++) { -                if ((sh->source == i) && (local->child_up[i] == 1)) -                        off += snprintf (string + off, len - off, source_child, -                                         priv->children[i]->name); -        } - -        for (i = 0; i < priv->child_count; i++) { -                if ((local->child_up[i] == 1)&& (sh->sources[i] == 0)) -                        off += snprintf (string + off, len - off, format, -                                         priv->children[i]->name); -        } - -        if (down_child_present) { -                if (down_count > 1) { -                        off += snprintf (string + off, len - off, "%s", -                                         down_subvol_2); -                } else { -                        off += snprintf (string + off, len - off, "%s", -                                         down_subvol_1); -                } -        } - -        for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i] == 0) -                        off += snprintf (string + off, len - off, format, -                                         priv->children[i]->name); -        } - -        if (unknown_child_present) { -                if (unknown_count > 1) { -                        off += snprintf (string + off, len - off, "%s", -                                 unknown_subvol_2); -                } else { -                        off += snprintf (string + off, len - off, "%s", -                                         unknown_subvol_1); -                } -        } - -        for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i] == -1) -                        off += snprintf (string + off, len - off, format, -                                         priv->children[i]->name); -        } - -        gf_asprintf (&sh->metadata_sh_info, "%s metadata %s,", string, -                     pending_matrix_str); - -        if (pending_matrix_str && strcmp (pending_matrix_str, "")) -                GF_FREE (pending_matrix_str); - -        if (string && strcmp (string, "")) -                GF_FREE (string); -} -int -afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) +static int +__afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *inode, +				 unsigned char *locked_on, unsigned char *sources, +				 unsigned char *sinks, unsigned char *healed_sinks, +				 struct afr_reply *replies)  { -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        afr_private_t   *priv = NULL; -        int              source = 0; - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; - -        source = sh->source; - -        afr_sh_mark_source_sinks (frame, this); -        if (sh->active_sinks == 0) { -                gf_log (this->name, GF_LOG_DEBUG, -                        "no active sinks for performing self-heal on file %s", -                        local->loc.path); -                afr_sh_metadata_finish (frame, this); -                return 0; -        } - -        gf_log (this->name, GF_LOG_TRACE, -                "syncing metadata of %s from subvolume %s to %d active sinks", -                local->loc.path, priv->children[source]->name, -                sh->active_sinks); - -        sh->actual_sh_started = _gf_true; -        afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); -        afr_set_metadata_sh_info_str (local, sh, this); -        STACK_WIND (frame, afr_sh_metadata_getxattr_cbk, -                    priv->children[source], -                    priv->children[source]->fops->getxattr, -                    &local->loc, NULL, NULL); - -        return 0; +	int ret = -1; +	int source = -1; +	afr_private_t *priv = NULL; +	int i = 0; + +	priv = this->private; + +	ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid, +					      replies); +	if (ret) +		return ret; + +	ret = afr_selfheal_find_direction (frame, this, replies, +					   AFR_METADATA_TRANSACTION, +					   locked_on, sources, sinks); +	if (ret) +		return ret; + +	source = __afr_selfheal_metadata_finalize_source (this, sources, sinks, +							  locked_on, replies); +	if (source < 0) +		return -EIO; + +	for (i = 0; i < priv->child_count; i++) +		/* Initialize the healed_sinks[] array optimistically to +		   the intersection of to-be-healed (i.e sinks[]) and +		   the list of servers which are up (i.e locked_on[]). + +		   As we encounter failures in the healing process, we +		   will unmark the respective servers in the healed_sinks[] +		   array. +		*/ +		healed_sinks[i] = sinks[i] && locked_on[i]; + +	return source;  } -void -afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this, -                     int32_t op_ret, int32_t op_errno) +static int +__afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode, +			 unsigned char *locked_on)  { -        afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        afr_private_t   *priv = NULL; -        int              nsources = 0; -        int              source = 0; -        int              i = 0; - -        local = frame->local; -        sh = &local->self_heal; -        priv = this->private; - -        if (op_ret < 0) { -                afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -                afr_sh_set_error (sh, op_errno); -                afr_sh_metadata_finish (frame, this); -                goto out; -        } -        nsources = afr_build_sources (this, sh->xattr, sh->buf, -                                      sh->pending_matrix, sh->sources, -                                      sh->success_children, -                                      AFR_METADATA_TRANSACTION, NULL, _gf_false); -        if ((nsources == -1) -            && (priv->favorite_child != -1) -            && (sh->child_errno[priv->favorite_child] == 0)) { - -                gf_log (this->name, GF_LOG_WARNING, -                        "Picking favorite child %s as authentic source to resolve conflicting metadata of %s", -                        priv->children[priv->favorite_child]->name, -                        local->loc.path); - -                sh->sources[priv->favorite_child] = 1; - -                nsources = afr_sh_source_count (sh->sources, -                                                priv->child_count); -        } - -        if (nsources == -1) { -                afr_sh_print_split_brain_log (sh->pending_matrix, this, -                                              local->loc.path); -                afr_set_split_brain (this, sh->inode, SPB, DONT_KNOW); -                afr_sh_metadata_fail (frame, this); -                goto out; -        } - -        afr_set_split_brain (this, sh->inode, NO_SPB, DONT_KNOW); -        if (nsources == 0) { -                gf_log (this->name, GF_LOG_TRACE, -                        "No self-heal needed for %s", -                        local->loc.path); - -                afr_sh_metadata_finish (frame, this); -                goto out; -        } - -        source = afr_sh_select_source (sh->sources, priv->child_count); - -        if (source == -1) { -                gf_log (this->name, GF_LOG_DEBUG, -                        "No active sources found."); - -                afr_sh_metadata_finish (frame, this); -                goto out; -        } - -        sh->source = source; - -        /* detect changes not visible through pending flags -- JIC */ -        for (i = 0; i < priv->child_count; i++) { -                if (i == source || sh->child_errno[i]) -                        continue; - -                if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source])) -                        sh->sources[i] = 0; - -                if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source])) -                        sh->sources[i] = 0; -        } - -        if ((!IA_ISREG (sh->buf[source].ia_type)) && -            (!IA_ISDIR (sh->buf[source].ia_type))) { -                afr_reset_children (sh->fresh_children, priv->child_count); -                afr_get_fresh_children (sh->success_children, sh->sources, -                                        sh->fresh_children, priv->child_count); -                afr_inode_set_read_ctx (this, sh->inode, sh->source, -                                        sh->fresh_children); -        } - -        if (sh->do_metadata_self_heal && priv->metadata_self_heal) -                afr_sh_metadata_sync_prepare (frame, this); -        else -                afr_sh_metadata_finish (frame, this); +	afr_private_t *priv = NULL; +	int ret = -1; +	unsigned char *sources = NULL; +	unsigned char *sinks = NULL; +	unsigned char *data_lock = NULL; +	unsigned char *healed_sinks = NULL; +	struct afr_reply *locked_replies = NULL; +	int source = -1; + +	priv = this->private; + +	sources = alloca0 (priv->child_count); +	sinks = alloca0 (priv->child_count); +	healed_sinks = alloca0 (priv->child_count); +	data_lock = alloca0 (priv->child_count); + +	locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); + +	ret = afr_selfheal_inodelk (frame, this, inode, this->name, +				    LLONG_MAX - 1, 0, data_lock); +	{ +		if (ret < 2) { +			ret = -ENOTCONN; +			goto unlock; +		} + +		ret = __afr_selfheal_metadata_prepare (frame, this, inode, data_lock, +						       sources, sinks, healed_sinks, +						       locked_replies); +		if (ret < 0) +			goto unlock; + +		source = ret; +		ret = 0; +	} +unlock: +	afr_selfheal_uninodelk (frame, this, inode, this->name, +				LLONG_MAX -1, 0, data_lock); +	if (ret < 0) +		goto out; + +	ret = afr_selfheal_metadata_do (frame, this, inode, source, healed_sinks, +					locked_replies); +	if (ret) +		goto out; + +	ret = afr_selfheal_undo_pending (frame, this, inode, sources, sinks, +					 healed_sinks, AFR_METADATA_TRANSACTION, +					 locked_replies, data_lock);  out: -        return; -} - -int -afr_sh_metadata_post_nonblocking_inodelk_cbk (call_frame_t *frame, -                                              xlator_t *this) -{ -        afr_internal_lock_t *int_lock = NULL; -        afr_local_t         *local    = NULL; - -        local    = frame->local; -        int_lock = &local->internal_lock; - -        if (int_lock->lock_op_ret < 0) { -                gf_log (this->name, GF_LOG_DEBUG, "Non Blocking metadata " -                        "inodelks failed for %s.", local->loc.path); -                gf_log (this->name, GF_LOG_DEBUG, "Metadata self-heal " -                        "failed for %s.", local->loc.path); -                afr_sh_metadata_done (frame, this); -        } else { - -                gf_log (this->name, GF_LOG_DEBUG, "Non Blocking metadata " -                        "inodelks done for %s. Proceeding to FOP", -                        local->loc.path); -                afr_sh_common_lookup (frame, this, &local->loc, -                                      afr_sh_metadata_fix, NULL, -                                      AFR_LOOKUP_FAIL_CONFLICTS | -                                      AFR_LOOKUP_FAIL_MISSING_GFIDS, -                                      NULL); -        } - -        return 0; +	return ret;  } -int -afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) -{ -        afr_internal_lock_t *int_lock = NULL; -        afr_inodelk_t       *inodelk  = NULL; -        afr_local_t         *local    = NULL; - -        local    = frame->local; -        int_lock = &local->internal_lock; - -        int_lock->domain           = this->name; -        inodelk = afr_get_inodelk (int_lock, int_lock->domain); -        int_lock->transaction_lk_type = AFR_SELFHEAL_LK; -        int_lock->selfheal_lk_type    = AFR_METADATA_SELF_HEAL_LK; - -        afr_set_lock_number (frame, this); - -        inodelk->flock.l_start = LLONG_MAX - 1; -        inodelk->flock.l_len   = 0; -        inodelk->flock.l_type  = F_WRLCK; -        int_lock->lock_cbk         = afr_sh_metadata_post_nonblocking_inodelk_cbk; - -        afr_nonblocking_inodelk (frame, this); - -        return 0; -} - -gf_boolean_t -afr_can_start_metadata_self_heal (afr_self_heal_t *sh, afr_private_t *priv) -{ -        if (sh->force_confirm_spb) -                return _gf_true; -        if (sh->do_metadata_self_heal && priv->metadata_self_heal) -                return _gf_true; -        return _gf_false; -}  int -afr_self_heal_metadata (call_frame_t *frame, xlator_t *this) +afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode)  { -        afr_local_t   *local = NULL; -        afr_private_t *priv = this->private; -        afr_self_heal_t *sh = &local->self_heal; - -        local = frame->local; -        sh = &local->self_heal; -        sh->sh_type_in_action = AFR_SELF_HEAL_METADATA; - -        if (afr_can_start_metadata_self_heal (sh, priv)) { -                afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); -                afr_sh_metadata_lock (frame, this); -        } else { -                afr_sh_metadata_done (frame, this); -        } - -        return 0; +	afr_private_t *priv = NULL; +	unsigned char *locked_on = NULL; +	int ret = 0; + +	priv = this->private; + +	locked_on = alloca0 (priv->child_count); + +	ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain, 0, 0, +				       locked_on); +	{ +		if (ret < 2) { +			/* Either less than two subvols available, or another +			   selfheal (from another server) is in progress. Skip +			   for now in any case there isn't anything to do. +			*/ +			ret = -ENOTCONN; +			goto unlock; +		} + +		ret = __afr_selfheal_metadata (frame, this, inode, locked_on); +	} +unlock: +	afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on); + +	return ret;  } diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c new file mode 100644 index 00000000000..ce80b8da393 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-name.c @@ -0,0 +1,457 @@ +/* +  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> +  This file is part of GlusterFS. + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "afr.h" +#include "afr-self-heal.h" + + +int +__afr_selfheal_assign_gfid (call_frame_t *frame, xlator_t *this, inode_t *parent, +			    uuid_t pargfid, const char *bname, inode_t *inode, +			    struct afr_reply *replies, int gfid_idx) +{ +	int i = 0; +	afr_private_t *priv = NULL; +	dict_t *xdata = NULL; +	int ret = 0; +	loc_t loc = {0, }; + +	priv = this->private; + +	uuid_copy (parent->gfid, pargfid); + +	xdata = dict_new (); +	if (!xdata) { +		return -ENOMEM; +	} + +	ret = dict_set_static_bin (xdata, "gfid-req", +				   replies[gfid_idx].poststat.ia_gfid, 16); +	if (ret) { +		dict_destroy (xdata); +		return -ENOMEM; +	} + +	loc.parent = inode_ref (parent); +	loc.inode = inode_ref (inode); +	uuid_copy (loc.pargfid, pargfid); +	loc.name = bname; + +	for (i = 0; i < priv->child_count; i++) { +		if (replies[i].op_ret == 0 || replies[i].op_errno != ENODATA) +			continue; + +		ret = syncop_lookup (priv->children[i], &loc, xdata, 0, 0, 0); +	} + +	loc_wipe (&loc); +	dict_unref (xdata); + +	return ret; +} + + +int +__afr_selfheal_name_impunge (call_frame_t *frame, xlator_t *this, inode_t *parent, +			     uuid_t pargfid, const char *bname, inode_t *inode, +			     struct afr_reply *replies, int gfid_idx) +{ +	int i = 0; +	afr_private_t *priv = NULL; +	int ret = 0; + +	priv = this->private; + +	uuid_copy (parent->gfid, pargfid); + +	for (i = 0; i < priv->child_count; i++) { +		if (!replies[i].valid) +			continue; + +		if (uuid_compare (replies[i].poststat.ia_gfid, +				  replies[gfid_idx].poststat.ia_gfid) == 0) +			continue; + +		ret |= afr_selfheal_recreate_entry (frame, this, i, gfid_idx, +						    parent, bname, inode, replies); +	} + +	return ret; +} + + +int +__afr_selfheal_name_expunge (call_frame_t *frame, xlator_t *this, inode_t *parent, +			     uuid_t pargfid, const char *bname, inode_t *inode, +			     struct afr_reply *replies) +{ +	loc_t loc = {0, }; +	int i = 0; +	afr_private_t *priv = NULL; +	char g[64]; +	int ret = 0; + +	priv = this->private; + +	loc.parent = inode_ref (parent); +	uuid_copy (loc.pargfid, pargfid); +	loc.name = bname; +	loc.inode = inode_ref (inode); + +	for (i = 0; i < priv->child_count; i++) { +		if (!replies[i].valid) +			continue; + +		if (replies[i].op_ret) +			continue; + +		switch (replies[i].poststat.ia_type) { +		case IA_IFDIR: +			gf_log (this->name, GF_LOG_WARNING, +				"expunging dir %s/%s (%s) on %s", +				uuid_utoa (pargfid), bname, +				uuid_utoa_r (replies[i].poststat.ia_gfid, g), +				priv->children[i]->name); +			ret |= syncop_rmdir (priv->children[i], &loc, 1); +			break; +		default: +			gf_log (this->name, GF_LOG_WARNING, +				"expunging file %s/%s (%s) on %s", +				uuid_utoa (pargfid), bname, +				uuid_utoa_r (replies[i].poststat.ia_gfid, g), +				priv->children[i]->name); +			ret |= syncop_unlink (priv->children[i], &loc); +			break; +		} +	} + +	loc_wipe (&loc); + +	return ret; + +} + + +int +__afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent, +			uuid_t pargfid, const char *bname, inode_t *inode, +			unsigned char *sources, unsigned char *sinks, +			unsigned char *healed_sinks, int source, +			unsigned char *locked_on, struct afr_reply *replies) +{ +	int i = 0; +	afr_private_t *priv = NULL; +	uuid_t gfid = {0, }; +	int gfid_idx = -1; +	gf_boolean_t source_is_empty = _gf_true; +	gf_boolean_t need_heal = _gf_false; +	int first_idx = -1; +	char g1[64],g2[64]; + +	priv = this->private; + +	for (i = 0; i < priv->child_count; i++) { +		if (!replies[i].valid) +			continue; + +		if (first_idx == -1) { +			first_idx = i; +			continue; +		} + +		if (replies[i].op_ret != replies[first_idx].op_ret) +			need_heal = _gf_true; + +		if (uuid_compare (replies[i].poststat.ia_gfid, +				  replies[first_idx].poststat.ia_gfid)) +			need_heal = _gf_true; +	} + +	if (!need_heal) +		return 0; + +	for (i = 0; i < priv->child_count; i++) { +		if (!replies[i].valid) +			continue; + +		if (!replies[i].op_ret && (source == -1 || sources[i])) { +			source_is_empty = _gf_false; +			break; +		} +	} + +	if (source_is_empty) { +		return __afr_selfheal_name_expunge (frame, this, parent, pargfid, +						    bname, inode, replies); +	} + +	for (i = 0; i < priv->child_count; i++) { +		if (!replies[i].valid) +			continue; + +		if (uuid_is_null (replies[i].poststat.ia_gfid)) +			continue; + +		if (uuid_is_null (gfid)) { +			uuid_copy (gfid, replies[i].poststat.ia_gfid); +			gfid_idx = i; +			continue; +		} + +		if (sources[i] || source == -1) { +			if (gfid_idx != -1 && +			    (sources[gfid_idx] || source == -1) && +			    uuid_compare (gfid, replies[i].poststat.ia_gfid)) { +				gf_log (this->name, GF_LOG_WARNING, +					"GFID mismatch for <gfid:%s>/%s " +					"%s on %s and %s on %s", +					uuid_utoa (pargfid), bname, +					uuid_utoa_r (replies[i].poststat.ia_gfid, g1), +					priv->children[i]->name, +					uuid_utoa_r (replies[gfid_idx].poststat.ia_gfid, g2), +					priv->children[gfid_idx]->name); +				return -1; +			} + +			uuid_copy (gfid, replies[i].poststat.ia_gfid); +			gfid_idx = i; +			continue; +		} +	} + +	if (gfid_idx == -1) +		return -1; + +	__afr_selfheal_assign_gfid (frame, this, parent, pargfid, bname, inode, +				    replies, gfid_idx); + +	return __afr_selfheal_name_impunge (frame, this, parent, pargfid, +					    bname, inode, replies, gfid_idx); +} + + +int +__afr_selfheal_name_finalize_source (xlator_t *this, unsigned char *sources, +				     unsigned char *sinks, unsigned char *locked_on, +				     struct afr_reply *replies) +{ +	int i = 0; +	afr_private_t *priv = NULL; +	int source = -1; +	int locked_count = 0; +	int sources_count = 0; +	int sinks_count = 0; + +	priv = this->private; + +	locked_count = AFR_COUNT (locked_on, priv->child_count); +	sources_count = AFR_COUNT (sources, priv->child_count); +	sinks_count = AFR_COUNT (sinks, priv->child_count); + +	if (locked_count == sinks_count || !sources_count) { +		return -1; +	} + +	for (i = 0; i < priv->child_count; i++) { +		if (sources[i]) { +			source = i; +			break; +		} +	} + +	return source; +} + + +int +__afr_selfheal_name_prepare (call_frame_t *frame, xlator_t *this, inode_t *parent, +			     uuid_t pargfid, unsigned char *locked_on, +			     unsigned char *sources, unsigned char *sinks, +			     unsigned char *healed_sinks, struct afr_reply *replies, +			     int *source_p) +{ +	int ret = -1; +	int source = -1; +	afr_private_t *priv = NULL; +	int i = 0; + +	priv = this->private; + +	ret = afr_selfheal_unlocked_discover (frame, parent, pargfid, replies); +	if (ret) +		return ret; + +	ret = afr_selfheal_find_direction (frame, this, replies, +					   AFR_ENTRY_TRANSACTION, +					   locked_on, sources, sinks); +	if (ret) +		return ret; + +	source = __afr_selfheal_name_finalize_source (this, sources, sinks, +						      locked_on, replies); +	if (source < 0) { +		/* If source is < 0 (typically split-brain), we perform a +		   conservative merge of entries rather than erroring out */ +	} +	*source_p = source; + +	for (i = 0; i < priv->child_count; i++) +		/* Initialize the healed_sinks[] array optimistically to +		   the intersection of to-be-healed (i.e sinks[]) and +		   the list of servers which are up (i.e locked_on[]). + +		   As we encounter failures in the healing process, we +		   will unmark the respective servers in the healed_sinks[] +		   array. +		*/ +		healed_sinks[i] = sinks[i] && locked_on[i]; + +	return ret; +} + + +int +afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent, +		      uuid_t pargfid, const char *bname) +{ +	afr_private_t *priv = NULL; +	unsigned char *sources = NULL; +	unsigned char *sinks = NULL; +	unsigned char *healed_sinks = NULL; +	unsigned char *locked_on = NULL; +	int source = -1; +	struct afr_reply *replies = NULL; +	int ret = -1; +	inode_t *inode = NULL; + +	priv = this->private; + +	locked_on = alloca0 (priv->child_count); +	sources = alloca0 (priv->child_count); +	sinks = alloca0 (priv->child_count); +	healed_sinks = alloca0 (priv->child_count); + +	replies = alloca0 (priv->child_count * sizeof(*replies)); + +	ret = afr_selfheal_entrylk (frame, this, parent, this->name, bname, +				    locked_on); +	{ +		if (ret < 2) { +			ret = -ENOTCONN; +			goto unlock; +		} + +		ret = __afr_selfheal_name_prepare (frame, this, parent, pargfid, +						   locked_on, sources, sinks, +						   healed_sinks, replies, +						   &source); +		if (ret) +			goto unlock; + +		inode = afr_selfheal_unlocked_lookup_on (frame, parent, bname, +							 replies, locked_on); +		if (!inode) { +			ret = -ENOMEM; +			goto unlock; +		} + +		ret = __afr_selfheal_name_do (frame, this, parent, pargfid, bname, +					      inode, sources, sinks, healed_sinks, +					      source, locked_on, replies); +	} +unlock: +	afr_selfheal_unentrylk (frame, this, parent, this->name, bname, +				locked_on); +	if (inode) +		inode_unref (inode); + +	return ret; +} + + +int +afr_selfheal_name_unlocked_inspect (call_frame_t *frame, xlator_t *this, +				    inode_t *parent, uuid_t pargfid, +				    const char *bname, gf_boolean_t *need_heal) +{ +	afr_private_t *priv = NULL; +	int i = 0; +	struct afr_reply *replies = NULL; +	inode_t *inode = NULL; +	int first_idx = -1; + +	priv = this->private; + +	replies = alloca0 (sizeof (*replies) * priv->child_count); + +	inode = afr_selfheal_unlocked_lookup_on (frame, parent, bname, +						 replies, priv->child_up); +	if (!inode) +		return -ENOMEM; + +	for (i = 0; i < priv->child_count; i++) { +		if (!replies[i].valid) +			continue; + +		if (first_idx == -1) { +			first_idx = i; +			continue; +		} + +		if (replies[i].op_ret != replies[first_idx].op_ret) +			*need_heal = _gf_true; + +		if (uuid_compare (replies[i].poststat.ia_gfid, +				  replies[first_idx].poststat.ia_gfid)) +			*need_heal = _gf_true; +	} + +	if (inode) +		inode_unref (inode); +	return 0; +} + +int +afr_selfheal_name (xlator_t *this, uuid_t pargfid, const char *bname) +{ +	inode_t *parent = NULL; +	call_frame_t *frame = NULL; +	int ret = -1; +	gf_boolean_t need_heal = _gf_false; + +	parent = afr_inode_find (this, pargfid); +	if (!parent) +		goto out; + +	frame = afr_frame_create (this); +	if (!frame) +		goto out; + +	ret = afr_selfheal_name_unlocked_inspect (frame, this, parent, pargfid, +						  bname, &need_heal); +	if (ret) +		goto out; + +	if (need_heal) +		afr_selfheal_name_do (frame, this, parent, pargfid, bname); +out: +	if (parent) +		inode_unref (parent); +	if (frame) +		AFR_STACK_DESTROY (frame); + +	return ret; +} diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 7c9bc81119c..a1b972ac35d 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -1,5 +1,5 @@  /* -  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>    This file is part of GlusterFS.    This file is licensed to you under your choice of the GNU Lesser @@ -8,36 +8,160 @@    cases as published by the Free Software Foundation.  */ -#ifndef __AFR_SELF_HEAL_H__ -#define __AFR_SELF_HEAL_H__ -#include <sys/stat.h> +#ifndef _AFR_SELFHEAL_H +#define _AFR_SELFHEAL_H -#define FILETYPE_DIFFERS(buf1,buf2) ((buf1)->ia_type != (buf2)->ia_type) -#define PERMISSION_DIFFERS(buf1,buf2) (st_mode_from_ia ((buf1)->ia_prot, (buf1)->ia_type) != st_mode_from_ia ((buf2)->ia_prot, (buf2)->ia_type)) -#define OWNERSHIP_DIFFERS(buf1,buf2) (((buf1)->ia_uid != (buf2)->ia_uid) || ((buf1)->ia_gid != (buf2)->ia_gid)) -#define SIZE_DIFFERS(buf1,buf2) ((buf1)->ia_size != (buf2)->ia_size) -#define SIZE_GREATER(buf1,buf2) ((buf1)->ia_size > (buf2)->ia_size) +/* Perform fop on all UP subvolumes and wait for all callbacks to return */ + +#define AFR_ONALL(frame, rfn, fop, args ...) do {			\ +	afr_local_t *__local = frame->local;				\ +	afr_private_t *__priv = frame->this->private;			\ +	int __i = 0, __count = 0;					\ +									\ +	afr_replies_wipe (__local, __priv);				\ +									\ +	for (__i = 0; __i < __priv->child_count; __i++) {		\ +		if (!__priv->child_up[__i]) continue;			\ +		STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i,	\ +				   __priv->children[__i],		\ +				   __priv->children[__i]->fops->fop, args); \ +		__count++;						\ +	}								\ +	syncbarrier_wait (&__local->barrier, __count);			\ +	} while (0) + + +/* Perform fop on all subvolumes represented by list[] array and wait +   for all callbacks to return */ + +#define AFR_ONLIST(list, frame, rfn, fop, args ...) do {		\ +	afr_local_t *__local = frame->local;				\ +	afr_private_t *__priv = frame->this->private;			\ +	int __i = 0, __count = 0;					\ +									\ +	afr_replies_wipe (__local, __priv);				\ +									\ +	for (__i = 0; __i < __priv->child_count; __i++) {		\ +		if (!list[__i]) continue;				\ +		STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i,	\ +				   __priv->children[__i],		\ +				   __priv->children[__i]->fops->fop, args); \ +		__count++;						\ +	}								\ +	syncbarrier_wait (&__local->barrier, __count);			\ +	} while (0) + + +#define AFR_SEQ(frame, rfn, fop, args ...) do {				\ +	afr_local_t *__local = frame->local;				\ +	afr_private_t *__priv = frame->this->private;			\ +	int __i = 0;							\ +									\ +	afr_replies_wipe (__local, __priv);				\ +									\ +	for (__i = 0; __i < __priv->child_count; __i++) {		\ +		if (!__priv->child_up[__i]) continue;			\ +		STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i,	\ +				   __priv->children[__i],		\ +				   __priv->children[__i]->fops->fop, args); \ +		syncbarrier_wait (&__local->barrier, 1);		\ +	}								\ +	} while (0) + + +#define ALLOC_MATRIX(n, type) ({type **__ptr = NULL; \ +	int __i; \ +	__ptr = alloca0 (n * sizeof(type *)); \ +	for (__i = 0; __i < n; __i++) __ptr[__i] = alloca0 (n * sizeof(type)); \ +	__ptr;}) + + +#define IA_EQUAL(f,s,field) (memcmp (&(f.ia_##field), &(s.ia_##field), sizeof (s.ia_##field)) == 0) + + +int +afr_selfheal (xlator_t *this, uuid_t gfid); + +int +afr_selfheal_name (xlator_t *this, uuid_t gfid, const char *name); + +int +afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode); + +int +afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode);  int -afr_self_heal_entry (call_frame_t *frame, xlator_t *this); +afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode); + + +int +afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, +		      char *dom, off_t off, size_t size, +		      unsigned char *locked_on); + +int +afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, +			 char *dom, off_t off, size_t size, +			 unsigned char *locked_on);  int -afr_self_heal_data (call_frame_t *frame, xlator_t *this); +afr_selfheal_uninodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, +			char *dom, off_t off, size_t size, +			const unsigned char *locked_on);  int -afr_self_heal_metadata (call_frame_t *frame, xlator_t *this); +afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, +		      char *dom, const char *name, unsigned char *locked_on);  int -afr_self_heal_get_source (xlator_t *this, afr_local_t *local, dict_t **xattr); +afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, +			 char *dom, const char *name, unsigned char *locked_on); + +int +afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, +			char *dom, const char *name, unsigned char *locked_on); + +int +afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode, +				uuid_t gfid, struct afr_reply *replies); + +inode_t * +afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent, +				 const char *name, struct afr_reply *replies, +				 unsigned char *lookup_on);  int -afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode); +afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, +			     struct afr_reply *replies, +			     afr_transaction_type type, unsigned char *locked_on, +			     unsigned char *sources, unsigned char *sinks);  int -afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, -                                          dict_t **xattr, -                                          afr_transaction_type txn_type, -                                          uuid_t gfid); -#endif /* __AFR_SELF_HEAL_H__ */ +afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies, +			    afr_transaction_type type, int *dirty, int **matrix); + +int +afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode, +			   unsigned char *sources, unsigned char *sinks, +			   unsigned char *healed_sinks, afr_transaction_type type, +			   struct afr_reply *replies, unsigned char *locked_on); + +int +afr_selfheal_recreate_entry (call_frame_t *frame, xlator_t *this, int dst, +			     int source, inode_t *dir, const char *name, +			     inode_t *inode, struct afr_reply *replies); + +int +afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode, +		      int subvol, dict_t *xattr); + +call_frame_t * +afr_frame_create (xlator_t *this); + +inode_t * +afr_inode_find (xlator_t *this, uuid_t gfid); + +#endif /* !_AFR_SELFHEAL_H */ diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index 9e5c1b3e79f..4bfe909bcb9 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -1,5 +1,5 @@  /* -  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>    This file is part of GlusterFS.    This file is licensed to you under your choice of the GNU Lesser @@ -8,1828 +8,1249 @@    cases as published by the Free Software Foundation.  */ +  #ifndef _CONFIG_H  #define _CONFIG_H  #include "config.h"  #endif +  #include "afr.h" -#include "syncop.h" +#include "afr-self-heal.h"  #include "afr-self-heald.h" -#include "afr-self-heal-common.h"  #include "protocol-common.h" -#include "event-history.h" - -typedef enum { -        STOP_CRAWL_ON_SINGLE_SUBVOL = 1, -        STOP_INDEX_CRAWL_ON_PENDING_FULL_CRAWL = 2 -} afr_crawl_flags_t; - -typedef enum { -        HEAL = 1, -        INFO, -        STATISTICS_TO_BE_HEALED, -} shd_crawl_op; - -typedef struct shd_dump { -        dict_t   *dict; -        xlator_t *this; -        int      child; -} shd_dump_t; - -typedef struct shd_event_ { -        int     child; -        char    *path; -} shd_event_t; - -typedef struct shd_pos_ { -        int     child; -        xlator_t *this; -        afr_child_pos_t pos; -} shd_pos_t; - -typedef int -(*afr_crawl_done_cbk_t)  (int ret, call_frame_t *sync_frame, void *crawl_data); -void -afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl, -                 process_entry_cbk_t process_entry, void *op_data, -                 gf_boolean_t exclusive, int crawl_flags, -                 afr_crawl_done_cbk_t crawl_done); +#define SHD_INODE_LRU_LIMIT          2048 +#define AFR_EH_HEALED_LIMIT          1024 +#define AFR_EH_HEAL_FAIL_LIMIT       1024 +#define AFR_EH_SPLIT_BRAIN_LIMIT     1024 +#define AFR_STATISTICS_HISTORY_SIZE    50 -static int -_crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data); -/* For calling straight through (e.g. already in a synctask). */ -int -afr_find_child_position (xlator_t *this, int child, afr_child_pos_t *pos); +#define ASSERT_LOCAL(this, healer)				\ +	if (!afr_shd_is_subvol_local(this, healer->subvol)) {	\ +		healer->local = _gf_false;			\ +		if (safe_break (healer)) {			\ +			break;					\ +		} else {					\ +			continue;				\ +		}						\ +	} else {						\ +		healer->local = _gf_true;			\ +	} -/* For deferring through a new synctask. */ -int -afr_syncop_find_child_position (void *data); -static int -_loc_assign_gfid_path (loc_t *loc) -{ -        int  ret = -1; -        char gfid_path[64] = {0}; - -        if (loc->inode && !uuid_is_null (loc->inode->gfid)) { -                ret = inode_path (loc->inode, NULL, (char**)&loc->path); -        } else if (!uuid_is_null (loc->gfid)) { -                snprintf (gfid_path, sizeof (gfid_path), "<gfid:%s>", -                          uuid_utoa (loc->gfid)); -                loc->path = gf_strdup (gfid_path); -                if (loc->path) -                        ret = 0; -        } -        return ret; -} +#define NTH_INDEX_HEALER(this, n) &((((afr_private_t *)this->private))->shd.index_healers[n]) +#define NTH_FULL_HEALER(this, n) &((((afr_private_t *)this->private))->shd.full_healers[n]) -void -_destroy_crawl_event_data (void *data) -{ -        shd_crawl_event_t        *crawl_event = NULL; +int afr_shd_gfid_to_path (xlator_t *this, xlator_t *subvol, uuid_t gfid, char **path_p); -        if (!data) -                goto out; +char * +afr_subvol_name (xlator_t *this, int subvol) +{ +	afr_private_t *priv = NULL; -        crawl_event = (shd_crawl_event_t *)data; -        GF_FREE (crawl_event->start_time_str); -        GF_FREE (crawl_event->end_time_str); +	priv = this->private; +	if (subvol < 0 || subvol > priv->child_count) +		return NULL; -out: -        return; +	return priv->children[subvol]->name;  } +  void -_destroy_shd_event_data (void *data) +afr_destroy_crawl_event_data (void *data)  { -        shd_event_t             *event = NULL; -        if (!data) -                goto out; -        event = (shd_event_t*)data; -        GF_FREE (event->path); -out:          return;  } + +  void -shd_cleanup_event (void *event) +afr_destroy_shd_event_data (void *data)  { -        shd_event_t *shd_event = event; +	shd_event_t *shd_event = data; + +	if (!shd_event) +		return; +	GF_FREE (shd_event->path); -        if (!shd_event) -                goto out; -        GF_FREE (shd_event->path); -        GF_FREE (shd_event); -out:          return;  } -int -afr_get_local_child (afr_self_heald_t *shd, unsigned int child_count) -{ -        int i = 0; -        int ret = -1; -        for (i = 0; i < child_count; i++) { -                if (shd->pos[i] == AFR_POS_LOCAL) { -                        ret = i; -                        break; -                } -        } -        return ret; -} -static int -_build_index_loc (xlator_t *this, loc_t *loc, char *name, loc_t *parent) +gf_boolean_t +afr_shd_is_subvol_local (xlator_t *this, int subvol)  { -        int             ret = 0; +	char *pathinfo = NULL; +	afr_private_t *priv = NULL; +	dict_t *xattr = NULL; +	int ret = 0; +	gf_boolean_t is_local = _gf_false; +	loc_t loc = {0, }; -        uuid_copy (loc->pargfid, parent->inode->gfid); -        loc->path = ""; -        loc->name = name; -        loc->parent = inode_ref (parent->inode); -        if (!loc->parent) { -                loc->path = NULL; -                loc_wipe (loc); -                ret = -1; -        } -        return ret; -} +	priv = this->private; -int -_add_crawl_stats_to_dict (xlator_t *this, dict_t *output, int child, -                          shd_crawl_event_t *shd_event, struct timeval *tv) -{ -        int             ret = 0; -        uint64_t        count = 0; -        char            key[256] = {0}; -        int             xl_id = 0; -        uint64_t        healed_count = 0; -        uint64_t        split_brain_count = 0; -        uint64_t        heal_failed_count = 0; -        char            *start_time_str = NULL; -        char            *end_time_str = NULL; -        char            *crawl_type = NULL; -        int             progress = -1; +	loc.inode = this->itable->root; +	uuid_copy (loc.gfid, loc.inode->gfid); -        healed_count = shd_event->healed_count; -        split_brain_count = shd_event->split_brain_count; -        heal_failed_count = shd_event->heal_failed_count; -        start_time_str = shd_event->start_time_str; -        end_time_str = shd_event->end_time_str; -        crawl_type = shd_event->crawl_type; +	ret = syncop_getxattr (priv->children[subvol], &loc, &xattr, +			       GF_XATTR_PATHINFO_KEY); +	if (ret) +		return _gf_false; +	if (!xattr) +		return _gf_false; -        if (!start_time_str) { -                ret = -1; -                goto out; -        } +	ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &pathinfo); +	if (ret) +		return _gf_false; +	afr_local_pathinfo (pathinfo, &is_local); -        ret = dict_get_int32 (output, this->name, &xl_id); -        if (ret) { -                gf_log (this->name, GF_LOG_ERROR, "xl does not have id"); -                goto out; -        } +	gf_log (this->name, GF_LOG_DEBUG, "subvol %s is %slocal", +		priv->children[subvol]->name, is_local? "" : "not "); -        snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child); -        ret = dict_get_uint64 (output, key, &count); +	return is_local; +} -        snprintf (key, sizeof (key), "statistics_healed_cnt-%d-%d-%"PRIu64, -                  xl_id, child, count); -        ret = dict_set_uint64(output, key, healed_count); -        if (ret) { -                gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" -                        "healed_count to outout"); -                goto out; -         } -        snprintf (key, sizeof (key), "statistics_sb_cnt-%d-%d-%"PRIu64, -                  xl_id, child, count); -        ret = dict_set_uint64 (output, key, split_brain_count); -         if (ret) { -                gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" -                        "split_brain_count to outout"); -                goto out; -        } -        snprintf (key, sizeof (key), "statistics_crawl_type-%d-%d-%"PRIu64, -                  xl_id, child, count); -        ret = dict_set_dynstr (output, key, gf_strdup (crawl_type)); -        if (ret) { -                gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" -                        "crawl_type to output"); -                goto out; -        } -        snprintf (key, sizeof (key), "statistics_heal_failed_cnt-%d-%d-%"PRIu64, -                  xl_id, child, count); -        ret = dict_set_uint64 (output, key, heal_failed_count); -         if (ret) { -                gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" -                        "healed_failed_count to outout"); -                goto out; -        } -        snprintf (key, sizeof (key), "statistics_strt_time-%d-%d-%"PRIu64, -                  xl_id, child, count); -        ret = dict_set_dynstr (output, key, gf_strdup(start_time_str)); -         if (ret) { -                gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" -                        "crawl_start_time to outout"); -                goto out; -        } +int +__afr_shd_healer_wait (struct subvol_healer *healer) +{ +	afr_private_t *priv = NULL; +	struct timespec wait_till = {0, }; +	int ret = 0; -        snprintf (key, sizeof (key), "statistics_end_time-%d-%d-%"PRIu64, -                  xl_id, child, count); +	priv = healer->this->private; -        if (!end_time_str) -                end_time_str = "Could not determine the end time"; -        ret = dict_set_dynstr (output, key, gf_strdup(end_time_str)); -         if (ret) { -                gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" -                        "crawl_end_time to outout"); -                goto out; -        } -        snprintf (key, sizeof (key), "statistics_inprogress-%d-%d-%"PRIu64, -                  xl_id, child, count); +disabled_loop: +	wait_till.tv_sec = time (NULL) + 60; -        if (shd_event->crawl_inprogress == _gf_true) -                progress = 1; -        else -                progress = 0; +	while (!healer->rerun) { +		ret = pthread_cond_timedwait (&healer->cond, +					      &healer->mutex, +					      &wait_till); +		if (ret == ETIMEDOUT) +			break; +	} -        ret = dict_set_int32 (output, key, progress); -         if (ret) { -                gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" -                        "inprogress to outout"); -                goto out; -        } +	ret = healer->rerun; +	healer->rerun = 0; -         snprintf (key, sizeof (key), "statistics-%d-%d-count",xl_id, child); -         ret = dict_set_uint64 (output, key, count + 1); -         if (ret) { -                gf_log (this->name, GF_LOG_ERROR, "Could not increment the " -                        "counter."); -                goto out; -         } -out: -        return ret; +	if (!priv->shd.enabled) +		goto disabled_loop; + +	return ret;  } +  int -_add_path_to_dict (xlator_t *this, dict_t *output, int child, char *path, -                   struct timeval *tv, gf_boolean_t dyn) +afr_shd_healer_wait (struct subvol_healer *healer)  { -        //subkey not used for now -        int             ret = -1; -        uint64_t        count = 0; -        char            key[256] = {0}; -        int             xl_id = 0; +	int ret = 0; -        ret = dict_get_int32 (output, this->name, &xl_id); -        if (ret) { -                gf_log (this->name, GF_LOG_ERROR, "xl does not have id"); -                goto out; -        } +	pthread_mutex_lock (&healer->mutex); +	{ +		ret = __afr_shd_healer_wait (healer); +	} +	pthread_mutex_unlock (&healer->mutex); -        snprintf (key, sizeof (key), "%d-%d-count", xl_id, child); -        ret = dict_get_uint64 (output, key, &count); +	return ret; +} -        snprintf (key, sizeof (key), "%d-%d-%"PRIu64, xl_id, child, count); -        if (dyn) -                ret = dict_set_dynstr (output, key, path); -        else -                ret = dict_set_str (output, key, path); -        if (ret) { -                gf_log (this->name, GF_LOG_ERROR, "%s: Could not add to output", -                        path); -                goto out; -        } -        if (!tv) -                goto inc_count; -        snprintf (key, sizeof (key), "%d-%d-%"PRIu64"-time", xl_id, -                  child, count); -        ret = dict_set_uint32 (output, key, tv->tv_sec); -        if (ret) { -                gf_log (this->name, GF_LOG_ERROR, "%s: Could not set time", -                        path); -                goto out; -        } +gf_boolean_t +safe_break (struct subvol_healer *healer) +{ +	gf_boolean_t ret = _gf_false; -inc_count: -        snprintf (key, sizeof (key), "%d-%d-count", xl_id, child); -        ret = dict_set_uint64 (output, key, count + 1); -        if (ret) { -                gf_log (this->name, GF_LOG_ERROR, "Could not increment count"); -                goto out; -        } -        ret = 0; -out: -        return ret; -} +	pthread_mutex_lock (&healer->mutex); +	{ +		if (healer->rerun) +			goto unlock; -int -_get_path_from_gfid_loc (xlator_t *this, xlator_t *readdir_xl, loc_t *child, -                         char **fpath, gf_boolean_t *missing) -{ -        dict_t          *xattr = NULL; -        char            *path = NULL; -        int             ret = -1; +		healer->running = _gf_false; +		ret = _gf_true; +	} +unlock: +	pthread_mutex_unlock (&healer->mutex); -        ret = syncop_getxattr (readdir_xl, child, &xattr, GFID_TO_PATH_KEY); -        if (ret < 0) { -                if ((-ret == ENOENT || -ret == ESTALE) && missing) -                        *missing = _gf_true; -                ret = -1; -                goto out; -        } -        ret = dict_get_str (xattr, GFID_TO_PATH_KEY, &path); -        if (ret) { -                gf_log (this->name, GF_LOG_ERROR, "Failed to get path for " -                        "gfid %s", uuid_utoa (child->gfid)); -                goto out; -        } -        path = gf_strdup (path); -        if (!path) { -                ret = -1; -                goto out; -        } -        ret = 0; -out: -        if (!ret) -                *fpath = path; -        if (xattr) -                dict_unref (xattr); -        return ret; +	return ret;  } -int -_add_event_to_dict (circular_buffer_t *cb, void *data) -{ -        int               ret = 0; -        shd_dump_t        *dump_data = NULL; -        shd_event_t       *shd_event = NULL; -        dump_data = data; -        shd_event = cb->data; -        if (shd_event->child != dump_data->child) -                goto out; -        ret = _add_path_to_dict (dump_data->this, dump_data->dict, -                                 dump_data->child, shd_event->path, &cb->tv, -                                 _gf_false); +inode_t * +afr_shd_inode_find (xlator_t *this, xlator_t *subvol, uuid_t gfid) +{ +	inode_t *inode = NULL; +	int ret = 0; +	loc_t loc = {0, }; +	struct iatt iatt = {0, }; + +	inode = inode_find (this->itable, gfid); +	if (inode) +		goto out; + +	loc.inode = inode_new (this->itable); +	if (!loc.inode) +		goto out; +	uuid_copy (loc.gfid, gfid); + +	ret = syncop_lookup (subvol, &loc, NULL, &iatt, NULL, NULL); +	if (ret < 0) +		goto out; + +	inode = inode_link (loc.inode, NULL, NULL, &iatt); +	if (inode) +		inode_lookup (inode);  out: -        return ret; +	loc_wipe (&loc); +	return inode;  } -int -_add_crawl_event_statistics_to_dict (circular_buffer_t *cb, void *data) -{ -        int               ret = 0; -        shd_dump_t        *dump_data = NULL; -        shd_crawl_event_t *shd_event = NULL; - -        dump_data = data; -        shd_event = cb->data; -        ret = _add_crawl_stats_to_dict (dump_data->this, dump_data->dict, -                                        dump_data->child, shd_event, &cb->tv); -        return ret; -} -int -_add_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict, int child) +fd_t * +afr_shd_index_opendir (xlator_t *this, int child)  { -        shd_dump_t dump_data = {0}; - -        dump_data.this = this; -        dump_data.dict = dict; -        dump_data.child = child; -        eh_dump (eh, &dump_data, _add_event_to_dict); -        return 0; +	fd_t *fd = NULL; +	afr_private_t *priv = NULL; +	xlator_t *subvol = NULL; +	loc_t rootloc = {0, }; +	inode_t *inode = NULL; +	int ret = 0; +	dict_t *xattr = NULL; +	void *index_gfid = NULL; + +	priv = this->private; +	subvol = priv->children[child]; + +	rootloc.inode = inode_ref (this->itable->root); +	uuid_copy (rootloc.gfid, rootloc.inode->gfid); + +	ret = syncop_getxattr (subvol, &rootloc, &xattr, +			       GF_XATTROP_INDEX_GFID); +	if (ret || !xattr) { +		errno = -ret; +		goto out; +	} + +	ret = dict_get_ptr (xattr, GF_XATTROP_INDEX_GFID, &index_gfid); +	if (ret) +		goto out; + +	gf_log (this->name, GF_LOG_DEBUG, "index-dir gfid for %s: %s", +		subvol->name, uuid_utoa (index_gfid)); + +	inode = afr_shd_inode_find (this, subvol, index_gfid); +	if (!inode) +		goto out; +	fd = fd_anonymous (inode); +out: +	loc_wipe (&rootloc); +	if (xattr) +		dict_unref (xattr); +	return fd;  }  int -_add_statistics_to_dict (xlator_t *this, dict_t *dict, int child) +afr_shd_index_purge (xlator_t *subvol, inode_t *inode, char *name)  { -        shd_dump_t              dump_data = {0}; -        afr_private_t           *priv  = NULL; -        afr_self_heald_t        *shd = NULL; +	loc_t loc = {0, }; +	int ret = 0; -        priv = this->private; -        shd = &priv->shd; +	loc.parent = inode_ref (inode); +	loc.name = name; -        dump_data.this = this; -        dump_data.dict = dict; -        dump_data.child = child; -        eh_dump (shd->statistics[child], &dump_data, -                 _add_crawl_event_statistics_to_dict); -        return 0; +	ret = syncop_unlink (subvol, &loc); +	loc_wipe (&loc); +	return ret;  } -void -_remove_stale_index (xlator_t *this, xlator_t *readdir_xl, -                     loc_t *parent, char *fname) -{ -        int              ret = 0; -        loc_t            index_loc = {0}; - -        ret = _build_index_loc (this, &index_loc, fname, parent); -        if (ret) -                goto out; -        gf_log (this->name, GF_LOG_DEBUG, "Removing stale index " -                "for %s on %s", index_loc.name, readdir_xl->name); -        ret = syncop_unlink (readdir_xl, &index_loc); -        if((ret < 0) && (-ret != ENOENT)) { -                gf_log(this->name, GF_LOG_ERROR, "%s: Failed to remove index " -                       "on %s - %s",index_loc.name, readdir_xl->name, -                       strerror (-ret)); -        } -        index_loc.path = NULL; -        loc_wipe (&index_loc); -out: -        return; -}  int -_count_hard_links_under_base_indices_dir (xlator_t *this, -                                           afr_crawl_data_t *crawl_data, -                                           gf_dirent_t *entry, loc_t *childloc, -                                           loc_t *parentloc, struct iatt *iattr) +afr_shd_selfheal_name (struct subvol_healer *healer, int child, uuid_t parent, +		       const char *bname)  { -        xlator_t                *readdir_xl = crawl_data->readdir_xl; -        struct iatt             parent = {0}; -        int                     ret = 0; -        dict_t                  *output = NULL; -        int                     xl_id =  0; -        char                    key[256] = {0}; -        int                     child  = -1; -        uint64_t                     hardlinks = 0; - -        output = crawl_data->op_data; -        child = crawl_data->child; - -        ret = syncop_lookup (readdir_xl, childloc, NULL, iattr, NULL, &parent); -        if (ret) { -                ret = -1; -                goto out; -        } - -        ret = dict_get_int32 (output, this->name, &xl_id); -        if (ret) -                goto out; +	int ret = -1; -        snprintf (key, sizeof (key), "%d-%d-hardlinks", xl_id, child); -        ret =  dict_get_uint64 (output, key, &hardlinks); +	ret = afr_selfheal_name (THIS, parent, bname); -        /*Removing the count of base_entry under indices/base_indicies and -         * entry under indices/xattrop */ -        hardlinks = hardlinks + iattr->ia_nlink - 2; -        ret = dict_set_uint64 (output, key, hardlinks); -        if (ret) -                goto out; - -out: -        return ret; +	return ret;  }  int -_add_summary_to_dict (xlator_t *this, afr_crawl_data_t *crawl_data, -                      gf_dirent_t *entry, -                      loc_t *childloc, loc_t *parentloc, struct iatt *iattr) +afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid)  { -        dict_t          *output = NULL; -        xlator_t        *readdir_xl = NULL; -        int             ret = -1; -        char            *path = NULL; -        gf_boolean_t    missing = _gf_false; -        char            gfid_str[64] = {0}; +	int ret = 0; +	eh_t *eh = NULL; +	afr_private_t *priv = NULL; +	afr_self_heald_t *shd = NULL; +	shd_event_t *shd_event = NULL; +	char *path = NULL; +	xlator_t *subvol = NULL; +	xlator_t *this = NULL; +	crawl_event_t *crawl_event = NULL; + +	this = healer->this; +	priv = this->private; +	shd = &priv->shd; +	crawl_event = &healer->crawl_event; + +	subvol = priv->children[child]; + +	ret = afr_selfheal (this, gfid); + +	if (ret == -EIO) { +		eh = shd->split_brain; +		crawl_event->split_brain_count++; +	} else if (ret < 0) { +		eh = shd->heal_failed; +		crawl_event->heal_failed_count++; +	} else if (ret == 0) { +		eh = shd->healed; +		crawl_event->healed_count++; +	} + +	afr_shd_gfid_to_path (this, subvol, gfid, &path); +	if (!path) +		return ret; + +	if (eh) { +		shd_event = GF_CALLOC (1, sizeof(*shd_event), +				       gf_afr_mt_shd_event_t); +		if (!shd_event) { +			GF_FREE (path); +			return ret; +		} + +		shd_event->child = child; +		shd_event->path = path; + +		if (eh_save_history (eh, shd_event) < 0) { +			GF_FREE (shd_event); +			GF_FREE (path); +		} +	} +	return ret; +} -        if (uuid_is_null (childloc->gfid)) -                goto out; -        output = crawl_data->op_data; -        readdir_xl = crawl_data->readdir_xl; - -        ret = _get_path_from_gfid_loc (this, readdir_xl, childloc, &path, -                                       &missing); -        if (ret == 0) { -                ret = _add_path_to_dict (this, output, crawl_data->child, path, -                                         NULL, _gf_true); -        } else if (missing) { -                _remove_stale_index (this, readdir_xl, parentloc, -                                     uuid_utoa_r (childloc->gfid, gfid_str)); -        } +void +afr_shd_sweep_prepare (struct subvol_healer *healer) +{ +	crawl_event_t *event = NULL; -out: -        if (ret && path) -                GF_FREE (path); -        return ret; +	event = &healer->crawl_event; + +	event->healed_count = 0; +	event->split_brain_count = 0; +	event->heal_failed_count = 0; + +	time (&event->start_time); +	event->end_time = 0;  } +  void -_crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child, -                       int32_t op_ret, int32_t op_errno, dict_t *xattr_rsp, -                       afr_crawl_data_t *crawl_data) +afr_shd_sweep_done (struct subvol_healer *healer)  { -        int                ret = 0; -        afr_private_t      *priv = NULL; -        afr_self_heald_t   *shd = NULL; -        eh_t               *eh = NULL; -        char               *path = NULL; -        char               gfid_str[64] = {0}; -        shd_event_t        *event = NULL; -        int32_t            sh_failed = 0; -        gf_boolean_t       split_brain = 0; -        int32_t            actual_sh_done = 0; -        shd_crawl_event_t  **shd_crawl_event = NULL; - -        priv = this->private; -        shd  = &priv->shd; -        if (crawl_data->crawl == INDEX) { -                if ((op_ret < 0) && (op_errno == ENOENT)) { -                        _remove_stale_index (this, crawl_data->readdir_xl, -                                             parent, uuid_utoa_r (child->gfid, -                                                                  gfid_str)); -                        goto out; -                } -                ret = _get_path_from_gfid_loc (this, crawl_data->readdir_xl, -                                               child, &path, NULL); -                if (ret) -                        goto out; -        } else { -                path = gf_strdup (child->path); -                if (!path) { -                        ret = -1; -                        goto out; -                } -        } +	crawl_event_t *event = NULL; +	crawl_event_t *history = NULL; +	afr_self_heald_t *shd = NULL; -        if (xattr_rsp) { -                ret = dict_get_int32 (xattr_rsp, "sh-failed", &sh_failed); -                ret = dict_get_int32 (xattr_rsp, "actual-sh-done", &actual_sh_done); -        } +	event = &healer->crawl_event; +	shd = &(((afr_private_t *)healer->this->private)->shd); -        shd_crawl_event = (shd_crawl_event_t**)(shd->crawl_events); - -        split_brain = afr_is_split_brain (this, child->inode); -        if ((op_ret < 0 && op_errno == EIO) || split_brain) { -                eh = shd->split_brain; -                shd_crawl_event[crawl_data->child]->split_brain_count += 1; -        } else if ((op_ret < 0) || sh_failed) { -                eh = shd->heal_failed; -                shd_crawl_event[crawl_data->child]->heal_failed_count += 1; -        } else if (actual_sh_done == 1) { -                eh = shd->healed; -                shd_crawl_event[crawl_data->child]->healed_count += 1; -        } -        ret = -1; +	time (&event->end_time); +	history = memdup (event, sizeof (*event)); +	event->start_time = 0; -        if (eh != NULL) { -                event = GF_CALLOC (1, sizeof (*event), gf_afr_mt_shd_event_t); -                if (!event) -                        goto out; -                event->child = crawl_data->child; -                event->path = path; +	if (!history) +		return; -                ret = eh_save_history (eh, event); -                if (ret < 0) { -                        gf_log (this->name, GF_LOG_ERROR, "%s:Failed to save " -                                "to event history, (%d, %s)", path, op_ret, -                                strerror (op_errno)); +	if (eh_save_history (shd->statistics[healer->subvol], history) < 0) +		GF_FREE (history); +} -                        goto out; -                } -        } else { -                gf_log (this->name, GF_LOG_DEBUG, "%s:Self heal already done ", -                        path); -        } -        ret = 0; -out: -        if (ret && path) -                GF_FREE (path); -        return; +int +afr_shd_index_sweep (struct subvol_healer *healer) +{ +	xlator_t *this = NULL; +	int child = -1; +	fd_t *fd = NULL; +	xlator_t *subvol = NULL; +	afr_private_t *priv = NULL; +	off_t offset = 0; +	gf_dirent_t entries; +	gf_dirent_t *entry = NULL; +	uuid_t gfid; +	int ret = 0; +	int count = 0; + +	this = healer->this; +	child = healer->subvol; +	priv = this->private; +	subvol = priv->children[child]; + +	fd = afr_shd_index_opendir (this, child); +	if (!fd) { +		gf_log (this->name, GF_LOG_WARNING, +			"unable to opendir index-dir on %s", subvol->name); +		return -errno; +	} + +	INIT_LIST_HEAD (&entries.list); + +	while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) { +		if (ret > 0) +			ret = 0; +		list_for_each_entry (entry, &entries.list, list) { +			offset = entry->d_off; + +			if (!priv->shd.enabled) { +				ret = -EBUSY; +				break; +			} + +			if (!strcmp (entry->d_name, ".") || +			    !strcmp (entry->d_name, "..")) +				continue; + +			gf_log (this->name, GF_LOG_DEBUG, "got entry: %s", +				entry->d_name); + +			ret = uuid_parse (entry->d_name, gfid); +			if (ret) +				continue; + +			ret = afr_shd_selfheal (healer, child, gfid); +			if (ret == 0) +				count++; + +			if (ret == -ENOENT || ret == -ESTALE) { +				afr_shd_index_purge (subvol, fd->inode, +						     entry->d_name); +				ret = 0; +			} +		} + +		gf_dirent_free (&entries); +		if (ret) +			break; +	} + +	if (fd) +		fd_unref (fd); +	if (!ret) +		ret = count; +	return ret;  } +  int -_link_inode_update_loc (xlator_t *this, loc_t *loc, struct iatt *iattr) +afr_shd_full_sweep (struct subvol_healer *healer, inode_t *inode)  { -        inode_t       *link_inode = NULL; -        int           ret = -1; +	fd_t *fd = NULL; +	xlator_t *this = NULL; +	xlator_t *subvol = NULL; +	afr_private_t *priv = NULL; +	off_t offset = 0; +	gf_dirent_t entries; +	gf_dirent_t *entry = NULL; +	int ret = 0; + +	this = healer->this; +	priv = this->private; +	subvol = priv->children[healer->subvol]; + +	fd = fd_anonymous (inode); +	if (!fd) +		return -errno; + +	INIT_LIST_HEAD (&entries.list); + +	while ((ret = syncop_readdirp (subvol, fd, 131072, offset, 0, &entries))) { +		if (ret < 0) +			break; + +		ret = gf_link_inodes_from_dirent (this, fd->inode, &entries); +		if (ret) +			break; + +		list_for_each_entry (entry, &entries.list, list) { +			offset = entry->d_off; + +			if (!priv->shd.enabled) { +				ret = -EBUSY; +				break; +			} + +			if (!strcmp (entry->d_name, ".") || +			    !strcmp (entry->d_name, "..")) +				continue; + +			afr_shd_selfheal_name (healer, healer->subvol, +					       inode->gfid, entry->d_name); + +			afr_shd_selfheal (healer, healer->subvol, +					  entry->d_stat.ia_gfid); + +			if (entry->d_stat.ia_type == IA_IFDIR) { +				ret = afr_shd_full_sweep (healer, entry->inode); +				if (ret) +					break; +			} +		} + +		gf_dirent_free (&entries); +		if (ret) +			break; +	} + +	if (fd) +		fd_unref (fd); +	return ret; +} -        link_inode = inode_link (loc->inode, NULL, NULL, iattr); -        if (link_inode == NULL) { -                gf_log (this->name, GF_LOG_ERROR, "inode link failed " -                        "on the inode (%s)", uuid_utoa (iattr->ia_gfid)); -                goto out; -        } -        inode_unref (loc->inode); -        loc->inode = link_inode; -        ret = 0; -out: -        return ret; + +void * +afr_shd_index_healer (void *data) +{ +	struct subvol_healer *healer = NULL; +	xlator_t *this = NULL; +	int ret = 0; + +	healer = data; +	THIS = this = healer->this; + +	for (;;) { +		afr_shd_healer_wait (healer); + +		ASSERT_LOCAL(this, healer); + +		do { +			gf_log (this->name, GF_LOG_DEBUG, +				"starting index sweep on subvol %s", +				afr_subvol_name (this, healer->subvol)); + +			afr_shd_sweep_prepare (healer); + +			ret = afr_shd_index_sweep (healer); + +			afr_shd_sweep_done (healer); +			/* +			  As long as at least one gfid was +			  healed, keep retrying. We may have +			  just healed a directory and thereby +			  created entries for other gfids which +			  could not be healed thus far. +			*/ + +			gf_log (this->name, GF_LOG_DEBUG, +				"finished index sweep on subvol %s", +				afr_subvol_name (this, healer->subvol)); +			/* +			  Give a pause before retrying to avoid a busy loop +			  in case the only entry in index is because of +			  an ongoing I/O. +			*/ +			sleep (1); +		} while (ret > 0); +	} + +	return NULL;  } -int -_self_heal_entry (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *entry, -                  loc_t *child, loc_t *parent, struct iatt *iattr) + +void * +afr_shd_full_healer (void *data)  { -        struct iatt      parentbuf = {0}; -        int              ret = 0; -        dict_t           *xattr_rsp = NULL; -        dict_t           *xattr_req = NULL; +	struct subvol_healer *healer = NULL; +	xlator_t *this = NULL; +	int run = 0; -        xattr_req = dict_new (); -        if (!xattr_req) { -                errno = ENOMEM; -                ret = -1; -                goto out; -        } +	healer = data; +	THIS = this = healer->this; -        ret = dict_set_int32 (xattr_req, "attempt-self-heal", 1); +	for (;;) { +		pthread_mutex_lock (&healer->mutex); +		{ +			run = __afr_shd_healer_wait (healer); +			if (!run) +				healer->running = _gf_false; +		} +		pthread_mutex_unlock (&healer->mutex); -        gf_log (this->name, GF_LOG_DEBUG, "lookup %s", child->path); +		if (!run) +			break; -        ret = syncop_lookup (this, child, xattr_req, -                             iattr, &xattr_rsp, &parentbuf); -        _crawl_post_sh_action (this, parent, child, ret, -ret, xattr_rsp, -                               crawl_data); -        if (ret < 0) -                ret = -1; -        if (xattr_rsp) -                dict_unref (xattr_rsp); -        if (ret == 0) -                ret = _link_inode_update_loc (this, child, iattr); +		ASSERT_LOCAL(this, healer); -out: -        if (xattr_req) -                dict_unref(xattr_req); -        return ret; -} +		gf_log (this->name, GF_LOG_INFO, +			"starting full sweep on subvol %s", +			afr_subvol_name (this, healer->subvol)); -static int -afr_crawl_done  (int ret, call_frame_t *sync_frame, void *data) -{ -        GF_FREE (data); -        STACK_DESTROY (sync_frame->root); -        return 0; -} +		afr_shd_sweep_prepare (healer); -int -_get_heal_op_flags (shd_crawl_op op, afr_crawl_type_t crawl) -{ -        int crawl_flags = 0; +		afr_shd_full_sweep (healer, this->itable->root); -        if (HEAL == op) { -                crawl_flags |= STOP_CRAWL_ON_SINGLE_SUBVOL; +		afr_shd_sweep_done (healer); -                if (crawl == INDEX) -                        crawl_flags |= STOP_INDEX_CRAWL_ON_PENDING_FULL_CRAWL; -        } +		gf_log (this->name, GF_LOG_INFO, +			"finished full sweep on subvol %s", +			afr_subvol_name (this, healer->subvol)); +	} -        return crawl_flags; +	return NULL;  } -void -_do_self_heal_on_subvol (xlator_t *this, int child, afr_crawl_type_t crawl) -{ -        afr_start_crawl (this, child, crawl, _self_heal_entry, -                         NULL, _gf_true, _get_heal_op_flags (HEAL, crawl), -                         afr_crawl_done); -} -gf_boolean_t -_crawl_proceed (xlator_t *this, int child, int crawl_flags, char **reason) +int +afr_shd_healer_init (xlator_t *this, struct subvol_healer *healer)  { -        afr_private_t           *priv = NULL; -        afr_self_heald_t        *shd = NULL; -        gf_boolean_t            proceed = _gf_false; -        char                    *msg = NULL; - -        priv = this->private; -        shd  = &priv->shd; -        if (!shd->enabled) { -                msg = "Self-heal daemon is not enabled"; -                gf_log (this->name, GF_LOG_DEBUG, "%s", msg); -                goto out; -        } +	int ret = 0; -        if (!priv->child_up[child]) { -                gf_log (this->name, GF_LOG_DEBUG, "Stopping crawl for %s , " -                        "subvol went down", priv->children[child]->name); -                msg = "Brick is Not connected"; -                goto out; -        } - -        if (crawl_flags & STOP_CRAWL_ON_SINGLE_SUBVOL) { -                if (afr_up_children_count (priv->child_up, -                                           priv->child_count) < 2) { -                        gf_log (this->name, GF_LOG_DEBUG, "Stopping crawl as " -                                "< 2 children are up"); -                        msg = "< 2 bricks in replica are running"; -                        goto out; -                } -        } +	ret = pthread_mutex_init (&healer->mutex, NULL); +	if (ret) +		goto out; -        if (crawl_flags & STOP_INDEX_CRAWL_ON_PENDING_FULL_CRAWL) { -                if (shd->pending[child] == FULL) { -                        gf_log (this->name, GF_LOG_INFO, "Stopping index " -                                "self-heal as Full self-heal is pending on %s", -                                priv->children[child]->name); -                        msg = "Full crawl is pending"; -                        goto out; -                } -        } +	ret = pthread_cond_init (&healer->cond, NULL); +	if (ret) +		goto out; -        proceed = _gf_true; +	healer->this = this; +	healer->running = _gf_false; +	healer->rerun = _gf_false; +	healer->local = _gf_false;  out: -        if (reason) -                *reason = msg; -        return proceed; +	return ret;  } -int -_do_crawl_op_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl, -                               shd_crawl_op op, dict_t *output) -{ -        afr_private_t       *priv = NULL; -        char                *status = NULL; -        char                *subkey = NULL; -        char                key[256] = {0}; -        shd_pos_t           pos_data = {0}; -        int                 op_ret = -1; -        int                 xl_id = -1; -        int                 i = 0; -        int                 ret = 0; -        int                 crawl_flags = 0; - -        priv = this->private; -        crawl_flags = _get_heal_op_flags (op, crawl); - -        if (output) { -                ret = dict_get_int32 (output, this->name, &xl_id); -                if (ret) { -                        gf_log (this->name, GF_LOG_ERROR, "Invalid input, " -                                "translator-id is not available"); -                        goto out; -                } -        } -        pos_data.this = this; -        subkey = "status"; -        for (i = 0; i < priv->child_count; i++) { -                if (_crawl_proceed (this, i, crawl_flags, &status)) { -                        pos_data.child = i; -                        /* -                         * We're already in a synctask in this case, so we -                         * don't need to defer through a second (and in fact -                         * that can cause deadlock).  Just call straight -                         * through instead. -                         */ -                        ret = afr_find_child_position(pos_data.this, -                                                      pos_data.child, -                                                      &pos_data.pos); -                        if (ret) { -                                status = "Not able to find brick location"; -                        } else if (pos_data.pos == AFR_POS_REMOTE) { -                                status = "brick is remote"; -                        } else { -                                op_ret = 0; -                                if (op == HEAL) { -                                        status = "Started self-heal"; -                                        _do_self_heal_on_subvol (this, i, -                                                                 crawl); -                                } else if (output && (op == INFO)) { -                                        status = ""; -                                        afr_start_crawl (this, i, INDEX, -                                                         _add_summary_to_dict, -                                                         output, _gf_false, 0, -                                                         NULL); -                                } else if (output && -                                           (op == STATISTICS_TO_BE_HEALED)) { -                                            status = ""; -                                            afr_start_crawl (this, i, -                                                             INDEX_TO_BE_HEALED, -                                       _count_hard_links_under_base_indices_dir, -                                                             output, _gf_false, -                                                             0, NULL); -                                } -                        } -                        if (output) { -                                snprintf (key, sizeof (key), "%d-%d-%s", xl_id, -                                          i, subkey); -                                ret = dict_set_str (output, key, status); -                        } -                        if (!op_ret && (crawl == FULL)) -                                break; -                } -                if (output) { -                        snprintf (key, sizeof (key), "%d-%d-%s", xl_id, i, -                                  subkey); -                        ret = dict_set_str (output, key, status); -                } -        } -out: -        return op_ret; -}  int -_do_self_heal_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl, -                                dict_t *output) +afr_shd_healer_spawn (xlator_t *this, struct subvol_healer *healer, +		      void *(threadfn)(void *))  { -        return _do_crawl_op_on_local_subvols (this, crawl, HEAL, output); +	int ret = 0; + +	pthread_mutex_lock (&healer->mutex); +	{ +		if (healer->running) { +			pthread_cond_signal (&healer->cond); +		} else { +			ret = gf_thread_create (&healer->thread, NULL, +						threadfn, healer); +			if (ret) +				goto unlock; +			healer->running = 1; +		} + +		healer->rerun = 1; +	} +unlock: +	pthread_mutex_unlock (&healer->mutex); + +	return ret;  } +  int -_get_index_summary_on_local_subvols (xlator_t *this, dict_t *output) +afr_shd_full_healer_spawn (xlator_t *this, int subvol)  { -        return _do_crawl_op_on_local_subvols (this, INDEX, INFO, output); +	return afr_shd_healer_spawn (this, NTH_FULL_HEALER (this, subvol), +				     afr_shd_full_healer);  } -void -afr_fill_completed_crawl_statistics_to_dict (xlator_t *this, dict_t *dict) -{ -        afr_private_t           *priv  = NULL; -        afr_self_heald_t        *shd = NULL; -        int                     i = 0; -        priv = this->private; -        shd= &priv->shd; -        for (i = 0; i < priv->child_count; i++) { -                if (shd->pos[i] != AFR_POS_LOCAL) -                        continue; -                _add_statistics_to_dict (this, dict, i); -        } - -        return ; -} -static void -reset_crawl_event (shd_crawl_event_t *crawl_event) +int +afr_shd_index_healer_spawn (xlator_t *this, int subvol)  { -    crawl_event->healed_count = 0; -    crawl_event->split_brain_count = 0; -    crawl_event->heal_failed_count = 0; -    GF_FREE (crawl_event->start_time_str); -    crawl_event->start_time_str = NULL; -    crawl_event->end_time_str = NULL; -    crawl_event->crawl_type = NULL; -    crawl_event->crawl_inprogress = _gf_false; -    return; +	return afr_shd_healer_spawn (this, NTH_INDEX_HEALER (this, subvol), +				     afr_shd_index_healer);  } -static void -afr_copy_crawl_event_struct (shd_crawl_event_t *src, shd_crawl_event_t *dst) -{ -        dst->healed_count = src->healed_count; -        dst->split_brain_count = src->split_brain_count; -        dst->heal_failed_count = src->heal_failed_count; -        dst->start_time_str = gf_strdup (src->start_time_str); -        dst->end_time_str = "Crawl is already in progress"; -        dst->crawl_type = src->crawl_type; -        dst->crawl_inprogress = _gf_true; -        return; -} -static int -afr_fill_crawl_statistics_of_running_crawl(xlator_t *this, dict_t *dict) +int +afr_shd_dict_add_crawl_event (xlator_t *this, dict_t *output, +			      crawl_event_t *crawl_event)  { -        shd_crawl_event_t       *evnt = NULL; -        int                     ret = 0; -        afr_private_t           *priv = NULL; -        afr_self_heald_t        *shd = NULL; -        int                     i = 0; -        priv = this->private; -        shd = &priv->shd; - -        evnt = GF_CALLOC (1, sizeof (shd_crawl_event_t), -                          gf_afr_mt_shd_crawl_event_t); -        if (!evnt) { -                ret = -1; -                goto out; -        } -        LOCK (&priv->lock); -        { -                for (i = 0; i < priv->child_count; i++) { -                        if (shd->pos[i] != AFR_POS_LOCAL) -                                continue; - -                        reset_crawl_event (evnt); - -                        if (!shd->crawl_events[i]) { -                                continue; -                        } - -                        afr_copy_crawl_event_struct (shd->crawl_events[i], -                                                     evnt); -                        _add_crawl_stats_to_dict (this, dict, i, evnt, NULL); +        int             ret = 0; +        uint64_t        count = 0; +        char            key[256] = {0}; +        int             xl_id = 0; +        uint64_t        healed_count = 0; +        uint64_t        split_brain_count = 0; +        uint64_t        heal_failed_count = 0; +        char            *start_time_str = 0; +        char            *end_time_str = NULL; +        char            *crawl_type = NULL; +        int             progress = -1; +	int             child = -1; -                } -        } -        UNLOCK (&priv->lock); -        reset_crawl_event (evnt); -        GF_FREE (evnt); +	child = crawl_event->child; +        healed_count = crawl_event->healed_count; +        split_brain_count = crawl_event->split_brain_count; +        heal_failed_count = crawl_event->heal_failed_count; +        crawl_type = crawl_event->crawl_type; -out: -        return ret; -} +	if (!crawl_event->start_time) +		goto out; -static int -_add_local_subvols_crawl_statistics_to_dict (xlator_t *this, dict_t *dict) -{ -        int ret = 0; -        afr_fill_completed_crawl_statistics_to_dict (this, dict); -        ret = afr_fill_crawl_statistics_of_running_crawl (this, dict); -        return ret; -} -int -_add_local_subvols_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict) -{ -        afr_private_t           *priv = NULL; -        afr_self_heald_t        *shd = NULL; -        int                     i = 0; +        start_time_str = gf_strdup (ctime (&crawl_event->start_time)); -        priv = this->private; -        shd = &priv->shd; +	if (crawl_event->end_time) +		end_time_str = gf_strdup (ctime (&crawl_event->end_time)); -        for (i = 0; i < priv->child_count; i++) { -                if (shd->pos[i] != AFR_POS_LOCAL) -                        continue; -                _add_eh_to_dict (this, eh, dict, i); +        ret = dict_get_int32 (output, this->name, &xl_id); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "xl does not have id"); +                goto out;          } -        return 0; -} -int -afr_xl_op (xlator_t *this, dict_t *input, dict_t *output) -{ -        gf_xl_afr_op_t   op = GF_AFR_OP_INVALID; -        int              ret = 0; -        afr_private_t    *priv = NULL; -        afr_self_heald_t *shd = NULL; -        int              xl_id = 0; +        snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child); +        ret = dict_get_uint64 (output, key, &count); -        priv = this->private; -        shd = &priv->shd; -        ret = dict_get_int32 (input, "xl-op", (int32_t*)&op); -        if (ret) -                goto out; -        ret = dict_get_int32 (input, this->name, &xl_id); -        if (ret) +        snprintf (key, sizeof (key), "statistics_healed_cnt-%d-%d-%"PRIu64, +                  xl_id, child, count); +        ret = dict_set_uint64(output, key, healed_count); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, +			"Could not add statistics_healed_count to outout");                  goto out; -        ret = dict_set_int32 (output, this->name, xl_id); -        if (ret) +	} + +        snprintf (key, sizeof (key), "statistics_sb_cnt-%d-%d-%"PRIu64, +                  xl_id, child, count); +        ret = dict_set_uint64 (output, key, split_brain_count); +	if (ret) { +                gf_log (this->name, GF_LOG_ERROR, +			"Could not add statistics_split_brain_count to outout");                  goto out; -        switch (op) { -        case GF_AFR_OP_HEAL_INDEX: -                ret = _do_self_heal_on_local_subvols (this, INDEX, output); -                break; -        case GF_AFR_OP_HEAL_FULL: -                ret = _do_self_heal_on_local_subvols (this, FULL, output); -                break; -        case GF_AFR_OP_INDEX_SUMMARY: -                (void)_get_index_summary_on_local_subvols (this, output); -                ret = 0; -                break; -        case GF_AFR_OP_HEALED_FILES: -                ret = _add_local_subvols_eh_to_dict (this, shd->healed, output); -                break; -        case GF_AFR_OP_HEAL_FAILED_FILES: -                ret = _add_local_subvols_eh_to_dict (this, shd->heal_failed, -                                                   output); -                break; -        case GF_AFR_OP_SPLIT_BRAIN_FILES: -                ret = _add_local_subvols_eh_to_dict (this, shd->split_brain, -                                                   output); -                break; -        case GF_AFR_OP_STATISTICS: -                ret = _add_local_subvols_crawl_statistics_to_dict (this, output); -                break; -        case GF_AFR_OP_STATISTICS_HEAL_COUNT: -        case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA: -                ret = _do_crawl_op_on_local_subvols (this, INDEX_TO_BE_HEALED, -                                                     STATISTICS_TO_BE_HEALED, -                                                     output); -                break; -        default: -                gf_log (this->name, GF_LOG_ERROR, "Unknown set op %d", op); -                break;          } -out: -        dict_del (output, this->name); -        return ret; -} -void -afr_poll_self_heal (void *data) -{ -        afr_private_t    *priv = NULL; -        afr_self_heald_t *shd = NULL; -        struct timespec  timeout = {0}; -        xlator_t         *this = NULL; -        long             child = (long)data; -        gf_timer_t       *old_timer = NULL; -        gf_timer_t       *new_timer = NULL; -        shd_pos_t        pos_data = {0}; -        int              ret = 0; - -        this = THIS; -        priv = this->private; -        shd = &priv->shd; - -        if (shd->pos[child] == AFR_POS_UNKNOWN) { -                pos_data.this = this; -                pos_data.child = child; -                ret = synctask_new (this->ctx->env, -                                    afr_syncop_find_child_position, -                                    NULL, NULL, &pos_data); -                if (!ret) -                        shd->pos[child] = pos_data.pos; -        } -        if (shd->enabled && (shd->pos[child] == AFR_POS_LOCAL)) -                _do_self_heal_on_subvol (this, child, INDEX); -        timeout.tv_sec = shd->timeout; -        timeout.tv_nsec = 0; -        //notify and previous timer should be synchronized. -        LOCK (&priv->lock); -        { -                old_timer = shd->timer[child]; -                if (shd->pos[child] == AFR_POS_REMOTE) -                        goto unlock; -                shd->timer[child] = gf_timer_call_after (this->ctx, timeout, -                                                         afr_poll_self_heal, -                                                         data); -                new_timer = shd->timer[child]; -        } -unlock: -        UNLOCK (&priv->lock); - -        if (old_timer) -                gf_timer_call_cancel (this->ctx, old_timer); -        if (!new_timer && (shd->pos[child] != AFR_POS_REMOTE)) { -                gf_log (this->name, GF_LOG_WARNING, -                        "Could not create self-heal polling timer for %s", -                        priv->children[child]->name); +        snprintf (key, sizeof (key), "statistics_crawl_type-%d-%d-%"PRIu64, +                  xl_id, child, count); +        ret = dict_set_str (output, key, crawl_type); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, +			"Could not add statistics_crawl_type to output"); +                goto out;          } -        return; -} - -static int -afr_handle_child_up  (int ret, call_frame_t *sync_frame, void *data) -{ -        afr_self_heald_t *shd = NULL; -        shd_pos_t        *pos_data = data; -        afr_private_t    *priv = NULL; -        if (ret) +        snprintf (key, sizeof (key), "statistics_heal_failed_cnt-%d-%d-%"PRIu64, +                  xl_id, child, count); +        ret = dict_set_uint64 (output, key, heal_failed_count); +	if (ret) { +                gf_log (this->name, GF_LOG_ERROR, +			"Could not add statistics_healed_failed_count to outout");                  goto out; +        } -        priv = pos_data->this->private; -        shd = &priv->shd; -        shd->pos[pos_data->child] = pos_data->pos; -        if (pos_data->pos != AFR_POS_REMOTE) -                afr_poll_self_heal ((void*)(long)pos_data->child); -        _do_self_heal_on_local_subvols (THIS, INDEX, NULL); -out: -        GF_FREE (data); -        return 0; -} - -void -afr_proactive_self_heal (void *data) -{ -        xlator_t         *this = NULL; -        long             child = (long)data; -        shd_pos_t        *pos_data = NULL; -        int              ret = 0; +        snprintf (key, sizeof (key), "statistics_strt_time-%d-%d-%"PRIu64, +                  xl_id, child, count); +        ret = dict_set_dynstr (output, key, start_time_str); +	if (ret) { +                gf_log (this->name, GF_LOG_ERROR, +			"Could not add statistics_crawl_start_time to outout"); +                goto out; +        } else { +		start_time_str = NULL; +	} -        this = THIS; +	if (!end_time_str) +                progress = 1; +        else +                progress = 0; -        //Position of brick could have changed and it could be local now. -        //Compute the position again -        pos_data = GF_CALLOC (1, sizeof (*pos_data), gf_afr_mt_pos_data_t); -        if (!pos_data) -                goto out; -        pos_data->this = this; -        pos_data->child = child; -        ret = synctask_new (this->ctx->env, afr_syncop_find_child_position, -                            afr_handle_child_up, NULL, pos_data); -        if (ret) +        snprintf (key, sizeof (key), "statistics_end_time-%d-%d-%"PRIu64, +                  xl_id, child, count); +        if (!end_time_str) +                end_time_str = gf_strdup ("Could not determine the end time"); +        ret = dict_set_dynstr (output, key, end_time_str); +	if (ret) { +                gf_log (this->name, GF_LOG_ERROR, +			"Could not add statistics_crawl_end_time to outout");                  goto out; -out: -        return; -} +        } else { +		end_time_str = NULL; +	} -static int -get_pathinfo_host (char *pathinfo, char *hostname, size_t size) -{ -        char    *start = NULL; -        char    *end = NULL; -        int     ret  = -1; -        int     i    = 0; +        snprintf (key, sizeof (key), "statistics_inprogress-%d-%d-%"PRIu64, +                  xl_id, child, count); -        if (!pathinfo) +        ret = dict_set_int32 (output, key, progress); +	if (ret) { +                gf_log (this->name, GF_LOG_ERROR, +			"Could not add statistics_inprogress to outout");                  goto out; +        } -        start = strchr (pathinfo, ':'); -        if (!start) +	snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child); +	ret = dict_set_uint64 (output, key, count + 1); +	if (ret) { +                gf_log (this->name, GF_LOG_ERROR, +			"Could not increment the counter.");                  goto out; -        end = strrchr (pathinfo, ':'); -        if (start == end) -                goto out; - -        memset (hostname, 0, size); -        i = 0; -        while (++start != end) -                hostname[i++] = *start; -        ret = 0; +	}  out: +	GF_FREE (start_time_str); +	GF_FREE (end_time_str);          return ret;  } +  int -afr_local_pathinfo (char *pathinfo, gf_boolean_t *local) +afr_shd_dict_add_path (xlator_t *this, dict_t *output, int child, char *path, +		       struct timeval *tv)  { -        int             ret   = 0; -        char            pathinfohost[1024] = {0}; -        char            localhost[1024] = {0}; -        xlator_t        *this = THIS; +        int             ret = -1; +        uint64_t        count = 0; +        char            key[256] = {0}; +        int             xl_id = 0; -        *local = _gf_false; -        ret = get_pathinfo_host (pathinfo, pathinfohost, sizeof (pathinfohost)); +        ret = dict_get_int32 (output, this->name, &xl_id);          if (ret) { -                gf_log (this->name, GF_LOG_ERROR, "Invalid pathinfo: %s", -                        pathinfo); +                gf_log (this->name, GF_LOG_ERROR, "xl does not have id");                  goto out;          } -        ret = gethostname (localhost, sizeof (localhost)); +        snprintf (key, sizeof (key), "%d-%d-count", xl_id, child); +        ret = dict_get_uint64 (output, key, &count); + +        snprintf (key, sizeof (key), "%d-%d-%"PRIu64, xl_id, child, count); +	ret = dict_set_dynstr (output, key, path); +          if (ret) { -                gf_log (this->name, GF_LOG_ERROR, "gethostname() failed, " -                        "reason: %s", strerror (errno)); +                gf_log (this->name, GF_LOG_ERROR, "%s: Could not add to output", +                        path);                  goto out;          } -        if (!strcmp (localhost, pathinfohost)) -                *local = _gf_true; -out: -        return ret; -} +	if (tv) { +		snprintf (key, sizeof (key), "%d-%d-%"PRIu64"-time", xl_id, +			  child, count); +		ret = dict_set_uint32 (output, key, tv->tv_sec); +		if (ret) { +			gf_log (this->name, GF_LOG_ERROR, "%s: Could not set time", +				path); +			goto out; +		} +	} -int -afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data, -                           loc_t *dirloc) -{ -        afr_private_t *priv = NULL; -        dict_t        *xattr = NULL; -        void          *index_gfid = NULL; -        void          *base_indices_holder_vgfid = NULL; -        loc_t         rootloc = {0}; -        struct iatt   iattr = {0}; -        struct iatt   parent = {0}; -        int           ret = 0; -        xlator_t      *readdir_xl = crawl_data->readdir_xl; - -        priv = this->private; -        if (crawl_data->crawl == FULL) { -                afr_build_root_loc (this, dirloc); -        } else if (crawl_data->crawl == INDEX) { -                afr_build_root_loc (this, &rootloc); -                ret = syncop_getxattr (readdir_xl, &rootloc, &xattr, -                                       GF_XATTROP_INDEX_GFID); -                if (ret < 0) { -                        ret = -1; -                        goto out; -                } -                ret = dict_get_ptr (xattr, GF_XATTROP_INDEX_GFID, &index_gfid); -                if (ret < 0) { -                        gf_log (this->name, GF_LOG_ERROR, "failed to get index " -                                "dir gfid on %s", readdir_xl->name); -                        goto out; -                } -                if (!index_gfid) { -                        gf_log (this->name, GF_LOG_ERROR, "index gfid empty " -                                "on %s", readdir_xl->name); -                        ret = -1; -                        goto out; -                } -                uuid_copy (dirloc->gfid, index_gfid); -                dirloc->path = ""; -                dirloc->inode = inode_new (priv->root_inode->table); -                ret = syncop_lookup (readdir_xl, dirloc, NULL, -                                     &iattr, NULL, &parent); -                if (ret < 0) { -                        if (-ret != ENOENT) { -                                gf_log (this->name, GF_LOG_ERROR, "lookup " -                                        "failed on index dir on %s - (%s)", -                                        readdir_xl->name, strerror (-ret)); -                        } -                        ret = -1; -                        goto out; -                } -                ret = _link_inode_update_loc (this, dirloc, &iattr); -                if (ret) -                        goto out; -        } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) { -                afr_build_root_loc (this, &rootloc); -                ret = syncop_getxattr (readdir_xl, &rootloc, &xattr, -                                       GF_BASE_INDICES_HOLDER_GFID); -                if (ret < 0) { -                        ret = -1; -                        goto out; -                } -                ret = dict_get_ptr (xattr, GF_BASE_INDICES_HOLDER_GFID, -                                    &base_indices_holder_vgfid); -                if (ret < 0) { -                        gf_log (this->name, GF_LOG_ERROR, "index gfid empty " -                                "on %s", readdir_xl->name); -                        ret = -1; -                        goto out; -                } -                if (!base_indices_holder_vgfid) { -                        gf_log (this->name, GF_LOG_ERROR, "Base indices holder" -                                "virtual gfid is null on %s", readdir_xl->name); -                        ret = -1; -                        goto out; -                } -                uuid_copy (dirloc->gfid,  base_indices_holder_vgfid); -                dirloc->path = ""; -                dirloc->inode = inode_new (priv->root_inode->table); -                ret = syncop_lookup (readdir_xl, dirloc, NULL, &iattr, NULL, -                                     &parent); -                if (ret < 0) { -                        if (-ret != ENOENT) { -                                gf_log (this->name, GF_LOG_ERROR, "lookup " -                                        "failed for base_indices_holder dir" -                                        " on %s - (%s)", readdir_xl->name, -                                        strerror (-ret)); - -                        } else { -                                gf_log (this->name, GF_LOG_ERROR, "base_indices" -                                        "_holder is not yet created."); -                        } -                        ret = -1; -                        goto out; -                } -                ret = _link_inode_update_loc (this, dirloc, &iattr); -                if (ret) -                        goto out; +        snprintf (key, sizeof (key), "%d-%d-count", xl_id, child); + +        ret = dict_set_uint64 (output, key, count + 1); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "Could not increment count"); +                goto out;          } +          ret = 0;  out: -        if (xattr) -                dict_unref (xattr); -        loc_wipe (&rootloc);          return ret;  } +  int -afr_crawl_opendir (xlator_t *this, afr_crawl_data_t *crawl_data, fd_t **dirfd, -                   loc_t *dirloc) +afr_shd_gfid_to_path (xlator_t *this, xlator_t *subvol, uuid_t gfid, char **path_p)  { -        fd_t          *fd   = NULL; -        int           ret = 0; - -        if (crawl_data->crawl == FULL) { -                fd = fd_create (dirloc->inode, crawl_data->pid); -                if (!fd) { -                        gf_log (this->name, GF_LOG_ERROR, -                                "Failed to create fd for %s", dirloc->path); -                        ret = -1; -                        goto out; -                } - -                ret = syncop_opendir (crawl_data->readdir_xl, dirloc, fd); -                if (ret < 0) { -                        gf_log (this->name, GF_LOG_ERROR, -                                "opendir failed on %s", dirloc->path); -                        ret = -1; -                        goto out; -                } -        } else { -                fd = fd_anonymous (dirloc->inode); -        } -        ret = 0; -out: -        if (!ret) -                *dirfd = fd; -        return ret; +	loc_t loc = {0,}; +	char *path = NULL; +	dict_t *xattr = NULL; +	int ret = 0; + +	uuid_copy (loc.gfid, gfid); +	loc.inode = inode_new (this->itable); + +	ret = syncop_getxattr (subvol, &loc, &xattr, GFID_TO_PATH_KEY); +	loc_wipe (&loc); +	if (ret) +		return ret; + +	ret = dict_get_str (xattr, GFID_TO_PATH_KEY, &path); +	if (ret || !path) +		return -EINVAL; + +	*path_p = gf_strdup (path); +	if (!*path_p) +		return -ENOMEM; +	return 0;  } -xlator_t* -afr_crawl_readdir_xl_get (xlator_t *this, afr_crawl_data_t *crawl_data) -{ -        afr_private_t *priv = this->private; - -        if (crawl_data->crawl == FULL) { -                return this; -        } else { -                return priv->children[crawl_data->child]; -        } -        return NULL; -}  int -afr_crawl_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, -                           gf_dirent_t *entry, afr_crawl_data_t *crawl_data) +afr_shd_gather_index_entries (xlator_t *this, int child, dict_t *output)  { -        int           ret = -1; -        afr_private_t *priv = NULL; - -        priv = this->private; -        if (crawl_data->crawl == FULL) { -                ret = afr_build_child_loc (this, child, parent, entry->d_name); -        } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) { -                ret = _build_index_loc (this, child, entry->d_name, parent); -                if (ret) -                        goto out; -                child->inode = inode_new (priv->root_inode->table); -                if (!child->inode) { -                        ret = -1; -                        goto out; -                } -                child->path = NULL; -        } else { -                child->inode = inode_new (priv->root_inode->table); -                if (!child->inode) -                        goto out; -                uuid_parse (entry->d_name, child->gfid); -                ret = _loc_assign_gfid_path (child); -        } -out: -        return ret; +	fd_t *fd = NULL; +	xlator_t *subvol = NULL; +	afr_private_t *priv = NULL; +	off_t offset = 0; +	gf_dirent_t entries; +	gf_dirent_t *entry = NULL; +	uuid_t gfid; +	int ret = 0; +	int count = 0; +	char *path = NULL; + +	priv = this->private; +	subvol = priv->children[child]; + +	fd = afr_shd_index_opendir (this, child); +	if (!fd) { +		gf_log (this->name, GF_LOG_WARNING, +			"unable to opendir index-dir on %s", subvol->name); +		return -errno; +	} + +	INIT_LIST_HEAD (&entries.list); + +	while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) { +		if (ret > 0) +			ret = 0; +		list_for_each_entry (entry, &entries.list, list) { +			offset = entry->d_off; + +			if (!strcmp (entry->d_name, ".") || +			    !strcmp (entry->d_name, "..")) +				continue; + +			gf_log (this->name, GF_LOG_DEBUG, "got entry: %s", +				entry->d_name); + +			ret = uuid_parse (entry->d_name, gfid); +			if (ret) +				continue; + +			path = NULL; +			ret = afr_shd_gfid_to_path (this, subvol, gfid, &path); + +			if (ret == -ENOENT || ret == -ESTALE) { +				afr_shd_index_purge (subvol, fd->inode, +						     entry->d_name); +				ret = 0; +				continue; +			} + +			ret = afr_shd_dict_add_path (this, output, child, path, +						     NULL); +		} + +		gf_dirent_free (&entries); +		if (ret) +			break; +	} + +	if (fd) +		fd_unref (fd); +	if (!ret) +		ret = count; +	return ret;  } -static int -_process_entries (xlator_t *this, loc_t *parentloc, gf_dirent_t *entries, -                  off_t *offset, afr_crawl_data_t *crawl_data) -{ -        gf_dirent_t      *entry = NULL; -        gf_dirent_t      *tmp = NULL; -        int              ret = 0; -        loc_t            entry_loc = {0}; -        fd_t             *fd = NULL; -        struct iatt      iattr = {0}; - -        list_for_each_entry_safe (entry, tmp, &entries->list, list) { -                if (!_crawl_proceed (this, crawl_data->child, -                                     crawl_data->crawl_flags, NULL)) { -                        ret = -1; -                        goto out; -                } -                *offset = entry->d_off; -                if (IS_ENTRY_CWD (entry->d_name) || -                    IS_ENTRY_PARENT (entry->d_name)) -                        continue; -                if ((crawl_data->crawl == FULL) && -                     uuid_is_null (entry->d_stat.ia_gfid)) { -                        gf_log (this->name, GF_LOG_WARNING, "%s/%s: No " -                                "gfid present skipping", -                                parentloc->path, entry->d_name); -                        continue; -                } - -                loc_wipe (&entry_loc); -                ret = afr_crawl_build_child_loc (this, &entry_loc, parentloc, -                                                 entry, crawl_data); -                if (ret) -                        goto out; -                ret = crawl_data->process_entry (this, crawl_data, entry, -                                                 &entry_loc, parentloc, &iattr); - -                if (crawl_data->crawl == INDEX_TO_BE_HEALED && ret) { -                       goto out; -                } else if (ret) { -                        continue; -                } - -                if ((crawl_data->crawl == INDEX) || -                    (crawl_data->crawl == INDEX_TO_BE_HEALED)) -                        continue; - -                if (!IA_ISDIR (iattr.ia_type)) -                        continue; -                fd = NULL; -                ret = afr_crawl_opendir (this, crawl_data, &fd, &entry_loc); -                if (ret) -                        continue; -                ret = _crawl_directory (fd, &entry_loc, crawl_data); -                if (fd) -                        fd_unref (fd); -        } -        ret = 0; -out: -        if ((crawl_data->crawl == INDEX_TO_BE_HEALED)  && ret) { -                gf_log (this->name, GF_LOG_ERROR,"Failed to get the hardlink " -                        "count"); -        } -        loc_wipe (&entry_loc); -        return ret; +int +afr_add_shd_event (circular_buffer_t *cb, void *data) +{ +	dict_t *output = NULL; +	xlator_t *this = THIS; +	afr_private_t *priv = NULL; +	afr_self_heald_t *shd = NULL; +	shd_event_t *shd_event = NULL; +	char *path = NULL; + +	output = data; +	priv = this->private; +	shd = &priv->shd; +	shd_event = cb->data; + +	if (!shd->index_healers[shd_event->child].local) +		return 0; + +	path = gf_strdup (shd_event->path); +	if (!path) +		return -ENOMEM; + +	afr_shd_dict_add_path (this, output, shd_event->child, path, +			       &cb->tv); +	return 0;  } -static int -_crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data) +int +afr_add_crawl_event (circular_buffer_t *cb, void *data)  { -        xlator_t        *this = NULL; -        off_t           offset   = 0; -        gf_dirent_t     entries; -        int             ret = 0; -        gf_boolean_t    free_entries = _gf_false; -        xlator_t        *readdir_xl = crawl_data->readdir_xl; +	dict_t *output = NULL; +	xlator_t *this = THIS; +	afr_private_t *priv = NULL; +	afr_self_heald_t *shd = NULL; +	crawl_event_t *crawl_event = NULL; -        INIT_LIST_HEAD (&entries.list); -        this = THIS; +	output = data; +	priv = this->private; +	shd = &priv->shd; +	crawl_event = cb->data; -        GF_ASSERT (loc->inode); +	if (!shd->index_healers[crawl_event->child].local) +		return 0; -        if (crawl_data->crawl == FULL) -                gf_log (this->name, GF_LOG_DEBUG, "crawling %s", loc->path); -        else -                gf_log (this->name, GF_LOG_DEBUG, "crawling INDEX %s", -                        uuid_utoa (loc->gfid)); - -        while (1) { -                if (crawl_data->crawl == FULL) -                        ret = syncop_readdirp (readdir_xl, fd, 131072, offset, -                                               NULL, &entries); -                else -                        ret = syncop_readdir (readdir_xl, fd, 131072, offset, -                                              &entries); -                if (ret < 0) { -                        ret = -1; -                        break; -                } else if (ret == 0) { -                        break; -                } - -                ret = 0; -                free_entries = _gf_true; - -                if (!_crawl_proceed (this, crawl_data->child, -                                     crawl_data->crawl_flags, NULL)) { -                        ret = -1; -                        goto out; -                } -                if (list_empty (&entries.list)) -                        goto out; +	afr_shd_dict_add_crawl_event (this, output, crawl_event); -                ret = _process_entries (this, loc, &entries, &offset, -                                        crawl_data); -                if ((ret < 0) && (crawl_data->crawl == INDEX_TO_BE_HEALED)) { -                        goto out; -                } -                gf_dirent_free (&entries); -                free_entries = _gf_false; -        } -        ret = 0; -out: -        if (free_entries) -                gf_dirent_free (&entries); -        return ret; +	return 0;  } -static char* -position_str_get (afr_child_pos_t pos) -{ -        switch (pos) { -        case AFR_POS_UNKNOWN: -                return "unknown"; -        case AFR_POS_LOCAL: -                return "local"; -        case AFR_POS_REMOTE: -                return "remote"; -        } -        return NULL; -}  int -afr_find_child_position (xlator_t *this, int child, afr_child_pos_t *pos) +afr_selfheal_daemon_init (xlator_t *this)  { -        afr_private_t    *priv = NULL; -        afr_self_heald_t *shd  = NULL; -        dict_t           *xattr_rsp = NULL; -        loc_t            loc = {0}; -        int              ret = 0; -        char             *node_uuid = NULL; - -        priv = this->private; -        shd  = &priv->shd; - -        afr_build_root_loc (this, &loc); - -        ret = syncop_getxattr (priv->children[child], &loc, &xattr_rsp, -                               GF_XATTR_NODE_UUID_KEY); -        if (ret < 0) { -                gf_log (this->name, GF_LOG_ERROR, "getxattr failed on %s - " -                        "(%s)", priv->children[child]->name, strerror (-ret)); -                ret = -1; +	afr_private_t *priv = NULL; +	afr_self_heald_t *shd = NULL; +	int ret = -1; +	int i = 0; + +	priv = this->private; +	shd = &priv->shd; + +	this->itable = inode_table_new (SHD_INODE_LRU_LIMIT, this); +	if (!this->itable) +		goto out; + +	shd->index_healers = GF_CALLOC (sizeof(*shd->index_healers), +					priv->child_count, +					gf_afr_mt_subvol_healer_t); +	if (!shd->index_healers) +		goto out; + +	for (i = 0; i < priv->child_count; i++) { +		shd->index_healers[i].subvol = i; +		ret = afr_shd_healer_init (this, &shd->index_healers[i]); +		if (ret) +			goto out; +	} + +	shd->full_healers = GF_CALLOC (sizeof(*shd->full_healers), +				       priv->child_count, +				       gf_afr_mt_subvol_healer_t); +	if (!shd->full_healers) +		goto out; +	for (i = 0; i < priv->child_count; i++) { +		shd->full_healers[i].subvol = i; +		ret = afr_shd_healer_init (this, &shd->full_healers[i]); +		if (ret) +			goto out; +	} + +	shd->healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false, +			      afr_destroy_shd_event_data); +        if (!shd->healed) +		goto out; + +	shd->heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false, +				   afr_destroy_shd_event_data); +	if (!shd->heal_failed) +		goto out; + +	shd->split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false, +				   afr_destroy_shd_event_data); +	if (!shd->split_brain) +		goto out; + +        shd->statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count, +				     gf_common_mt_eh_t); +        if (!shd->statistics)                  goto out; -        } -        ret = dict_get_str (xattr_rsp, GF_XATTR_NODE_UUID_KEY, &node_uuid); -        if (ret) { -                gf_log (this->name, GF_LOG_ERROR, "node-uuid key not found on " -                        "child %s", priv->children[child]->name); -                goto out; +        for (i = 0; i < priv->child_count ; i++) { +                shd->statistics[i] = eh_new (AFR_STATISTICS_HISTORY_SIZE, +					     _gf_false, +					     afr_destroy_crawl_event_data); +                if (!shd->statistics[i]) +                        goto out; +		shd->full_healers[i].crawl_event.child = i; +		shd->full_healers[i].crawl_event.crawl_type = "FULL"; +		shd->index_healers[i].crawl_event.child = i; +		shd->index_healers[i].crawl_event.crawl_type = "INDEX";          } -        if (!strcmp (node_uuid, shd->node_uuid)) -                *pos = AFR_POS_LOCAL; -        else -                *pos = AFR_POS_REMOTE; - -        gf_log (this->name, GF_LOG_DEBUG, "child %s is %s", -                priv->children[child]->name, position_str_get (*pos)); +	ret = 0;  out: -        if (ret) -                *pos = AFR_POS_UNKNOWN; -        loc_wipe (&loc); -        return ret; +	return ret;  } +  int -afr_syncop_find_child_position (void *data) +afr_selfheal_childup (xlator_t *this, int subvol)  { -        shd_pos_t *pos_data = data; -        int       ret = 0; +	afr_shd_index_healer_spawn (this, subvol); -        ret = afr_find_child_position (pos_data->this, pos_data->child, -                                       &pos_data->pos); -        return ret; +	return 0;  } -static int -afr_dir_crawl (void *data) -{ -        xlator_t            *this = NULL; -        int                 ret = -1; -        xlator_t            *readdir_xl = NULL; -        fd_t                *fd = NULL; -        loc_t               dirloc = {0}; -        afr_crawl_data_t    *crawl_data = data; - -        this = THIS; - -        if (!_crawl_proceed (this, crawl_data->child, crawl_data->crawl_flags, -                             NULL)) -                goto out; - -        readdir_xl = afr_crawl_readdir_xl_get (this, crawl_data); -        if (!readdir_xl) -                goto out; -        crawl_data->readdir_xl = readdir_xl; -        ret = afr_crawl_build_start_loc (this, crawl_data, &dirloc); -        if (ret) -                goto out; - -        ret = afr_crawl_opendir (this, crawl_data, &fd, &dirloc); -        if (ret) { -                if (crawl_data->crawl == INDEX_TO_BE_HEALED) { -                        gf_log (this->name, GF_LOG_ERROR, "Failed to open base_" -                                "indices_holder"); -                } -                goto out; -        } - -        ret = _crawl_directory (fd, &dirloc, crawl_data); -        if (ret) -                gf_log (this->name, GF_LOG_ERROR, "Crawl failed on %s", -                        readdir_xl->name); -        else -                gf_log (this->name, GF_LOG_DEBUG, "Crawl completed " -                        "on %s", readdir_xl->name); -        if (crawl_data->crawl == INDEX) -                dirloc.path = NULL; -out: -        if (fd) -                fd_unref (fd); -        if ((crawl_data->crawl == INDEX) || -            (crawl_data->crawl == INDEX_TO_BE_HEALED )) -                dirloc.path = NULL; -        loc_wipe (&dirloc); -        return ret; -} - -char * -get_crawl_type_in_string (afr_crawl_type_t crawl) +int64_t +afr_shd_get_index_count (xlator_t *this, int i)  { -        char    *index = "INDEX"; -        char    *full  = "FULL"; -        char    *crawl_type = NULL; - -        if (crawl == INDEX){ -                crawl_type = index; -        } else if (crawl == FULL) { -                crawl_type = full; -        } - -        return  crawl_type; -} - -static int -afr_allocate_crawl_event (xlator_t *this, int child, afr_crawl_type_t crawl) -{ -        afr_private_t           *priv = NULL; -        afr_self_heald_t        *shd = NULL; -        int                     ret = 0; -        shd_crawl_event_t       *crawl_event = NULL; -        time_t                  get_time = 0; - -        priv = this->private; -        shd = &priv->shd; - -        crawl_event = GF_CALLOC (sizeof (shd_crawl_event_t), 1, -                                 gf_afr_mt_shd_crawl_event_t); -        if (!crawl_event) { -                ret = -1; -                goto out; -        } - -        get_time =  time(NULL); -        if (get_time == ((time_t)-1)) { -                 ret = -1; -                goto out; -        } - -        crawl_event->start_time_str = gf_strdup (ctime(&get_time)); - -        crawl_event->crawl_type = get_crawl_type_in_string (crawl); -        if (!crawl_event->crawl_type) { -                ret = -1; -                goto out; -        } -        LOCK (&priv->lock); -        { -                shd->crawl_events[child] = crawl_event; -        } -        UNLOCK (&priv->lock); -        ret = 0; -out: -        return ret; - +	afr_private_t *priv = NULL; +	xlator_t *subvol = NULL; +	uint64_t count = 0; +	loc_t rootloc = {0, }; +	dict_t *xattr = NULL; +	int ret = -1; + +	priv = this->private; +	subvol = priv->children[i]; + +	rootloc.inode = inode_ref (this->itable->root); +	uuid_copy (rootloc.gfid, rootloc.inode->gfid); + +	ret = syncop_getxattr (subvol, &rootloc, &xattr, +			       GF_XATTROP_INDEX_COUNT); +	loc_wipe (&rootloc); + +	if (ret < 0) +		return -1; + +	ret = dict_get_uint64 (xattr, GF_XATTROP_INDEX_COUNT, &count); +	if (ret) +		return -1; +	return count;  } -static int -afr_put_crawl_event_in_eh (xlator_t *this, int child) -{ -        afr_private_t           *priv = NULL; -        afr_self_heald_t        *shd = NULL; -        int                     ret = 0; -        time_t                  get_time = 0; -        shd_crawl_event_t       **crawl_event = NULL; - -        priv = this->private; -        shd = &priv->shd; - -        get_time = time(NULL); -        if (get_time == ((time_t)-1)) { -                ret = -1; -                goto out; -        } -        crawl_event = (shd_crawl_event_t**)shd->crawl_events; -        LOCK (&priv->lock); -        { -                crawl_event[child]->end_time_str = gf_strdup (ctime(&get_time)); -                ret = eh_save_history (shd->statistics[child], -                                       crawl_event[child]); -                crawl_event[child] = NULL; -        } -        UNLOCK (&priv->lock); -out: -        return ret; -} -static int -afr_dir_exclusive_crawl (void *data) +int +afr_xl_op (xlator_t *this, dict_t *input, dict_t *output)  { -        afr_private_t    *priv = NULL; -        afr_self_heald_t *shd = NULL; -        gf_boolean_t     crawl = _gf_false; +        gf_xl_afr_op_t   op = GF_AFR_OP_INVALID;          int              ret = 0; -        int              child = -1; -        xlator_t         *this = NULL; -        afr_crawl_data_t *crawl_data = data; - -        this = THIS; -        priv = this->private; -        shd = &priv->shd; -        child = crawl_data->child; - -        LOCK (&priv->lock); -        { -                if (shd->inprogress[child]) { -                        if (shd->pending[child] != FULL) -                                shd->pending[child] = crawl_data->crawl; -                } else { -                        shd->inprogress[child] = _gf_true; -                        crawl = _gf_true; -                } -        } -        UNLOCK (&priv->lock); - -        if (!crawl) { -                gf_log (this->name, GF_LOG_INFO, "Another crawl is in progress " -                        "for %s while attempting %s heal on %s", -                        priv->children[child]->name, -                        get_crawl_type_in_string (crawl_data->crawl), -                        priv->children[child]->name); -                goto out; -        } - -        do { -                ret = afr_allocate_crawl_event (this, child, crawl_data->crawl); -                if (ret) -                        goto out; -                afr_dir_crawl (data); - -                ret = afr_put_crawl_event_in_eh (this, child); -                if (ret < 0) -                        goto out; - -                LOCK (&priv->lock); -                { -                        if (shd->pending[child] != NONE) { -                                crawl_data->crawl = shd->pending[child]; -                                shd->pending[child] = NONE; -                        } else { -                                shd->inprogress[child] = _gf_false; -                                crawl = _gf_false; -                        } -                } -                UNLOCK (&priv->lock); -        } while (crawl); -out: -        return ret; -} +        int              xl_id = 0; +	afr_private_t   *priv = NULL; +	afr_self_heald_t *shd = NULL; +	struct subvol_healer *healer = NULL; +	int i = 0; +	char key[64]; +	int op_ret = 0; +	int64_t cnt = 0; -void -afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl, -                 process_entry_cbk_t process_entry, void *op_data, -                 gf_boolean_t exclusive, int crawl_flags, -                 afr_crawl_done_cbk_t crawl_done) -{ -        afr_private_t              *priv = NULL; -        call_frame_t               *frame = NULL; -        afr_crawl_data_t           *crawl_data = NULL; -        int                        ret = 0; -        int (*crawler) (void*) = NULL; +	priv = this->private; +	shd = &priv->shd; -        priv = this->private; +	for (i = 0; i < priv->child_count; i++) +		if (priv->child_up[i] == -1) +			goto out; -        frame = create_frame (this, this->ctx->pool); -        if (!frame) +        ret = dict_get_int32 (input, "xl-op", (int32_t*)&op); +        if (ret)                  goto out; - -        afr_set_lk_owner (frame, this, frame->root); -        afr_set_low_priority (frame); -        crawl_data = GF_CALLOC (1, sizeof (*crawl_data), -                                gf_afr_mt_crawl_data_t); -        if (!crawl_data) +        ret = dict_get_int32 (input, this->name, &xl_id); +        if (ret)                  goto out; -        crawl_data->process_entry = process_entry; -        crawl_data->child = idx; -        crawl_data->pid = frame->root->pid; -        crawl_data->crawl = crawl; -        crawl_data->op_data = op_data; -        crawl_data->crawl_flags = crawl_flags; -        gf_log (this->name, GF_LOG_DEBUG, "starting crawl %d for %s", -                crawl_data->crawl, priv->children[idx]->name); - -        if (exclusive) -                crawler = afr_dir_exclusive_crawl; -        else -                crawler = afr_dir_crawl; -        ret = synctask_new (this->ctx->env, crawler, -                            crawl_done, frame, crawl_data); +        ret = dict_set_int32 (output, this->name, xl_id);          if (ret) -                gf_log (this->name, GF_LOG_ERROR, "afr crawl failed for child" -                        " %d with ret %d", idx, ret); -out: -        return; -} - -void -afr_build_root_loc (xlator_t *this, loc_t *loc) -{ -        afr_private_t   *priv = NULL; - -        priv = this->private; -        loc->path = gf_strdup ("/"); -        loc->name = ""; -        loc->inode = inode_ref (priv->root_inode); -        uuid_copy (loc->gfid, loc->inode->gfid); -} - -int -afr_set_root_gfid (dict_t *dict) -{ -        uuid_t gfid; -        int ret = 0; - -        memset (gfid, 0, 16); -        gfid[15] = 1; - -        ret = afr_set_dict_gfid (dict, gfid); +                goto out; +        switch (op) { +        case GF_AFR_OP_HEAL_INDEX: +		op_ret = -1; + +		for (i = 0; i < priv->child_count; i++) { +			healer = &shd->index_healers[i]; +			snprintf (key, 64, "%d-%d-status", xl_id, i); + +			if (!priv->child_up[i]) { +				ret = dict_set_str (output, key, +						    "Brick is not connected"); +			} else if (AFR_COUNT (priv->child_up, +					      priv->child_count) < 2) { +				ret = dict_set_str (output, key, +						    "< 2 bricks in replica are up"); +			} else if (!afr_shd_is_subvol_local (this, healer->subvol)) { +				ret = dict_set_str (output, key, +						    "Brick is remote"); +			} else { +				ret = dict_set_str (output, key, +						    "Started self-heal"); +				afr_shd_index_healer_spawn (this, i); +				op_ret = 0; +			} +		} +                break; +        case GF_AFR_OP_HEAL_FULL: +		op_ret = -1; + +		for (i = 0; i < priv->child_count; i++) { +			healer = &shd->full_healers[i]; +			snprintf (key, 64, "%d-%d-status", xl_id, i); + +			if (!priv->child_up[i]) { +				ret = dict_set_str (output, key, +						    "Brick is not connected"); +			} else if (AFR_COUNT (priv->child_up, +					      priv->child_count) < 2) { +				ret = dict_set_str (output, key, +						    "< 2 bricks in replica are up"); +			} else if (!afr_shd_is_subvol_local (this, healer->subvol)) { +				ret = dict_set_str (output, key, +						    "Brick is remote"); +			} else { +				ret = dict_set_str (output, key, +						    "Started self-heal"); +				afr_shd_full_healer_spawn (this, i); +				op_ret = 0; +			} +		} +                break; +        case GF_AFR_OP_INDEX_SUMMARY: +		for (i = 0; i < priv->child_count; i++) +			if (shd->index_healers[i].local) +				afr_shd_gather_index_entries (this, i, output); +                break; +        case GF_AFR_OP_HEALED_FILES: +		eh_dump (shd->healed, output, afr_add_shd_event); +                break; +        case GF_AFR_OP_HEAL_FAILED_FILES: +		eh_dump (shd->heal_failed, output, afr_add_shd_event); +                break; +        case GF_AFR_OP_SPLIT_BRAIN_FILES: +		eh_dump (shd->split_brain, output, afr_add_shd_event); +                break; +        case GF_AFR_OP_STATISTICS: +		for (i = 0; i < priv->child_count; i++) { +			eh_dump (shd->statistics[i], output, +				 afr_add_crawl_event); +			afr_shd_dict_add_crawl_event (this, output, +						      &shd->index_healers[i].crawl_event); +			afr_shd_dict_add_crawl_event (this, output, +						      &shd->full_healers[i].crawl_event); +		} +                break; +        case GF_AFR_OP_STATISTICS_HEAL_COUNT: +        case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA: +		op_ret = -1; + +		for (i = 0; i < priv->child_count; i++) { +			if (!priv->child_up[i]) { +				snprintf (key, 64, "%d-%d-status", xl_id, i); +				ret = dict_set_str (output, key, +						    "Brick is not connected"); +			} else { +				snprintf (key, 64, "%d-%d-hardlinks", xl_id, i); +				cnt = afr_shd_get_index_count (this, i); +				if (cnt >= 0) { +					ret = dict_set_uint64 (output, key, cnt); +				} +				op_ret = 0; +			} +		} + +//                ret = _do_crawl_op_on_local_subvols (this, INDEX_TO_BE_HEALED, +//                                                     STATISTICS_TO_BE_HEALED, +//                                                     output); +                break; -        return ret; +        default: +                gf_log (this->name, GF_LOG_ERROR, "Unknown set op %d", op); +                break; +        } +out: +        dict_del (output, this->name); +        return op_ret;  } diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h index e0c083754e0..10e229ee7c2 100644 --- a/xlators/cluster/afr/src/afr-self-heald.h +++ b/xlators/cluster/afr/src/afr-self-heald.h @@ -1,5 +1,5 @@  /* -  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> +  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>    This file is part of GlusterFS.    This file is licensed to you under your choice of the GNU Lesser @@ -8,58 +8,65 @@    cases as published by the Free Software Foundation.  */ -#ifndef __AFR_SELF_HEALD_H__ -#define __AFR_SELF_HEALD_H__ -#include "xlator.h" - -#define IS_ROOT_PATH(path) (!strcmp (path, "/")) -#define IS_ENTRY_CWD(entry) (!strcmp (entry, ".")) -#define IS_ENTRY_PARENT(entry) (!strcmp (entry, "..")) -#define AFR_ALL_CHILDREN -1 - -typedef struct afr_crawl_data_ { -        int                 child; -        pid_t               pid; -        afr_crawl_type_t    crawl; -        xlator_t            *readdir_xl; -        void                *op_data; -        int                 crawl_flags; -        int (*process_entry) (xlator_t *this, struct afr_crawl_data_ *crawl_data, -                              gf_dirent_t *entry, loc_t *child, loc_t *parent, -                              struct iatt *iattr); -} afr_crawl_data_t; - -typedef struct crawl_event_stats_ { -        uint64_t healed_count; + +#ifndef _AFR_SELF_HEALD_H +#define _AFR_SELF_HEALD_H + +#include <pthread.h> + + +typedef struct { +	int child; +	char *path; +} shd_event_t; + +typedef struct { +	int      child; +	uint64_t healed_count;          uint64_t split_brain_count;          uint64_t heal_failed_count; -        char     *start_time_str; -        char     *end_time_str; + +	/* If start_time is 0, it means crawler is not in progress +	   and stats are not valid */ +	time_t   start_time; +	/* If start_time is NOT 0 and end_time is 0, it means +	   cralwer is in progress */ +        time_t   end_time;          char     *crawl_type; -        gf_boolean_t crawl_inprogress; -} shd_crawl_event_t; +} crawl_event_t; -void _destroy_crawl_event_data (void *data); -void _destroy_shd_event_data (void *data); +struct subvol_healer { +	xlator_t        *this; +	int              subvol; +	gf_boolean_t     local; +	gf_boolean_t     running; +	gf_boolean_t     rerun; +	crawl_event_t    crawl_event; +	pthread_mutex_t  mutex; +	pthread_cond_t   cond; +	pthread_t        thread; +}; -typedef int (*process_entry_cbk_t) (xlator_t *this, afr_crawl_data_t *crawl_data, -                              gf_dirent_t *entry, loc_t *child, loc_t *parent, -                              struct iatt *iattr); +typedef struct { +	gf_boolean_t            iamshd; +	gf_boolean_t            enabled; +	struct subvol_healer   *index_healers; +	struct subvol_healer   *full_healers; -void afr_build_root_loc (xlator_t *this, loc_t *loc); +	eh_t                    *healed; +        eh_t                    *heal_failed; +        eh_t                    *split_brain; +        eh_t                    **statistics; +} afr_self_heald_t; -int afr_set_root_gfid (dict_t *dict); -void -afr_proactive_self_heal (void *data); +int +afr_selfheal_childup (xlator_t *this, int subvol);  int -afr_xl_op (xlator_t *this, dict_t *input, dict_t *output); +afr_selfheal_daemon_init (xlator_t *this); -/* - * In addition to its self-heal use, this is used to find a local default - * read_child. - */  int -afr_local_pathinfo (char *pathinfo, gf_boolean_t *local); -#endif /* __AFR_SELF_HEALD_H__ */ +afr_xl_op (xlator_t *this, dict_t *input, dict_t *output); + +#endif /* !_AFR_SELF_HEALD_H */ diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 20306e46924..f974fdb596b 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -18,188 +18,130 @@  #include <signal.h> +gf_boolean_t +afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this); + +gf_boolean_t +afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this); + +int +afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, +		  afr_changelog_resume_t changelog_resume); -#define LOCKED_NO       0x0        /* no lock held */ -#define LOCKED_YES      0x1        /* for DATA, METADATA, ENTRY and higher_path -                                      of RENAME */ -#define LOCKED_LOWER    0x2        /* for lower_path of RENAME */ -afr_fd_ctx_t * -__afr_fd_ctx_get (fd_t *fd, xlator_t *this) +int +__afr_txn_write_fop (call_frame_t *frame, xlator_t *this)  { -        uint64_t       ctx = 0; -        int            ret = 0; -        afr_fd_ctx_t  *fd_ctx = NULL; -        int            i = 0; +        afr_local_t *local = NULL;          afr_private_t *priv = NULL; +        int call_count = -1; +        int i = 0; +        local = frame->local;          priv = this->private; -        ret = __fd_ctx_get (fd, this, &ctx); - -        if (ret < 0 && fd_is_anonymous (fd)) { -                ret = __afr_fd_ctx_set (this, fd); -                if (ret < 0) -                        goto out; - -                ret = __fd_ctx_get (fd, this, &ctx); -                if (ret < 0) -                        goto out; +        call_count = AFR_COUNT (local->transaction.pre_op, priv->child_count); -                fd_ctx = (afr_fd_ctx_t *)(long) ctx; -                for (i = 0; i < priv->child_count; i++) -                        fd_ctx->opened_on[i] = AFR_FD_OPENED; +        if (call_count == 0) { +                local->transaction.resume (frame, this); +                return 0;          } -        fd_ctx = (afr_fd_ctx_t *)(long) ctx; -out: -        return fd_ctx; -} - +        local->call_count = call_count; -afr_fd_ctx_t * -afr_fd_ctx_get (fd_t *fd, xlator_t *this) -{ -        afr_fd_ctx_t  *fd_ctx = NULL; +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) { +			local->transaction.wind (frame, this, i); -        LOCK(&fd->lock); -        { -                fd_ctx = __afr_fd_ctx_get (fd, this); +                        if (!--call_count) +                                break; +                }          } -        UNLOCK(&fd->lock); -        return fd_ctx; +        return 0;  } -static void -afr_save_lk_owner (call_frame_t *frame) +int +__afr_txn_write_done (call_frame_t *frame, xlator_t *this)  { -        afr_local_t * local = NULL; +        afr_local_t *local = NULL;          local = frame->local; -        local->saved_lk_owner = frame->root->lk_owner; +        local->transaction.unwind (frame, this); + +        AFR_STACK_DESTROY (frame); + +        return 0;  } -static void -afr_restore_lk_owner (call_frame_t *frame) +call_frame_t* +afr_transaction_detach_fop_frame (call_frame_t *frame)  { -        afr_local_t * local = NULL; +        afr_local_t *   local = NULL; +        call_frame_t   *fop_frame = NULL;          local = frame->local; -        frame->root->lk_owner = local->saved_lk_owner; -} - -static void -__mark_all_pending (int32_t *pending[], int child_count, -                    afr_transaction_type type) -{ -        int i = 0; -        int j = 0; - -        for (i = 0; i < child_count; i++) { -                j = afr_index_for_transaction_type (type); -                pending[i][j] = hton32 (1); +        LOCK (&frame->lock); +        { +                fop_frame = local->transaction.main_frame; +                local->transaction.main_frame = NULL;          } +        UNLOCK (&frame->lock); + +        return fop_frame;  }  static void -__mark_child_dead (int32_t *pending[], int child_count, int child, -                   afr_transaction_type type) +afr_save_lk_owner (call_frame_t *frame)  { -        int j = 0; +        afr_local_t * local = NULL; -        j = afr_index_for_transaction_type (type); +        local = frame->local; -        pending[child][j] = 0; +        local->saved_lk_owner = frame->root->lk_owner;  }  static void -__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index) +afr_restore_lk_owner (call_frame_t *frame)  { -        afr_local_t   *local = NULL; -        afr_fd_ctx_t  *fd_ctx = NULL; +        afr_local_t * local = NULL;          local = frame->local; -        if (!local->fd) -                return; - -        fd_ctx = afr_fd_ctx_get (local->fd, this); - -        if (!fd_ctx) -                goto out; - -        LOCK (&local->fd->lock); -        { -                if (local->transaction.type == AFR_DATA_TRANSACTION) -                        fd_ctx->pre_op_done[child_index]++; -        } -        UNLOCK (&local->fd->lock); -out: -        return; -} - -static void -__mark_non_participant_children (int32_t *pending[], int child_count, -                                 unsigned char *participants, -                                 afr_transaction_type type) -{ -        int i = 0; -        int j = 0; - -        j = afr_index_for_transaction_type (type); -        for (i = 0; i < child_count; i++) { -                if (!participants[i]) -                        pending[i][j] = 0; -        } +        frame->root->lk_owner = local->saved_lk_owner;  } -  void -__mark_all_success (int32_t *pending[], int child_count, -                    afr_transaction_type type) +__mark_all_success (call_frame_t *frame, xlator_t *this)  { -        int i; -        int j; - -        for (i = 0; i < child_count; i++) { -                j = afr_index_for_transaction_type (type); -                pending[i][j] = hton32 (-1); -        } -} +	afr_private_t *priv = NULL; +	afr_local_t *local = NULL; +	int i; -void -_set_all_child_errno (int *child_errno, unsigned int child_count) -{ -        int     i = 0; +	local = frame->local; +	priv = this->private; -        for (i = 0; i < child_count; i++) -                if (child_errno[i] == 0) -                        child_errno[i] = ENOTCONN; +	for (i = 0; i < priv->child_count; i++) { +		local->transaction.failed_subvols[i] = 0; +	}  } -void + +int  afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)  {          afr_local_t     *local = NULL; -        afr_private_t   *priv = NULL;          fd_t            *fd   = NULL;          local = frame->local; -        priv  = this->private;          fd    = local->fd; -        __mark_all_success (local->pending, priv->child_count, -                            local->transaction.type); - -        _set_all_child_errno (local->child_errno, priv->child_count); -          /*  Perform fops with the lk-owner from top xlator.           *  Eg: lk-owner of posix-lk and flush should be same,           *  flush cant clear the  posix-lks without that lk-owner. @@ -208,6 +150,10 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)          frame->root->lk_owner =                  local->transaction.main_frame->root->lk_owner; +	if (local->pre_op_compat) +		/* old mode, pre-op was done as afr_changelog_do() +		   just now, before OP */ +		afr_changelog_pre_op_update (frame, this);          /* The wake up needs to happen independent of             what type of fop arrives here. If it was @@ -220,6 +166,8 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)          if (fd)                  afr_delayed_changelog_wake_up (this, fd);          local->transaction.fop (frame, this); + +	return 0;  } @@ -285,39 +233,28 @@ __fop_changelog_needed (call_frame_t *frame, xlator_t *this)          return op_ret;  } +  int -afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, -                      int child, afr_xattrop_type_t op) +afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int **pending)  {          int i = 0;          int ret = 0; +	int pending_zero[AFR_NUM_CHANGE_LOGS] = {0, }; -        if (op == LOCAL_FIRST) { -                ret = dict_set_static_bin (xattr, priv->pending_key[child], -                                           pending[child], -                                   AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); -                if (ret) -                        goto out; -        }          for (i = 0; i < priv->child_count; i++) { -                if (i == child) -                        continue; +		if (!memcmp (pending_zero, pending[i], sizeof (pending_zero))) +			/* don't set xattrs for non-pending servers */ +			continue; +                  ret = dict_set_static_bin (xattr, priv->pending_key[i], -                                           pending[i], -                                   AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); +					   pending[i], +					   AFR_NUM_CHANGE_LOGS * sizeof (int));                  /* 3 = data+metadata+entry */ -                if (ret < 0) -                        goto out; -        } -        if (op == LOCAL_LAST) { -                ret = dict_set_static_bin (xattr, priv->pending_key[child], -                                           pending[child], -                                   AFR_NUM_CHANGE_LOGS * sizeof (int32_t));                  if (ret) -                        goto out; +                        break;          } -out: +          return ret;  } @@ -346,102 +283,34 @@ afr_lock_server_count (afr_private_t *priv, afr_transaction_type type)  /* {{{ pending */ -int32_t -afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                           int32_t op_ret, int32_t op_errno, dict_t *xattr, -                           dict_t *xdata) + +int +afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this)  { +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL;          afr_internal_lock_t *int_lock = NULL; -        afr_private_t       *priv     = NULL; -        afr_local_t         *local    = NULL; -        int                  call_count = -1; -        priv     = this->private; -        local    = frame->local; +	local = frame->local; +	priv = this->private;          int_lock = &local->internal_lock; -        LOCK (&frame->lock); -        { -                call_count = --local->call_count; -        } -        UNLOCK (&frame->lock); - -        if (call_count == 0) { -                if (local->transaction.resume_stub) { -			call_resume (local->transaction.resume_stub); -                        local->transaction.resume_stub = NULL; -                } +	if (local->transaction.resume_stub) { +		call_resume (local->transaction.resume_stub); +		local->transaction.resume_stub = NULL; +	} -                if (afr_lock_server_count (priv, local->transaction.type) == 0) { -                        local->transaction.done (frame, this); -                } else { -                        int_lock->lock_cbk = local->transaction.done; -                        afr_unlock (frame, this); -                } -        } +	if (afr_lock_server_count (priv, local->transaction.type) == 0) { +		local->transaction.done (frame, this); +	} else { +		int_lock->lock_cbk = local->transaction.done; +		afr_unlock (frame, this); +	} -        return 0; +	return 0;  } -void -afr_transaction_rm_stale_children (call_frame_t *frame, xlator_t *this, -                                   inode_t *inode, afr_transaction_type type) -{ -        int             i = -1; -        int             count = 0; -        int             read_child = -1; -        afr_private_t   *priv = NULL; -        afr_local_t     *local = NULL; -        int             **pending = NULL; -        int             idx = 0; -        int32_t         *stale_children = NULL; -        int32_t         *fresh_children = NULL; -        gf_boolean_t    rm_stale_children = _gf_false; - -        idx = afr_index_for_transaction_type (type); - -        priv = this->private; -        local = frame->local; -        pending = local->pending; - -        if (local->op_ret < 0) -                goto out; -        fresh_children = local->fresh_children; -        read_child = afr_inode_get_read_ctx (this, inode, fresh_children); -        if (read_child < 0) { -                gf_log (this->name, GF_LOG_DEBUG, "Possible split-brain " -                        "for %s", uuid_utoa (inode->gfid)); -                goto out; -        } - -        for (i = 0; i < priv->child_count; i++) { -                if (!afr_is_child_present (fresh_children, -                                           priv->child_count, i)) -                        continue; -                if (pending[i][idx]) -                        continue; -                /* child is down or op failed on it */ -                if (!stale_children) -                        stale_children = afr_children_create (priv->child_count); -                if (!stale_children) -                        goto out; - -                rm_stale_children = _gf_true; -                stale_children[count++] = i; -                gf_log (this->name, GF_LOG_DEBUG, "Removing stale child " -                        "%d for %s", i, uuid_utoa (inode->gfid)); -        } - -        if (!rm_stale_children) -                goto out; - -        afr_inode_rm_stale_children (this, inode, stale_children); -out: -        GF_FREE (stale_children); -        return; -} -  afr_inodelk_t*  afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom)  { @@ -478,423 +347,468 @@ afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock)          return locked_nodes;  } +  int -afr_changelog_pre_op_call_count (afr_transaction_type type, -                                 afr_internal_lock_t *int_lock, -                                 unsigned int child_count) +afr_changelog_call_count (afr_transaction_type type, +			  unsigned char *pre_op_subvols, +			  unsigned int child_count)  { -        int           call_count = 0; -        unsigned char *locked_nodes = NULL; +        int call_count = 0; -        locked_nodes = afr_locked_nodes_get (type, int_lock); -        GF_ASSERT (locked_nodes); +	call_count = AFR_COUNT(pre_op_subvols, child_count); -        call_count = afr_locked_children_count (locked_nodes, child_count);          if (type == AFR_ENTRY_RENAME_TRANSACTION)                  call_count *= 2;          return call_count;  } -int -afr_changelog_post_op_call_count (afr_transaction_type type, -                                  unsigned char *pre_op, -                                  unsigned int child_count) -{ -        int           call_count = 0; -        call_count = afr_pre_op_done_children_count (pre_op, child_count); -        if (type == AFR_ENTRY_RENAME_TRANSACTION) -                call_count *= 2; +gf_boolean_t +afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) +{ +        afr_private_t *priv = NULL; +        afr_local_t *local = NULL; +        int i = 0; -        return call_count; -} +        local = frame->local; +	priv = this->private; -void -afr_compute_txn_changelog (afr_local_t *local, afr_private_t *priv) -{ -        int             i = 0; -        int             index = 0; -        int32_t         postop = 0; -        int32_t         preop = 1; -        int32_t         **txn_changelog = NULL; - -        txn_changelog = local->transaction.txn_changelog; -        index = afr_index_for_transaction_type (local->transaction.type);          for (i = 0; i < priv->child_count; i++) { -                postop = ntoh32 (local->pending[i][index]); -                txn_changelog[i][index] = hton32 (postop + preop); +                if (local->transaction.failed_subvols[i]) +                        return _gf_false;          } -} -afr_xattrop_type_t -afr_get_postop_xattrop_type (int32_t **pending, int optimized, int child, -                             afr_transaction_type type) -{ -        int                     index = 0; -        afr_xattrop_type_t      op = LOCAL_LAST; - -        index = afr_index_for_transaction_type (type); -        if (optimized && !pending[child][index]) -                op = LOCAL_FIRST; -        return op; +        return _gf_true;  } +  void -afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr, -                     int optimized, int child) +afr_handle_symmetric_errors (call_frame_t *frame, xlator_t *this)  { -        int32_t                 **txn_changelog = NULL; -        int32_t                 **changelog = NULL; -        afr_private_t           *priv = NULL; -        int                     ret = 0; -        afr_xattrop_type_t      op = LOCAL_LAST; +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	int op_errno = 0; +	int i_errno = 0; +	gf_boolean_t matching_errors = _gf_true; +	int i = 0; -        priv = this->private; -        txn_changelog = local->transaction.txn_changelog; -        op = afr_get_postop_xattrop_type (local->pending, optimized, child, -                                          local->transaction.type); -        if (optimized) -                changelog = txn_changelog; -        else -                changelog = local->pending; -        ret = afr_set_pending_dict (priv, xattr, changelog, child, op); -        if (ret < 0) -                gf_log (this->name, GF_LOG_INFO, -                        "failed to set pending entry"); +	priv = this->private; +	local = frame->local; + +	for (i = 0; i < priv->child_count; i++) { +		if (!local->replies[i].valid) +			continue; +		if (local->replies[i].op_ret != -1) { +			/* Operation succeeded on at least on subvol, +			   so it is not a failed-everywhere situation. +			*/ +			matching_errors = _gf_false; +			break; +		} +		i_errno = local->replies[i].op_errno; + +		if (i_errno == ENOTCONN) { +			/* ENOTCONN is not a symmetric error. We do not +			   know if the operation was performed on the +			   backend or not. +			*/ +			matching_errors = _gf_false; +			break; +		} + +		if (!op_errno) { +			op_errno = i_errno; +		} else if (op_errno != i_errno) { +			/* Mismatching op_errno's */ +			matching_errors = _gf_false; +			break; +		} +	} + +	if (matching_errors) +		__mark_all_success (frame, this);  } -gf_boolean_t -afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) +int +afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)  { -        afr_private_t *priv = NULL; -        afr_local_t *local = NULL; -        int index = -1; -        int i = 0; +        afr_private_t * priv = this->private; +        int i          = 0; +	int ret = 0; +	int idx = 0; +        afr_local_t *  local = NULL; +        dict_t        *xattr = NULL; +        int            nothing_failed = 1; +	gf_boolean_t   need_undirty = _gf_false;          local = frame->local; -        priv = this->private; +	idx = afr_index_for_transaction_type (local->transaction.type); -        index = afr_index_for_transaction_type (local->transaction.type); +        nothing_failed = afr_txn_nothing_failed (frame, this); -        for (i = 0; i < priv->child_count; i++) { -                if (local->pending[i][index] == 0) -                        return _gf_false; -        } +	if (afr_changelog_pre_op_uninherit (frame, this)) +		need_undirty = _gf_false; +	else +		need_undirty = _gf_true; -        return _gf_true; -} +	if (nothing_failed && !need_undirty) { +		afr_changelog_post_op_done (frame, this); +                goto out; +	} -static void -afr_dir_fop_handle_all_fop_failures (call_frame_t *frame) -{ -        xlator_t        *this = NULL; -        afr_local_t     *local = NULL; -        afr_private_t   *priv = NULL; +	xattr = dict_new (); +	if (!xattr) { +		local->op_ret = -1; +		local->op_errno = ENOMEM; +		afr_changelog_post_op_done (frame, this); +		goto out; +	} -        this = frame->this; -        local = frame->local; -        priv = this->private; +	if (need_undirty) { +		local->dirty[idx] = hton32(-1); -        if ((local->transaction.type != AFR_ENTRY_TRANSACTION) && -            (local->transaction.type != AFR_ENTRY_RENAME_TRANSACTION)) -                return; +		ret = dict_set_static_bin (xattr, AFR_DIRTY, local->dirty, +					   sizeof(int) * AFR_NUM_CHANGE_LOGS); +		if (ret) { +			local->op_ret = -1; +			local->op_errno = ENOMEM; +			afr_changelog_post_op_done (frame, this); +			goto out; +		} -        if (local->op_ret >= 0) -                goto out; +	} + +	if (!nothing_failed) { +		for (i = 0; i < priv->child_count; i++) { +			if (local->transaction.failed_subvols[i]) +				local->pending[i][idx] = hton32(1); +		} +		ret = afr_set_pending_dict (priv, xattr, local->pending); +		if (ret < 0) { +			local->op_ret = -1; +			local->op_errno = ENOMEM; +			afr_changelog_post_op_done (frame, this); +			goto out; +		} + +	} -        __mark_all_success (local->pending, priv->child_count, -                            local->transaction.type); +	afr_changelog_do (frame, this, xattr, afr_changelog_post_op_done);  out: -        return; +	if (xattr) +                dict_unref (xattr); + +        return 0;  } -static void -afr_data_handle_quota_errors (call_frame_t *frame, xlator_t *this) + +gf_boolean_t +afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this)  { -        int     i = 0; -        afr_private_t *priv = NULL; -        afr_local_t   *local = NULL; -        gf_boolean_t  all_quota_failures = _gf_false; +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	fd_t *fd = NULL; +	int i = 0; +	gf_boolean_t ret = _gf_false; +	afr_fd_ctx_t *fd_ctx = NULL; +	int type = 0; -        local = frame->local; -        priv  = this->private; -        if (local->transaction.type != AFR_DATA_TRANSACTION) -                return; -        /* -         * Idea is to not leave the file in FOOL-FOOL scenario in case on -         * all the bricks data transaction failed with EDQUOT to avoid -         * increasing un-necessary load of self-heals in the system. -         */ -        all_quota_failures = _gf_true; -        for (i = 0; i < priv->child_count; i++) { -                if (local->transaction.pre_op[i] && -                    (local->child_errno[i] != EDQUOT)) { -                        all_quota_failures = _gf_false; -                        break; -                } -        } -        if (all_quota_failures) -                __mark_all_success (local->pending, priv->child_count, -                                    local->transaction.type); +	local = frame->local; +	priv = this->private; +	fd = local->fd; + +	type = afr_index_for_transaction_type (local->transaction.type); +	if (type != AFR_DATA_TRANSACTION) +		return !local->transaction.dirtied; + +	if (!fd) +		return !local->transaction.dirtied; + +	fd_ctx = afr_fd_ctx_get (fd, this); +	if (!fd_ctx) +		return _gf_false; + +	if (local->transaction.no_uninherit) +		return _gf_false; + +	/* This function must be idempotent. So check if we +	   were called before and return the same answer again. + +	   It is important to keep this function idempotent for +	   the call in afr_changelog_post_op_safe() to not have +	   side effects on the call from afr_changelog_post_op_now() +	*/ +	if (local->transaction.uninherit_done) +		return local->transaction.uninherit_value; + +	LOCK(&fd->lock); +	{ +		for (i = 0; i < priv->child_count; i++) { +			if (local->transaction.pre_op[i] != +			    fd_ctx->pre_op_done[type][i]) { +				ret = !local->transaction.dirtied; +				goto unlock; +			} +		} + +		if (fd_ctx->inherited[type]) { +			ret = _gf_true; +			fd_ctx->inherited[type]--; +		} else if (fd_ctx->on_disk[type]) { +			ret = _gf_false; +			fd_ctx->on_disk[type]--; +		} else { +			/* ASSERT */ +			ret = _gf_false; +		} + +		if (!fd_ctx->inherited[type] && !fd_ctx->on_disk[type]) { +			for (i = 0; i < priv->child_count; i++) +				fd_ctx->pre_op_done[type][i] = 0; +		} +	} +unlock: +	UNLOCK(&fd->lock); + +	local->transaction.uninherit_done = _gf_true; +	local->transaction.uninherit_value = ret; + +	return ret;  } -int -afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) + +gf_boolean_t +afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this)  { -        afr_private_t * priv = this->private; -        afr_internal_lock_t *int_lock = NULL; -        int i          = 0; -        int call_count = 0; +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	fd_t *fd = NULL; +	int i = 0; +	gf_boolean_t ret = _gf_false; +	afr_fd_ctx_t *fd_ctx = NULL; +	int type = 0; -        afr_local_t *  local = NULL; -        afr_fd_ctx_t  *fdctx = NULL; -        dict_t        **xattr = NULL; -        int            piggyback = 0; -        int            nothing_failed = 1; +	local = frame->local; +	priv = this->private; +	fd = local->fd; -        local    = frame->local; -        int_lock = &local->internal_lock; +	if (local->transaction.type != AFR_DATA_TRANSACTION) +		return _gf_false; -        __mark_non_participant_children (local->pending, priv->child_count, -                                         local->transaction.pre_op, -                                         local->transaction.type); +	type = afr_index_for_transaction_type (local->transaction.type); -        afr_data_handle_quota_errors (frame, this); -        afr_dir_fop_handle_all_fop_failures (frame); +	if (!fd) +		return _gf_false; -        if (local->fd) -                afr_transaction_rm_stale_children (frame, this, -                                                   local->fd->inode, -                                                   local->transaction.type); +	fd_ctx = afr_fd_ctx_get (fd, this); +	if (!fd_ctx) +		return _gf_false; -        xattr = alloca (priv->child_count * sizeof (*xattr)); -        memset (xattr, 0, (priv->child_count * sizeof (*xattr))); -        for (i = 0; i < priv->child_count; i++) { -                xattr[i] = dict_new (); -        } +	LOCK(&fd->lock); +	{ +		if (!fd_ctx->on_disk[type]) { +			/* nothing to inherit yet */ +			ret = _gf_false; +			goto unlock; +		} -        call_count = afr_changelog_post_op_call_count (local->transaction.type, -                                                       local->transaction.pre_op, -                                                       priv->child_count); -        local->call_count = call_count; +		for (i = 0; i < priv->child_count; i++) { +			if (local->transaction.pre_op[i] != +			    fd_ctx->pre_op_done[type][i]) { +				/* either inherit exactly, or don't */ +				ret = _gf_false; +				goto unlock; +			} +		} -        if (local->fd) -                fdctx = afr_fd_ctx_get (local->fd, this); +		fd_ctx->inherited[type]++; -        if (call_count == 0) { -                /* no child is up */ -                int_lock->lock_cbk = local->transaction.done; -                afr_unlock (frame, this); -                goto out; -        } +		ret = _gf_true; -        nothing_failed = afr_txn_nothing_failed (frame, this); +		local->transaction.inherited = _gf_true; +	} +unlock: +	UNLOCK(&fd->lock); -        afr_compute_txn_changelog (local , priv); +	return ret; +} -        for (i = 0; i < priv->child_count; i++) { -                if (!local->transaction.pre_op[i]) -                        continue; -                if (local->transaction.type != AFR_DATA_TRANSACTION) -                        afr_set_postop_dict (local, this, xattr[i], -                                             local->optimistic_change_log, i); -                switch (local->transaction.type) { -                case AFR_DATA_TRANSACTION: -                { -                        if (!fdctx) { -                                afr_set_postop_dict (local, this, xattr[i], -                                                     0, i); -                                STACK_WIND (frame, afr_changelog_post_op_cbk, -                                            priv->children[i], -                                            priv->children[i]->fops->xattrop, -                                            &local->loc, -                                            GF_XATTROP_ADD_ARRAY, xattr[i], -                                            NULL); -                                break; -                        } +gf_boolean_t +afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	fd_t *fd = NULL; +	afr_fd_ctx_t *fd_ctx = NULL; +	int i = 0; +	gf_boolean_t ret = _gf_false; +	int type = 0; -                        /* local->transaction.postop_piggybacked[] was -                           precomputed in is_piggyback_postop() when called from -                           afr_changelog_post_op_safe() -                        */ +	local = frame->local; +	priv = this->private; +	fd = local->fd; -                        piggyback = 0; -                        if (local->transaction.postop_piggybacked[i]) -                                piggyback = 1; +	if (!fd) +		return _gf_false; -                        afr_set_postop_dict (local, this, xattr[i], -                                             piggyback, i); +	fd_ctx = afr_fd_ctx_get (fd, this); +	if (!fd_ctx) +		return _gf_false; -                        if (nothing_failed && piggyback) { -                                afr_changelog_post_op_cbk (frame, (void *)(long)i, -                                                           this, 1, 0, xattr[i], NULL); -                        } else { -                                STACK_WIND_COOKIE (frame, -                                                   afr_changelog_post_op_cbk, -                                                   (void *) (long) i, -                                                   priv->children[i], -                                                   priv->children[i]->fops->fxattrop, -                                                   local->fd, -                                                   GF_XATTROP_ADD_ARRAY, xattr[i], -                                                   NULL); -                        } -                } -                break; -                case AFR_METADATA_TRANSACTION: -                { -                        if (nothing_failed && local->optimistic_change_log) { -                                afr_changelog_post_op_cbk (frame, (void *)(long)i, -                                                           this, 1, 0, xattr[i], -                                                           NULL); -                                break; -                        } +	if (local->transaction.inherited) +		/* was already inherited in afr_changelog_pre_op */ +		return _gf_false; -                        if (local->fd) -                                STACK_WIND (frame, afr_changelog_post_op_cbk, -                                            priv->children[i], -                                            priv->children[i]->fops->fxattrop, -                                            local->fd, -                                            GF_XATTROP_ADD_ARRAY, xattr[i], -                                            NULL); -                        else -                                STACK_WIND (frame, afr_changelog_post_op_cbk, -                                            priv->children[i], -                                            priv->children[i]->fops->xattrop, -                                            &local->loc, -                                            GF_XATTROP_ADD_ARRAY, xattr[i], -                                            NULL); -                } -                break; +	if (!local->transaction.dirtied) +		return _gf_false; -                case AFR_ENTRY_RENAME_TRANSACTION: -                { -                        if (nothing_failed && local->optimistic_change_log) { -                                afr_changelog_post_op_cbk (frame, (void *)(long)i, -                                                           this, 1, 0, xattr[i], -                                                           NULL); -                        } else { -                                STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, -                                                   (void *) (long) i, -                                                   priv->children[i], -                                                   priv->children[i]->fops->xattrop, -                                                   &local->transaction.new_parent_loc, -                                                   GF_XATTROP_ADD_ARRAY, xattr[i], -                                                   NULL); -                        } -                        call_count--; -                } +        if (!afr_txn_nothing_failed (frame, this)) +		return _gf_false; -                /* -                  set it again because previous stack_wind -                  might have already returned (think of case -                  where subvolume is posix) and would have -                  used the dict as placeholder for return -                  value -                */ +	type = afr_index_for_transaction_type (local->transaction.type); -                afr_set_postop_dict (local, this, xattr[i], -                                     local->optimistic_change_log, i); +	ret = _gf_false; -                /* fall through */ +	LOCK(&fd->lock); +	{ +		if (!fd_ctx->on_disk[type]) { +			for (i = 0; i < priv->child_count; i++) +				fd_ctx->pre_op_done[type][i] = +					local->transaction.pre_op[i]; +		} else { +			for (i = 0; i < priv->child_count; i++) +				if (fd_ctx->pre_op_done[type][i] != +				    local->transaction.pre_op[i]) { +					local->transaction.no_uninherit = 1; +					goto unlock; +				} +		} +		fd_ctx->on_disk[type]++; + +		ret = _gf_true; +	} +unlock: +	UNLOCK(&fd->lock); -                case AFR_ENTRY_TRANSACTION: -                { -                        if (nothing_failed && local->optimistic_change_log) { -                                afr_changelog_post_op_cbk (frame, (void *)(long)i, -                                                           this, 1, 0, xattr[i], -                                                           NULL); -                                break; -                        } +	return ret; +} -                        if (local->fd) -                                STACK_WIND (frame, afr_changelog_post_op_cbk, -                                            priv->children[i], -                                            priv->children[i]->fops->fxattrop, -                                            local->fd, -                                            GF_XATTROP_ADD_ARRAY, xattr[i], -                                            NULL); -                        else -                                STACK_WIND (frame, afr_changelog_post_op_cbk, -                                            priv->children[i], -                                            priv->children[i]->fops->xattrop, -                                            &local->transaction.parent_loc, -                                            GF_XATTROP_ADD_ARRAY, xattr[i], -                                            NULL); -                } -                break; -                } -                if (!--call_count) -                        break; -        } +int +afr_changelog_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		   int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ +        afr_local_t *local = NULL; +        int call_count = -1; -out: -        for (i = 0; i < priv->child_count; i++) { -                dict_unref (xattr[i]); -        } +        local = frame->local; + +	if (op_ret == -1) +		afr_transaction_fop_failed (frame, this, (long) cookie); + +	call_count = afr_frame_return (frame); + +        if (call_count == 0) +		local->transaction.changelog_resume (frame, this);          return 0;  } -int32_t -afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                          int32_t op_ret, int32_t op_errno, dict_t *xattr, -                          dict_t *xdata) +int +afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, +		  afr_changelog_resume_t changelog_resume)  { -        afr_local_t *   local = NULL; -        afr_private_t * priv  = this->private; -        int call_count  = -1; -        int child_index = (long) cookie; +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	int i = 0; +	int call_count = 0; -        local = frame->local; +	local = frame->local; +	priv = this->private; -        LOCK (&frame->lock); -        { -                switch (op_ret) { -                case 0: -                        __mark_pre_op_done_on_fd (frame, this, child_index); -                        //fallthrough we need to mark the pre_op -                case 1: -                        local->transaction.pre_op[child_index] = 1; -                        /* special op_ret for piggyback */ -                        break; -                case -1: -                        if (op_errno == ENOTSUP) { -                                gf_log (this->name, GF_LOG_ERROR, -                                        "xattrop not supported by %s", -                                        priv->children[child_index]->name); -                                local->op_ret = -1; - -                        } else if (!child_went_down (op_ret, op_errno)) { -                                gf_log (this->name, GF_LOG_ERROR, -                                        "xattrop failed on child %s: %s", -                                        priv->children[child_index]->name, -                                        strerror (op_errno)); +        call_count = afr_changelog_call_count (local->transaction.type, +					       local->transaction.pre_op, +					       priv->child_count); + +	if (call_count == 0) { +		changelog_resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	local->transaction.changelog_resume = changelog_resume; + +        for (i = 0; i < priv->child_count; i++) { +                if (!local->transaction.pre_op[i]) +                        continue; + +                switch (local->transaction.type) { +                case AFR_DATA_TRANSACTION: +                case AFR_METADATA_TRANSACTION: +                        if (!local->fd) { +                                STACK_WIND_COOKIE (frame, afr_changelog_cbk, +						   (void *) (long) i, +						   priv->children[i], +						   priv->children[i]->fops->xattrop, +						   &local->loc, +						   GF_XATTROP_ADD_ARRAY, xattr, +						   NULL); +                        } else { +                                STACK_WIND_COOKIE (frame, afr_changelog_cbk, +						   (void *) (long) i, +						   priv->children[i], +						   priv->children[i]->fops->fxattrop, +						   local->fd, +						   GF_XATTROP_ADD_ARRAY, xattr, +						   NULL);                          } -                        local->op_errno = op_errno; -                        break; -                } +			break; +                case AFR_ENTRY_RENAME_TRANSACTION: -                call_count = --local->call_count; -        } -        UNLOCK (&frame->lock); +			STACK_WIND_COOKIE (frame, afr_changelog_cbk, +					   (void *) (long) i, +					   priv->children[i], +					   priv->children[i]->fops->xattrop, +					   &local->transaction.new_parent_loc, +					   GF_XATTROP_ADD_ARRAY, xattr, +					   NULL); +                        call_count--; -        if (call_count == 0) { -                if ((local->op_ret == -1) && -                    (local->op_errno == ENOTSUP)) { -                        local->transaction.resume (frame, this); -                } else { -                        afr_transaction_perform_fop (frame, this); -                } +                /* fall through */ + +                case AFR_ENTRY_TRANSACTION: +                        if (local->fd) +                                STACK_WIND_COOKIE (frame, afr_changelog_cbk, +						   (void *) (long) i, +						   priv->children[i], +						   priv->children[i]->fops->fxattrop, +						   local->fd, +						   GF_XATTROP_ADD_ARRAY, xattr, +						   NULL); +                        else +                                STACK_WIND_COOKIE (frame, afr_changelog_cbk, +						   (void *) (long) i, +						   priv->children[i], +						   priv->children[i]->fops->xattrop, +						   &local->transaction.parent_loc, +						   GF_XATTROP_ADD_ARRAY, xattr, +						   NULL); +			break; +		} + +                if (!--call_count) +                        break;          } -        return 0; +	return 0;  } +  int  afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)  { @@ -902,206 +816,122 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)          int i = 0;          int ret = 0;          int call_count = 0; -        dict_t **xattr = NULL; -        afr_fd_ctx_t *fdctx = NULL; +	int op_errno = 0;          afr_local_t *local = NULL; -        int          piggyback = 0;          afr_internal_lock_t *int_lock = NULL;          unsigned char       *locked_nodes = NULL; +	unsigned char       *pending_subvols = NULL; +	int idx = -1; +	gf_boolean_t pre_nop = _gf_true; +	dict_t *xdata_req = NULL;          local = frame->local;          int_lock = &local->internal_lock; - -        xattr = alloca (priv->child_count * sizeof (*xattr)); -        memset (xattr, 0, (priv->child_count * sizeof (*xattr))); - -        for (i = 0; i < priv->child_count; i++) { -                xattr[i] = dict_new (); -        } - -        call_count = afr_changelog_pre_op_call_count (local->transaction.type, -                                                      int_lock, -                                                      priv->child_count); -        if (call_count == 0) { -                local->internal_lock.lock_cbk = -                        local->transaction.done; -                afr_unlock (frame, this); -                goto out; -        } - -        local->call_count = call_count; - -        __mark_all_pending (local->pending, priv->child_count, -                            local->transaction.type); - -        if (local->fd) -                fdctx = afr_fd_ctx_get (local->fd, this); +	idx = afr_index_for_transaction_type (local->transaction.type);          locked_nodes = afr_locked_nodes_get (local->transaction.type, int_lock); -        for (i = 0; i < priv->child_count; i++) { -                if (!locked_nodes[i]) -                        continue; -                ret = afr_set_pending_dict (priv, xattr[i], local->pending, -                                            i, LOCAL_FIRST); - -                if (ret < 0) -                        gf_log (this->name, GF_LOG_INFO, -                                "failed to set pending entry"); +	pending_subvols = alloca0 (priv->child_count); -                switch (local->transaction.type) { -                case AFR_DATA_TRANSACTION: -                { -                        if (!fdctx) { -                                STACK_WIND_COOKIE (frame, -                                                   afr_changelog_pre_op_cbk, -                                                   (void *) (long) i, -                                                   priv->children[i], -                                                   priv->children[i]->fops->xattrop, -                                                   &(local->loc), -                                                   GF_XATTROP_ADD_ARRAY, xattr[i], -                                                   NULL); -                                break; -                        } +	for (i = 0; i < priv->child_count; i++) { +		if (locked_nodes[i]) { +			local->transaction.pre_op[i] = 1; +			call_count++; +		} else { +			pending_subvols[i] = 1; +		} +	} -                        LOCK (&local->fd->lock); -                        { -                                piggyback = 0; -                                if (fdctx->pre_op_done[i]) { -                                        fdctx->pre_op_piggyback[i]++; -                                        piggyback = 1; -                                        fdctx->hit++; -                                } else { -                                        fdctx->miss++; -                                } -                        } -                        UNLOCK (&local->fd->lock); +	/* TBD: quorum check w/ call_count */ -                        afr_set_delayed_post_op (frame, this); +        if (call_count == 0) { +		op_errno = ENOTCONN; +		goto err; +	} -                        if (piggyback) -                                afr_changelog_pre_op_cbk (frame, (void *)(long)i, -                                                          this, 1, 0, xattr[i], -                                                          NULL); -                        else -                                STACK_WIND_COOKIE (frame, -                                                   afr_changelog_pre_op_cbk, -                                                   (void *) (long) i, -                                                   priv->children[i], -                                                   priv->children[i]->fops->fxattrop, -                                                   local->fd, -                                                   GF_XATTROP_ADD_ARRAY, xattr[i], -                                                   NULL); -                } -                break; -                case AFR_METADATA_TRANSACTION: -                { -                        if (local->optimistic_change_log) { -                                afr_changelog_pre_op_cbk (frame, (void *)(long)i, -                                                          this, 1, 0, xattr[i], -                                                          NULL); -                                break; -                        } +	xdata_req = dict_new(); +	if (!xdata_req) { +		op_errno = ENOMEM; +		goto err; +	} -                        if (local->fd) -                                STACK_WIND_COOKIE (frame, -                                                   afr_changelog_pre_op_cbk, -                                                   (void *) (long) i, -                                                   priv->children[i], -                                                   priv->children[i]->fops->fxattrop, -                                                   local->fd, -                                                   GF_XATTROP_ADD_ARRAY, xattr[i], -                                                   NULL); -                        else -                                STACK_WIND_COOKIE (frame, -                                                   afr_changelog_pre_op_cbk, -                                                   (void *) (long) i, -                                                   priv->children[i], -                                                   priv->children[i]->fops->xattrop, -                                                   &(local->loc), -                                                   GF_XATTROP_ADD_ARRAY, xattr[i], -                                                   NULL); -                } -                break; +	pre_nop = _gf_true; + +	if (afr_changelog_pre_op_inherit (frame, this)) +		goto next; + +	if (call_count < priv->child_count) { +		/* For subvols we are not performing operation on, +		   mark them as pending up-front along with the FOP +		   so that we can safely defer unmarking dirty until +		   later. +		*/ +		for (i = 0; i < priv->child_count; i++) { +			if (pending_subvols[i]) +				local->pending[i][idx] = hton32(1); +		} +		ret = afr_set_pending_dict (priv, xdata_req, +					    local->pending); +		if (ret < 0) { +			op_errno = ENOMEM; +			goto err; +		} +		pre_nop = _gf_false; +	} -                case AFR_ENTRY_RENAME_TRANSACTION: -                { -                        if (local->optimistic_change_log) { -                                afr_changelog_pre_op_cbk (frame, (void *)(long)i, -                                                          this, 1, 0, xattr[i], -                                                          NULL); -                        } else { -                                STACK_WIND_COOKIE (frame, -                                                   afr_changelog_pre_op_cbk, -                                                   (void *) (long) i, -                                                   priv->children[i], -                                                   priv->children[i]->fops->xattrop, -                                                   &local->transaction.new_parent_loc, -                                                   GF_XATTROP_ADD_ARRAY, xattr[i], -                                                   NULL); -                        } +	if (call_count > 1 && +	    (local->transaction.type == AFR_DATA_TRANSACTION || +	     !local->optimistic_change_log)) { + +		/* If we are performing change on only one subvol, no +		   need to mark dirty, because we are setting the pending +		   counts already anyways +		*/ +		local->dirty[idx] = hton32(1); + +		ret = dict_set_static_bin (xdata_req, AFR_DIRTY, local->dirty, +					   sizeof(int) * AFR_NUM_CHANGE_LOGS); +		if (ret) { +			op_errno = ENOMEM; +			goto err; +		} + +		pre_nop = _gf_false; +		local->transaction.dirtied = 1; +	} -                        call_count--; -                } +	if (pre_nop) +		goto next; +	if (!local->pre_op_compat) { +		dict_copy (xdata_req, local->xdata_req); +		goto next; +	} -                /* -                  set it again because previous stack_wind -                  might have already returned (think of case -                  where subvolume is posix) and would have -                  used the dict as placeholder for return -                  value -                */ +	afr_changelog_do (frame, this, xdata_req, afr_transaction_perform_fop); -                ret = afr_set_pending_dict (priv, xattr[i], local->pending, -                                            i, LOCAL_FIRST); +	if (xdata_req) +		dict_unref (xdata_req); -                if (ret < 0) -                        gf_log (this->name, GF_LOG_INFO, -                                "failed to set pending entry"); +	return 0; +next: +	afr_transaction_perform_fop (frame, this); -                /* fall through */ +	if (xdata_req) +		dict_unref (xdata_req); -                case AFR_ENTRY_TRANSACTION: -                { -                        if (local->optimistic_change_log) { -                                afr_changelog_pre_op_cbk (frame, (void *)(long)i, -                                                          this, 1, 0, xattr[i], -                                                          NULL); -                                break; -                        } +        return 0; +err: +	local->internal_lock.lock_cbk = local->transaction.done; +	local->op_ret = -1; +	local->op_errno = op_errno; -                        if (local->fd) -                                STACK_WIND_COOKIE (frame, -                                                   afr_changelog_pre_op_cbk, -                                                   (void *) (long) i, -                                                   priv->children[i], -                                                   priv->children[i]->fops->fxattrop, -                                                   local->fd, -                                                   GF_XATTROP_ADD_ARRAY, xattr[i], -                                                   NULL); -                        else -                                STACK_WIND_COOKIE (frame, -                                                   afr_changelog_pre_op_cbk, -                                                   (void *) (long) i, -                                                   priv->children[i], -                                                   priv->children[i]->fops->xattrop, -                                                   &local->transaction.parent_loc, -                                                   GF_XATTROP_ADD_ARRAY, xattr[i], -                                                   NULL); -                } -                break; -                } +	afr_unlock (frame, this); -                if (!--call_count) -                        break; -        } -out: -        for (i = 0; i < priv->child_count; i++) { -                dict_unref (xattr[i]); -        } +	if (xdata_req) +		dict_unref (xdata_req); -        return 0; +	return 0;  } @@ -1365,15 +1195,15 @@ afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this)  }  gf_boolean_t -afr_are_multiple_fds_opened (inode_t *inode, xlator_t *this) +afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)  { -        afr_inode_ctx_t *ictx = NULL; +        afr_fd_ctx_t *fd_ctx = NULL; -        if (!inode) { +        if (!fd) {                  /* If false is returned, it may keep on taking eager-lock                   * which may lead to starvation, so return true to avoid that.                   */ -                gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid inode"); +                gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid fd");                  return _gf_true;          }          /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock @@ -1383,32 +1213,22 @@ afr_are_multiple_fds_opened (inode_t *inode, xlator_t *this)           * if open-fd-count is > 1           */ -        ictx = afr_inode_ctx_get (inode, this); -        if (!ictx) +        fd_ctx = afr_fd_ctx_get (fd, this); +        if (!fd_ctx)                  return _gf_true; -        if (ictx->open_fd_count > 1) +        if (fd_ctx->open_fd_count > 1)                  return _gf_true;          return _gf_false;  } -gf_boolean_t -afr_any_fops_failed (afr_local_t *local, afr_private_t *priv) -{ -        if (local->success_count != priv->child_count) -                return _gf_true; -        return _gf_false; -}  gf_boolean_t  is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this)  {          afr_local_t      *local = NULL;          gf_boolean_t      res = _gf_false; -        afr_private_t    *priv  = NULL; - -        priv  = this->private;          local = frame->local;          if (!local) @@ -1418,10 +1238,10 @@ is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this)                  goto out;          //Mark pending changelog ASAP -        if (afr_any_fops_failed (local, priv)) +        if (!afr_txn_nothing_failed (frame, this))                  goto out; -        if (local->fd && afr_are_multiple_fds_opened (local->fd->inode, this)) +        if (local->fd && afr_are_multiple_fds_opened (local->fd, this))                  goto out;          res = _gf_true; @@ -1445,58 +1265,6 @@ afr_delayed_changelog_wake_up_cbk (void *data)  } -/* -  Check if the frame is destined to get optimized away -  with changelog piggybacking -*/ -static gf_boolean_t -is_piggyback_post_op (call_frame_t *frame, fd_t *fd) -{ -        afr_fd_ctx_t *fdctx = NULL; -        afr_local_t *local = NULL; -        gf_boolean_t piggyback = _gf_true; -        afr_private_t *priv = NULL; -        int i = 0; - -        priv = frame->this->private; -        local = frame->local; -        fdctx = afr_fd_ctx_get (fd, frame->this); - -        LOCK(&fd->lock); -        { -                piggyback = _gf_true; - -                for (i = 0; i < priv->child_count; i++) { -                        if (!local->transaction.pre_op[i]) -                                continue; -                        if (fdctx->pre_op_piggyback[i]) { -                                fdctx->pre_op_piggyback[i]--; -                                local->transaction.postop_piggybacked[i] = 1; -                        } else { -                                /* For at least _one_ subvolume we cannot -                                   piggyback on the changelog, and have to -                                   perform a hard POST-OP and therefore fsync -                                   if necesssary -                                */ -                                piggyback = _gf_false; -                                GF_ASSERT (fdctx->pre_op_done[i]); -                                fdctx->pre_op_done[i]--; -                        } -                } -        } -        UNLOCK(&fd->lock); - -        if (!afr_txn_nothing_failed (frame, frame->this)) { -                /* something failed in this transaction, -                   we will be performing a hard post-op -                */ -                return _gf_false; -        } - -        return piggyback; -} - -  /* SET operation */  int  afr_fd_report_unstable_write (xlator_t *this, fd_t *fd) @@ -1521,7 +1289,7 @@ afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd)          afr_fd_ctx_t *fdctx = NULL;          gf_boolean_t witness = _gf_false; -	fdctx = afr_fd_ctx_get (fd, this); +        fdctx = afr_fd_ctx_get (fd, this);          if (!fdctx)                  return _gf_true; @@ -1551,10 +1319,10 @@ afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          priv = this->private;          local = frame->local; -        if (afr_fop_failed (op_ret, op_errno)) { +        if (op_ret != 0) {                  /* Failure of fsync() is as good as failure of previous                     write(). So treat it like one. -                */ +		*/                  gf_log (this->name, GF_LOG_WARNING,                          "fsync(%s) failed on subvolume %s. Transaction was %s",                          uuid_utoa (local->fd->inode->gfid), @@ -1562,14 +1330,14 @@ afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          gf_fop_list[local->op]);                  afr_transaction_fop_failed (frame, this, child_index); -        } +	} -        call_count = afr_frame_return (frame); +	call_count = afr_frame_return (frame); -        if (call_count == 0) -                afr_changelog_post_op_now (frame, this); +	if (call_count == 0) +		afr_changelog_post_op_now (frame, this); -        return 0; +	return 0;  } @@ -1580,14 +1348,13 @@ afr_changelog_fsync (call_frame_t *frame, xlator_t *this)          int i = 0;          int call_count = 0;          afr_private_t *priv = NULL; - 	dict_t *xdata = NULL; - 	GF_UNUSED int ret = -1; +        dict_t *xdata = NULL; +        GF_UNUSED int ret = -1;          local = frame->local;          priv = this->private; -        call_count = afr_pre_op_done_children_count (local->transaction.pre_op, -                                                     priv->child_count); +        call_count = AFR_COUNT (local->transaction.pre_op, priv->child_count);          if (!call_count) {                  /* will go straight to unlock */ @@ -1597,30 +1364,30 @@ afr_changelog_fsync (call_frame_t *frame, xlator_t *this)          local->call_count = call_count; -	xdata = dict_new(); -	if (xdata) -		ret = dict_set_int32 (xdata, "batch-fsync", 1); +        xdata = dict_new(); +        if (xdata) +                ret = dict_set_int32 (xdata, "batch-fsync", 1);          for (i = 0; i < priv->child_count; i++) {                  if (!local->transaction.pre_op[i])                          continue;                  STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk, -                                (void *) (long) i, priv->children[i], -                                priv->children[i]->fops->fsync, local->fd, -                                1, xdata); +				   (void *) (long) i, priv->children[i], +				   priv->children[i]->fops->fsync, local->fd, +				   1, xdata);                  if (!--call_count)                          break;          } -	if (xdata) -		dict_unref (xdata); +        if (xdata) +                dict_unref (xdata);          return 0;  } -        int +int  afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)  {  	afr_local_t    *local = NULL; @@ -1634,7 +1401,8 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)                  return 0;          } -        if (is_piggyback_post_op (frame, local->fd)) { +        if (afr_changelog_pre_op_uninherit (frame, this) && +	    afr_txn_nothing_failed (frame, this)) {                  /* just detected that this post-op is about to                     be optimized away as a new write() has                     already piggybacked on this frame's changelog. @@ -1733,7 +1501,7 @@ out:  	if (prev_frame) {  		local = prev_frame->local;  		local->transaction.resume_stub = stub; -		afr_changelog_post_op_safe (prev_frame, this); +		afr_changelog_post_op_now (prev_frame, this);  	} else if (stub) {  		call_resume (stub);  	} @@ -1779,13 +1547,9 @@ afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd)          int  afr_transaction_resume (call_frame_t *frame, xlator_t *this)  { -        afr_internal_lock_t *int_lock = NULL;          afr_local_t         *local    = NULL; -        afr_private_t       *priv     = NULL;          local    = frame->local; -        int_lock = &local->internal_lock; -        priv     = this->private;          if (local->transaction.eager_lock_on) {                  /* We don't need to retain "local" in the @@ -1800,15 +1564,17 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this)          afr_restore_lk_owner (frame); +        afr_handle_symmetric_errors (frame, this); + +	if (!local->pre_op_compat) +		/* new mode, pre-op was done along +		   with OP */ +		afr_changelog_pre_op_update (frame, this); +          if (__fop_changelog_needed (frame, this)) {                  afr_changelog_post_op (frame, this);          } else { -                if (afr_lock_server_count (priv, local->transaction.type) == 0) { -                        local->transaction.done (frame, this); -                } else { -                        int_lock->lock_cbk = local->transaction.done; -                        afr_unlock (frame, this); -                } +		afr_changelog_post_op_done (frame, this);          }          return 0; @@ -1824,13 +1590,10 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,                              int child_index)  {          afr_local_t *   local = NULL; -        afr_private_t * priv  = NULL;          local = frame->local; -        priv  = this->private; -        __mark_child_dead (local->pending, priv->child_count, -                        child_index, local->transaction.type); +	local->transaction.failed_subvols[child_index] = 1;  } @@ -1878,7 +1641,7 @@ afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this)          if (!fdctx)                  return; -        if (afr_are_multiple_fds_opened (local->fd->inode, this)) +        if (afr_are_multiple_fds_opened (local->fd, this))                  return;          /*           * Once full file lock is acquired in eager-lock phase, overlapping diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index fa626fd0d6e..77cc8eed019 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -11,10 +11,7 @@  #ifndef __TRANSACTION_H__  #define __TRANSACTION_H__ -typedef enum { -        LOCAL_FIRST = 1, -        LOCAL_LAST = 2 -} afr_xattrop_type_t; +#include "afr.h"  void  afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, @@ -29,11 +26,9 @@ afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom);  int32_t  afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); -afr_fd_ctx_t * -afr_fd_ctx_get (fd_t *fd, xlator_t *this);  int -afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, -                      int child, afr_xattrop_type_t op); +afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending); +  void  afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this); @@ -41,11 +36,18 @@ void  afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd);  void -__mark_all_success (int32_t *pending[], int child_count, -                    afr_transaction_type type); -gf_boolean_t -afr_any_fops_failed (afr_local_t *local, afr_private_t *priv); +__mark_all_success (call_frame_t *frame, xlator_t *this);  gf_boolean_t  afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this); + +int afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, +		  afr_read_txn_wind_t readfn, afr_transaction_type type); + +int afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol); + +int __afr_txn_write_fop (call_frame_t *frame, xlator_t *this); +int __afr_txn_write_done (call_frame_t *frame, xlator_t *this); +call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame); +  #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index c264538073e..5e12910b7c6 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -21,11 +21,6 @@  #endif  #include "afr-common.c" -#define SHD_INODE_LRU_LIMIT          2048 -#define AFR_EH_HEALED_LIMIT          1024 -#define AFR_EH_HEAL_FAIL_LIMIT       1024 -#define AFR_EH_SPLIT_BRAIN_LIMIT     1024 -  struct volume_options options[];  int32_t @@ -114,6 +109,14 @@ reconfigure (xlator_t *this, dict_t *options)          priv = this->private; +	GF_OPTION_RECONF ("afr-dirty-xattr", +			  priv->afr_dirty, options, str, +			  out); + +	GF_OPTION_RECONF ("metadata-splitbrain-forced-heal", +			  priv->metadata_splitbrain_forced_heal, options, bool, +			  out); +          GF_OPTION_RECONF ("background-self-heal-count",                            priv->background_self_heal_count, options, uint32,                            out); @@ -127,9 +130,6 @@ reconfigure (xlator_t *this, dict_t *options)          GF_OPTION_RECONF ("entry-self-heal", priv->entry_self_heal, options,                            bool, out); -        GF_OPTION_RECONF ("strict-readdir", priv->strict_readdir, options, bool, -                          out); -          GF_OPTION_RECONF ("data-self-heal-window-size",                            priv->data_self_heal_window_size, options,                            uint32, out); @@ -146,8 +146,6 @@ reconfigure (xlator_t *this, dict_t *options)          GF_OPTION_RECONF ("data-self-heal-algorithm",                            priv->data_self_heal_algorithm, options, str, out); -        GF_OPTION_RECONF ("self-heal-daemon", priv->shd.enabled, options, bool, out); -          GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out);          GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode, @@ -175,13 +173,13 @@ reconfigure (xlator_t *this, dict_t *options)                  priv->read_child = index;          } +        GF_OPTION_RECONF ("pre-op-compat", priv->pre_op_compat, options, bool, out); +          GF_OPTION_RECONF ("eager-lock", priv->eager_lock, options, bool, out);          GF_OPTION_RECONF ("quorum-type", qtype, options, str, out);          GF_OPTION_RECONF ("quorum-count", priv->quorum_count, options,                            uint32, out);          fix_quorum_options(this,priv,qtype); -        GF_OPTION_RECONF ("heal-timeout", priv->shd.timeout, options, -                          int32, out);  	GF_OPTION_RECONF ("post-op-delay-secs", priv->post_op_delay_secs, options,  			  uint32, out); @@ -189,10 +187,15 @@ reconfigure (xlator_t *this, dict_t *options)          GF_OPTION_RECONF (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size,                            options, size, out);          /* Reset this so we re-discover in case the topology changed.  */ -        GF_OPTION_RECONF ("readdir-failover", priv->readdir_failover, options, -                          bool, out);          GF_OPTION_RECONF ("ensure-durability", priv->ensure_durability, options,                            bool, out); + +	GF_OPTION_RECONF ("self-heal-daemon", priv->shd.enabled, options, +			  bool, out); + +	GF_OPTION_RECONF ("iam-self-heal-daemon", priv->shd.iamshd, options, +			  bool, out); +          priv->did_discovery = _gf_false;          ret = 0; @@ -244,10 +247,6 @@ init (xlator_t *this)          priv = this->private;          LOCK_INIT (&priv->lock); -        LOCK_INIT (&priv->read_child_lock); -        //lock recovery is not done in afr -        pthread_mutex_init (&priv->mutex, NULL); -        INIT_LIST_HEAD (&priv->saved_fds);          child_count = xlator_subvolume_count (this); @@ -255,6 +254,11 @@ init (xlator_t *this)          priv->read_child = -1; +	GF_OPTION_INIT ("afr-dirty-xattr", priv->afr_dirty, str, out); + +	GF_OPTION_INIT ("metadata-splitbrain-forced-heal", +			priv->metadata_splitbrain_forced_heal, bool, out); +          GF_OPTION_INIT ("read-subvolume", read_subvol, xlator, out);          if (read_subvol) {                  priv->read_child = xlator_subvolume_index (this, read_subvol); @@ -308,10 +312,6 @@ init (xlator_t *this)          GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out); -        GF_OPTION_INIT ("self-heal-daemon", priv->shd.enabled, bool, out); - -        GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out); -          GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out);          GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool, @@ -326,7 +326,7 @@ init (xlator_t *this)          GF_OPTION_INIT ("entrylk-trace", priv->entrylk_trace, bool, out); -        GF_OPTION_INIT ("strict-readdir", priv->strict_readdir, bool, out); +        GF_OPTION_INIT ("pre-op-compat", priv->pre_op_compat, bool, out);          GF_OPTION_INIT ("eager-lock", priv->eager_lock, bool, out);          GF_OPTION_INIT ("quorum-type", qtype, str, out); @@ -336,10 +336,13 @@ init (xlator_t *this)          fix_quorum_options(this,priv,qtype);  	GF_OPTION_INIT ("post-op-delay-secs", priv->post_op_delay_secs, uint32, out); -        GF_OPTION_INIT ("readdir-failover", priv->readdir_failover, bool, out);          GF_OPTION_INIT ("ensure-durability", priv->ensure_durability, bool,                          out); +	GF_OPTION_INIT ("self-heal-daemon", priv->shd.enabled, bool, out); + +	GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out); +          priv->wait_count = 1;          priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, @@ -402,6 +405,12 @@ init (xlator_t *this)                  goto out;          } +	ret = afr_selfheal_daemon_init (this); +	if (ret) { +		ret = -ENOMEM; +		goto out; +	} +          /* keep more local here as we may need them for self-heal etc */          this->local_pool = mem_pool_new (afr_local_t, 512);          if (!this->local_pool) { @@ -411,58 +420,8 @@ init (xlator_t *this)                  goto out;          } -        priv->first_lookup = 1;          priv->root_inode = NULL; -        if (!priv->shd.iamshd) { -                ret = 0; -                goto out; -        } - -        ret = -ENOMEM; -        priv->shd.pos = GF_CALLOC (sizeof (*priv->shd.pos), child_count, -                                   gf_afr_mt_brick_pos_t); -        if (!priv->shd.pos) -                goto out; - -        priv->shd.pending = GF_CALLOC (sizeof (*priv->shd.pending), child_count, -                                       gf_afr_mt_int32_t); -        if (!priv->shd.pending) -                goto out; - -        priv->shd.inprogress = GF_CALLOC (sizeof (*priv->shd.inprogress), -                                          child_count, gf_afr_mt_shd_bool_t); -        if (!priv->shd.inprogress) -                goto out; -        priv->shd.timer = GF_CALLOC (sizeof (*priv->shd.timer), child_count, -                                     gf_afr_mt_shd_timer_t); -        if (!priv->shd.timer) -                goto out; - -        priv->shd.healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false, -                                   _destroy_shd_event_data); -        if (!priv->shd.healed) -                goto out; - -        priv->shd.heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false, -                                        _destroy_shd_event_data); -        if (!priv->shd.heal_failed) -                goto out; - -        priv->shd.split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false, -                                        _destroy_shd_event_data); -        if (!priv->shd.split_brain) -                goto out; - -        this->itable = inode_table_new (SHD_INODE_LRU_LIMIT, this); -        if (!this->itable) -                goto out; -        priv->root_inode = inode_ref (this->itable->root); -        GF_OPTION_INIT ("node-uuid", priv->shd.node_uuid, str, out); -        GF_OPTION_INIT ("heal-timeout", priv->shd.timeout, int32, out); -        ret = afr_initialise_statistics (this); -        if (ret) -                goto out;          ret = 0;  out:          return ret; @@ -572,11 +531,11 @@ struct volume_options options[] = {            .type = GF_OPTION_TYPE_INT,            .min = 0,            .max = 2, -          .default_value = "0", +          .default_value = "1",            .description = "inode-read fops happen only on one of the bricks in "                           "replicate. AFR will prefer the one computed using "                           "the method specified using this option" -                         "0 = first responder, " +                         "0 = first up server, "                           "1 = hash by GFID of file (all clients use "                                                      "same subvolume), "                           "2 = hash by GFID of file and client PID", @@ -585,7 +544,7 @@ struct volume_options options[] = {            .type = GF_OPTION_TYPE_BOOL,            .default_value = "true",            .description = "Choose a local subvolume (i.e. Brick) to read from" -                         " if read-subvolume is not explicitly set.", +	                 " if read-subvolume is not explicitly set.",          },          { .key  = {"favorite-child"},            .type = GF_OPTION_TYPE_XLATOR, @@ -675,10 +634,6 @@ struct volume_options options[] = {                           "pre fop changelog operations in afr transaction "                           "if this option is enabled."          }, -        { .key  = {"strict-readdir"}, -          .type = GF_OPTION_TYPE_BOOL, -          .default_value = "off", -        },          { .key = {"inodelk-trace"},            .type = GF_OPTION_TYPE_BOOL,            .default_value = "off", @@ -689,6 +644,12 @@ struct volume_options options[] = {            .default_value = "off",            .description = "Enabling this option logs entry lock/unlocks"          }, +	{ .key = {"pre-op-compat"}, +	  .type = GF_OPTION_TYPE_BOOL, +	  .default_value = "on", +	  .description = "Use separate pre-op xattrop() FOP rather than " +	                 "overloading xdata of the OP" +	},          { .key = {"eager-lock"},            .type = GF_OPTION_TYPE_BOOL,            .default_value = "on", @@ -753,14 +714,6 @@ struct volume_options options[] = {                           "self-heal-daemon so that it can crawl only on "                           "local index directories.",          }, -        { .key  = {"heal-timeout"}, -          .type = GF_OPTION_TYPE_INT, -          .min  = 60, -          .max  = INT_MAX, -          .default_value = "600", -          .description = "time interval for checking the need to self-heal " -                         "in self-heal-daemon" -        },          { .key  = {"post-op-delay-secs"},            .type = GF_OPTION_TYPE_INT,            .min  = 0, @@ -777,11 +730,6 @@ struct volume_options options[] = {            .max = 131072,            .default_value = "1KB",          }, -        { .key = {"readdir-failover"}, -          .type = GF_OPTION_TYPE_BOOL, -          .description = "readdir(p) will not failover if this option is off", -          .default_value = "on", -        },          { .key = {"ensure-durability"},            .type = GF_OPTION_TYPE_BOOL,            .description = "Afr performs fsyncs for transactions if this " @@ -789,5 +737,13 @@ struct volume_options options[] = {                           "written to the disk",            .default_value = "on",          }, +	{ .key = {"afr-dirty-xattr"}, +	  .type = GF_OPTION_TYPE_STR, +	  .default_value = AFR_DIRTY_DEFAULT, +	}, +	{ .key = {"metadata-splitbrain-forced-heal"}, +	  .type = GF_OPTION_TYPE_BOOL, +	  .default_value = "off", +	},          { .key  = {NULL} },  }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 9196a1f271d..2e1b78d1c9f 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -20,112 +20,42 @@  #include "call-stub.h"  #include "compat-errno.h"  #include "afr-mem-types.h" -#include "afr-self-heal-algorithm.h"  #include "libxlator.h"  #include "timer.h" +#include "syncop.h" + +#include "afr-self-heald.h"  #define AFR_XATTR_PREFIX "trusted.afr"  #define AFR_PATHINFO_HEADER "REPLICATE:"  #define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size"  #define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal" +#define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty" +#define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty)  #define AFR_LOCKEE_COUNT_MAX    3  #define AFR_DOM_COUNT_MAX    3 - -#define afr_inode_missing(op_errno) (op_errno == ENOENT || op_errno == ESTALE) - -struct _pump_private; - -typedef int (*afr_expunge_done_cbk_t) (call_frame_t *frame, xlator_t *this, -                                       int child, int32_t op_error, -                                       int32_t op_errno); - -typedef int (*afr_impunge_done_cbk_t) (call_frame_t *frame, xlator_t *this, -                                       int32_t op_error, int32_t op_errno); -typedef int (*afr_post_remove_call_t) (call_frame_t *frame, xlator_t *this); +#define AFR_NUM_CHANGE_LOGS            3 /*data + metadata + entry*/  typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this); -typedef void (*afr_lookup_done_cbk_t) (call_frame_t *frame, xlator_t *this, -                                      int32_t op_ret, int32_t op_errno); -typedef enum { -        AFR_POS_UNKNOWN, -        AFR_POS_LOCAL, -        AFR_POS_REMOTE -} afr_child_pos_t; +typedef int (*afr_read_txn_wind_t) (call_frame_t *frame, xlator_t *this, int subvol); -typedef enum { -        SPLIT_BRAIN = 1, -        ALL_FOOLS = 2 -} afr_subvol_status_t; +typedef int (*afr_inode_refresh_cbk_t) (call_frame_t *frame, xlator_t *this, int err); -typedef enum { -        AFR_INODE_SET_READ_CTX = 1, -        AFR_INODE_RM_STALE_CHILDREN, -        AFR_INODE_SET_OPENDIR_DONE, -        AFR_INODE_GET_READ_CTX, -        AFR_INODE_GET_OPENDIR_DONE, -} afr_inode_op_t; - -typedef struct afr_inode_params_ { -        afr_inode_op_t op; -        union { -                gf_boolean_t value; -                struct { -                        int32_t read_child; -                        int32_t *children; -                } read_ctx; -        } u; -} afr_inode_params_t; - -typedef enum afr_spb_state { -        DONT_KNOW, -        SPB, -        NO_SPB -} afr_spb_state_t; - -typedef struct afr_inode_ctx_ { -        uint64_t masks; -        int32_t  *fresh_children;//increasing order of latency -        afr_spb_state_t mdata_spb; -        afr_spb_state_t data_spb; -        uint32_t        open_fd_count; -} afr_inode_ctx_t; +typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this); -typedef enum { -        NONE, -        INDEX, -        INDEX_TO_BE_HEALED, -        FULL, -} afr_crawl_type_t; - -typedef struct afr_self_heald_ { -        gf_boolean_t            enabled; -        gf_boolean_t            iamshd; -        afr_crawl_type_t        *pending; -        gf_boolean_t            *inprogress; -        afr_child_pos_t         *pos; -        gf_timer_t              **timer; -        eh_t                    *healed; -        eh_t                    *heal_failed; -        eh_t                    *split_brain; -        eh_t                    **statistics; -        void                    **crawl_events; -        char                    *node_uuid; -        int                     timeout; -} afr_self_heald_t; +#define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr;}) +#define AFR_COUNT(array,max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res;}) +#define AFR_INTERSECT(dst,src1,src2,max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i];})  typedef struct _afr_private {          gf_lock_t lock;               /* to guard access to child_count, etc */          unsigned int child_count;     /* total number of children   */ -        unsigned int read_child_rr;   /* round-robin index of the read_child */ -        gf_lock_t read_child_lock;    /* lock to protect above */ -          xlator_t **children; -        int first_lookup;          inode_t *root_inode;          unsigned char *child_up; @@ -146,6 +76,7 @@ typedef struct _afr_private {          gf_boolean_t metadata_change_log;   /* on/off */          gf_boolean_t entry_change_log;      /* on/off */ +	gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */          int read_child;               /* read-subvolume */          unsigned int hash_mode;       /* for when read_child is not set */          int favorite_child;  /* subvolume to be preferred in resolving @@ -154,178 +85,45 @@ typedef struct _afr_private {          gf_boolean_t inodelk_trace;          gf_boolean_t entrylk_trace; -        gf_boolean_t strict_readdir; -          unsigned int wait_count;      /* # of servers to wait for success */          uint64_t up_count;      /* number of CHILD_UPs we have seen */          uint64_t down_count;    /* number of CHILD_DOWNs we have seen */ -        struct _pump_private *pump_private; /* Set if we are loaded as pump */ -        int                   use_afr_in_pump; - -        pthread_mutex_t  mutex; -        struct list_head saved_fds;   /* list of fds on which locks have succeeded */          gf_boolean_t      optimistic_change_log;          gf_boolean_t      eager_lock; +        gf_boolean_t      pre_op_compat;      /* on/off */  	uint32_t          post_op_delay_secs;          unsigned int      quorum_count;          char                   vol_uuid[UUID_SIZE + 1];          int32_t                *last_event; -        afr_self_heald_t       shd; + +	/* @event_generation: Keeps count of number of events received which can +	   potentially impact consistency decisions. The events are CHILD_UP +	   and CHILD_DOWN, when we have to recalculate the freshness/staleness +	   of copies to detect if changes had happened while the other server +	   was down. CHILD_DOWN and CHILD_UP can also be received on network +	   disconnect/reconnects and not necessarily server going down/up. +	   Recalculating freshness/staleness on network events is equally +	   important as we might have had a network split brain. +	*/ +	uint32_t               event_generation; +          gf_boolean_t           choose_local;          gf_boolean_t           did_discovery; -        gf_boolean_t           readdir_failover;          uint64_t               sh_readdir_size;          gf_boolean_t           ensure_durability;          char                   *sh_domain; -} afr_private_t; - -typedef enum { -        AFR_SELF_HEAL_NOT_ATTEMPTED, -        AFR_SELF_HEAL_STARTED, -        AFR_SELF_HEAL_FAILED, -        AFR_SELF_HEAL_SYNC_BEGIN, -} afr_self_heal_status; - -typedef struct { -        afr_self_heal_status gfid_or_missing_entry_self_heal; -        afr_self_heal_status metadata_self_heal; -        afr_self_heal_status data_self_heal; -        afr_self_heal_status entry_self_heal; -} afr_sh_status_for_all_type; - -typedef enum { -        AFR_SELF_HEAL_ENTRY, -        AFR_SELF_HEAL_METADATA, -        AFR_SELF_HEAL_DATA, -        AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY, -        AFR_SELF_HEAL_INVALID = -1, -} afr_self_heal_type; - -typedef enum { -        AFR_CHECK_ALL, -        AFR_CHECK_SPECIFIC, -} afr_sh_fail_check_type; - -struct afr_self_heal_ { -        /* External interface: These are variables (some optional) that -           are set by whoever has triggered self-heal */ - -        gf_boolean_t do_data_self_heal; -        gf_boolean_t do_metadata_self_heal; -        gf_boolean_t do_entry_self_heal; -        gf_boolean_t do_gfid_self_heal; -        gf_boolean_t do_missing_entry_self_heal; -        gf_boolean_t force_confirm_spb; /* Check for split-brains even when -                                           self-heal is turned off */ - -        gf_boolean_t forced_merge;        /* Is this a self-heal triggered to -                                             forcibly merge the directories? */ - -        gf_boolean_t background;          /* do self-heal in background -                                             if possible */ -        ia_type_t type;                   /* st_mode of the entry we're doing -                                             self-heal on */ -        inode_t   *inode;                 /* inode on which the self-heal is -                                             performed on */ -        uuid_t  sh_gfid_req;                 /* gfid self-heal needs to be done -                                             with this gfid if it is not null */ - -        /* Function to call to unwind. If self-heal is being done in the -           background, this function will be called as soon as possible. */ - -        int (*unwind) (call_frame_t *frame, xlator_t *this, int32_t op_ret, -                       int32_t op_errno, int32_t sh_failed); - -        /* End of external interface members */ - - -        /* array of stat's, one for each child */ -        struct iatt *buf; -        struct iatt *parentbufs; -        struct iatt parentbuf; -        struct iatt entrybuf; - -        afr_expunge_done_cbk_t expunge_done; -        afr_impunge_done_cbk_t impunge_done; - -        /* array of xattr's, one for each child */ -        dict_t **xattr; - -        /* array containing if the lookups succeeded in the order of response -         */ -        int32_t *success_children; -        int     success_count; -        /* array containing the fresh children found in the self-heal process */ -        int32_t *fresh_children; -        /* array containing the fresh children found in the parent lookup */ -        int32_t *fresh_parent_dirs; -        /* array of errno's, one for each child */ -        int *child_errno; -        /*loc used for lookup*/ -        loc_t lookup_loc; -        int32_t lookup_flags; -        afr_lookup_done_cbk_t lookup_done; - -        int32_t **pending_matrix; -        int32_t **delta_matrix; +	char                   *afr_dirty; -        int32_t op_ret; -        int32_t op_errno; +	afr_self_heald_t       shd; -        int *sources; -        int source; -        int active_source; -        int active_sinks; -        unsigned char *success; -        unsigned char *locked_nodes; -        int lock_count; - -        const char *linkname; -        gf_boolean_t entries_skipped; - -        gf_boolean_t actual_sh_started; -        gf_boolean_t sync_done; -        gf_boolean_t data_lock_held; -        gf_boolean_t sh_dom_lock_held; -        gf_boolean_t eof_reached; -        fd_t  *healing_fd; -        int   file_has_holes; -        blksize_t block_size; -        off_t file_size; -        off_t offset; -        unsigned char *write_needed; -        uint8_t *checksum; -        afr_post_remove_call_t post_remove_call; - -        char    *data_sh_info; -        char    *metadata_sh_info; - -        loc_t parent_loc; -        call_frame_t *orig_frame; -        call_frame_t *old_loop_frame; -        gf_boolean_t unwound; - -        afr_sh_algo_private_t *private; -        afr_sh_status_for_all_type  afr_all_sh_status; -        afr_self_heal_type       sh_type_in_action; - -        struct afr_sh_algorithm  *algo; -        afr_lock_cbk_t data_lock_success_handler; -        afr_lock_cbk_t data_lock_failure_handler; -	gf_boolean_t data_lock_block; -        int (*completion_cbk) (call_frame_t *frame, xlator_t *this); -        int (*sh_data_algo_start) (call_frame_t *frame, xlator_t *this); -        int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this); -        int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this); -        void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this); - -        call_frame_t *sh_frame; -}; +	/* pump dependencies */ +	void                   *pump_private; +	gf_boolean_t           use_afr_in_pump; +} afr_private_t; -typedef struct afr_self_heal_ afr_self_heal_t;  typedef enum {          AFR_DATA_TRANSACTION,          /* truncate, write, ... */ @@ -438,32 +236,72 @@ typedef struct {          char *domain; /* Domain on which inode/entry lock/unlock in progress.*/  } afr_internal_lock_t; -typedef struct _afr_locked_fd { -        fd_t  *fd; -        struct list_head list; -} afr_locked_fd_t; -  struct afr_reply {  	int	valid;  	int32_t	op_ret;  	int32_t	op_errno; +	dict_t *xdata; +	struct iatt poststat; +	struct iatt postparent; +	struct iatt prestat; +	struct iatt preparent; +	struct iatt preparent2; +	struct iatt postparent2; +	uint8_t checksum[MD5_DIGEST_LENGTH];  }; +typedef enum { +        AFR_FD_NOT_OPENED, +        AFR_FD_OPENED, +        AFR_FD_OPENING +} afr_fd_open_status_t; + +typedef struct { +        unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS]; +	int inherited[AFR_NUM_CHANGE_LOGS]; +	int on_disk[AFR_NUM_CHANGE_LOGS]; +        afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ + +        unsigned int *lock_piggyback; +        unsigned int *lock_acquired; + +        int flags; + +	/* used for delayed-post-op optimization */ +	pthread_mutex_t    delay_lock; +	gf_timer_t        *delay_timer; +	call_frame_t      *delay_frame; + +	/* set if any write on this fd was a non stable write +	   (i.e, without O_SYNC or O_DSYNC) +	*/ +	gf_boolean_t      witnessed_unstable_write; + +	/* @open_fd_count: +	   Number of open FDs queried from the server, as queried through +	   xdata in FOPs. Currently, used to decide if eager-locking must be +	   temporarily disabled. +	*/ +        uint32_t        open_fd_count; + + +	/* list of frames currently in progress */ +	struct list_head  eager_locked; +} afr_fd_ctx_t; + +  typedef struct _afr_local { -        int     uid; -        int     gid; +	glusterfs_fop_t  op;          unsigned int call_count; -        unsigned int success_count; -        unsigned int enoent_count; -        uint32_t     open_fd_count; -        gf_boolean_t update_open_fd_count; +	/* @event_generation: copy of priv->event_generation taken at the +	   time of starting the transaction. The copy is made so that we +	   have a stable value through the various phases of the transaction. +	*/ +	unsigned int event_generation; -        unsigned int unhealable; - -        unsigned int read_child_index; -        unsigned char read_child_returned; -        unsigned int first_up_child; +        uint32_t     open_fd_count; +        gf_boolean_t update_open_fd_count;  	gf_lkowner_t  saved_lk_owner; @@ -472,54 +310,111 @@ typedef struct _afr_local {          int32_t **pending; +	int dirty[AFR_NUM_CHANGE_LOGS]; +          loc_t loc;          loc_t newloc;          fd_t *fd; +	afr_fd_ctx_t *fd_ctx; -        glusterfs_fop_t fop; - +	/* @child_up: copy of priv->child_up taken at the time of transaction +	   start. The copy is taken so that we have a stable child_up array +	   through the phases of the transaction as priv->child_up[i] can keep +	   changing through time. +	*/          unsigned char *child_up; -        int32_t       *fresh_children; //in the order of response -        int32_t *child_errno; +	/* @read_attempted: +	   array of flags representing subvolumes where read operations of +	   the read transaction have already been attempted. The array is +	   first pre-filled with down subvolumes, and as reads are performed +	   on other subvolumes, those are set as well. This way if the read +	   operation fails we do not retry on that subvolume again. +	*/ +	unsigned char *read_attempted; + +	/* @readfn: -        dict_t  *xattr_req; +	   pointer to function which will perform the read operation on a given +	   subvolume. Used in read transactions. +	*/ -        int32_t  inodelk_count; -        int32_t  entrylk_count; +	afr_read_txn_wind_t readfn; -        afr_internal_lock_t internal_lock; +	/* @refreshed: -        afr_locked_fd_t *locked_fd; -        int32_t          source_child; -        int32_t          lock_recovery_child; +	   the inode was "refreshed" (i.e, pending xattrs from all subvols +	   freshly inspected and inode ctx updated accordingly) as part of +	   this transaction already. +	*/ +	gf_boolean_t refreshed; + +	/* @inode: + +	   the inode on which the read txn is performed on. ref'ed and copied +	   from either fd->inode or loc.inode +	*/ + +	inode_t *inode; + +	/* @parent[2]: + +	   parent inode[s] on which directory transactions are performed. +	*/ + +	inode_t *parent; +	inode_t *parent2; + +	/* @readable: + +	   array of flags representing servers from which a read can be +	   performed. This is the output of afr_inode_refresh() +	*/ +	unsigned char *readable; + +	afr_inode_refresh_cbk_t refreshfn; + +	/* @refreshinode: + +	   Inode currently getting refreshed. +	*/ +	inode_t *refreshinode; + +	/* +	  @pre_op_compat: + +	  compatibility mode of pre-op. send a separate pre-op and +	  op operations as part of transaction, rather than combining +	*/ + +	gf_boolean_t pre_op_compat; + +        dict_t  *xattr_req; + +        afr_internal_lock_t internal_lock;          dict_t  *dict; +          int      optimistic_change_log;  	gf_boolean_t      delayed_post_op; -  	/* Is the current writev() going to perform a stable write?  	   i.e, is fd->flags or @flags writev param have O_SYNC or  	   O_DSYNC?  	*/ -        gf_boolean_t      stable_write; - -        /* This write appended to the file. Nnot necessarily O_APPEND, -           just means the offset of write was at the end of file. -        */ -        gf_boolean_t      append_write; - -        int attempt_self_heal; -        int foreground_self_heal; +	gf_boolean_t      stable_write; +	/* This write appended to the file. Nnot necessarily O_APPEND, +	   just means the offset of write was at the end of file. +	*/ +	gf_boolean_t      append_write; -        /* This struct contains the arguments for the "continuation" -           (scheme-like) of fops +        /* +          This struct contains the arguments for the "continuation" +          (scheme-like) of fops          */ -        int   op;          struct {                  struct {                          unsigned char buf_set; @@ -527,24 +422,6 @@ typedef struct _afr_local {                  } statfs;                  struct { -                        uint32_t parent_entrylk; -                        uuid_t  gfid_req; -                        inode_t *inode; -                        struct iatt buf; -                        struct iatt postparent; -                        dict_t **xattrs; -                        dict_t *xattr; -                        struct iatt *postparents; -                        struct iatt *bufs; -                        int32_t read_child; -                        int32_t *sources; -                        int32_t *success_children; -                        int32_t **pending_matrix; -                        gf_boolean_t fresh_lookup; -                        gf_boolean_t possible_spb; -                } lookup; - -                struct {                          int32_t flags;                  } open; @@ -737,22 +614,67 @@ typedef struct _afr_local {                  afr_transaction_type type; -		/* pre-compute the post piggyback status before -		   entering POST-OP phase -		*/ -		int              *postop_piggybacked; -  		/* stub to resume on destruction  		   of the transaction frame */  		call_stub_t      *resume_stub;  		struct list_head  eager_locked; -                int32_t         **txn_changelog;//changelog after pre+post ops                  unsigned char   *pre_op; +		/* @fop_subvols: subvolumes on which FOP will be attempted */ +                unsigned char   *fop_subvols; + +		/* @failed_subvols: subvolumes on which FOP failed. Always +		   a subset of @fop_subvols */ +                unsigned char   *failed_subvols; + +		/* @dirtied: flag which indicates whether we set dirty flag +		   in the OP. Typically true when we are performing operation +		   on more than one subvol and optimistic changelog is disabled + +		   A 'true' value set in @dirtied flag means an 'undirtying' +		   has to be done in POST-OP phase. +		*/ +		gf_boolean_t  dirtied; + +		/* @inherited: flag which indicates that the dirty flags +		   of the previous transaction were inherited +		*/ +		gf_boolean_t  inherited; + +		/* +		  @no_uninherit: flag which indicates that a pre_op_uninherit() +		  must _not_ be attempted (and returned as failure) always. This +		  flag is set when a hard pre-op is performed, but not accounted +		  for it in fd_ctx->on_disk[]. Such transactions are "isolated" +		  from the pre-op piggybacking entirely and therefore uninherit +		  must not be attempted. +		*/ +		gf_boolean_t no_uninherit; + +		/* @uninherit_done: +		   @uninherit_value: + +		   The above pair variables make pre_op_uninherit() idempotent. +		   Both are FALSE initially. The first call to pre_op_uninherit +		   sets @uninherit_done to TRUE and the return value to +		   @uninherit_value. Further calls will check for @uninherit_done +		   to be TRUE and if so will simply return @uninherit_value. +		*/ +		gf_boolean_t uninherit_done; +		gf_boolean_t uninherit_value; + +		/* @changelog_resume: function to be called after changlogging +		   (either pre-op or post-op) is done +		*/ + +		afr_changelog_resume_t changelog_resume; +                  call_frame_t *main_frame; +                int (*wind) (call_frame_t *frame, xlator_t *this, int subvol); +                  int (*fop) (call_frame_t *frame, xlator_t *this);                  int (*done) (call_frame_t *frame, xlator_t *this); @@ -764,7 +686,7 @@ typedef struct _afr_local {                  /* post-op hook */          } transaction; -        afr_self_heal_t self_heal; +	syncbarrier_t barrier;          struct marker_str     marker; @@ -778,75 +700,58 @@ typedef struct _afr_local {  	struct afr_reply *replies;  } afr_local_t; -typedef enum { -        AFR_FD_NOT_OPENED, -        AFR_FD_OPENED, -        AFR_FD_OPENING -} afr_fd_open_status_t; - -typedef struct { -        unsigned int *pre_op_done; -        afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ -        unsigned int *pre_op_piggyback; - -        unsigned int *lock_piggyback; -        unsigned int *lock_acquired; - -        int flags; -        uint64_t up_count;   /* number of CHILD_UPs this fd has seen */ -        uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */ - -        int32_t last_tried; - -        int  hit, miss; -        gf_boolean_t failed_over; -        struct list_head entries; /* needed for readdir failover */ - -        unsigned char *locked_on; /* which subvolumes locks have been successful */ - -	/* used for delayed-post-op optimization */ -	pthread_mutex_t    delay_lock; -	gf_timer_t        *delay_timer; -	call_frame_t      *delay_frame; -        int               call_child; - -	/* set if any write on this fd was a non stable write -	   (i.e, without O_SYNC or O_DSYNC) -	*/ -	gf_boolean_t      witnessed_unstable_write; - -	/* list of frames currently in progress */ -	struct list_head  eager_locked; -} afr_fd_ctx_t; - - -/* try alloc and if it fails, goto label */ -#define AFR_LOCAL_ALLOC_OR_GOTO(var, label) do {                    \ -                var = mem_get0 (THIS->local_pool);                  \ -                if (!var) {                                         \ -                        gf_log (this->name, GF_LOG_ERROR,           \ -                                "out of memory :(");                \ -                        op_errno = ENOMEM;                          \ -                        goto label;                                 \ -                }                                                   \ -        } while (0); -  /* did a call fail due to a child failing? */  #define child_went_down(op_ret, op_errno) (((op_ret) < 0) &&            \                                             ((op_errno == ENOTCONN) ||   \                                              (op_errno == EBADFD))) -#define afr_fop_failed(op_ret, op_errno) ((op_ret) == -1) +int +afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, +			   unsigned char *data_subvols, +			   unsigned char *metadata_subvols, +			   int *event_generation); +int +__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, +			     unsigned char *data_subvols, +			     unsigned char *metadata_subvols, +			     int *event_generation); -/* have we tried all children? */ -#define all_tried(i, count)  ((i) == (count) - 1) +int +__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, +			     unsigned char *data_subvols, +			     unsigned char *metadata_subvol, +			     int event_generation); +int +afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, +			   unsigned char *data_subvols, +			   unsigned char *metadata_subvols, +			   int event_generation); -int32_t -afr_set_dict_gfid (dict_t *dict, uuid_t gfid); +int +afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this);  int -pump_command_reply (call_frame_t *frame, xlator_t *this); +afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, +				  unsigned char *readable); + +int +afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this, +				unsigned char *readable, int *event_p, +				int type); +int +afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p, +		     int *event_p, afr_transaction_type type); + +#define afr_data_subvol_get(i, t, s, e) \ +	afr_read_subvol_get(i, t, s, e, AFR_DATA_TRANSACTION) + +#define afr_metadata_subvol_get(i, t, s, e) \ +	afr_read_subvol_get(i, t, s, e, AFR_METADATA_TRANSACTION) + +int +afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode, +		   afr_inode_refresh_cbk_t cbk);  int32_t  afr_notify (xlator_t *this, int32_t event, void *data, void *data2); @@ -862,9 +767,6 @@ int  afr_attempt_lock_recovery (xlator_t *this, int32_t child_index);  int -afr_save_locked_fd (xlator_t *this, fd_t *fd); - -int  afr_mark_locked_nodes (xlator_t *this, fd_t *fd,                         unsigned char *locked_nodes); @@ -874,10 +776,6 @@ afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner);  int  afr_set_lock_number (call_frame_t *frame, xlator_t *this); - -loc_t * -lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2); -  int32_t  afr_unlock (call_frame_t *frame, xlator_t *this); @@ -897,42 +795,26 @@ int  afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom,                            unsigned int child_count); -int pump_start (call_frame_t *frame, xlator_t *this); -  int  __afr_fd_ctx_set (xlator_t *this, fd_t *fd);  int  afr_fd_ctx_set (xlator_t *this, fd_t *fd); -int32_t -afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children); - -void -afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, -                        int32_t *fresh_children); +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this);  int  afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno); -unsigned int -afr_up_children_count (unsigned char *child_up, unsigned int child_count); - -unsigned int -afr_locked_children_count (unsigned char *children, unsigned int child_count); - -unsigned int -afr_pre_op_done_children_count (unsigned char *pre_op, -                                unsigned int child_count); +int +afr_locked_nodes_count (unsigned char *locked_nodes, int child_count); -gf_boolean_t -afr_is_fresh_lookup (loc_t *loc, xlator_t *this); +int +afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode);  void -afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent); - -int -afr_locked_nodes_count (unsigned char *locked_nodes, int child_count); +afr_replies_wipe (afr_local_t *local, afr_private_t *priv);  void  afr_local_cleanup (afr_local_t *local, xlator_t *this); @@ -940,32 +822,16 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this);  int  afr_frame_return (call_frame_t *frame); -gf_boolean_t -afr_is_split_brain (xlator_t *this, inode_t *inode); - -void -afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, -                     afr_spb_state_t data_spb); -  int  afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,            fd_t *fd, dict_t *xdata);  void -afr_set_opendir_done (xlator_t *this, inode_t *inode); - -gf_boolean_t -afr_is_opendir_done (xlator_t *this, inode_t *inode); - -void  afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this);  int  afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd); -int -afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd); -  #define AFR_STACK_UNWIND(fop, frame, params ...)                \          do {                                                    \                  afr_local_t *__local = NULL;                    \ @@ -996,7 +862,16 @@ afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd);                  }                                               \          } while (0); -#define AFR_NUM_CHANGE_LOGS            3 /*data + metadata + entry*/ +#define AFR_FRAME_INIT(frame, op_errno)				       \ +	({frame->local = mem_get0 (THIS->local_pool);		       \ +	if (afr_local_init (frame->local, THIS->private, &op_errno)) { \ +		afr_local_cleanup (frame->local, THIS);		       \ +		mem_put (frame->local);				       \ +		frame->local = NULL; };				       \ +	frame->local;}) + +#define AFR_STACK_RESET(frame) do { int opr; STACK_RESET (frame->root); AFR_FRAME_INIT(frame, opr);} while (0) +  /* allocate and return a string that is the basename of argument */  static inline char *  AFR_BASENAME (const char *str) @@ -1009,6 +884,9 @@ AFR_BASENAME (const char *str)          return __basename_str;  } +call_frame_t * +afr_copy_frame (call_frame_t *base); +  int  afr_transaction_local_init (afr_local_t *local, xlator_t *this); @@ -1016,9 +894,6 @@ int32_t  afr_marker_getxattr (call_frame_t *frame, xlator_t *this,                       loc_t *loc, const char *name,afr_local_t *local, afr_private_t *priv ); -int32_t * -afr_children_create (int32_t child_count); -  int  afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno); @@ -1027,101 +902,20 @@ afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count,                          transaction_lk_type_t lk_type);  int -afr_first_up_child (unsigned char *child_up, size_t child_count); +afr_higher_errno (int32_t old_errno, int32_t new_errno);  int -afr_select_read_child_from_policy (int32_t *fresh_children, int32_t child_count, -                                   int32_t prev_read_child, -                                   int32_t config_read_child, int32_t *sources, -                                   unsigned int hmode, uuid_t gfid); +afr_final_errno (afr_local_t *local, afr_private_t *priv); -void -afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, -                              int32_t *fresh_children, int32_t prev_read_child, -                              int32_t config_read_child, uuid_t gfid); - -int32_t -afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, -                    int32_t *fresh_children, -                    int32_t *call_child, int32_t *last_index); - -int32_t -afr_next_call_child (int32_t *fresh_children, unsigned char *child_up, -                     size_t child_count, int32_t *last_index, -                     int32_t read_child); -void -afr_get_fresh_children (int32_t *success_children, int32_t *sources, -                        int32_t *children, unsigned int child_count); -void -afr_children_add_child (int32_t *children, int32_t child, -                              int32_t child_count); -void -afr_children_rm_child (int32_t *children, int32_t child, -                             int32_t child_count); -void -afr_reset_children (int32_t *children, int32_t child_count); -int32_t -afr_most_important_error(int32_t old_errno, int32_t new_errno, -			 gf_boolean_t eio);  int -afr_errno_count (int32_t *children, int *child_errno, -                 unsigned int child_count, int32_t op_errno); -int -afr_get_children_count (int32_t *children, unsigned int child_count); -gf_boolean_t -afr_is_child_present (int32_t *success_children, int32_t child_count, -                      int32_t child); -void -afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs, -                            int32_t *success_children, -                            unsigned int child_count); -void -afr_reset_xattr (dict_t **xattr, unsigned int child_count); -gf_boolean_t -afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, -                        unsigned int child_count, const char *path, -                        const char *xlator_name); -unsigned int -afr_gfid_missing_count (const char *xlator_name, int32_t *children, -                        struct iatt *bufs, unsigned int child_count, -                        const char *path); -void -afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path); -void -afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count); -afr_transaction_type -afr_transaction_type_get (ia_type_t ia_type); -int32_t -afr_resultant_errno_get (int32_t *children, -                         int *child_errno, unsigned int child_count); -void -afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, -                             int32_t *stale_children); -void -afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, -                      gf_boolean_t background, ia_type_t ia_type, char *reason, -                      void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, -                                                   xlator_t *this), -                      int (*unwind) (call_frame_t *frame, xlator_t *this, -                                     int32_t op_ret, int32_t op_errno, -                                     int32_t sh_failed)); -void -afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open); +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req);  void -afr_open_fd_fix (fd_t *fd, xlator_t *this); -int -afr_set_elem_count_get (unsigned char *elems, int child_count); +afr_fix_open (fd_t *fd, xlator_t *this);  afr_fd_ctx_t *  afr_fd_ctx_get (fd_t *fd, xlator_t *this); -gf_boolean_t -afr_open_only_data_self_heal (char *data_self_heal); - -gf_boolean_t -afr_data_self_heal_enabled (char *data_self_heal); -  void  afr_set_low_priority (call_frame_t *frame);  int @@ -1137,22 +931,9 @@ afr_matrix_cleanup (int32_t **pending, unsigned int m);  int32_t**  afr_matrix_create (unsigned int m, unsigned int n); -gf_boolean_t -afr_is_errno_set (int *child_errno, int child); - -gf_boolean_t -afr_is_errno_unset (int *child_errno, int child); - -gf_boolean_t -afr_is_fd_fixable (fd_t *fd); -  void -afr_prepare_new_entry_pending_matrix (int32_t **pending, -                                      gf_boolean_t (*is_pending) (int *, int), -                                      int *ctx, struct iatt *buf, -                                      unsigned int child_count); -void -afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count); +afr_filter_xattrs (dict_t *xattr); +  /*   * Special value indicating we should use the "auto" quorum method instead of   * a fixed value (including zero to turn off quorum enforcement). @@ -1172,28 +953,6 @@ afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count);          }                                                                \  } while (0); - -#define AFR_SBRAIN_MSG "Failed on %s as split-brain is seen. Returning EIO." - -#define AFR_SBRAIN_CHECK_FD(fd, label) do {                              \ -        if (fd->inode && afr_is_split_brain (this, fd->inode)) {        \ -                op_errno = EIO;                                         \ -                gf_log (this->name, GF_LOG_WARNING,                     \ -                        AFR_SBRAIN_MSG ,uuid_utoa (fd->inode->gfid));   \ -                goto label;                                             \ -        }                                                               \ -} while (0) - -#define AFR_SBRAIN_CHECK_LOC(loc, label) do {                           \ -        if (loc->inode && afr_is_split_brain (this, loc->inode)) {      \ -                op_errno = EIO;                                         \ -                loc_path (loc, NULL);                                   \ -                gf_log (this->name, GF_LOG_WARNING,                     \ -                        AFR_SBRAIN_MSG , loc->path);                    \ -                goto label;                                             \ -        }                                                               \ -} while (0) -  int  afr_fd_report_unstable_write (xlator_t *this, fd_t *fd); @@ -1209,7 +968,7 @@ afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count);  void  afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this); -afr_inode_ctx_t* -afr_inode_ctx_get (inode_t *inode, xlator_t *this); +int +afr_local_pathinfo (char *pathinfo, gf_boolean_t *is_local);  #endif /* __AFR_H__ */ diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c index 987696e5583..eed5099563b 100644 --- a/xlators/cluster/afr/src/pump.c +++ b/xlators/cluster/afr/src/pump.c @@ -21,6 +21,120 @@  #include "afr-common.c"  #include "defaults.c"  #include "glusterfs.h" +#include "pump.h" + + +static int +afr_set_dict_gfid (dict_t *dict, uuid_t gfid) +{ +        int ret       = 0; +        uuid_t *pgfid = NULL; + +        GF_ASSERT (gfid); + +        pgfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char); +        if (!pgfid) { +                ret = -1; +                goto out; +        } + +        uuid_copy (*pgfid, gfid); + +        ret = dict_set_dynptr (dict, "gfid-req", pgfid, sizeof (uuid_t)); +        if (ret) +                gf_log (THIS->name, GF_LOG_ERROR, "gfid set failed"); + +out: +        if (ret && pgfid) +                GF_FREE (pgfid); +	return ret; +} + +static int +afr_set_root_gfid (dict_t *dict) +{ +        uuid_t gfid; +        int ret = 0; + +        memset (gfid, 0, 16); +        gfid[15] = 1; + +        ret = afr_set_dict_gfid (dict, gfid); + +        return ret; +} + +static int +afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +{ +        int   ret = -1; +        uuid_t pargfid = {0}; + +        if (!child) +                goto out; + +        if (!uuid_is_null (parent->inode->gfid)) +                uuid_copy (pargfid, parent->inode->gfid); +        else if (!uuid_is_null (parent->gfid)) +                uuid_copy (pargfid, parent->gfid); + +        if (uuid_is_null (pargfid)) +                goto out; + +        if (strcmp (parent->path, "/") == 0) +                ret = gf_asprintf ((char **)&child->path, "/%s", name); +        else +                ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path, +                                   name); + +        if (-1 == ret) { +                gf_log (this->name, GF_LOG_ERROR, +                        "asprintf failed while setting child path"); +        } + +        child->name = strrchr (child->path, '/'); +        if (child->name) +                child->name++; + +        child->parent = inode_ref (parent->inode); +        child->inode = inode_new (parent->inode->table); +        uuid_copy (child->pargfid, pargfid); + +        if (!child->inode) { +                ret = -1; +                goto out; +        } + +        ret = 0; +out: +        if ((ret == -1) && child) +                loc_wipe (child); + +        return ret; +} + +static void +afr_build_root_loc (xlator_t *this, loc_t *loc) +{ +        afr_private_t   *priv = NULL; + +        priv = this->private; +        loc->path = gf_strdup ("/"); +        loc->name = ""; +        loc->inode = inode_ref (priv->root_inode); +        uuid_copy (loc->gfid, loc->inode->gfid); +} + +static void +afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) +{ +        GF_ASSERT (loc); +        GF_ASSERT (buf); + +        uuid_copy (loc->gfid, buf->ia_gfid); +        if (postparent) +                uuid_copy (loc->pargfid, postparent->ia_gfid); +}  static uint64_t pump_pid = 0;  static inline void @@ -387,54 +501,68 @@ gf_pump_traverse_directory (loc_t *loc)                          if (ret)                                  goto out; -                        if (!IS_ENTRY_CWD (entry->d_name) && -                            !IS_ENTRY_PARENT (entry->d_name)) { - -                                    is_directory_empty = _gf_false; -                                    gf_log (this->name, GF_LOG_DEBUG, -                                            "lookup %s => %"PRId64, -                                            entry_loc.path, -                                            iatt.ia_ino); - -                                    ret = syncop_lookup (this, &entry_loc, NULL, -                                                         &iatt, &xattr_rsp, &parent); - -                                    if (ret) { -                                            gf_log (this->name, GF_LOG_ERROR, -                                                    "%s: lookup failed", -                                                    entry_loc.path); -                                            continue; -                                    } -                                    pump_fill_loc_info (&entry_loc, &iatt, -                                                       &parent); - -                                    pump_update_resume_state (this, entry_loc.path); - -                                    pump_save_path (this, entry_loc.path); -                                    pump_save_file_stats (this, entry_loc.path); - -                                    ret = pump_check_and_update_status (this); -                                    if (ret < 0) { -                                            gf_log (this->name, GF_LOG_DEBUG, -                                                    "Pump beginning to exit out"); -                                            goto out; -                                    } - -                                    if (IA_ISDIR (iatt.ia_type)) { -                                            if (is_pump_traversal_allowed (this, entry_loc.path)) { -                                                    gf_log (this->name, GF_LOG_TRACE, -                                                            "entering dir=%s", -                                                            entry->d_name); -                                                    gf_pump_traverse_directory (&entry_loc); -                                            } -                                    } +			if ((strcmp (entry->d_name, ".") == 0) || +			    (strcmp (entry->d_name, "..") == 0)) +				continue; + +			is_directory_empty = _gf_false; +			gf_log (this->name, GF_LOG_DEBUG, +				"lookup %s => %"PRId64, +				entry_loc.path, +				iatt.ia_ino); + +			ret = syncop_lookup (this, &entry_loc, NULL, &iatt, +					     &xattr_rsp, &parent); + +			if (ret) { +				gf_log (this->name, GF_LOG_ERROR, +					"%s: lookup failed", entry_loc.path); +				continue; +			} + +			ret = afr_selfheal_name (this, loc->gfid, entry->d_name); +			if (ret) { +				gf_log (this->name, GF_LOG_ERROR, +					"%s: name self-heal failed (%s/%s)", +					entry_loc.path, uuid_utoa (loc->gfid), +					entry->d_name); +				continue; +			} + +			ret = afr_selfheal (this, iatt.ia_gfid); +			if (ret) { +				gf_log (this->name, GF_LOG_ERROR, +					"%s: self-heal failed (%s)", +					entry_loc.path, uuid_utoa (iatt.ia_gfid)); +				continue; +			} + +			pump_fill_loc_info (&entry_loc, &iatt, &parent); + +			pump_update_resume_state (this, entry_loc.path); + +			pump_save_path (this, entry_loc.path); +			pump_save_file_stats (this, entry_loc.path); + +			ret = pump_check_and_update_status (this); +			if (ret < 0) { +				gf_log (this->name, GF_LOG_DEBUG, +					"Pump beginning to exit out"); +				goto out; +			} + +			if (IA_ISDIR (iatt.ia_type)) { +				if (is_pump_traversal_allowed (this, entry_loc.path)) { +					gf_log (this->name, GF_LOG_TRACE, +						"entering dir=%s", entry->d_name); +					gf_pump_traverse_directory (&entry_loc); +				}                          }                  }                  gf_dirent_free (&entries);                  free_entries = _gf_false; -                gf_log (this->name, GF_LOG_TRACE, -                        "offset incremented to %d", +                gf_log (this->name, GF_LOG_TRACE, "offset incremented to %d",                          (int32_t ) offset);          } @@ -443,7 +571,7 @@ gf_pump_traverse_directory (loc_t *loc)          if (ret < 0)                  gf_log (this->name, GF_LOG_DEBUG, "closing the fd failed"); -        if (is_directory_empty && IS_ROOT_PATH (loc->path)) { +        if (is_directory_empty && (strcmp (loc->path, "/") == 0)) {                 pump_change_state (this, PUMP_STATE_RUNNING);                 gf_log (this->name, GF_LOG_INFO, "Empty source brick. "                                  "Nothing to be done."); @@ -1277,128 +1405,16 @@ out:  } -struct _xattr_key { -        char *key; -        struct list_head list; -}; - -static int -__gather_xattr_keys (dict_t *dict, char *key, data_t *value, -                     void *data) -{ -        struct list_head *  list  = data; -        struct _xattr_key * xkey  = NULL; - -        if (!strncmp (key, AFR_XATTR_PREFIX, -                      strlen (AFR_XATTR_PREFIX))) { - -                xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key); -                if (!xkey) -                        return -1; - -                xkey->key = key; -                INIT_LIST_HEAD (&xkey->list); - -                list_add_tail (&xkey->list, list); -        } -        return 0; -} - -static void -__filter_xattrs (dict_t *dict) -{ -        struct list_head keys; - -        struct _xattr_key *key; -        struct _xattr_key *tmp; - -        INIT_LIST_HEAD (&keys); - -        dict_foreach (dict, __gather_xattr_keys, -                      (void *) &keys); - -        list_for_each_entry_safe (key, tmp, &keys, list) { -                dict_del (dict, key->key); - -                list_del_init (&key->list); - -                GF_FREE (key); -        } -} - -int32_t -pump_getxattr_cbk (call_frame_t *frame, void *cookie, -		  xlator_t *this, int32_t op_ret, int32_t op_errno, -		  dict_t *dict, dict_t *xdata) -{ -	afr_private_t   *priv           = NULL; -	afr_local_t     *local          = NULL; -	xlator_t        **children      = NULL; -	int             unwind          = 1; -        int32_t         *last_index     = NULL; -        int32_t         next_call_child = -1; -        int32_t         read_child      = -1; -        int32_t         *fresh_children = NULL; - - -	priv     = this->private; -	children = priv->children; - -	local = frame->local; - -        read_child = (long) cookie; - -	if (op_ret == -1) { -		last_index = &local->cont.getxattr.last_index; -                fresh_children = local->fresh_children; -                next_call_child = afr_next_call_child (fresh_children, -                                                       local->child_up, -                                                       priv->child_count, -                                                       last_index, read_child); -                if (next_call_child < 0) -                        goto out; - -		unwind = 0; -		STACK_WIND_COOKIE (frame, pump_getxattr_cbk, -				   (void *) (long) read_child, -				   children[next_call_child], -				   children[next_call_child]->fops->getxattr, -				   &local->loc, -				   local->cont.getxattr.name, NULL); -	} - -out: -	if (unwind) { -                if (op_ret >= 0 && dict) -                        __filter_xattrs (dict); - -		AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, NULL); -	} - -	return 0; -} - -int32_t -pump_getxattr (call_frame_t *frame, xlator_t *this, -	      loc_t *loc, const char *name, dict_t *xdata) +int +pump_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, +	       const char *name, dict_t *xdata)  { -	afr_private_t *   priv       = NULL; -	xlator_t **       children   = NULL; -	int               call_child = 0; -	afr_local_t       *local     = NULL; -	int32_t           ret     = -1; -	int32_t           op_errno   = 0; -        uint64_t          read_child = 0; - - -	VALIDATE_OR_GOTO (frame, out); -	VALIDATE_OR_GOTO (this, out); -	VALIDATE_OR_GOTO (this->private, out); +	afr_private_t *priv = NULL; +	int op_errno = 0; +	int ret = 0; -	priv     = this->private; -	VALIDATE_OR_GOTO (priv->children, out); +	priv = this->private; -	children = priv->children;          if (!priv->use_afr_in_pump) {                  STACK_WIND (frame, default_getxattr_cbk,                              FIRST_CHILD (this), @@ -1407,14 +1423,6 @@ pump_getxattr (call_frame_t *frame, xlator_t *this,                  return 0;          } - -	AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); -	local = frame->local; - -        ret = afr_local_init (local, priv, &op_errno); -        if (ret < 0) -                goto out; -          if (name) {                  if (!strncmp (name, AFR_XATTR_PREFIX,                                strlen (AFR_XATTR_PREFIX))) { @@ -1432,32 +1440,7 @@ pump_getxattr (call_frame_t *frame, xlator_t *this,                  }          } -        local->fresh_children = GF_CALLOC (priv->child_count, -                                          sizeof (*local->fresh_children), -                                          gf_afr_mt_int32_t); -        if (!local->fresh_children) { -                ret = -1; -                op_errno = ENOMEM; -                goto out; -        } - -        read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); -        ret = afr_get_call_child (this, local->child_up, read_child, -                                     local->fresh_children, -                                     &call_child, -                                     &local->cont.getxattr.last_index); -        if (ret < 0) { -                op_errno = -ret; -                goto out; -        } -	loc_copy (&local->loc, loc); -	if (name) -	  local->cont.getxattr.name       = gf_strdup (name); - -	STACK_WIND_COOKIE (frame, pump_getxattr_cbk, -			   (void *) (long) call_child, -			   children[call_child], children[call_child]->fops->getxattr, -			   loc, name, xdata); +	afr_getxattr (frame, this, loc, name, xdata);  	ret = 0;  out: @@ -1466,134 +1449,6 @@ out:  	return 0;  } -static int -afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) -{ -	afr_local_t *   local = NULL; -	call_frame_t   *main_frame = NULL; - -	local = frame->local; - -	LOCK (&frame->lock); -	{ -		if (local->transaction.main_frame) -			main_frame = local->transaction.main_frame; -		local->transaction.main_frame = NULL; -	} -	UNLOCK (&frame->lock); - -	if (main_frame) { -		AFR_STACK_UNWIND (setxattr, main_frame, -                                  local->op_ret, local->op_errno, NULL); -	} -	return 0; -} - -static int -afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -		       int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ -	afr_local_t *   local = NULL; -	afr_private_t * priv  = NULL; - -	int call_count  = -1; -	int need_unwind = 0; - -	local = frame->local; -	priv = this->private; - -	LOCK (&frame->lock); -	{ -		if (op_ret != -1) { -			if (local->success_count == 0) { -				local->op_ret = op_ret; -			} -			local->success_count++; - -			if (local->success_count == priv->child_count) { -				need_unwind = 1; -			} -		} - -		local->op_errno = op_errno; -	} -	UNLOCK (&frame->lock); - -	if (need_unwind) -		local->transaction.unwind (frame, this); - -	call_count = afr_frame_return (frame); - -	if (call_count == 0) { -		local->transaction.resume (frame, this); -	} - -	return 0; -} - -static int -afr_setxattr_wind (call_frame_t *frame, xlator_t *this) -{ -	afr_local_t *local = NULL; -	afr_private_t *priv = NULL; - -	int call_count = -1; -	int i = 0; - -	local = frame->local; -	priv = this->private; - -	call_count = afr_up_children_count (local->child_up, priv->child_count); - -	if (call_count == 0) { -		local->transaction.resume (frame, this); -		return 0; -	} - -	local->call_count = call_count; - -	for (i = 0; i < priv->child_count; i++) { -		if (local->child_up[i]) { -			STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, -					   (void *) (long) i, -					   priv->children[i], -					   priv->children[i]->fops->setxattr, -					   &local->loc, -					   local->cont.setxattr.dict, -					   local->cont.setxattr.flags, NULL); - -			if (!--call_count) -				break; -		} -	} - -	return 0; -} - - -static int -afr_setxattr_done (call_frame_t *frame, xlator_t *this) -{ -	afr_local_t * local = frame->local; - -	local->transaction.unwind (frame, this); - -	AFR_STACK_DESTROY (frame); - -	return 0; -} - -int32_t -pump_setxattr_cbk (call_frame_t *frame, -		      void *cookie, -		      xlator_t *this, -		      int32_t op_ret, -		      int32_t op_errno, dict_t *xdata) -{ -	AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); -	return 0; -} -  int  pump_command_reply (call_frame_t *frame, xlator_t *this)  { @@ -1617,51 +1472,56 @@ pump_command_reply (call_frame_t *frame, xlator_t *this)  }  int -pump_parse_command (call_frame_t *frame, xlator_t *this, -                    afr_local_t *local, dict_t *dict) +pump_parse_command (call_frame_t *frame, xlator_t *this, dict_t *dict, +		    int *op_errno_p)  { - +	afr_local_t *local = NULL;          int ret = -1; +	int op_errno = 0;          if (pump_command_start (this, dict)) { -                frame->local = local; +                local = AFR_FRAME_INIT (frame, op_errno); +		if (!local) +			goto out;                  local->dict = dict_ref (dict);                  ret = pump_execute_start (frame, this);          } else if (pump_command_pause (this, dict)) { -                frame->local = local; +		local = AFR_FRAME_INIT (frame, op_errno); +		if (!local) +			goto out;                  local->dict = dict_ref (dict);                  ret = pump_execute_pause (frame, this);          } else if (pump_command_abort (this, dict)) { -                frame->local = local; +		local = AFR_FRAME_INIT (frame, op_errno); +		if (!local) +			goto out;                  local->dict = dict_ref (dict);                  ret = pump_execute_abort (frame, this);          } else if (pump_command_commit (this, dict)) { -                frame->local = local; +		local = AFR_FRAME_INIT (frame, op_errno); +		if (!local) +			goto out;                  local->dict = dict_ref (dict);                  ret = pump_execute_commit (frame, this);          } +out: +	if (op_errno_p) +		*op_errno_p = op_errno;          return ret;  }  int -pump_setxattr (call_frame_t *frame, xlator_t *this, -               loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) +pump_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, +	       int32_t flags, dict_t *xdata)  { -	afr_private_t * priv  = NULL; -	afr_local_t   * local = NULL; -	call_frame_t   *transaction_frame = NULL; +	afr_private_t *priv = NULL;  	int ret = -1;  	int op_errno = 0; -	VALIDATE_OR_GOTO (frame, out); -	VALIDATE_OR_GOTO (this, out); -	VALIDATE_OR_GOTO (this->private, out); - -        GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.pump*", dict, -                                   op_errno, out); +        GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.pump*", dict, op_errno, out);  	priv = this->private;          if (!priv->use_afr_in_pump) { @@ -1672,57 +1532,15 @@ pump_setxattr (call_frame_t *frame, xlator_t *this,                  return 0;          } - -	AFR_LOCAL_ALLOC_OR_GOTO (local, out); - -	ret = afr_local_init (local, priv, &op_errno); -	if (ret < 0) { -                afr_local_cleanup (local, this); -                mem_put (local); -		goto out; -        } - -        ret = pump_parse_command (frame, this, -                                  local, dict); -        if (ret >= 0) { -                ret = 0; +        ret = pump_parse_command (frame, this, dict, &op_errno); +        if (ret >= 0)                  goto out; -        } - -	transaction_frame = copy_frame (frame); -	if (!transaction_frame) { -		gf_log (this->name, GF_LOG_ERROR, -			"Out of memory."); -                op_errno = ENOMEM; -                ret = -1; -                afr_local_cleanup (local, this); -		goto out; -	} - -	transaction_frame->local = local; - -	local->op_ret = -1; - -	local->cont.setxattr.dict  = dict_ref (dict); -	local->cont.setxattr.flags = flags; - -	local->transaction.fop    = afr_setxattr_wind; -	local->transaction.done   = afr_setxattr_done; -	local->transaction.unwind = afr_setxattr_unwind; - -	loc_copy (&local->loc, loc); - -	local->transaction.main_frame = frame; -	local->transaction.start   = LLONG_MAX - 1; -	local->transaction.len     = 0; -	afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); +	afr_setxattr (frame, this, loc, dict, flags, xdata);  	ret = 0;  out:  	if (ret < 0) { -		if (transaction_frame) -			AFR_STACK_DESTROY (transaction_frame);  		AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);  	} @@ -2416,10 +2234,6 @@ init (xlator_t *this)                  goto out;          LOCK_INIT (&priv->lock); -        LOCK_INIT (&priv->read_child_lock); -        //lock recovery is not done in afr -        pthread_mutex_init (&priv->mutex, NULL); -        INIT_LIST_HEAD (&priv->saved_fds);          child_count = xlator_subvolume_count (this);          if (child_count != 2) { @@ -2453,8 +2267,6 @@ init (xlator_t *this)             and the sink.          */ -	priv->strict_readdir = _gf_false; -	priv->wait_count = 1;  	priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count,                                   gf_afr_mt_char);  	if (!priv->child_up) { @@ -2508,7 +2320,6 @@ init (xlator_t *this)                  goto out;          } -        priv->first_lookup = 1;          priv->root_inode = NULL;          priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event), @@ -2579,7 +2390,6 @@ out:                  GF_FREE (priv->pending_key);                  GF_FREE (priv->last_event);                  LOCK_DESTROY (&priv->lock); -                LOCK_DESTROY (&priv->read_child_lock);                  GF_FREE (priv);          } diff --git a/xlators/cluster/afr/src/pump.h b/xlators/cluster/afr/src/pump.h index bc4c31a78a5..9d0b6db6a5e 100644 --- a/xlators/cluster/afr/src/pump.h +++ b/xlators/cluster/afr/src/pump.h @@ -75,4 +75,7 @@ pump_command_status (xlator_t *this, dict_t *dict);  int  pump_execute_status (call_frame_t *frame, xlator_t *this); +int +pump_command_reply (call_frame_t *frame, xlator_t *this); +  #endif /* __PUMP_H__ */ diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 3055f4615cf..3868fc38fd5 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -3120,7 +3120,7 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,                  /* making sure we set the inode ctx right with layout,                     currently possible only for non-directories, so for                     directories don't set entry inodes */ -                if (!IA_ISDIR(entry->d_stat.ia_type)) { +                if (!IA_ISDIR(entry->d_stat.ia_type) && orig_entry->inode) {                          ret = dht_layout_preset (this, prev->this,                                                   orig_entry->inode);                          if (ret) diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c index 32d53e8e6e2..79e80b51381 100644 --- a/xlators/cluster/stripe/src/stripe.c +++ b/xlators/cluster/stripe/src/stripe.c @@ -4886,7 +4886,7 @@ unlock:                  if (!local_entry)                          break; -                if (!IA_ISREG (local_entry->d_stat.ia_type)) { +                if (!IA_ISREG (local_entry->d_stat.ia_type) || !local_entry->inode) {                          LOCK (&frame->lock);                          {                                  local->wind_count--;  | 
