From 6d3739292b7b51d2ddbab75b5f884fb38925b943 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Thu, 16 Jan 2014 16:14:36 -0800 Subject: cluster/afr: refactor - Remove client side self-healing completely (opendir, openfd, lookup) - Re-work readdir-failover to work reliably in case of NFS - Remove unused/dead lock recovery code - Consistently use xdata in both calls and callbacks in all FOPs - Per-inode event generation, used to force inode ctx refresh - Implement dirty flag support (in place of pending counts) - Eliminate inode ctx structure, use read subvol bits + event_generation - Implement inode ctx refreshing based on event generation - Provide backward compatibility in transactions - remove unused variables and functions - make code more consistent in style and pattern - regularize and clean up inode-write transaction code - regularize and clean up dir-write transaction code - regularize and clean up common FOPs - reorganize transaction framework code - skip setting xattrs in pending dict if nothing is pending - re-write self-healing code using syncops - re-write simpler self-heal-daemon Change-Id: I1e4080c9796c8a2815c2dab4be3073f389d614a8 BUG: 1021686 Signed-off-by: Anand Avati Reviewed-on: http://review.gluster.org/6010 Tested-by: Gluster Build System Reviewed-by: Vijay Bellur --- xlators/cluster/afr/src/afr-common.c | 4351 +++++++++++++--------------------- 1 file changed, 1677 insertions(+), 2674 deletions(-) (limited to 'xlators/cluster/afr/src/afr-common.c') diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 224d3054626..2bab0f8533d 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -45,787 +45,797 @@ #include "afr-dir-write.h" #include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" #include "afr-self-heald.h" -#include "pump.h" -#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000100000000ULL -#define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL -#define AFR_STATISTICS_HISTORY_SIZE 50 -int -afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, - gf_boolean_t fail_conflict); -void -afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count) -{ - int i = 0; - - for (i = 0; i < child_count; i++) - dst[i] = src[i]; -} -void -afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path) +call_frame_t * +afr_copy_frame (call_frame_t *base) { - int i = 0; - afr_private_t *priv = NULL; - int ret = 0; + afr_local_t *local = NULL; + call_frame_t *frame = NULL; + int op_errno = 0; - priv = this->private; + frame = copy_frame (base); + if (!frame) + return NULL; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) { + AFR_STACK_DESTROY (frame); + return NULL; + } - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_uint64 (xattr_req, priv->pending_key[i], - 3 * sizeof(int32_t)); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - path, priv->pending_key[i]); - /* 3 = data+metadata+entry */ - } - ret = dict_set_int32 (xattr_req, GF_GFIDLESS_LOOKUP, 1); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, "%s: failed to set gfidless " - "lookup", path); - } + return frame; } +/* + * INODE CTX 64-bit VALUE FORMAT FOR SMALL (<= 16) SUBVOL COUNTS: + * + * |<---------- 64bit ------------>| + * 63 32 31 16 15 0 + * | EVENT_GEN | DATA | METADATA | + * + * + * METADATA (bit-0 .. bit-15): bitmap representing subvolumes from which + * metadata can be attempted to be read. + * + * bit-0 => priv->subvolumes[0] + * bit-1 => priv->subvolumes[1] + * ... etc. till bit-15 + * + * DATA (bit-16 .. bit-31): bitmap representing subvolumes from which data + * can be attempted to be read. + * + * bit-16 => priv->subvolumes[0] + * bit-17 => priv->subvolumes[1] + * ... etc. till bit-31 + * + * EVENT_GEN (bit-32 .. bit-63): event generation (i.e priv->event_generation) + * when DATA and METADATA was last updated. + * + * If EVENT_GEN is < priv->event_generation, + * or is 0, it means afr_inode_refresh() needs + * to be called to recalculate the bitmaps. + */ + int -afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this, - dict_t *xattr_req, loc_t *loc, void **gfid_req) +__afr_inode_read_subvol_get_small (inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int *event_p) { - int ret = -ENOMEM; + afr_private_t *priv = NULL; + int ret = -1; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint32_t event = 0; + uint64_t val = 0; + int i = 0; - GF_ASSERT (gfid_req); + priv = this->private; - *gfid_req = NULL; - local->xattr_req = dict_new (); - if (!local->xattr_req) - goto out; - if (xattr_req) - dict_copy (xattr_req, local->xattr_req); + ret = __inode_ctx_get (inode, this, &val); + if (ret < 0) + return ret; - afr_xattr_req_prepare (this, local->xattr_req, loc->path); - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_INODELK_COUNT); - } - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_ENTRYLK_COUNT); - } + metadatamap = (val & 0x000000000000ffff); + datamap = (val & 0x00000000ffff0000) >> 16; + event = (val & 0xffffffff00000000) >> 32; - ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_PARENT_ENTRYLK); - } + for (i = 0; i < priv->child_count; i++) { + if (metadata) + metadata[i] = (metadatamap >> i) & 1; + if (data) + data[i] = (datamap >> i) & 1; + } - ret = dict_get_ptr (local->xattr_req, "gfid-req", gfid_req); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: failed to get the gfid from dict", loc->path); - *gfid_req = NULL; - } else { - if (loc->parent != NULL) - dict_del (local->xattr_req, "gfid-req"); - } - ret = 0; -out: - return ret; + if (event_p) + *event_p = event; + return ret; } -void -afr_lookup_save_gfid (uuid_t dst, void* new, const loc_t *loc) -{ - inode_t *inode = NULL; - - inode = loc->inode; - if (inode && !uuid_is_null (inode->gfid)) - uuid_copy (dst, inode->gfid); - else if (!uuid_is_null (loc->gfid)) - uuid_copy (dst, loc->gfid); - else if (new && !uuid_is_null (new)) - uuid_copy (dst, new); -} int -afr_errno_count (int32_t *children, int *child_errno, - unsigned int child_count, int32_t op_errno) -{ - int i = 0; - int errno_count = 0; - int child = 0; +__afr_inode_read_subvol_set_small (inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int event) +{ + afr_private_t *priv = NULL; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint64_t val = 0; + int i = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (data[i]) + datamap |= (1 << i); + if (metadata[i]) + metadatamap |= (1 << i); + } - for (i = 0; i < child_count; i++) { - if (children) { - child = children[i]; - if (child == -1) - break; - } else { - child = i; - } - if (child_errno[child] == op_errno) - errno_count++; - } - return errno_count; -} + val = ((uint64_t) metadatamap) | + (((uint64_t) datamap) << 16) | + (((uint64_t) event) << 32); -int32_t -afr_set_dict_gfid (dict_t *dict, uuid_t gfid) -{ - int ret = 0; - uuid_t *pgfid = NULL; + return __inode_ctx_set (inode, this, &val); +} - GF_ASSERT (gfid); - pgfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char); - if (!pgfid) { - ret = -1; - goto out; - } +int +__afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this) +{ + int ret = -1; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint32_t event = 0; + uint64_t val = 0; - uuid_copy (*pgfid, gfid); + ret = __inode_ctx_get (inode, this, &val); + (void) ret; - ret = dict_set_dynptr (dict, "gfid-req", pgfid, sizeof (uuid_t)); - if (ret) - gf_log (THIS->name, GF_LOG_ERROR, "gfid set failed"); + metadatamap = (val & 0x000000000000ffff) >> 0; + datamap = (val & 0x00000000ffff0000) >> 16; + event = 0; -out: - if (ret && pgfid) - GF_FREE (pgfid); + val = ((uint64_t) metadatamap) | + (((uint64_t) datamap) << 16) | + (((uint64_t) event) << 32); - return ret; + return __inode_ctx_set (inode, this, &val); } -void -afr_inode_ctx_destroy (afr_inode_ctx_t *ctx) -{ - if (!ctx) - return; - GF_FREE (ctx->fresh_children); - GF_FREE (ctx); -} -afr_inode_ctx_t* -__afr_inode_ctx_get (inode_t *inode, xlator_t *this) +int +__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int *event_p) { - int ret = 0; - uint64_t ctx_addr = 0; - afr_inode_ctx_t *ctx = NULL; - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; + int ret = -1; - priv = this->private; - ret = __inode_ctx_get (inode, this, &ctx_addr); - if (ret < 0) - ctx_addr = 0; - if (ctx_addr != 0) { - ctx = (afr_inode_ctx_t*) (long) ctx_addr; - goto out; - } - ctx = GF_CALLOC (1, sizeof (*ctx), - gf_afr_mt_inode_ctx_t); - if (!ctx) - goto fail; - ctx->fresh_children = GF_CALLOC (priv->child_count, - sizeof (*ctx->fresh_children), - gf_afr_mt_int32_t); - if (!ctx->fresh_children) - goto fail; - ret = __inode_ctx_put (inode, this, (uint64_t)ctx); - if (ret) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to " - "set the inode ctx (%s)", - uuid_utoa (inode->gfid)); - goto fail; - } + priv = this->private; -out: - return ctx; + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_get_small (inode, this, data, + metadata, event_p); + else + /* TBD: allocate structure with array and read from it */ + ret = -1; -fail: - afr_inode_ctx_destroy (ctx); - return NULL; + return ret; } -afr_inode_ctx_t* -afr_inode_ctx_get (inode_t *inode, xlator_t *this) + +int +__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int event) { - afr_inode_ctx_t *ctx = NULL; + afr_private_t *priv = NULL; + int ret = -1; - LOCK (&inode->lock); - { - ctx = __afr_inode_ctx_get (inode, this); - } - UNLOCK (&inode->lock); - return ctx; + priv = this->private; + + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_set_small (inode, this, data, + metadata, event); + else + ret = -1; + + return ret; } -void -afr_inode_get_ctx_params (xlator_t *this, inode_t *inode, - afr_inode_params_t *params) + +int +__afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this) { - GF_ASSERT (inode); - GF_ASSERT (params); + afr_private_t *priv = NULL; + int ret = -1; - afr_inode_ctx_t *ctx = NULL; - afr_private_t *priv = NULL; - int i = 0; - int32_t read_child = -1; - int32_t *fresh_children = NULL; + priv = this->private; - priv = this->private; - LOCK (&inode->lock); - { - ctx = __afr_inode_ctx_get (inode, this); - if (!ctx) - goto unlock; - switch (params->op) { - case AFR_INODE_GET_READ_CTX: - fresh_children = params->u.read_ctx.children; - read_child = (int32_t)(ctx->masks & - AFR_ICTX_READ_CHILD_MASK); - params->u.read_ctx.read_child = read_child; - if (!fresh_children) - goto unlock; - for (i = 0; i < priv->child_count; i++) - fresh_children[i] = ctx->fresh_children[i]; - break; - case AFR_INODE_GET_OPENDIR_DONE: - params->u.value = _gf_false; - if (ctx->masks & AFR_ICTX_OPENDIR_DONE_MASK) - params->u.value = _gf_true; - break; - default: - GF_ASSERT (0); - break; - } - } -unlock: - UNLOCK (&inode->lock); + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_reset_small (inode, this); + else + ret = -1; + + return ret; } -gf_boolean_t -afr_is_split_brain (xlator_t *this, inode_t *inode) + +int +afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int *event_p) { - afr_inode_ctx_t *ctx = NULL; - gf_boolean_t spb = _gf_false; + int ret = -1; - ctx = afr_inode_ctx_get (inode, this); - if (!ctx) - goto out; - if ((ctx->mdata_spb == SPB) || (ctx->data_spb == SPB)) - spb = _gf_true; -out: - return spb; + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_get (inode, this, data, + metadata, event_p); + } + UNLOCK(&inode->lock); + + return ret; } -gf_boolean_t -afr_is_opendir_done (xlator_t *this, inode_t *inode) + +int +afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int event) { - afr_inode_params_t params = {0}; + int ret = -1; + + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_set (inode, this, data, metadata, + event); + } + UNLOCK(&inode->lock); - params.op = AFR_INODE_GET_OPENDIR_DONE; - afr_inode_get_ctx_params (this, inode, ¶ms); - return params.u.value; + return ret; } -int32_t -afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children) + +int +afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this) { - afr_inode_params_t params = {0}; + int ret = -1; - params.op = AFR_INODE_GET_READ_CTX; - params.u.read_ctx.children = fresh_children; - afr_inode_get_ctx_params (this, inode, ¶ms); - return params.u.read_ctx.read_child; + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_reset (inode, this); + } + UNLOCK(&inode->lock); + + return ret; } -void -afr_inode_ctx_set_read_child (afr_inode_ctx_t *ctx, int32_t read_child) -{ - uint64_t remaining_mask = 0; - uint64_t mask = 0; - remaining_mask = (~AFR_ICTX_READ_CHILD_MASK & ctx->masks); - mask = (AFR_ICTX_READ_CHILD_MASK & read_child); - ctx->masks = remaining_mask | mask; -} +int +afr_accused_fill (xlator_t *this, dict_t *xdata, unsigned char *accused, + afr_transaction_type type) +{ + afr_private_t *priv = NULL; + int i = 0; + int idx = afr_index_for_transaction_type (type); + void *pending_raw = NULL; + int pending[3]; + int ret = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + ret = dict_get_ptr (xdata, priv->pending_key[i], + &pending_raw); + if (ret) /* no pending flags */ + continue; + memcpy (pending, pending_raw, sizeof(pending)); + + if (ntoh32 (pending[idx])) + accused[i] = 1; + } -void -afr_inode_ctx_set_read_ctx (afr_inode_ctx_t *ctx, int32_t read_child, - int32_t *fresh_children, int32_t child_count) -{ - int i = 0; - - afr_inode_ctx_set_read_child (ctx, read_child); - for (i = 0; i < child_count; i++) { - if (fresh_children) - ctx->fresh_children[i] = fresh_children[i]; - else - ctx->fresh_children[i] = -1; - } + return 0; } -void -afr_inode_ctx_rm_stale_children (afr_inode_ctx_t *ctx, int32_t *stale_children, - int32_t child_count) + +int +afr_accuse_smallfiles (xlator_t *this, struct afr_reply *replies, + unsigned char *data_accused) { - int i = 0; - int32_t read_child = -1; + int i = 0; + afr_private_t *priv = NULL; + uint64_t maxsize = 0; - GF_ASSERT (stale_children); - for (i = 0; i < child_count; i++) { - if (stale_children[i] == -1) - break; - afr_children_rm_child (ctx->fresh_children, - stale_children[i], child_count); - } - read_child = (int32_t)(ctx->masks & AFR_ICTX_READ_CHILD_MASK); - if (!afr_is_child_present (ctx->fresh_children, child_count, - read_child)) - afr_inode_ctx_set_read_child (ctx, ctx->fresh_children[0]); -} + priv = this->private; -void -afr_inode_ctx_set_opendir_done (afr_inode_ctx_t *ctx) -{ - uint64_t remaining_mask = 0; - uint64_t mask = 0; + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) + continue; + if (replies[i].poststat.ia_size > maxsize) + maxsize = replies[i].poststat.ia_size; + } + + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) + continue; + if (replies[i].poststat.ia_size < maxsize) + data_accused[i] = 1; + } - remaining_mask = (~AFR_ICTX_OPENDIR_DONE_MASK & ctx->masks); - mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_OPENDIR_DONE_MASK); - ctx->masks = remaining_mask | mask; + return 0; } -void -afr_inode_set_ctx_params (xlator_t *this, inode_t *inode, - afr_inode_params_t *params) -{ - GF_ASSERT (inode); - GF_ASSERT (params); - afr_inode_ctx_t *ctx = NULL; - afr_private_t *priv = NULL; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - int32_t *stale_children = NULL; +int +afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int event_generation = 0; + int i = 0; + unsigned char *data_accused = NULL; + unsigned char *metadata_accused = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + int ret = 0; - priv = this->private; - LOCK (&inode->lock); - { - ctx = __afr_inode_ctx_get (inode, this); - if (!ctx) - goto unlock; - switch (params->op) { - case AFR_INODE_SET_READ_CTX: - read_child = params->u.read_ctx.read_child; - fresh_children = params->u.read_ctx.children; - afr_inode_ctx_set_read_ctx (ctx, read_child, - fresh_children, - priv->child_count); - break; - case AFR_INODE_RM_STALE_CHILDREN: - stale_children = params->u.read_ctx.children; - afr_inode_ctx_rm_stale_children (ctx, - stale_children, - priv->child_count); - break; - case AFR_INODE_SET_OPENDIR_DONE: - afr_inode_ctx_set_opendir_done (ctx); - break; - default: - GF_ASSERT (0); - break; - } - } -unlock: - UNLOCK (&inode->lock); -} + local = frame->local; + priv = this->private; + replies = local->replies; + event_generation = local->event_generation; + + data_accused = alloca0 (priv->child_count); + data_readable = alloca0 (priv->child_count); + metadata_accused = alloca0 (priv->child_count); + metadata_readable = alloca0 (priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + data_readable[i] = 1; + metadata_readable[i] = 1; + } -void -afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, - afr_spb_state_t data_spb) -{ - afr_inode_ctx_t *ctx = NULL; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) { + data_readable[i] = 0; + metadata_readable[i] = 0; + continue; + } - ctx = afr_inode_ctx_get (inode, this); - if (mdata_spb != DONT_KNOW) - ctx->mdata_spb = mdata_spb; - if (data_spb != DONT_KNOW) - ctx->data_spb = data_spb; -} + if (replies[i].op_ret == -1) { + data_readable[i] = 0; + metadata_readable[i] = 0; + continue; + } -void -afr_set_opendir_done (xlator_t *this, inode_t *inode) -{ - afr_inode_params_t params = {0}; + afr_accused_fill (this, replies[i].xdata, data_accused, + (inode->ia_type == IA_IFDIR) ? + AFR_ENTRY_TRANSACTION : AFR_DATA_TRANSACTION); - params.op = AFR_INODE_SET_OPENDIR_DONE; - afr_inode_set_ctx_params (this, inode, ¶ms); -} + afr_accused_fill (this, replies[i].xdata, + metadata_accused, AFR_METADATA_TRANSACTION); -void -afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, - int32_t *fresh_children) -{ - afr_inode_params_t params = {0}; - afr_private_t *priv = NULL; + } - priv = this->private; - GF_ASSERT (read_child >= 0); - GF_ASSERT (fresh_children); - GF_ASSERT (afr_is_child_present (fresh_children, priv->child_count, - read_child)); - - params.op = AFR_INODE_SET_READ_CTX; - params.u.read_ctx.read_child = read_child; - params.u.read_ctx.children = fresh_children; - afr_inode_set_ctx_params (this, inode, ¶ms); + if (inode->ia_type != IA_IFDIR) + afr_accuse_smallfiles (this, replies, data_accused); + + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) { + data_readable[i] = 0; + ret = 1; + } + if (metadata_accused[i]) { + metadata_readable[i] = 0; + ret = 1; + } + } + + afr_inode_read_subvol_set (inode, this, data_readable, + metadata_readable, event_generation); + return ret; } -void -afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, - int32_t *stale_children) -{ - afr_inode_params_t params = {0}; - GF_ASSERT (stale_children); - params.op = AFR_INODE_RM_STALE_CHILDREN; - params.u.read_ctx.children = stale_children; - afr_inode_set_ctx_params (this, inode, ¶ms); +int +afr_refresh_selfheal_done (int ret, call_frame_t *heal, void *opaque) +{ + if (heal) + STACK_DESTROY (heal->root); + return 0; } -gf_boolean_t -afr_is_source_child (int32_t *sources, int32_t child_count, int32_t child) +int +afr_inode_refresh_err (call_frame_t *frame, xlator_t *this) { - gf_boolean_t source_xattrs = _gf_false; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int err = 0; - GF_ASSERT (child < child_count); + local = frame->local; + priv = this->private; - if ((child >= 0) && (child < child_count) && - sources[child]) { - source_xattrs = _gf_true; - } - return source_xattrs; + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && !local->replies[i].op_ret) { + err = 0; + goto ret; + } + } + + err = afr_final_errno (local, priv); +ret: + return -err; } -gf_boolean_t -afr_is_child_present (int32_t *success_children, int32_t child_count, - int32_t child) + +int +afr_refresh_selfheal_wrap (void *opaque) { - gf_boolean_t success_child = _gf_false; - int i = 0; + call_frame_t *frame = opaque; + afr_local_t *local = NULL; + xlator_t *this = NULL; + int err = 0; - GF_ASSERT (child < child_count); + local = frame->local; + this = frame->this; - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - if (child == success_children[i]) { - success_child = _gf_true; - break; - } - } - return success_child; -} + afr_selfheal (frame->this, local->refreshinode->gfid); -gf_boolean_t -afr_is_read_child (int32_t *success_children, int32_t *sources, - int32_t child_count, int32_t child) -{ - gf_boolean_t success_child = _gf_false; - gf_boolean_t source = _gf_false; + afr_selfheal_unlocked_discover (frame, local->refreshinode, + local->refreshinode->gfid, + local->replies); - if (child < 0) { - return _gf_false; - } + afr_replies_interpret (frame, this, local->refreshinode); - GF_ASSERT (success_children); - GF_ASSERT (child_count > 0); + err = afr_inode_refresh_err (frame, this); - success_child = afr_is_child_present (success_children, child_count, - child); - if (!success_child) - goto out; - if (NULL == sources) { - source = _gf_true; - goto out; - } - source = afr_is_source_child (sources, child_count, child); -out: - return (success_child && source); + afr_replies_wipe (local, this->private); + + local->refreshfn (frame, this, err); + + return 0; } -int32_t -afr_hash_child (int32_t *success_children, int32_t child_count, - unsigned int hmode, uuid_t gfid) + +gf_boolean_t +afr_selfheal_enabled (xlator_t *this) { - uuid_t gfid_copy = {0,}; - pid_t pid; + afr_private_t *priv = NULL; + gf_boolean_t data = _gf_false; - if (!hmode) { - return -1; - } + priv = this->private; - if (gfid) { - uuid_copy(gfid_copy,gfid); - } - if (hmode > 1) { - /* - * Why getpid? Because it's one of the cheapest calls - * available - faster than gethostname etc. - and returns a - * constant-length value that's sure to be shorter than a UUID. - * It's still very unlikely to be the same across clients, so - * it still provides good mixing. We're not trying for - * perfection here. All we need is a low probability that - * multiple clients won't converge on the same subvolume. - */ - pid = getpid(); - memcpy (gfid_copy, &pid, sizeof(pid)); - } + gf_string2boolean (priv->data_self_heal, &data); - return SuperFastHash((char *)gfid_copy, - sizeof(gfid_copy)) % child_count; + return data || priv->metadata_self_heal || priv->entry_self_heal; } -/* If sources is NULL the xattrs are assumed to be of source for all - * success_children. - */ + + int -afr_select_read_child_from_policy (int32_t *success_children, - int32_t child_count, int32_t prev_read_child, - int32_t config_read_child, int32_t *sources, - unsigned int hmode, uuid_t gfid) +afr_inode_refresh_done (call_frame_t *frame, xlator_t *this) { - int32_t read_child = -1; - int i = 0; + call_frame_t *heal = NULL; + afr_local_t *local = NULL; + int ret = 0; + int err = 0; - GF_ASSERT (success_children); + local = frame->local; - read_child = config_read_child; - if (afr_is_read_child (success_children, sources, child_count, - read_child)) - goto out; + ret = afr_replies_interpret (frame, this, local->refreshinode); - read_child = prev_read_child; - if (afr_is_read_child (success_children, sources, child_count, - read_child)) - goto out; + err = afr_inode_refresh_err (frame, this); - read_child = afr_hash_child (success_children, child_count, - hmode, gfid); - if (afr_is_read_child (success_children, sources, child_count, - read_child)) { - goto out; - } + afr_replies_wipe (local, this->private); - for (i = 0; i < child_count; i++) { - read_child = success_children[i]; - if (read_child < 0) - break; - if (afr_is_read_child (success_children, sources, child_count, - read_child)) - goto out; - } - read_child = -1; + if (ret && afr_selfheal_enabled (this)) { + heal = copy_frame (frame); + if (heal) + heal->root->pid = -1; + ret = synctask_new (this->ctx->env, afr_refresh_selfheal_wrap, + afr_refresh_selfheal_done, heal, frame); + if (ret) + goto refresh_done; + } else { + refresh_done: + local->refreshfn (frame, this, err); + } -out: - return read_child; + return 0; } -/* This function should be used when all the success_children are sources - */ -void -afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, - int32_t *fresh_children, int32_t prev_read_child, - int32_t config_read_child, uuid_t gfid) + +int +afr_inode_refresh_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *par) { - int read_child = -1; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int call_child = (long) cookie; + int call_count = 0; - priv = this->private; - read_child = afr_select_read_child_from_policy (fresh_children, - priv->child_count, - prev_read_child, - config_read_child, - NULL, - priv->hash_mode, gfid); - if (read_child >= 0) - afr_inode_set_read_ctx (this, inode, read_child, - fresh_children); -} + local = frame->local; -/* afr_next_call_child () - * This is a common function used by all the read-type fops - * This function should not be called with the inode's read_children array. - * The fop's handler should make a copy of the inode's read_children, - * preferred read_child into the local vars, because while this function is - * in execution there is a chance for inode's read_ctx to change. - */ -int32_t -afr_next_call_child (int32_t *fresh_children, unsigned char *child_up, - size_t child_count, int32_t *last_index, - int32_t read_child) + local->replies[call_child].valid = 1; + local->replies[call_child].op_ret = op_ret; + local->replies[call_child].op_errno = op_errno; + if (op_ret != -1) { + local->replies[call_child].poststat = *buf; + local->replies[call_child].postparent = *par; + local->replies[call_child].xdata = dict_ref (xdata); + } + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_inode_refresh_done (frame, this); + + return 0; +} + + +int +afr_inode_refresh_subvol (call_frame_t *frame, xlator_t *this, int i, + inode_t *inode, dict_t *xdata) { - int next_index = 0; - int32_t next_call_child = -1; + loc_t loc = {0, }; + afr_private_t *priv = NULL; - GF_ASSERT (last_index); + priv = this->private; - next_index = *last_index; -retry: - next_index++; - if ((next_index >= child_count) || - (fresh_children[next_index] == -1)) - goto out; - if ((fresh_children[next_index] == read_child) || - (!child_up[fresh_children[next_index]])) - goto retry; - *last_index = next_index; - next_call_child = fresh_children[next_index]; -out: - return next_call_child; + loc.inode = inode; + uuid_copy (loc.gfid, inode->gfid); + + STACK_WIND_COOKIE (frame, afr_inode_refresh_subvol_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->lookup, &loc, xdata); + return 0; } - /* This function should not be called with the inode's read_children array. - * The fop's handler should make a copy of the inode's read_children, - * preferred read_child into the local vars, because while this function is - * in execution there is a chance for inode's read_ctx to change. - */ -int32_t -afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, - int32_t *fresh_children, - int32_t *call_child, int32_t *last_index) + +int +afr_inode_refresh_do (call_frame_t *frame, xlator_t *this) { - int ret = 0; - afr_private_t *priv = NULL; - int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t *xdata = NULL; - GF_ASSERT (child_up); - GF_ASSERT (call_child); - GF_ASSERT (last_index); - GF_ASSERT (fresh_children); + priv = this->private; + local = frame->local; - if (read_child < 0) { - ret = -EIO; - goto out; - } - priv = this->private; - *call_child = -1; - *last_index = -1; + afr_replies_wipe (local, priv); - if (child_up[read_child]) { - *call_child = read_child; - } else { - for (i = 0; i < priv->child_count; i++) { - if (fresh_children[i] == -1) - break; - if (child_up[fresh_children[i]]) { - *call_child = fresh_children[i]; - ret = 0; - break; - } - } + xdata = dict_new (); + if (!xdata) { + afr_inode_refresh_done (frame, this); + return 0; + } - if (*call_child == -1) { - ret = -ENOTCONN; - goto out; - } + if (afr_xattr_req_prepare (this, xdata) != 0) { + dict_unref (xdata); + afr_inode_refresh_done (frame, this); + return 0; + } + + local->call_count = AFR_COUNT (local->child_up, priv->child_count); + + call_count = local->call_count; + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + + afr_inode_refresh_subvol (frame, this, i, local->refreshinode, + xdata); + + if (!--call_count) + break; + } + + dict_unref (xdata); + + return 0; +} + + +int +afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_inode_refresh_cbk_t refreshfn) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->refreshfn = refreshfn; - *last_index = i; + if (local->refreshinode) { + inode_unref (local->refreshinode); + local->refreshinode = NULL; + } + + local->refreshinode = inode_ref (inode); + + afr_inode_refresh_do (frame, this); + + return 0; +} + + +int +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req) +{ + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + ret = dict_set_uint64 (xattr_req, priv->pending_key[i], + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret < 0) + gf_log (this->name, GF_LOG_WARNING, + "Unable to set dict value for %s", + priv->pending_key[i]); + /* 3 = data+metadata+entry */ } -out: - gf_log (this->name, GF_LOG_DEBUG, "Returning %d, call_child: %d, " - "last_index: %d", ret, *call_child, *last_index); - return ret; + ret = dict_set_uint64 (xattr_req, AFR_DIRTY, + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "failed to set dirty " + "query flag"); + } + + return ret; } -void -afr_reset_xattr (dict_t **xattr, unsigned int child_count) +int +afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this, + dict_t *xattr_req, loc_t *loc) { - unsigned int i = 0; + int ret = -ENOMEM; - if (!xattr) + local->xattr_req = dict_new (); + if (!local->xattr_req) goto out; - for (i = 0; i < child_count; i++) { - if (xattr[i]) { - dict_unref (xattr[i]); - xattr[i] = NULL; - } + if (xattr_req) + dict_copy (xattr_req, local->xattr_req); + + ret = afr_xattr_req_prepare (this, local->xattr_req); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to prepare xattr_req", loc->path); + } + + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_INODELK_COUNT); + } + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_ENTRYLK_COUNT); } + + ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_PARENT_ENTRYLK); + } + + ret = 0; out: - return; + return ret; } -void -afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count) -{ - afr_reset_xattr (xattr, child_count); - GF_FREE (xattr); -} -void -afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) +int +afr_hash_child (inode_t *inode, int32_t child_count, int hashmode) { - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; + uuid_t gfid_copy = {0,}; + pid_t pid; - sh = &local->self_heal; - priv = this->private; + if (!hashmode) { + return -1; + } + + if (inode) { + uuid_copy (gfid_copy, inode->gfid); + } + + if (hashmode > 1) { + /* + * Why getpid? Because it's one of the cheapest calls + * available - faster than gethostname etc. - and returns a + * constant-length value that's sure to be shorter than a UUID. + * It's still very unlikely to be the same across clients, so + * it still provides good mixing. We're not trying for + * perfection here. All we need is a low probability that + * multiple clients won't converge on the same subvolume. + */ + pid = getpid(); + memcpy (gfid_copy, &pid, sizeof(pid)); + } + + return SuperFastHash((char *)gfid_copy, + sizeof(gfid_copy)) % child_count; +} - if (sh->data_sh_info && strcmp (sh->data_sh_info, "")) - GF_FREE (sh->data_sh_info); - if (sh->metadata_sh_info && strcmp (sh->metadata_sh_info, "")) - GF_FREE (sh->metadata_sh_info); +int +afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, + unsigned char *readable) +{ + afr_private_t *priv = NULL; + int read_subvol = -1; + int i = 0; - GF_FREE (sh->buf); + priv = this->private; - GF_FREE (sh->parentbufs); + /* first preference - explicitly specified or local subvolume */ + if (priv->read_child >= 0 && readable[priv->read_child]) + return priv->read_child; - if (sh->inode) - inode_unref (sh->inode); + /* second preference - use hashed mode */ + read_subvol = afr_hash_child (inode, priv->child_count, + priv->hash_mode); + if (read_subvol >= 0 && readable[read_subvol]) + return read_subvol; - afr_xattr_array_destroy (sh->xattr, priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (readable[i]) + return i; + } - GF_FREE (sh->child_errno); + /* no readable subvolumes, either split brain or all subvols down */ - afr_matrix_cleanup (sh->pending_matrix, priv->child_count); - afr_matrix_cleanup (sh->delta_matrix, priv->child_count); + return -1; +} - GF_FREE (sh->sources); - GF_FREE (sh->success); +int +afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, + int type) +{ + int ret = -1; - GF_FREE (sh->locked_nodes); + if (type == AFR_METADATA_TRANSACTION) + ret = afr_inode_read_subvol_get (inode, this, 0, readable, + event_p); + else + ret = afr_inode_read_subvol_get (inode, this, readable, 0, + event_p); + return ret; +} - if (sh->healing_fd) { - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; - } - GF_FREE ((char *)sh->linkname); +int +afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p, + int *event_p, afr_transaction_type type) +{ + afr_private_t *priv = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + unsigned char *readable = NULL; + unsigned char *intersection = NULL; + int subvol = -1; + int event = 0; - GF_FREE (sh->success_children); + priv = this->private; - GF_FREE (sh->fresh_children); + readable = alloca0 (priv->child_count); + data_readable = alloca0 (priv->child_count); + metadata_readable = alloca0 (priv->child_count); + intersection = alloca0 (priv->child_count); - GF_FREE (sh->fresh_parent_dirs); + afr_inode_read_subvol_type_get (inode, this, readable, &event, type); - loc_wipe (&sh->parent_loc); - loc_wipe (&sh->lookup_loc); + afr_inode_read_subvol_get (inode, this, data_readable, metadata_readable, + &event); - GF_FREE (sh->checksum); + AFR_INTERSECT (intersection, data_readable, metadata_readable, + priv->child_count); - GF_FREE (sh->write_needed); - if (sh->healing_fd) - fd_unref (sh->healing_fd); + if (AFR_COUNT (intersection, priv->child_count) > 0) + subvol = afr_read_subvol_select_by_policy (inode, this, + intersection); + else + subvol = afr_read_subvol_select_by_policy (inode, this, + readable); + if (subvol_p) + *subvol_p = subvol; + if (event_p) + *event_p = event; + return subvol; } @@ -838,8 +848,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) priv = this->private; afr_matrix_cleanup (local->pending, priv->child_count); - afr_matrix_cleanup (local->transaction.txn_changelog, - priv->child_count); GF_FREE (local->internal_lock.locked_nodes); @@ -860,7 +868,25 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) loc_wipe (&local->transaction.parent_loc); loc_wipe (&local->transaction.new_parent_loc); - GF_FREE (local->transaction.postop_piggybacked); +} + + +void +afr_replies_wipe (afr_local_t *local, afr_private_t *priv) +{ + int i; + + if (!local->replies) + return; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].xdata) { + dict_unref (local->replies[i].xdata); + local->replies[i].xdata = NULL; + } + } + + memset (local->replies, 0, sizeof(*local->replies) * priv->child_count); } @@ -872,7 +898,7 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (!local) return; - afr_local_sh_cleanup (local, this); + syncbarrier_destroy (&local->barrier); afr_local_transaction_cleanup (local, this); @@ -890,40 +916,26 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (local->dict) dict_unref (local->dict); + afr_replies_wipe (local, priv); GF_FREE(local->replies); GF_FREE (local->child_up); - GF_FREE (local->child_errno); - - GF_FREE (local->fresh_children); - - { /* lookup */ - if (local->cont.lookup.xattrs) { - afr_reset_xattr (local->cont.lookup.xattrs, - priv->child_count); - GF_FREE (local->cont.lookup.xattrs); - local->cont.lookup.xattrs = NULL; - } - - if (local->cont.lookup.xattr) { - dict_unref (local->cont.lookup.xattr); - } + GF_FREE (local->read_attempted); - if (local->cont.lookup.inode) { - inode_unref (local->cont.lookup.inode); - } + GF_FREE (local->readable); - GF_FREE (local->cont.lookup.postparents); + if (local->inode) + inode_unref (local->inode); - GF_FREE (local->cont.lookup.bufs); + if (local->parent) + inode_unref (local->parent); - GF_FREE (local->cont.lookup.success_children); + if (local->parent2) + inode_unref (local->parent2); - GF_FREE (local->cont.lookup.sources); - afr_matrix_cleanup (local->cont.lookup.pending_matrix, - priv->child_count); - } + if (local->refreshinode) + inode_unref (local->refreshinode); { /* getxattr */ GF_FREE (local->cont.getxattr.name); @@ -980,1167 +992,260 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (local->cont.fxattrop.xattr) dict_unref (local->cont.fxattrop.xattr); } - { /* symlink */ - GF_FREE (local->cont.symlink.linkpath); - } - - { /* opendir */ - GF_FREE (local->cont.opendir.checksum); - } - - { /* readdirp */ - if (local->cont.readdir.dict) - dict_unref (local->cont.readdir.dict); - } - - if (local->xdata_req) - dict_unref (local->xdata_req); - - if (local->xdata_rsp) - dict_unref (local->xdata_rsp); -} - - -int -afr_frame_return (call_frame_t *frame) -{ - afr_local_t *local = NULL; - int call_count = 0; - - local = frame->local; - - LOCK (&frame->lock); - { - call_count = --local->call_count; - } - UNLOCK (&frame->lock); - - return call_count; -} - -int -afr_set_elem_count_get (unsigned char *elems, int child_count) -{ - int i = 0; - int ret = 0; - - for (i = 0; i < child_count; i++) - if (elems[i]) - ret++; - return ret; -} - -/** - * up_children_count - return the number of children that are up - */ - -unsigned int -afr_up_children_count (unsigned char *child_up, unsigned int child_count) -{ - return afr_set_elem_count_get (child_up, child_count); -} - -unsigned int -afr_locked_children_count (unsigned char *children, unsigned int child_count) -{ - return afr_set_elem_count_get (children, child_count); -} - -unsigned int -afr_pre_op_done_children_count (unsigned char *pre_op, - unsigned int child_count) -{ - return afr_set_elem_count_get (pre_op, child_count); -} - -gf_boolean_t -afr_is_fresh_lookup (loc_t *loc, xlator_t *this) -{ - uint64_t ctx = 0; - int32_t ret = 0; - - GF_ASSERT (loc); - GF_ASSERT (this); - GF_ASSERT (loc->inode); - - ret = inode_ctx_get (loc->inode, this, &ctx); - if (0 == ret) - return _gf_false; - return _gf_true; -} - -void -afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) -{ - GF_ASSERT (loc); - GF_ASSERT (buf); - - uuid_copy (loc->gfid, buf->ia_gfid); - if (postparent) - uuid_copy (loc->pargfid, postparent->ia_gfid); -} - -/* - * Quota size xattrs are not maintained by afr. There is a - * possibility that they differ even when both the directory changelog xattrs - * suggest everything is fine. So if there is at least one 'source' check among - * the sources which has the maximum quota size. Otherwise check among all the - * available ones for maximum quota size. This way if there is a source and - * stale copies it always votes for the 'source'. - * */ - -static void -afr_handle_quota_size (afr_local_t *local, xlator_t *this, - dict_t *rsp_dict) -{ - int32_t *sources = NULL; - dict_t *xattr = NULL; - data_t *max_data = NULL; - int64_t max_quota_size = -1; - data_t *data = NULL; - int64_t *size = NULL; - int64_t quota_size = -1; - afr_private_t *priv = NULL; - int i = 0; - int ret = -1; - gf_boolean_t source_present = _gf_false; - - priv = this->private; - sources = local->cont.lookup.sources; - - if (rsp_dict == NULL) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "%s: Invalid " - "response dictionary", local->loc.path); - return; - } - - for (i = 0; i < priv->child_count; i++) { - if (sources[i]) { - source_present = _gf_true; - break; - } - } - - for (i = 0; i < priv->child_count; i++) { - /* - * If there is at least one source lets check - * for maximum quota sizes among sources, otherwise take the - * maximum of the ones present to be on the safer side. - */ - if (source_present && !sources[i]) - continue; - - xattr = local->cont.lookup.xattrs[i]; - if (!xattr) - continue; - - data = dict_get (xattr, QUOTA_SIZE_KEY); - if (!data) - continue; - - size = (int64_t*)data->data; - quota_size = ntoh64(*size); - gf_log (this->name, GF_LOG_DEBUG, "%s: %d, size: %"PRId64, - local->loc.path, i, quota_size); - if (quota_size > max_quota_size) { - if (max_data) - data_unref (max_data); - - max_quota_size = quota_size; - max_data = data_ref (data); - } - } - - if (max_data) { - ret = dict_set (rsp_dict, QUOTA_SIZE_KEY, max_data); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " - "quota size", local->loc.path); - } - - data_unref (max_data); - } -} - -int -afr_lookup_build_response_params (afr_local_t *local, xlator_t *this) -{ - struct iatt *buf = NULL; - struct iatt *postparent = NULL; - dict_t **xattr = NULL; - int32_t *success_children = NULL; - int32_t *sources = NULL; - afr_private_t *priv = NULL; - int32_t read_child = -1; - int ret = 0; - int i = 0; - - GF_ASSERT (local); - - buf = &local->cont.lookup.buf; - postparent = &local->cont.lookup.postparent; - xattr = &local->cont.lookup.xattr; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->cont.lookup.inode, - local->fresh_children); - if (read_child < 0) { - ret = -1; - goto out; - } - success_children = local->cont.lookup.success_children; - sources = local->cont.lookup.sources; - memset (sources, 0, sizeof (*sources) * priv->child_count); - afr_children_intersection_get (local->fresh_children, success_children, - sources, priv->child_count); - if (!sources[read_child]) { - read_child = -1; - for (i = 0; i < priv->child_count; i++) { - if (sources[i]) { - read_child = i; - break; - } - } - } - if (read_child < 0) { - ret = -1; - goto out; - } - - gf_log (this->name, GF_LOG_DEBUG, "Building lookup response from %d", - read_child); - if (!*xattr) - *xattr = dict_ref (local->cont.lookup.xattrs[read_child]); - - *buf = local->cont.lookup.bufs[read_child]; - *postparent = local->cont.lookup.postparents[read_child]; - - if (dict_get (local->xattr_req, QUOTA_SIZE_KEY)) - afr_handle_quota_size (local, this, *xattr); - - if (IA_INVAL == local->cont.lookup.inode->ia_type) { - /* fix for RT #602 */ - local->cont.lookup.inode->ia_type = buf->ia_type; - } -out: - return ret; -} - -static void -afr_lookup_update_lk_counts (afr_local_t *local, xlator_t *this, - int child_index, dict_t *xattr) -{ - uint32_t inodelk_count = 0; - uint32_t entrylk_count = 0; - int ret = -1; - uint32_t parent_entrylk = 0; - - GF_ASSERT (local); - GF_ASSERT (this); - GF_ASSERT (xattr); - GF_ASSERT (child_index >= 0); - - ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT, - &inodelk_count); - if (ret == 0) - local->inodelk_count += inodelk_count; - - ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT, - &entrylk_count); - if (ret == 0) - local->entrylk_count += entrylk_count; - ret = dict_get_uint32 (xattr, GLUSTERFS_PARENT_ENTRYLK, - &parent_entrylk); - if (!ret) - local->cont.lookup.parent_entrylk += parent_entrylk; -} - -/* - * It's important to maintain a commutative property on do_*_self_heal and - * found*; once set, they must not be cleared by a subsequent iteration or - * call, so that they represent a logical OR of all iterations and calls - * regardless of child/key order. That allows the caller to call us multiple - * times without having to use a separate variable as a "reduce" accumulator. - */ -static void -afr_lookup_set_self_heal_params_by_xattr (afr_local_t *local, xlator_t *this, - dict_t *xattr) -{ - afr_private_t *priv = NULL; - int i = 0; - int ret = -1; - void *pending_raw = NULL; - int32_t *pending = NULL; - - GF_ASSERT (local); - GF_ASSERT (this); - GF_ASSERT (xattr); - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); - if (ret != 0) { - continue; - } - pending = pending_raw; - - if (pending[AFR_METADATA_TRANSACTION]) { - gf_log(this->name, GF_LOG_DEBUG, - "metadata self-heal is pending for %s.", - local->loc.path); - local->self_heal.do_metadata_self_heal = _gf_true; - } - - if (pending[AFR_ENTRY_TRANSACTION]) { - gf_log(this->name, GF_LOG_DEBUG, - "entry self-heal is pending for %s.", - local->loc.path); - local->self_heal.do_entry_self_heal = _gf_true; - } - - if (pending[AFR_DATA_TRANSACTION]) { - gf_log(this->name, GF_LOG_DEBUG, - "data self-heal is pending for %s.", - local->loc.path); - local->self_heal.do_data_self_heal = _gf_true; - } - } -} - -void -afr_lookup_check_set_metadata_split_brain (afr_local_t *local, xlator_t *this) -{ - int32_t *sources = NULL; - afr_private_t *priv = NULL; - int32_t subvol_status = 0; - int32_t *success_children = NULL; - dict_t **xattrs = NULL; - struct iatt *bufs = NULL; - int32_t **pending_matrix = NULL; - - priv = this->private; - - sources = GF_CALLOC (priv->child_count, sizeof (*sources), - gf_afr_mt_int32_t); - if (NULL == sources) - goto out; - success_children = local->cont.lookup.success_children; - xattrs = local->cont.lookup.xattrs; - bufs = local->cont.lookup.bufs; - pending_matrix = local->cont.lookup.pending_matrix; - afr_build_sources (this, xattrs, bufs, pending_matrix, - sources, success_children, AFR_METADATA_TRANSACTION, - &subvol_status, _gf_false); - if (subvol_status & SPLIT_BRAIN) - local->cont.lookup.possible_spb = _gf_true; -out: - GF_FREE (sources); -} - -static void -afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this, - struct iatt *buf, struct iatt *lookup_buf) -{ - if (PERMISSION_DIFFERS (buf, lookup_buf)) { - /* mismatching permissions */ - gf_log (this->name, GF_LOG_DEBUG, - "permissions differ for %s ", local->loc.path); - local->self_heal.do_metadata_self_heal = _gf_true; - } - - if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { - /* mismatching permissions */ - local->self_heal.do_metadata_self_heal = _gf_true; - gf_log (this->name, GF_LOG_DEBUG, - "ownership differs for %s ", local->loc.path); - } - - if (SIZE_DIFFERS (buf, lookup_buf) - && IA_ISREG (buf->ia_type)) { - gf_log (this->name, GF_LOG_DEBUG, - "size differs for %s ", local->loc.path); - local->self_heal.do_data_self_heal = _gf_true; - } - - if (uuid_compare (buf->ia_gfid, lookup_buf->ia_gfid)) { - /* mismatching gfid */ - gf_log (this->name, GF_LOG_DEBUG, - "%s: gfid different on subvolume", local->loc.path); - } -} - -static void -afr_detect_self_heal_by_split_brain_status (afr_local_t *local, xlator_t *this) -{ - gf_boolean_t split_brain = _gf_false; - afr_self_heal_t *sh = NULL; - - sh = &local->self_heal; - - split_brain = afr_is_split_brain (this, local->cont.lookup.inode); - split_brain = split_brain || local->cont.lookup.possible_spb; - if ((local->success_count > 0) && split_brain && - IA_ISREG (local->cont.lookup.inode->ia_type)) { - sh->force_confirm_spb = _gf_true; - gf_log (this->name, GF_LOG_DEBUG, - "split brain detected during lookup of %s.", - local->loc.path); - } -} - -static void -afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this) -{ - GF_ASSERT (local); - GF_ASSERT (this); - - if ((local->success_count > 0) && (local->enoent_count > 0)) { - local->self_heal.do_metadata_self_heal = _gf_true; - local->self_heal.do_data_self_heal = _gf_true; - local->self_heal.do_entry_self_heal = _gf_true; - local->self_heal.do_gfid_self_heal = _gf_true; - local->self_heal.do_missing_entry_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "entries are missing in lookup of %s.", - local->loc.path); - } - - return; -} - -gf_boolean_t -afr_can_self_heal_proceed (afr_self_heal_t *sh, afr_private_t *priv) -{ - GF_ASSERT (sh); - GF_ASSERT (priv); - - if (sh->force_confirm_spb) - return _gf_true; - return (sh->do_gfid_self_heal - || sh->do_missing_entry_self_heal - || (afr_data_self_heal_enabled (priv->data_self_heal) && - sh->do_data_self_heal) - || (priv->metadata_self_heal && sh->do_metadata_self_heal) - || (priv->entry_self_heal && sh->do_entry_self_heal)); -} - -afr_transaction_type -afr_transaction_type_get (ia_type_t ia_type) -{ - afr_transaction_type type = AFR_METADATA_TRANSACTION; - - GF_ASSERT (ia_type != IA_INVAL); - - if (IA_ISDIR (ia_type)) { - type = AFR_ENTRY_TRANSACTION; - } else if (IA_ISREG (ia_type)) { - type = AFR_DATA_TRANSACTION; - } - return type; -} - -int -afr_lookup_select_read_child (afr_local_t *local, xlator_t *this, - int32_t *read_child) -{ - ia_type_t ia_type = IA_INVAL; - int32_t source = -1; - int ret = -1; - dict_t **xattrs = NULL; - int32_t *success_children = NULL; - afr_transaction_type type = AFR_METADATA_TRANSACTION; - uuid_t *gfid = NULL; - - GF_ASSERT (local); - GF_ASSERT (this); - GF_ASSERT (local->success_count > 0); - - success_children = local->cont.lookup.success_children; - /*We can take the success_children[0] only because we already - *handle the conflicting children other wise, we could select the - *read_child based on wrong file type - */ - ia_type = local->cont.lookup.bufs[success_children[0]].ia_type; - type = afr_transaction_type_get (ia_type); - xattrs = local->cont.lookup.xattrs; - gfid = &local->cont.lookup.buf.ia_gfid; - source = afr_lookup_select_read_child_by_txn_type (this, local, xattrs, - type, *gfid); - if (source < 0) { - gf_log (this->name, GF_LOG_DEBUG, "failed to select source " - "for %s", local->loc.path); - goto out; - } - - gf_log (this->name, GF_LOG_DEBUG, "Source selected as %d for %s", - source, local->loc.path); - *read_child = source; - ret = 0; -out: - return ret; -} - -static inline gf_boolean_t -afr_is_transaction_running (afr_local_t *local) -{ - GF_ASSERT (local->fop == GF_FOP_LOOKUP); - return ((local->inodelk_count > 0) || (local->entrylk_count > 0)); -} - -void -afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, - gf_boolean_t background, ia_type_t ia_type, char *reason, - void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, - xlator_t *this), - int (*unwind) (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno, - int32_t sh_failed)) -{ - afr_local_t *local = NULL; - char sh_type_str[256] = {0,}; - char *bg = ""; - - GF_ASSERT (frame); - GF_ASSERT (this); - GF_ASSERT (inode); - GF_ASSERT (ia_type != IA_INVAL); - - local = frame->local; - local->self_heal.background = background; - local->self_heal.type = ia_type; - local->self_heal.unwind = unwind; - local->self_heal.gfid_sh_success_cbk = gfid_sh_success_cbk; - - afr_self_heal_type_str_get (&local->self_heal, - sh_type_str, - sizeof (sh_type_str)); - - if (background) - bg = "background"; - gf_log (this->name, GF_LOG_DEBUG, - "%s %s self-heal triggered. path: %s, reason: %s", bg, - sh_type_str, local->loc.path, reason); - - afr_self_heal (frame, this, inode); -} - -unsigned int -afr_gfid_missing_count (const char *xlator_name, int32_t *success_children, - struct iatt *bufs, unsigned int child_count, - const char *path) -{ - unsigned int gfid_miss_count = 0; - int i = 0; - struct iatt *child1 = NULL; - - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - child1 = &bufs[success_children[i]]; - if (uuid_is_null (child1->ia_gfid)) { - gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid is null" - " on subvolume %d", path, success_children[i]); - gfid_miss_count++; - } - } - - return gfid_miss_count; -} - -static int -afr_lookup_gfid_missing_count (afr_local_t *local, xlator_t *this) -{ - int32_t *success_children = NULL; - afr_private_t *priv = NULL; - struct iatt *bufs = NULL; - int miss_count = 0; - - priv = this->private; - bufs = local->cont.lookup.bufs; - success_children = local->cont.lookup.success_children; - - miss_count = afr_gfid_missing_count (this->name, success_children, - bufs, priv->child_count, - local->loc.path); - return miss_count; -} - -gf_boolean_t -afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, - unsigned int child_count, const char *path, - const char *xlator_name) -{ - gf_boolean_t conflicting = _gf_false; - int i = 0; - struct iatt *child1 = NULL; - struct iatt *child2 = NULL; - uuid_t *gfid = NULL; - - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - child1 = &bufs[success_children[i]]; - if ((!gfid) && (!uuid_is_null (child1->ia_gfid))) - gfid = &child1->ia_gfid; - - if (i == 0) - continue; - - child2 = &bufs[success_children[i-1]]; - if (FILETYPE_DIFFERS (child1, child2)) { - gf_log (xlator_name, GF_LOG_DEBUG, "%s: filetype " - "differs on subvolumes (%d, %d)", path, - success_children[i-1], success_children[i]); - conflicting = _gf_true; - goto out; - } - if (!gfid || uuid_is_null (child1->ia_gfid)) - continue; - if (uuid_compare (*gfid, child1->ia_gfid)) { - gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid differs" - " on subvolume %d", path, success_children[i]); - conflicting = _gf_true; - goto out; - } - } -out: - return conflicting; -} - -/* afr_update_gfid_from_iatts: This function should be called only if the - * iatts are not conflicting. - */ -void -afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs, - int32_t *success_children, unsigned int child_count) -{ - uuid_t *gfid = NULL; - int i = 0; - int child = 0; - - for (i = 0; i < child_count; i++) { - child = success_children[i]; - if (child == -1) - break; - if ((!gfid) && (!uuid_is_null (bufs[child].ia_gfid))) { - gfid = &bufs[child].ia_gfid; - } else if (gfid && (!uuid_is_null (bufs[child].ia_gfid))) { - if (uuid_compare (*gfid, bufs[child].ia_gfid)) { - GF_ASSERT (0); - goto out; - } - } - } - if (gfid && (!uuid_is_null (*gfid))) - uuid_copy (uuid, *gfid); -out: - return; -} - -static gf_boolean_t -afr_lookup_conflicting_entries (afr_local_t *local, xlator_t *this) -{ - afr_private_t *priv = NULL; - gf_boolean_t conflict = _gf_false; - - priv = this->private; - conflict = afr_conflicting_iattrs (local->cont.lookup.bufs, - local->cont.lookup.success_children, - priv->child_count, local->loc.path, - this->name); - return conflict; -} - -gf_boolean_t -afr_open_only_data_self_heal (char *data_self_heal) -{ - return !strcmp (data_self_heal, "open"); -} - -gf_boolean_t -afr_data_self_heal_enabled (char *data_self_heal) -{ - gf_boolean_t enabled = _gf_false; - - if (gf_string2boolean (data_self_heal, &enabled) == -1) { - enabled = !strcmp (data_self_heal, "open"); - GF_ASSERT (enabled); - } - - return enabled; -} - -static void -afr_lookup_set_self_heal_params (afr_local_t *local, xlator_t *this) -{ - int i = 0; - struct iatt *bufs = NULL; - dict_t **xattr = NULL; - afr_private_t *priv = NULL; - int32_t child1 = -1; - int32_t child2 = -1; - afr_self_heal_t *sh = NULL; - - priv = this->private; - sh = &local->self_heal; - - afr_detect_self_heal_by_lookup_status (local, this); - - if (afr_lookup_gfid_missing_count (local, this)) - local->self_heal.do_gfid_self_heal = _gf_true; - - if (_gf_true == afr_lookup_conflicting_entries (local, this)) - local->self_heal.do_missing_entry_self_heal = _gf_true; - else - afr_update_gfid_from_iatts (local->self_heal.sh_gfid_req, - local->cont.lookup.bufs, - local->cont.lookup.success_children, - priv->child_count); - - bufs = local->cont.lookup.bufs; - for (i = 1; i < local->success_count; i++) { - child1 = local->cont.lookup.success_children[i-1]; - child2 = local->cont.lookup.success_children[i]; - afr_detect_self_heal_by_iatt (local, this, - &bufs[child1], &bufs[child2]); - } - - xattr = local->cont.lookup.xattrs; - for (i = 0; i < local->success_count; i++) { - child1 = local->cont.lookup.success_children[i]; - afr_lookup_set_self_heal_params_by_xattr (local, this, - xattr[child1]); - } - if (afr_open_only_data_self_heal (priv->data_self_heal)) - sh->do_data_self_heal = _gf_false; - if (sh->do_metadata_self_heal) - afr_lookup_check_set_metadata_split_brain (local, this); - afr_detect_self_heal_by_split_brain_status (local, this); -} - -int -afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno, - int32_t sh_failed) -{ - afr_local_t *local = NULL; - int ret = -1; - dict_t *xattr = NULL; - - local = frame->local; - - if (op_ret == -1) { - local->op_ret = -1; - local->op_errno = afr_most_important_error(local->op_errno, - op_errno, _gf_true); - - goto out; - } else { - local->op_ret = 0; - } - - afr_lookup_done_success_action (frame, this, _gf_true); - xattr = local->cont.lookup.xattr; - if (xattr) { - ret = dict_set_int32 (xattr, "sh-failed", sh_failed); - if (ret) - gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " - "sh-failed to %d", local->loc.path, sh_failed); - - if (local->self_heal.actual_sh_started == _gf_true && - sh_failed == 0) { - ret = dict_set_int32 (xattr, "actual-sh-done", 1); - if (ret) - gf_log(this->name, GF_LOG_ERROR, "%s: Failed to" - " set actual-sh-done to %d", - local->loc.path, - local->self_heal.actual_sh_started); - } - } -out: - AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, - local->cont.lookup.inode, &local->cont.lookup.buf, - local->cont.lookup.xattr, - &local->cont.lookup.postparent); - - return 0; -} - -//TODO: At the moment only lookup needs this, so not doing any checks, in the -// future we will have to do fop specific operations -void -afr_post_gfid_sh_success (call_frame_t *sh_frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_local_t *sh_local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - struct iatt *lookup_bufs = NULL; - struct iatt *lookup_parentbufs = NULL; - - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - local = sh->orig_frame->local; - lookup_bufs = local->cont.lookup.bufs; - lookup_parentbufs = local->cont.lookup.postparents; - priv = this->private; - - memcpy (lookup_bufs, sh->buf, priv->child_count * sizeof (*sh->buf)); - memcpy (lookup_parentbufs, sh->parentbufs, - priv->child_count * sizeof (*sh->parentbufs)); - - afr_reset_xattr (local->cont.lookup.xattrs, priv->child_count); - if (local->cont.lookup.xattr) { - dict_unref (local->cont.lookup.xattr); - local->cont.lookup.xattr = NULL; - } - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) - local->cont.lookup.xattrs[i] = dict_ref (sh->xattr[i]); - } - - afr_reset_children (local->cont.lookup.success_children, - priv->child_count); - afr_children_copy (local->cont.lookup.success_children, - sh->fresh_children, priv->child_count); -} - -static void -afr_lookup_perform_self_heal (call_frame_t *frame, xlator_t *this, - gf_boolean_t *sh_launched) -{ - unsigned int up_count = 0; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - char *reason = NULL; - - GF_ASSERT (sh_launched); - *sh_launched = _gf_false; - priv = this->private; - local = frame->local; - - up_count = afr_up_children_count (local->child_up, priv->child_count); - if (up_count == 1) { - gf_log (this->name, GF_LOG_DEBUG, - "Only 1 child up - do not attempt to detect self heal"); - goto out; - } - - afr_lookup_set_self_heal_params (local, this); - if (afr_can_self_heal_proceed (&local->self_heal, priv)) { - if (afr_is_transaction_running (local) && - /*Forcefully call afr_launch_self_heal (which will go on to - fail) for SB files.This prevents stale data being served - due to race in afr_is_transaction_running() when - multiple clients access the same SB file*/ - !local->cont.lookup.possible_spb && - (!local->attempt_self_heal)) - goto out; - - reason = "lookup detected pending operations"; - afr_launch_self_heal (frame, this, local->cont.lookup.inode, - !local->foreground_self_heal, - local->cont.lookup.buf.ia_type, - reason, afr_post_gfid_sh_success, - afr_self_heal_lookup_unwind); - *sh_launched = _gf_true; - } -out: - return; -} - -void -afr_get_fresh_children (int32_t *success_children, int32_t *sources, - int32_t *fresh_children, unsigned int child_count) -{ - unsigned int i = 0; - unsigned int j = 0; - - GF_ASSERT (success_children); - GF_ASSERT (sources); - GF_ASSERT (fresh_children); - - afr_reset_children (fresh_children, child_count); - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - if (afr_is_read_child (success_children, sources, child_count, - success_children[i])) { - fresh_children[j] = success_children[i]; - j++; - } - } -} - -static int -afr_lookup_set_read_ctx (afr_local_t *local, xlator_t *this, int32_t read_child) -{ - afr_private_t *priv = NULL; - - GF_ASSERT (read_child >= 0); - - priv = this->private; - afr_get_fresh_children (local->cont.lookup.success_children, - local->cont.lookup.sources, - local->fresh_children, priv->child_count); - afr_inode_set_read_ctx (this, local->cont.lookup.inode, read_child, - local->fresh_children); - - return 0; -} - -int -afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, - gf_boolean_t fail_conflict) -{ - int32_t read_child = -1; - int32_t ret = -1; - afr_local_t *local = NULL; - gf_boolean_t fresh_lookup = _gf_false; - - local = frame->local; - fresh_lookup = local->cont.lookup.fresh_lookup; - - if (local->loc.parent == NULL) - fail_conflict = _gf_true; - - if (afr_lookup_conflicting_entries (local, this)) { - if (fail_conflict == _gf_false) - ret = 0; - goto out; - } + { /* symlink */ + GF_FREE (local->cont.symlink.linkpath); + } - ret = afr_lookup_select_read_child (local, this, &read_child); - if (!afr_is_transaction_running (local) || fresh_lookup) { - if (read_child < 0) - goto out; + { /* opendir */ + GF_FREE (local->cont.opendir.checksum); + } - ret = afr_lookup_set_read_ctx (local, this, read_child); - if (ret) - goto out; + { /* readdirp */ + if (local->cont.readdir.dict) + dict_unref (local->cont.readdir.dict); } - ret = afr_lookup_build_response_params (local, this); - if (ret) - goto out; - afr_update_loc_gfids (&local->loc, - &local->cont.lookup.buf, - &local->cont.lookup.postparent); + if (local->xdata_req) + dict_unref (local->xdata_req); - ret = 0; -out: - if (ret) { - local->op_ret = -1; - local->op_errno = EIO; - } - return ret; + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); } + int -afr_lookup_get_latest_subvol (afr_local_t *local, xlator_t *this) +afr_frame_return (call_frame_t *frame) { - afr_private_t *priv = NULL; - int32_t *success_children = NULL; - struct iatt *bufs = NULL; - int i = 0; - int child = 0; - int lsubvol = -1; - - priv = this->private; - success_children = local->cont.lookup.success_children; - bufs = local->cont.lookup.bufs; - for (i = 0; i < priv->child_count; i++) { - child = success_children[i]; - if (child == -1) - break; - if (uuid_is_null (bufs[child].ia_gfid)) - continue; - if (lsubvol < 0) { - lsubvol = child; - } else if (bufs[lsubvol].ia_ctime < bufs[child].ia_ctime) { - lsubvol = child; - } else if ((bufs[lsubvol].ia_ctime == bufs[child].ia_ctime) && - (bufs[lsubvol].ia_ctime_nsec < bufs[child].ia_ctime_nsec)) { - lsubvol = child; - } - } - return lsubvol; -} + afr_local_t *local = NULL; + int call_count = 0; -void -afr_lookup_mark_other_entries_stale (afr_local_t *local, xlator_t *this, - int subvol) -{ - afr_private_t *priv = NULL; - int32_t *success_children = NULL; - struct iatt *bufs = NULL; - int i = 0; - int child = 0; + local = frame->local; - priv = this->private; - success_children = local->cont.lookup.success_children; - bufs = local->cont.lookup.bufs; - memcpy (local->fresh_children, success_children, - sizeof (*success_children) * priv->child_count); - for (i = 0; i < priv->child_count; i++) { - child = local->fresh_children[i]; - if (child == -1) - break; - if (child == subvol) - continue; - if (uuid_is_null (bufs[child].ia_gfid) && - (bufs[child].ia_type == bufs[subvol].ia_type)) - continue; - afr_children_rm_child (success_children, child, - priv->child_count); - local->success_count--; + LOCK (&frame->lock); + { + call_count = --local->call_count; } - afr_reset_children (local->fresh_children, priv->child_count); + UNLOCK (&frame->lock); + + return call_count; } -void -afr_succeed_lookup_on_latest_iatt (afr_local_t *local, xlator_t *this) -{ - int lsubvol = 0; - if (!afr_lookup_conflicting_entries (local, this)) - goto out; +gf_boolean_t +afr_is_entry_possibly_under_txn (afr_local_t *local, xlator_t *this) +{ + int i = 0; + int tmp = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].xdata) + continue; + if (dict_get_int32 (local->replies[i].xdata, + GLUSTERFS_PARENT_ENTRYLK, + &tmp) == 0) + if (tmp) + return _gf_true; + } - lsubvol = afr_lookup_get_latest_subvol (local, this); - if (lsubvol < 0) - goto out; - afr_lookup_mark_other_entries_stale (local, this, lsubvol); -out: - return; + return _gf_false; } -gf_boolean_t -afr_is_entry_possibly_under_creation (afr_local_t *local, xlator_t *this) + +/* + * Quota size xattrs are not maintained by afr. There is a + * possibility that they differ even when both the directory changelog xattrs + * suggest everything is fine. So if there is at least one 'source' check among + * the sources which has the maximum quota size. Otherwise check among all the + * available ones for maximum quota size. This way if there is a source and + * stale copies it always votes for the 'source'. + * */ + +static void +afr_handle_quota_size (call_frame_t *frame, xlator_t *this) { - /* - * We need to perform this test in lookup done and treat on going - * create/DELETE as ENOENT. - * Reason: - Multiple clients A, B and C are attempting 'mkdir -p /mnt/a/b/c' - - 1 Client A is in the middle of mkdir(/a). It has acquired lock. - It has performed mkdir(/a) on one subvol, and second one is still - in progress - 2 Client B performs a lookup, sees directory /a on one, - ENOENT on the other, succeeds lookup. - 3 Client B performs lookup on /a/b on both subvols, both return ENOENT - (one subvol because /a/b does not exist, another because /a - itself does not exist) - 4 Client B proceeds to mkdir /a/b. It obtains entrylk on inode=/a with - basename=b on one subvol, but fails on other subvol as /a is yet to - be created by Client A. - 5 Client A finishes mkdir of /a on other subvol - 6 Client C also attempts to create /a/b, lookup returns ENOENT on - both subvols. - 7 Client C tries to obtain entrylk on on inode=/a with basename=b, - obtains on one subvol (where B had failed), and waits for B to unlock - on other subvol. - 8 Client B finishes mkdir() on one subvol with GFID-1 and completes - transaction and unlocks - 9 Client C gets the lock on the second subvol, At this stage second - subvol already has /a/b created from Client B, but Client C does not - check that in the middle of mkdir transaction - 10 Client C attempts mkdir /a/b on both subvols. It succeeds on - ONLY ONE (where Client B could not get lock because of - missing parent /a dir) with GFID-2, and gets EEXIST from ONE subvol. - This way we have /a/b in GFID mismatch. One subvol got GFID-1 because - Client B performed transaction on only one subvol (because entrylk() - could not be obtained on second subvol because of missing parent dir -- - caused by premature/speculative succeeding of lookup() on /a when locks - are detected). Other subvol gets GFID-2 from Client C because while - it was waiting for entrylk() on both subvols, Client B was in the - middle of creating mkdir() on only one subvol, and Client C does not - "expect" this when it is between lock() and pre-op()/op() phase of the - transaction. - */ - if (local->cont.lookup.parent_entrylk && local->enoent_count) - return _gf_true; + unsigned char *readable = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int i = 0; + uint64_t size = 0; + uint64_t max_size = 0; + int readable_cnt = 0; - return _gf_false; + local = frame->local; + priv = this->private; + replies = local->replies; + + readable = alloca0 (priv->child_count); + + afr_inode_read_subvol_get (local->inode, this, readable, 0, 0); + + readable_cnt = AFR_COUNT (readable, priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (readable_cnt && !readable[i]) + continue; + if (!replies[i].xdata) + continue; + if (dict_get_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, &size)) + continue; + if (size > max_size) + max_size = size; + } + + if (!max_size) + return; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (readable_cnt && !readable[i]) + continue; + if (!replies[i].xdata) + continue; + if (dict_set_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, max_size)) + continue; + } } static void afr_lookup_done (call_frame_t *frame, xlator_t *this) { - int unwind = 1; afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; - gf_boolean_t sh_launched = _gf_false; - gf_boolean_t fail_conflict = _gf_false; - int gfid_miss_count = 0; - int enotconn_count = 0; - int up_children_count = 0; + int i = -1; + int op_errno = 0; + int read_subvol = 0; + unsigned char *readable = NULL; + int event = 0; + struct afr_reply *replies = NULL; + uuid_t read_gfid = {0, }; + gf_boolean_t locked_entry = _gf_false; + gf_boolean_t can_interpret = _gf_true; priv = this->private; local = frame->local; + replies = local->replies; + + locked_entry = afr_is_entry_possibly_under_txn (local, this); - if (afr_is_entry_possibly_under_creation (local, this)) { + readable = alloca0 (priv->child_count); + + afr_inode_read_subvol_get (local->loc.parent, this, readable, + NULL, &event); + + /* First, check if we have an ESTALE from somewhere, + If so, propagate that so that a revalidate can be + issued + */ + op_errno = afr_final_errno (frame->local, this->private); + local->op_errno = op_errno; + if (op_errno == ESTALE) { + local->op_errno = op_errno; local->op_ret = -1; - local->op_errno = ENOENT; - goto unwind; + goto unwind; } - if (local->op_ret < 0) - goto unwind; + read_subvol = -1; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (locked_entry && replies[i].op_ret == -1 && + replies[i].op_errno == ENOENT) { + /* Second, check entry is still + "underway" in creation */ + local->op_ret = -1; + local->op_errno = ENOENT; + read_subvol = i; + goto unwind; + } - if (local->cont.lookup.parent_entrylk && local->success_count > 1) - afr_succeed_lookup_on_latest_iatt (local, this); - - gfid_miss_count = afr_lookup_gfid_missing_count (local, this); - up_children_count = afr_up_children_count (local->child_up, - priv->child_count); - enotconn_count = priv->child_count - up_children_count; - if ((gfid_miss_count == local->success_count) && - (enotconn_count > 0)) { - local->op_ret = -1; - local->op_errno = EIO; - gf_log (this->name, GF_LOG_ERROR, "Failing lookup for %s, " - "LOOKUP on a file without gfid is not allowed when " - "some of the children are down", local->loc.path); - goto unwind; - } + if (replies[i].op_ret == -1) + continue; - if ((gfid_miss_count == local->success_count) && - uuid_is_null (local->cont.lookup.gfid_req)) { - local->op_ret = -1; - local->op_errno = ENODATA; - gf_log (this->name, GF_LOG_ERROR, "%s: No gfid present", - local->loc.path); - goto unwind; - } + if (read_subvol == -1 || !readable[read_subvol]) { + read_subvol = i; + uuid_copy (read_gfid, replies[i].poststat.ia_gfid); + local->op_ret = 0; + } + } - if (gfid_miss_count && uuid_is_null (local->cont.lookup.gfid_req)) - fail_conflict = _gf_true; - ret = afr_lookup_done_success_action (frame, this, fail_conflict); - if (ret) - goto unwind; - uuid_copy (local->self_heal.sh_gfid_req, local->cont.lookup.gfid_req); + if (read_subvol == -1) + goto unwind; + /* We now have a read_subvol, which is readable[] (if there + were any). Next we look for GFID mismatches. We don't + consider a GFID mismatch as an error if read_subvol is + readable[] but the mismatching GFID subvol is not. + */ + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) { + if (priv->child_up[i]) + can_interpret = _gf_false; + continue; + } - afr_lookup_perform_self_heal (frame, this, &sh_launched); - if (sh_launched) { - unwind = 0; - goto unwind; - } + if (!uuid_compare (replies[i].poststat.ia_gfid, + read_gfid)) + continue; - unwind: - if (unwind) { - AFR_STACK_UNWIND (lookup, frame, local->op_ret, - local->op_errno, local->cont.lookup.inode, - &local->cont.lookup.buf, - local->cont.lookup.xattr, - &local->cont.lookup.postparent); - } + can_interpret = _gf_false; + + if (locked_entry) + continue; + + /* Now GFIDs mismatch. It's OK as long as this subvol + is not readable[] but read_subvol is */ + if (readable[read_subvol] && !readable[i]) + continue; + + /* LOG ERROR */ + local->op_ret = -1; + local->op_errno = EIO; + goto unwind; + } + + /* Forth, for the finalized GFID, pick the best subvolume + to return stats from. + */ + if (can_interpret) { + /* It is safe to call afr_replies_interpret() because we have + a response from all the UP subvolumes and all of them resolved + to the same GFID + */ + if (afr_replies_interpret (frame, this, local->inode)) { + read_subvol = afr_data_subvol_get (local->inode, this, + 0, 0); + afr_inode_read_subvol_reset (local->inode, this); + goto cant_interpret; + } else { + read_subvol = afr_data_subvol_get (local->inode, this, + 0, 0); + } + } else { + cant_interpret: + if (read_subvol == -1) + dict_del (replies[0].xdata, GF_CONTENT_KEY); + else + dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY); + } + + afr_handle_quota_size (frame, this); + +unwind: + if (read_subvol == -1) + read_subvol = 0; + + AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->replies[read_subvol].poststat, + local->replies[read_subvol].xdata, + &local->replies[read_subvol].postparent); } /* @@ -2148,104 +1253,102 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) * others in that they must be given higher priority while * returning to the user. * - * The hierarchy is ESTALE > EIO > ENOENT > others + * The hierarchy is ESTALE > ENOENT > others */ -int32_t -afr_most_important_error(int32_t old_errno, int32_t new_errno, - gf_boolean_t eio) + +int +afr_higher_errno (int32_t old_errno, int32_t new_errno) { + if (old_errno == ENODATA || new_errno == ENODATA) + return ENODATA; if (old_errno == ESTALE || new_errno == ESTALE) return ESTALE; - if (eio && (old_errno == EIO || new_errno == EIO)) - return EIO; if (old_errno == ENOENT || new_errno == ENOENT) return ENOENT; return new_errno; } -int32_t -afr_resultant_errno_get (int32_t *children, - int *child_errno, unsigned int child_count) -{ - int i = 0; - int32_t op_errno = 0; - int child = 0; - for (i = 0; i < child_count; i++) { - if (children) { - child = children[i]; - if (child == -1) - break; - } else { - child = i; - } - op_errno = afr_most_important_error(op_errno, - child_errno[child], - _gf_false); - } - return op_errno; +int +afr_final_errno (afr_local_t *local, afr_private_t *priv) +{ + int i = 0; + int op_errno = 0; + int tmp_errno = 0; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret == 0) + continue; + tmp_errno = local->replies[i].op_errno; + op_errno = afr_higher_errno (op_errno, tmp_errno); + } + + return op_errno; } -static void -afr_lookup_handle_error (afr_local_t *local, int32_t op_ret, int32_t op_errno) +static int +get_pathinfo_host (char *pathinfo, char *hostname, size_t size) { - GF_ASSERT (local); - if (op_errno == ENOENT) - local->enoent_count++; + char *start = NULL; + char *end = NULL; + int ret = -1; + int i = 0; + + if (!pathinfo) + goto out; - local->op_errno = afr_most_important_error(local->op_errno, op_errno, - _gf_false); + start = strchr (pathinfo, ':'); + if (!start) + goto out; + end = strrchr (pathinfo, ':'); + if (start == end) + goto out; - if (local->op_errno == ESTALE) { - local->op_ret = -1; - } + memset (hostname, 0, size); + i = 0; + while (++start != end) + hostname[i++] = *start; + ret = 0; +out: + return ret; } -static void -afr_set_root_inode_on_first_lookup (afr_local_t *local, xlator_t *this, - inode_t *inode) +int +afr_local_pathinfo (char *pathinfo, gf_boolean_t *local) { - afr_private_t *priv = NULL; - GF_ASSERT (inode); + int ret = 0; + char pathinfohost[1024] = {0}; + char localhost[1024] = {0}; + xlator_t *this = THIS; - if (!__is_root_gfid (inode->gfid)) - goto out; - if (!afr_is_fresh_lookup (&local->loc, this)) + *local = _gf_false; + ret = get_pathinfo_host (pathinfo, pathinfohost, sizeof (pathinfohost)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Invalid pathinfo: %s", + pathinfo); goto out; - priv = this->private; - if ((priv->first_lookup)) { - gf_log (this->name, GF_LOG_INFO, "added root inode"); - priv->root_inode = inode_ref (inode); - priv->first_lookup = 0; } -out: - return; -} -static void -afr_lookup_cache_args (afr_local_t *local, int child_index, dict_t *xattr, - struct iatt *buf, struct iatt *postparent) -{ - GF_ASSERT (child_index >= 0); - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - local->cont.lookup.postparents[child_index] = *postparent; - local->cont.lookup.bufs[child_index] = *buf; -} + ret = gethostname (localhost, sizeof (localhost)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "gethostname() failed, " + "reason: %s", strerror (errno)); + goto out; + } -static void -afr_lookup_handle_first_success (afr_local_t *local, xlator_t *this, - inode_t *inode, struct iatt *buf) -{ - local->cont.lookup.inode = inode_ref (inode); - local->cont.lookup.buf = *buf; - afr_set_root_inode_on_first_lookup (local, this, inode); + if (!strcmp (localhost, pathinfohost)) + *local = _gf_true; +out: + return ret; } static int32_t -afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, - dict_t *xdata) +afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { int ret = 0; char *pathinfo = NULL; @@ -2257,6 +1360,9 @@ afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } + priv = this->private; + child_index = (int32_t)(long)cookie; + ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); if (ret != 0) { goto out; @@ -2267,7 +1373,6 @@ afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - priv = this->private; /* * Note that one local subvolume will override another here. The only * way to avoid that would be to retain extra information about whether @@ -2275,13 +1380,11 @@ afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, * the slowest local subvolume is far preferable to a remote one. */ if (is_local) { - child_index = (int32_t)(long)cookie; gf_log (this->name, GF_LOG_INFO, "selecting local read_child %s", priv->children[child_index]->name); priv->read_child = child_index; } - out: STACK_DESTROY(frame->root); return 0; @@ -2300,234 +1403,357 @@ afr_attempt_local_discovery (xlator_t *this, int32_t child_index) } tmploc.gfid[sizeof(tmploc.gfid)-1] = 1; - STACK_WIND_COOKIE (newframe, afr_discovery_cbk, + STACK_WIND_COOKIE (newframe, afr_local_discovery_cbk, (void *)(long)child_index, priv->children[child_index], priv->children[child_index]->fops->getxattr, &tmploc, GF_XATTR_PATHINFO_KEY, NULL); } -static void -afr_lookup_handle_success (afr_local_t *local, xlator_t *this, int32_t child_index, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent) + +int +afr_lookup_selfheal_wrap (void *opaque) { - afr_private_t *priv = this->private; + call_frame_t *frame = opaque; + afr_local_t *local = NULL; + xlator_t *this = NULL; + inode_t *inode = NULL; - if (local->success_count == 0) { - if (local->op_errno != ESTALE) { - local->op_ret = op_ret; - local->op_errno = 0; - } - afr_lookup_handle_first_success (local, this, inode, buf); - } - afr_lookup_update_lk_counts (local, this, - child_index, xattr); + local = frame->local; + this = frame->this; + + afr_selfheal_name (frame->this, local->loc.pargfid, local->loc.name); + + afr_replies_wipe (local, this->private); + + inode = afr_selfheal_unlocked_lookup_on (frame, local->loc.parent, + local->loc.name, local->replies, + local->child_up); + if (inode) + inode_unref (inode); + afr_lookup_done (frame, this); + + return 0; +} + + +int +afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + call_frame_t *heal = NULL; + int i = 0, first = -1; + gf_boolean_t need_heal = _gf_false; + struct afr_reply *replies = NULL; + int ret = 0; + + local = frame->local; + replies = local->replies; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (first == -1) { + first = i; + continue; + } + + if (replies[i].op_ret != replies[first].op_ret) { + need_heal = _gf_true; + break; + } + + if (uuid_compare (replies[i].poststat.ia_gfid, + replies[first].poststat.ia_gfid)) { + need_heal = _gf_true; + break; + } + } + + if (need_heal) { + heal = copy_frame (frame); + if (heal) + heal->root->pid = -1; + ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap, + afr_refresh_selfheal_done, heal, frame); + if (ret) + goto lookup_done; + } else { + lookup_done: + afr_lookup_done (frame, this); + } + + return ret; +} + + +int +afr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *buf, + dict_t *xdata, struct iatt *postparent) +{ + afr_local_t * local = NULL; + int call_count = -1; + int child_index = -1; + + child_index = (long) cookie; + + local = frame->local; - afr_lookup_cache_args (local, child_index, xattr, - buf, postparent); + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (op_ret != -1) { + local->replies[child_index].poststat = *buf; + local->replies[child_index].postparent = *postparent; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + } - if (local->do_discovery && (priv->read_child == (-1))) { - afr_attempt_local_discovery(this,child_index); + call_count = afr_frame_return (frame); + if (call_count == 0) { + afr_lookup_entry_heal (frame, this); } - local->cont.lookup.success_children[local->success_count] = child_index; - local->success_count++; + return 0; +} + + + +static void +afr_discover_done (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = -1; + int op_errno = 0; + int read_subvol = 0; + + priv = this->private; + local = frame->local; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret == 0) + local->op_ret = 0; + } + + op_errno = afr_final_errno (frame->local, this->private); + + if (local->op_ret < 0) { + local->op_errno = op_errno; + local->op_ret = -1; + goto unwind; + } + + afr_replies_interpret (frame, this, local->inode); + + read_subvol = afr_data_subvol_get (local->inode, this, 0, 0); + if (read_subvol == -1) { + gf_log (this->name, GF_LOG_WARNING, "no read subvols for %s", + local->loc.path); + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid || + local->replies[i].op_ret == -1) + continue; + read_subvol = i; + break; + } + } + +unwind: + if (read_subvol == -1) + read_subvol = 0; + + AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->replies[read_subvol].poststat, + local->replies[read_subvol].xdata, + &local->replies[read_subvol].postparent); } + int -afr_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) +afr_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *buf, + dict_t *xdata, struct iatt *postparent) { afr_local_t * local = NULL; int call_count = -1; int child_index = -1; - child_index = (long) cookie; + child_index = (long) cookie; - LOCK (&frame->lock); - { - local = frame->local; + local = frame->local; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (op_ret != -1) { + local->replies[child_index].poststat = *buf; + local->replies[child_index].postparent = *postparent; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + } + + if (local->do_discovery && (op_ret == 0)) + afr_attempt_local_discovery (this, child_index); + + call_count = afr_frame_return (frame); + if (call_count == 0) { + afr_discover_done (frame, this); + } + + return 0; +} + + +int +afr_discover_do (call_frame_t *frame, xlator_t *this, int err) +{ + int ret = 0; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + priv = this->private; + + if (err) { + local->op_errno = -err; + ret = -1; + goto out; + } + + call_count = local->call_count = AFR_COUNT (local->child_up, + priv->child_count); + + ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, + &local->loc); + if (ret) { + local->op_errno = -ret; + ret = -1; + goto out; + } - if (op_ret == -1) { - afr_lookup_handle_error (local, op_ret, op_errno); - goto unlock; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_discover_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, local->xattr_req); + if (!--call_count) + break; } - afr_lookup_handle_success (local, this, child_index, op_ret, - op_errno, inode, buf, xattr, - postparent); - - } -unlock: - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_lookup_done (frame, this); } - return 0; + return 0; +out: + AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0); + return 0; } + int -afr_lookup_cont_init (afr_local_t *local, unsigned int child_count) +afr_discover (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) { - int ret = -ENOMEM; - struct iatt *iatts = NULL; - int32_t *success_children = NULL; - int32_t *sources = NULL; - int32_t **pending_matrix = NULL; - - GF_ASSERT (local); - local->cont.lookup.xattrs = GF_CALLOC (child_count, - sizeof (*local->cont.lookup.xattr), - gf_afr_mt_dict_t); - if (NULL == local->cont.lookup.xattrs) - goto out; - - iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); - if (NULL == iatts) - goto out; - local->cont.lookup.postparents = iatts; - - iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); - if (NULL == iatts) - goto out; - local->cont.lookup.bufs = iatts; + int op_errno = ENOMEM; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int event = 0; - success_children = afr_children_create (child_count); - if (NULL == success_children) - goto out; - local->cont.lookup.success_children = success_children; + priv = this->private; - local->fresh_children = afr_children_create (child_count); - if (NULL == local->fresh_children) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - sources = GF_CALLOC (sizeof (*sources), child_count, gf_afr_mt_int32_t); - if (NULL == sources) + if (!local->call_count) { + op_errno = ENOTCONN; goto out; - local->cont.lookup.sources = sources; + } - pending_matrix = afr_matrix_create (child_count, child_count); - if (NULL == pending_matrix) - goto out; - local->cont.lookup.pending_matrix = pending_matrix; + if (__is_root_gfid (loc->inode->gfid)) { + if (!this->itable) + this->itable = loc->inode->table; + if (!priv->root_inode) + priv->root_inode = inode_ref (loc->inode); - ret = 0; -out: - return ret; -} + if (priv->choose_local && !priv->did_discovery) { + /* Logic to detect which subvolumes of AFR are + local, in order to prefer them for reads + */ + local->do_discovery = _gf_true; + priv->did_discovery = _gf_true; + } + } -int -afr_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - void *gfid_req = NULL; - int ret = -1; - int i = 0; - int call_count = 0; - uint64_t ctx = 0; - int32_t op_errno = 0; - priv = this->private; - - AFR_LOCAL_ALLOC_OR_GOTO (local, out); + local->op = GF_FOP_LOOKUP; - local->op_ret = -1; + loc_copy (&local->loc, loc); - frame->local = local; - local->fop = GF_FOP_LOOKUP; + local->inode = inode_ref (loc->inode); - loc_copy (&local->loc, loc); - ret = loc_path (&local->loc, NULL); - if (ret < 0) { - op_errno = EINVAL; - goto out; - } + if (xattr_req) + /* If xattr_req was null, afr_lookup_xattr_req_prepare() will + allocate one for us */ + local->xattr_req = dict_ref (xattr_req); - if (local->loc.path && - (strcmp (local->loc.path, "/" GF_REPLICATE_TRASH_DIR) == 0)) { - op_errno = EPERM; - ret = -1; - goto out; - } + if (uuid_is_null (loc->inode->gfid)) { + afr_discover_do (frame, this, 0); + return 0; + } - ret = inode_ctx_get (local->loc.inode, this, &ctx); - if (ret == 0) { - /* lookup is a revalidate */ + afr_read_subvol_get (loc->inode, this, NULL, &event, + AFR_DATA_TRANSACTION); - local->read_child_index = afr_inode_get_read_ctx (this, - local->loc.inode, - NULL); - } else { - LOCK (&priv->read_child_lock); - { - if (priv->hash_mode) { - local->read_child_index = -1; - } - else { - local->read_child_index = - (++priv->read_child_rr) % - (priv->child_count); - } - } - UNLOCK (&priv->read_child_lock); - local->cont.lookup.fresh_lookup = _gf_true; - } + if (event != local->event_generation) + afr_inode_refresh (frame, this, loc->inode, afr_discover_do); + else + afr_discover_do (frame, this, 0); - local->child_up = memdup (priv->child_up, - sizeof (*local->child_up) * priv->child_count); - if (NULL == local->child_up) { - op_errno = ENOMEM; - goto out; - } + return 0; +out: + AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; +} - ret = afr_lookup_cont_init (local, priv->child_count); - if (ret < 0) { - op_errno = -ret; - goto out; - } - local->call_count = afr_up_children_count (local->child_up, - priv->child_count); - call_count = local->call_count; - if (local->call_count == 0) { - ret = -1; - op_errno = ENOTCONN; - goto out; - } +int +afr_lookup_do (call_frame_t *frame, xlator_t *this, int err) +{ + int ret = 0; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; - /* By default assume ENOTCONN. On success it will be set to 0. */ - local->op_errno = ENOTCONN; + local = frame->local; + priv = this->private; - ret = dict_get_int32 (xattr_req, "attempt-self-heal", - &local->attempt_self_heal); - dict_del (xattr_req, "attempt-self-heal"); + if (err < 0) { + local->op_errno = -err; + ret = -1; + goto out; + } - ret = dict_get_int32 (xattr_req, "foreground-self-heal", - &local->foreground_self_heal); - dict_del (xattr_req, "foreground-self-heal"); + call_count = local->call_count = AFR_COUNT (local->child_up, + priv->child_count); - ret = afr_lookup_xattr_req_prepare (local, this, xattr_req, &local->loc, - &gfid_req); + ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, + &local->loc); if (ret) { local->op_errno = -ret; + ret = -1; goto out; } - afr_lookup_save_gfid (local->cont.lookup.gfid_req, gfid_req, - &local->loc); - local->fop = GF_FOP_LOOKUP; - if (priv->choose_local && !priv->did_discovery) { - if (gfid_req && __is_root_gfid(gfid_req)) { - local->do_discovery = _gf_true; - priv->did_discovery = _gf_true; - } - } + for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, afr_lookup_cbk, @@ -2539,12 +1765,98 @@ afr_lookup (call_frame_t *frame, xlator_t *this, break; } } + return 0; +out: + AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0); + return 0; +} - ret = 0; +/* + * afr_lookup() + * + * The goal here is to figure out what the element getting looked up is. + * i.e what is the GFID, inode type and a conservative estimate of the + * inode attributes are. + * + * As we lookup, operations may be underway on the entry name and the + * inode. In lookup() we are primarily concerned only with the entry + * operations. If the entry is getting unlinked or renamed, we detect + * what operation is underway by querying for on-going transactions and + * pending self-healing on the entry through xdata. + * + * If the entry is a file/dir, it may need self-heal and/or in a + * split-brain condition. Lookup is not the place to worry about these + * conditions. Outcast marking will naturally handle them in the read + * paths. + * + * Here is a brief goal of what we are trying to achieve: + * + * - LOOKUP on all subvolumes concurrently, querying on-going transaction + * and pending self-heal info from the servers. + * + * - If all servers reply the same inode type and GFID, the overall call + * MUST be a success. + * + * - If inode types or GFIDs mismatch, and there IS either an on-going + * transaction or pending self-heal, inspect what the nature of the + * transaction or pending heal is, and select the appropriate subvolume's + * reply as the winner. + * + * - If inode types or GFIDs mismatch, and there are no on-going transactions + * or pending self-heal on the entry name on any of the servers, fail the + * lookup with EIO. Something has gone wrong beyond reasonable action. + */ + +int +afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + afr_local_t *local = NULL; + int32_t op_errno = 0; + int event = 0; + + if (!loc->parent) { + afr_discover (frame, this, loc, xattr_req); + return 0; + } + + if (__is_root_gfid (loc->parent->gfid)) { + if (!strcmp (loc->name, GF_REPLICATE_TRASH_DIR)) { + op_errno = EPERM; + goto out; + } + } + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + if (!local->call_count) { + op_errno = ENOTCONN; + goto out; + } + + local->op = GF_FOP_LOOKUP; + + loc_copy (&local->loc, loc); + + local->inode = inode_ref (loc->inode); + + if (xattr_req) + /* If xattr_req was null, afr_lookup_xattr_req_prepare() will + allocate one for us */ + local->xattr_req = dict_ref (xattr_req); + + afr_read_subvol_get (loc->parent, this, NULL, &event, + AFR_DATA_TRANSACTION); + + if (event != local->event_generation) + afr_inode_refresh (frame, this, loc->parent, afr_lookup_do); + else + afr_lookup_do (frame, this, 0); + + return 0; out: - if (ret) - AFR_STACK_UNWIND (lookup, frame, -1, op_errno, - NULL, NULL, NULL, NULL); + AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); return 0; } @@ -2552,6 +1864,46 @@ out: /* {{{ open */ +afr_fd_ctx_t * +__afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ + uint64_t ctx = 0; + int ret = 0; + afr_fd_ctx_t *fd_ctx = NULL; + + ret = __fd_ctx_get (fd, this, &ctx); + + if (ret < 0) { + ret = __afr_fd_ctx_set (this, fd); + if (ret < 0) + goto out; + + ret = __fd_ctx_get (fd, this, &ctx); + if (ret < 0) + goto out; + } + + fd_ctx = (afr_fd_ctx_t *)(long) ctx; +out: + return fd_ctx; +} + + +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ + afr_fd_ctx_t *fd_ctx = NULL; + + LOCK(&fd->lock); + { + fd_ctx = __afr_fd_ctx_get (fd, this); + } + UNLOCK(&fd->lock); + + return fd_ctx; +} + + int __afr_fd_ctx_set (xlator_t *this, fd_t *fd) { @@ -2559,6 +1911,7 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) int ret = -1; uint64_t ctx = 0; afr_fd_ctx_t * fd_ctx = NULL; + int i = 0; VALIDATE_OR_GOTO (this->private, out); VALIDATE_OR_GOTO (fd, out); @@ -2577,21 +1930,15 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) goto out; } - fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_done) { - ret = -ENOMEM; - goto out; - } - - fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_piggyback) { - ret = -ENOMEM; - goto out; - } + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]), + priv->child_count, + gf_afr_mt_int32_t); + if (!fd_ctx->pre_op_done[i]) { + ret = -ENOMEM; + goto out; + } + } fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), priv->child_count, @@ -2601,6 +1948,13 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) goto out; } + for (i = 0; i < priv->child_count; i++) { + if (fd_is_anonymous (fd)) + fd_ctx->opened_on[i] = AFR_FD_OPENED; + else + fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED; + } + fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback), priv->child_count, gf_afr_mt_char); @@ -2617,20 +1971,7 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) goto out; } - fd_ctx->up_count = priv->up_count; - fd_ctx->down_count = priv->down_count; - - fd_ctx->locked_on = GF_CALLOC (sizeof (*fd_ctx->locked_on), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->locked_on) { - ret = -ENOMEM; - goto out; - } - pthread_mutex_init (&fd_ctx->delay_lock, NULL); - INIT_LIST_HEAD (&fd_ctx->entries); - fd_ctx->call_child = -1; INIT_LIST_HEAD (&fd_ctx->eager_locked); @@ -2660,32 +2001,31 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd) /* {{{ flush */ int -afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - int call_count = -1; + afr_local_t *local = NULL; + int call_count = -1; local = frame->local; LOCK (&frame->lock); { if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - } - - local->op_errno = op_errno; + local->op_ret = op_ret; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + } else { + local->op_errno = op_errno; + } } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) - AFR_STACK_UNWIND(flush, frame, local->op_ret, - local->op_errno, NULL); + AFR_STACK_UNWIND (flush, frame, local->op_ret, + local->op_errno, local->xdata_rsp); return 0; } @@ -2708,7 +2048,7 @@ afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) (void *) (long) i, priv->children[i], priv->children[i]->fops->flush, - local->fd, NULL); + local->fd, xdata); if (!--call_count) break; @@ -2721,40 +2061,30 @@ afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) int afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_private_t *priv = NULL; afr_local_t *local = NULL; call_stub_t *stub = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; + int op_errno = ENOMEM; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - ret = afr_local_init(local, priv, &op_errno); - if (ret < 0) + if (!local->call_count) { + op_errno = ENOTCONN; goto out; + } local->fd = fd_ref(fd); + stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata); - if (!stub) { - ret = -1; - op_errno = ENOMEM; + if (!stub) goto out; - } afr_delayed_changelog_wake_resume (this, fd, stub); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND(flush, frame, -1, op_errno, NULL); - + AFR_STACK_UNWIND (flush, frame, -1, op_errno, NULL); return 0; } @@ -2767,6 +2097,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) uint64_t ctx = 0; afr_fd_ctx_t *fd_ctx = NULL; int ret = 0; + int i = 0; ret = fd_ctx_get (fd, this, &ctx); if (ret < 0) @@ -2775,13 +2106,11 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) fd_ctx = (afr_fd_ctx_t *)(long) ctx; if (fd_ctx) { - GF_FREE (fd_ctx->pre_op_done); + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) + GF_FREE (fd_ctx->pre_op_done[i]); GF_FREE (fd_ctx->opened_on); - GF_FREE (fd_ctx->locked_on); - - GF_FREE (fd_ctx->pre_op_piggyback); GF_FREE (fd_ctx->lock_piggyback); GF_FREE (fd_ctx->lock_acquired); @@ -2799,24 +2128,8 @@ out: int afr_release (xlator_t *this, fd_t *fd) { - afr_locked_fd_t *locked_fd = NULL; - afr_locked_fd_t *tmp = NULL; - afr_private_t *priv = NULL; - - priv = this->private; - afr_cleanup_fd_ctx (this, fd); - list_for_each_entry_safe (locked_fd, tmp, &priv->saved_fds, - list) { - - if (locked_fd->fd == fd) { - list_del_init (&locked_fd->list); - GF_FREE (locked_fd); - } - - } - return 0; } @@ -2841,36 +2154,38 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_local_t *local = NULL; int call_count = -1; int child_index = (long) cookie; - int read_child = 0; + int read_subvol = 0; call_stub_t *stub = NULL; local = frame->local; - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + read_subvol = afr_data_subvol_get (local->inode, this, 0, 0); LOCK (&frame->lock); { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - if (op_ret == 0) { - local->op_ret = 0; + if (local->op_ret == -1) { + local->op_ret = 0; - if (local->success_count == 0) { local->cont.inode_wfop.prebuf = *prebuf; local->cont.inode_wfop.postbuf = *postbuf; + + if (xdata) + local->xdata_rsp = dict_ref (xdata); } - if (child_index == read_child) { + if (child_index == read_subvol) { local->cont.inode_wfop.prebuf = *prebuf; local->cont.inode_wfop.postbuf = *postbuf; + if (xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = dict_ref (xdata); + } } - - local->success_count++; - } - - local->op_errno = op_errno; + } else { + local->op_errno = op_errno; + } } UNLOCK (&frame->lock); @@ -2890,7 +2205,7 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_ret, local->op_errno, &local->cont.inode_wfop.prebuf, &local->cont.inode_wfop.postbuf, - xdata); + local->xdata_rsp); if (!stub) { AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); return 0; @@ -2910,37 +2225,35 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int -afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync, dict_t *xdata) +afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; + int32_t op_errno = ENOMEM; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + priv = this->private; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } - local->fd = fd_ref (fd); + local->fd = fd_ref (fd); if (afr_fd_has_witnessed_unstable_write (this, fd)) { /* don't care. we only wanted to CLEAR the bit */ } + local->inode = inode_ref (fd->inode); + for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, afr_fsync_cbk, @@ -2953,10 +2266,10 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); + AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } @@ -2964,10 +2277,9 @@ out: /* {{{ fsync */ -int32_t -afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xdata) +int +afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; int call_count = -1; @@ -2976,10 +2288,13 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, LOCK (&frame->lock); { - if (op_ret == 0) + if (op_ret == 0) { local->op_ret = 0; - - local->op_errno = op_errno; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + } else { + local->op_errno = op_errno; + } } UNLOCK (&frame->lock); @@ -2987,37 +2302,33 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret, - local->op_errno, xdata); + local->op_errno, local->xdata_rsp); return 0; } -int32_t -afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync, dict_t *xdata) +int +afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; + int32_t op_errno = ENOMEM; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + priv = this->private; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3030,10 +2341,10 @@ afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); + return 0; } @@ -3056,6 +2367,10 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, if (op_ret == 0) { if (!local->cont.xattrop.xattr) local->cont.xattrop.xattr = dict_ref (xattr); + + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + local->op_ret = 0; } @@ -3067,7 +2382,7 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (xattrop, frame, local->op_ret, local->op_errno, - local->cont.xattrop.xattr, xdata); + local->cont.xattrop.xattr, local->xdata_rsp); return 0; } @@ -3079,25 +2394,21 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3110,10 +2421,10 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); + return 0; } @@ -3138,6 +2449,8 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, if (!local->cont.fxattrop.xattr) local->cont.fxattrop.xattr = dict_ref (xattr); + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); local->op_ret = 0; } @@ -3149,7 +2462,7 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fxattrop, frame, local->op_ret, local->op_errno, - local->cont.fxattrop.xattr, xdata); + local->cont.fxattrop.xattr, local->xdata_rsp); return 0; } @@ -3161,25 +2474,21 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; int32_t op_errno = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3192,10 +2501,10 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); + return 0; } @@ -3203,8 +2512,8 @@ out: int32_t -afr_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -3238,25 +2547,21 @@ afr_inodelk (call_frame_t *frame, xlator_t *this, { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOMEM; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3270,18 +2575,17 @@ afr_inodelk (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); + return 0; } int32_t -afr_finodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xdata) +afr_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -3309,31 +2613,26 @@ afr_finodelk_cbk (call_frame_t *frame, void *cookie, int32_t -afr_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, - dict_t *xdata) +afr_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *flock, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3347,10 +2646,10 @@ afr_finodelk (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); + return 0; } @@ -3383,33 +2682,28 @@ afr_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } -int32_t -afr_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, - const char *basename, entrylk_cmd cmd, entrylk_type type, - dict_t *xdata) +int +afr_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; int32_t op_errno = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3423,18 +2717,18 @@ afr_entrylk (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); + return 0; } -int32_t -afr_fentrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) +int +afr_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -3461,33 +2755,28 @@ afr_fentrylk_cbk (call_frame_t *frame, void *cookie, } -int32_t -afr_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, - const char *basename, entrylk_cmd cmd, - entrylk_type type, dict_t *xdata) +int +afr_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3501,82 +2790,85 @@ afr_fentrylk (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); + return 0; } -int32_t -afr_statfs_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct statvfs *statvfs, dict_t *xdata) + +int +afr_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct statvfs *statvfs, dict_t *xdata) { afr_local_t *local = NULL; int call_count = 0; + struct statvfs *buf = NULL; LOCK (&frame->lock); { local = frame->local; - if (op_ret == 0) { - local->op_ret = op_ret; - - if (local->cont.statfs.buf_set) { - if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail) - local->cont.statfs.buf = *statvfs; - } else { - local->cont.statfs.buf = *statvfs; - local->cont.statfs.buf_set = 1; - } - } - - if (op_ret == -1) + if (op_ret != 0) { local->op_errno = op_errno; + goto unlock; + } + local->op_ret = op_ret; + + buf = &local->cont.statfs.buf; + if (local->cont.statfs.buf_set) { + if (statvfs->f_bavail < buf->f_bavail) { + *buf = *statvfs; + if (xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = dict_ref (xdata); + } + } + } else { + *buf = *statvfs; + local->cont.statfs.buf_set = 1; + if (xdata) + local->xdata_rsp = dict_ref (xdata); + } } +unlock: UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno, - &local->cont.statfs.buf, xdata); + &local->cont.statfs.buf, local->xdata_rsp); return 0; } -int32_t -afr_statfs (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xdata) +int +afr_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - afr_private_t * priv = NULL; - int child_count = 0; afr_local_t * local = NULL; + afr_private_t *priv = NULL; int i = 0; - int ret = -1; int call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); + int32_t op_errno = ENOMEM; - priv = this->private; - child_count = priv->child_count; - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + priv = this->private; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } - for (i = 0; i < child_count; i++) { + for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_statfs_cbk, priv->children[i], @@ -3587,10 +2879,10 @@ afr_statfs (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); + return 0; } @@ -3699,21 +2991,6 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN, &local->cont.lk.ret_flock, NULL); } else { - /* locking has succeeded on all nodes that are up */ - - /* temporarily - ret = afr_mark_locked_nodes (this, local->fd, - local->cont.lk.locked_nodes); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "Could not save locked nodes info in fdctx"); - - ret = afr_save_locked_fd (this, local->fd); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "Could not save locked fd"); - - */ AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, &local->cont.lk.ret_flock, NULL); } @@ -3729,20 +3006,12 @@ afr_lk (call_frame_t *frame, xlator_t *this, afr_private_t *priv = NULL; afr_local_t *local = NULL; int i = 0; - int32_t op_errno = 0; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) goto out; local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count, @@ -3764,28 +3033,16 @@ afr_lk (call_frame_t *frame, xlator_t *this, priv->children[i]->fops->lk, fd, cmd, flock, xdata); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); + return 0; } int afr_forget (xlator_t *this, inode_t *inode) { - uint64_t ctx_addr = 0; - afr_inode_ctx_t *ctx = NULL; - - inode_ctx_get (inode, this, &ctx_addr); - - if (!ctx_addr) - goto out; - - ctx = (afr_inode_ctx_t *)(long)ctx_addr; - GF_FREE (ctx->fresh_children); - GF_FREE (ctx); -out: return 0; } @@ -3805,7 +3062,6 @@ afr_priv_dump (xlator_t *this) snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); gf_proc_dump_add_section(key_prefix); gf_proc_dump_write("child_count", "%u", priv->child_count); - gf_proc_dump_write("read_child_rr", "%u", priv->read_child_rr); for (i = 0; i < priv->child_count; i++) { sprintf (key, "child_up[%d]", i); gf_proc_dump_write(key, "%d", priv->child_up[i]); @@ -3862,7 +3118,7 @@ afr_notify (xlator_t *this, int32_t event, int idx = -1; int ret = -1; int call_psh = 0; - int up_child = AFR_ALL_CHILDREN; + int up_child = -1; dict_t *input = NULL; dict_t *output = NULL; @@ -3914,6 +3170,7 @@ afr_notify (xlator_t *this, int32_t event, */ if (priv->child_up[idx] != 1) { priv->up_count++; + priv->event_generation++; } priv->child_up[idx] = 1; @@ -3953,6 +3210,7 @@ afr_notify (xlator_t *this, int32_t event, */ if (priv->child_up[idx] == 1) { priv->down_count++; + priv->event_generation++; } priv->child_up[idx] = 0; @@ -4019,8 +3277,7 @@ afr_notify (xlator_t *this, int32_t event, LOCK (&priv->lock); { - up_children = afr_up_children_count (priv->child_up, - priv->child_count); + up_children = AFR_COUNT (priv->child_up, priv->child_count); for (i = 0; i < priv->child_count; i++) { if (priv->last_event[i] == GF_EVENT_CHILD_UP) { event = GF_EVENT_CHILD_UP; @@ -4040,39 +3297,23 @@ afr_notify (xlator_t *this, int32_t event, ret = 0; if (propagate) ret = default_notify (this, event, data); - if (call_psh && priv->shd.iamshd) - afr_proactive_self_heal ((void*) (long) up_child); + if (call_psh && priv->shd.iamshd) { + afr_selfheal_childup (this, up_child); + } out: return ret; } -int -afr_first_up_child (unsigned char *child_up, size_t child_count) -{ - int ret = -1; - int i = 0; - - GF_ASSERT (child_up); - - for (i = 0; i < child_count; i++) { - if (child_up[i]) { - ret = i; - break; - } - } - - return ret; -} int afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) { - int ret = -1; - local->op_ret = -1; local->op_errno = EUCLEAN; + syncbarrier_init (&local->barrier); + local->child_up = GF_CALLOC (priv->child_count, sizeof (*local->child_up), gf_afr_mt_char); @@ -4084,38 +3325,42 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) memcpy (local->child_up, priv->child_up, sizeof (*local->child_up) * priv->child_count); - local->call_count = afr_up_children_count (local->child_up, - priv->child_count); + local->call_count = AFR_COUNT (local->child_up, priv->child_count); if (local->call_count == 0) { gf_log (THIS->name, GF_LOG_INFO, "no subvolumes up"); if (op_errno) *op_errno = ENOTCONN; goto out; } + local->event_generation = priv->event_generation; - local->child_errno = GF_CALLOC (priv->child_count, - sizeof (*local->child_errno), - gf_afr_mt_int32_t); - if (!local->child_errno) { - if (op_errno) - *op_errno = ENOMEM; - goto out; - } + local->read_attempted = GF_CALLOC (priv->child_count, sizeof (char), + gf_afr_mt_char); + if (!local->read_attempted) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } - local->transaction.postop_piggybacked = GF_CALLOC (priv->child_count, - sizeof (int), - gf_afr_mt_int32_t); - if (!local->transaction.postop_piggybacked) { - if (op_errno) - *op_errno = ENOMEM; - goto out; - } + local->readable = GF_CALLOC (priv->child_count, sizeof (char), + gf_afr_mt_char); + if (!local->readable) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } - local->append_write = _gf_false; + local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), + gf_afr_mt_reply_t); + if (!local->replies) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } - ret = 0; + return 0; out: - return ret; + return -1; } int @@ -4218,13 +3463,11 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) } ret = -ENOMEM; - child_up_count = afr_up_children_count (local->child_up, - priv->child_count); + child_up_count = AFR_COUNT (local->child_up, priv->child_count); if (priv->optimistic_change_log && child_up_count == priv->child_count) local->optimistic_change_log = 1; - local->first_up_child = afr_first_up_child (local->child_up, - priv->child_count); + local->pre_op_compat = priv->pre_op_compat; local->transaction.eager_lock = GF_CALLOC (sizeof (*local->transaction.eager_lock), @@ -4234,26 +3477,29 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) if (!local->transaction.eager_lock) goto out; - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) - goto out; - local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op), priv->child_count, gf_afr_mt_char); if (!local->transaction.pre_op) goto out; + local->transaction.fop_subvols = GF_CALLOC (sizeof (*local->transaction.fop_subvols), + priv->child_count, + gf_afr_mt_char); + if (!local->transaction.fop_subvols) + goto out; + + local->transaction.failed_subvols = GF_CALLOC (sizeof (*local->transaction.failed_subvols), + priv->child_count, + gf_afr_mt_char); + if (!local->transaction.failed_subvols) + goto out; + local->pending = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); if (!local->pending) goto out; - local->transaction.txn_changelog = afr_matrix_create (priv->child_count, - AFR_NUM_CHANGE_LOGS); - if (!local->transaction.txn_changelog) - goto out; - INIT_LIST_HEAD (&local->transaction.eager_locked); ret = 0; @@ -4261,86 +3507,6 @@ out: return ret; } -void -afr_reset_children (int32_t *fresh_children, int32_t child_count) -{ - unsigned int i = 0; - for (i = 0; i < child_count; i++) - fresh_children[i] = -1; -} - -int32_t* -afr_children_create (int32_t child_count) -{ - int32_t *children = NULL; - int i = 0; - - GF_ASSERT (child_count > 0); - - children = GF_CALLOC (child_count, sizeof (*children), - gf_afr_mt_int32_t); - if (NULL == children) - goto out; - for (i = 0; i < child_count; i++) - children[i] = -1; -out: - return children; -} - -void -afr_children_add_child (int32_t *children, int32_t child, - int32_t child_count) -{ - gf_boolean_t child_found = _gf_false; - int i = 0; - - for (i = 0; i < child_count; i++) { - if (children[i] == -1) - break; - if (children[i] == child) { - child_found = _gf_true; - break; - } - } - - if (!child_found) { - GF_ASSERT (i < child_count); - children[i] = child; - } -} - -void -afr_children_rm_child (int32_t *children, int32_t child, int32_t child_count) -{ - int i = 0; - - GF_ASSERT ((child >= 0) && (child < child_count)); - for (i = 0; i < child_count; i++) { - if (children[i] == -1) - break; - if (children[i] == child) { - if (i != (child_count - 1)) - memmove (children + i, children + i + 1, - sizeof (*children)*(child_count - i - 1)); - children[child_count - 1] = -1; - break; - } - } -} - -int -afr_get_children_count (int32_t *children, unsigned int child_count) -{ - int count = 0; - int i = 0; - - for (i = 0; i < child_count; i++) { - if (children[i] == -1) - break; - count++; - } - return count; -} void afr_set_low_priority (call_frame_t *frame) @@ -4348,38 +3514,6 @@ afr_set_low_priority (call_frame_t *frame) frame->root->pid = LOW_PRIO_PROC_PID; } -int -afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child, - int flags) -{ - int ret = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - GF_ASSERT (fd && fd->inode); - ret = afr_fd_ctx_set (this, fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not set fd ctx for fd=%p", fd); - goto out; - } - - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not get fd ctx for fd=%p", fd); - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - fd_ctx->opened_on[child] = AFR_FD_OPENED; - if (!IA_ISDIR (fd->inode->ia_type)) { - fd_ctx->flags = flags; - } - ret = 0; -out: - return ret; -} gf_boolean_t afr_have_quorum (char *logname, afr_private_t *priv) @@ -4426,33 +3560,6 @@ afr_priv_destroy (afr_private_t *priv) if (!priv) goto out; inode_unref (priv->root_inode); - GF_FREE (priv->shd.pos); - GF_FREE (priv->shd.pending); - GF_FREE (priv->shd.inprogress); -// for (i = 0; i < priv->child_count; i++) -// if (priv->shd.timer && priv->shd.timer[i]) -// gf_timer_call_cancel (this->ctx, priv->shd.timer[i]); - GF_FREE (priv->shd.timer); - - if (priv->shd.healed) - eh_destroy (priv->shd.healed); - - if (priv->shd.heal_failed) - eh_destroy (priv->shd.heal_failed); - - if (priv->shd.split_brain) - eh_destroy (priv->shd.split_brain); - - for (i = 0; i < priv->child_count; i++) - { - if (priv->shd.statistics[i]) - eh_destroy (priv->shd.statistics[i]); - } - - GF_FREE (priv->shd.statistics); - - GF_FREE (priv->shd.crawl_events); - GF_FREE (priv->last_event); if (priv->pending_key) { for (i = 0; i < priv->child_count; i++) @@ -4462,8 +3569,7 @@ afr_priv_destroy (afr_private_t *priv) GF_FREE (priv->children); GF_FREE (priv->child_up); LOCK_DESTROY (&priv->lock); - LOCK_DESTROY (&priv->read_child_lock); - pthread_mutex_destroy (&priv->mutex); + GF_FREE (priv); out: return; @@ -4480,124 +3586,21 @@ xlator_subvolume_count (xlator_t *this) return i; } -inline gf_boolean_t -afr_is_errno_set (int *child_errno, int child) -{ - return child_errno[child]; -} - -inline gf_boolean_t -afr_is_errno_unset (int *child_errno, int child) -{ - return !afr_is_errno_set (child_errno, child); -} - -void -afr_prepare_new_entry_pending_matrix (int32_t **pending, - gf_boolean_t (*is_pending) (int *, int), - int *ctx, struct iatt *buf, - unsigned int child_count) -{ - int midx = 0; - int idx = 0; - int i = 0; - - midx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); - if (IA_ISDIR (buf->ia_type)) - idx = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); - else if (IA_ISREG (buf->ia_type)) - idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - else - idx = -1; - for (i = 0; i < child_count; i++) { - if (is_pending (ctx, i)) { - pending[i][midx] = hton32 (1); - if (idx == -1) - continue; - pending[i][idx] = hton32 (1); - } - } -} - -gf_boolean_t -afr_is_fd_fixable (fd_t *fd) -{ - if (!fd || !fd->inode) - return _gf_false; - else if (fd_is_anonymous (fd)) - return _gf_false; - else if (uuid_is_null (fd->inode->gfid)) - return _gf_false; - - return _gf_true; -} void afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; - inode_t *inode = NULL; - afr_inode_ctx_t *ctx = NULL; + afr_fd_ctx_t *fd_ctx = NULL; local = frame->local; - if (local->fd) - inode = local->fd->inode; - else - inode = local->loc.inode; - - if (!inode) - return; - - LOCK (&inode->lock); - { - ctx = __afr_inode_ctx_get (inode, this); - ctx->open_fd_count = local->open_fd_count; - } - UNLOCK (&inode->lock); -} - -int -afr_initialise_statistics (xlator_t *this) -{ - afr_private_t *priv = NULL; - int ret = -1; - int i = 0; - int child_count = 0; - eh_t *stats_per_brick = NULL; - shd_crawl_event_t ***shd_crawl_events = NULL; - priv = this->private; - - priv->shd.statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count, - gf_common_mt_eh_t); - if (!priv->shd.statistics) { - ret = -1; - goto out; - } - child_count = priv->child_count; - for (i=0; i < child_count ; i++) { - stats_per_brick = eh_new (AFR_STATISTICS_HISTORY_SIZE, - _gf_false, - _destroy_crawl_event_data); - if (!stats_per_brick) { - ret = -1; - goto out; - } - priv->shd.statistics[i] = stats_per_brick; - - } - - shd_crawl_events = (shd_crawl_event_t***)(&priv->shd.crawl_events); - *shd_crawl_events = GF_CALLOC (sizeof(shd_crawl_event_t*), - priv->child_count, - gf_afr_mt_shd_crawl_event_t); + if (!local->fd) + return; - if (!priv->shd.crawl_events) { - ret = -1; - goto out; - } - ret = 0; -out: - return ret; + fd_ctx = afr_fd_ctx_get (local->fd, this); + if (!fd_ctx) + return; + fd_ctx->open_fd_count = local->open_fd_count; } -- cgit