diff options
Diffstat (limited to 'xlators/cluster/afr/src/afr-common.c')
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 1075 |
1 files changed, 694 insertions, 381 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 09b0195cd..af01f2ef2 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -49,10 +49,9 @@ #include "afr-self-heald.h" #include "pump.h" -#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000200000000ULL -#define AFR_ICTX_SPLIT_BRAIN_MASK 0x0000000100000000ULL +#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000100000000ULL #define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL - +#define AFR_STATISTICS_HISTORY_SIZE 50 int afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, gf_boolean_t fail_conflict); @@ -203,60 +202,86 @@ out: return ret; } -afr_inode_ctx_t* -afr_inode_ctx_get_from_addr (uint64_t addr, int32_t child_count) +void +afr_inode_ctx_destroy (afr_inode_ctx_t *ctx) { - int ret = -1; - afr_inode_ctx_t *ctx = NULL; - size_t size = 0; + if (!ctx) + return; + GF_FREE (ctx->fresh_children); + GF_FREE (ctx); +} - GF_ASSERT (child_count > 0); +afr_inode_ctx_t* +__afr_inode_ctx_get (inode_t *inode, xlator_t *this) +{ + int ret = 0; + uint64_t ctx_addr = 0; + afr_inode_ctx_t *ctx = NULL; + afr_private_t *priv = NULL; - if (!addr) { - ctx = GF_CALLOC (1, sizeof (*ctx), - gf_afr_mt_inode_ctx_t); - if (!ctx) - goto out; - size = sizeof (*ctx->fresh_children); - ctx->fresh_children = GF_CALLOC (child_count, size, - gf_afr_mt_int32_t); - if (!ctx->fresh_children) - goto out; - } else { - ctx = (afr_inode_ctx_t*) (long) addr; + priv = this->private; + ret = __inode_ctx_get (inode, this, &ctx_addr); + if (ret < 0) + ctx_addr = 0; + if (ctx_addr != 0) { + ctx = (afr_inode_ctx_t*) (long) ctx_addr; + goto out; } - ret = 0; + ctx = GF_CALLOC (1, sizeof (*ctx), + gf_afr_mt_inode_ctx_t); + if (!ctx) + goto fail; + ctx->fresh_children = GF_CALLOC (priv->child_count, + sizeof (*ctx->fresh_children), + gf_afr_mt_int32_t); + if (!ctx->fresh_children) + goto fail; + ret = __inode_ctx_put (inode, this, (uint64_t)ctx); + if (ret) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to " + "set the inode ctx (%s)", + uuid_utoa (inode->gfid)); + goto fail; + } + out: - if (ret && ctx) { - if (ctx->fresh_children) - GF_FREE (ctx->fresh_children); - GF_FREE (ctx); - ctx = NULL; + return ctx; + +fail: + afr_inode_ctx_destroy (ctx); + return NULL; +} + +afr_inode_ctx_t* +afr_inode_ctx_get (inode_t *inode, xlator_t *this) +{ + afr_inode_ctx_t *ctx = NULL; + + LOCK (&inode->lock); + { + ctx = __afr_inode_ctx_get (inode, this); } + UNLOCK (&inode->lock); return ctx; } void -afr_inode_get_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) +afr_inode_get_ctx_params (xlator_t *this, inode_t *inode, + afr_inode_params_t *params) { GF_ASSERT (inode); GF_ASSERT (params); - int ret = 0; afr_inode_ctx_t *ctx = NULL; afr_private_t *priv = NULL; int i = 0; - uint64_t ctx_addr = 0; int32_t read_child = -1; int32_t *fresh_children = NULL; priv = this->private; LOCK (&inode->lock); { - ret = __inode_ctx_get (inode, this, &ctx_addr); - if (ret < 0) - goto unlock; - ctx = afr_inode_ctx_get_from_addr (ctx_addr, priv->child_count); + ctx = __afr_inode_ctx_get (inode, this); if (!ctx) goto unlock; switch (params->op) { @@ -275,12 +300,6 @@ afr_inode_get_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) if (ctx->masks & AFR_ICTX_OPENDIR_DONE_MASK) params->u.value = _gf_true; break; - case AFR_INODE_GET_SPLIT_BRAIN: - params->u.value = _gf_false; - if (ctx->masks & AFR_ICTX_SPLIT_BRAIN_MASK) - params->u.value = _gf_true; - ; - break; default: GF_ASSERT (0); break; @@ -293,11 +312,16 @@ unlock: gf_boolean_t afr_is_split_brain (xlator_t *this, inode_t *inode) { - afr_inode_params_t params = {0}; + afr_inode_ctx_t *ctx = NULL; + gf_boolean_t spb = _gf_false; - params.op = AFR_INODE_GET_SPLIT_BRAIN; - afr_inode_get_ctx (this, inode, ¶ms); - return params.u.value; + ctx = afr_inode_ctx_get (inode, this); + if (!ctx) + goto out; + if ((ctx->mdata_spb == SPB) || (ctx->data_spb == SPB)) + spb = _gf_true; +out: + return spb; } gf_boolean_t @@ -306,11 +330,10 @@ afr_is_opendir_done (xlator_t *this, inode_t *inode) afr_inode_params_t params = {0}; params.op = AFR_INODE_GET_OPENDIR_DONE; - afr_inode_get_ctx (this, inode, ¶ms); + afr_inode_get_ctx_params (this, inode, ¶ms); return params.u.value; } - int32_t afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children) { @@ -318,7 +341,7 @@ afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children) params.op = AFR_INODE_GET_READ_CTX; params.u.read_ctx.children = fresh_children; - afr_inode_get_ctx (this, inode, ¶ms); + afr_inode_get_ctx_params (this, inode, ¶ms); return params.u.read_ctx.read_child; } @@ -380,31 +403,14 @@ afr_inode_ctx_set_opendir_done (afr_inode_ctx_t *ctx) } void -afr_inode_ctx_set_splitbrain (afr_inode_ctx_t *ctx, gf_boolean_t set) -{ - uint64_t remaining_mask = 0; - uint64_t mask = 0; - - if (set) { - remaining_mask = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx->masks); - mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_SPLIT_BRAIN_MASK); - ctx->masks = remaining_mask | mask; - } else { - ctx->masks = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx->masks); - } -} - -void -afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) +afr_inode_set_ctx_params (xlator_t *this, inode_t *inode, + afr_inode_params_t *params) { GF_ASSERT (inode); GF_ASSERT (params); - int ret = 0; afr_inode_ctx_t *ctx = NULL; afr_private_t *priv = NULL; - uint64_t ctx_addr = 0; - gf_boolean_t set = _gf_false; int32_t read_child = -1; int32_t *fresh_children = NULL; int32_t *stale_children = NULL; @@ -412,10 +418,7 @@ afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) priv = this->private; LOCK (&inode->lock); { - ret = __inode_ctx_get (inode, this, &ctx_addr); - if (ret < 0) - ctx_addr = 0; - ctx = afr_inode_ctx_get_from_addr (ctx_addr, priv->child_count); + ctx = __afr_inode_ctx_get (inode, this); if (!ctx) goto unlock; switch (params->op) { @@ -435,33 +438,26 @@ afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) case AFR_INODE_SET_OPENDIR_DONE: afr_inode_ctx_set_opendir_done (ctx); break; - case AFR_INODE_SET_SPLIT_BRAIN: - set = params->u.value; - afr_inode_ctx_set_splitbrain (ctx, set); - break; default: GF_ASSERT (0); break; } - ret = __inode_ctx_put (inode, this, (uint64_t)ctx); - if (ret) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to " - "set the inode ctx (%s)", - uuid_utoa (inode->gfid)); - } } unlock: UNLOCK (&inode->lock); } void -afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set) +afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, + afr_spb_state_t data_spb) { - afr_inode_params_t params = {0}; + afr_inode_ctx_t *ctx = NULL; - params.op = AFR_INODE_SET_SPLIT_BRAIN; - params.u.value = set; - afr_inode_set_ctx (this, inode, ¶ms); + ctx = afr_inode_ctx_get (inode, this); + if (mdata_spb != DONT_KNOW) + ctx->mdata_spb = mdata_spb; + if (data_spb != DONT_KNOW) + ctx->data_spb = data_spb; } void @@ -470,7 +466,7 @@ afr_set_opendir_done (xlator_t *this, inode_t *inode) afr_inode_params_t params = {0}; params.op = AFR_INODE_SET_OPENDIR_DONE; - afr_inode_set_ctx (this, inode, ¶ms); + afr_inode_set_ctx_params (this, inode, ¶ms); } void @@ -489,7 +485,7 @@ afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, params.op = AFR_INODE_SET_READ_CTX; params.u.read_ctx.read_child = read_child; params.u.read_ctx.children = fresh_children; - afr_inode_set_ctx (this, inode, ¶ms); + afr_inode_set_ctx_params (this, inode, ¶ms); } void @@ -502,7 +498,7 @@ afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, params.op = AFR_INODE_RM_STALE_CHILDREN; params.u.read_ctx.children = stale_children; - afr_inode_set_ctx (this, inode, ¶ms); + afr_inode_set_ctx_params (this, inode, ¶ms); } gf_boolean_t @@ -571,6 +567,7 @@ afr_hash_child (int32_t *success_children, int32_t child_count, unsigned int hmode, uuid_t gfid) { uuid_t gfid_copy = {0,}; + pid_t pid; if (!hmode) { return -1; @@ -589,7 +586,8 @@ afr_hash_child (int32_t *success_children, int32_t child_count, * perfection here. All we need is a low probability that * multiple clients won't converge on the same subvolume. */ - *((pid_t *)gfid_copy) = getpid(); + pid = getpid(); + memcpy (gfid_copy, &pid, sizeof(pid)); } return SuperFastHash((char *)gfid_copy, @@ -610,12 +608,12 @@ afr_select_read_child_from_policy (int32_t *success_children, GF_ASSERT (success_children); - read_child = prev_read_child; + read_child = config_read_child; if (afr_is_read_child (success_children, sources, child_count, read_child)) goto out; - read_child = config_read_child; + read_child = prev_read_child; if (afr_is_read_child (success_children, sources, child_count, read_child)) goto out; @@ -766,6 +764,13 @@ out: } void +afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count) +{ + afr_reset_xattr (xattr, child_count); + GF_FREE (xattr); +} + +void afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) { afr_self_heal_t *sh = NULL; @@ -774,60 +779,51 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) sh = &local->self_heal; priv = this->private; - if (sh->buf) - GF_FREE (sh->buf); + if (sh->data_sh_info && strcmp (sh->data_sh_info, "")) + GF_FREE (sh->data_sh_info); - if (sh->parentbufs) - GF_FREE (sh->parentbufs); + if (sh->metadata_sh_info && strcmp (sh->metadata_sh_info, "")) + GF_FREE (sh->metadata_sh_info); + + GF_FREE (sh->buf); + + GF_FREE (sh->parentbufs); if (sh->inode) inode_unref (sh->inode); - if (sh->xattr) { - afr_reset_xattr (sh->xattr, priv->child_count); - GF_FREE (sh->xattr); - } + afr_xattr_array_destroy (sh->xattr, priv->child_count); - if (sh->child_errno) - GF_FREE (sh->child_errno); + GF_FREE (sh->child_errno); afr_matrix_cleanup (sh->pending_matrix, priv->child_count); afr_matrix_cleanup (sh->delta_matrix, priv->child_count); - if (sh->sources) - GF_FREE (sh->sources); + GF_FREE (sh->sources); - if (sh->success) - GF_FREE (sh->success); + GF_FREE (sh->success); - if (sh->locked_nodes) - GF_FREE (sh->locked_nodes); + GF_FREE (sh->locked_nodes); if (sh->healing_fd) { fd_unref (sh->healing_fd); sh->healing_fd = NULL; } - if (sh->linkname) - GF_FREE ((char *)sh->linkname); + GF_FREE ((char *)sh->linkname); - if (sh->success_children) - GF_FREE (sh->success_children); + GF_FREE (sh->success_children); - if (sh->fresh_children) - GF_FREE (sh->fresh_children); + GF_FREE (sh->fresh_children); - if (sh->fresh_parent_dirs) - GF_FREE (sh->fresh_parent_dirs); + GF_FREE (sh->fresh_parent_dirs); loc_wipe (&sh->parent_loc); loc_wipe (&sh->lookup_loc); - if (sh->checksum) - GF_FREE (sh->checksum); + GF_FREE (sh->checksum); - if (sh->write_needed) - GF_FREE (sh->write_needed); + GF_FREE (sh->write_needed); if (sh->healing_fd) fd_unref (sh->healing_fd); } @@ -836,7 +832,8 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) void afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) { - afr_private_t * priv = NULL; + afr_private_t *priv = NULL; + int i = 0; priv = this->private; @@ -844,18 +841,15 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) afr_matrix_cleanup (local->transaction.txn_changelog, priv->child_count); - if (local->internal_lock.locked_nodes) - GF_FREE (local->internal_lock.locked_nodes); - - if (local->internal_lock.inode_locked_nodes) - GF_FREE (local->internal_lock.inode_locked_nodes); + GF_FREE (local->internal_lock.locked_nodes); - if (local->internal_lock.entry_locked_nodes) - GF_FREE (local->internal_lock.entry_locked_nodes); + for (i = 0; local->internal_lock.inodelk[i].domain; i++) { + GF_FREE (local->internal_lock.inodelk[i].locked_nodes); + } - if (local->internal_lock.lower_locked_nodes) - GF_FREE (local->internal_lock.lower_locked_nodes); + GF_FREE (local->internal_lock.lower_locked_nodes); + afr_entry_lockee_cleanup (&local->internal_lock); GF_FREE (local->transaction.pre_op); GF_FREE (local->transaction.eager_lock); @@ -865,6 +859,8 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) loc_wipe (&local->transaction.parent_loc); loc_wipe (&local->transaction.new_parent_loc); + + GF_FREE (local->transaction.postop_piggybacked); } @@ -894,17 +890,13 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (local->dict) dict_unref (local->dict); - if (local->child_up) - GF_FREE (local->child_up); + GF_FREE(local->replies); - if (local->child_errno) - GF_FREE (local->child_errno); + GF_FREE (local->child_up); - if (local->fresh_children) - GF_FREE (local->fresh_children); + GF_FREE (local->child_errno); - if (local->fd_open_on) - GF_FREE (local->fd_open_on); + GF_FREE (local->fresh_children); { /* lookup */ if (local->cont.lookup.xattrs) { @@ -922,27 +914,23 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) inode_unref (local->cont.lookup.inode); } - if (local->cont.lookup.postparents) - GF_FREE (local->cont.lookup.postparents); + GF_FREE (local->cont.lookup.postparents); - if (local->cont.lookup.bufs) - GF_FREE (local->cont.lookup.bufs); + GF_FREE (local->cont.lookup.bufs); - if (local->cont.lookup.success_children) - GF_FREE (local->cont.lookup.success_children); + GF_FREE (local->cont.lookup.success_children); - if (local->cont.lookup.sources) - GF_FREE (local->cont.lookup.sources); + GF_FREE (local->cont.lookup.sources); + afr_matrix_cleanup (local->cont.lookup.pending_matrix, + priv->child_count); } { /* getxattr */ - if (local->cont.getxattr.name) - GF_FREE (local->cont.getxattr.name); + GF_FREE (local->cont.getxattr.name); } { /* lk */ - if (local->cont.lk.locked_nodes) - GF_FREE (local->cont.lk.locked_nodes); + GF_FREE (local->cont.lk.locked_nodes); } { /* create */ @@ -997,8 +985,7 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) } { /* opendir */ - if (local->cont.opendir.checksum) - GF_FREE (local->cont.opendir.checksum); + GF_FREE (local->cont.opendir.checksum); } { /* readdirp */ @@ -1093,6 +1080,88 @@ afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) uuid_copy (loc->pargfid, postparent->ia_gfid); } +/* + * Quota size xattrs are not maintained by afr. There is a + * possibility that they differ even when both the directory changelog xattrs + * suggest everything is fine. So if there is at least one 'source' check among + * the sources which has the maximum quota size. Otherwise check among all the + * available ones for maximum quota size. This way if there is a source and + * stale copies it always votes for the 'source'. + * */ + +static void +afr_handle_quota_size (afr_local_t *local, xlator_t *this, + dict_t *rsp_dict) +{ + int32_t *sources = NULL; + dict_t *xattr = NULL; + data_t *max_data = NULL; + int64_t max_quota_size = -1; + data_t *data = NULL; + int64_t *size = NULL; + int64_t quota_size = -1; + afr_private_t *priv = NULL; + int i = 0; + int ret = -1; + gf_boolean_t source_present = _gf_false; + + priv = this->private; + sources = local->cont.lookup.sources; + + if (rsp_dict == NULL) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "%s: Invalid " + "response dictionary", local->loc.path); + return; + } + + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source_present = _gf_true; + break; + } + } + + for (i = 0; i < priv->child_count; i++) { + /* + * If there is at least one source lets check + * for maximum quota sizes among sources, otherwise take the + * maximum of the ones present to be on the safer side. + */ + if (source_present && !sources[i]) + continue; + + xattr = local->cont.lookup.xattrs[i]; + if (!xattr) + continue; + + data = dict_get (xattr, QUOTA_SIZE_KEY); + if (!data) + continue; + + size = (int64_t*)data->data; + quota_size = ntoh64(*size); + gf_log (this->name, GF_LOG_DEBUG, "%s: %d, size: %"PRId64, + local->loc.path, i, quota_size); + if (quota_size > max_quota_size) { + if (max_data) + data_unref (max_data); + + max_quota_size = quota_size; + max_data = data_ref (data); + } + } + + if (max_data) { + ret = dict_set (rsp_dict, QUOTA_SIZE_KEY, max_data); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " + "quota size", local->loc.path); + } + + data_unref (max_data); + } +} + int afr_lookup_build_response_params (afr_local_t *local, xlator_t *this) { @@ -1137,13 +1206,18 @@ afr_lookup_build_response_params (afr_local_t *local, xlator_t *this) ret = -1; goto out; } + gf_log (this->name, GF_LOG_DEBUG, "Building lookup response from %d", read_child); if (!*xattr) *xattr = dict_ref (local->cont.lookup.xattrs[read_child]); + *buf = local->cont.lookup.bufs[read_child]; *postparent = local->cont.lookup.postparents[read_child]; + if (dict_get (local->xattr_req, QUOTA_SIZE_KEY)) + afr_handle_quota_size (local, this, *xattr); + if (IA_INVAL == local->cont.lookup.inode->ia_type) { /* fix for RT #602 */ local->cont.lookup.inode->ia_type = buf->ia_type; @@ -1181,41 +1255,97 @@ afr_lookup_update_lk_counts (afr_local_t *local, xlator_t *this, local->cont.lookup.parent_entrylk += parent_entrylk; } +/* + * It's important to maintain a commutative property on do_*_self_heal and + * found*; once set, they must not be cleared by a subsequent iteration or + * call, so that they represent a logical OR of all iterations and calls + * regardless of child/key order. That allows the caller to call us multiple + * times without having to use a separate variable as a "reduce" accumulator. + */ static void afr_lookup_set_self_heal_params_by_xattr (afr_local_t *local, xlator_t *this, dict_t *xattr) { + afr_private_t *priv = NULL; + int i = 0; + int ret = -1; + void *pending_raw = NULL; + int32_t *pending = NULL; + GF_ASSERT (local); GF_ASSERT (this); GF_ASSERT (xattr); - if (afr_sh_has_metadata_pending (xattr, this)) { - local->self_heal.do_metadata_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "metadata self-heal is pending for %s.", - local->loc.path); - } + priv = this->private; - if (afr_sh_has_entry_pending (xattr, this)) { - local->self_heal.do_entry_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "entry self-heal is pending for %s.", local->loc.path); - } + for (i = 0; i < priv->child_count; i++) { + ret = dict_get_ptr (xattr, priv->pending_key[i], + &pending_raw); + if (ret != 0) { + continue; + } + pending = pending_raw; - if (afr_sh_has_data_pending (xattr, this)) { - local->self_heal.do_data_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "data self-heal is pending for %s.", local->loc.path); + if (pending[AFR_METADATA_TRANSACTION]) { + gf_log(this->name, GF_LOG_DEBUG, + "metadata self-heal is pending for %s.", + local->loc.path); + local->self_heal.do_metadata_self_heal = _gf_true; + } + + if (pending[AFR_ENTRY_TRANSACTION]) { + gf_log(this->name, GF_LOG_DEBUG, + "entry self-heal is pending for %s.", + local->loc.path); + local->self_heal.do_entry_self_heal = _gf_true; + } + + if (pending[AFR_DATA_TRANSACTION]) { + gf_log(this->name, GF_LOG_DEBUG, + "data self-heal is pending for %s.", + local->loc.path); + local->self_heal.do_data_self_heal = _gf_true; + } } } +void +afr_lookup_check_set_metadata_split_brain (afr_local_t *local, xlator_t *this) +{ + int32_t *sources = NULL; + afr_private_t *priv = NULL; + int32_t subvol_status = 0; + int32_t *success_children = NULL; + dict_t **xattrs = NULL; + struct iatt *bufs = NULL; + int32_t **pending_matrix = NULL; + + priv = this->private; + + sources = GF_CALLOC (priv->child_count, sizeof (*sources), + gf_afr_mt_int32_t); + if (NULL == sources) + goto out; + success_children = local->cont.lookup.success_children; + xattrs = local->cont.lookup.xattrs; + bufs = local->cont.lookup.bufs; + pending_matrix = local->cont.lookup.pending_matrix; + afr_build_sources (this, xattrs, bufs, pending_matrix, + sources, success_children, AFR_METADATA_TRANSACTION, + &subvol_status, _gf_false); + if (subvol_status & SPLIT_BRAIN) + local->cont.lookup.possible_spb = _gf_true; +out: + GF_FREE (sources); +} + static void afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this, struct iatt *buf, struct iatt *lookup_buf) { if (PERMISSION_DIFFERS (buf, lookup_buf)) { /* mismatching permissions */ - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_DEBUG, "permissions differ for %s ", local->loc.path); local->self_heal.do_metadata_self_heal = _gf_true; } @@ -1223,27 +1353,45 @@ afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this, if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { /* mismatching permissions */ local->self_heal.do_metadata_self_heal = _gf_true; - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_DEBUG, "ownership differs for %s ", local->loc.path); } if (SIZE_DIFFERS (buf, lookup_buf) && IA_ISREG (buf->ia_type)) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_DEBUG, "size differs for %s ", local->loc.path); local->self_heal.do_data_self_heal = _gf_true; } if (uuid_compare (buf->ia_gfid, lookup_buf->ia_gfid)) { /* mismatching gfid */ - gf_log (this->name, GF_LOG_WARNING, + gf_log (this->name, GF_LOG_DEBUG, "%s: gfid different on subvolume", local->loc.path); } } static void -afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this, - gf_boolean_t split_brain) +afr_detect_self_heal_by_split_brain_status (afr_local_t *local, xlator_t *this) +{ + gf_boolean_t split_brain = _gf_false; + afr_self_heal_t *sh = NULL; + + sh = &local->self_heal; + + split_brain = afr_is_split_brain (this, local->cont.lookup.inode); + split_brain = split_brain || local->cont.lookup.possible_spb; + if ((local->success_count > 0) && split_brain && + IA_ISREG (local->cont.lookup.inode->ia_type)) { + sh->force_confirm_spb = _gf_true; + gf_log (this->name, GF_LOG_DEBUG, + "split brain detected during lookup of %s.", + local->loc.path); + } +} + +static void +afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this) { GF_ASSERT (local); GF_ASSERT (this); @@ -1254,23 +1402,11 @@ afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this, local->self_heal.do_entry_self_heal = _gf_true; local->self_heal.do_gfid_self_heal = _gf_true; local->self_heal.do_missing_entry_self_heal = _gf_true; - gf_log(this->name, GF_LOG_INFO, + gf_log(this->name, GF_LOG_DEBUG, "entries are missing in lookup of %s.", local->loc.path); - //If all self-heals are needed no need to check for other rules - goto out; - } - - if ((local->success_count > 0) && split_brain && - IA_ISREG (local->cont.lookup.inode->ia_type)) { - local->self_heal.do_data_self_heal = _gf_true; - local->self_heal.do_gfid_self_heal = _gf_true; - gf_log (this->name, GF_LOG_WARNING, - "split brain detected during lookup of %s.", - local->loc.path); } -out: return; } @@ -1280,6 +1416,8 @@ afr_can_self_heal_proceed (afr_self_heal_t *sh, afr_private_t *priv) GF_ASSERT (sh); GF_ASSERT (priv); + if (sh->force_confirm_spb) + return _gf_true; return (sh->do_gfid_self_heal || sh->do_missing_entry_self_heal || (afr_data_self_heal_enabled (priv->data_self_heal) && @@ -1381,7 +1519,7 @@ afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, if (background) bg = "background"; - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, GF_LOG_DEBUG, "%s %s self-heal triggered. path: %s, reason: %s", bg, sh_type_str, local->loc.path, reason); @@ -1452,7 +1590,7 @@ afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, child2 = &bufs[success_children[i-1]]; if (FILETYPE_DIFFERS (child1, child2)) { - gf_log (xlator_name, GF_LOG_WARNING, "%s: filetype " + gf_log (xlator_name, GF_LOG_DEBUG, "%s: filetype " "differs on subvolumes (%d, %d)", path, success_children[i-1], success_children[i]); conflicting = _gf_true; @@ -1461,7 +1599,7 @@ afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, if (!gfid || uuid_is_null (child1->ia_gfid)) continue; if (uuid_compare (*gfid, child1->ia_gfid)) { - gf_log (xlator_name, GF_LOG_WARNING, "%s: gfid differs" + gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid differs" " on subvolume %d", path, success_children[i]); conflicting = _gf_true; goto out; @@ -1544,13 +1682,11 @@ afr_lookup_set_self_heal_params (afr_local_t *local, xlator_t *this) int32_t child1 = -1; int32_t child2 = -1; afr_self_heal_t *sh = NULL; - gf_boolean_t split_brain = _gf_false; priv = this->private; sh = &local->self_heal; - split_brain = afr_is_split_brain (this, local->cont.lookup.inode); - afr_detect_self_heal_by_lookup_status (local, this, split_brain); + afr_detect_self_heal_by_lookup_status (local, this); if (afr_lookup_gfid_missing_count (local, this)) local->self_heal.do_gfid_self_heal = _gf_true; @@ -1577,9 +1713,11 @@ afr_lookup_set_self_heal_params (afr_local_t *local, xlator_t *this) afr_lookup_set_self_heal_params_by_xattr (local, this, xattr[child1]); } - if (afr_open_only_data_self_heal (priv->data_self_heal) - && !split_brain) + if (afr_open_only_data_self_heal (priv->data_self_heal)) sh->do_data_self_heal = _gf_false; + if (sh->do_metadata_self_heal) + afr_lookup_check_set_metadata_split_brain (local, this); + afr_detect_self_heal_by_split_brain_status (local, this); } int @@ -1595,8 +1733,8 @@ afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, if (op_ret == -1) { local->op_ret = -1; - if (afr_error_more_important (local->op_errno, op_errno)) - local->op_errno = op_errno; + local->op_errno = afr_most_important_error(local->op_errno, + op_errno, _gf_true); goto out; } else { @@ -1610,6 +1748,16 @@ afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, if (ret) gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " "sh-failed to %d", local->loc.path, sh_failed); + + if (local->self_heal.actual_sh_started == _gf_true && + sh_failed == 0) { + ret = dict_set_int32 (xattr, "actual-sh-done", 1); + if (ret) + gf_log(this->name, GF_LOG_ERROR, "%s: Failed to" + " set actual-sh-done to %d", + local->loc.path, + local->self_heal.actual_sh_started); + } } out: AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, @@ -1684,7 +1832,8 @@ afr_lookup_perform_self_heal (call_frame_t *frame, xlator_t *this, afr_lookup_set_self_heal_params (local, this); if (afr_can_self_heal_proceed (&local->self_heal, priv)) { - if (afr_is_transaction_running (local)) + if (afr_is_transaction_running (local) && + (!local->allow_sh_for_running_transaction)) goto out; reason = "lookup detected pending operations"; @@ -1863,6 +2012,57 @@ out: return; } +gf_boolean_t +afr_is_entry_possibly_under_creation (afr_local_t *local, xlator_t *this) +{ + /* + * We need to perform this test in lookup done and treat on going + * create/DELETE as ENOENT. + * Reason: + Multiple clients A, B and C are attempting 'mkdir -p /mnt/a/b/c' + + 1 Client A is in the middle of mkdir(/a). It has acquired lock. + It has performed mkdir(/a) on one subvol, and second one is still + in progress + 2 Client B performs a lookup, sees directory /a on one, + ENOENT on the other, succeeds lookup. + 3 Client B performs lookup on /a/b on both subvols, both return ENOENT + (one subvol because /a/b does not exist, another because /a + itself does not exist) + 4 Client B proceeds to mkdir /a/b. It obtains entrylk on inode=/a with + basename=b on one subvol, but fails on other subvol as /a is yet to + be created by Client A. + 5 Client A finishes mkdir of /a on other subvol + 6 Client C also attempts to create /a/b, lookup returns ENOENT on + both subvols. + 7 Client C tries to obtain entrylk on on inode=/a with basename=b, + obtains on one subvol (where B had failed), and waits for B to unlock + on other subvol. + 8 Client B finishes mkdir() on one subvol with GFID-1 and completes + transaction and unlocks + 9 Client C gets the lock on the second subvol, At this stage second + subvol already has /a/b created from Client B, but Client C does not + check that in the middle of mkdir transaction + 10 Client C attempts mkdir /a/b on both subvols. It succeeds on + ONLY ONE (where Client B could not get lock because of + missing parent /a dir) with GFID-2, and gets EEXIST from ONE subvol. + This way we have /a/b in GFID mismatch. One subvol got GFID-1 because + Client B performed transaction on only one subvol (because entrylk() + could not be obtained on second subvol because of missing parent dir -- + caused by premature/speculative succeeding of lookup() on /a when locks + are detected). Other subvol gets GFID-2 from Client C because while + it was waiting for entrylk() on both subvols, Client B was in the + middle of creating mkdir() on only one subvol, and Client C does not + "expect" this when it is between lock() and pre-op()/op() phase of the + transaction. + */ + if (local->cont.lookup.parent_entrylk && local->enoent_count) + return _gf_true; + + return _gf_false; +} + + static void afr_lookup_done (call_frame_t *frame, xlator_t *this) { @@ -1879,6 +2079,12 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) priv = this->private; local = frame->local; + if (afr_is_entry_possibly_under_creation (local, this)) { + local->op_ret = -1; + local->op_errno = ENOENT; + goto unwind; + } + if (local->op_ret < 0) goto unwind; @@ -1936,25 +2142,20 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) * others in that they must be given higher priority while * returning to the user. * - * The hierarchy is ESTALE > ENOENT > others - * + * The hierarchy is ESTALE > EIO > ENOENT > others */ - -gf_boolean_t -afr_error_more_important (int32_t old_errno, int32_t new_errno) +int32_t +afr_most_important_error(int32_t old_errno, int32_t new_errno, + gf_boolean_t eio) { - gf_boolean_t ret = _gf_true; - - /* Nothing should ever overwrite ESTALE */ - if (old_errno == ESTALE) - ret = _gf_false; + if (old_errno == ESTALE || new_errno == ESTALE) + return ESTALE; + if (eio && (old_errno == EIO || new_errno == EIO)) + return EIO; + if (old_errno == ENOENT || new_errno == ENOENT) + return ENOENT; - /* Nothing should overwrite ENOENT, except ESTALE/EIO*/ - else if ((old_errno == ENOENT) && (new_errno != ESTALE) - && (new_errno != EIO)) - ret = _gf_false; - - return ret; + return new_errno; } int32_t @@ -1973,8 +2174,9 @@ afr_resultant_errno_get (int32_t *children, } else { child = i; } - if (afr_error_more_important (op_errno, child_errno[child])) - op_errno = child_errno[child]; + op_errno = afr_most_important_error(op_errno, + child_errno[child], + _gf_false); } return op_errno; } @@ -1986,8 +2188,8 @@ afr_lookup_handle_error (afr_local_t *local, int32_t op_ret, int32_t op_errno) if (op_errno == ENOENT) local->enoent_count++; - if (afr_error_more_important (local->op_errno, op_errno)) - local->op_errno = op_errno; + local->op_errno = afr_most_important_error(local->op_errno, op_errno, + _gf_false); if (local->op_errno == ESTALE) { local->op_ret = -1; @@ -2171,6 +2373,7 @@ afr_lookup_cont_init (afr_local_t *local, unsigned int child_count) struct iatt *iatts = NULL; int32_t *success_children = NULL; int32_t *sources = NULL; + int32_t **pending_matrix = NULL; GF_ASSERT (local); local->cont.lookup.xattrs = GF_CALLOC (child_count, @@ -2203,6 +2406,11 @@ afr_lookup_cont_init (afr_local_t *local, unsigned int child_count) goto out; local->cont.lookup.sources = sources; + pending_matrix = afr_matrix_create (child_count, child_count); + if (NULL == pending_matrix) + goto out; + local->cont.lookup.pending_matrix = pending_matrix; + ret = 0; out: return ret; @@ -2220,7 +2428,7 @@ afr_lookup (call_frame_t *frame, xlator_t *this, int call_count = 0; uint64_t ctx = 0; int32_t op_errno = 0; - + int allow_sh = 0; priv = this->private; AFR_LOCAL_ALLOC_OR_GOTO (local, out); @@ -2237,6 +2445,13 @@ afr_lookup (call_frame_t *frame, xlator_t *this, goto out; } + if (local->loc.path && + (strcmp (local->loc.path, "/" GF_REPLICATE_TRASH_DIR) == 0)) { + op_errno = EPERM; + ret = -1; + goto out; + } + ret = inode_ctx_get (local->loc.inode, this, &ctx); if (ret == 0) { /* lookup is a revalidate */ @@ -2285,6 +2500,11 @@ afr_lookup (call_frame_t *frame, xlator_t *this, /* By default assume ENOTCONN. On success it will be set to 0. */ local->op_errno = ENOTCONN; + ret = dict_get_int32 (xattr_req, "allow-sh-for-running-transaction", + &allow_sh); + dict_del (xattr_req, "allow-sh-for-running-transaction"); + local->allow_sh_for_running_transaction = allow_sh; + ret = afr_lookup_xattr_req_prepare (local, this, xattr_req, &local->loc, &gfid_req); if (ret) { @@ -2400,8 +2620,11 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) goto out; } - INIT_LIST_HEAD (&fd_ctx->paused_calls); + pthread_mutex_init (&fd_ctx->delay_lock, NULL); INIT_LIST_HEAD (&fd_ctx->entries); + fd_ctx->call_child = -1; + + INIT_LIST_HEAD (&fd_ctx->eager_locked); ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); if (ret) @@ -2429,135 +2652,70 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd) /* {{{ flush */ int -afr_flush_unwind (call_frame_t *frame, xlator_t *this) +afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (flush, main_frame, - local->op_ret, local->op_errno, - NULL); - } - - return 0; -} - - -int -afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; int call_count = -1; - int child_index = (long) cookie; - int need_unwind = 0; local = frame->local; - priv = this->private; LOCK (&frame->lock); { - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - if (op_ret != -1) { if (local->success_count == 0) { local->op_ret = op_ret; } local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } } local->op_errno = op_errno; } UNLOCK (&frame->lock); - if (need_unwind) - afr_flush_unwind (frame, this); - - call_count = afr_frame_return (frame); + call_count = afr_frame_return (frame); - if (call_count == 0) { - local->transaction.resume (frame, this); - } + if (call_count == 0) + AFR_STACK_UNWIND(flush, frame, local->op_ret, + local->op_errno, NULL); return 0; } - -int -afr_flush_wind (call_frame_t *frame, xlator_t *this) +static int +afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int i = 0; - int call_count = -1; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; - local = frame->local; priv = this->private; - - call_count = afr_up_children_count (local->child_up, priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; + local = frame->local; + call_count = local->call_count; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_flush_wind_cbk, + STACK_WIND_COOKIE (frame, afr_flush_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->flush, local->fd, NULL); - if (!--call_count) break; + } } return 0; } - -int -afr_flush_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - - int afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - call_frame_t *transaction_frame = NULL; + call_stub_t *stub = NULL; int ret = -1; int op_errno = 0; @@ -2567,47 +2725,27 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) priv = this->private; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - local->op = GF_FOP_FLUSH; - - local->transaction.fop = afr_flush_wind; - local->transaction.done = afr_flush_done; - local->transaction.unwind = afr_flush_unwind; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; - local->fd = fd_ref (fd); + ret = afr_local_init(local, priv, &op_errno); + if (ret < 0) + goto out; - local->transaction.main_frame = frame; - local->transaction.start = 0; - local->transaction.len = 0; - - ret = afr_open_fd_fix (transaction_frame, this, _gf_false); - if (ret) { - op_errno = -ret; + local->fd = fd_ref(fd); + stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata); + if (!stub) { + ret = -1; + op_errno = ENOMEM; goto out; } - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + afr_delayed_changelog_wake_resume (this, fd, stub); + ret = 0; - ret = 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - - AFR_STACK_UNWIND (flush, frame, -1, op_errno, NULL); - } + if (ret < 0) + AFR_STACK_UNWIND(flush, frame, -1, op_errno, NULL); return 0; } @@ -2621,8 +2759,6 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) uint64_t ctx = 0; afr_fd_ctx_t *fd_ctx = NULL; int ret = 0; - afr_fd_paused_call_t *paused_call = NULL; - afr_fd_paused_call_t *tmp = NULL; ret = fd_ctx_get (fd, this, &ctx); if (ret < 0) @@ -2631,28 +2767,18 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) fd_ctx = (afr_fd_ctx_t *)(long) ctx; if (fd_ctx) { - if (fd_ctx->pre_op_done) - GF_FREE (fd_ctx->pre_op_done); + GF_FREE (fd_ctx->pre_op_done); - if (fd_ctx->opened_on) - GF_FREE (fd_ctx->opened_on); + GF_FREE (fd_ctx->opened_on); - if (fd_ctx->locked_on) - GF_FREE (fd_ctx->locked_on); + GF_FREE (fd_ctx->locked_on); - if (fd_ctx->pre_op_piggyback) - GF_FREE (fd_ctx->pre_op_piggyback); - list_for_each_entry_safe (paused_call, tmp, &fd_ctx->paused_calls, - call_list) { - list_del_init (&paused_call->call_list); - GF_FREE (paused_call); - } + GF_FREE (fd_ctx->pre_op_piggyback); + GF_FREE (fd_ctx->lock_piggyback); - if (fd_ctx->lock_piggyback) - GF_FREE (fd_ctx->lock_piggyback); + GF_FREE (fd_ctx->lock_acquired); - if (fd_ctx->lock_acquired) - GF_FREE (fd_ctx->lock_acquired); + pthread_mutex_destroy (&fd_ctx->delay_lock); GF_FREE (fd_ctx); } @@ -2690,6 +2816,16 @@ afr_release (xlator_t *this, fd_t *fd) /* {{{ fsync */ int +afr_fsync_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} + +int afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) @@ -2698,6 +2834,7 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int call_count = -1; int child_index = (long) cookie; int read_child = 0; + call_stub_t *stub = NULL; local = frame->local; @@ -2713,13 +2850,13 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_ret = 0; if (local->success_count == 0) { - local->cont.fsync.prebuf = *prebuf; - local->cont.fsync.postbuf = *postbuf; + local->cont.inode_wfop.prebuf = *prebuf; + local->cont.inode_wfop.postbuf = *postbuf; } if (child_index == read_child) { - local->cont.fsync.prebuf = *prebuf; - local->cont.fsync.postbuf = *postbuf; + local->cont.inode_wfop.prebuf = *prebuf; + local->cont.inode_wfop.postbuf = *postbuf; } local->success_count++; @@ -2732,10 +2869,32 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { - AFR_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno, - &local->cont.fsync.prebuf, - &local->cont.fsync.postbuf, - NULL); + /* Make a stub out of the frame, and register it + with the waking up post-op. When the call-stub resumes, + we are guaranteed that there was no post-op pending + (i.e changelogs were unset in the server). This is an + essential "guarantee", that fsync() returns only after + completely finishing EVERYTHING, including the delayed + post-op. This guarantee is expected by FUSE graph switching + for example. + */ + stub = fop_fsync_cbk_stub (frame, afr_fsync_unwind_cbk, + local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + xdata); + if (!stub) { + AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); + return 0; + } + + /* If no new unstable writes happened between the + time we cleared the unstable write witness flag in afr_fsync + and now, calling afr_delayed_changelog_wake_up() should + wake up and skip over the fsync phase and go straight to + afr_changelog_post_op_now() + */ + afr_delayed_changelog_wake_resume (this, local->fd, stub); } return 0; @@ -2770,6 +2929,10 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, local->fd = fd_ref (fd); + if (afr_fd_has_witnessed_unstable_write (this, fd)) { + /* don't care. we only wanted to CLEAR the bit */ + } + for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, afr_fsync_cbk, @@ -3612,8 +3775,7 @@ afr_forget (xlator_t *this, inode_t *inode) goto out; ctx = (afr_inode_ctx_t *)(long)ctx_addr; - if (ctx->fresh_children) - GF_FREE (ctx->fresh_children); + GF_FREE (ctx->fresh_children); GF_FREE (ctx); out: return 0; @@ -3928,6 +4090,17 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) goto out; } + local->transaction.postop_piggybacked = GF_CALLOC (priv->child_count, + sizeof (int), + gf_afr_mt_int32_t); + if (!local->transaction.postop_piggybacked) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + local->append_write = _gf_false; + ret = 0; out: return ret; @@ -3939,16 +4112,6 @@ afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, { int ret = -ENOMEM; - lk->inode_locked_nodes = GF_CALLOC (sizeof (*lk->inode_locked_nodes), - child_count, gf_afr_mt_char); - if (NULL == lk->inode_locked_nodes) - goto out; - - lk->entry_locked_nodes = GF_CALLOC (sizeof (*lk->entry_locked_nodes), - child_count, gf_afr_mt_char); - if (NULL == lk->entry_locked_nodes) - goto out; - lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), child_count, gf_afr_mt_char); if (NULL == lk->locked_nodes) @@ -4007,6 +4170,21 @@ out: } int +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count) +{ + int ret = -ENOMEM; + + lk->domain = dom; + lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), + child_count, gf_afr_mt_char); + if (NULL == lk->locked_nodes) + goto out; + ret = 0; +out: + return ret; +} + +int afr_transaction_local_init (afr_local_t *local, xlator_t *this) { int child_up_count = 0; @@ -4019,6 +4197,14 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) if (ret < 0) goto out; + if ((local->transaction.type == AFR_DATA_TRANSACTION) || + (local->transaction.type == AFR_METADATA_TRANSACTION)) { + ret = afr_inodelk_init (&local->internal_lock.inodelk[0], + this->name, priv->child_count); + if (ret < 0) + goto out; + } + ret = -ENOMEM; child_up_count = afr_up_children_count (local->child_up, priv->child_count); @@ -4040,14 +4226,6 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) if (!local->fresh_children) goto out; - if (local->fd) { - local->fd_open_on = GF_CALLOC (sizeof (*local->fd_open_on), - priv->child_count, - gf_afr_mt_char); - if (!local->fd_open_on) - goto out; - } - local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op), priv->child_count, gf_afr_mt_char); @@ -4063,6 +4241,9 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) AFR_NUM_CHANGE_LOGS); if (!local->transaction.txn_changelog) goto out; + + INIT_LIST_HEAD (&local->transaction.eager_locked); + ret = 0; out: return ret; @@ -4250,6 +4431,16 @@ afr_priv_destroy (afr_private_t *priv) if (priv->shd.split_brain) eh_destroy (priv->shd.split_brain); + for (i = 0; i < priv->child_count; i++) + { + if (priv->shd.statistics[i]) + eh_destroy (priv->shd.statistics[i]); + } + + GF_FREE (priv->shd.statistics); + + GF_FREE (priv->shd.crawl_events); + GF_FREE (priv->last_event); if (priv->pending_key) { for (i = 0; i < priv->child_count; i++) @@ -4276,3 +4467,125 @@ xlator_subvolume_count (xlator_t *this) i++; return i; } + +inline gf_boolean_t +afr_is_errno_set (int *child_errno, int child) +{ + return child_errno[child]; +} + +inline gf_boolean_t +afr_is_errno_unset (int *child_errno, int child) +{ + return !afr_is_errno_set (child_errno, child); +} + +void +afr_prepare_new_entry_pending_matrix (int32_t **pending, + gf_boolean_t (*is_pending) (int *, int), + int *ctx, struct iatt *buf, + unsigned int child_count) +{ + int midx = 0; + int idx = 0; + int i = 0; + + midx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); + if (IA_ISDIR (buf->ia_type)) + idx = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); + else if (IA_ISREG (buf->ia_type)) + idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); + else + idx = -1; + for (i = 0; i < child_count; i++) { + if (is_pending (ctx, i)) { + pending[i][midx] = hton32 (1); + if (idx == -1) + continue; + pending[i][idx] = hton32 (1); + } + } +} + +gf_boolean_t +afr_is_fd_fixable (fd_t *fd) +{ + if (!fd || !fd->inode) + return _gf_false; + else if (fd_is_anonymous (fd)) + return _gf_false; + else if (uuid_is_null (fd->inode->gfid)) + return _gf_false; + + return _gf_true; +} + +void +afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + inode_t *inode = NULL; + afr_inode_ctx_t *ctx = NULL; + + local = frame->local; + + if (local->fd) + inode = local->fd->inode; + else + inode = local->loc.inode; + + if (!inode) + return; + + LOCK (&inode->lock); + { + ctx = __afr_inode_ctx_get (inode, this); + ctx->open_fd_count = local->open_fd_count; + } + UNLOCK (&inode->lock); +} + +int +afr_initialise_statistics (xlator_t *this) +{ + afr_private_t *priv = NULL; + int ret = -1; + int i = 0; + int child_count = 0; + eh_t *stats_per_brick = NULL; + shd_crawl_event_t ***shd_crawl_events = NULL; + priv = this->private; + + priv->shd.statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count, + gf_common_mt_eh_t); + if (!priv->shd.statistics) { + ret = -1; + goto out; + } + child_count = priv->child_count; + for (i=0; i < child_count ; i++) { + stats_per_brick = eh_new (AFR_STATISTICS_HISTORY_SIZE, + _gf_false, + _destroy_crawl_event_data); + if (!stats_per_brick) { + ret = -1; + goto out; + } + priv->shd.statistics[i] = stats_per_brick; + + } + + shd_crawl_events = (shd_crawl_event_t***)(&priv->shd.crawl_events); + *shd_crawl_events = GF_CALLOC (sizeof(shd_crawl_event_t*), + priv->child_count, + gf_afr_mt_shd_crawl_event_t); + + if (!priv->shd.crawl_events) { + ret = -1; + goto out; + } + ret = 0; +out: + return ret; + +} |
