diff options
Diffstat (limited to 'xlators/cluster/afr/src/afr-transaction.c')
| -rw-r--r-- | xlators/cluster/afr/src/afr-transaction.c | 2401 |
1 files changed, 1330 insertions, 1071 deletions
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 06e8931f2..205ff759e 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -1,1407 +1,1579 @@ /* - Copyright (c) 2007-2009 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include "dict.h" #include "byte-order.h" #include "common-utils.h" +#include "timer.h" #include "afr.h" #include "afr-transaction.h" #include <signal.h> +gf_boolean_t +afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this); -#define LOCKED_NO 0x0 /* no lock held */ -#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path - of RENAME */ -#define LOCKED_LOWER 0x2 /* for lower_path of RENAME */ - +gf_boolean_t +afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this); -static void -afr_pid_save (call_frame_t *frame) -{ - afr_local_t * local = NULL; - - local = frame->local; - - local->saved_pid = frame->root->pid; -} +int +afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, + afr_changelog_resume_t changelog_resume); -static void -afr_pid_restore (call_frame_t *frame) +int +__afr_txn_write_fop (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; local = frame->local; + priv = this->private; - frame->root->pid = local->saved_pid; -} - - -static void -__mark_all_pending (int32_t *pending[], int child_count, - afr_transaction_type type) -{ - int i; - int j; + call_count = AFR_COUNT (local->transaction.pre_op, priv->child_count); - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); - pending[i][j] = hton32 (1); + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; } -} + local->call_count = call_count; -static void -__mark_child_dead (int32_t *pending[], int child_count, int child, - afr_transaction_type type) -{ - int j; + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + local->transaction.wind (frame, this, i); - j = afr_index_for_transaction_type (type); - - pending[child][j] = 0; + if (!--call_count) + break; + } + } + + return 0; } -static void -__mark_fop_failed_on_fd (fd_t *fd, xlator_t *this, - int child_index) +int +__afr_txn_write_done (call_frame_t *frame, xlator_t *this) { - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; - - int ret = 0; + afr_local_t *local = NULL; - ret = fd_ctx_get (fd, this, &ctx); + local = frame->local; - if (ret < 0) - goto out; + local->transaction.unwind (frame, this); - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + AFR_STACK_DESTROY (frame); - fd_ctx->child_failed[child_index] = 1; -out: - return; + return 0; } -static void -__mark_failed_children (int32_t *pending[], int child_count, - xlator_t *this, fd_t *fd, afr_transaction_type type) +call_frame_t* +afr_transaction_detach_fop_frame (call_frame_t *frame) { - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; - - int ret = 0; - int i = 0; - int j = 0; + afr_local_t * local = NULL; + call_frame_t *fop_frame = NULL; - ret = fd_ctx_get (fd, this, &ctx); - - if (ret < 0) - goto out; - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); + local = frame->local; - if (fd_ctx->child_failed[i]) - pending[i][j] = 0; + LOCK (&frame->lock); + { + fop_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; } - -out: - return; + UNLOCK (&frame->lock); + + return fop_frame; } static void -__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index) +afr_save_lk_owner (call_frame_t *frame) { - afr_local_t *local = NULL; - - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; - int ret = 0; + afr_local_t * local = NULL; local = frame->local; - ret = fd_ctx_get (local->fd, this, &ctx); + local->saved_lk_owner = frame->root->lk_owner; +} - if (ret < 0) - goto out; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; +static void +afr_restore_lk_owner (call_frame_t *frame) +{ + afr_local_t * local = NULL; - if ((local->op == GF_FOP_WRITE) - || (local->op == GF_FOP_FTRUNCATE)) { - fd_ctx->pre_op_done[child_index] = 1; - } + local = frame->local; -out: - return; + frame->root->lk_owner = local->saved_lk_owner; } - -static void -__mark_down_children (int32_t *pending[], int child_count, - unsigned char *child_up, afr_transaction_type type) +void +__mark_all_success (call_frame_t *frame, xlator_t *this) { + afr_private_t *priv = NULL; + afr_local_t *local = NULL; int i; - int j; - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); + local = frame->local; + priv = this->private; - if (!child_up[i]) - pending[i][j] = 0; - } + for (i = 0; i < priv->child_count; i++) { + local->transaction.failed_subvols[i] = 0; + } } -static void -__mark_all_success (int32_t *pending[], int child_count, - afr_transaction_type type) +int +afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) { - int i; - int j; + afr_local_t *local = NULL; + fd_t *fd = NULL; - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); - pending[i][j] = hton32 (-1); - } + local = frame->local; + fd = local->fd; + + /* Perform fops with the lk-owner from top xlator. + * Eg: lk-owner of posix-lk and flush should be same, + * flush cant clear the posix-lks without that lk-owner. + */ + afr_save_lk_owner (frame); + frame->root->lk_owner = + local->transaction.main_frame->root->lk_owner; + + if (local->pre_op_compat) + /* old mode, pre-op was done as afr_changelog_do() + just now, before OP */ + afr_changelog_pre_op_update (frame, this); + + /* The wake up needs to happen independent of + what type of fop arrives here. If it was + a write, then it has already inherited the + lock and changelog. If it was not a write, + then the presumption of the optimization (of + optimizing for successive write operations) + fails. + */ + if (fd) + afr_delayed_changelog_wake_up (this, fd); + local->transaction.fop (frame, this); + + return 0; } static int -__is_first_write_on_fd (xlator_t *this, fd_t *fd) +__changelog_enabled (afr_private_t *priv, afr_transaction_type type) { - int op_ret = 0; - int _ret = -1; - int i = 0; - - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; + int ret = 0; - afr_private_t *priv = NULL; + switch (type) { + case AFR_DATA_TRANSACTION: + if (priv->data_change_log) + ret = 1; - priv = this->private; + break; - LOCK (&fd->lock); - { - _ret = __fd_ctx_get (fd, this, &ctx); - - if (_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "could not get fd ctx on fd=%p", - fd); - goto out; - } + case AFR_METADATA_TRANSACTION: + if (priv->metadata_change_log) + ret = 1; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + break; - op_ret = 1; - for (i = 0; i < priv->child_count; i++) { - if (fd_ctx->pre_op_done[i] == 0) - continue; + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + if (priv->entry_change_log) + ret = 1; - op_ret = 0; - } + break; } -out: - UNLOCK (&fd->lock); - return op_ret; + return ret; } static int -__if_fd_pre_op_done (xlator_t *this, fd_t *fd, int child_index) +__fop_changelog_needed (call_frame_t *frame, xlator_t *this) { + afr_private_t * priv = NULL; + afr_local_t * local = NULL; int op_ret = 0; - int _ret = -1; + afr_transaction_type type = -1; - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; + priv = this->private; + local = frame->local; + type = local->transaction.type; - LOCK (&fd->lock); - { - _ret = __fd_ctx_get (fd, this, &ctx); + if (__changelog_enabled (priv, type)) { + switch (local->op) { - if (_ret < 0) { - goto out; - } + case GF_FOP_WRITE: + case GF_FOP_FTRUNCATE: + op_ret = 1; + break; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + case GF_FOP_FLUSH: + op_ret = 0; + break; - if (fd_ctx->pre_op_done[child_index]) { + default: op_ret = 1; } - fd_ctx->pre_op_done[child_index] = 0; } -out: - UNLOCK (&fd->lock); return op_ret; } -static int -afr_pre_op_done_count (xlator_t *this, fd_t *fd, unsigned char *child_up) +int +afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int **pending) { int i = 0; - int count = 0; + int ret = 0; + int pending_zero[AFR_NUM_CHANGE_LOGS] = {0, }; - int _ret = 0; - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; + for (i = 0; i < priv->child_count; i++) { + if (!memcmp (pending_zero, pending[i], sizeof (pending_zero))) + /* don't set xattrs for non-pending servers */ + continue; - afr_private_t *priv = NULL; + ret = dict_set_static_bin (xattr, priv->pending_key[i], + pending[i], + AFR_NUM_CHANGE_LOGS * sizeof (int)); + /* 3 = data+metadata+entry */ - priv = this->private; + if (ret) + break; + } - LOCK (&fd->lock); - { - _ret = __fd_ctx_get (fd, this, &ctx); + return ret; +} - if (_ret < 0) { - goto out; - } +int +afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) +{ + int ret = 0; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + switch (type) { + case AFR_DATA_TRANSACTION: + ret = priv->child_count; + break; - for (i = 0; i < priv->child_count; i++) { - if (fd_ctx->pre_op_done[i] && child_up[i]) { - count++; - } - } + case AFR_METADATA_TRANSACTION: + ret = priv->child_count; + break; + + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + ret = priv->child_count; + break; } -out: - UNLOCK (&fd->lock); - return count; + return ret; } +/* {{{ pending */ -static int -__changelog_enabled (afr_private_t *priv, afr_transaction_type type) + +int +afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this) { - int ret = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; - switch (type) { - case AFR_DATA_TRANSACTION: - if (priv->data_change_log) - ret = 1; - - break; + local = frame->local; + priv = this->private; + int_lock = &local->internal_lock; - case AFR_METADATA_TRANSACTION: - if (priv->metadata_change_log) - ret = 1; + if (local->transaction.resume_stub) { + call_resume (local->transaction.resume_stub); + local->transaction.resume_stub = NULL; + } - break; + if (afr_lock_server_count (priv, local->transaction.type) == 0) { + local->transaction.done (frame, this); + } else { + int_lock->lock_cbk = local->transaction.done; + afr_unlock (frame, this); + } - case AFR_ENTRY_TRANSACTION: - case AFR_ENTRY_RENAME_TRANSACTION: - if (priv->entry_change_log) - ret = 1; + return 0; +} - break; - - case AFR_FLUSH_TRANSACTION: - ret = 1; - } - return ret; +afr_inodelk_t* +afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom) +{ + afr_inodelk_t *inodelk = NULL; + int i = 0; + + for (i = 0; int_lock->inodelk[i].domain; i++) { + inodelk = &int_lock->inodelk[i]; + if (strcmp (dom, inodelk->domain) == 0) + return inodelk; + } + return NULL; } +unsigned char* +afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock) +{ + unsigned char *locked_nodes = NULL; + afr_inodelk_t *inodelk = NULL; + switch (type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + locked_nodes = inodelk->locked_nodes; + break; + + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + /*Because same set of subvols participate in all lockee + * entities*/ + locked_nodes = int_lock->lockee[0].locked_nodes; + break; + } + return locked_nodes; +} -static int -__changelog_needed_pre_op (call_frame_t *frame, xlator_t *this) + +int +afr_changelog_call_count (afr_transaction_type type, + unsigned char *pre_op_subvols, + unsigned int child_count) +{ + int call_count = 0; + + call_count = AFR_COUNT(pre_op_subvols, child_count); + + if (type == AFR_ENTRY_RENAME_TRANSACTION) + call_count *= 2; + + return call_count; +} + + +gf_boolean_t +afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - fd_t * fd = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) + return _gf_false; + } + + return _gf_true; +} - int op_ret = 0; - priv = this->private; +void +afr_handle_symmetric_errors (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int op_errno = 0; + int i_errno = 0; + gf_boolean_t matching_errors = _gf_true; + int i = 0; + + priv = this->private; local = frame->local; - - if (__changelog_enabled (priv, local->transaction.type)) { - switch (local->op) { - - case GF_FOP_WRITE: - case GF_FOP_FTRUNCATE: - /* - if it's a data transaction, we write the changelog - only on the first write on an fd - */ - - fd = local->fd; - if (!fd || __is_first_write_on_fd (this, fd)) - op_ret = 1; + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret != -1) { + /* Operation succeeded on at least on subvol, + so it is not a failed-everywhere situation. + */ + matching_errors = _gf_false; break; + } + i_errno = local->replies[i].op_errno; - case GF_FOP_FLUSH: - /* only do post-op on flush() */ - - op_ret = 0; + if (i_errno == ENOTCONN) { + /* ENOTCONN is not a symmetric error. We do not + know if the operation was performed on the + backend or not. + */ + matching_errors = _gf_false; break; + } - default: - op_ret = 1; + if (!op_errno) { + op_errno = i_errno; + } else if (op_errno != i_errno) { + /* Mismatching op_errno's */ + matching_errors = _gf_false; + break; } } - return op_ret; + if (matching_errors) + __mark_all_success (frame, this); } -static int -__changelog_needed_post_op (call_frame_t *frame, xlator_t *this) +int +afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; + afr_private_t * priv = this->private; + int i = 0; + int ret = 0; + int idx = 0; + afr_local_t * local = NULL; + dict_t *xattr = NULL; + int nothing_failed = 1; + gf_boolean_t need_undirty = _gf_false; - int op_ret = 0; - afr_transaction_type type = -1; + local = frame->local; + idx = afr_index_for_transaction_type (local->transaction.type); - priv = this->private; - local = frame->local; - type = local->transaction.type; + nothing_failed = afr_txn_nothing_failed (frame, this); - if (__changelog_enabled (priv, type)) { - switch (local->op) { + if (afr_changelog_pre_op_uninherit (frame, this)) + need_undirty = _gf_false; + else + need_undirty = _gf_true; - case GF_FOP_WRITE: - case GF_FOP_FTRUNCATE: - op_ret = 0; - break; + if (nothing_failed && !need_undirty) { + afr_changelog_post_op_done (frame, this); + goto out; + } - case GF_FOP_FLUSH: - op_ret = 1; - break; + xattr = dict_new (); + if (!xattr) { + local->op_ret = -1; + local->op_errno = ENOMEM; + afr_changelog_post_op_done (frame, this); + goto out; + } - default: - op_ret = 1; - } - } + if (need_undirty) { + local->dirty[idx] = hton32(-1); - return op_ret; -} + ret = dict_set_static_bin (xattr, AFR_DIRTY, local->dirty, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + local->op_ret = -1; + local->op_errno = ENOMEM; + afr_changelog_post_op_done (frame, this); + goto out; + } + } -static int -afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending) -{ - int i; - int ret = 0; + if (!nothing_failed) { + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) + local->pending[i][idx] = hton32(1); + } + ret = afr_set_pending_dict (priv, xattr, local->pending); + if (ret < 0) { + local->op_ret = -1; + local->op_errno = ENOMEM; + afr_changelog_post_op_done (frame, this); + goto out; + } - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_static_bin (xattr, priv->pending_key[i], - pending[i], 3 * sizeof (int32_t)); - /* 3 = data+metadata+entry */ - - if (ret < 0) - goto out; - } + } + afr_changelog_do (frame, this, xattr, afr_changelog_post_op_done); out: - return ret; + if (xattr) + dict_unref (xattr); + + return 0; } -int -afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) +gf_boolean_t +afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this) { - int ret = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + fd_t *fd = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + afr_fd_ctx_t *fd_ctx = NULL; + int type = 0; + + local = frame->local; + priv = this->private; + fd = local->fd; + + type = afr_index_for_transaction_type (local->transaction.type); + if (type != AFR_DATA_TRANSACTION) + return !local->transaction.dirtied; + + if (!fd) + return !local->transaction.dirtied; - switch (type) { - case AFR_FLUSH_TRANSACTION: - case AFR_DATA_TRANSACTION: - ret = priv->data_lock_server_count; - break; + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return _gf_false; - case AFR_METADATA_TRANSACTION: - ret = priv->metadata_lock_server_count; - break; + if (local->transaction.no_uninherit) + return _gf_false; - case AFR_ENTRY_TRANSACTION: - case AFR_ENTRY_RENAME_TRANSACTION: - ret = priv->entry_lock_server_count; - break; + /* This function must be idempotent. So check if we + were called before and return the same answer again. + + It is important to keep this function idempotent for + the call in afr_changelog_post_op_safe() to not have + side effects on the call from afr_changelog_post_op_now() + */ + if (local->transaction.uninherit_done) + return local->transaction.uninherit_value; + + LOCK(&fd->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] != + fd_ctx->pre_op_done[type][i]) { + ret = !local->transaction.dirtied; + goto unlock; + } + } + + if (fd_ctx->inherited[type]) { + ret = _gf_true; + fd_ctx->inherited[type]--; + } else if (fd_ctx->on_disk[type]) { + ret = _gf_false; + fd_ctx->on_disk[type]--; + } else { + /* ASSERT */ + ret = _gf_false; + } + + if (!fd_ctx->inherited[type] && !fd_ctx->on_disk[type]) { + for (i = 0; i < priv->child_count; i++) + fd_ctx->pre_op_done[type][i] = 0; + } } +unlock: + UNLOCK(&fd->lock); + + local->transaction.uninherit_done = _gf_true; + local->transaction.uninherit_value = ret; return ret; } -/* {{{ unlock */ - -static int -afr_transaction_locked_nodes_count (afr_local_t *local, int child_count) +gf_boolean_t +afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this) { - int i; - int call_count = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + fd_t *fd = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + afr_fd_ctx_t *fd_ctx = NULL; + int type = 0; - for (i = 0; i < child_count; i++) { - if (local->transaction.locked_nodes[i] & LOCKED_YES) - call_count++; + local = frame->local; + priv = this->private; + fd = local->fd; - if ((local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) - && (local->transaction.locked_nodes[i] & LOCKED_LOWER)) { - call_count++; - } - } + if (local->transaction.type != AFR_DATA_TRANSACTION) + return _gf_false; - return call_count; -} + type = afr_index_for_transaction_type (local->transaction.type); + if (!fd) + return _gf_false; -static loc_t * -lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2) -{ - int ret = 0; + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return _gf_false; - ret = strcmp (l1->path, l2->path); - - if (ret == 0) - ret = strcmp (b1, b2); + LOCK(&fd->lock); + { + if (!fd_ctx->on_disk[type]) { + /* nothing to inherit yet */ + ret = _gf_false; + goto unlock; + } - if (ret <= 0) - return l1; - else - return l2; + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] != + fd_ctx->pre_op_done[type][i]) { + /* either inherit exactly, or don't */ + ret = _gf_false; + goto unlock; + } + } + + fd_ctx->inherited[type]++; + + ret = _gf_true; + + local->transaction.inherited = _gf_true; + } +unlock: + UNLOCK(&fd->lock); + + return ret; } -int32_t -afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +gf_boolean_t +afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this) { - afr_local_t *local; - int call_count = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + fd_t *fd = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + int type = 0; local = frame->local; + priv = this->private; + fd = local->fd; + + if (!fd) + return _gf_false; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return _gf_false; - LOCK (&frame->lock); + if (local->transaction.inherited) + /* was already inherited in afr_changelog_pre_op */ + return _gf_false; + + if (!local->transaction.dirtied) + return _gf_false; + + if (!afr_txn_nothing_failed (frame, this)) + return _gf_false; + + type = afr_index_for_transaction_type (local->transaction.type); + + ret = _gf_false; + + LOCK(&fd->lock); { - call_count = --local->call_count; - } - UNLOCK (&frame->lock); + if (!fd_ctx->on_disk[type]) { + for (i = 0; i < priv->child_count; i++) + fd_ctx->pre_op_done[type][i] = + local->transaction.pre_op[i]; + } else { + for (i = 0; i < priv->child_count; i++) + if (fd_ctx->pre_op_done[type][i] != + local->transaction.pre_op[i]) { + local->transaction.no_uninherit = 1; + goto unlock; + } + } + fd_ctx->on_disk[type]++; - if (call_count == 0) { - local->transaction.done (frame, this); + ret = _gf_true; } - - return 0; +unlock: + UNLOCK(&fd->lock); + + return ret; } int -afr_unlock (call_frame_t *frame, xlator_t *this) +afr_changelog_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - struct flock flock; + afr_local_t *local = NULL; + int call_count = -1; - int i = 0; - int call_count = 0; + local = frame->local; - afr_local_t *local = NULL; - afr_private_t * priv = this->private; + if (op_ret == -1) + afr_transaction_fop_failed (frame, this, (long) cookie); - loc_t * lower = NULL; - loc_t * higher = NULL; + call_count = afr_frame_return (frame); - const char *lower_name = NULL; - const char *higher_name = NULL; + if (call_count == 0) + local->transaction.changelog_resume (frame, this); - local = frame->local; + return 0; +} - /* - pid has been restored to saved_pid in the fop, - so set it back to frame->root - */ - frame->root->pid = (long) frame->root; +int +afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, + afr_changelog_resume_t changelog_resume) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + + local = frame->local; + priv = this->private; - call_count = afr_transaction_locked_nodes_count (local, - priv->child_count); + call_count = afr_changelog_call_count (local->transaction.type, + local->transaction.pre_op, + priv->child_count); if (call_count == 0) { - local->transaction.done (frame, this); + changelog_resume (frame, this); return 0; } - local->call_count = call_count; + local->call_count = call_count; + + local->transaction.changelog_resume = changelog_resume; - for (i = 0; i < priv->child_count; i++) { - flock.l_start = local->transaction.start; - flock.l_len = local->transaction.len; - flock.l_type = F_UNLCK; + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; switch (local->transaction.type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - case AFR_FLUSH_TRANSACTION: - - if (local->transaction.locked_nodes[i] & LOCKED_YES) { - if (local->fd) { - STACK_WIND (frame, afr_unlock_common_cbk, - priv->children[i], - priv->children[i]->fops->finodelk, - this->name, local->fd, - F_SETLK, &flock); - } else { - STACK_WIND (frame, afr_unlock_common_cbk, - priv->children[i], - priv->children[i]->fops->inodelk, - this->name, &local->loc, - F_SETLK, &flock); - } - - call_count--; + if (!local->fd) { + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); + } else { + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); } - - break; - + break; case AFR_ENTRY_RENAME_TRANSACTION: - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - if (local->transaction.locked_nodes[i] & LOCKED_LOWER) { - STACK_WIND (frame, afr_unlock_common_cbk, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - lower, lower_name, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK); - - call_count--; - } - if (local->transaction.locked_nodes[i] & LOCKED_YES) { - STACK_WIND (frame, afr_unlock_common_cbk, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - higher, higher_name, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.new_parent_loc, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); + call_count--; - call_count--; - } - - break; + /* fall through */ case AFR_ENTRY_TRANSACTION: - if (local->transaction.locked_nodes[i] & LOCKED_YES) { - if (local->fd) { - STACK_WIND (frame, afr_unlock_common_cbk, - priv->children[i], - priv->children[i]->fops->fentrylk, - this->name, local->fd, - local->transaction.basename, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK); - } else { - STACK_WIND (frame, afr_unlock_common_cbk, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - &local->transaction.parent_loc, - local->transaction.basename, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK); - - } - - call_count--; - } - - break; - } + if (local->fd) + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); + else + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.parent_loc, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); + break; + } - if (!call_count) + if (!--call_count) break; - } + } return 0; } -/* }}} */ +int +afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = this->private; + int i = 0; + int ret = 0; + int call_count = 0; + int op_errno = 0; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + unsigned char *locked_nodes = NULL; + unsigned char *pending_subvols = NULL; + int idx = -1; + gf_boolean_t pre_nop = _gf_true; + dict_t *xdata_req = NULL; + + local = frame->local; + int_lock = &local->internal_lock; + idx = afr_index_for_transaction_type (local->transaction.type); -/* {{{ pending */ + locked_nodes = afr_locked_nodes_get (local->transaction.type, int_lock); -int32_t -afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; + pending_subvols = alloca0 (priv->child_count); - int call_count = -1; + for (i = 0; i < priv->child_count; i++) { + if (locked_nodes[i]) { + local->transaction.pre_op[i] = 1; + call_count++; + } else { + pending_subvols[i] = 1; + } + } - int (*post_post_op) (call_frame_t *, xlator_t *); + /* TBD: quorum check w/ call_count */ - priv = this->private; - local = frame->local; + if (call_count == 0) { + op_errno = ENOTCONN; + goto err; + } - LOCK (&frame->lock); - { - call_count = --local->call_count; + xdata_req = dict_new(); + if (!xdata_req) { + op_errno = ENOMEM; + goto err; } - UNLOCK (&frame->lock); - if (call_count == 0) { - if (local->transaction.post_post_op) { - post_post_op = local->transaction.post_post_op; + pre_nop = _gf_true; - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - local->transaction.post_post_op = local->transaction.done; - } else { - local->transaction.post_post_op = afr_unlock; - } + if (afr_changelog_pre_op_inherit (frame, this)) + goto next; - post_post_op (frame, this); - } else { - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - local->transaction.done (frame, this); - } else { - afr_unlock (frame, this); - } - } + if (call_count < priv->child_count) { + /* For subvols we are not performing operation on, + mark them as pending up-front along with the FOP + so that we can safely defer unmarking dirty until + later. + */ + for (i = 0; i < priv->child_count; i++) { + if (pending_subvols[i]) + local->pending[i][idx] = hton32(1); + } + ret = afr_set_pending_dict (priv, xdata_req, + local->pending); + if (ret < 0) { + op_errno = ENOMEM; + goto err; + } + pre_nop = _gf_false; + } + + if (call_count > 1 && + (local->transaction.type == AFR_DATA_TRANSACTION || + !local->optimistic_change_log)) { + + /* If we are performing change on only one subvol, no + need to mark dirty, because we are setting the pending + counts already anyways + */ + local->dirty[idx] = hton32(1); + + ret = dict_set_static_bin (xdata_req, AFR_DIRTY, local->dirty, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + op_errno = ENOMEM; + goto err; + } + + pre_nop = _gf_false; + local->transaction.dirtied = 1; + } + + if (pre_nop) + goto next; + + if (!local->pre_op_compat) { + dict_copy (xdata_req, local->xdata_req); + goto next; } - return 0; + afr_changelog_do (frame, this, xdata_req, afr_transaction_perform_fop); + + if (xdata_req) + dict_unref (xdata_req); + + return 0; +next: + afr_transaction_perform_fop (frame, this); + + if (xdata_req) + dict_unref (xdata_req); + + return 0; +err: + local->internal_lock.lock_cbk = local->transaction.done; + local->op_ret = -1; + local->op_errno = op_errno; + + afr_unlock (frame, this); + + if (xdata_req) + dict_unref (xdata_req); + + return 0; } -int -afr_changelog_post_op (call_frame_t *frame, xlator_t *this) +int +afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = this->private; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; - int ret = 0; - int i = 0; - int call_count = 0; - - afr_local_t * local = NULL; - dict_t **xattr = NULL; + local = frame->local; + int_lock = &local->internal_lock; - local = frame->local; + if (int_lock->lock_op_ret < 0) { + gf_log (this->name, GF_LOG_INFO, + "Blocking inodelks failed."); + local->transaction.done (frame, this); + } else { - __mark_down_children (local->pending, priv->child_count, - local->child_up, local->transaction.type); - - if (local->op == GF_FOP_FLUSH) { - __mark_failed_children (local->pending, priv->child_count, - this, local->fd, - local->transaction.type); + gf_log (this->name, GF_LOG_DEBUG, + "Blocking inodelks done. Proceeding to FOP"); + afr_internal_lock_finish (frame, this); } - xattr = alloca (priv->child_count * sizeof (*xattr)); - memset (xattr, 0, (priv->child_count * sizeof (*xattr))); - for (i = 0; i < priv->child_count; i++) { - xattr[i] = get_new_dict (); - dict_ref (xattr[i]); - } + return 0; +} + - if (local->op == GF_FOP_FLUSH) { - call_count = afr_pre_op_done_count (this, local->fd, local->child_up); +int +afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + + local = frame->local; + int_lock = &local->internal_lock; + + /* Initiate blocking locks if non-blocking has failed */ + if (int_lock->lock_op_ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "Non blocking inodelks failed. Proceeding to blocking"); + int_lock->lock_cbk = afr_post_blocking_inodelk_cbk; + afr_blocking_lock (frame, this); } else { - call_count = afr_up_children_count (priv->child_count, local->child_up); - if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { - call_count *= 2; - } + gf_log (this->name, GF_LOG_DEBUG, + "Non blocking inodelks done. Proceeding to FOP"); + afr_internal_lock_finish (frame, this); } - local->call_count = call_count; + return 0; +} - if (call_count == 0) { - /* no child is up */ - for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); - } - - afr_unlock (frame, this); - return 0; - } - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set pending entry"); - - - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - { - if (local->fd) - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - else - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); - call_count--; - } - break; +int +afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; - case AFR_FLUSH_TRANSACTION: - { - if (__if_fd_pre_op_done (this, local->fd, i)) { - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - call_count--; - } - } - break; + local = frame->local; + int_lock = &local->internal_lock; - case AFR_ENTRY_RENAME_TRANSACTION: - { - STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); - - call_count--; - } + if (int_lock->lock_op_ret < 0) { + gf_log (this->name, GF_LOG_INFO, + "Blocking entrylks failed."); + local->transaction.done (frame, this); + } else { - /* - set it again because previous stack_wind - might have already returned (think of case - where subvolume is posix) and would have - used the dict as placeholder for return - value - */ - - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set pending entry"); - - /* fall through */ - - case AFR_ENTRY_TRANSACTION: - { - if (local->fd) - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - else - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); - call_count--; - } - break; - } + gf_log (this->name, GF_LOG_DEBUG, + "Blocking entrylks done. Proceeding to FOP"); + afr_internal_lock_finish (frame, this); + } - if (!call_count) - break; - } - } + return 0; +} - for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); + +int +afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + + local = frame->local; + int_lock = &local->internal_lock; + + /* Initiate blocking locks if non-blocking has failed */ + if (int_lock->lock_op_ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "Non blocking entrylks failed. Proceeding to blocking"); + int_lock->lock_cbk = afr_post_blocking_entrylk_cbk; + afr_blocking_lock (frame, this); + } else { + + gf_log (this->name, GF_LOG_DEBUG, + "Non blocking entrylks done. Proceeding to FOP"); + afr_internal_lock_finish (frame, this); } - return 0; + return 0; } -int32_t -afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr) +int +afr_post_blocking_rename_cbk (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = this->private; - loc_t * loc = NULL; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; - int call_count = -1; - int child_index = (long) cookie; + local = frame->local; + int_lock = &local->internal_lock; - local = frame->local; - loc = &local->loc; + if (int_lock->lock_op_ret < 0) { + gf_log (this->name, GF_LOG_INFO, + "Blocking entrylks failed."); + local->transaction.done (frame, this); + } else { - LOCK (&frame->lock); - { - if (op_ret == 0) { - __mark_pre_op_done_on_fd (frame, this, child_index); - } + gf_log (this->name, GF_LOG_DEBUG, + "Blocking entrylks done. Proceeding to FOP"); + afr_internal_lock_finish (frame, this); + } + return 0; +} - if (op_ret == -1) { - local->child_up[child_index] = 0; - - if (op_errno == ENOTSUP) { - gf_log (this->name, GF_LOG_ERROR, - "xattrop not supported by %s", - priv->children[child_index]->name); - local->op_ret = -1; - - } else if (!child_went_down (op_ret, op_errno)) { - gf_log (this->name, GF_LOG_ERROR, - "xattrop failed on child %s: %s", - priv->children[child_index]->name, - strerror (op_errno)); - } - local->op_errno = op_errno; - } - call_count = --local->call_count; - } - UNLOCK (&frame->lock); +int +afr_post_lower_unlock_cbk (call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; - if (call_count == 0) { - if ((local->op_ret == -1) && - (local->op_errno == ENOTSUP)) { - local->transaction.resume (frame, this); - } else { - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); + local = frame->local; + int_lock = &local->internal_lock; - afr_pid_restore (frame); + GF_ASSERT (!int_lock->higher_locked); - local->transaction.fop (frame, this); - } - } + int_lock->lock_cbk = afr_post_blocking_rename_cbk; + afr_blocking_lock (frame, this); - return 0; + return 0; } -int -afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) +int +afr_set_transaction_flock (afr_local_t *local) { - afr_private_t * priv = this->private; + afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; - int i = 0; - int ret = 0; - int call_count = 0; - dict_t **xattr = NULL; + int_lock = &local->internal_lock; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - afr_local_t *local = NULL; + inodelk->flock.l_len = local->transaction.len; + inodelk->flock.l_start = local->transaction.start; + inodelk->flock.l_type = F_WRLCK; - local = frame->local; - - xattr = alloca (priv->child_count * sizeof (*xattr)); - memset (xattr, 0, (priv->child_count * sizeof (*xattr))); + return 0; +} - for (i = 0; i < priv->child_count; i++) { - xattr[i] = get_new_dict (); - dict_ref (xattr[i]); - } +int +afr_lock_rec (call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; - call_count = afr_up_children_count (priv->child_count, - local->child_up); + local = frame->local; + int_lock = &local->internal_lock; - if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { - call_count *= 2; - } + int_lock->transaction_lk_type = AFR_TRANSACTION_LK; + int_lock->domain = this->name; - if (call_count == 0) { - /* no child is up */ - for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); - } - - afr_unlock (frame, this); - return 0; - } + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + afr_set_transaction_flock (local); - local->call_count = call_count; + int_lock->lock_cbk = afr_post_nonblocking_inodelk_cbk; - __mark_all_pending (local->pending, priv->child_count, - local->transaction.type); + afr_nonblocking_inodelk (frame, this); + break; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set pending entry"); - - - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - case AFR_FLUSH_TRANSACTION: - { - if (local->fd) - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &(local->loc), - GF_XATTROP_ADD_ARRAY, xattr[i]); - } - break; - - case AFR_ENTRY_RENAME_TRANSACTION: - { - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); + case AFR_ENTRY_RENAME_TRANSACTION: - call_count--; - } + int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk; + afr_nonblocking_entrylk (frame, this); + break; + case AFR_ENTRY_TRANSACTION: + int_lock->lk_basename = local->transaction.basename; + if (&local->transaction.parent_loc) + int_lock->lk_loc = &local->transaction.parent_loc; + else + GF_ASSERT (local->fd); - /* - set it again because previous stack_wind - might have already returned (think of case - where subvolume is posix) and would have - used the dict as placeholder for return - value - */ + int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk; + afr_nonblocking_entrylk (frame, this); + break; + } - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set pending entry"); - - /* fall through */ - - case AFR_ENTRY_TRANSACTION: - { - if (local->fd) - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); - } + return 0; +} - break; - } - if (!--call_count) - break; - } - } - - for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); - } - - return 0; +int +afr_lock (call_frame_t *frame, xlator_t *this) +{ + afr_set_lock_number (frame, this); + + return afr_lock_rec (frame, this); } + /* }}} */ -/* {{{ lock */ +int +afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) +{ + if (__fop_changelog_needed (frame, this)) { + afr_changelog_pre_op (frame, this); + } else { + afr_transaction_perform_fop (frame, this); + } + + return 0; +} -static -int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index); -int32_t -afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +void +afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int done = 0; - int child_index = (long) cookie; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + /* call this function from any of the related optimizations + which benefit from delaying post op are enabled, namely: - LOCK (&frame->lock); - { - if (op_ret == -1) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/posix-locks xlator on server"); - local->op_ret = op_ret; - done = 1; - } + - changelog piggybacking + - eager locking + */ - local->child_up[child_index] = 0; - local->op_errno = op_errno; - } - } - UNLOCK (&frame->lock); - - if ((op_ret == -1) && - (op_errno == ENOSYS)) { - afr_unlock (frame, this); - } else { - if (op_ret == 0) { - local->transaction.locked_nodes[child_index] - |= LOCKED_YES; - local->transaction.lock_count++; - } - afr_lock_rec (frame, this, child_index + 1); - } + priv = this->private; + if (!priv) + return; - return 0; -} + if (!priv->post_op_delay_secs) + return; + local = frame->local; + if (!local->transaction.eager_lock_on) + return; + + if (!local) + return; + + if (!local->fd) + return; + + if (local->op == GF_FOP_WRITE) + local->delayed_post_op = _gf_true; +} -int32_t -afr_lock_lower_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +gf_boolean_t +afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + + if (!fd) { + /* If false is returned, it may keep on taking eager-lock + * which may lead to starvation, so return true to avoid that. + */ + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid fd"); + return _gf_true; + } + /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock + * is taken mount2 opened the same file, it won't be able to + * perform any data operations until mount1 releases eager-lock. + * To avoid such scenario do not enable eager-lock for this transaction + * if open-fd-count is > 1 + */ - int child_index = (long) cookie; + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return _gf_true; - loc_t * lower = NULL; - loc_t * higher = NULL; + if (fd_ctx->open_fd_count > 1) + return _gf_true; - const char *lower_name = NULL; - const char *higher_name = NULL; + return _gf_false; +} - priv = this->private; - local = frame->local; - LOCK (&frame->lock); - { - if (op_ret == -1) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ +gf_boolean_t +is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + gf_boolean_t res = _gf_false; - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/posix-locks xlator on server"); + local = frame->local; + if (!local) + goto out; - local->op_ret = op_ret; - } + if (!local->delayed_post_op) + goto out; - local->child_up[child_index] = 0; - local->op_errno = op_errno; - } - } - UNLOCK (&frame->lock); + //Mark pending changelog ASAP + if (!afr_txn_nothing_failed (frame, this)) + goto out; - if (op_ret != 0) { - afr_unlock (frame, this); + if (local->fd && afr_are_multiple_fds_opened (local->fd, this)) goto out; - } else { - local->transaction.locked_nodes[child_index] |= LOCKED_LOWER; - } - /* The lower path has been locked. Now lock the higher path */ + res = _gf_true; +out: + return res; +} + + +void +afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, + call_stub_t *stub); + +void +afr_delayed_changelog_wake_up_cbk (void *data) +{ + fd_t *fd = NULL; - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); + fd = data; - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); + afr_delayed_changelog_wake_up (THIS, fd); +} - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); +/* SET operation */ +int +afr_fd_report_unstable_write (xlator_t *this, fd_t *fd) +{ + afr_fd_ctx_t *fdctx = NULL; - STACK_WIND_COOKIE (frame, afr_lock_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - this->name, higher, higher_name, - ENTRYLK_LOCK, ENTRYLK_WRLCK); + fdctx = afr_fd_ctx_get (fd, this); + + LOCK(&fd->lock); + { + fdctx->witnessed_unstable_write = _gf_true; + } + UNLOCK(&fd->lock); -out: return 0; } - -static -int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index) +/* TEST and CLEAR operation */ +gf_boolean_t +afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; + afr_fd_ctx_t *fdctx = NULL; + gf_boolean_t witness = _gf_false; - uint64_t ctx; - afr_fd_ctx_t *fd_ctx; + fdctx = afr_fd_ctx_get (fd, this); + if (!fdctx) + return _gf_true; - struct flock flock; + LOCK(&fd->lock); + { + if (fdctx->witnessed_unstable_write) { + witness = _gf_true; + fdctx->witnessed_unstable_write = _gf_false; + } + } + UNLOCK (&fd->lock); - int ret = 0; + return witness; +} - loc_t * lower = NULL; - loc_t * higher = NULL; - const char *lower_name = NULL; - const char *higher_name = NULL; +int +afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + afr_private_t *priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + afr_local_t *local = NULL; - local = frame->local; - priv = this->private; + priv = this->private; + local = frame->local; - flock.l_start = local->transaction.start; - flock.l_len = local->transaction.len; - flock.l_type = F_WRLCK; + if (op_ret != 0) { + /* Failure of fsync() is as good as failure of previous + write(). So treat it like one. + */ + gf_log (this->name, GF_LOG_WARNING, + "fsync(%s) failed on subvolume %s. Transaction was %s", + uuid_utoa (local->fd->inode->gfid), + priv->children[child_index]->name, + gf_fop_list[local->op]); + + afr_transaction_fop_failed (frame, this, child_index); + } - if (local->fd) { - ret = fd_ctx_get (local->fd, this, &ctx); + call_count = afr_frame_return (frame); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "unable to get fd ctx for fd=%p", - local->fd); + if (call_count == 0) + afr_changelog_post_op_now (frame, this); - local->op_ret = -1; - local->op_errno = EINVAL; + return 0; +} - afr_unlock (frame, this); - return 0; - } +int +afr_changelog_fsync (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + GF_UNUSED int ret = -1; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + local = frame->local; + priv = this->private; - /* skip over children that or down - or don't have the fd open */ + call_count = AFR_COUNT (local->transaction.pre_op, priv->child_count); - while ((child_index < priv->child_count) - && (!local->child_up[child_index] - || !fd_ctx->opened_on[child_index])) + if (!call_count) { + /* will go straight to unlock */ + afr_changelog_post_op_now (frame, this); + return 0; + } - child_index++; - } else { - /* skip over children that are down */ - while ((child_index < priv->child_count) - && !local->child_up[child_index]) - child_index++; + local->call_count = call_count; + + xdata = dict_new(); + if (xdata) + ret = dict_set_int32 (xdata, "batch-fsync", 1); + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; + + STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->fsync, local->fd, + 1, xdata); + if (!--call_count) + break; } - if ((child_index == priv->child_count) && - local->transaction.lock_count == 0) { + if (xdata) + dict_unref (xdata); - gf_log (this->name, GF_LOG_DEBUG, - "unable to lock on even one child"); + return 0; +} - local->op_ret = -1; - local->op_errno = EAGAIN; - afr_unlock (frame, this); - - return 0; +int +afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - } + local = frame->local; + priv = this->private; - if ((child_index == priv->child_count) - || (local->transaction.lock_count == - afr_lock_server_count (priv, local->transaction.type))) { + if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) { + afr_changelog_post_op_now (frame, this); + return 0; + } - /* we're done locking */ + if (afr_changelog_pre_op_uninherit (frame, this) && + afr_txn_nothing_failed (frame, this)) { + /* just detected that this post-op is about to + be optimized away as a new write() has + already piggybacked on this frame's changelog. + */ + afr_changelog_post_op_now (frame, this); + return 0; + } - if (__changelog_needed_pre_op (frame, this)) { - afr_changelog_pre_op (frame, this); - } else { - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); + /* Calling afr_changelog_post_op_now() now will result in + issuing ->[f]xattrop(). + + Performing a hard POST-OP (->[f]xattrop() FOP) is a more + responsible operation that what it might appear on the surface. + + The changelog of a file (in the xattr of the file on the server) + stores information (pending count) about the state of the file + on the OTHER server. This changelog is blindly trusted, and must + therefore be updated in such a way it remains trustworthy. This + implies that decrementing the pending count (essentially "clearing + the dirty flag") must be done STRICTLY after we are sure that the + operation on the other server has reached stable storage. + + While the backend filesystem on that server will eventually flush + it to stable storage, we (being in userspace) have no mechanism + to get notified when the write became "stable". + + This means we need take matter into our own hands and issue an + fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES, + and get an acknowledgement for it. And we need to wait for the + fsync() acknowledgement before initiating the hard POST-OP. + + However if the FD itself was opened in O_SYNC or O_DSYNC then + we are already guaranteed that the writes were made stable as + part of the FOP itself. The same holds true for NFS stable + writes which happen on an anonymous FD with O_DSYNC or O_SYNC + flag set in the writev() @flags param. For all other write types, + mark a flag in the fdctx whenever an unstable write is witnessed. + */ + + if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) { + afr_changelog_post_op_now (frame, this); + return 0; + } - afr_pid_restore (frame); + /* Check whether users want durability and perform fsync/post-op + * accordingly. + */ + if (priv->ensure_durability) { + /* Time to fsync() */ + afr_changelog_fsync (frame, this); + } else { + afr_changelog_post_op_now (frame, this); + } - local->transaction.fop (frame, this); - } + return 0; +} - return 0; - } - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - case AFR_FLUSH_TRANSACTION: - - if (local->fd) { - STACK_WIND_COOKIE (frame, afr_lock_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->finodelk, - this->name, local->fd, - F_SETLKW, &flock); - - } else { - STACK_WIND_COOKIE (frame, afr_lock_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->inodelk, - this->name, &local->loc, - F_SETLKW, &flock); - } - - break; - - case AFR_ENTRY_RENAME_TRANSACTION: +void +afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, + call_stub_t *stub) +{ + afr_fd_ctx_t *fd_ctx = NULL; + call_frame_t *prev_frame = NULL; + struct timespec delta = {0, }; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + goto out; + + delta.tv_sec = priv->post_op_delay_secs; + delta.tv_nsec = 0; + + pthread_mutex_lock (&fd_ctx->delay_lock); { - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - STACK_WIND_COOKIE (frame, afr_lock_lower_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - this->name, lower, lower_name, - ENTRYLK_LOCK, ENTRYLK_WRLCK); - - break; + prev_frame = fd_ctx->delay_frame; + fd_ctx->delay_frame = NULL; + if (fd_ctx->delay_timer) + gf_timer_call_cancel (this->ctx, fd_ctx->delay_timer); + fd_ctx->delay_timer = NULL; + if (!frame) + goto unlock; + fd_ctx->delay_timer = gf_timer_call_after (this->ctx, delta, + afr_delayed_changelog_wake_up_cbk, + fd); + fd_ctx->delay_frame = frame; } - - case AFR_ENTRY_TRANSACTION: - if (local->fd) { - STACK_WIND_COOKIE (frame, afr_lock_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->fentrylk, - this->name, local->fd, - local->transaction.basename, - ENTRYLK_LOCK, ENTRYLK_WRLCK); - } else { - STACK_WIND_COOKIE (frame, afr_lock_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - this->name, - &local->transaction.parent_loc, - local->transaction.basename, - ENTRYLK_LOCK, ENTRYLK_WRLCK); - } +unlock: + pthread_mutex_unlock (&fd_ctx->delay_lock); - break; +out: + if (prev_frame) { + local = prev_frame->local; + local->transaction.resume_stub = stub; + afr_changelog_post_op_now (prev_frame, this); + } else if (stub) { + call_resume (stub); } - - return 0; } -int32_t afr_lock (call_frame_t *frame, xlator_t *this) +void +afr_changelog_post_op (call_frame_t *frame, xlator_t *this) { - afr_pid_save (frame); + afr_local_t *local = NULL; - frame->root->pid = (long) frame->root; + local = frame->local; - return afr_lock_rec (frame, this, 0); + if (is_afr_delayed_changelog_post_op_needed (frame, this)) + afr_delayed_changelog_post_op (this, frame, local->fd, NULL); + else + afr_changelog_post_op_safe (frame, this); } -/* }}} */ -int32_t +/* Wake up the sleeping/delayed post-op, and also register + a stub to have it resumed after this transaction + completely finishes. + + The @stub gets saved in @local and gets resumed in + afr_local_cleanup() + */ +void +afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub) +{ + afr_delayed_changelog_post_op (this, NULL, fd, stub); +} + + +void +afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd) +{ + afr_delayed_changelog_post_op (this, NULL, fd, NULL); +} + + +int afr_transaction_resume (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; + afr_local_t *local = NULL; - local = frame->local; - priv = this->private; + local = frame->local; - if (__changelog_needed_post_op (frame, this)) { - afr_changelog_post_op (frame, this); - } else { - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - local->transaction.done (frame, this); - } else { - afr_unlock (frame, this); - } - } + if (local->transaction.eager_lock_on) { + /* We don't need to retain "local" in the + fd list anymore, writes to all subvols + are finished by now */ + afr_remove_eager_lock_stub (local); + } - return 0; + afr_restore_lk_owner (frame); + + afr_handle_symmetric_errors (frame, this); + + if (!local->pre_op_compat) + /* new mode, pre-op was done along + with OP */ + afr_changelog_pre_op_update (frame, this); + + if (__fop_changelog_needed (frame, this)) { + afr_changelog_post_op (frame, this); + } else { + afr_changelog_post_op_done (frame, this); + } + + return 0; } @@ -1410,54 +1582,141 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this) */ void -afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index) +afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, + int child_index) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; + afr_local_t * local = NULL; - local = frame->local; - priv = this->private; + local = frame->local; - switch (local->op) { - case GF_FOP_WRITE: - __mark_fop_failed_on_fd (local->fd, this, child_index); - break; - default: - __mark_child_dead (local->pending, priv->child_count, - child_index, local->transaction.type); - break; + local->transaction.failed_subvols[child_index] = 1; +} + + + + static gf_boolean_t +afr_locals_overlap (afr_local_t *local1, afr_local_t *local2) +{ + uint64_t start1 = local1->transaction.start; + uint64_t start2 = local2->transaction.start; + uint64_t end1 = 0; + uint64_t end2 = 0; + + if (local1->transaction.len) + end1 = start1 + local1->transaction.len - 1; + else + end1 = ULLONG_MAX; + + if (local2->transaction.len) + end2 = start2 + local2->transaction.len - 1; + else + end2 = ULLONG_MAX; + + return ((end1 >= start2) && (end2 >= start1)); +} + +void +afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_fd_ctx_t *fdctx = NULL; + afr_local_t *each = NULL; + + priv = this->private; + + if (!local->fd) + return; + + if (local->transaction.type != AFR_DATA_TRANSACTION) + return; + + if (!priv->eager_lock) + return; + + fdctx = afr_fd_ctx_get (local->fd, this); + if (!fdctx) + return; + + if (afr_are_multiple_fds_opened (local->fd, this)) + return; + /* + * Once full file lock is acquired in eager-lock phase, overlapping + * writes do not compete for inode-locks, instead are transferred to the + * next writes. Because of this overlapping writes are not ordered. + * This can cause inconsistencies in replication. + * Example: + * Two overlapping writes w1, w2 are sent in parallel on same fd + * in two threads t1, t2. + * Both threads can execute afr_writev_wind in the following manner. + * t1 winds w1 on brick-0 + * t2 winds w2 on brick-0 + * t2 winds w2 on brick-1 + * t1 winds w1 on brick-1 + * + * This check makes sure the locks are not transferred for + * overlapping writes. + */ + LOCK (&local->fd->lock); + { + list_for_each_entry (each, &fdctx->eager_locked, + transaction.eager_locked) { + if (afr_locals_overlap (each, local)) { + local->transaction.eager_lock_on = _gf_false; + goto unlock; + } + } + + local->transaction.eager_lock_on = _gf_true; + list_add_tail (&local->transaction.eager_locked, + &fdctx->eager_locked); } +unlock: + UNLOCK (&local->fd->lock); } -int32_t +int afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + fd_t *fd = NULL; + int ret = -1; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - afr_transaction_local_init (local, priv); + local->transaction.resume = afr_transaction_resume; + local->transaction.type = type; - local->transaction.resume = afr_transaction_resume; - local->transaction.type = type; + ret = afr_transaction_local_init (local, this); + if (ret < 0) + goto out; - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - if (__changelog_needed_pre_op (frame, this)) { - afr_changelog_pre_op (frame, this); - } else { - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); + afr_transaction_eager_lock_init (local, this); - afr_pid_restore (frame); + if (local->fd && local->transaction.eager_lock_on) + afr_set_lk_owner (frame, this, local->fd); + else + afr_set_lk_owner (frame, this, frame->root); - local->transaction.fop (frame, this); - } - } else { - afr_lock (frame, this); - } + if (!local->transaction.eager_lock_on && local->loc.inode) { + fd = fd_lookup (local->loc.inode, frame->root->pid); + if (fd == NULL) + fd = fd_lookup_anonymous (local->loc.inode); - return 0; + if (fd) { + afr_delayed_changelog_wake_up (this, fd); + fd_unref (fd); + } + } + + if (afr_lock_server_count (priv, local->transaction.type) == 0) { + afr_internal_lock_finish (frame, this); + } else { + afr_lock (frame, this); + } + ret = 0; +out: + return ret; } |
