From 1af420c700fbc49b65cf7faceb3270e81cd991ce Mon Sep 17 00:00:00 2001 From: Pranith Kumar K Date: Sat, 20 Aug 2011 15:48:27 +0530 Subject: cluster/afr: Perform self-heal without locking the whole file Change-Id: I206571c77f2d7b3c9f9d7bb82a936366fd99ce5c BUG: 3182 Reviewed-on: http://review.gluster.com/141 Tested-by: Gluster Build System Reviewed-by: Vijay Bellur --- xlators/cluster/afr/src/afr-common.c | 98 +- xlators/cluster/afr/src/afr-dir-read.c | 13 +- xlators/cluster/afr/src/afr-dir-write.c | 58 +- xlators/cluster/afr/src/afr-inode-read.c | 15 +- xlators/cluster/afr/src/afr-inode-write.c | 302 +++++- xlators/cluster/afr/src/afr-lk-common.c | 122 ++- xlators/cluster/afr/src/afr-mem-types.h | 1 + xlators/cluster/afr/src/afr-open.c | 499 ++------- xlators/cluster/afr/src/afr-self-heal-algorithm.c | 1114 +++++++-------------- xlators/cluster/afr/src/afr-self-heal-algorithm.h | 20 +- xlators/cluster/afr/src/afr-self-heal-common.c | 84 +- xlators/cluster/afr/src/afr-self-heal-common.h | 29 +- xlators/cluster/afr/src/afr-self-heal-data.c | 709 ++++++++----- xlators/cluster/afr/src/afr-self-heal-entry.c | 44 +- xlators/cluster/afr/src/afr-self-heal-metadata.c | 66 +- xlators/cluster/afr/src/afr-transaction.c | 55 +- xlators/cluster/afr/src/afr-transaction.h | 2 + xlators/cluster/afr/src/afr.h | 73 +- xlators/cluster/afr/src/pump.c | 2 +- xlators/features/locks/src/inodelk.c | 3 +- 20 files changed, 1582 insertions(+), 1727 deletions(-) diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 94335bd0298..19c5a83d2da 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -706,7 +706,7 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) if (sh->locked_nodes) GF_FREE (sh->locked_nodes); - if (sh->healing_fd && !sh->healing_fd_opened) { + if (sh->healing_fd) { fd_unref (sh->healing_fd); sh->healing_fd = NULL; } @@ -724,6 +724,14 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) GF_FREE (sh->fresh_parent_dirs); loc_wipe (&sh->parent_loc); + + if (sh->checksum) + GF_FREE (sh->checksum); + + if (sh->write_needed) + GF_FREE (sh->write_needed); + if (sh->healing_fd) + fd_unref (sh->healing_fd); } @@ -795,6 +803,9 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (local->fresh_children) GF_FREE (local->fresh_children); + if (local->fd_open_on) + GF_FREE (local->fd_open_on); + { /* lookup */ if (local->cont.lookup.xattrs) { afr_reset_xattr (local->cont.lookup.xattrs, @@ -897,23 +908,34 @@ afr_frame_return (call_frame_t *frame) return call_count; } - -/** - * up_children_count - return the number of children that are up - */ - int -afr_up_children_count (int child_count, unsigned char *child_up) +afr_set_elem_count_get (unsigned char *elems, int child_count) { int i = 0; int ret = 0; for (i = 0; i < child_count; i++) - if (child_up[i]) + if (elems[i]) ret++; return ret; } +/** + * up_children_count - return the number of children that are up + */ + +unsigned int +afr_up_children_count (unsigned char *child_up, unsigned int child_count) +{ + return afr_set_elem_count_get (child_up, child_count); +} + +unsigned int +afr_locked_children_count (unsigned char *children, unsigned int child_count) +{ + return afr_set_elem_count_get (children, child_count); +} + gf_boolean_t afr_is_fresh_lookup (loc_t *loc, xlator_t *this) { @@ -1172,7 +1194,7 @@ afr_is_transaction_running (afr_local_t *local) return ((local->inodelk_count > 0) || (local->entrylk_count > 0)); } -static void +void afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, gf_boolean_t is_background, ia_type_t ia_type, void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, @@ -1186,6 +1208,7 @@ afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, GF_ASSERT (frame); GF_ASSERT (this); GF_ASSERT (inode); + GF_ASSERT (ia_type != IA_INVAL); local = frame->local; local->self_heal.background = is_background; @@ -1444,7 +1467,7 @@ static void afr_lookup_perform_self_heal_if_needed (call_frame_t *frame, xlator_t *this, gf_boolean_t *sh_launched) { - size_t up_count = 0; + unsigned int up_count = 0; afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -1453,7 +1476,7 @@ afr_lookup_perform_self_heal_if_needed (call_frame_t *frame, xlator_t *this, priv = this->private; local = frame->local; - up_count = afr_up_children_count (priv->child_count, local->child_up); + up_count = afr_up_children_count (local->child_up, priv->child_count); if (up_count == 1) { gf_log (this->name, GF_LOG_DEBUG, "Only 1 child up - do not attempt to detect self heal"); @@ -1591,8 +1614,8 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) if (local->op_ret < 0) goto unwind; gfid_miss_count = afr_lookup_gfid_missing_count (local, this); - up_children_count = afr_up_children_count (priv->child_count, - local->child_up); + up_children_count = afr_up_children_count (local->child_up, + priv->child_count); enotconn_count = priv->child_count - up_children_count; if ((gfid_miss_count == local->success_count) && (enotconn_count > 0)) { @@ -1871,7 +1894,8 @@ afr_lookup (call_frame_t *frame, xlator_t *this, if (loc->parent) local->cont.lookup.parent_ino = loc->parent->ino; - local->child_up = memdup (priv->child_up, priv->child_count); + local->child_up = memdup (priv->child_up, + sizeof (*local->child_up) * priv->child_count); if (NULL == local->child_up) { op_errno = ENOMEM; goto out; @@ -1883,8 +1907,8 @@ afr_lookup (call_frame_t *frame, xlator_t *this, goto out; } - local->call_count = afr_up_children_count (priv->child_count, - local->child_up); + local->call_count = afr_up_children_count (local->child_up, + priv->child_count); call_count = local->call_count; if (local->call_count == 0) { @@ -1994,7 +2018,7 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd) fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), priv->child_count, - gf_afr_mt_char); + gf_afr_mt_int32_t); if (!fd_ctx->opened_on) { ret = -ENOMEM; goto unlock; @@ -2011,12 +2035,13 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd) goto unlock; } + INIT_LIST_HEAD (&fd_ctx->paused_calls); + INIT_LIST_HEAD (&fd_ctx->entries); + ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); if (ret) gf_log (this->name, GF_LOG_DEBUG, "failed to set fd ctx (%p)", fd); - - INIT_LIST_HEAD (&fd_ctx->entries); } unlock: UNLOCK (&fd->lock); @@ -2108,7 +2133,7 @@ afr_flush_wind (call_frame_t *frame, xlator_t *this) local = frame->local; priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_up_children_count (local->child_up, priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -2174,7 +2199,7 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) goto out; } - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_up_children_count (local->child_up, priv->child_count); transaction_frame = copy_frame (frame); if (!transaction_frame) { @@ -2196,6 +2221,12 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) local->transaction.start = 0; local->transaction.len = 0; + ret = afr_open_fd_fix (transaction_frame, this, _gf_false); + if (ret) { + op_ret = -1; + op_errno = -ret; + goto out; + } afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); @@ -3474,8 +3505,8 @@ AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv) { local->op_ret = -1; local->op_errno = EUCLEAN; - local->call_count = afr_up_children_count (priv->child_count, - priv->child_up); + local->call_count = afr_up_children_count (priv->child_up, + priv->child_count); if (local->call_count == 0) { gf_log (THIS->name, GF_LOG_INFO, "no subvolumes up"); return -ENOTCONN; @@ -3531,19 +3562,22 @@ out: } int -afr_transaction_local_init (afr_local_t *local, afr_private_t *priv) +afr_transaction_local_init (afr_local_t *local, xlator_t *this) { - int i; - int child_up_count = 0; - int ret = -ENOMEM; + int i = 0; + int child_up_count = 0; + int ret = -ENOMEM; + afr_private_t *priv = NULL; + priv = this->private; ret = afr_internal_lock_init (&local->internal_lock, priv->child_count, AFR_TRANSACTION_LK); if (ret < 0) goto out; ret = -ENOMEM; - child_up_count = afr_up_children_count (priv->child_count, local->child_up); + child_up_count = afr_up_children_count (local->child_up, + priv->child_count); if (priv->optimistic_change_log && child_up_count == priv->child_count) local->optimistic_change_log = 1; @@ -3567,6 +3601,14 @@ afr_transaction_local_init (afr_local_t *local, afr_private_t *priv) if (!local->fresh_children) goto out; + if (local->fd) { + local->fd_open_on = GF_CALLOC (sizeof (*local->fd_open_on), + priv->child_count, + gf_afr_mt_int32_t); + if (!local->fd_open_on) + goto out; + } + for (i = 0; i < priv->child_count; i++) { local->pending[i] = GF_CALLOC (sizeof (*local->pending[i]), 3, /* data + metadata + entry */ diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index 645da2a6c57..ec3639ff73b 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -164,9 +164,6 @@ out: sh->need_entry_self_heal = _gf_true; sh->forced_merge = _gf_true; - sh->type = local->fd->inode->ia_type; - sh->background = _gf_false; - sh->unwind = afr_examine_dir_sh_unwind; afr_self_heal_type_str_get(&local->self_heal, sh_type_str, @@ -177,7 +174,9 @@ out: " forced merge option set", sh_type_str, local->loc.path); - afr_self_heal (frame, this, local->fd->inode); + afr_launch_self_heal (frame, this, local->fd->inode, + _gf_false, local->fd->inode->ia_type, + NULL, afr_examine_dir_sh_unwind); } else { afr_set_opendir_done (this, local->fd->inode); @@ -205,7 +204,7 @@ afr_examine_dir (call_frame_t *frame, xlator_t *this) sizeof (*local->cont.opendir.checksum), gf_afr_mt_int32_t); - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_up_children_count (local->child_up, priv->child_count); local->call_count = call_count; @@ -240,8 +239,8 @@ afr_opendir_cbk (call_frame_t *frame, void *cookie, priv = this->private; local = frame->local; - up_children_count = afr_up_children_count (priv->child_count, - local->child_up); + up_children_count = afr_up_children_count (local->child_up, + priv->child_count); LOCK (&frame->lock); { diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 21287f8b8c2..1c25d160689 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -166,7 +166,7 @@ afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, fd_ctx = (afr_fd_ctx_t *)(long) ctx; - fd_ctx->opened_on[child_index] = 1; + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; fd_ctx->flags = local->cont.create.flags; if (local->success_count == 0) @@ -212,13 +212,16 @@ afr_create_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; + int_lock = &local->internal_lock; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_locked_children_count (int_lock->entry_locked_nodes, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -228,7 +231,7 @@ afr_create_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->child_up[i] && int_lock->entry_locked_nodes[i]) { STACK_WIND_COOKIE (frame, afr_create_wind_cbk, (void *) (long) i, priv->children[i], @@ -442,13 +445,16 @@ afr_mknod_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; + int_lock = &local->internal_lock; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_locked_children_count (int_lock->entry_locked_nodes, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -458,7 +464,7 @@ afr_mknod_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->child_up[i] && int_lock->entry_locked_nodes[i]) { STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->mknod, @@ -667,13 +673,16 @@ afr_mkdir_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; + int_lock = &local->internal_lock; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_locked_children_count (int_lock->entry_locked_nodes, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -683,7 +692,7 @@ afr_mkdir_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->child_up[i] && int_lock->entry_locked_nodes[i]) { STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, (void *) (long) i, priv->children[i], @@ -894,13 +903,16 @@ afr_link_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; + int_lock = &local->internal_lock; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_locked_children_count (int_lock->entry_locked_nodes, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -910,7 +922,7 @@ afr_link_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->child_up[i] && int_lock->entry_locked_nodes[i]) { STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->link, @@ -1117,13 +1129,16 @@ afr_symlink_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; + int_lock = &local->internal_lock; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_locked_children_count (int_lock->entry_locked_nodes, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -1133,7 +1148,7 @@ afr_symlink_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->child_up[i] && int_lock->entry_locked_nodes[i]) { STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, (void *) (long) i, priv->children[i], @@ -1338,13 +1353,16 @@ afr_rename_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; + int_lock = &local->internal_lock; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_locked_children_count (int_lock->entry_locked_nodes, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -1354,7 +1372,7 @@ afr_rename_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->child_up[i] && int_lock->entry_locked_nodes[i]) { STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, (void *) (long) i, priv->children[i], @@ -1543,13 +1561,16 @@ afr_unlink_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; + int_lock = &local->internal_lock; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_locked_children_count (int_lock->entry_locked_nodes, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -1559,7 +1580,7 @@ afr_unlink_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->child_up[i] && int_lock->entry_locked_nodes[i]) { STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, (void *) (long) i, priv->children[i], @@ -1741,13 +1762,16 @@ afr_rmdir_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; + int_lock = &local->internal_lock; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_locked_children_count (int_lock->entry_locked_nodes, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -1757,7 +1781,7 @@ afr_rmdir_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->child_up[i] && int_lock->entry_locked_nodes[i]) { STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, (void *) (long) i, priv->children[i], diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index f8157482758..1258afe09fc 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -44,9 +44,6 @@ #include "compat-errno.h" #include "compat.h" -#include "afr.h" - - /** * Common algorithm for inode read calls: * @@ -399,6 +396,12 @@ afr_fstat (call_frame_t *frame, xlator_t *this, local->cont.fstat.ino = fd->inode->ino; local->fd = fd_ref (fd); + op_ret = afr_open_fd_fix (frame, this, _gf_false); + if (op_ret) { + op_errno = -op_ret; + op_ret = -1; + goto out; + } STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child, children[call_child], children[call_child]->fops->fstat, @@ -1036,6 +1039,12 @@ afr_readv (call_frame_t *frame, xlator_t *this, local->cont.readv.size = size; local->cont.readv.offset = offset; + op_ret = afr_open_fd_fix (frame, this, _gf_false); + if (op_ret) { + op_errno = -op_ret; + op_ret = -1; + goto out; + } STACK_WIND_COOKIE (frame, afr_readv_cbk, (void *) (long) call_child, children[call_child], diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index c292b7493f6..faf3db40041 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -46,6 +46,7 @@ #include "afr.h" #include "afr-transaction.h" +#include "afr-self-heal-common.h" /* {{{ writev */ @@ -125,19 +126,21 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, return 0; } - int afr_writev_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; int i = 0; int call_count = -1; local = frame->local; priv = this->private; + int_lock = &local->internal_lock; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_locked_children_count (int_lock->inode_locked_nodes, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -147,7 +150,7 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->child_up[i] && int_lock->inode_locked_nodes[i]) { STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, (void *) (long) i, priv->children[i], @@ -188,10 +191,10 @@ afr_writev_done (call_frame_t *frame, xlator_t *this) int afr_do_writev (call_frame_t *frame, xlator_t *this) { - call_frame_t * transaction_frame = NULL; - afr_local_t * local = NULL; - int op_ret = -1; - int op_errno = 0; + call_frame_t *transaction_frame = NULL; + afr_local_t *local = NULL; + int op_ret = -1; + int op_errno = 0; local = frame->local; @@ -235,6 +238,202 @@ out: return 0; } +static int +afr_prepare_loc (call_frame_t *frame, fd_t *fd) +{ + afr_local_t *local = NULL; + char *name = NULL; + char *path = NULL; + int ret = 0; + + if ((!fd) || (!fd->inode)) + return -1; + + local = frame->local; + ret = inode_path (fd->inode, NULL, (char **)&path); + if (ret <= 0) { + gf_log (frame->this->name, GF_LOG_DEBUG, + "Unable to get path for gfid: %s", + uuid_utoa (fd->inode->gfid)); + return -1; + } + + if (local->loc.path) { + if (strcmp (path, local->loc.path)) + gf_log (frame->this->name, GF_LOG_DEBUG, + "overwriting old loc->path %s with %s", + local->loc.path, path); + GF_FREE ((char *)local->loc.path); + } + local->loc.path = path; + + name = strrchr (local->loc.path, '/'); + if (name) + name++; + local->loc.name = name; + + if (local->loc.inode) { + inode_unref (local->loc.inode); + } + local->loc.inode = inode_ref (fd->inode); + + if (local->loc.parent) { + inode_unref (local->loc.parent); + } + + local->loc.parent = inode_parent (local->loc.inode, 0, NULL); + + return 0; +} + +afr_fd_paused_call_t* +afr_paused_call_create (call_frame_t *frame) +{ + afr_local_t *local = NULL; + afr_fd_paused_call_t *paused_call = NULL; + + local = frame->local; + GF_ASSERT (local->fop_call_continue); + + paused_call = GF_CALLOC (1, sizeof (*paused_call), + gf_afr_fd_paused_call_t); + if (paused_call) { + INIT_LIST_HEAD (&paused_call->call_list); + paused_call->frame = frame; + } + + return paused_call; +} + +static int +afr_pause_fd_fop (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx) +{ + afr_fd_paused_call_t *paused_call = NULL; + int ret = 0; + + paused_call = afr_paused_call_create (frame); + if (paused_call) + list_add (&paused_call->call_list, &fd_ctx->paused_calls); + else + ret = -ENOMEM; + + return ret; +} + +static void +afr_trigger_open_fd_self_heal (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + char sh_type_str[256] = {0}; + + local = frame->local; + sh = &local->self_heal; + + sh->need_missing_entry_self_heal = _gf_true; + sh->need_gfid_self_heal = _gf_true; + sh->need_data_self_heal = _gf_true; + afr_self_heal_type_str_get(&local->self_heal, sh_type_str, + sizeof(sh_type_str)); + gf_log (this->name, GF_LOG_INFO, "%s self-heal triggered. " + "path: %s, reason: Replicate up down flush, data lock " + "is held", sh_type_str, local->loc.path); + + afr_launch_self_heal (frame, this, local->fd->inode, _gf_true, + local->fd->inode->ia_type, NULL, NULL); +} + +int +afr_open_fd_fix (call_frame_t *frame, xlator_t *this, gf_boolean_t pause_fop) +{ + int ret = 0; + int i = 0; + afr_fd_ctx_t *fd_ctx = NULL; + gf_boolean_t need_self_heal = _gf_false; + int *need_open = NULL; + int need_open_count = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + gf_boolean_t fop_continue = _gf_true; + gf_boolean_t queue_fop = _gf_false; + + local = frame->local; + priv = this->private; + + GF_ASSERT (local->fd); + if (pause_fop) + GF_ASSERT (local->fop_call_continue); + + ret = afr_prepare_loc (frame, local->fd); + if (ret < 0) { + //File does not exist we cant open it. + ret = 0; + goto out; + } + + fd_ctx = afr_fd_ctx_get (local->fd, this); + if (!fd_ctx) { + ret = -EINVAL; + goto unlock; + } + + LOCK (&local->fd->lock); + { + if (fd_ctx->up_count < priv->up_count) { + need_self_heal = _gf_true; + fd_ctx->up_count = priv->up_count; + fd_ctx->down_count = priv->down_count; + } + for (i = 0; i < priv->child_count; i++) { + if ((fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED) && + local->child_up[i]) { + fd_ctx->opened_on[i] = AFR_FD_OPENING; + if (!need_open) + need_open = GF_CALLOC (priv->child_count, + sizeof (*need_open), + gf_afr_mt_int32_t); + need_open[i] = 1; + need_open_count++; + } else if (pause_fop && local->child_up[i] && + (fd_ctx->opened_on[i] == AFR_FD_OPENING)) { + queue_fop = _gf_true; + } + } + + if (queue_fop) { + GF_ASSERT (pause_fop); + gf_log (this->name, GF_LOG_INFO, "Pause fd %p", + local->fd); + ret = afr_pause_fd_fop (frame, this, fd_ctx); + if (ret) + goto unlock; + } + } +unlock: + UNLOCK (&local->fd->lock); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to fix fd for %s", + local->loc.path); + fop_continue = _gf_false; + goto out; + } + + if (need_self_heal) + afr_trigger_open_fd_self_heal (frame, this); + + if (!need_open_count) + goto out; + + gf_log (this->name, GF_LOG_INFO, "Opening fd %p", local->fd); + afr_fix_open (frame, this, fd_ctx, need_open_count, need_open); + fop_continue = _gf_false; +out: + if (need_open) + GF_FREE (need_open); + if (fop_continue && local->fop_call_continue) + local->fop_call_continue (frame, this); + return ret; +} int afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, @@ -246,8 +445,6 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, int ret = -1; int op_ret = -1; int op_errno = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -272,21 +469,14 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, local->cont.writev.iobref = iobref_ref (iobref); local->fd = fd_ref (fd); + local->fop_call_continue = afr_do_writev; - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { + ret = afr_open_fd_fix (frame, this, _gf_true); + if (ret) { + op_errno = -ret; goto out; } - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - if (fd_ctx->up_count < priv->up_count) { - local->openfd_flush_cbk = afr_do_writev; - afr_openfd_flush (frame, this, fd); - } else { - afr_do_writev (frame, this); - } - op_ret = 0; out: if (op_ret == -1) { @@ -395,13 +585,16 @@ afr_truncate_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; + int_lock = &local->internal_lock; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_locked_children_count (int_lock->inode_locked_nodes, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -411,7 +604,7 @@ afr_truncate_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->child_up[i] && int_lock->inode_locked_nodes[i]) { STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, (void *) (long) i, priv->children[i], @@ -602,13 +795,16 @@ afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; + int_lock = &local->internal_lock; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_locked_children_count (int_lock->inode_locked_nodes, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -618,7 +814,7 @@ afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->child_up[i] && int_lock->inode_locked_nodes[i]) { STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, (void *) (long) i, priv->children[i], @@ -702,8 +898,6 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this, int ret = -1; int op_ret = -1; int op_errno = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -725,21 +919,14 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this, local->cont.ftruncate.ino = fd->inode->ino; local->fd = fd_ref (fd); + local->fop_call_continue = afr_do_ftruncate; - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { + ret = afr_open_fd_fix (frame, this, _gf_true); + if (ret) { + op_errno = -ret; goto out; } - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - if (fd_ctx->up_count < priv->up_count) { - local->openfd_flush_cbk = afr_do_ftruncate; - afr_openfd_flush (frame, this, fd); - } else { - afr_do_ftruncate (frame, this); - } - op_ret = 0; out: if (op_ret == -1) { @@ -849,13 +1036,16 @@ afr_setattr_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; + int_lock = &local->internal_lock; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_locked_children_count (int_lock->inode_locked_nodes, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -865,7 +1055,7 @@ afr_setattr_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->child_up[i] && int_lock->inode_locked_nodes[i]) { STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, (void *) (long) i, priv->children[i], @@ -1056,13 +1246,16 @@ afr_fsetattr_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; + int_lock = &local->internal_lock; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_locked_children_count (int_lock->inode_locked_nodes, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -1072,7 +1265,7 @@ afr_fsetattr_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->child_up[i] && int_lock->inode_locked_nodes[i]) { STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, (void *) (long) i, priv->children[i], @@ -1104,7 +1297,6 @@ afr_fsetattr_done (call_frame_t *frame, xlator_t *this) return 0; } - int afr_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf, int32_t valid) @@ -1124,10 +1316,12 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this, transaction_frame = copy_frame (frame); if (!transaction_frame) { + op_errno = ENOMEM; goto out; } ALLOC_OR_GOTO (local, afr_local_t, out); + transaction_frame->local = local; ret = AFR_LOCAL_INIT (local, priv); if (ret < 0) { @@ -1135,12 +1329,9 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this, goto out; } - transaction_frame->local = local; - local->op_ret = -1; local->cont.fsetattr.ino = fd->inode->ino; - local->cont.fsetattr.in_buf = *buf; local->cont.fsetattr.valid = valid; @@ -1150,6 +1341,13 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this, local->fd = fd_ref (fd); + op_ret = afr_open_fd_fix (frame, this, _gf_false); + if (ret) { + op_errno = -op_ret; + op_ret = -1; + goto out; + } + local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; local->transaction.len = 0; @@ -1242,13 +1440,16 @@ afr_setxattr_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; + int_lock = &local->internal_lock; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_locked_children_count (int_lock->inode_locked_nodes, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -1258,7 +1459,7 @@ afr_setxattr_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->child_up[i] && int_lock->inode_locked_nodes[i]) { STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, (void *) (long) i, priv->children[i], @@ -1424,13 +1625,16 @@ afr_removexattr_wind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; int call_count = -1; int i = 0; local = frame->local; priv = this->private; + int_lock = &local->internal_lock; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_locked_children_count (int_lock->inode_locked_nodes, + priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); @@ -1440,7 +1644,7 @@ afr_removexattr_wind (call_frame_t *frame, xlator_t *this) local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (local->child_up[i] && int_lock->inode_locked_nodes[i]) { STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, (void *) (long) i, priv->children[i], diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index 17651add96d..2168ee26f69 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -88,8 +88,7 @@ is_afr_lock_selfheal (afr_local_t *local) } int32_t -internal_lock_count (call_frame_t *frame, xlator_t *this, - afr_fd_ctx_t *fd_ctx) +internal_lock_count (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; @@ -99,10 +98,9 @@ internal_lock_count (call_frame_t *frame, xlator_t *this, local = frame->local; priv = this->private; - if (fd_ctx) { - GF_ASSERT (local->fd); + if (local->fd) { for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && fd_ctx->opened_on[i]) + if (local->child_up[i] && local->fd_open_on[i]) ++call_count; } } else { @@ -552,20 +550,24 @@ static int32_t afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + int32_t child_index = (long)cookie; local = frame->local; + int_lock = &local->internal_lock; afr_trace_inodelk_out (frame, AFR_INODELK_TRANSACTION, AFR_UNLOCK_OP, NULL, op_ret, - op_errno, (long) cookie); + op_errno, child_index); if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { gf_log (this->name, GF_LOG_ERROR, - "%s: unlock failed %s", - local->loc.path, strerror (op_errno)); + "%s: unlock failed on %d, reason: %s", + local->loc.path, child_index, strerror (op_errno)); } + int_lock->inode_locked_nodes[child_index] &= LOCKED_NO; afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno); return 0; @@ -590,6 +592,9 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) flock.l_len = int_lock->lk_flock.l_len; flock.l_type = F_UNLCK; + gf_log (this->name, GF_LOG_DEBUG, "attempting data unlock range %"PRIu64 + " %"PRIu64" by %"PRIu64, flock.l_start, flock.l_len, + frame->root->lk_owner); call_count = afr_locked_nodes_count (int_lock->inode_locked_nodes, priv->child_count); @@ -646,9 +651,22 @@ static int32_t afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno) { + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + int32_t child_index = (long)cookie; + + local = frame->local; + int_lock = &local->internal_lock; + afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION, AFR_UNLOCK_OP, NULL, op_ret, - op_errno, (long) cookie); + op_errno, child_index); + + if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { + gf_log (this->name, GF_LOG_ERROR, + "%s: unlock failed on %d, reason: %s", + local->loc.path, child_index, strerror (op_errno)); + } afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno); @@ -747,8 +765,7 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_unlock (frame, this); } else { if (op_ret == 0) { - int_lock->locked_nodes[child_index] - |= LOCKED_YES; + int_lock->locked_nodes[child_index] |= LOCKED_YES; int_lock->lock_count++; } afr_lock_blocking (frame, this, child_index + 1); @@ -940,8 +957,8 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) or don't have the fd open */ while ((child_index < priv->child_count) - && (!local->child_up[child_index] - || !fd_ctx->opened_on[child_index])) + && (!local->child_up[child_index] || + !local->fd_open_on[child_index])) child_index++; } else { @@ -969,9 +986,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) } if ((child_index == priv->child_count) - || (int_lock->lock_count == - afr_up_children_count (priv->child_count, - local->child_up))) { + || (int_lock->lock_count == int_lock->lk_expected_count)) { /* we're done locking */ @@ -1081,8 +1096,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) } return 0; - - } int32_t @@ -1091,6 +1104,7 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this) afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; + int up_count = 0; priv = this->private; local = frame->local; @@ -1103,6 +1117,10 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this) break; case AFR_ENTRY_RENAME_TRANSACTION: + up_count = afr_up_children_count (local->child_up, + priv->child_count); + int_lock->lk_expected_count = 2 * up_count; + //fallthrough case AFR_ENTRY_TRANSACTION: initialize_entrylk_variables (frame, this); break; @@ -1151,8 +1169,7 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_errno = op_errno; } } else if (op_ret == 0) { - int_lock->entry_locked_nodes[child_index] - |= LOCKED_YES; + int_lock->entry_locked_nodes[child_index] |= LOCKED_YES; int_lock->entrylk_lock_count++; } @@ -1161,7 +1178,7 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "Last locking reply received"); /* all locks successfull. Proceed to call FOP */ if (int_lock->entrylk_lock_count == - afr_up_children_count (priv->child_count, local->child_up)) { + int_lock->lk_expected_count) { gf_log (this->name, GF_LOG_TRACE, "All servers locked. Calling the cbk"); int_lock->lock_op_ret = 0; @@ -1181,6 +1198,20 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, return 0; } +void +afr_mark_fd_open_on (afr_local_t *local, afr_fd_ctx_t *fd_ctx, + size_t child_count) +{ + int i = 0; + + GF_ASSERT (local->fd_open_on); + + memset (local->fd_open_on, 0, sizeof (*local->fd_open_on)*child_count); + for (i = 0; i < child_count; i++) + if (fd_ctx->opened_on[i] == AFR_FD_OPENED) + local->fd_open_on[i] = 1; +} + int afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) { @@ -1192,8 +1223,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) loc_t *loc = NULL; int32_t call_count = 0; int i = 0; - uint64_t ctx = 0; - int ret = 0; local = frame->local; int_lock = &local->internal_lock; @@ -1206,9 +1235,8 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) loc = int_lock->lk_loc; if (local->fd) { - ret = fd_ctx_get (local->fd, this, &ctx); - - if (ret < 0) { + fd_ctx = afr_fd_ctx_get (local->fd, this); + if (!fd_ctx) { gf_log (this->name, GF_LOG_INFO, "unable to get fd ctx for fd=%p", local->fd); @@ -1221,10 +1249,10 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) return -1; } - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - call_count = internal_lock_count (frame, this, fd_ctx); + afr_mark_fd_open_on (local, fd_ctx, priv->child_count); + call_count = internal_lock_count (frame, this); int_lock->lk_call_count = call_count; + int_lock->lk_expected_count = call_count; if (!call_count) { gf_log (this->name, GF_LOG_INFO, @@ -1236,7 +1264,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) /* Send non-blocking entrylk calls only on up children and where the fd has been opened */ for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && fd_ctx->opened_on[i]) { + if (local->child_up[i] && local->fd_open_on[i]) { afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION, AFR_LOCK_OP, basename, i); @@ -1252,8 +1280,9 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) } else { GF_ASSERT (loc); - call_count = internal_lock_count (frame, this, NULL); + call_count = internal_lock_count (frame, this); int_lock->lk_call_count = call_count; + int_lock->lk_expected_count = call_count; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -1314,8 +1343,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_errno = op_errno; } } else if (op_ret == 0) { - int_lock->inode_locked_nodes[child_index] - |= LOCKED_YES; + int_lock->inode_locked_nodes[child_index] |= LOCKED_YES; int_lock->inodelk_lock_count++; } @@ -1324,7 +1352,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "Last inode locking reply received"); /* all locks successfull. Proceed to call FOP */ if (int_lock->inodelk_lock_count == - afr_up_children_count (priv->child_count, local->child_up)) { + int_lock->lk_expected_count) { gf_log (this->name, GF_LOG_TRACE, "All servers locked. Calling the cbk"); int_lock->lock_op_ret = 0; @@ -1352,7 +1380,6 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) afr_private_t *priv = NULL; afr_fd_ctx_t *fd_ctx = NULL; int32_t call_count = 0; - uint64_t ctx = 0; int i = 0; int ret = 0; struct gf_flock flock = {0,}; @@ -1365,12 +1392,14 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) flock.l_len = int_lock->lk_flock.l_len; flock.l_type = int_lock->lk_flock.l_type; + gf_log (this->name, GF_LOG_DEBUG, "attempting data lock range %"PRIu64 + " %"PRIu64" by %"PRIu64, flock.l_start, flock.l_len, + frame->root->lk_owner); initialize_inodelk_variables (frame, this); if (local->fd) { - ret = fd_ctx_get (local->fd, this, &ctx); - - if (ret < 0) { + fd_ctx = afr_fd_ctx_get (local->fd, this); + if (!fd_ctx) { gf_log (this->name, GF_LOG_INFO, "unable to get fd ctx for fd=%p", local->fd); @@ -1384,10 +1413,10 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) goto out; } - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - call_count = internal_lock_count (frame, this, fd_ctx); + afr_mark_fd_open_on (local, fd_ctx, priv->child_count); + call_count = internal_lock_count (frame, this); int_lock->lk_call_count = call_count; + int_lock->lk_expected_count = call_count; if (!call_count) { gf_log (this->name, GF_LOG_INFO, @@ -1399,7 +1428,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) /* Send non-blocking inodelk calls only on up children and where the fd has been opened */ for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && fd_ctx->opened_on[i]) { + if (local->child_up[i] && local->fd_open_on[i]) { afr_trace_inodelk_in (frame, AFR_INODELK_NB_TRANSACTION, AFR_LOCK_OP, &flock, F_SETLK, i); @@ -1417,8 +1446,9 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) } } else { - call_count = internal_lock_count (frame, this, NULL); + call_count = internal_lock_count (frame, this); int_lock->lk_call_count = call_count; + int_lock->lk_expected_count = call_count; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -1989,7 +2019,7 @@ afr_mark_fd_opened (xlator_t *this, fd_t *fd, int32_t child_index) fdctx = (afr_fd_ctx_t *) (long) tmp; - fdctx->opened_on[child_index] = 1; + fdctx->opened_on[child_index] = AFR_FD_OPENED; out: return ret; @@ -2083,7 +2113,7 @@ is_fd_opened (fd_t *fd, int32_t child_index) fdctx = (afr_fd_ctx_t *) (long) tmp; - if (fdctx->opened_on[child_index]) + if (fdctx->opened_on[child_index] == AFR_FD_OPENED) ret = 1; out: diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 98e86574031..d5a988708b8 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -43,6 +43,7 @@ enum gf_afr_mem_types_ { gf_afr_mt_pump_priv, gf_afr_mt_locked_fd, gf_afr_mt_inode_ctx_t, + gf_afr_fd_paused_call_t, gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index 306f5a85af0..02d8f3ded3b 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -116,7 +116,7 @@ afr_open_cbk (call_frame_t *frame, void *cookie, fd_ctx = (afr_fd_ctx_t *)(long) ctx; - fd_ctx->opened_on[child_index] = 1; + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; fd_ctx->flags = local->cont.open.flags; fd_ctx->wbflags = local->cont.open.wbflags; } @@ -141,7 +141,6 @@ unlock: return 0; } - int afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, fd_t *fd, int32_t wbflags) @@ -180,7 +179,6 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, frame->local = local; call_count = local->call_count; - loc_copy (&local->loc, loc); local->cont.open.flags = flags; @@ -209,446 +207,165 @@ out: return 0; } - -int -afr_openfd_sh_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) +//NOTE: this function should be called with holding the lock on +//fd to which fd_ctx belongs +void +afr_get_resumable_calls (xlator_t *this, afr_fd_ctx_t *fd_ctx, + struct list_head *list) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - uint64_t ctx = 0; - int ret = 0; - int call_count = 0; - int child_index = (long) cookie; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - - LOCK (&frame->lock); - { - if (op_ret >= 0) { - ret = fd_ctx_get (fd, this, &ctx); + afr_fd_paused_call_t *paused_call = NULL; + afr_fd_paused_call_t *tmp = NULL; + afr_local_t *call_local = NULL; + afr_private_t *priv = NULL; + int i = 0; + gf_boolean_t call = _gf_false; - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "failed to get fd context, %p", fd); - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - fd_ctx->opened_on[child_index] = 1; - - gf_log (this->name, GF_LOG_TRACE, - "fd for %s opened successfully on subvolume %s", - local->loc.path, priv->children[child_index]->name); + priv = this->private; + list_for_each_entry_safe (paused_call, tmp, &fd_ctx->paused_calls, + call_list) { + call = _gf_true; + call_local = paused_call->frame->local; + for (i = 0; i < priv->child_count; i++) { + if (call_local->child_up[i] && + (fd_ctx->opened_on[i] == AFR_FD_OPENING)) + call = _gf_false; } - } -out: - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - if (call_count == 0) { - int_lock->lock_cbk = local->transaction.done; - local->transaction.resume (frame, this); + if (call) { + list_del_init (&paused_call->call_list); + list_add (&paused_call->call_list, list); + } } - - return 0; } - -static int -__unopened_count (int child_count, unsigned int *opened_on, unsigned char *child_up) +void +afr_resume_calls (xlator_t *this, struct list_head *list) { - int i = 0; - int count = 0; - - for (i = 0; i < child_count; i++) { - if (!opened_on[i] && child_up[i]) - count++; + afr_fd_paused_call_t *paused_call = NULL; + afr_fd_paused_call_t *tmp = NULL; + afr_local_t *call_local = NULL; + + list_for_each_entry_safe (paused_call, tmp, list, call_list) { + list_del_init (&paused_call->call_list); + call_local = paused_call->frame->local; + call_local->fop_call_continue (paused_call->frame, this); + GF_FREE (paused_call); } - - return count; } - int -afr_openfd_sh_unwind (call_frame_t *frame, xlator_t *this, int32_t op_ret, - int32_t op_errno) +afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int abandon = 0; - int ret = 0; - int i = 0; - int call_count = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int call_count = 0; + int child_index = (long) cookie; + struct list_head paused_calls = {0}; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - /* - * Some subvolumes might have come up on which we never - * opened this fd in the first place. Re-open fd's on those - * subvolumes now. - */ + call_count = afr_frame_return (frame); - ret = fd_ctx_get (local->fd, this, &ctx); - if (ret < 0) { + //Note: No frame locking needed for this block of code + fd_ctx = afr_fd_ctx_get (local->fd, this); + if (!fd_ctx) { gf_log (this->name, GF_LOG_WARNING, - "failed to get fd context %p (%s)", - local->fd, local->loc.path); - abandon = 1; + "failed to get fd context, %p", local->fd); goto out; } - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - LOCK (&local->fd->lock); { - call_count = __unopened_count (priv->child_count, - fd_ctx->opened_on, - local->child_up); - for (i = 0; i < priv->child_count; i++) { - fd_ctx->pre_op_done[i] = 0; - fd_ctx->pre_op_piggyback[i] = 0; + if (op_ret >= 0) { + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + gf_log (this->name, GF_LOG_INFO, "fd for %s opened " + "successfully on subvolume %s", local->loc.path, + priv->children[child_index]->name); + } else { + //Change open status from OPENING to NOT OPENED. + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; } - } - UNLOCK (&local->fd->lock); - - if (call_count == 0) { - gf_log (this->name, GF_LOG_WARNING, - "fd not open on any subvolume %p (%s)", - local->fd, local->loc.path); - abandon = 1; - goto out; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (!fd_ctx->opened_on[i] && local->child_up[i]) { - gf_log (this->name, GF_LOG_TRACE, - "opening fd for %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_openfd_sh_open_cbk, - (void *)(long) i, - priv->children[i], - priv->children[i]->fops->open, - &local->loc, fd_ctx->flags, local->fd, - fd_ctx->wbflags); - - if (!--call_count) - break; + if (call_count == 0) { + INIT_LIST_HEAD (&paused_calls); + afr_get_resumable_calls (this, fd_ctx, &paused_calls); } } - + UNLOCK (&local->fd->lock); out: - if (abandon) - local->transaction.resume (frame, this); - - return 0; -} - - -static int -afr_prepare_loc (call_frame_t *frame, fd_t *fd) -{ - afr_local_t *local = NULL; - char *name = NULL; - char *path = NULL; - int ret = 0; - - if ((!fd) || (!fd->inode)) - return -1; - - local = frame->local; - ret = inode_path (fd->inode, NULL, (char **)&path); - if (ret <= 0) { - gf_log (frame->this->name, GF_LOG_DEBUG, - "Unable to get path for gfid: %s", - uuid_utoa (fd->inode->gfid)); - return -1; - } - - if (local->loc.path) { - if (strcmp (path, local->loc.path)) - gf_log (frame->this->name, GF_LOG_DEBUG, - "overwriting old loc->path %s with %s", - local->loc.path, path); - GF_FREE ((char *)local->loc.path); - } - local->loc.path = path; - - name = strrchr (local->loc.path, '/'); - if (name) - name++; - local->loc.name = name; - - if (local->loc.inode) { - inode_unref (local->loc.inode); - } - local->loc.inode = inode_ref (fd->inode); - - if (local->loc.parent) { - inode_unref (local->loc.parent); + if (call_count == 0) { + afr_resume_calls (this, &paused_calls); + if (local->fop_call_continue) + local->fop_call_continue (frame, this); + else + AFR_STACK_DESTROY (frame); } - local->loc.parent = inode_parent (local->loc.inode, 0, NULL); - return 0; } - int -afr_openfd_sh (call_frame_t *frame, xlator_t *this) +afr_fix_open (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx, + int need_open_count, int *need_open) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - char sh_type_str[256] = {0,}; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + call_frame_t *open_frame = NULL; + afr_local_t *open_local = NULL; + int ret = -1; + int32_t op_errno = 0; + + GF_ASSERT (fd_ctx); + GF_ASSERT (need_open_count > 0); + GF_ASSERT (need_open); local = frame->local; - sh = &local->self_heal; - - GF_ASSERT (local->loc.path); - /* forcibly trigger missing-entries self-heal */ - - sh->need_missing_entry_self_heal = _gf_true; - sh->need_gfid_self_heal = _gf_true; - sh->data_lock_held = _gf_true; - sh->need_data_self_heal = _gf_true; - sh->type = local->fd->inode->ia_type; - sh->background = _gf_false; - sh->unwind = afr_openfd_sh_unwind; - - afr_self_heal_type_str_get(&local->self_heal, - sh_type_str, - sizeof(sh_type_str)); - gf_log (this->name, GF_LOG_INFO, "%s self-heal triggered. " - "path: %s, reason: Replicate up down flush, data lock is held", - sh_type_str, local->loc.path); - - afr_self_heal (frame, this, local->fd->inode); - - return 0; -} - - -int -afr_openfd_flush_done (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; - - int _ret = -1; - priv = this->private; - local = frame->local; - - LOCK (&local->fd->lock); - { - _ret = __fd_ctx_get (local->fd, this, &ctx); - if (_ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "failed to get fd context %p (%s)", - local->fd, local->loc.path); + if (!local->fop_call_continue) { + open_frame = copy_frame (frame); + if (!open_frame) { + ret = -ENOMEM; goto out; } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - fd_ctx->down_count = priv->down_count; - fd_ctx->up_count = priv->up_count; - } -out: - UNLOCK (&local->fd->lock); - - afr_local_transaction_cleanup (local, this); - - gf_log (this->name, GF_LOG_TRACE, - "The up/down flush is over"); - - fd_unref (local->fd); - local->openfd_flush_cbk (frame, this); - - return 0; -} - - - -int -afr_openfd_xaction (call_frame_t *frame, xlator_t *this, fd_t *fd) -{ - afr_local_t * local = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - local = frame->local; - - local->op = GF_FOP_FLUSH; - - local->transaction.fop = afr_openfd_sh; - local->transaction.done = afr_openfd_flush_done; - - local->transaction.start = 0; - local->transaction.len = 0; - - gf_log (this->name, GF_LOG_TRACE, - "doing up/down flush on fd=%p", fd); - - afr_transaction (frame, this, AFR_DATA_TRANSACTION); - -out: - return 0; -} - - - -int -afr_openfd_xaction_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int ret = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int call_count = 0; - int child_index = (long) cookie; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - - LOCK (&frame->lock); - { - if (op_ret >= 0) { - ret = fd_ctx_get (fd, this, &ctx); - - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "failed to get fd context %p (%s)", - fd, local->loc.path); - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - fd_ctx->opened_on[child_index] = 1; - - gf_log (this->name, GF_LOG_TRACE, - "fd for %s opened successfully on subvolume %s", - local->loc.path, priv->children[child_index]->name); + ALLOC_OR_GOTO (open_local, afr_local_t, out); + ret = AFR_LOCAL_INIT (open_local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; } + loc_copy (&open_local->loc, &local->loc); + open_local->fd = fd_ref (local->fd); + } else { + ret = 0; + open_frame = frame; + open_local = local; } -out: - UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_openfd_xaction (frame, this, local->fd); - } - - return 0; -} - - -int -afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int no_open = 0; - int ret = 0; - int i = 0; - int call_count = 0; - - priv = this->private; - local = frame->local; - - /* - * If the file is already deleted while the fd is open, no need to - * perform the openfd flush, call the flush_cbk and get out. - */ - ret = afr_prepare_loc (frame, fd); - if (ret < 0) { - local->openfd_flush_cbk (frame, this); - goto out; - } + open_local->call_count = need_open_count; - /* - * Some subvolumes might have come up on which we never - * opened this fd in the first place. Re-open fd's on those - * subvolumes now. - */ - - local->fd = fd_ref (fd); - - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "failed to get fd context %p (%s)", - fd, local->loc.path); - no_open = 1; - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - LOCK (&local->fd->lock); - { - call_count = __unopened_count (priv->child_count, - fd_ctx->opened_on, - local->child_up); - } - UNLOCK (&local->fd->lock); - - if (call_count == 0) { - gf_log (this->name, GF_LOG_WARNING, - "fd not open on any subvolume %p (%s)", - fd, local->loc.path); - no_open = 1; - goto out; - } - - local->call_count = call_count; + gf_log (this->name, GF_LOG_DEBUG, "need open count: %d", + need_open_count); for (i = 0; i < priv->child_count; i++) { - if (!fd_ctx->opened_on[i] && local->child_up[i]) { - gf_log (this->name, GF_LOG_TRACE, + if (need_open[i]) { + gf_log (this->name, GF_LOG_DEBUG, "opening fd for %s on subvolume %s", local->loc.path, priv->children[i]->name); - STACK_WIND_COOKIE (frame, afr_openfd_xaction_open_cbk, + STACK_WIND_COOKIE (open_frame, afr_openfd_fix_open_cbk, (void *)(long) i, priv->children[i], priv->children[i]->fops->open, - &local->loc, fd_ctx->flags, fd, - fd_ctx->wbflags); + &open_local->loc, fd_ctx->flags, + open_local->fd, fd_ctx->wbflags); - if (!--call_count) - break; } } - out: - if (no_open) - afr_openfd_xaction (frame, this, fd); - - return 0; + if (ret && open_frame) + AFR_STACK_DESTROY (open_frame); + return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c index 04b388fe052..1c7cdf41819 100644 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c +++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.c @@ -44,279 +44,249 @@ This file contains the various self-heal algorithms */ +static int +sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, + gf_boolean_t is_first_call, call_frame_t *old_loop_frame); +static int +sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame, + int32_t op_ret, int32_t op_errno); +static int +sh_destroy_frame (call_frame_t *frame, xlator_t *this) +{ + if (!frame) + goto out; -/* - The "full" algorithm. Copies the entire file from - source to sinks. -*/ - + AFR_STACK_DESTROY (frame); +out: + return 0; +} static void -sh_full_private_cleanup (call_frame_t *frame, xlator_t *this) +sh_private_cleanup (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_full_private_t *sh_priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_sh_algo_private_t *sh_priv = NULL; local = frame->local; sh = &local->self_heal; sh_priv = sh->private; - if (sh_priv) GF_FREE (sh_priv); } - static int -sh_full_loop_driver (call_frame_t *frame, xlator_t *this, gf_boolean_t is_first_call); +sh_number_of_writes_needed (unsigned char *write_needed, int child_count) +{ + int writes = 0; + int i = 0; + + for (i = 0; i < child_count; i++) { + if (write_needed[i]) + writes++; + } + + return writes; +} + static int -sh_full_loop_driver_done (call_frame_t *frame, xlator_t *this) +sh_loop_driver_done (call_frame_t *frame, xlator_t *this, + call_frame_t *last_loop_frame) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_full_private_t *sh_priv = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_sh_algo_private_t *sh_priv = NULL; + int32_t total_blocks = 0; + int32_t diff_blocks = 0; - priv = this->private; - local = frame->local; - sh = &local->self_heal; - sh_priv = sh->private; + priv = this->private; + local = frame->local; + sh = &local->self_heal; + sh_priv = sh->private; + total_blocks = sh_priv->total_blocks; + diff_blocks = sh_priv->diff_blocks; - sh_full_private_cleanup (frame, this); + sh_private_cleanup (frame, this); if (sh->op_failed) { + GF_ASSERT (!last_loop_frame); + //loop_finish should have happened and the old_loop should be NULL gf_log (this->name, GF_LOG_INFO, - "full self-heal aborting on %s", + "self-heal aborting on %s", local->loc.path); local->self_heal.algo_abort_cbk (frame, this); } else { - gf_log (this->name, GF_LOG_INFO, - "full self-heal completed on %s", - local->loc.path); + GF_ASSERT (last_loop_frame); + if (diff_blocks == total_blocks) { + gf_log (this->name, GF_LOG_INFO, "full self-heal " + "completed on %s",local->loc.path); + } else { + gf_log (this->name, GF_LOG_INFO, + "diff self-heal on %s: completed. " + "(%d blocks of %d were different (%.2f%%))", + local->loc.path, diff_blocks, total_blocks, + ((diff_blocks * 1.0)/total_blocks) * 100); + } + sh->old_loop_frame = last_loop_frame; local->self_heal.algo_completion_cbk (frame, this); } + return 0; } -static int -sh_full_loop_return (call_frame_t *rw_frame, xlator_t *this, off_t offset) +int +sh_loop_finish (call_frame_t *loop_frame, xlator_t *this) { - afr_local_t * rw_local = NULL; - afr_self_heal_t * rw_sh = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_full_private_t *sh_priv = NULL; - - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; + afr_local_t *loop_local = NULL; + afr_self_heal_t *loop_sh = NULL; - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - sh_priv = sh->private; - - AFR_STACK_DESTROY (rw_frame); + if (!loop_frame) + goto out; - sh_full_loop_driver (sh_frame, this, _gf_false); + loop_local = loop_frame->local; + if (loop_local) { + loop_sh = &loop_local->self_heal; + } + if (loop_sh && loop_sh->loop_completion_cbk) { + if (loop_sh->data_lock_held) { + afr_sh_data_unlock (loop_frame, this, + loop_sh->loop_completion_cbk); + } else { + loop_sh->loop_completion_cbk (loop_frame, this); + } + } else { + //default loop_completion_cbk destroys the loop_frame + if (loop_sh && !loop_sh->loop_completion_cbk) + GF_ASSERT (!loop_sh->data_lock_held); + sh_destroy_frame (loop_frame, this); + } +out: return 0; } - static int -sh_full_write_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) +sh_loop_lock_success (call_frame_t *loop_frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t *rw_sh = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - int child_index = (long) cookie; - int call_count = 0; - - priv = this->private; - - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - gf_log (this->name, GF_LOG_TRACE, - "wrote %d bytes of data from %s to child %d, offset %"PRId64"", - op_ret, sh_local->loc.path, child_index, - rw_sh->offset - op_ret); - - LOCK (&sh_frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "write to %s failed on subvolume %s (%s)", - sh_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); + afr_local_t *loop_local = NULL; + afr_self_heal_t *loop_sh = NULL; - sh->op_failed = 1; - } - } - UNLOCK (&sh_frame->lock); + loop_local = loop_frame->local; + loop_sh = &loop_local->self_heal; - call_count = afr_frame_return (rw_frame); - - if (call_count == 0) { - sh_full_loop_return (rw_frame, this, rw_sh->offset - op_ret); - } + sh_loop_finish (loop_sh->old_loop_frame, this); + loop_sh->old_loop_frame = NULL; + gf_log (this->name, GF_LOG_DEBUG, "Aquired lock for range %"PRIu64 + " %"PRIu64, loop_sh->offset, loop_sh->block_size); + loop_sh->data_lock_held = _gf_true; + loop_sh->sh_data_algo_start (loop_frame, this); return 0; } - static int -sh_full_read_cbk (call_frame_t *rw_frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, struct iatt *buf, - struct iobref *iobref) +sh_loop_lock_failure (call_frame_t *loop_frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t *rw_sh = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - int call_count = 0; - off_t offset = (long) cookie; - - priv = this->private; - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - call_count = sh->active_sinks; - - rw_local->call_count = call_count; - - gf_log (this->name, GF_LOG_TRACE, - "read %d bytes of data from %s, offset %"PRId64"", - op_ret, sh_local->loc.path, offset); - - if (op_ret <= 0) { - gf_log (this->name, GF_LOG_ERROR, - "read from %s failed on subvolume %s (%s)", - sh_local->loc.path, - priv->children[sh->source]->name, - strerror (op_errno)); - sh->op_failed = 1; - sh_full_loop_return (rw_frame, this, offset); - return 0; - } - - rw_sh->offset += op_ret; - - if (sh->file_has_holes) { - if (iov_0filled (vector, count) == 0) { - /* the iter function depends on the - sh->offset already being updated - above - */ - gf_log (this->name, GF_LOG_DEBUG, - "block has all 0 filled"); - sh_full_loop_return (rw_frame, this, offset); - goto out; - } - } - - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] || !sh_local->child_up[i]) - continue; - - /* this is a sink, so write to it */ - - STACK_WIND_COOKIE (rw_frame, sh_full_write_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->writev, - sh->healing_fd, vector, count, offset, - iobref); - - if (!--call_count) - break; - } - -out: + call_frame_t *sh_frame = NULL; + afr_local_t *loop_local = NULL; + afr_self_heal_t *loop_sh = NULL; + + loop_local = loop_frame->local; + loop_sh = &loop_local->self_heal; + sh_frame = loop_sh->sh_frame; + + gf_log (this->name, GF_LOG_ERROR, "failed lock for range %"PRIu64 + " %"PRIu64, loop_sh->offset, loop_sh->block_size); + sh_loop_finish (loop_sh->old_loop_frame, this); + loop_sh->old_loop_frame = NULL; + sh_loop_return (sh_frame, this, loop_frame, -1, ENOTCONN); return 0; } - static int -sh_full_read_write (call_frame_t *frame, xlator_t *this, off_t offset) +sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, + call_frame_t *old_loop_frame) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t *rw_sh = NULL; - afr_self_heal_t *sh = NULL; - call_frame_t *rw_frame = NULL; - int32_t op_errno = 0; + call_frame_t *new_loop_frame = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_local_t *new_loop_local = NULL; + afr_self_heal_t *new_loop_sh = NULL; + afr_private_t *priv = NULL; - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - rw_frame = copy_frame (frame); - if (!rw_frame) - goto out; + GF_ASSERT (sh_frame); - ALLOC_OR_GOTO (rw_local, afr_local_t, out); - - rw_frame->local = rw_local; - rw_sh = &rw_local->self_heal; + local = sh_frame->local; + sh = &local->self_heal; + priv = this->private; - rw_sh->offset = offset; - rw_sh->sh_frame = frame; + new_loop_frame = copy_frame (sh_frame); + if (!new_loop_frame) + goto out; + //We want the frame to have same lk_oner as sh_frame + new_loop_local = afr_local_copy (local, this); + if (!new_loop_local) + goto out; + new_loop_frame->local = new_loop_local; - STACK_WIND_COOKIE (rw_frame, sh_full_read_cbk, - (void *) (long) offset, - priv->children[sh->source], - priv->children[sh->source]->fops->readv, - sh->healing_fd, sh->block_size, - offset); + new_loop_sh = &new_loop_local->self_heal; + new_loop_sh->sources = memdup (sh->sources, + priv->child_count * sizeof (*sh->sources)); + if (!new_loop_sh->sources) + goto out; + new_loop_sh->write_needed = GF_CALLOC (priv->child_count, + sizeof (*new_loop_sh->write_needed), + gf_afr_mt_char); + if (!new_loop_sh->write_needed) + goto out; + new_loop_sh->checksum = GF_CALLOC (priv->child_count, MD5_DIGEST_LEN, + gf_afr_mt_uint8_t); + if (!new_loop_sh->checksum) + goto out; + new_loop_sh->offset = offset; + new_loop_sh->block_size = sh->block_size; + new_loop_sh->old_loop_frame = old_loop_frame; + new_loop_sh->sh_frame = sh_frame; + new_loop_sh->inode = inode_ref (sh->inode); + new_loop_sh->sh_data_algo_start = sh->sh_data_algo_start; + new_loop_sh->source = sh->source; + new_loop_sh->active_sinks = sh->active_sinks; + new_loop_sh->healing_fd = fd_ref (sh->healing_fd); + new_loop_sh->file_has_holes = sh->file_has_holes; + new_loop_sh->loop_completion_cbk = sh_destroy_frame; + afr_sh_data_lock (new_loop_frame, this, offset, new_loop_sh->block_size, + sh_loop_lock_success, sh_loop_lock_failure); return 0; - out: sh->op_failed = 1; - - sh_full_loop_driver (frame, this, _gf_false); - + if (new_loop_frame) { + new_loop_frame->local = new_loop_local; + } + if (old_loop_frame) + sh_loop_finish (old_loop_frame, this); + sh_loop_return (sh_frame, this, new_loop_frame, -1, ENOMEM); return 0; } - static int -sh_full_loop_driver (call_frame_t *frame, xlator_t *this, gf_boolean_t is_first_call) +sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, + gf_boolean_t is_first_call, call_frame_t *old_loop_frame) { afr_private_t * priv = NULL; afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_full_private_t *sh_priv = NULL; + afr_self_heal_t * sh = NULL; + afr_sh_algo_private_t *sh_priv = NULL; gf_boolean_t is_driver_done = _gf_false; blksize_t block_size = 0; - off_t offset = 0; int loop = 0; + off_t offset = 0; priv = this->private; - local = frame->local; + local = sh_frame->local; sh = &local->self_heal; sh_priv = sh->private; @@ -324,23 +294,18 @@ sh_full_loop_driver (call_frame_t *frame, xlator_t *this, gf_boolean_t is_first_ { if (_gf_false == is_first_call) sh_priv->loops_running--; - offset = sh_priv->offset; - block_size = sh->block_size; - while ((sh->op_failed == 0) && + offset = sh_priv->offset; + block_size = sh->block_size; + while ((!sh->eof_reached) && (0 == sh->op_failed) && (sh_priv->loops_running < priv->data_self_heal_window_size) && (sh_priv->offset < sh->file_size)) { loop++; - gf_log (this->name, GF_LOG_TRACE, - "spawning a loop for offset %"PRId64, - sh_priv->offset); - - sh_priv->offset += sh->block_size; + sh_priv->offset += block_size; sh_priv->loops_running++; if (_gf_false == is_first_call) break; - } if (0 == sh_priv->loops_running) { is_driver_done = _gf_true; @@ -348,274 +313,130 @@ sh_full_loop_driver (call_frame_t *frame, xlator_t *this, gf_boolean_t is_first_ } UNLOCK (&sh_priv->lock); + if (0 == loop) { + //loop finish does unlock, but the erasing of the pending + //xattrs needs to happen before that so do not finish the loop + if (is_driver_done && !sh->op_failed) + goto driver_done; + if (old_loop_frame) { + sh_loop_finish (old_loop_frame, this); + old_loop_frame = NULL; + } + } + + //If we have more loops to form we should finish previous loop after + //the next loop lock while (loop--) { if (sh->op_failed) { // op failed in other loop, stop spawning more loops - sh_full_loop_driver (frame, this, _gf_false); + if (old_loop_frame) { + sh_loop_finish (old_loop_frame, this); + old_loop_frame = NULL; + } + sh_loop_driver (sh_frame, this, _gf_false, NULL); } else { - sh_full_read_write (frame, this, offset); + gf_log (this->name, GF_LOG_TRACE, "spawning a loop " + "for offset %"PRId64, offset); + + sh_loop_start (sh_frame, this, offset, old_loop_frame); + old_loop_frame = NULL; offset += block_size; } } +driver_done: if (is_driver_done) { - sh_full_loop_driver_done (frame, this); + sh_loop_driver_done (sh_frame, this, old_loop_frame); } - - return 0; -} - - -int -afr_sh_algo_full (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_full_private_t *sh_priv = NULL; - - local = frame->local; - sh = &local->self_heal; - - sh_priv = GF_CALLOC (1, sizeof (*sh_priv), - gf_afr_mt_afr_private_t); - if (!sh_priv) - goto out; - - LOCK_INIT (&sh_priv->lock); - - sh->private = sh_priv; - - local->call_count = 0; - - sh_full_loop_driver (frame, this, _gf_true); -out: return 0; } - -/* - * The "diff" algorithm. Copies only those blocks whose checksums - * don't match with those of source. - */ - - -static void -sh_diff_private_cleanup (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_diff_private_t *sh_priv = NULL; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sh_priv = sh->private; - - for (i = 0; i < priv->data_self_heal_window_size; i++) { - if (sh_priv->loops[i]) { - if (sh_priv->loops[i]->write_needed) - GF_FREE (sh_priv->loops[i]->write_needed); - - if (sh_priv->loops[i]->checksum) - GF_FREE (sh_priv->loops[i]->checksum); - - GF_FREE (sh_priv->loops[i]); - } - } - - if (sh_priv) { - if (sh_priv->loops) - GF_FREE (sh_priv->loops); - - GF_FREE (sh_priv); - } - - -} - - -static uint32_t -__make_cookie (int loop_index, int child_index) -{ - uint32_t ret = ((loop_index << 16) | child_index); - return ret; -} - - -static int -__loop_index (uint32_t cookie) -{ - return ((cookie & 0xFFFF0000) >> 16); -} - - -static int -__child_index (uint32_t cookie) -{ - return (cookie & 0x0000FFFF); -} - - -static void -sh_diff_loop_state_reset (struct sh_diff_loop_state *loop_state, int child_count) -{ - loop_state->active = _gf_false; -// loop_state->offset = 0; - - memset (loop_state->write_needed, - 0, sizeof (*loop_state->write_needed) * child_count); - - memset (loop_state->checksum, - 0, MD5_DIGEST_LEN * child_count); -} - - static int -sh_diff_number_of_writes_needed (unsigned char *write_needed, int child_count) -{ - int writes = 0; - int i = 0; - - for (i = 0; i < child_count; i++) { - if (write_needed[i]) - writes++; - } - - return writes; -} - - -static int -sh_diff_loop_driver_done (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_diff_private_t *sh_priv = NULL; - int32_t total_blocks = 0; - int32_t diff_blocks = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - sh_priv = sh->private; - total_blocks = sh_priv->total_blocks; - diff_blocks = sh_priv->diff_blocks; - - sh_diff_private_cleanup (frame, this); - if (sh->op_failed) { - gf_log (this->name, GF_LOG_INFO, - "diff self-heal aborting on %s", - local->loc.path); - - local->self_heal.algo_abort_cbk (frame, this); - } else { - gf_log (this->name, GF_LOG_INFO, - "diff self-heal on %s: completed. " - "(%d blocks of %d were different (%.2f%%))", - local->loc.path, diff_blocks, total_blocks, - ((diff_blocks * 1.0)/total_blocks) * 100); - - local->self_heal.algo_completion_cbk (frame, this); - } - - return 0; -} - -static int -sh_diff_loop_driver (call_frame_t *frame, xlator_t *this, - gf_boolean_t is_first_call, - struct sh_diff_loop_state *loop_state); - -static int -sh_diff_loop_return (call_frame_t *rw_frame, xlator_t *this, - struct sh_diff_loop_state *loop_state) +sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame, + int32_t op_ret, int32_t op_errno) { afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t * rw_sh = NULL; - call_frame_t *sh_frame = NULL; + afr_local_t * loop_local = NULL; + afr_self_heal_t * loop_sh = NULL; afr_local_t * sh_local = NULL; afr_self_heal_t *sh = NULL; - afr_sh_algo_diff_private_t *sh_priv = NULL; + afr_sh_algo_private_t *sh_priv = NULL; priv = this->private; - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; sh_local = sh_frame->local; sh = &sh_local->self_heal; sh_priv = sh->private; - gf_log (this->name, GF_LOG_TRACE, - "loop for offset %"PRId64" returned", loop_state->offset); + if (loop_frame) { + loop_local = loop_frame->local; + if (loop_local) + loop_sh = &loop_local->self_heal; + if (loop_sh) + gf_log (this->name, GF_LOG_TRACE, "loop for offset " + "%"PRId64" returned", loop_sh->offset); + } - AFR_STACK_DESTROY (rw_frame); + if (op_ret == -1) { + sh->op_failed = 1; + afr_sh_set_error (sh, op_errno); + if (loop_frame) { + sh_loop_finish (loop_frame, this); + loop_frame = NULL; + } + } - sh_diff_loop_driver (sh_frame, this, _gf_false, loop_state); + sh_loop_driver (sh_frame, this, _gf_false, loop_frame); return 0; } - static int -sh_diff_write_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this, +sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, struct iatt *postbuf) { afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t * rw_sh = NULL; + afr_local_t * loop_local = NULL; + afr_self_heal_t * loop_sh = NULL; call_frame_t *sh_frame = NULL; afr_local_t * sh_local = NULL; afr_self_heal_t *sh = NULL; - afr_sh_algo_diff_private_t *sh_priv = NULL; - struct sh_diff_loop_state *loop_state = NULL; + afr_sh_algo_private_t *sh_priv = NULL; int call_count = 0; int child_index = 0; - int loop_index = 0; priv = this->private; - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; + loop_local = loop_frame->local; + loop_sh = &loop_local->self_heal; - sh_frame = rw_sh->sh_frame; + sh_frame = loop_sh->sh_frame; sh_local = sh_frame->local; sh = &sh_local->self_heal; sh_priv = sh->private; - child_index = __child_index ((uint32_t) (long) cookie); - loop_index = __loop_index ((uint32_t) (long) cookie); - loop_state = sh_priv->loops[loop_index]; + child_index = (long) cookie; gf_log (this->name, GF_LOG_TRACE, "wrote %d bytes of data from %s to child %d, offset %"PRId64"", - op_ret, sh_local->loc.path, child_index, - loop_state->offset); + op_ret, sh_local->loc.path, child_index, loop_sh->offset); - LOCK (&sh_frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "write to %s failed on subvolume %s (%s)", - sh_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "write to %s failed on subvolume %s (%s)", + sh_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); - sh->op_failed = 1; - } + sh->op_failed = 1; + afr_sh_set_error (loop_sh, op_errno); } - UNLOCK (&sh_frame->lock); - call_count = afr_frame_return (rw_frame); + call_count = afr_frame_return (loop_frame); if (call_count == 0) { - sh_diff_loop_return (rw_frame, this, loop_state); + sh_loop_return (sh_frame, this, loop_frame, + loop_sh->op_ret, loop_sh->op_errno); } return 0; @@ -623,74 +444,71 @@ sh_diff_write_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this, static int -sh_diff_read_cbk (call_frame_t *rw_frame, void *cookie, +sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, int32_t count, struct iatt *buf, struct iobref *iobref) { afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t * rw_sh = NULL; - afr_sh_algo_diff_private_t * sh_priv = NULL; + afr_local_t * loop_local = NULL; + afr_self_heal_t * loop_sh = NULL; call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - int loop_index = 0; - struct sh_diff_loop_state *loop_state = NULL; - uint32_t wcookie = 0; int i = 0; int call_count = 0; + afr_local_t * sh_local = NULL; + afr_self_heal_t * sh = NULL; - priv = this->private; - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; + priv = this->private; + loop_local = loop_frame->local; + loop_sh = &loop_local->self_heal; - sh_frame = rw_sh->sh_frame; + sh_frame = loop_sh->sh_frame; sh_local = sh_frame->local; sh = &sh_local->self_heal; - sh_priv = sh->private; - - loop_index = __loop_index ((uint32_t) (long) cookie); - loop_state = sh_priv->loops[loop_index]; - - call_count = sh_diff_number_of_writes_needed (loop_state->write_needed, - priv->child_count); - - rw_local->call_count = call_count; gf_log (this->name, GF_LOG_TRACE, "read %d bytes of data from %s, offset %"PRId64"", - op_ret, sh_local->loc.path, loop_state->offset); - - if ((op_ret <= 0) || - (call_count == 0)) { - sh_diff_loop_return (rw_frame, this, loop_state); + op_ret, loop_local->loc.path, loop_sh->offset); - return 0; + if (op_ret <= 0) { + if (op_ret < 0) { + sh->op_failed = 1; + gf_log (this->name, GF_LOG_ERROR, "read failed on %d " + "for %s reason :%s", sh->source, + sh_local->loc.path, strerror (errno)); + } else { + sh->eof_reached = _gf_true; + gf_log (this->name, GF_LOG_DEBUG, "Eof reached for %s", + sh_local->loc.path); + } + sh_loop_return (sh_frame, this, loop_frame, op_ret, op_errno); + goto out; } - if (sh->file_has_holes) { - if (iov_0filled (vector, count) == 0) { + if (loop_sh->file_has_holes && iov_0filled (vector, count) == 0) { gf_log (this->name, GF_LOG_DEBUG, "0 filled block"); - sh_diff_loop_return (rw_frame, this, loop_state); + sh_loop_return (sh_frame, this, loop_frame, + op_ret, op_errno); goto out; - } } - for (i = 0; i < priv->child_count; i++) { - if (loop_state->write_needed[i]) { - wcookie = __make_cookie (loop_index, i); + call_count = sh_number_of_writes_needed (loop_sh->write_needed, + priv->child_count); + GF_ASSERT (call_count > 0); + loop_local->call_count = call_count; - STACK_WIND_COOKIE (rw_frame, sh_diff_write_cbk, - (void *) (long) wcookie, - priv->children[i], - priv->children[i]->fops->writev, - sh->healing_fd, vector, count, - loop_state->offset, iobref); + for (i = 0; i < priv->child_count; i++) { + if (!loop_sh->write_needed[i]) + continue; + STACK_WIND_COOKIE (loop_frame, sh_loop_write_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->writev, + loop_sh->healing_fd, vector, count, + loop_sh->offset, iobref); - if (!--call_count) - break; - } + if (!--call_count) + break; } out: @@ -699,100 +517,77 @@ out: static int -sh_diff_read (call_frame_t *rw_frame, xlator_t *this, - int loop_index) +sh_loop_read (call_frame_t *loop_frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t * rw_sh = NULL; - afr_sh_algo_diff_private_t * sh_priv = NULL; - struct sh_diff_loop_state *loop_state = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - uint32_t cookie = 0; + afr_private_t *priv = NULL; + afr_local_t *loop_local = NULL; + afr_self_heal_t *loop_sh = NULL; priv = this->private; - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; - - sh_frame = rw_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - sh_priv = sh->private; - - loop_state = sh_priv->loops[loop_index]; + loop_local = loop_frame->local; + loop_sh = &loop_local->self_heal; - cookie = __make_cookie (loop_index, sh->source); - - STACK_WIND_COOKIE (rw_frame, sh_diff_read_cbk, - (void *) (long) cookie, - priv->children[sh->source], - priv->children[sh->source]->fops->readv, - sh->healing_fd, sh_priv->block_size, - loop_state->offset); + STACK_WIND_COOKIE (loop_frame, sh_loop_read_cbk, + (void *) (long) loop_sh->source, + priv->children[loop_sh->source], + priv->children[loop_sh->source]->fops->readv, + loop_sh->healing_fd, loop_sh->block_size, + loop_sh->offset); return 0; } static int -sh_diff_checksum_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this, +sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, uint32_t weak_checksum, uint8_t *strong_checksum) { - afr_private_t * priv = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t *rw_sh = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_diff_private_t * sh_priv = NULL; - int loop_index = 0; + afr_private_t *priv = NULL; + afr_local_t *loop_local = NULL; + afr_self_heal_t *loop_sh = NULL; + call_frame_t *sh_frame = NULL; + afr_local_t *sh_local = NULL; + afr_self_heal_t *sh = NULL; + afr_sh_algo_private_t *sh_priv = NULL; int child_index = 0; - struct sh_diff_loop_state *loop_state = NULL; int call_count = 0; int i = 0; int write_needed = 0; priv = this->private; - rw_local = rw_frame->local; - rw_sh = &rw_local->self_heal; + loop_local = loop_frame->local; + loop_sh = &loop_local->self_heal; - sh_frame = rw_sh->sh_frame; + sh_frame = loop_sh->sh_frame; sh_local = sh_frame->local; sh = &sh_local->self_heal; sh_priv = sh->private; - child_index = __child_index ((uint32_t) (long) cookie); - loop_index = __loop_index ((uint32_t) (long) cookie); - - loop_state = sh_priv->loops[loop_index]; + child_index = (long) cookie; if (op_ret < 0) { gf_log (this->name, GF_LOG_ERROR, "checksum on %s failed on subvolume %s (%s)", sh_local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; } else { - memcpy (loop_state->checksum + child_index * MD5_DIGEST_LEN, - strong_checksum, - MD5_DIGEST_LEN); + memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LEN, + strong_checksum, MD5_DIGEST_LEN); } - call_count = afr_frame_return (rw_frame); + call_count = afr_frame_return (loop_frame); if (call_count == 0) { for (i = 0; i < priv->child_count; i++) { if (sh->sources[i] || !sh_local->child_up[i]) continue; - if (memcmp (loop_state->checksum + (i * MD5_DIGEST_LEN), - loop_state->checksum + (sh->source * MD5_DIGEST_LEN), + if (memcmp (loop_sh->checksum + (i * MD5_DIGEST_LEN), + loop_sh->checksum + (sh->source * MD5_DIGEST_LEN), MD5_DIGEST_LEN)) { /* Checksums differ, so this block @@ -802,9 +597,9 @@ sh_diff_checksum_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this, gf_log (this->name, GF_LOG_DEBUG, "checksum on subvolume %s at offset %" PRId64" differs from that on source", - priv->children[i]->name, loop_state->offset); + priv->children[i]->name, loop_sh->offset); - write_needed = loop_state->write_needed[i] = 1; + write_needed = loop_sh->write_needed[i] = 1; } } @@ -817,271 +612,130 @@ sh_diff_checksum_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this, UNLOCK (&sh_priv->lock); if (write_needed && !sh->op_failed) { - sh_diff_read (rw_frame, this, loop_index); + sh_loop_read (loop_frame, this); } else { - sh->offset += sh_priv->block_size; - - sh_diff_loop_return (rw_frame, this, loop_state); + sh_loop_return (sh_frame, this, loop_frame, + op_ret, op_errno); } } return 0; } - -static int -sh_diff_find_unused_loop (afr_sh_algo_diff_private_t *sh_priv, int max) -{ - int i = 0; - - LOCK (&sh_priv->lock); - { - for (i = 0; i < max; i++) { - if (sh_priv->loops[i]->active == _gf_false) { - sh_priv->loops[i]->active = _gf_true; - break; - } - } - } - UNLOCK (&sh_priv->lock); - - if (i == max) { - gf_log ("[sh-diff]", GF_LOG_ERROR, - "no free loops found! This shouldn't happen. Please" - " report this to gluster-devel@nongnu.org"); - } - - return i; -} - - static int -sh_diff_checksum (call_frame_t *frame, xlator_t *this, off_t offset) +sh_diff_checksum (call_frame_t *loop_frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_local_t * rw_local = NULL; - afr_self_heal_t * sh = NULL; - afr_self_heal_t * rw_sh = NULL; - afr_sh_algo_diff_private_t * sh_priv = NULL; - call_frame_t *rw_frame = NULL; - uint32_t cookie = 0; - int loop_index = 0; - struct sh_diff_loop_state *loop_state = NULL; - int32_t op_errno = 0; - int call_count = 0; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sh_priv = sh->private; - - rw_frame = copy_frame (frame); - if (!rw_frame) - goto out; - - ALLOC_OR_GOTO (rw_local, afr_local_t, out); + afr_private_t *priv = NULL; + afr_local_t *loop_local = NULL; + afr_self_heal_t *loop_sh = NULL; + afr_sh_algo_private_t *loop_sh_priv = NULL; + int call_count = 0; + int i = 0; - rw_frame->local = rw_local; - rw_sh = &rw_local->self_heal; - - rw_sh->offset = sh->offset; - rw_sh->sh_frame = frame; - - call_count = sh->active_sinks + 1; /* sinks and source */ - - rw_local->call_count = call_count; - - loop_index = sh_diff_find_unused_loop (sh_priv, priv->data_self_heal_window_size); + priv = this->private; + loop_local = loop_frame->local; + loop_sh = &loop_local->self_heal; - loop_state = sh_priv->loops[loop_index]; - loop_state->offset = offset; + loop_sh_priv = loop_sh->private; - /* we need to send both the loop index and child index, - so squeeze them both into a 32-bit number */ + call_count = loop_sh->active_sinks + 1; /* sinks and source */ - cookie = __make_cookie (loop_index, sh->source); + loop_local->call_count = call_count; - STACK_WIND_COOKIE (rw_frame, sh_diff_checksum_cbk, - (void *) (long) cookie, - priv->children[sh->source], - priv->children[sh->source]->fops->rchecksum, - sh->healing_fd, - offset, sh_priv->block_size); + STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk, + (void *) (long) loop_sh->source, + priv->children[loop_sh->source], + priv->children[loop_sh->source]->fops->rchecksum, + loop_sh->healing_fd, + loop_sh->offset, loop_sh->block_size); for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] || !local->child_up[i]) + if (loop_sh->sources[i] || !loop_local->child_up[i]) continue; - cookie = __make_cookie (loop_index, i); - - STACK_WIND_COOKIE (rw_frame, sh_diff_checksum_cbk, - (void *) (long) cookie, + STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk, + (void *) (long) i, priv->children[i], priv->children[i]->fops->rchecksum, - sh->healing_fd, - offset, sh_priv->block_size); + loop_sh->healing_fd, + loop_sh->offset, loop_sh->block_size); if (!--call_count) break; } - return 0; - -out: - sh->op_failed = 1; - - sh_diff_loop_driver (frame, this, _gf_false, loop_state); - return 0; } - static int -sh_diff_loop_driver (call_frame_t *frame, xlator_t *this, - gf_boolean_t is_first_call, - struct sh_diff_loop_state *loop_state) +sh_full_read_write_to_sinks (call_frame_t *loop_frame, xlator_t *this) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_diff_private_t *sh_priv = NULL; - gf_boolean_t is_driver_done = _gf_false; - blksize_t block_size = 0; - int loop = 0; - off_t offset = 0; - char sh_type_str[256] = {0,}; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - sh_priv = sh->private; - - afr_self_heal_type_str_get(sh, sh_type_str, sizeof(sh_type_str)); - - LOCK (&sh_priv->lock); - { - if (loop_state) - sh_diff_loop_state_reset (loop_state, priv->child_count); - if (_gf_false == is_first_call) - sh_priv->loops_running--; - offset = sh_priv->offset; - block_size = sh_priv->block_size; - while ((0 == sh->op_failed) && - (sh_priv->loops_running < priv->data_self_heal_window_size) - && (sh_priv->offset < sh->file_size)) { - - loop++; - gf_log (this->name, GF_LOG_TRACE, - "spawning a loop for offset %"PRId64, - sh_priv->offset); - - sh_priv->offset += sh_priv->block_size; - sh_priv->loops_running++; - - if (_gf_false == is_first_call) - break; + afr_private_t *priv = NULL; + afr_local_t *loop_local = NULL; + afr_self_heal_t *loop_sh = NULL; + int i = 0; - } - if (0 == sh_priv->loops_running) { - is_driver_done = _gf_true; - } - } - UNLOCK (&sh_priv->lock); - - while (loop--) { - if (sh->op_failed) { - // op failed in other loop, stop spawning more loops - sh_diff_loop_driver (frame, this, _gf_false, NULL); - } else { - sh_diff_checksum (frame, this, offset); - offset += block_size; - } - } + priv = this->private; + loop_local = loop_frame->local; + loop_sh = &loop_local->self_heal; - if (is_driver_done) { - sh_diff_loop_driver_done (frame, this); + for (i = 0; i < priv->child_count; i++) { + if (loop_sh->sources[i] || !loop_local->child_up[i]) + continue; + loop_sh->write_needed[i] = 1; } + sh_loop_read (loop_frame, this); return 0; } +static int +sh_do_nothing (call_frame_t *frame, xlator_t *this) +{ + return 0; +} int -afr_sh_algo_diff (call_frame_t *frame, xlator_t *this) +afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this, + afr_sh_algo_fn sh_data_algo_start) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_diff_private_t *sh_priv = NULL; - int i = 0; + afr_local_t *sh_local = NULL; + afr_self_heal_t *sh = NULL; + afr_sh_algo_private_t *sh_priv = NULL; - priv = this->private; - local = frame->local; - sh = &local->self_heal; + sh_local = sh_frame->local; + sh = &sh_local->self_heal; sh_priv = GF_CALLOC (1, sizeof (*sh_priv), gf_afr_mt_afr_private_t); if (!sh_priv) - goto err; - - sh_priv->block_size = this->ctx->page_size; - - sh->private = sh_priv; + goto out; LOCK_INIT (&sh_priv->lock); - local->call_count = 0; - - sh_priv->loops = GF_CALLOC (priv->data_self_heal_window_size, - sizeof (*sh_priv->loops), - gf_afr_mt_sh_diff_loop_state); - if (!sh_priv->loops) - goto err; - - for (i = 0; i < priv->data_self_heal_window_size; i++) { - sh_priv->loops[i] = GF_CALLOC (1, sizeof (*sh_priv->loops[i]), - gf_afr_mt_sh_diff_loop_state); - if (!sh_priv->loops[i]) - goto err; - - sh_priv->loops[i]->checksum = GF_CALLOC (priv->child_count, - MD5_DIGEST_LEN, gf_afr_mt_uint8_t); - if (!sh_priv->loops[i]->checksum) - goto err; - - sh_priv->loops[i]->write_needed = GF_CALLOC (priv->child_count, - sizeof (*sh_priv->loops[i]->write_needed), - gf_afr_mt_char); - if (!sh_priv->loops[i]->write_needed) - goto err; - - } + sh->private = sh_priv; + sh->sh_data_algo_start = sh_data_algo_start; - sh_diff_loop_driver (frame, this, _gf_true, NULL); + sh_local->call_count = 0; + sh->loop_completion_cbk = sh_do_nothing; + sh_loop_driver (sh_frame, this, _gf_true, sh_frame); +out: return 0; -err: - if (sh_priv) { - if (sh_priv->loops) { - for (i = 0; i < priv->data_self_heal_window_size; i++) { - if (sh_priv->loops[i]->write_needed) - GF_FREE (sh_priv->loops[i]->write_needed); - if (sh_priv->loops[i]->checksum) - GF_FREE (sh_priv->loops[i]->checksum); - if (sh_priv->loops[i]) - GF_FREE (sh_priv->loops[i]); - } - - GF_FREE (sh_priv->loops); - } +} - GF_FREE (sh_priv); - } +int +afr_sh_algo_diff (call_frame_t *sh_frame, xlator_t *this) +{ + afr_sh_start_loops (sh_frame, this, sh_diff_checksum); return 0; } +int +afr_sh_algo_full (call_frame_t *sh_frame, xlator_t *this) +{ + afr_sh_start_loops (sh_frame, this, sh_full_read_write_to_sinks); + return 0; +} struct afr_sh_algorithm afr_self_heal_algorithms[] = { {.name = "full", .fn = afr_sh_algo_full}, diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.h b/xlators/cluster/afr/src/afr-self-heal-algorithm.h index 2790dbc6a52..04d8e8a6ce7 100644 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.h +++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.h @@ -30,31 +30,13 @@ struct afr_sh_algorithm { }; extern struct afr_sh_algorithm afr_self_heal_algorithms[3]; - -typedef struct { - gf_lock_t lock; - unsigned int loops_running; - off_t offset; -} afr_sh_algo_full_private_t; - -struct sh_diff_loop_state { - off_t offset; - unsigned char *write_needed; - uint8_t *checksum; - gf_boolean_t active; -}; - typedef struct { - size_t block_size; - gf_lock_t lock; unsigned int loops_running; off_t offset; int32_t total_blocks; int32_t diff_blocks; - - struct sh_diff_loop_state **loops; -} afr_sh_algo_diff_private_t; +} afr_sh_algo_private_t; #endif /* __AFR_SELF_HEAL_ALGORITHM_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index f66bdff8446..0846184c29f 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -57,6 +57,29 @@ afr_sh_select_source (int sources[], int child_count) return -1; } +void +afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int active_sinks = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] == 0 && local->child_up[i] == 1) { + active_sinks++; + sh->success[i] = 1; + } else if (sh->sources[i] == 1 && local->child_up[i] == 1) { + sh->success[i] = 1; + } + } + sh->active_sinks = active_sinks; +} /** * sink_count - return number of sinks in sources array @@ -112,7 +135,7 @@ afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); } sprintf (ptr, "]"); - gf_log (this->name, GF_LOG_TRACE, + gf_log (this->name, GF_LOG_DEBUG, "pending_matrix: %s", buf); } @@ -718,7 +741,7 @@ out: void afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, - int32_t *delta_matrix[], int success[], + int32_t *delta_matrix[], unsigned char success[], int child_count, afr_transaction_type type) { /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ @@ -970,12 +993,13 @@ afr_sh_missing_entries_finish (call_frame_t *frame, xlator_t *this) return 0; } -static void +void afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, inode_t *inode, struct iatt *buf, - dict_t *xattr, struct iatt *postparent) + dict_t *xattr, struct iatt *postparent, + loc_t *loc) { int child_index = 0; afr_local_t *local = NULL; @@ -991,15 +1015,13 @@ afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, { if (op_ret == 0) { sh->buf[child_index] = *buf; - sh->parentbuf = *postparent; sh->parentbufs[child_index] = *postparent; sh->success_children[sh->success_count] = child_index; sh->success_count++; sh->xattr[child_index] = dict_ref (xattr); } else { - gf_log (this->name, GF_LOG_ERROR, - "path %s on subvolume %s => -1 (%s)", - local->loc.path, + gf_log (this->name, GF_LOG_ERROR, "path %s on subvolume" + " %s => -1 (%s)", loc->path, priv->children[child_index]->name, strerror (op_errno)); local->self_heal.child_errno[child_index] = op_errno; @@ -1201,6 +1223,7 @@ afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this) if (sh->gfid_sh_success_cbk) sh->gfid_sh_success_cbk (frame, this); + sh->type = sh->buf[sh->source].ia_type; sh_missing_entries_create (frame, this); return; out: @@ -1227,7 +1250,7 @@ afr_sh_missing_entries_lookup_cbk (call_frame_t *frame, void *cookie, afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, op_errno, inode, buf, xattr, - postparent); + postparent, &local->loc); call_count = afr_frame_return (frame); if (call_count == 0) @@ -1417,6 +1440,8 @@ afr_sh_purge_entry_common (call_frame_t *frame, xlator_t *this, for (i = 0; i < priv->child_count; i++) { if (!purge_condition (local, priv, i)) continue; + gf_log (this->name, GF_LOG_INFO, "purging the stale entry %s " + "on %d", local->loc.path, i); afr_sh_call_entry_expunge_remove (frame, this, (long) i, &sh->buf[i], afr_sh_remove_entry_cbk); @@ -1536,6 +1561,8 @@ afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this) sh->child_errno, priv->child_count, ENOENT); if (fresh_child_enoents == fresh_parent_count) { + gf_log (this->name, GF_LOG_INFO, "Deleting stale file %s", + local->loc.path); afr_sh_set_error (sh, ENOENT); sh->op_failed = 1; afr_sh_purge_entry (frame, this); @@ -1570,10 +1597,13 @@ afr_sh_children_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *postparent) { int call_count = 0; + afr_local_t *local = NULL; + + local = frame->local; afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, op_errno, inode, buf, xattr, - postparent); + postparent, &local->loc); call_count = afr_frame_return (frame); if (call_count == 0) @@ -1669,10 +1699,15 @@ afr_sh_conflicting_entry_lookup_cbk (call_frame_t *frame, void *cookie, dict_t *xattr, struct iatt *postparent) { int call_count = 0; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, op_errno, inode, buf, xattr, - postparent); + postparent, &sh->parent_loc); call_count = afr_frame_return (frame); if (call_count == 0) @@ -1716,8 +1751,7 @@ afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, priv = this->private; sh = &local->self_heal; - call_count = afr_up_children_count (priv->child_count, - local->child_up); + call_count = afr_up_children_count (local->child_up, priv->child_count); local->call_count = call_count; @@ -1728,7 +1762,7 @@ afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, if (set_gfid) { gf_log (this->name, GF_LOG_DEBUG, "looking up %s with gfid: %s", - local->loc.path, uuid_utoa (sh->sh_gfid_req)); + loc->path, uuid_utoa (sh->sh_gfid_req)); GF_ASSERT (!uuid_is_null (sh->sh_gfid_req)); afr_set_dict_gfid (xattr_req, sh->sh_gfid_req); } @@ -1739,7 +1773,7 @@ afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, if (local->child_up[i]) { gf_log (this->name, GF_LOG_DEBUG, "looking up %s on subvolume %s", - local->loc.path, priv->children[i]->name); + loc->path, priv->children[i]->name); STACK_WIND_COOKIE (frame, lookup_cbk, @@ -1906,12 +1940,7 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) shc->need_metadata_self_heal = sh->need_metadata_self_heal; shc->need_entry_self_heal = sh->need_entry_self_heal; shc->forced_merge = sh->forced_merge; - shc->healing_fd_opened = sh->healing_fd_opened; shc->data_lock_held = sh->data_lock_held; - if (sh->healing_fd && !sh->healing_fd_opened) - shc->healing_fd = fd_ref (sh->healing_fd); - else - shc->healing_fd = sh->healing_fd; shc->background = sh->background; shc->type = sh->type; @@ -1919,7 +1948,8 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) if (l->loc.path) loc_copy (&lc->loc, &l->loc); - lc->child_up = memdup (l->child_up, priv->child_count); + lc->child_up = memdup (l->child_up, + sizeof (*lc->child_up) * priv->child_count); if (l->xattr_req) lc->xattr_req = dict_ref (l->xattr_req); @@ -1930,7 +1960,7 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) if (l->internal_lock.inode_locked_nodes) lc->internal_lock.inode_locked_nodes = memdup (l->internal_lock.inode_locked_nodes, - priv->child_count); + sizeof (*lc->internal_lock.inode_locked_nodes) * priv->child_count); else lc->internal_lock.inode_locked_nodes = GF_CALLOC (sizeof (*l->internal_lock.inode_locked_nodes), @@ -1939,7 +1969,7 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) if (l->internal_lock.entry_locked_nodes) lc->internal_lock.entry_locked_nodes = memdup (l->internal_lock.entry_locked_nodes, - priv->child_count); + sizeof (*lc->internal_lock.entry_locked_nodes) * priv->child_count); else lc->internal_lock.entry_locked_nodes = GF_CALLOC (sizeof (*l->internal_lock.entry_locked_nodes), @@ -1948,7 +1978,7 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) if (l->internal_lock.locked_nodes) lc->internal_lock.locked_nodes = memdup (l->internal_lock.locked_nodes, - priv->child_count); + sizeof (*lc->internal_lock.locked_nodes) * priv->child_count); else lc->internal_lock.locked_nodes = GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), @@ -1994,7 +2024,7 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) FRAME_SU_UNDO (bgsh_frame, afr_local_t); - if (!sh->unwound) { + if (!sh->unwound && sh->unwind) { sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno); } @@ -2068,8 +2098,8 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) gf_afr_mt_iatt); sh->child_errno = GF_CALLOC (priv->child_count, sizeof (int), gf_afr_mt_int); - sh->success = GF_CALLOC (priv->child_count, sizeof (int), - gf_afr_mt_int); + sh->success = GF_CALLOC (priv->child_count, sizeof (*sh->success), + gf_afr_mt_char); sh->xattr = GF_CALLOC (priv->child_count, sizeof (dict_t *), gf_afr_mt_dict_t); sh->sources = GF_CALLOC (sizeof (*sh->sources), priv->child_count, diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h index 043ebea2da6..3df5f0a0aa6 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ b/xlators/cluster/afr/src/afr-self-heal-common.h @@ -53,7 +53,7 @@ afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, void afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, - int32_t *delta_matrix[], int success[], + int32_t *delta_matrix[], unsigned char success[], int child_count, afr_transaction_type type); int @@ -82,6 +82,15 @@ afr_build_sources (xlator_t *xlator, dict_t **xattr, struct iatt *bufs, int32_t *success_children, afr_transaction_type type); void afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count); + +void +afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *buf, + dict_t *xattr, struct iatt *postparent, + loc_t *loc); + int afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, afr_lookup_cbk_t lookup_cbk, gf_boolean_t set_gfid); @@ -95,4 +104,22 @@ int afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, int child_index, struct iatt *buf, struct iatt *postparent); +int +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, + afr_lock_cbk_t lock_cbk); +afr_local_t * +afr_local_copy (afr_local_t *l, xlator_t *this); +int +afr_sh_data_lock (call_frame_t *frame, xlator_t *this, + off_t start, off_t len, + afr_lock_cbk_t success_handler, + afr_lock_cbk_t failure_handler); +void +afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno); +void +afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this); +typedef int +(*afr_fxattrop_cbk_t) (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xattr); #endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index dcaad9c8b47..5db2d94f5cb 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -50,6 +50,18 @@ #include "afr-self-heal-algorithm.h" +extern int +sh_loop_finish (call_frame_t *loop_frame, xlator_t *this); + +int +afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this); + +int +afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this); + +int +afr_sh_data_finish (call_frame_t *frame, xlator_t *this); + int afr_sh_data_done (call_frame_t *frame, xlator_t *this) { @@ -61,20 +73,6 @@ afr_sh_data_done (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; priv = this->private; - /* - TODO: cleanup sh->* - */ - - if (sh->healing_fd && !sh->healing_fd_opened) { - /* unref only if we created the fd ourselves */ - - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; - } - - /* for (i = 0; i < priv->child_count; i++) */ - /* sh->locked_nodes[i] = 0; */ - sh->completion_cbk (frame, this); return 0; @@ -97,7 +95,7 @@ afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { if (op_ret == -1) { gf_log (this->name, GF_LOG_INFO, - "flush or setattr failed on %s on subvolume %s: %s", + "flush failed on %s on subvolume %s: %s", local->loc.path, priv->children[child_index]->name, strerror (op_errno)); } @@ -113,18 +111,6 @@ afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, return 0; } - -int -afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *statpre, - struct iatt *statpost) -{ - afr_sh_data_flush_cbk (frame, cookie, this, op_ret, op_errno); - - return 0; -} - - int afr_sh_data_close (call_frame_t *frame, xlator_t *this) { @@ -134,8 +120,6 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this) int i = 0; int call_count = 0; int source = 0; - int32_t valid = 0; - struct iatt stbuf = {0,}; local = frame->local; sh = &local->self_heal; @@ -143,30 +127,11 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this) source = sh->source; - valid |= (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME); - - stbuf.ia_atime = sh->buf[source].ia_atime; - stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec; - stbuf.ia_mtime = sh->buf[source].ia_mtime; - stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec; - - if (sh->healing_fd_opened) { - /* not our job to close the fd */ - - afr_sh_data_done (frame, this); - return 0; - } - - if (!sh->healing_fd) { - afr_sh_data_done (frame, this); - return 0; - } - - call_count = (sh->active_sinks + 1) * 2; + call_count = (sh->active_sinks + 1); local->call_count = call_count; /* closed source */ - gf_log (this->name, GF_LOG_TRACE, + gf_log (this->name, GF_LOG_DEBUG, "closing fd of %s on %s", local->loc.path, priv->children[sh->source]->name); @@ -177,14 +142,6 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this) sh->healing_fd); call_count--; - STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk, - (void *) (long) sh->source, - priv->children[sh->source], - priv->children[sh->source]->fops->setattr, - &local->loc, &stbuf, valid); - - call_count--; - if (call_count == 0) return 0; @@ -192,7 +149,7 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this) if (sh->sources[i] || !local->child_up[i]) continue; - gf_log (this->name, GF_LOG_TRACE, + gf_log (this->name, GF_LOG_DEBUG, "closing fd of %s on %s", local->loc.path, priv->children[i]->name); @@ -202,14 +159,6 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this) priv->children[i]->fops->flush, sh->healing_fd); - call_count--; - - STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setattr, - &local->loc, &stbuf, valid); - if (!--call_count) break; } @@ -217,28 +166,27 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this) return 0; } - int -afr_sh_data_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *statpre, + struct iatt *statpost) { - afr_local_t * local = NULL; - int call_count = 0; - int child_index = (long) cookie; + + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = (long) cookie; local = frame->local; + priv = this->private; LOCK (&frame->lock); { if (op_ret == -1) { gf_log (this->name, GF_LOG_INFO, - "locking inode of %s on child %d failed: %s", - local->loc.path, child_index, + "setattr failed on %s on subvolume %s: %s", + local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_TRACE, - "inode of %s on child %d locked", - local->loc.path, child_index); } } UNLOCK (&frame->lock); @@ -246,15 +194,114 @@ afr_sh_data_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { - afr_sh_data_close (frame, this); + afr_sh_data_finish (frame, this); } return 0; } +int +afr_sh_data_setattr (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + int i = 0; + int call_count = 0; + int source = 0; + int32_t valid = 0; + struct iatt stbuf = {0,}; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + + valid |= (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME); + + stbuf.ia_atime = sh->buf[source].ia_atime; + stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec; + stbuf.ia_mtime = sh->buf[source].ia_mtime; + stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec; + + call_count = afr_set_elem_count_get (sh->success, + priv->child_count); + local->call_count = call_count; + + if (call_count == 0) { + GF_ASSERT (0); + afr_sh_data_finish (frame, this); + return 0; + } + + for (i = 0; i < priv->child_count; i++) { + if (!sh->success[i]) + continue; + + STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->setattr, + &local->loc, &stbuf, valid); + + if (!--call_count) + break; + } + + return 0; +} int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this) +afr_sh_data_setattr_fstat_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct iatt *buf) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int child_index = (long) cookie; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + GF_ASSERT (sh->source == child_index); + if (op_ret != -1) + sh->buf[child_index] = *buf; + afr_sh_data_setattr (frame, this); + + return 0; +} + +/* + * If there are any writes after the self-heal is triggered then the + * stbuf stored in local->self_heal.buf[] will be invalid so we do one more + * stat on the source and then set the [am]times + */ +int +afr_sh_set_timestamps (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + STACK_WIND_COOKIE (frame, afr_sh_data_setattr_fstat_cbk, + (void *) (long) sh->source, + priv->children[sh->source], + priv->children[sh->source]->fops->fstat, + sh->healing_fd); + return 0; +} + +//Fun fact, lock_cbk is being used for both lock & unlock +int +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, + afr_lock_cbk_t lock_cbk) { afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; @@ -264,15 +311,15 @@ afr_sh_data_unlock (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; sh = &local->self_heal; - GF_ASSERT (!sh->data_lock_held); + GF_ASSERT (sh->data_lock_held); - int_lock->lock_cbk = afr_sh_data_close; + sh->data_lock_held = _gf_false; + int_lock->lock_cbk = lock_cbk; afr_unlock (frame, this); return 0; } - int afr_sh_data_finish (call_frame_t *frame, xlator_t *this) { @@ -285,44 +332,52 @@ afr_sh_data_finish (call_frame_t *frame, xlator_t *this) gf_log (this->name, GF_LOG_DEBUG, "finishing data selfheal of %s", local->loc.path); - if (!sh->data_lock_held) - afr_sh_data_unlock (frame, this); + if (sh->data_lock_held) + afr_sh_data_unlock (frame, this, afr_sh_data_close); else afr_sh_data_close (frame, this); return 0; } +int +afr_sh_data_fail (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + gf_log (this->name, GF_LOG_DEBUG, + "finishing failed data selfheal of %s", local->loc.path); + + sh->op_failed = 1; + if (sh->data_lock_held) + afr_sh_data_unlock (frame, this, afr_sh_data_close); + else + afr_sh_data_close (frame, this); + return 0; +} int afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xattr) { - afr_local_t *local = NULL; int call_count = 0; - long i = 0; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - i = (long)cookie; - afr_children_add_child (sh->fresh_children, i, priv->child_count); call_count = afr_frame_return (frame); if (call_count == 0) { - afr_inode_set_read_ctx (this, sh->inode, sh->source, - sh->fresh_children); - afr_sh_data_finish (frame, this); + afr_sh_data_lock (frame, this, 0, 0, + afr_post_sh_big_lock_success, + afr_post_sh_big_lock_failure); } return 0; } - int afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) { @@ -339,6 +394,9 @@ afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, sh->success, priv->child_count, AFR_DATA_TRANSACTION); + gf_log (this->name, GF_LOG_DEBUG, "Delta matrix for: %"PRIu64, + frame->root->lk_owner); + afr_sh_print_pending_matrix (sh->delta_matrix, this); erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, gf_afr_mt_dict_t); @@ -355,12 +413,13 @@ afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, priv->child_count, AFR_DATA_TRANSACTION); + GF_ASSERT (call_count); local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { if (!erase_xattr[i]) continue; - gf_log (this->name, GF_LOG_TRACE, + gf_log (this->name, GF_LOG_DEBUG, "erasing pending flags from %s on %s", local->loc.path, priv->children[i]->name); @@ -385,85 +444,6 @@ afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) } -int -afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - int call_count = 0; - int child_index = 0; - - priv = this->private; - local = frame->local; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) - gf_log (this->name, GF_LOG_INFO, - "ftruncate of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - else - gf_log (this->name, GF_LOG_TRACE, - "ftruncate of %s on subvolume %s completed", - local->loc.path, - priv->children[child_index]->name); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_data_erase_pending (frame, this); - } - - return 0; -} - - -int -afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; - int *sources = NULL; - int call_count = 0; - int i = 0; - - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sources = sh->sources; - call_count = sh->active_sinks; - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (sources[i] || !local->child_up[i]) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->ftruncate, - sh->healing_fd, sh->file_size); - - if (!--call_count) - break; - } - - return 0; -} - - static struct afr_sh_algorithm * sh_algo_from_name (xlator_t *this, char *name) { @@ -549,64 +529,138 @@ afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; - int active_sinks = 0; - int source = 0; - int i = 0; struct afr_sh_algorithm *sh_algo = NULL; local = frame->local; sh = &local->self_heal; priv = this->private; - source = sh->source; + sh->algo_completion_cbk = afr_sh_data_erase_pending; + sh->algo_abort_cbk = afr_sh_data_fail; - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] == 0 && local->child_up[i] == 1) { - active_sinks++; - sh->success[i] = 1; - } - } - sh->success[source] = 1; + sh_algo = afr_sh_data_pick_algo (frame, this); - if (active_sinks == 0) { - gf_log (this->name, GF_LOG_INFO, - "no active sinks for performing self-heal on file %s", - local->loc.path); - afr_sh_data_finish (frame, this); - return 0; + sh_algo->fn (frame, this); + + return 0; +} + +int +afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + int call_count = 0; + int child_index = 0; + + priv = this->private; + local = frame->local; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) + gf_log (this->name, GF_LOG_INFO, + "ftruncate of %s on subvolume %s failed (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + else + gf_log (this->name, GF_LOG_DEBUG, + "ftruncate of %s on subvolume %s completed", + local->loc.path, + priv->children[child_index]->name); } - sh->active_sinks = active_sinks; + UNLOCK (&frame->lock); - gf_log (this->name, GF_LOG_DEBUG, - "self-healing file %s from subvolume %s to %d other", - local->loc.path, priv->children[source]->name, active_sinks); + call_count = afr_frame_return (frame); - sh->algo_completion_cbk = afr_sh_data_trim_sinks; - sh->algo_abort_cbk = afr_sh_data_finish; + if (call_count == 0) + afr_sh_data_sync_prepare (frame, this); - sh_algo = afr_sh_data_pick_algo (frame, this); + return 0; +} - sh_algo->fn (frame, this); + +int +afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + int *sources = NULL; + int call_count = 0; + int i = 0; + + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + sources = sh->sources; + call_count = sh->active_sinks; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (sources[i] || !local->child_up[i]) + continue; + + STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->ftruncate, + sh->healing_fd, sh->file_size); + + if (!--call_count) + break; + } return 0; } +int +afr_sh_inode_set_read_ctx (afr_self_heal_t *sh, xlator_t *this) +{ + afr_private_t *priv = NULL; + int ret = 0; + + priv = this->private; + sh->source = afr_sh_select_source (sh->sources, priv->child_count); + if (sh->source < 0) { + ret = -1; + goto out; + } + + afr_reset_children (sh->fresh_children, priv->child_count); + afr_get_fresh_children (sh->success_children, sh->sources, + sh->fresh_children, priv->child_count); + afr_inode_set_read_ctx (this, sh->inode, sh->source, + sh->fresh_children); +out: + return ret; +} int afr_sh_data_fix (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; - afr_local_t * orig_local = NULL; afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; int nsources = 0; int source = 0; int i = 0; + int ret = 0; local = frame->local; sh = &local->self_heal; priv = this->private; + gf_log (this->name, GF_LOG_DEBUG, "Pending matrix for: %"PRIu64, + frame->root->lk_owner); nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, sh->sources, sh->success_children, AFR_DATA_TRANSACTION); @@ -643,30 +697,26 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this) local->govinda_gOvinda = 1; - afr_sh_data_finish (frame, this); + afr_sh_data_fail (frame, this); return 0; } - source = afr_sh_select_source (sh->sources, priv->child_count); - - if (source == -1) { + ret = afr_sh_inode_set_read_ctx (sh, this); + if (ret) { gf_log (this->name, GF_LOG_DEBUG, "No active sources found."); - afr_sh_data_finish (frame, this); + afr_sh_data_fail (frame, this); return 0; } - sh->source = source; - sh->block_size = 65536; /* TODO: make it configurable or use macro */ + source = sh->source; + sh->block_size = this->ctx->page_size; sh->file_size = sh->buf[source].ia_size; if (FILE_HAS_HOLES (&sh->buf[source])) sh->file_has_holes = 1; - orig_local = sh->orig_frame->local; - orig_local->cont.lookup.buf.ia_size = sh->buf[source].ia_size; - /* detect changes not visible through pending flags -- JIC */ for (i = 0; i < priv->child_count; i++) { if (i == source || sh->child_errno[i]) @@ -676,27 +726,25 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this) sh->sources[i] = 0; } - afr_reset_children (sh->fresh_children, priv->child_count); - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); - afr_inode_set_read_ctx (this, sh->inode, sh->source, - sh->fresh_children); - - /* - quick-read might have read the file, so send xattr from - the source subvolume (http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=815) - */ - - dict_unref (orig_local->cont.lookup.xattr); - if (orig_local->cont.lookup.xattrs) - orig_local->cont.lookup.xattr = dict_ref (orig_local->cont.lookup.xattrs[sh->source]); - - if (sh->background) { + if (sh->background && sh->unwind) { sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno); sh->unwound = _gf_true; } - afr_sh_data_sync_prepare (frame, this); + afr_sh_mark_source_sinks (frame, this); + if (sh->active_sinks == 0) { + gf_log (this->name, GF_LOG_INFO, + "no active sinks for performing self-heal on file %s", + local->loc.path); + afr_sh_data_finish (frame, this); + return 0; + } + + gf_log (this->name, GF_LOG_DEBUG, + "self-healing file %s from subvolume %s to %d other", + local->loc.path, priv->children[sh->source]->name, + sh->active_sinks); + afr_sh_data_trim_sinks (frame, this); return 0; } @@ -855,8 +903,8 @@ afr_sh_data_fstat (call_frame_t *frame, xlator_t *this) local = frame->local; sh = &local->self_heal; - call_count = afr_up_children_count (priv->child_count, - local->child_up); + call_count = afr_up_children_count (local->child_up, + priv->child_count); local->call_count = call_count; @@ -878,16 +926,14 @@ afr_sh_data_fstat (call_frame_t *frame, xlator_t *this) return 0; } - -int -afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr) +void +afr_sh_common_fxattrop_resp_handler (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xattr) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; - int call_count = -1; int child_index = (long) cookie; local = frame->local; @@ -903,12 +949,55 @@ afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, priv->children[child_index]->name); sh->xattr[child_index] = dict_ref (xattr); + sh->success_children[sh->success_count] = child_index; + sh->success_count++; } } UNLOCK (&frame->lock); +} + +int +afr_post_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xattr) +{ + int call_count = -1; + int ret = 0; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + afr_sh_common_fxattrop_resp_handler (frame, cookie, this, op_ret, + op_errno, xattr); + local = frame->local; + sh = &local->self_heal; call_count = afr_frame_return (frame); + if (call_count == 0) { + (void) afr_build_sources (this, sh->xattr, NULL, + sh->pending_matrix, + sh->sources, sh->success_children, + AFR_DATA_TRANSACTION); + ret = afr_sh_inode_set_read_ctx (sh, this); + if (ret) + afr_sh_data_fail (frame, this); + else + afr_sh_set_timestamps (frame, this); + } + return 0; +} + +int +afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xattr) +{ + int call_count = -1; + + afr_sh_common_fxattrop_resp_handler (frame, cookie, this, op_ret, + op_errno, xattr); + + call_count = afr_frame_return (frame); if (call_count == 0) { afr_sh_data_fstat (frame, this); } @@ -918,7 +1007,8 @@ afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, int -afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this) +afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this, + afr_fxattrop_cbk_t fxattrop_cbk) { afr_self_heal_t *sh = NULL; afr_local_t *local = NULL; @@ -933,8 +1023,8 @@ afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this) local = frame->local; sh = &local->self_heal; - call_count = afr_up_children_count (priv->child_count, - local->child_up); + call_count = afr_up_children_count (local->child_up, + priv->child_count); local->call_count = call_count; @@ -963,9 +1053,12 @@ afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this) } } + afr_reset_xattr (sh->xattr, priv->child_count); + afr_reset_children (sh->success_children, priv->child_count); + sh->success_count = 0; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_sh_data_fxattrop_cbk, + STACK_WIND_COOKIE (frame, fxattrop_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->fxattrop, @@ -992,7 +1085,45 @@ out: } int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this); +afr_sh_data_big_lock_success (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + sh->data_lock_held = _gf_true; + afr_sh_data_fxattrop (frame, this, afr_sh_data_fxattrop_cbk); + return 0; +} + +int +afr_sh_data_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + int_lock = &local->internal_lock; + sh = &local->self_heal; + + if (int_lock->lock_op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Blocking data inodelks " + "failed for %s. by %"PRIu64, + local->loc.path, frame->root->lk_owner); + sh->data_lock_failure_handler (frame, this); + } else { + + gf_log (this->name, GF_LOG_DEBUG, "Blocking data inodelks " + "done for %s by %"PRIu64". Proceding to self-heal", + local->loc.path, frame->root->lk_owner); + sh->data_lock_success_handler (frame, this); + } + + return 0; +} int afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) @@ -1006,22 +1137,24 @@ afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "Non Blocking data inodelks " - "failed for %s.", local->loc.path); - sh->op_failed = 1; - afr_sh_data_done (frame, this); + gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks " + "failed for %s. by %"PRIu64, + local->loc.path, frame->root->lk_owner); + int_lock->lock_cbk = afr_sh_data_post_blocking_inodelk_cbk; + afr_blocking_lock (frame, this); } else { gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks " - "done for %s. Proceeding to FOP", local->loc.path); - afr_sh_data_fxattrop (frame, this); + "done for %s by %"PRIu64". Proceeding to self-heal", + local->loc.path, frame->root->lk_owner); + sh->data_lock_success_handler (frame, this); } return 0; } int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this) +afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, off_t start, off_t len) { afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; @@ -1036,8 +1169,8 @@ afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this) afr_set_lock_number (frame, this); - int_lock->lk_flock.l_start = 0; - int_lock->lk_flock.l_len = 0; + int_lock->lk_flock.l_start = start; + int_lock->lk_flock.l_len = len; int_lock->lk_flock.l_type = F_WRLCK; int_lock->lock_cbk = afr_sh_data_post_nonblocking_inodelk_cbk; @@ -1046,9 +1179,45 @@ afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this) return 0; } +int +afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + GF_ASSERT (sh->old_loop_frame); + sh_loop_finish (sh->old_loop_frame, this); + sh->old_loop_frame = NULL; + sh->data_lock_held = _gf_true; + afr_sh_data_fxattrop (frame, this, afr_post_sh_data_fxattrop_cbk); + return 0; +} int -afr_sh_data_lock (call_frame_t *frame, xlator_t *this) +afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + GF_ASSERT (sh->old_loop_frame); + sh_loop_finish (sh->old_loop_frame, this); + sh->old_loop_frame = NULL; + afr_sh_set_timestamps (frame, this); + return 0; +} + + +int +afr_sh_data_lock (call_frame_t *frame, xlator_t *this, + off_t start, off_t len, + afr_lock_cbk_t success_handler, + afr_lock_cbk_t failure_handler) { afr_local_t * local = NULL; afr_private_t * priv = NULL; @@ -1059,18 +1228,11 @@ afr_sh_data_lock (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; priv = this->private; - if (sh->data_lock_held) { - /* caller has held the lock already, - so skip locking */ - - afr_sh_data_fxattrop (frame, this); - return 0; - } - - return afr_sh_data_lock_rec (frame, this); + sh->data_lock_success_handler = success_handler; + sh->data_lock_failure_handler = failure_handler; + return afr_sh_data_lock_rec (frame, this, start, len); } - int afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd) @@ -1113,7 +1275,7 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (call_count == 0) { if (sh->op_failed) { - afr_sh_data_finish (frame, this); + afr_sh_data_fail (frame, this); return 0; } @@ -1121,7 +1283,9 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "fd for %s opened, commencing sync", local->loc.path); - afr_sh_data_lock (frame, this); + afr_sh_data_lock (frame, this, 0, 0, + afr_sh_data_big_lock_success, + afr_sh_data_fail); } return 0; @@ -1142,14 +1306,7 @@ afr_sh_data_open (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; priv = this->private; - if (sh->healing_fd_opened) { - /* caller has opened the fd for us already, so skip open */ - - afr_sh_data_lock (frame, this); - return 0; - } - - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_up_children_count (local->child_up, priv->child_count); local->call_count = call_count; fd = fd_create (local->loc.inode, frame->root->pid); diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 9e80cb3d5a5..ddca2619db8 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -60,18 +60,10 @@ afr_sh_entry_done (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; priv = this->private; - /* - TODO: cleanup sh->* - */ - if (sh->healing_fd) fd_unref (sh->healing_fd); sh->healing_fd = NULL; - /* for (i = 0; i < priv->child_count; i++) { */ - /* sh->locked_nodes[i] = 0; */ - /* } */ - sh->completion_cbk (frame, this); return 0; @@ -2192,9 +2184,7 @@ afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; - int active_sinks = 0; int source = 0; - int i = 0; local = frame->local; sh = &local->self_heal; @@ -2202,37 +2192,31 @@ afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this) source = sh->source; - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] == 0 && local->child_up[i] == 1) { - active_sinks++; - sh->success[i] = 1; - } - } + afr_sh_mark_source_sinks (frame, this); if (source != -1) sh->success[source] = 1; - if (active_sinks == 0) { + if (sh->active_sinks == 0) { gf_log (this->name, GF_LOG_TRACE, "no active sinks for self-heal on dir %s", local->loc.path); afr_sh_entry_finish (frame, this); return 0; } - if (source == -1 && active_sinks < 2) { + if (source == -1 && sh->active_sinks < 2) { gf_log (this->name, GF_LOG_TRACE, "cannot sync with 0 sources and 1 sink on dir %s", local->loc.path); afr_sh_entry_finish (frame, this); return 0; } - sh->active_sinks = active_sinks; if (source != -1) gf_log (this->name, GF_LOG_DEBUG, "self-healing directory %s from subvolume %s to " "%d other", local->loc.path, priv->children[source]->name, - active_sinks); + sh->active_sinks); else gf_log (this->name, GF_LOG_DEBUG, "no active sources for %s found. " @@ -2302,25 +2286,13 @@ afr_sh_entry_lookup_cbk (call_frame_t *frame, void *cookie, inode_t *inode, struct iatt *buf, dict_t *xattr, struct iatt *postparent) { + int call_count = 0; afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - int call_count = -1; - int child_index = (long) cookie; local = frame->local; - sh = &local->self_heal; - - LOCK (&frame->lock); - { - if (op_ret != -1) { - sh->xattr[child_index] = dict_ref (xattr); - sh->buf[child_index] = *buf; - sh->success_children[sh->success_count] = child_index; - sh->success_count++; - } - } - UNLOCK (&frame->lock); + afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, + op_errno, inode, buf, xattr, + postparent, &local->loc); call_count = afr_frame_return (frame); diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index 5445132ab8c..fb92cd999ed 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -55,7 +55,6 @@ afr_sh_metadata_done (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; - int i = 0; local = frame->local; sh = &local->self_heal; @@ -63,18 +62,9 @@ afr_sh_metadata_done (call_frame_t *frame, xlator_t *this) // memset (sh->child_errno, 0, sizeof (int) * priv->child_count); memset (sh->buf, 0, sizeof (struct iatt) * priv->child_count); - memset (sh->success, 0, sizeof (int) * priv->child_count); - -/* for (i = 0; i < priv->child_count; i++) { */ -/* sh->locked_nodes[i] = 1; */ -/* } */ - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } + memset (sh->success, 0, sizeof (*sh->success) * priv->child_count); + afr_reset_xattr (sh->xattr, priv->child_count); if (local->govinda_gOvinda) { gf_log (this->name, GF_LOG_INFO, "split-brain detected, aborting selfheal of %s", @@ -438,9 +428,7 @@ afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; - int active_sinks = 0; int source = 0; - int i = 0; local = frame->local; sh = &local->self_heal; @@ -448,26 +436,19 @@ afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) source = sh->source; - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] == 0 && local->child_up[i] == 1) { - active_sinks++; - sh->success[i] = 1; - } - } - sh->success[source] = 1; - - if (active_sinks == 0) { + afr_sh_mark_source_sinks (frame, this); + if (sh->active_sinks == 0) { gf_log (this->name, GF_LOG_DEBUG, "no active sinks for performing self-heal on file %s", local->loc.path); afr_sh_metadata_finish (frame, this); return 0; } - sh->active_sinks = active_sinks; gf_log (this->name, GF_LOG_TRACE, "syncing metadata of %s from subvolume %s to %d active sinks", - local->loc.path, priv->children[source]->name, active_sinks); + local->loc.path, priv->children[source]->name, + sh->active_sinks); STACK_WIND (frame, afr_sh_metadata_getxattr_cbk, priv->children[source], @@ -558,8 +539,7 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this) if ((!IA_ISREG (sh->buf[source].ia_type)) && (!IA_ISDIR (sh->buf[source].ia_type))) { - afr_reset_children (sh->fresh_children, - priv->child_count); + afr_reset_children (sh->fresh_children, priv->child_count); afr_get_fresh_children (sh->success_children, sh->sources, sh->fresh_children, priv->child_count); afr_inode_set_read_ctx (this, sh->inode, sh->source, @@ -579,43 +559,17 @@ afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *postparent) { afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; int call_count = 0; int child_index = 0; local = frame->local; - sh = &local->self_heal; - priv = this->private; child_index = (long) cookie; - LOCK (&frame->lock); - { - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "path %s on subvolume %s is of mode 0%o", - local->loc.path, - priv->children[child_index]->name, - buf->ia_type); - - sh->buf[child_index] = *buf; - if (xattr) - sh->xattr[child_index] = dict_ref (xattr); - sh->success_children[sh->success_count] = child_index; - sh->success_count++; - } else { - gf_log (this->name, GF_LOG_INFO, - "path %s on subvolume %s => -1 (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - sh->child_errno[child_index] = op_errno; - } - } - UNLOCK (&frame->lock); + afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, + op_errno, inode, buf, xattr, + postparent, &local->loc); call_count = afr_frame_return (frame); diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index fc030433b69..6b0e9a7cfe5 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -322,7 +322,7 @@ afr_set_piggyback_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, memcpy (arr, pending[i], pending_xattr_size); - arr[index]++; + arr[index] = hton32 (ntoh32(arr[index]) + 1); ret = dict_set_bin (xattr, priv->pending_key[i], arr, pending_xattr_size); @@ -468,6 +468,32 @@ out: return; } +int +afr_fxattrop_call_count (afr_transaction_type type, afr_internal_lock_t *int_lock, + unsigned int child_count) +{ + int call_count = 0; + + switch (type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + call_count = afr_locked_children_count (int_lock->inode_locked_nodes, + child_count); + break; + + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + call_count = afr_locked_children_count (int_lock->entry_locked_nodes, + child_count); + break; + } + + if (type == AFR_ENTRY_RENAME_TRANSACTION) { + call_count *= 2; + } + return call_count; +} + int afr_changelog_post_op (call_frame_t *frame, xlator_t *this) @@ -503,12 +529,8 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) dict_ref (xattr[i]); } - call_count = afr_up_children_count (priv->child_count, local->child_up); - - if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { - call_count *= 2; - } - + call_count = afr_fxattrop_call_count (local->transaction.type, int_lock, + priv->child_count); local->call_count = call_count; if (local->fd) @@ -546,6 +568,8 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) for (i = 0; i < priv->child_count; i++) { if (!local->child_up[i]) continue; + if (local->fd && !local->fd_open_on[i]) + continue; ret = afr_set_pending_dict (priv, xattr[i], local->pending); @@ -750,7 +774,6 @@ afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, return 0; } - int afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) { @@ -762,8 +785,10 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) afr_fd_ctx_t *fdctx = NULL; afr_local_t *local = NULL; int piggyback = 0; + afr_internal_lock_t *int_lock = NULL; local = frame->local; + int_lock = &local->internal_lock; xattr = alloca (priv->child_count * sizeof (*xattr)); memset (xattr, 0, (priv->child_count * sizeof (*xattr))); @@ -773,13 +798,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) dict_ref (xattr[i]); } - call_count = afr_up_children_count (priv->child_count, - local->child_up); - - if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { - call_count *= 2; - } - + call_count = afr_fxattrop_call_count (local->transaction.type, int_lock, + priv->child_count); if (call_count == 0) { /* no child is up */ for (i = 0; i < priv->child_count; i++) { @@ -803,6 +823,9 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) for (i = 0; i < priv->child_count; i++) { if (!local->child_up[i]) continue; + if (local->fd && !local->fd_open_on[i]) + continue; + ret = afr_set_pending_dict (priv, xattr[i], local->pending); @@ -1246,7 +1269,7 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) local = frame->local; priv = this->private; - afr_transaction_local_init (local, priv); + afr_transaction_local_init (local, this); local->transaction.resume = afr_transaction_resume; local->transaction.type = type; diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index 4b4428cc59e..10f274feccc 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -30,4 +30,6 @@ afr_lock_server_count (afr_private_t *priv, afr_transaction_type type); int32_t afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this); #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 236a24a6057..a392dbefa97 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -29,6 +29,7 @@ #include "call-stub.h" #include "compat-errno.h" #include "afr-mem-types.h" +#include "afr-self-heal-algorithm.h" #include "libxlator.h" @@ -147,17 +148,8 @@ typedef struct { gf_boolean_t forced_merge; /* Is this a self-heal triggered to forcibly merge the directories? */ - gf_boolean_t healing_fd_opened; /* true if caller has already - opened fd */ - - gf_boolean_t data_lock_held; /* true if caller has already - acquired 0-0 lock */ - - fd_t *healing_fd; /* set if callers has opened fd */ - gf_boolean_t background; /* do self-heal in background if possible */ - ia_type_t type; /* st_mode of the entry we're doing self-heal on */ inode_t *inode; /* inode on which the self-heal is @@ -208,7 +200,7 @@ typedef struct { int source; int active_source; int active_sinks; - int *success; + unsigned char *success; unsigned char *locked_nodes; int lock_count; @@ -217,24 +209,31 @@ typedef struct { int op_failed; + gf_boolean_t data_lock_held; + gf_boolean_t eof_reached; + fd_t *healing_fd; int file_has_holes; blksize_t block_size; off_t file_size; off_t offset; + unsigned char *write_needed; + uint8_t *checksum; afr_post_remove_call_t post_remove_call; loc_t parent_loc; call_frame_t *orig_frame; + call_frame_t *old_loop_frame; gf_boolean_t unwound; - /* private data for the particular self-heal algorithm */ - void *private; - - int (*flush_self_heal_cbk) (call_frame_t *frame, xlator_t *this); + afr_sh_algo_private_t *private; + afr_lock_cbk_t data_lock_success_handler; + afr_lock_cbk_t data_lock_failure_handler; int (*completion_cbk) (call_frame_t *frame, xlator_t *this); + int (*sh_data_algo_start) (call_frame_t *frame, xlator_t *this); int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this); + afr_lock_cbk_t loop_completion_cbk; int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this); void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this); @@ -327,12 +326,11 @@ typedef struct { uint64_t lock_number; int32_t lk_call_count; + int32_t lk_expected_count; int32_t lock_op_ret; int32_t lock_op_errno; - - int (*lock_cbk) (call_frame_t *, xlator_t *); - + afr_lock_cbk_t lock_cbk; } afr_internal_lock_t; typedef struct _afr_locked_fd { @@ -365,6 +363,7 @@ typedef struct _afr_local { loc_t newloc; fd_t *fd; + int32_t *fd_open_on; glusterfs_fop_t fop; @@ -387,7 +386,7 @@ typedef struct _afr_local { dict_t *dict; int optimistic_change_log; - int (*openfd_flush_cbk) (call_frame_t *frame, xlator_t *this); + int (*fop_call_continue) (call_frame_t *frame, xlator_t *this); /* This struct contains the arguments for the "continuation" @@ -685,10 +684,20 @@ typedef struct _afr_local { struct marker_str marker; } afr_local_t; +typedef enum { + AFR_FD_NOT_OPENED, + AFR_FD_OPENED, + AFR_FD_OPENING +} afr_fd_open_status_t; + +typedef struct { + struct list_head call_list; + call_frame_t *frame; +} afr_fd_paused_call_t; typedef struct { unsigned int *pre_op_done; - unsigned int *opened_on; /* which subvolumes the fd is open on */ + afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ unsigned int *pre_op_piggyback; int flags; @@ -703,6 +712,7 @@ typedef struct { struct list_head entries; /* needed for readdir failover */ unsigned char *locked_on; /* which subvolumes locks have been successful */ + struct list_head paused_calls; /* queued calls while fix_open happens */ } afr_fd_ctx_t; @@ -790,8 +800,11 @@ afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, void afr_build_parent_loc (loc_t *parent, loc_t *child); -int -afr_up_children_count (int child_count, unsigned char *child_up); +unsigned int +afr_up_children_count (unsigned char *child_up, unsigned int child_count); + +unsigned int +afr_locked_children_count (unsigned char *children, unsigned int child_count); gf_boolean_t afr_is_fresh_lookup (loc_t *loc, xlator_t *this); @@ -831,7 +844,7 @@ int afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd); int -afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd); +afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd); #define AFR_STACK_UNWIND(fop, frame, params ...) \ do { \ @@ -872,7 +885,7 @@ AFR_BASENAME (const char *str) } int -afr_transaction_local_init (afr_local_t *local, afr_private_t *priv); +afr_transaction_local_init (afr_local_t *local, xlator_t *this); int32_t afr_marker_getxattr (call_frame_t *frame, xlator_t *this, @@ -957,4 +970,18 @@ afr_resultant_errno_get (int32_t *children, void afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, int32_t read_child, int32_t *stale_children); +void +afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, + gf_boolean_t is_background, ia_type_t ia_type, + void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, + xlator_t *this), + int (*unwind) (call_frame_t *frame, xlator_t *this, + int32_t op_ret, int32_t op_errno)); +int +afr_fix_open (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx, + int need_open_count, int *need_open); +int +afr_open_fd_fix (call_frame_t *frame, xlator_t *this, gf_boolean_t pause_fop); +int +afr_set_elem_count_get (unsigned char *elems, int child_count); #endif /* __AFR_H__ */ diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c index 769e03b14ea..d27ecd21526 100644 --- a/xlators/cluster/afr/src/pump.c +++ b/xlators/cluster/afr/src/pump.c @@ -1654,7 +1654,7 @@ afr_setxattr_wind (call_frame_t *frame, xlator_t *this) local = frame->local; priv = this->private; - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_up_children_count (local->child_up, priv->child_count); if (call_count == 0) { local->transaction.resume (frame, this); diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c index 71775439010..1fd6a7e0685 100644 --- a/xlators/features/locks/src/inodelk.c +++ b/xlators/features/locks/src/inodelk.c @@ -143,7 +143,8 @@ __inodelk_grantable (pl_dom_list_t *dom, pl_inode_lock_t *lock) if (list_empty (&dom->inodelk_list)) goto out; list_for_each_entry (l, &dom->inodelk_list, list){ - if (inodelk_conflict (lock, l)) { + if (inodelk_conflict (lock, l) && + !same_inodelk_owner (lock, l)) { ret = l; goto out; } -- cgit