From aee339605337916aaa1e38a0e9ed2422f0f0dcfb Mon Sep 17 00:00:00 2001 From: "Anand V. Avati" Date: Wed, 29 Sep 2010 00:28:07 +0000 Subject: replicate: replace first-write-to-flush optimization use a changelog piggybacking optimization instead of first-write-to-flush optimization and do other cleanups (removal of post-post-op hook etc.) Signed-off-by: Anand V. Avati Signed-off-by: Anand V. Avati Signed-off-by: Vijay Bellur BUG: 1235 (Bug for all pump/migrate commits) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=1235 --- xlators/cluster/afr/src/afr-common.c | 142 ++---- xlators/cluster/afr/src/afr-inode-write.c | 17 +- xlators/cluster/afr/src/afr-lk-common.c | 4 - xlators/cluster/afr/src/afr-open.c | 61 +-- xlators/cluster/afr/src/afr-transaction.c | 768 ++++++++++++++---------------- xlators/cluster/afr/src/afr.h | 24 +- 6 files changed, 422 insertions(+), 594 deletions(-) (limited to 'xlators') diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 13eb57e3ec5..b5f060a8730 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1188,25 +1188,22 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd) goto unlock; } - fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->opened_on) { + fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->pre_op_piggyback) { gf_log (this->name, GF_LOG_ERROR, "Out of memory"); ret = -ENOMEM; goto unlock; } - fd_ctx->child_failed = GF_CALLOC ( - sizeof (*fd_ctx->child_failed), - priv->child_count, - gf_afr_mt_char); - - if (!fd_ctx->child_failed) { + fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), + priv->child_count, + gf_afr_mt_char); + if (!fd_ctx->opened_on) { gf_log (this->name, GF_LOG_ERROR, "Out of memory"); - ret = -ENOMEM; goto unlock; } @@ -1351,73 +1348,6 @@ afr_flush_done (call_frame_t *frame, xlator_t *this) } -int -afr_plain_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) - -{ - afr_local_t *local = NULL; - - int call_count = -1; - - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == 0) - local->op_ret = 0; - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - AFR_STACK_UNWIND (flush, frame, local->op_ret, local->op_errno); - - return 0; -} - - -static int -__no_pre_op_done (xlator_t *this, fd_t *fd) -{ - int i = 0; - int op_ret = 1; - - int _ret = 0; - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; - - afr_private_t *priv = NULL; - - priv = this->private; - - LOCK (&fd->lock); - { - _ret = __fd_ctx_get (fd, this, &ctx); - - if (_ret < 0) { - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - for (i = 0; i < priv->child_count; i++) { - if (fd_ctx->pre_op_done[i]) { - op_ret = 0; - break; - } - } - } -out: - UNLOCK (&fd->lock); - - return op_ret; -} - - int afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) { @@ -1431,7 +1361,6 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) int op_ret = -1; int op_errno = 0; - int i = 0; int call_count = 0; VALIDATE_OR_GOTO (frame, out); @@ -1450,45 +1379,30 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) call_count = afr_up_children_count (priv->child_count, local->child_up); - if (__no_pre_op_done (this, fd)) { - frame->local = local; + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "Out of memory."); + goto out; + } - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_plain_flush_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->flush, - fd); - if (!--call_count) - break; - } - } - } else { - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } + transaction_frame->local = local; - transaction_frame->local = local; + local->op = GF_FOP_FLUSH; - local->op = GF_FOP_FLUSH; + local->transaction.fop = afr_flush_wind; + local->transaction.done = afr_flush_done; + local->transaction.unwind = afr_flush_unwind; - local->transaction.fop = afr_flush_wind; - local->transaction.done = afr_flush_done; - local->transaction.unwind = afr_flush_unwind; + local->fd = fd_ref (fd); - local->fd = fd_ref (fd); + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; - local->transaction.main_frame = frame; - local->transaction.start = 0; - local->transaction.len = 0; + afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - afr_transaction (transaction_frame, this, AFR_FLUSH_TRANSACTION); - } op_ret = 0; out: @@ -1519,10 +1433,10 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) fd_ctx = (afr_fd_ctx_t *)(long) ctx; - if (fd_ctx) { - if (fd_ctx->child_failed) - GF_FREE (fd_ctx->child_failed); + gf_log (this->name, GF_LOG_TRACE, + "hits=%d, miss=%d", fd_ctx->hit, fd_ctx->miss); + if (fd_ctx) { if (fd_ctx->pre_op_done) GF_FREE (fd_ctx->pre_op_done); diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 587e4840008..5a2c40cc467 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -291,14 +291,9 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, fd_ctx = (afr_fd_ctx_t *)(long) ctx; - if (fd_ctx->down_count < priv->down_count) { - local->up_down_flush_cbk = afr_do_writev; - afr_up_down_flush (frame, this, fd, AFR_CHILD_DOWN_FLUSH); - - } else if (fd_ctx->up_count < priv->up_count) { - local->up_down_flush_cbk = afr_do_writev; - afr_up_down_flush (frame, this, fd, AFR_CHILD_UP_FLUSH); - + if (fd_ctx->up_count < priv->up_count) { + local->openfd_flush_cbk = afr_do_writev; + afr_openfd_flush (frame, this, fd); } else { afr_do_writev (frame, this); } @@ -769,9 +764,9 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this, fd_ctx = (afr_fd_ctx_t *)(long) ctx; - if (fd_ctx->down_count < priv->down_count) { - local->up_down_flush_cbk = afr_do_ftruncate; - afr_up_down_flush (frame, this, fd, AFR_CHILD_DOWN_FLUSH); + if (fd_ctx->up_count < priv->up_count) { + local->openfd_flush_cbk = afr_do_ftruncate; + afr_openfd_flush (frame, this, fd); } else { afr_do_ftruncate (frame, this); } diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index 749c6bf9a59..ee53d1d7bfb 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -433,7 +433,6 @@ is_afr_lock_transaction (afr_local_t *local) switch (local->transaction.type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - case AFR_FLUSH_TRANSACTION: ret = 1; break; @@ -878,7 +877,6 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) switch (local->transaction.type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - case AFR_FLUSH_TRANSACTION: memcpy (int_lock->inode_locked_nodes, int_lock->locked_nodes, priv->child_count); @@ -998,7 +996,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) switch (local->transaction.type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - case AFR_FLUSH_TRANSACTION: if (local->fd) { afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION, @@ -1110,7 +1107,6 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this) switch (local->transaction.type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - case AFR_FLUSH_TRANSACTION: initialize_inodelk_variables (frame, this); break; diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index 69bbb56b8d0..d943213b277 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -216,9 +216,8 @@ out: int -afr_up_down_flush_open_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd) +afr_openfd_sh_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd) { afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; @@ -261,7 +260,7 @@ out: if (call_count == 0) { int_lock->lock_cbk = local->transaction.done; - local->transaction.post_post_op (frame, this); + local->transaction.resume (frame, this); } return 0; @@ -269,7 +268,7 @@ out: static int -__unopened_count (int child_count, unsigned char *opened_on, unsigned char *child_up) +__unopened_count (int child_count, unsigned int *opened_on, unsigned char *child_up) { int i; int count = 0; @@ -284,7 +283,7 @@ __unopened_count (int child_count, unsigned char *opened_on, unsigned char *chil int -afr_up_down_flush_sh_unwind (call_frame_t *frame, xlator_t *this) +afr_openfd_sh_unwind (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; @@ -331,7 +330,7 @@ afr_up_down_flush_sh_unwind (call_frame_t *frame, xlator_t *this) "opening fd for %s on subvolume %s", local->loc.path, priv->children[i]->name); - STACK_WIND_COOKIE (frame, afr_up_down_flush_open_cbk, + STACK_WIND_COOKIE (frame, afr_openfd_sh_open_cbk, (void *)(long) i, priv->children[i], priv->children[i]->fops->open, @@ -345,14 +344,14 @@ afr_up_down_flush_sh_unwind (call_frame_t *frame, xlator_t *this) out: if (abandon) - local->transaction.post_post_op (frame, this); + local->transaction.resume (frame, this); return 0; } int -afr_up_down_flush_post_post_op (call_frame_t *frame, xlator_t *this) +afr_openfd_sh (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; @@ -375,7 +374,7 @@ afr_up_down_flush_post_post_op (call_frame_t *frame, xlator_t *this) sh->need_data_self_heal = _gf_true; sh->type = local->fd->inode->ia_type; sh->background = _gf_false; - sh->unwind = afr_up_down_flush_sh_unwind; + sh->unwind = afr_openfd_sh_unwind; afr_self_heal_type_str_get(&local->self_heal, sh_type_str, @@ -391,19 +390,7 @@ afr_up_down_flush_post_post_op (call_frame_t *frame, xlator_t *this) int -afr_up_down_flush_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.resume (frame, this); - return 0; -} - - -int -afr_up_down_flush_done (call_frame_t *frame, xlator_t *this) +afr_openfd_flush_done (call_frame_t *frame, xlator_t *this) { afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -412,7 +399,6 @@ afr_up_down_flush_done (call_frame_t *frame, xlator_t *this) afr_fd_ctx_t * fd_ctx = NULL; int _ret = -1; - int i = 0; priv = this->private; local = frame->local; @@ -429,11 +415,6 @@ afr_up_down_flush_done (call_frame_t *frame, xlator_t *this) fd_ctx->down_count = priv->down_count; fd_ctx->up_count = priv->up_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) - fd_ctx->pre_op_done[i] = 0; - } } out: UNLOCK (&local->fd->lock); @@ -444,15 +425,14 @@ out: "The up/down flush is over"); fd_unref (local->fd); - local->up_down_flush_cbk (frame, this); + local->openfd_flush_cbk (frame, this); return 0; } int -afr_up_down_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, - afr_flush_type type) +afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) { afr_local_t * local = NULL; @@ -466,18 +446,8 @@ afr_up_down_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, local->fd = fd_ref (fd); - local->transaction.fop = afr_up_down_flush_wind; - local->transaction.done = afr_up_down_flush_done; - - switch (type) { - case AFR_CHILD_UP_FLUSH: - local->transaction.post_post_op = afr_up_down_flush_post_post_op; - break; - - case AFR_CHILD_DOWN_FLUSH: - local->transaction.post_post_op = NULL; - break; - } + local->transaction.fop = afr_openfd_sh; + local->transaction.done = afr_openfd_flush_done; local->transaction.start = 0; local->transaction.len = 0; @@ -486,8 +456,9 @@ afr_up_down_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, "doing up/down flush on fd=%p", fd); - afr_transaction (frame, this, AFR_FLUSH_TRANSACTION); + afr_transaction (frame, this, AFR_DATA_TRANSACTION); out: return 0; } + diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index b4615326747..e31f0c1df31 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -33,6 +33,25 @@ #define LOCKED_LOWER 0x2 /* for lower_path of RENAME */ +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ + uint64_t ctx = 0; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = 0; + + ret = fd_ctx_get (fd, this, &ctx); + + if (ret < 0) + goto out; + + fd_ctx = (afr_fd_ctx_t *)(long) ctx; + +out: + return fd_ctx; +} + + static void afr_pid_save (call_frame_t *frame) { @@ -82,80 +101,54 @@ __mark_child_dead (int32_t *pending[], int child_count, int child, static void -__mark_fop_failed_on_fd (fd_t *fd, xlator_t *this, - int child_index) +__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index) { - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; - - int ret = 0; - - ret = fd_ctx_get (fd, this, &ctx); - - if (ret < 0) - goto out; - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - fd_ctx->child_failed[child_index] = 1; -out: - return; -} - + afr_local_t *local = NULL; + afr_fd_ctx_t *fd_ctx = NULL; -static void -__mark_failed_children (int32_t *pending[], int child_count, - xlator_t *this, fd_t *fd, afr_transaction_type type) -{ - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; + local = frame->local; - int ret = 0; - int i = 0; - int j = 0; + if (!local->fd) + return; - ret = fd_ctx_get (fd, this, &ctx); + fd_ctx = afr_fd_ctx_get (local->fd, this); - if (ret < 0) + if (!fd_ctx) goto out; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); - - if (fd_ctx->child_failed[i]) - pending[i][j] = 0; + LOCK (&local->fd->lock); + { + if (local->transaction.type == AFR_DATA_TRANSACTION) + fd_ctx->pre_op_done[child_index]++; } - + UNLOCK (&local->fd->lock); out: return; } static void -__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index) +__mark_pre_op_undone_on_fd (call_frame_t *frame, xlator_t *this, int child_index) { - afr_local_t *local = NULL; - - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; - int ret = 0; + afr_local_t *local = NULL; + afr_fd_ctx_t *fd_ctx = NULL; local = frame->local; - ret = fd_ctx_get (local->fd, this, &ctx); + if (!local->fd) + return; - if (ret < 0) - goto out; + fd_ctx = afr_fd_ctx_get (local->fd, this); - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + if (!fd_ctx) + goto out; - if ((local->op == GF_FOP_WRITE) - || (local->op == GF_FOP_FTRUNCATE)) { - fd_ctx->pre_op_done[child_index] = 1; + LOCK (&local->fd->lock); + { + if (local->transaction.type == AFR_DATA_TRANSACTION) + fd_ctx->pre_op_done[child_index]--; } - + UNLOCK (&local->fd->lock); out: return; } @@ -191,116 +184,6 @@ __mark_all_success (int32_t *pending[], int child_count, } -static int -__is_first_write_on_fd (xlator_t *this, fd_t *fd) -{ - int op_ret = 0; - int _ret = -1; - int i = 0; - - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; - - afr_private_t *priv = NULL; - - priv = this->private; - - LOCK (&fd->lock); - { - _ret = __fd_ctx_get (fd, this, &ctx); - - if (_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "could not get fd ctx on fd=%p", - fd); - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - op_ret = 1; - for (i = 0; i < priv->child_count; i++) { - if (fd_ctx->pre_op_done[i] == 0) - continue; - - op_ret = 0; - } - } -out: - UNLOCK (&fd->lock); - - return op_ret; -} - - -static int -__if_fd_pre_op_done (xlator_t *this, fd_t *fd, int child_index) -{ - int op_ret = 0; - int _ret = -1; - - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; - - LOCK (&fd->lock); - { - _ret = __fd_ctx_get (fd, this, &ctx); - - if (_ret < 0) { - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - if (fd_ctx->pre_op_done[child_index]) { - op_ret = 1; - } - fd_ctx->pre_op_done[child_index] = 0; - } -out: - UNLOCK (&fd->lock); - - return op_ret; -} - - -static int -afr_pre_op_done_count (xlator_t *this, fd_t *fd, unsigned char *child_up) -{ - int i = 0; - int count = 0; - - int _ret = 0; - uint64_t ctx; - afr_fd_ctx_t * fd_ctx = NULL; - - afr_private_t *priv = NULL; - - priv = this->private; - - LOCK (&fd->lock); - { - _ret = __fd_ctx_get (fd, this, &ctx); - - if (_ret < 0) { - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - for (i = 0; i < priv->child_count; i++) { - if (fd_ctx->pre_op_done[i] && child_up[i]) { - count++; - } - } - } -out: - UNLOCK (&fd->lock); - - return count; -} - - static int __changelog_enabled (afr_private_t *priv, afr_transaction_type type) { @@ -325,9 +208,6 @@ __changelog_enabled (afr_private_t *priv, afr_transaction_type type) ret = 1; break; - - case AFR_FLUSH_TRANSACTION: - ret = 1; } return ret; @@ -339,7 +219,6 @@ __changelog_needed_pre_op (call_frame_t *frame, xlator_t *this) { afr_private_t * priv = NULL; afr_local_t * local = NULL; - fd_t * fd = NULL; int op_ret = 0; @@ -351,20 +230,10 @@ __changelog_needed_pre_op (call_frame_t *frame, xlator_t *this) case GF_FOP_WRITE: case GF_FOP_FTRUNCATE: - /* - if it's a data transaction, we write the changelog - only on the first write on an fd - */ - - fd = local->fd; - if (!fd || __is_first_write_on_fd (this, fd)) - op_ret = 1; - + op_ret = 1; break; case GF_FOP_FLUSH: - /* only do post-op on flush() */ - op_ret = 0; break; @@ -395,11 +264,11 @@ __changelog_needed_post_op (call_frame_t *frame, xlator_t *this) case GF_FOP_WRITE: case GF_FOP_FTRUNCATE: - op_ret = 0; + op_ret = 1; break; case GF_FOP_FLUSH: - op_ret = 1; + op_ret = 0; break; default: @@ -431,13 +300,48 @@ out: } +static int +afr_set_piggyback_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, + afr_transaction_type type) +{ + int i; + int ret = 0; + int *arr = NULL; + int index = 0; + + index = afr_index_for_transaction_type (type); + + for (i = 0; i < priv->child_count; i++) { + arr = GF_CALLOC (3 * sizeof (int32_t), priv->child_count, + gf_afr_mt_char); + if (!arr) { + ret = -1; + goto out; + } + + memcpy (arr, pending[i], 3 * sizeof (int32_t)); + + arr[index]++; + + ret = dict_set_bin (xattr, priv->pending_key[i], + arr, 3 * sizeof (int32_t)); + /* 3 = data+metadata+entry */ + + if (ret < 0) + goto out; + } + +out: + return ret; +} + + int afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) { int ret = 0; switch (type) { - case AFR_FLUSH_TRANSACTION: case AFR_DATA_TRANSACTION: ret = priv->child_count; break; @@ -464,15 +368,23 @@ afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_internal_lock_t *int_lock = NULL; afr_private_t *priv = NULL; afr_local_t *local = NULL; + int child_index = 0; int call_count = -1; - int (*post_post_op) (call_frame_t *, xlator_t *); - priv = this->private; local = frame->local; int_lock = &local->internal_lock; + child_index = (long) cookie; + + if (op_ret == 1) { + } + + if (op_ret == 0) { + __mark_pre_op_undone_on_fd (frame, this, child_index); + } + LOCK (&frame->lock); { call_count = --local->call_count; @@ -480,24 +392,11 @@ afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, UNLOCK (&frame->lock); if (call_count == 0) { - if (local->transaction.post_post_op) { - post_post_op = local->transaction.post_post_op; - - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - local->transaction.post_post_op = local->transaction.done; - } else { - int_lock->lock_cbk = local->transaction.done; - local->transaction.post_post_op = afr_unlock; - } - - post_post_op (frame, this); + if (afr_lock_server_count (priv, local->transaction.type) == 0) { + local->transaction.done (frame, this); } else { - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - local->transaction.done (frame, this); - } else { - int_lock->lock_cbk = local->transaction.done; - afr_unlock (frame, this); - } + int_lock->lock_cbk = local->transaction.done; + afr_unlock (frame, this); } } @@ -515,7 +414,11 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) int call_count = 0; afr_local_t * local = NULL; + afr_fd_ctx_t *fdctx = NULL; dict_t **xattr = NULL; + int piggyback = 0; + int index = 0; + int nothing_failed = 1; local = frame->local; int_lock = &local->internal_lock; @@ -523,12 +426,6 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) __mark_down_children (local->pending, priv->child_count, local->child_up, local->transaction.type); - if (local->op == GF_FOP_FLUSH) { - __mark_failed_children (local->pending, priv->child_count, - this, local->fd, - local->transaction.type); - } - xattr = alloca (priv->child_count * sizeof (*xattr)); memset (xattr, 0, (priv->child_count * sizeof (*xattr))); for (i = 0; i < priv->child_count; i++) { @@ -536,18 +433,17 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) dict_ref (xattr[i]); } - if (local->op == GF_FOP_FLUSH) { - call_count = afr_pre_op_done_count (this, local->fd, local->child_up); - } else { - call_count = afr_up_children_count (priv->child_count, local->child_up); + call_count = afr_up_children_count (priv->child_count, local->child_up); - if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { - call_count *= 2; - } + if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + call_count *= 2; } local->call_count = call_count; + if (local->fd) + fdctx = afr_fd_ctx_get (local->fd, this); + if (call_count == 0) { /* no child is up */ for (i = 0; i < priv->child_count; i++) { @@ -559,100 +455,134 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) return 0; } + /* check if something has failed, to handle piggybacking */ + nothing_failed = 1; + index = afr_index_for_transaction_type (local->transaction.type); + for (i = 0; i < priv->child_count; i++) { + if (local->pending[i][index] == 0) { + nothing_failed = 0; + break; + } + } + for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set pending entry"); - - - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - { - if (local->fd) - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - else - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); - call_count--; - } - break; + if (!local->child_up[i]) + continue; - case AFR_FLUSH_TRANSACTION: - { - if (__if_fd_pre_op_done (this, local->fd, i)) { - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - call_count--; + ret = afr_set_pending_dict (priv, xattr[i], + local->pending); + + if (ret < 0) + gf_log (this->name, GF_LOG_DEBUG, + "failed to set pending entry"); + + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + { + if (!fdctx) { + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, xattr[i]); + break; + } + + LOCK (&local->fd->lock); + { + piggyback = 0; + if (fdctx->pre_op_piggyback[i]) { + fdctx->pre_op_piggyback[i]--; + piggyback = 1; } - } - break; + } + UNLOCK (&local->fd->lock); - case AFR_ENTRY_RENAME_TRANSACTION: - { - STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); + if (piggyback && !nothing_failed) + ret = afr_set_piggyback_dict (priv, xattr[i], + local->pending, + local->transaction.type); - call_count--; - } + if (nothing_failed && piggyback) { + afr_changelog_post_op_cbk (frame, (void *)(long)i, + this, 1, 0, xattr[i]); + } else { + STACK_WIND_COOKIE (frame, + afr_changelog_post_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr[i]); + } + } + break; + case AFR_METADATA_TRANSACTION: + { + if (local->fd) + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr[i]); + else + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, xattr[i]); + } + break; - /* - set it again because previous stack_wind - might have already returned (think of case - where subvolume is posix) and would have - used the dict as placeholder for return - value - */ - - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set pending entry"); - - /* fall through */ - - case AFR_ENTRY_TRANSACTION: - { - if (local->fd) - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - else - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); - call_count--; - } - break; - } + case AFR_ENTRY_RENAME_TRANSACTION: + { + STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.new_parent_loc, + GF_XATTROP_ADD_ARRAY, xattr[i]); + call_count--; + } - if (!call_count) - break; - } + /* + set it again because previous stack_wind + might have already returned (think of case + where subvolume is posix) and would have + used the dict as placeholder for return + value + */ + + ret = afr_set_pending_dict (priv, xattr[i], + local->pending); + + if (ret < 0) + gf_log (this->name, GF_LOG_DEBUG, + "failed to set pending entry"); + + /* fall through */ + + case AFR_ENTRY_TRANSACTION: + { + if (local->fd) + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr[i]); + else + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.parent_loc, + GF_XATTROP_ADD_ARRAY, xattr[i]); + } + break; + } + + if (!--call_count) + break; } for (i = 0; i < priv->child_count; i++) { @@ -679,6 +609,10 @@ afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, LOCK (&frame->lock); { + if (op_ret == 1) { + /* special op_ret for piggyback */ + } + if (op_ret == 0) { __mark_pre_op_done_on_fd (frame, this, child_index); } @@ -731,8 +665,10 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) int ret = 0; int call_count = 0; dict_t **xattr = NULL; + afr_fd_ctx_t *fdctx = NULL; afr_local_t *local = NULL; + int piggyback = 0; local = frame->local; @@ -768,97 +704,136 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) __mark_all_pending (local->pending, priv->child_count, local->transaction.type); + if (local->fd) + fdctx = afr_fd_ctx_get (local->fd, this); + for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set pending entry"); - - - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - case AFR_FLUSH_TRANSACTION: - { - if (local->fd) - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &(local->loc), - GF_XATTROP_ADD_ARRAY, xattr[i]); - } - break; + if (!local->child_up[i]) + continue; + ret = afr_set_pending_dict (priv, xattr[i], + local->pending); - case AFR_ENTRY_RENAME_TRANSACTION: - { - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); - - call_count--; - } + if (ret < 0) + gf_log (this->name, GF_LOG_DEBUG, + "failed to set pending entry"); + + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + { + if (!fdctx) { + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &(local->loc), + GF_XATTROP_ADD_ARRAY, xattr[i]); + break; + } + + LOCK (&local->fd->lock); + { + piggyback = 0; + if (fdctx->pre_op_done[i]) { + fdctx->pre_op_piggyback[i]++; + piggyback = 1; + fdctx->hit++; + } else { + fdctx->miss++; + } + } + UNLOCK (&local->fd->lock); + + if (piggyback) + afr_changelog_pre_op_cbk (frame, (void *)(long)i, + this, 1, 0, xattr[i]); + else + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr[i]); + } + break; + case AFR_METADATA_TRANSACTION: + { + if (local->fd) + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr[i]); + else + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &(local->loc), + GF_XATTROP_ADD_ARRAY, xattr[i]); + } + break; + case AFR_ENTRY_RENAME_TRANSACTION: + { + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.new_parent_loc, + GF_XATTROP_ADD_ARRAY, xattr[i]); + + call_count--; + } - /* - set it again because previous stack_wind - might have already returned (think of case - where subvolume is posix) and would have - used the dict as placeholder for return - value - */ - - ret = afr_set_pending_dict (priv, xattr[i], - local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_DEBUG, - "failed to set pending entry"); - - /* fall through */ - - case AFR_ENTRY_TRANSACTION: - { - if (local->fd) - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i]); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i]); - } - break; - } + /* + set it again because previous stack_wind + might have already returned (think of case + where subvolume is posix) and would have + used the dict as placeholder for return + value + */ - if (!--call_count) - break; - } + ret = afr_set_pending_dict (priv, xattr[i], + local->pending); + + if (ret < 0) + gf_log (this->name, GF_LOG_DEBUG, + "failed to set pending entry"); + + /* fall through */ + + case AFR_ENTRY_TRANSACTION: + { + if (local->fd) + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr[i]); + else + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.parent_loc, + GF_XATTROP_ADD_ARRAY, xattr[i]); + } + break; + } + + if (!--call_count) + break; } for (i = 0; i < priv->child_count; i++) { @@ -1038,7 +1013,6 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this) switch (local->transaction.type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - case AFR_FLUSH_TRANSACTION: afr_set_transaction_flock (local); int_lock->lock_cbk = afr_post_nonblocking_inodelk_cbk; @@ -1098,7 +1072,7 @@ afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) afr_changelog_pre_op (frame, this); } else { __mark_all_success (local->pending, priv->child_count, - local->transaction.type); + local->transaction.type); afr_pid_restore (frame); @@ -1148,15 +1122,8 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index local = frame->local; priv = this->private; - switch (local->op) { - case GF_FOP_WRITE: - __mark_fop_failed_on_fd (local->fd, this, child_index); - break; - default: - __mark_child_dead (local->pending, priv->child_count, - child_index, local->transaction.type); - break; - } + __mark_child_dead (local->pending, priv->child_count, + child_index, local->transaction.type); } @@ -1175,16 +1142,7 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) local->transaction.type = type; if (afr_lock_server_count (priv, local->transaction.type) == 0) { - if (__changelog_needed_pre_op (frame, this)) { - afr_changelog_pre_op (frame, this); - } else { - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); - - afr_pid_restore (frame); - - local->transaction.fop (frame, this); - } + afr_internal_lock_finish (frame, this); } else { afr_lock (frame, this); } diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index db762c11ec5..749264a8d65 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -174,7 +174,6 @@ typedef enum { AFR_METADATA_TRANSACTION, /* chmod, chown, ... */ AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */ AFR_ENTRY_RENAME_TRANSACTION, /* rename */ - AFR_FLUSH_TRANSACTION, /* flush */ } afr_transaction_type; typedef enum { @@ -217,7 +216,6 @@ afr_index_for_transaction_type (afr_transaction_type type) switch (type) { case AFR_DATA_TRANSACTION: - case AFR_FLUSH_TRANSACTION: return 0; case AFR_METADATA_TRANSACTION: @@ -232,11 +230,6 @@ afr_index_for_transaction_type (afr_transaction_type type) } -typedef enum { - AFR_CHILD_UP_FLUSH, - AFR_CHILD_DOWN_FLUSH, -} afr_flush_type; - typedef struct { loc_t *lk_loc; struct flock lk_flock; @@ -309,7 +302,7 @@ typedef struct _afr_local { dict_t *dict; - int (*up_down_flush_cbk) (call_frame_t *, xlator_t *); + int (*openfd_flush_cbk) (call_frame_t *frame, xlator_t *this); /* This struct contains the arguments for the "continuation" @@ -606,7 +599,6 @@ typedef struct _afr_local { int (*unwind) (call_frame_t *frame, xlator_t *this); /* post-op hook */ - int (*post_post_op) (call_frame_t *frame, xlator_t *this); } transaction; afr_self_heal_t self_heal; @@ -614,15 +606,17 @@ typedef struct _afr_local { typedef struct { - unsigned char *pre_op_done; - unsigned char *opened_on; /* which subvolumes the fd is open on */ - unsigned char *child_failed; + unsigned int *pre_op_done; + unsigned int *opened_on; /* which subvolumes the fd is open on */ + unsigned int *pre_op_piggyback; int flags; int32_t wbflags; uint64_t up_count; /* number of CHILD_UPs this fd has seen */ uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */ int32_t last_tried; + + int hit, miss; gf_boolean_t failed_over; struct list_head entries; /* needed for readdir failover */ } afr_fd_ctx_t; @@ -729,9 +723,6 @@ int afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, fd_t *fd, int32_t wbflags); -int -afr_up_down_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, afr_flush_type type); - void afr_set_opendir_done (xlator_t *this, inode_t *inode); @@ -744,6 +735,9 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this); int afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd); +int +afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd); + #define AFR_STACK_UNWIND(fop, frame, params ...) \ do { \ afr_local_t *__local = NULL; \ -- cgit