From 74612a456ad1602f8038fae79fee654eb427602a Mon Sep 17 00:00:00 2001 From: Vikas Gorur Date: Tue, 24 Nov 2009 08:45:09 +0000 Subject: cluster/afr: Do self-heal on reopened fds. This patch brings in partial support for self-heal of open fds. The precondition is that the fd should have been opened successfully during the initial open() (or create()), and we assume that protocol/client has successfully reopened the fd when the subvolume comes back up. It works by doing an "up/down flush" (a dummy flush transaction to do post-op wherever necessary) and then triggering data self-heal on the file in the post-post-op hook of the dummy flush transaction. This ensures that any writes that come in during self-heal will wait until self-heal completes. The up/down flush is also done when a subvolume goes down, so that post-op is done on all subvolumes where pre-op was done. Signed-off-by: Vikas Gorur Signed-off-by: Anand V. Avati BUG: 170 (Auto-heal fails on files that are open()-ed/mmap()-ed) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=170 --- xlators/cluster/afr/src/Makefile.am | 2 +- xlators/cluster/afr/src/afr-dir-read.c | 3 +- xlators/cluster/afr/src/afr-inode-write.c | 179 ++++++++++--- xlators/cluster/afr/src/afr-open.c | 356 +++++++++++++++++++++++++ xlators/cluster/afr/src/afr-self-heal-common.c | 49 ++-- xlators/cluster/afr/src/afr-self-heal-data.c | 2 +- xlators/cluster/afr/src/afr-self-heal.h | 3 +- xlators/cluster/afr/src/afr-transaction.c | 130 +++++++-- xlators/cluster/afr/src/afr.c | 174 ++---------- xlators/cluster/afr/src/afr.h | 30 ++- 10 files changed, 699 insertions(+), 229 deletions(-) create mode 100644 xlators/cluster/afr/src/afr-open.c (limited to 'xlators') diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am index df284d12c..1a8ddadb7 100644 --- a/xlators/cluster/afr/src/Makefile.am +++ b/xlators/cluster/afr/src/Makefile.am @@ -3,7 +3,7 @@ xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster afr_la_LDFLAGS = -module -avoidversion -afr_la_SOURCES = afr.c afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c afr-self-heal-algorithm.c +afr_la_SOURCES = afr.c afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c afr-self-heal-algorithm.c afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h afr-self-heal-algorithm.h diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index ed052589c..ee80963b7 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -165,7 +165,8 @@ out: local->loc.path); afr_self_heal (frame, this, - afr_examine_dir_completion_cbk); + afr_examine_dir_completion_cbk, + _gf_true); } else { afr_set_opendir_done (this, local->fd->inode); diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 7dcc06708..5f35aa26f 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -204,6 +204,58 @@ afr_writev_done (call_frame_t *frame, xlator_t *this) } +int +afr_do_writev (call_frame_t *frame, xlator_t *this) +{ + call_frame_t * transaction_frame = NULL; + afr_local_t * local = NULL; + + int op_ret = -1; + int op_errno = 0; + + local = frame->local; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "Out of memory."); + op_errno = ENOMEM; + goto out; + } + + transaction_frame->local = local; + frame->local = NULL; + + local->op = GF_FOP_WRITE; + + local->transaction.fop = afr_writev_wind; + local->transaction.done = afr_writev_done; + local->transaction.unwind = afr_writev_unwind; + + local->transaction.main_frame = frame; + if (local->fd->flags & O_APPEND) { + local->transaction.start = 0; + local->transaction.len = 0; + } else { + local->transaction.start = local->cont.writev.offset; + local->transaction.len = iov_length (local->cont.writev.vector, + local->cont.writev.count); + } + + afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL); + } + + return 0; +} + + int afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, @@ -211,26 +263,21 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, { afr_private_t * priv = NULL; afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; int ret = -1; int op_ret = -1; int op_errno = 0; + uint64_t ctx; + afr_fd_ctx_t *fd_ctx = NULL; + VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (this->private, out); priv = this->private; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - ALLOC_OR_GOTO (local, afr_local_t, out); ret = AFR_LOCAL_INIT (local, priv); @@ -239,37 +286,38 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, goto out; } - transaction_frame->local = local; + frame->local = local; - local->op = GF_FOP_WRITE; local->cont.writev.vector = iov_dup (vector, count); local->cont.writev.count = count; local->cont.writev.offset = offset; local->cont.writev.ino = fd->inode->ino; local->cont.writev.iobref = iobref_ref (iobref); - local->transaction.fop = afr_writev_wind; - local->transaction.done = afr_writev_done; - local->transaction.unwind = afr_writev_unwind; - local->fd = fd_ref (fd); - local->transaction.main_frame = frame; - if (fd->flags & O_APPEND) { - local->transaction.start = 0; - local->transaction.len = 0; - } else { - local->transaction.start = offset; - local->transaction.len = iov_length (vector, count); - } + ret = fd_ctx_get (fd, this, &ctx); + if (ret < 0) { + goto out; + } - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + fd_ctx = (afr_fd_ctx_t *)(long) ctx; + + if (fd_ctx->down_count < priv->down_count) { + local->up_down_flush_cbk = afr_do_writev; + afr_up_down_flush (frame, this, fd, AFR_CHILD_DOWN_FLUSH); + + } else if (fd_ctx->up_count < priv->up_count) { + local->up_down_flush_cbk = afr_do_writev; + afr_up_down_flush (frame, this, fd, AFR_CHILD_UP_FLUSH); + + } else { + afr_do_writev (frame, this); + } op_ret = 0; out: if (op_ret == -1) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL); } @@ -647,6 +695,52 @@ afr_ftruncate_done (call_frame_t *frame, xlator_t *this) } +int +afr_do_ftruncate (call_frame_t *frame, xlator_t *this) +{ + call_frame_t * transaction_frame = NULL; + afr_local_t * local = NULL; + + int op_ret = -1; + int op_errno = 0; + + local = frame->local; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "Out of memory."); + goto out; + } + + transaction_frame->local = local; + frame->local = NULL; + + local->op = GF_FOP_FTRUNCATE; + + local->transaction.fop = afr_ftruncate_wind; + local->transaction.done = afr_ftruncate_done; + local->transaction.unwind = afr_ftruncate_unwind; + + local->transaction.main_frame = frame; + + local->transaction.start = 0; + local->transaction.len = local->cont.ftruncate.offset; + + afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, NULL); + } + + return 0; +} + + int afr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) @@ -660,19 +754,15 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this, int op_ret = -1; int op_errno = 0; + uint64_t ctx; + afr_fd_ctx_t *fd_ctx = NULL; + VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (this->private, out); priv = this->private; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - goto out; - } - ALLOC_OR_GOTO (local, afr_local_t, out); ret = AFR_LOCAL_INIT (local, priv); @@ -681,25 +771,26 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this, goto out; } - transaction_frame->local = local; - - local->op = GF_FOP_FTRUNCATE; - local->op_ret = -1; + frame->local = local; local->cont.ftruncate.offset = offset; local->cont.ftruncate.ino = fd->inode->ino; - local->transaction.fop = afr_ftruncate_wind; - local->transaction.done = afr_ftruncate_done; - local->transaction.unwind = afr_ftruncate_unwind; - local->fd = fd_ref (fd); - local->transaction.main_frame = frame; - local->transaction.start = 0; - local->transaction.len = offset; + ret = fd_ctx_get (fd, this, &ctx); + if (ret < 0) { + goto out; + } - afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + fd_ctx = (afr_fd_ctx_t *)(long) ctx; + + if (fd_ctx->down_count < priv->down_count) { + local->up_down_flush_cbk = afr_do_ftruncate; + afr_up_down_flush (frame, this, fd, AFR_CHILD_DOWN_FLUSH); + } else { + afr_do_ftruncate (frame, this); + } op_ret = 0; out: diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c new file mode 100644 index 000000000..945f5cddf --- /dev/null +++ b/xlators/cluster/afr/src/afr-open.c @@ -0,0 +1,356 @@ +/* + Copyright (c) 2007-2009 Gluster, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#include +#include +#include +#include +#include +#include + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" +#include "statedump.h" + +#include "fd.h" + +#include "afr-inode-read.h" +#include "afr-inode-write.h" +#include "afr-dir-read.h" +#include "afr-dir-write.h" +#include "afr-transaction.h" + +#include "afr-self-heal.h" + + +int +afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *prebuf, + struct stat *postbuf) +{ + afr_local_t * local = frame->local; + int ret = 0; + + ret = afr_fd_ctx_set (this, local->fd); + + if (ret < 0) { + local->op_ret = -1; + local->op_errno = -ret; + } + + AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, + local->fd); + return 0; +} + + +int +afr_open_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + fd_t *fd) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int ret = 0; + + int call_count = -1; + + priv = this->private; + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + } + + if (op_ret >= 0) { + local->op_ret = op_ret; + local->success_count++; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if ((local->cont.open.flags & O_TRUNC) + && (local->op_ret >= 0)) { + STACK_WIND (frame, afr_open_ftruncate_cbk, + this, this->fops->ftruncate, + fd, 0); + } else { + ret = afr_fd_ctx_set (this, fd); + + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "could not set fd ctx for fd=%p", + fd); + + local->op_ret = -1; + local->op_errno = -ret; + } + + AFR_STACK_UNWIND (open, frame, local->op_ret, + local->op_errno, local->fd); + } + } + + return 0; +} + + +int +afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, int32_t wbflags) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int i = 0; + int ret = -1; + + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + int32_t wind_flags = flags & (~O_TRUNC); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + priv = this->private; + + if (afr_is_split_brain (this, loc->inode)) { + /* self-heal failed */ + op_errno = EIO; + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + call_count = local->call_count; + + local->cont.open.flags = flags; + local->fd = fd_ref (fd); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->open, + loc, wind_flags, fd, wbflags); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (open, frame, op_ret, op_errno, fd); + } + + return 0; +} + + +int +afr_up_down_flush_sh_completion_cbk (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.post_post_op (frame, this); + + return 0; +} + + +int +afr_up_down_flush_post_post_op (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + sh->calling_fop = GF_FOP_FLUSH; + +// sh->healing_fd = local->fd; + +// sh->healing_fd_opened = _gf_true; + + local->cont.lookup.inode = local->fd->inode; + + inode_path (local->fd->inode, NULL, (char **)&local->loc.path); + local->loc.name = strrchr (local->loc.path, '/'); + local->loc.inode = inode_ref (local->fd->inode); + local->loc.parent = inode_parent (local->fd->inode, 0, NULL); + + sh->data_lock_held = _gf_true; + + local->need_data_self_heal = _gf_true; + local->cont.lookup.buf.st_mode = local->fd->inode->st_mode; + local->child_count = afr_up_children_count (priv->child_count, + local->child_up); + + sh->flush_self_heal_cbk = afr_up_down_flush_sh_completion_cbk; + + afr_self_heal (frame, this, afr_up_down_flush_sh_completion_cbk, + _gf_false); + + return 0; +} + + +int +afr_up_down_flush_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + local->transaction.resume (frame, this); + return 0; +} + + +int +afr_up_down_flush_done (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + uint64_t ctx; + afr_fd_ctx_t * fd_ctx = NULL; + + int _ret = -1; + int i = 0; + + priv = this->private; + local = frame->local; + + LOCK (&local->fd->lock); + { + _ret = __fd_ctx_get (local->fd, this, &ctx); + + if (_ret < 0) { + goto out; + } + + fd_ctx = (afr_fd_ctx_t *)(long) ctx; + + fd_ctx->down_count = priv->down_count; + fd_ctx->up_count = priv->up_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) + fd_ctx->pre_op_done[i] = 0; + } + } +out: + UNLOCK (&local->fd->lock); + + local->up_down_flush_cbk (frame, this); + + return 0; +} + + +int +afr_up_down_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, + afr_flush_type type) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int op_ret = -1; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + local = frame->local; + + local->op = GF_FOP_FLUSH; + + local->fd = fd_ref (local->fd); + + local->transaction.fop = afr_up_down_flush_wind; + local->transaction.done = afr_up_down_flush_done; + + switch (type) { + case AFR_CHILD_UP_FLUSH: + local->transaction.post_post_op = afr_up_down_flush_post_post_op; + break; + + case AFR_CHILD_DOWN_FLUSH: + local->transaction.post_post_op = NULL; + break; + } + + local->transaction.start = 0; + local->transaction.len = 0; + + gf_log (this->name, GF_LOG_TRACE, + "doing up/down flush on fd=%p", + fd); + + afr_transaction (frame, this, AFR_FLUSH_TRANSACTION); + + op_ret = 0; +out: + return 0; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index a60331d51..0b4f22b01 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -1379,7 +1379,8 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) memcpy (lc, l, sizeof (afr_local_t)); - loc_copy (&lc->loc, &l->loc); + if (l->loc.path) + loc_copy (&lc->loc, &l->loc); lc->child_up = memdup (l->child_up, priv->child_count); if (l->xattr_req) @@ -1412,12 +1413,16 @@ afr_bgsh_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) "background self-heal completed"); if (!sh->unwound) { - AFR_STACK_UNWIND (lookup, sh->orig_frame, - local->op_ret, local->op_errno, - local->cont.lookup.inode, - &local->cont.lookup.buf, - local->cont.lookup.xattr, - &local->cont.lookup.postparent); + if (sh->calling_fop == GF_FOP_LOOKUP) { + AFR_STACK_UNWIND (lookup, sh->orig_frame, + local->op_ret, local->op_errno, + local->cont.lookup.inode, + &local->cont.lookup.buf, + local->cont.lookup.xattr, + &local->cont.lookup.postparent); + } else { + sh->flush_self_heal_cbk (sh->orig_frame, this); + } } LOCK (&priv->lock); @@ -1450,12 +1455,16 @@ afr_bgsh_unwind (call_frame_t *bgsh_frame, xlator_t *this) sh->unwound = _gf_true; - AFR_STACK_UNWIND (lookup, sh->orig_frame, - local->op_ret, local->op_errno, - local->cont.lookup.inode, - &local->cont.lookup.buf, - local->cont.lookup.xattr, - &local->cont.lookup.postparent); + if (sh->calling_fop == GF_FOP_LOOKUP) { + AFR_STACK_UNWIND (lookup, sh->orig_frame, + local->op_ret, local->op_errno, + local->cont.lookup.inode, + &local->cont.lookup.buf, + local->cont.lookup.xattr, + &local->cont.lookup.postparent); + } else { + sh->flush_self_heal_cbk (sh->orig_frame, this); + } return 0; } @@ -1463,7 +1472,8 @@ afr_bgsh_unwind (call_frame_t *bgsh_frame, xlator_t *this) int afr_self_heal (call_frame_t *frame, xlator_t *this, - int (*completion_cbk) (call_frame_t *, xlator_t *)) + int (*completion_cbk) (call_frame_t *, xlator_t *), + int bgsh) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; @@ -1498,13 +1508,18 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, sh_frame->local = sh_local; sh = &sh_local->self_heal; - sh->background = _gf_true; sh->orig_frame = frame; - if (completion_cbk == NULL) - sh->completion_cbk = afr_bgsh_completion_cbk; + if (bgsh) + sh->background = _gf_true; else + sh->background = _gf_false; + + if (completion_cbk == NULL) { + sh->completion_cbk = afr_bgsh_completion_cbk; + } else { sh->completion_cbk = completion_cbk; + } sh->unwind = afr_bgsh_unwind; diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 46d074831..e8384ec30 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -80,7 +80,7 @@ afr_sh_data_done (call_frame_t *frame, xlator_t *this) "self heal of %s completed", local->loc.path); - sh->completion_cbk (frame, this); + sh->completion_cbk (sh->orig_frame, this); return 0; } diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 7c4dd99b7..84a1380b7 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -47,6 +47,7 @@ afr_self_heal_metadata (call_frame_t *frame, xlator_t *this); int afr_self_heal (call_frame_t *frame, xlator_t *this, - int (*completion_cbk) (call_frame_t *, xlator_t *)); + int (*completion_cbk) (call_frame_t *, xlator_t *), + int bgsh); #endif /* __AFR_SELF_HEAL_H__ */ diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index f7604dbf6..94f0972a1 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -133,6 +133,34 @@ out: } +static void +__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index) +{ + afr_local_t *local = NULL; + + uint64_t ctx; + afr_fd_ctx_t * fd_ctx = NULL; + int ret = 0; + + local = frame->local; + + ret = fd_ctx_get (local->fd, this, &ctx); + + if (ret < 0) + goto out; + + fd_ctx = (afr_fd_ctx_t *)(long) ctx; + + if ((local->op == GF_FOP_WRITE) + || (local->op == GF_FOP_FTRUNCATE)) { + fd_ctx->pre_op_done[child_index] = 1; + } + +out: + return; +} + + static void __mark_down_children (int32_t *pending[], int child_count, unsigned char *child_up, afr_transaction_type type) @@ -168,10 +196,15 @@ __is_first_write_on_fd (xlator_t *this, fd_t *fd) { int op_ret = 0; int _ret = -1; + int i = 0; uint64_t ctx; afr_fd_ctx_t * fd_ctx = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + LOCK (&fd->lock); { _ret = __fd_ctx_get (fd, this, &ctx); @@ -185,9 +218,12 @@ __is_first_write_on_fd (xlator_t *this, fd_t *fd) fd_ctx = (afr_fd_ctx_t *)(long) ctx; - if (fd_ctx->pre_op_done == 0) { - fd_ctx->pre_op_done = 1; - op_ret = 1; + op_ret = 1; + for (i = 0; i < priv->child_count; i++) { + if (fd_ctx->pre_op_done[i] == 0) + continue; + + op_ret = 0; } } out: @@ -198,7 +234,7 @@ out: static int -__if_fd_pre_op_done (xlator_t *this, fd_t *fd) +__if_fd_pre_op_done (xlator_t *this, fd_t *fd, int child_index) { int op_ret = 0; int _ret = -1; @@ -216,8 +252,7 @@ __if_fd_pre_op_done (xlator_t *this, fd_t *fd) fd_ctx = (afr_fd_ctx_t *)(long) ctx; - if (fd_ctx->pre_op_done) { - fd_ctx->pre_op_done = 0; + if (fd_ctx->pre_op_done[child_index]) { op_ret = 1; } } @@ -228,6 +263,43 @@ out: } +static int +afr_pre_op_done_count (xlator_t *this, fd_t *fd, unsigned char *child_up) +{ + int i = 0; + int count = 0; + + int _ret = 0; + uint64_t ctx; + afr_fd_ctx_t * fd_ctx = NULL; + + afr_private_t *priv = NULL; + + priv = this->private; + + LOCK (&fd->lock); + { + _ret = __fd_ctx_get (fd, this, &ctx); + + if (_ret < 0) { + goto out; + } + + fd_ctx = (afr_fd_ctx_t *)(long) ctx; + + for (i = 0; i < priv->child_count; i++) { + if (fd_ctx->pre_op_done[i] && child_up[i]) { + count++; + } + } + } +out: + UNLOCK (&fd->lock); + + return count; +} + + static int __changelog_enabled (afr_private_t *priv, afr_transaction_type type) { @@ -326,7 +398,7 @@ __changelog_needed_post_op (call_frame_t *frame, xlator_t *this) break; case GF_FOP_FLUSH: - op_ret = __if_fd_pre_op_done (this, local->fd); + op_ret = 1; break; default: @@ -665,11 +737,15 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) dict_ref (xattr[i]); } - call_count = afr_up_children_count (priv->child_count, local->child_up); + if (local->op == GF_FOP_FLUSH) { + call_count = afr_pre_op_done_count (this, local->fd, local->child_up); + } else { + call_count = afr_up_children_count (priv->child_count, local->child_up); - if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { - call_count *= 2; - } + if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + call_count *= 2; + } + } local->call_count = call_count; @@ -696,20 +772,33 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) switch (local->transaction.type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - case AFR_FLUSH_TRANSACTION: { if (local->fd) STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], + priv->children[i], priv->children[i]->fops->fxattrop, - local->fd, + local->fd, GF_XATTROP_ADD_ARRAY, xattr[i]); - else + else STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], + priv->children[i], priv->children[i]->fops->xattrop, - &local->loc, + &local->loc, GF_XATTROP_ADD_ARRAY, xattr[i]); + call_count--; + } + break; + + case AFR_FLUSH_TRANSACTION: + { + if (__if_fd_pre_op_done (this, local->fd, i)) { + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr[i]); + call_count--; + } } break; @@ -756,11 +845,12 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) priv->children[i]->fops->xattrop, &local->transaction.parent_loc, GF_XATTROP_ADD_ARRAY, xattr[i]); + call_count--; } break; } - if (!--call_count) + if (!call_count) break; } } @@ -789,6 +879,10 @@ afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, LOCK (&frame->lock); { + if (op_ret == 0) { + __mark_pre_op_done_on_fd (frame, this, child_index); + } + if (op_ret == -1) { local->child_up[child_index] = 0; diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index d6c1d8bcf..467b8112d 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -277,7 +277,7 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) if (sh->locked_nodes) FREE (sh->locked_nodes); - if (sh->healing_fd) { + if (sh->healing_fd && !sh->healing_fd_opened) { fd_unref (sh->healing_fd); sh->healing_fd = NULL; } @@ -694,7 +694,10 @@ unlock: lookup_buf->st_mode; } - afr_self_heal (frame, this, NULL); + local->self_heal.calling_fop = GF_FOP_LOOKUP; + + afr_self_heal (frame, this, NULL, _gf_true); + } else { AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, @@ -845,6 +848,15 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd) goto unlock; } + fd_ctx->pre_op_done = CALLOC (sizeof (*fd_ctx->pre_op_done), + priv->child_count); + if (!fd_ctx->pre_op_done) { + gf_log (this->name, GF_LOG_ERROR, + "Out of memory"); + op_ret = -ENOMEM; + goto unlock; + } + fd_ctx->child_failed = CALLOC (sizeof (*fd_ctx->child_failed), priv->child_count); @@ -856,6 +868,9 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd) goto unlock; } + fd_ctx->up_count = priv->up_count; + fd_ctx->down_count = priv->down_count; + ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); if (ret < 0) { op_ret = ret; @@ -867,149 +882,6 @@ out: return ret; } - -int -afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *prebuf, - struct stat *postbuf) -{ - afr_local_t * local = frame->local; - int ret = 0; - - ret = afr_fd_ctx_set (this, local->fd); - - if (ret < 0) { - local->op_ret = -1; - local->op_errno = -ret; - } - - AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, - local->fd); - return 0; -} - - -int -afr_open_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int ret = 0; - - int call_count = -1; - - priv = this->private; - local = frame->local; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - } - - if (op_ret >= 0) { - local->op_ret = op_ret; - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if ((local->cont.open.flags & O_TRUNC) - && (local->op_ret >= 0)) { - STACK_WIND (frame, afr_open_ftruncate_cbk, - this, this->fops->ftruncate, - fd, 0); - } else { - ret = afr_fd_ctx_set (this, fd); - - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "could not set fd ctx for fd=%p", - fd); - - local->op_ret = -1; - local->op_errno = -ret; - } - - AFR_STACK_UNWIND (open, frame, local->op_ret, - local->op_errno, local->fd); - } - } - - return 0; -} - - -int -afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, int32_t wbflags) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - - int i = 0; - int ret = -1; - - int32_t call_count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - int32_t wind_flags = flags & (~O_TRUNC); - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - - if (afr_is_split_brain (this, loc->inode)) { - /* self-heal failed */ - op_errno = EIO; - goto out; - } - - ALLOC_OR_GOTO (local, afr_local_t, out); - - ret = AFR_LOCAL_INIT (local, priv); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - frame->local = local; - call_count = local->call_count; - - local->cont.open.flags = flags; - local->fd = fd_ref (fd); - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->open, - loc, wind_flags, fd, wbflags); - - if (!--call_count) - break; - } - } - - op_ret = 0; -out: - if (op_ret == -1) { - AFR_STACK_UNWIND (open, frame, op_ret, op_errno, fd); - } - - return 0; -} - -/* }}} */ - /* {{{ flush */ int @@ -2385,6 +2257,12 @@ notify (xlator_t *this, int32_t event, child_up[i] = 1; + LOCK (&priv->lock); + { + priv->up_count++; + } + UNLOCK (&priv->lock); + /* if all the children were down, and one child came up, send notify to parent @@ -2408,6 +2286,12 @@ notify (xlator_t *this, int32_t event, i = find_child_index (this, data); child_up[i] = 0; + + LOCK (&priv->lock); + { + priv->down_count++; + } + UNLOCK (&priv->lock); /* if all children are down, and this was the last to go down, diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 23e75e612..56f7a069d 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -69,6 +69,9 @@ typedef struct _afr_private { unsigned int entry_lock_server_count; unsigned int wait_count; /* # of servers to wait for success */ + + uint64_t up_count; /* number of CHILD_UPs we have seen */ + uint64_t down_count; /* number of CHILD_DOWNs we have seen */ } afr_private_t; typedef struct { @@ -76,6 +79,8 @@ typedef struct { directories? */ gf_boolean_t forced_merge; + glusterfs_fop_t calling_fop; + /* array of stat's, one for each child */ struct stat *buf; struct stat parentbuf; @@ -124,6 +129,8 @@ typedef struct { gf_boolean_t data_lock_held; /* set by caller: true if caller has already acquired 0-0 lock */ + int (*flush_self_heal_cbk) (call_frame_t *frame, xlator_t *this); + int (*completion_cbk) (call_frame_t *frame, xlator_t *this); int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this); int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this); @@ -168,6 +175,12 @@ afr_index_for_transaction_type (afr_transaction_type type) } +typedef enum { + AFR_CHILD_UP_FLUSH, + AFR_CHILD_DOWN_FLUSH, +} afr_flush_type; + + typedef struct _afr_local { unsigned int call_count; unsigned int success_count; @@ -203,9 +216,12 @@ typedef struct _afr_local { dict_t *xattr_req; int open_fd_count; + int32_t inodelk_count; int32_t entrylk_count; + int (*up_down_flush_cbk) (call_frame_t *, xlator_t *); + /* This struct contains the arguments for the "continuation" (scheme-like) of fops @@ -503,8 +519,10 @@ typedef struct _afr_local { typedef struct { - unsigned char pre_op_done; + unsigned char *pre_op_done; unsigned char *child_failed; + uint64_t up_count; /* number of CHILD_UPs this fd has seen */ + uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */ } afr_fd_ctx_t; @@ -560,9 +578,19 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this); int afr_frame_return (call_frame_t *frame); +uint64_t +afr_is_split_brain (xlator_t *this, inode_t *inode); + void afr_set_split_brain (xlator_t *this, inode_t *inode); +int +afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, int32_t wbflags); + +int +afr_up_down_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, afr_flush_type type); + void afr_set_opendir_done (xlator_t *this, inode_t *inode); -- cgit