diff options
Diffstat (limited to 'xlators/cluster/afr/src')
19 files changed, 14016 insertions, 0 deletions
diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am new file mode 100644 index 00000000000..1bde9e5bad7 --- /dev/null +++ b/xlators/cluster/afr/src/Makefile.am @@ -0,0 +1,20 @@ +xlator_LTLIBRARIES = afr.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +afr_la_LDFLAGS = -module -avoidversion  + +afr_la_SOURCES = afr.c afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c +afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ +	    -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES =  + +uninstall-local: +	rm -f $(DESTDIR)$(xlatordir)/replicate.so + +install-data-hook: +	ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so
\ No newline at end of file diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c new file mode 100644 index 00000000000..0c65ca8528d --- /dev/null +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -0,0 +1,345 @@ +/* +   Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" + + +int32_t +afr_opendir_cbk (call_frame_t *frame, void *cookie, +		 xlator_t *this, int32_t op_ret, int32_t op_errno, +		 fd_t *fd) +{ +	afr_local_t * local  = NULL; + +	int call_count = -1; + +	LOCK (&frame->lock); +	{ +		local = frame->local; + +		if (op_ret == 0) +			local->op_ret = 0; + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		AFR_STACK_UNWIND (frame, local->op_ret, +				  local->op_errno, local->fd); +	} + +	return 0; +} + + +int32_t  +afr_opendir (call_frame_t *frame, xlator_t *this, +	     loc_t *loc, fd_t *fd) +{ +	afr_private_t * priv        = NULL; +	afr_local_t   * local       = NULL; + +	int             child_count = 0; +	int             i           = 0; + +	int ret = -1; +	int call_count = -1; + +	int32_t         op_ret   = -1; +	int32_t         op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	child_count = priv->child_count; + +	ALLOC_OR_GOTO (local, afr_local_t, out); +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	frame->local = local; +	local->fd    = fd_ref (fd); + +	call_count = local->call_count; +	 +	for (i = 0; i < child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND (frame, afr_opendir_cbk,  +				    priv->children[i], +				    priv->children[i]->fops->opendir, +				    loc, fd); + +			if (!--call_count) +				break; +		} +	} + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno, fd); +	} + +	return 0; +} + + +/** + * Common algorithm for directory read calls: + *  + * - Try the fop on the first child that is up + * - if we have failed due to ENOTCONN: + *     try the next child + * + * Applicable to: readdir + */ + +int32_t +afr_readdir_cbk (call_frame_t *frame, void *cookie, +		 xlator_t *this, int32_t op_ret, int32_t op_errno, +		 gf_dirent_t *buf) +{ +	afr_private_t * priv     = NULL; +	afr_local_t *   local    = NULL; +	xlator_t **     children = NULL; + +	int unwind     = 1; +	int last_tried = -1; +	int this_try = -1; + +	priv     = this->private; +	children = priv->children; + +	local = frame->local; + +	if (op_ret == -1) { +		last_tried = local->cont.readdir.last_tried; + +		if (all_tried (last_tried, priv->child_count)) { +			goto out; +		} + +		this_try = ++local->cont.readdir.last_tried; +		unwind = 0; + +		STACK_WIND (frame, afr_readdir_cbk, +			    children[this_try], +			    children[this_try]->fops->readdir, +			    local->fd, local->cont.readdir.size, +			    local->cont.readdir.offset); +	} + +out: +	if (unwind) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); +	} + +	return 0; +} + + +int32_t +afr_readdir (call_frame_t *frame, xlator_t *this, +	     fd_t *fd, size_t size, off_t offset) +{ +	afr_private_t * priv       = NULL; +	xlator_t **     children   = NULL; +	int             call_child = 0; +	afr_local_t     *local     = NULL; + +	int ret = -1; + +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv     = this->private; +	children = priv->children; + +	ALLOC_OR_GOTO (local, afr_local_t, out); +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} +						 +	frame->local = local; + +	call_child = afr_first_up_child (priv); +	if (call_child == -1) { +		op_errno = ENOTCONN; +		gf_log (this->name, GF_LOG_ERROR, +			"no child is up :("); +		goto out; +	} + +	local->cont.readdir.last_tried = call_child; + +	local->fd                  = fd_ref (fd); +	local->cont.readdir.size   = size; +	local->cont.readdir.offset = offset; + +	STACK_WIND (frame, afr_readdir_cbk, +		    children[call_child], children[call_child]->fops->readdir, +		    fd, size, offset); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); +	} +	return 0; +} + + +int32_t +afr_getdents_cbk (call_frame_t *frame, void *cookie, +		  xlator_t *this, int32_t op_ret, int32_t op_errno, +		  dir_entry_t *entry, int32_t count) +{ +	afr_private_t * priv     = NULL; +	afr_local_t *   local    = NULL; +	xlator_t **     children = NULL; + +	int unwind     = 1; +	int last_tried = -1; +	int this_try = -1; + +	priv     = this->private; +	children = priv->children; + +	local = frame->local; + +	if (op_ret == -1) { +		last_tried = local->cont.getdents.last_tried; + +		if (all_tried (last_tried, priv->child_count)) { +			goto out; +		} + +		this_try = ++local->cont.getdents.last_tried; +		unwind = 0; + +		STACK_WIND (frame, afr_getdents_cbk, +			    children[this_try], +			    children[this_try]->fops->getdents, +			    local->fd, local->cont.getdents.size, +			    local->cont.getdents.offset, local->cont.getdents.flag); +	} + +out: +	if (unwind) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno, entry, count); +	} + +	return 0; +} + + +int32_t +afr_getdents (call_frame_t *frame, xlator_t *this, +	      fd_t *fd, size_t size, off_t offset, int32_t flag) +{ +	afr_private_t * priv       = NULL; +	xlator_t **     children   = NULL; +	int             call_child = 0; +	afr_local_t     *local     = NULL; + +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv     = this->private; +	children = priv->children; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	call_child = afr_first_up_child (priv); +	if (call_child == -1) { +		op_errno = ENOTCONN; +		gf_log (this->name, GF_LOG_ERROR, +			"no child is up :("); +		goto out; +	} + +	local->cont.getdents.last_tried = call_child; + +	local->fd                   = fd_ref (fd); + +	local->cont.getdents.size   = size; +	local->cont.getdents.offset = offset; +	local->cont.getdents.flag   = flag; +	 +	frame->local = local; + +	STACK_WIND (frame, afr_getdents_cbk, +		    children[call_child], children[call_child]->fops->getdents, +		    fd, size, offset, flag); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); +	} + +	return 0; +} + + diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h new file mode 100644 index 00000000000..172ec3c90c4 --- /dev/null +++ b/xlators/cluster/afr/src/afr-dir-read.h @@ -0,0 +1,47 @@ +/* +   Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + +#ifndef __DIR_READ_H__ +#define __DIR_READ_H__ + + +int32_t +afr_opendir (call_frame_t *frame, xlator_t *this, +	     loc_t *loc, fd_t *fd); + +int32_t +afr_closedir (call_frame_t *frame, xlator_t *this, +	      fd_t *fd); + +int32_t +afr_readdir (call_frame_t *frame, xlator_t *this, +	     fd_t *fd, size_t size, off_t offset); + + +int32_t +afr_getdents (call_frame_t *frame, xlator_t *this, +	      fd_t *fd, size_t size, off_t offset, int32_t flag); + + +int32_t +afr_checksum (call_frame_t *frame, xlator_t *this, +	      loc_t *loc, int32_t flags); + + +#endif /* __DIR_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c new file mode 100644 index 00000000000..87a6e09b5be --- /dev/null +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -0,0 +1,1786 @@ +/* +  Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +  This file is part of GlusterFS. + +  GlusterFS is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published +  by the Free Software Foundation; either version 3 of the License, +  or (at your option) any later version. + +  GlusterFS is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program.  If not, see +  <http://www.gnu.org/licenses/>. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" +#include "afr-transaction.h" + + +void +afr_build_parent_loc (loc_t *parent, loc_t *child) +{ +	char *tmp = NULL; + +	if (!child->parent) { +		loc_copy (parent, child); +		return; +	} + +	tmp = strdup (child->path); +	parent->path   = strdup (dirname (tmp)); +	FREE (tmp); + +        parent->name   = strrchr (parent->path, '/'); +	if (parent->name) +		parent->name++; + +	parent->inode  = inode_ref (child->parent); +	parent->parent = inode_parent (parent->inode, 0, NULL); +	parent->ino    = parent->inode->ino; +} + + +/* {{{ create */ + +int +afr_create_unwind (call_frame_t *frame, xlator_t *this) +{ +	call_frame_t *main_frame = NULL; +	afr_local_t  *local = NULL; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.main_frame) { +			main_frame = local->transaction.main_frame; +		} +		local->transaction.main_frame = NULL; +	} +	UNLOCK (&frame->lock); + +	if (main_frame) +		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, +				  local->cont.create.fd, +				  local->cont.create.inode, +				  &local->cont.create.buf); +	return 0; +} + + +int +afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +		     int32_t op_ret, int32_t op_errno,  +		     fd_t *fd, inode_t *inode, struct stat *buf) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int call_count = -1; +	int child_index = -1; + +	local = frame->local; +	priv = this->private; + +	child_index = (long) cookie; + +	LOCK (&frame->lock); +	{ +		if (child_went_down (op_ret, op_errno)) +			afr_transaction_child_died (frame, this, child_index); + +		if (op_ret != -1) { +			local->op_ret = op_ret; + +			if ((local->success_count == 0) +			    || (child_index == priv->read_child)) { +				local->cont.create.buf        = *buf; +				local->cont.create.buf.st_ino =  +					afr_itransform (buf->st_ino, +							priv->child_count, +							child_index); +			} +			local->cont.create.inode = inode; + +			local->success_count++; +		} + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.unwind (frame, this); + +		local->transaction.resume (frame, this); +	} +	 +	return 0; +} + + +int +afr_create_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; + +	int call_count = -1; +	int i = 0; + +	local = frame->local; +	priv = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) {				 +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_create_wind_cbk, +					   (void *) (long) i, +					   priv->children[i],  +					   priv->children[i]->fops->create, +					   &local->loc,  +					   local->cont.create.flags,  +					   local->cont.create.mode,  +					   local->cont.create.fd); +			if (!--call_count) +				break; +		} +	} +	 +	return 0; +} + + +int +afr_create_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t * local = NULL; + +	local = frame->local; + +	local->transaction.unwind (frame, this); + +	AFR_STACK_DESTROY (frame); + +	return 0; +} + + +int +afr_create (call_frame_t *frame, xlator_t *this, +	    loc_t *loc, int32_t flags, mode_t mode, fd_t *fd) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	call_frame_t  * transaction_frame = NULL; + +	int ret = -1; + +	int op_ret   = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	transaction_frame->local = local; + +	loc_copy (&local->loc, loc); + +	local->cont.create.flags = flags; +	local->cont.create.mode  = mode; +	local->cont.create.fd    = fd_ref (fd); + +	local->transaction.fop    = afr_create_wind; +	local->transaction.done   = afr_create_done; +	local->transaction.unwind = afr_create_unwind; + +	afr_build_parent_loc (&local->transaction.parent_loc, loc); + +	local->transaction.main_frame = frame; +	local->transaction.basename = AFR_BASENAME (loc->path); +	local->transaction.pending  = AFR_ENTRY_PENDING; + +	afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		if (transaction_frame) +			AFR_STACK_DESTROY (transaction_frame); +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); +	} + +	return 0; +} + +/* }}} */ + +/* {{{ mknod */ + +int +afr_mknod_unwind (call_frame_t *frame, xlator_t *this) +{ +	call_frame_t *main_frame = NULL; +	afr_local_t  *local = NULL; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.main_frame) { +			main_frame = local->transaction.main_frame; +		} +		local->transaction.main_frame = NULL; +	} +	UNLOCK (&frame->lock); + +	if (main_frame) +		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, +				  local->cont.mknod.inode, +				  &local->cont.mknod.buf); +	return 0; +} + + +int +afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +		    int32_t op_ret, int32_t op_errno,  +		    inode_t *inode, struct stat *buf) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int call_count = -1; +	int child_index = -1; + +	local = frame->local; +	priv = this->private; + +	child_index = (long) cookie; + +	LOCK (&frame->lock); +	{ +		if (child_went_down (op_ret, op_errno)) +			afr_transaction_child_died (frame, this, child_index); +		 +		if (op_ret != -1) { +			local->op_ret = op_ret; + +			if ((local->success_count == 0) +			    || (child_index == priv->read_child)) {	 +				local->cont.mknod.buf   = *buf; +				local->cont.mknod.buf.st_ino =  +					afr_itransform (buf->st_ino, +							priv->child_count, +							child_index); +			} +			local->cont.mknod.inode = inode; + +			local->success_count++; +		} + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.unwind (frame, this); + +		local->transaction.resume (frame, this); +	} +	 +	return 0; +} + + +int32_t +afr_mknod_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; + +	int call_count = -1; +	int i = 0; + +	local = frame->local; +	priv  = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) {				 +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i, +					   priv->children[i],  +					   priv->children[i]->fops->mknod, +					   &local->loc, local->cont.mknod.mode, +					   local->cont.mknod.dev); +			if (!--call_count) +				break; +		} +	} +	 +	return 0; +} + + +int +afr_mknod_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t * local = NULL; + +	local = frame->local; + +	local->transaction.unwind (frame, this); +	AFR_STACK_DESTROY (frame); + +	return 0; +} + + +int +afr_mknod (call_frame_t *frame, xlator_t *this, +	   loc_t *loc, mode_t mode, dev_t dev) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	call_frame_t  * transaction_frame = NULL; + +	int ret = -1; + +	int op_ret   = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	transaction_frame->local = local; + +	loc_copy (&local->loc, loc); + +	local->cont.mknod.mode  = mode; +	local->cont.mknod.dev   = dev; + +	local->transaction.fop    = afr_mknod_wind; +	local->transaction.done   = afr_mknod_done; +	local->transaction.unwind = afr_mknod_unwind; + +	afr_build_parent_loc (&local->transaction.parent_loc, loc); + +	local->transaction.main_frame = frame; +	local->transaction.basename = AFR_BASENAME (loc->path); +	local->transaction.pending  = AFR_ENTRY_PENDING; + +	afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		if (transaction_frame) +			AFR_STACK_DESTROY (transaction_frame); +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); +	} + +	return 0; +} + +/* }}} */ + +/* {{{ mkdir */ + + +int +afr_mkdir_unwind (call_frame_t *frame, xlator_t *this) +{ +	call_frame_t *main_frame = NULL; +	afr_local_t  *local = NULL; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.main_frame) { +			main_frame = local->transaction.main_frame; +		} +		local->transaction.main_frame = NULL; +	} +	UNLOCK (&frame->lock); + +	if (main_frame) +		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, +				  local->cont.mkdir.inode, +				  &local->cont.mkdir.buf); +	return 0; +} + + +int +afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +		    int32_t op_ret, int32_t op_errno,  +		    inode_t *inode, struct stat *buf) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int call_count = -1; +	int child_index = -1; + +	local = frame->local; +	priv = this->private; + +	child_index = (long) cookie; + +	LOCK (&frame->lock); +	{ +		if (child_went_down (op_ret, op_errno)) +			afr_transaction_child_died (frame, this, child_index); + +		if (op_ret != -1) { +			local->op_ret           = op_ret; + +			if ((local->success_count == 0) +			    || (child_index == priv->read_child)) { +				local->cont.mkdir.buf   = *buf; +				local->cont.mkdir.buf.st_ino =  +					afr_itransform (buf->st_ino, priv->child_count, +							child_index); +			} +			local->cont.mkdir.inode = inode; + +			local->success_count++; +		} + +		local->op_errno         = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.unwind (frame, this); + +		local->transaction.resume (frame, this); +	} +	 +	return 0; +} + + +int +afr_mkdir_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	 +	int call_count = -1; +	int i = 0; + +	local = frame->local; +	priv  = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) {				 +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, +					   (void *) (long) i,	 +					   priv->children[i],  +					   priv->children[i]->fops->mkdir, +					   &local->loc, local->cont.mkdir.mode); +			if (!--call_count) +				break; +		} +	} +	 +	return 0; +} + + +int +afr_mkdir_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t * local = NULL; + +	local = frame->local; + +	local->transaction.unwind (frame, this); + +	AFR_STACK_DESTROY (frame); + +	return 0; +} + + +int +afr_mkdir (call_frame_t *frame, xlator_t *this, +	   loc_t *loc, mode_t mode) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	call_frame_t  * transaction_frame = NULL; + +	int ret = -1; + +	int op_ret   = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	transaction_frame->local = local; + +	loc_copy (&local->loc, loc); + +	local->cont.mkdir.mode  = mode; + +	local->transaction.fop    = afr_mkdir_wind; +	local->transaction.done   = afr_mkdir_done; +	local->transaction.unwind = afr_mkdir_unwind; + +	afr_build_parent_loc (&local->transaction.parent_loc, loc); + +	local->transaction.main_frame = frame; +	local->transaction.basename = AFR_BASENAME (loc->path); +	local->transaction.pending  = AFR_ENTRY_PENDING; + +	afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		if (transaction_frame) +			AFR_STACK_DESTROY (transaction_frame); + +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); +	} + +	return 0; +} + +/* }}} */ + +/* {{{ link */ + + +int +afr_link_unwind (call_frame_t *frame, xlator_t *this) +{ +	call_frame_t *main_frame = NULL; +	afr_local_t  *local = NULL; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.main_frame) { +			main_frame = local->transaction.main_frame; +		} +		local->transaction.main_frame = NULL; +	} +	UNLOCK (&frame->lock); + +	if (main_frame) { +		local->cont.link.buf.st_ino = local->cont.link.ino; + +		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,  +				  local->cont.link.inode, +				  &local->cont.link.buf); +	} + +	return 0; +} + + +int +afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +		   int32_t op_ret, int32_t op_errno, inode_t *inode, +		   struct stat *buf) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int call_count = -1; +	int child_index = -1; + +	local = frame->local; +	priv = this->private; + +	child_index = (long) cookie; + +	LOCK (&frame->lock); +	{ +		if (child_went_down (op_ret, op_errno)) +			afr_transaction_child_died (frame, this, child_index); + +		if (op_ret != -1) { +			local->op_ret   = op_ret; + +			if ((local->success_count == 0) +			    || (child_index == priv->read_child)) { +				local->cont.link.buf        = *buf; +				local->cont.link.buf.st_ino =  +					afr_itransform (buf->st_ino, priv->child_count, +							child_index); +			} +			local->cont.link.inode    = inode; + +			local->success_count++; +		} + +		local->op_errno = op_errno;		 +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.unwind (frame, this); + +		local->transaction.resume (frame, this); +	} +	 +	return 0; +} + + +int +afr_link_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; + +	int call_count = -1; +	int i = 0; + +	local = frame->local; +	priv  = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) {				 +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i, +					   priv->children[i],  +					   priv->children[i]->fops->link, +					   &local->loc, +					   &local->newloc); +			 +			if (!--call_count) +				break; +		} +	} +	 +	return 0; +} + + +int +afr_link_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t * local = frame->local; + +	local->transaction.unwind (frame, this); + +	AFR_STACK_DESTROY (frame); + +	return 0; +} + + +int +afr_link (call_frame_t *frame, xlator_t *this, +	  loc_t *oldloc, loc_t *newloc) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	call_frame_t  * transaction_frame = NULL; + +	int ret = -1; + +	int op_ret   = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	transaction_frame->local = local; + +	loc_copy (&local->loc,    oldloc); +	loc_copy (&local->newloc, newloc); + +	local->cont.link.ino = oldloc->inode->ino; + +	local->transaction.fop    = afr_link_wind; +	local->transaction.done   = afr_link_done; +	local->transaction.unwind = afr_link_unwind; + +	afr_build_parent_loc (&local->transaction.parent_loc, oldloc); + +	local->transaction.main_frame   = frame; +	local->transaction.basename     = AFR_BASENAME (oldloc->path); +	local->transaction.new_basename = AFR_BASENAME (newloc->path); +	local->transaction.pending      = AFR_ENTRY_PENDING; + +	afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		if (transaction_frame) +			AFR_STACK_DESTROY (transaction_frame); +		AFR_STACK_UNWIND (frame, op_ret, op_errno); +	} + +	return 0; +} + +/* }}} */ + +/* {{{ symlink */ + + +int +afr_symlink_unwind (call_frame_t *frame, xlator_t *this) +{ +	call_frame_t *main_frame = NULL; +	afr_local_t  *local = NULL; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.main_frame) { +			main_frame = local->transaction.main_frame; +		} +		local->transaction.main_frame = NULL; +	} +	UNLOCK (&frame->lock); + +	if (main_frame) +		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, +				  local->cont.symlink.inode, +				  &local->cont.symlink.buf); +	return 0; +} + + +int +afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +		      int32_t op_ret, int32_t op_errno, inode_t *inode, +		      struct stat *buf) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int call_count = -1; +	int child_index = -1; + +	local = frame->local; +	priv = this->private; + +	child_index = (long) cookie; + +	LOCK (&frame->lock); +	{ +		if (child_went_down (op_ret, op_errno)) +			afr_transaction_child_died (frame, this, child_index); +		 +		if (op_ret != -1) { +			local->op_ret   = op_ret; + +			if ((local->success_count == 0) +			    || (child_index == priv->read_child)) { +				local->cont.symlink.buf        = *buf; +				local->cont.symlink.buf.st_ino =  +					afr_itransform (buf->st_ino, priv->child_count, +							child_index); +			} +			local->cont.symlink.inode    = inode; + +			local->success_count++; +		} + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.unwind (frame, this); + +		local->transaction.resume (frame, this); +	} +	 +	return 0; +} + + +int +afr_symlink_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; + +	int call_count = -1; +	int i = 0; + +	local = frame->local; +	priv = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) {				 +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, +					   (void *) (long) i,	 +					   priv->children[i],  +					   priv->children[i]->fops->symlink, +					   local->cont.symlink.linkpath, +					   &local->loc); + +			if (!--call_count) +				break; + +		} +	} +	 +	return 0; +} + + +int +afr_symlink_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t * local = frame->local; + +	local->transaction.unwind (frame, this); + +	AFR_STACK_DESTROY (frame); +	 +	return 0; +} + + +int +afr_symlink (call_frame_t *frame, xlator_t *this, +	     const char *linkpath, loc_t *loc) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	call_frame_t  * transaction_frame = NULL; + +	int ret = -1; + +	int op_ret   = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	transaction_frame->local = local; +	 +	loc_copy (&local->loc, loc); + +	local->cont.symlink.ino      = loc->inode->ino; +	local->cont.symlink.linkpath = strdup (linkpath); + +	local->transaction.fop    = afr_symlink_wind; +	local->transaction.done   = afr_symlink_done; +	local->transaction.unwind = afr_symlink_unwind; + +	afr_build_parent_loc (&local->transaction.parent_loc, loc); + +	local->transaction.main_frame   = frame; +	local->transaction.basename     = AFR_BASENAME (loc->path); +	local->transaction.pending      = AFR_ENTRY_PENDING; + +	afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		if (transaction_frame) +			AFR_STACK_DESTROY (transaction_frame); +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, NULL); +	} + +	return 0; +} + +/* }}} */ + +/* {{{ rename */ + +int +afr_rename_unwind (call_frame_t *frame, xlator_t *this) +{ +	call_frame_t *main_frame = NULL; +	afr_local_t  *local = NULL; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.main_frame) { +			main_frame = local->transaction.main_frame; +		} +		local->transaction.main_frame = NULL; +	} +	UNLOCK (&frame->lock); + +	if (main_frame) { +		local->cont.rename.buf.st_ino = local->cont.rename.ino; + +		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,  +				  &local->cont.rename.buf); +	} + +	return 0; +} + + +int +afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +		     int32_t op_ret, int32_t op_errno, struct stat *buf) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int call_count = -1; +	int child_index = -1; + +	local = frame->local; +	priv  = this->private; + +	child_index = (long) cookie; + +	LOCK (&frame->lock); +	{ +		if (child_went_down (op_ret, op_errno)) +			afr_transaction_child_died (frame, this, child_index); + +		if ((op_ret != -1) && (local->success_count == 0)) { +			local->op_ret = op_ret; +		 +			if (buf) { +				local->cont.rename.buf = *buf; +				local->cont.rename.buf.st_ino =  +					afr_itransform (buf->st_ino, priv->child_count, +							child_index); +			} +			local->success_count++; +		} + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.unwind (frame, this); + +		local->transaction.resume (frame, this); +	} +	 +	return 0; +} + + +int32_t +afr_rename_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; + +	int call_count = -1; +	int i = 0; + +	local = frame->local; +	priv = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) {				 +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_rename_wind_cbk,  +					   (void *) (long) i,	 +					   priv->children[i],  +					   priv->children[i]->fops->rename, +					   &local->loc, +					   &local->newloc); +			if (!--call_count) +				break; +		} +	} +	 +	return 0; +} + + +int +afr_rename_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t * local = frame->local; + +	local->transaction.unwind (frame, this); + +	AFR_STACK_DESTROY (frame); +	 +	return 0; +} + + +int +afr_rename (call_frame_t *frame, xlator_t *this, +	    loc_t *oldloc, loc_t *newloc) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	call_frame_t  * transaction_frame = NULL; + +	int ret = -1; + +	int op_ret   = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	transaction_frame->local = local; + +	loc_copy (&local->loc,    oldloc); +	loc_copy (&local->newloc, newloc); + +	local->cont.rename.ino = oldloc->inode->ino; + +	local->transaction.fop    = afr_rename_wind; +	local->transaction.done   = afr_rename_done; +	local->transaction.unwind = afr_rename_unwind; + +	afr_build_parent_loc (&local->transaction.parent_loc, oldloc); +	afr_build_parent_loc (&local->transaction.new_parent_loc, newloc); + +	local->transaction.main_frame   = frame; +	local->transaction.basename     = AFR_BASENAME (oldloc->path); +	local->transaction.new_basename = AFR_BASENAME (newloc->path); +	local->transaction.pending      = AFR_ENTRY_PENDING; + +	afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		if (transaction_frame) +			AFR_STACK_DESTROY (transaction_frame); + +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); +	} + +	return 0; +} + +/* }}} */ + +/* {{{ unlink */ + +int +afr_unlink_unwind (call_frame_t *frame, xlator_t *this) +{ +	call_frame_t *main_frame = NULL; +	afr_local_t  *local = NULL; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.main_frame) { +			main_frame = local->transaction.main_frame; +		} +		local->transaction.main_frame = NULL; +	} +	UNLOCK (&frame->lock); + +	if (main_frame) +		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno); + +	return 0; +} + + +int +afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +		     int32_t op_ret, int32_t op_errno) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int call_count  = -1; +	int child_index = (long) cookie; +	int need_unwind = 0; + +	local = frame->local; +	priv  = this->private; +	 +	LOCK (&frame->lock); +	{ +		if (child_went_down (op_ret, op_errno)) +			afr_transaction_child_died (frame, this, child_index); + +		if (op_ret != -1) { +			if (local->success_count == 0) { +				local->op_ret   = op_ret; +			} +			local->success_count++; + +			if (local->success_count == priv->wait_count) { +				need_unwind = 1; +			} +		} + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.unwind (frame, this); + +		local->transaction.resume (frame, this); +	} +	 +	return 0; +} + + +int32_t +afr_unlink_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; + +	int call_count = -1; +	int i = 0; + +	local = frame->local; +	priv  = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) {				 +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk,	 +					   (void *) (long) i, +					   priv->children[i],  +					   priv->children[i]->fops->unlink, +					   &local->loc); +			 +			if (!--call_count) +				break; +		} +	} +	 +	return 0; +} + + +int32_t +afr_unlink_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t * local = frame->local; + +	local->transaction.unwind (frame, this); + +	AFR_STACK_DESTROY (frame); +	 +	return 0; +} + + +int32_t +afr_unlink (call_frame_t *frame, xlator_t *this, +	    loc_t *loc) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	call_frame_t  * transaction_frame = NULL; + +	int ret = -1; + +	int op_ret   = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	transaction_frame->local = local; + +	loc_copy (&local->loc, loc); + +	local->transaction.fop    = afr_unlink_wind; +	local->transaction.done   = afr_unlink_done; +	local->transaction.unwind = afr_unlink_unwind; + +	afr_build_parent_loc (&local->transaction.parent_loc, loc); + +	local->transaction.main_frame = frame; +	local->transaction.basename = AFR_BASENAME (loc->path); +	local->transaction.pending  = AFR_ENTRY_PENDING; + +	afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		if (transaction_frame) +			AFR_STACK_DESTROY (transaction_frame); +		AFR_STACK_UNWIND (frame, op_ret, op_errno); +	} + +	return 0; +} + +/* }}} */ + +/* {{{ rmdir */ + + + +int +afr_rmdir_unwind (call_frame_t *frame, xlator_t *this) +{ +	call_frame_t *main_frame = NULL; +	afr_local_t  *local = NULL; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.main_frame) { +			main_frame = local->transaction.main_frame; +		} +		local->transaction.main_frame = NULL; +	} +	UNLOCK (&frame->lock); + +	if (main_frame) +		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno); + +	return 0; +} + + +int +afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +		    int32_t op_ret, int32_t op_errno) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int call_count  = -1; +	int child_index = (long) cookie; +	int need_unwind = 0; + +	local = frame->local; +	priv = this->private; + +	LOCK (&frame->lock); +	{ +		if (child_went_down (op_ret, op_errno)) +			afr_transaction_child_died (frame, this, child_index); + +		if (op_ret != -1) { +			if (local->success_count == 0) { +				local->op_ret = op_ret; +			} +			local->success_count++; + +			if (local->success_count == priv->wait_count) +				need_unwind = 1; +		} + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.unwind (frame, this); + +		local->transaction.resume (frame, this); +	} +	 +	return 0; +} + + +int +afr_rmdir_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; + +	int call_count = -1; +	int i = 0; + +	local = frame->local; +	priv  = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) {				 +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk,	 +					   (void *) (long) i, +					   priv->children[i],  +					   priv->children[i]->fops->rmdir, +					   &local->loc); + +			if (!--call_count) +				break; +		} +	} +	 +	return 0; +} + + +int +afr_rmdir_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t * local = frame->local; + +	local->transaction.unwind (frame, this); + +	AFR_STACK_DESTROY (frame); +	 +	return 0; +} + + +int +afr_rmdir (call_frame_t *frame, xlator_t *this, +	   loc_t *loc) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	call_frame_t  * transaction_frame = NULL; +	 +	int ret = -1; + +	int op_ret   = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	transaction_frame->local = local; + +	loc_copy (&local->loc, loc); + +	local->transaction.fop    = afr_rmdir_wind; +	local->transaction.done   = afr_rmdir_done; +	local->transaction.unwind = afr_rmdir_unwind; + +	afr_build_parent_loc (&local->transaction.parent_loc, loc); + +	local->transaction.main_frame = frame; +	local->transaction.basename = AFR_BASENAME (loc->path); +	local->transaction.pending  = AFR_ENTRY_PENDING; + +	afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		if (transaction_frame) +			AFR_STACK_DESTROY (transaction_frame); +		AFR_STACK_UNWIND (frame, op_ret, op_errno); +	} + +	return 0; +} + +/* }}} */ + +/* {{{ setdents */ + +int32_t +afr_setdents_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +		       int32_t op_ret, int32_t op_errno) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int call_count  = -1; +	int child_index = (long) cookie; + +	local = frame->local; +	priv = this->private; + +	LOCK (&frame->lock); +	{ +		if (child_went_down (op_ret, op_errno)) +			afr_transaction_child_died (frame, this, child_index); + +		if ((op_ret != -1) && (local->success_count == 0)) { +			local->op_ret = op_ret; +			local->success_count++; +		} + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +	} +	 +	return 0; +} + + +int32_t +afr_setdents_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; + +	int call_count = -1; +	int i = 0; + +	local = frame->local; +	priv  = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) {				 +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_setdents_wind_cbk,	 +					   (void *) (long) i, +					   priv->children[i],  +					   priv->children[i]->fops->setdents, +					   local->fd, local->cont.setdents.flags, +					   local->cont.setdents.entries,  +					   local->cont.setdents.count); +			 +			if (!--call_count) +				break; +		} +	} +	 +	return 0; +} + + +int32_t +afr_setdents_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t * local = frame->local; + +	AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); +	 +	return 0; +} + + +int32_t +afr_setdents (call_frame_t *frame, xlator_t *this, +	      fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	 +	int ret = -1; + +	int op_ret   = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	frame->local = local; + +	local->fd = fd_ref (fd); + +	local->cont.setdents.flags   = flags; +	local->cont.setdents.entries = entries; +	local->cont.setdents.count   = count; + +	local->transaction.fop  = afr_setdents_wind; +	local->transaction.done = afr_setdents_done; + +	local->transaction.basename = NULL; +	local->transaction.pending  = AFR_ENTRY_PENDING; + +	afr_transaction (frame, this, AFR_ENTRY_TRANSACTION); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno); +	} + +	return 0; +} + +/* }}} */ diff --git a/xlators/cluster/afr/src/afr-dir-write.h b/xlators/cluster/afr/src/afr-dir-write.h new file mode 100644 index 00000000000..e6e8a5e797c --- /dev/null +++ b/xlators/cluster/afr/src/afr-dir-write.h @@ -0,0 +1,59 @@ +/* +   Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + +#ifndef __DIR_WRITE_H__ +#define __DIR_WRITE_H__ + +int32_t +afr_create (call_frame_t *frame, xlator_t *this, +	    loc_t *loc, int32_t flags, mode_t mode, fd_t *fd); + +int32_t +afr_mknod (call_frame_t *frame, xlator_t *this, +	   loc_t *loc, mode_t mode, dev_t dev); + +int32_t +afr_mkdir (call_frame_t *frame, xlator_t *this, +	   loc_t *loc, mode_t mode); + +int32_t +afr_unlink (call_frame_t *frame, xlator_t *this, +	    loc_t *loc); + +int32_t +afr_rmdir (call_frame_t *frame, xlator_t *this, +	   loc_t *loc); + +int32_t +afr_link (call_frame_t *frame, xlator_t *this, +	  loc_t *oldloc, loc_t *newloc); + +int32_t +afr_rename (call_frame_t *frame, xlator_t *this, +	    loc_t *oldloc, loc_t *newloc); + +int32_t +afr_symlink (call_frame_t *frame, xlator_t *this, +	     const char *linkpath, loc_t *oldloc); + +int32_t +afr_setdents (call_frame_t *frame, xlator_t *this, +	      fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count); + +#endif /* __DIR_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c new file mode 100644 index 00000000000..a6c99ec0576 --- /dev/null +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -0,0 +1,721 @@ +/* +   Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" + + +/** + * Common algorithm for inode read calls: + *  + * - Try the fop on the first child that is up + * - if we have failed due to ENOTCONN: + *     try the next child + * + * Applicable to: access, stat, fstat, readlink, getxattr + */ + +/* {{{ access */ + +int32_t +afr_access_cbk (call_frame_t *frame, void *cookie, +		xlator_t *this, int32_t op_ret, int32_t op_errno) +{ +	afr_private_t * priv     = NULL; +	afr_local_t *   local    = NULL; +	xlator_t **     children = NULL; + +	int unwind     = 1; +	int last_tried = -1; +	int this_try = -1; + +	priv     = this->private; +	children = priv->children; + +	local = frame->local; + +	if (op_ret == -1) { +		last_tried = local->cont.access.last_tried; + +		if (all_tried (last_tried, priv->child_count)) { +			goto out; +		} +		this_try    = ++local->cont.access.last_tried; + +		unwind = 0; + +		STACK_WIND_COOKIE (frame, afr_access_cbk, +				   (void *) (long) this_try, +				   children[this_try],  +				   children[this_try]->fops->access, +				   &local->loc, local->cont.access.mask); +	} + +out: +	if (unwind) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno); +	} + +	return 0; +} + + +int32_t +afr_access (call_frame_t *frame, xlator_t *this, +	    loc_t *loc, int32_t mask) +{ +	afr_private_t * priv       = NULL; +	xlator_t **     children   = NULL; +	int             call_child = 0; +	afr_local_t     *local     = NULL; + +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv     = this->private; +	VALIDATE_OR_GOTO (priv->children, out); + +	children = priv->children; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	call_child = afr_first_up_child (priv); +	if (call_child == -1) { +		op_errno = ENOTCONN; +		gf_log (this->name, GF_LOG_ERROR, +			"no child is up :("); +		goto out; +	} + +	local->cont.access.last_tried = call_child; +	loc_copy (&local->loc, loc); +	local->cont.access.mask       = mask; + +	STACK_WIND_COOKIE (frame, afr_access_cbk, +			   (void *) (long) call_child, +			   children[call_child], children[call_child]->fops->access, +			   loc, mask); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno); +	} +	return 0; +} + + +/* }}} */ + +/* {{{ stat */ + +int32_t +afr_stat_cbk (call_frame_t *frame, void *cookie, +	      xlator_t *this, int32_t op_ret, int32_t op_errno, +	      struct stat *buf) +{ +	afr_private_t * priv     = NULL; +	afr_local_t *   local    = NULL; +	xlator_t **     children = NULL; + +	int deitransform_child = -1; + +	int unwind     = 1; +	int last_tried = -1; +	int this_try = -1; + +	priv     = this->private; +	children = priv->children; + +	deitransform_child = (long) cookie; + +	local = frame->local; + +	if (op_ret == -1) { +	retry: +		last_tried = local->cont.stat.last_tried; + +		if (all_tried (last_tried, priv->child_count)) { +			goto out; +		} +		this_try = ++local->cont.stat.last_tried; + +		if (this_try == deitransform_child) { +			goto retry; +		} + +		unwind = 0; + +		STACK_WIND_COOKIE (frame, afr_stat_cbk, +				   (void *) (long) deitransform_child, +				   children[this_try],  +				   children[this_try]->fops->stat, +				   &local->loc); +	} + +out: +	if (unwind) { +		if (op_ret != -1) +			buf->st_ino = local->cont.stat.ino; + +		AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); +	} + +	return 0; +} + + +int32_t +afr_stat (call_frame_t *frame, xlator_t *this, +	  loc_t *loc) +{ +	afr_private_t * priv       = NULL; +	afr_local_t   * local      = NULL; +	xlator_t **     children   = NULL; + +	int             call_child = 0; + +	int32_t         op_ret     = -1; +	int32_t         op_errno   = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv     = this->private; +	VALIDATE_OR_GOTO (priv->children, out); + +	children = priv->children; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	frame->local = local; + +	call_child = afr_deitransform (loc->inode->ino, priv->child_count); +	loc_copy (&local->loc, loc); + +	/*  +	   if stat fails from the deitranform'd child, we try +	   all children starting with the first one +	*/ +	local->cont.stat.last_tried = -1; +	local->cont.stat.ino = loc->inode->ino; + +	STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child, +			   children[call_child], +			   children[call_child]->fops->stat, +			   loc); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); +	} + +	return 0; +} + + +/* }}} */ + +/* {{{ fstat */ + +int32_t +afr_fstat_cbk (call_frame_t *frame, void *cookie, +	       xlator_t *this, int32_t op_ret, int32_t op_errno, +	       struct stat *buf) +{ +	afr_private_t * priv     = NULL; +	afr_local_t *   local    = NULL; +	xlator_t **     children = NULL; + +	int deitransform_child = -1; + +	int unwind     = 1; +	int last_tried = -1; +	int this_try = -1; + +	priv     = this->private; +	children = priv->children; + +	deitransform_child = (long) cookie; + +	local = frame->local; + +	if (op_ret == -1) { +	retry: +		last_tried = local->cont.fstat.last_tried; + +		if (all_tried (last_tried, priv->child_count)) { +			goto out; +		} +		this_try   = ++local->cont.fstat.last_tried; + +		if (this_try == deitransform_child) { +			/*  +			   skip the deitransform'd child since if we are here +			   we must have already tried that child +			*/ +			goto retry; +		} +	        + +		unwind = 0; + +		STACK_WIND_COOKIE (frame, afr_fstat_cbk, +				   (void *) (long) deitransform_child, +				   children[this_try],  +				   children[this_try]->fops->fstat, +				   local->fd); +	} + +out: +	if (unwind) { +		if (op_ret != -1) +			buf->st_ino = local->cont.fstat.ino; + +		AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); +	} + +	return 0; +} + + +int32_t +afr_fstat (call_frame_t *frame, xlator_t *this, +	   fd_t *fd) +{ +	afr_private_t * priv       = NULL; +	afr_local_t   * local      = NULL; +	xlator_t **     children   = NULL; + +	int             call_child = 0; + +	int32_t         op_ret     = -1; +	int32_t         op_errno   = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (fd, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv     = this->private; +	VALIDATE_OR_GOTO (priv->children, out); + +	children = priv->children; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	frame->local = local; + +	VALIDATE_OR_GOTO (fd->inode, out); + +	call_child = afr_deitransform (fd->inode->ino, priv->child_count); + +	/*  +	   if fstat fails from the deitranform'd child, we try +	   all children starting with the first one +	*/ +	local->cont.fstat.last_tried = -1; +	local->cont.fstat.ino = fd->inode->ino; +	local->fd = fd_ref (fd); + +	STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child, +			   children[call_child], +			   children[call_child]->fops->fstat, +			   fd); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); +	} + +	return 0; +} + +/* }}} */ + +/* {{{ readlink */ + +int32_t +afr_readlink_cbk (call_frame_t *frame, void *cookie, +		  xlator_t *this, int32_t op_ret, int32_t op_errno, +		  const char *buf) +{ +	afr_private_t * priv     = NULL; +	afr_local_t *   local    = NULL; +	xlator_t **     children = NULL; + +	int unwind     = 1; +	int last_tried = -1; +	int this_try = -1; + +	priv     = this->private; +	children = priv->children; + +	local = frame->local; + +	if (op_ret == -1) { +		last_tried = local->cont.readlink.last_tried; + +		if (all_tried (last_tried, priv->child_count)) { +			goto out; +		} +		this_try = ++local->cont.readlink.last_tried; + +		unwind = 0; +		STACK_WIND_COOKIE (frame, afr_readlink_cbk, +				   (void *) (long) this_try, +				   children[this_try],  +				   children[this_try]->fops->readlink, +				   &local->loc, +				   local->cont.readlink.size); +	} + +out: +	if (unwind) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); +	} + +	return 0; +} + + +int32_t +afr_readlink (call_frame_t *frame, xlator_t *this, +	      loc_t *loc, size_t size) +{ +	afr_private_t * priv       = NULL; +	xlator_t **     children   = NULL; +	int             call_child = 0; +	afr_local_t     *local     = NULL; + +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv     = this->private; +	VALIDATE_OR_GOTO (priv->children, out); + +	children = priv->children; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	frame->local = local; + +	call_child = afr_first_up_child (priv); +	if (call_child == -1) { +		op_errno = ENOTCONN; +		gf_log (this->name, GF_LOG_ERROR, +			"no child is up :("); +		goto out; +	} + +	local->cont.readlink.last_tried = call_child; +	loc_copy (&local->loc, loc); +	local->cont.readlink.size       = size; + +	STACK_WIND_COOKIE (frame, afr_readlink_cbk, +			   (void *) (long) call_child, +			   children[call_child], children[call_child]->fops->readlink, +			   loc, size); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); +	} +	return 0; +} + + +/* }}} */ + +/* {{{ getxattr */ + +int32_t +afr_getxattr_cbk (call_frame_t *frame, void *cookie, +		  xlator_t *this, int32_t op_ret, int32_t op_errno, +		  dict_t *dict) +{ +	afr_private_t * priv     = NULL; +	afr_local_t *   local    = NULL; +	xlator_t **     children = NULL; + +	int unwind     = 1; +	int last_tried = -1; +	int this_try = -1; + +	priv     = this->private; +	children = priv->children; + +	local = frame->local; + +	if (op_ret == -1) { +		last_tried = local->cont.getxattr.last_tried; + +		if (all_tried (last_tried, priv->child_count)) { +			goto out; +		} +		this_try = ++local->cont.getxattr.last_tried; + +		unwind = 0; +		STACK_WIND_COOKIE (frame, afr_getxattr_cbk, +				   (void *) (long) this_try, +				   children[this_try],  +				   children[this_try]->fops->getxattr, +				   &local->loc, +				   local->cont.getxattr.name); +	} + +out: +	if (unwind) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno, dict); +	} + +	return 0; +} + + +int32_t +afr_getxattr (call_frame_t *frame, xlator_t *this, +	      loc_t *loc, const char *name) +{ +	afr_private_t *   priv       = NULL; +	xlator_t **       children   = NULL; +	int               call_child = 0; +	afr_local_t     * local      = NULL; + +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv     = this->private; +	VALIDATE_OR_GOTO (priv->children, out); + +	children = priv->children; + +	ALLOC_OR_GOTO (local, afr_local_t, out); +	frame->local = local; + +	call_child = afr_first_up_child (priv); +	if (call_child == -1) { +		op_errno = ENOTCONN; +		gf_log (this->name, GF_LOG_ERROR, +			"no child is up :("); +		goto out; +	} + +	local->cont.getxattr.last_tried = call_child; +	loc_copy (&local->loc, loc); +	if (name) +	  local->cont.getxattr.name       = strdup (name); + +	STACK_WIND_COOKIE (frame, afr_getxattr_cbk, +			   (void *) (long) call_child, +			   children[call_child], children[call_child]->fops->getxattr, +			   loc, name); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); +	} +	return 0; +} + + +/* }}} */ + +/* {{{ readv */ + +/** + * read algorithm: + *  + * if the user has specified a read subvolume, use it + * otherwise - + *   use the inode number to hash it to one of the subvolumes, and + *   read from there (to balance read load) + * + * if any of the above read's fail, try the children in sequence + * beginning at the beginning + */ +  +int32_t +afr_readv_cbk (call_frame_t *frame, void *cookie, +	       xlator_t *this, int32_t op_ret, int32_t op_errno, +	       struct iovec *vector, int32_t count, struct stat *buf) +{ +	afr_private_t * priv     = NULL; +	afr_local_t *   local    = NULL; +	xlator_t **     children = NULL; + +	int unwind     = 1; +	int last_tried = -1; +	int this_try = -1; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv     = this->private; +	VALIDATE_OR_GOTO (priv->children, out); + +	children = priv->children; + +	local = frame->local; + +	if (op_ret == -1) { +	retry: +		last_tried = local->cont.readv.last_tried; + +		if (all_tried (last_tried, priv->child_count)) { +			goto out; +		} +		this_try = ++local->cont.readv.last_tried; + +		if (this_try == priv->read_child) { +			/*  +			   skip the read child since if we are here +			   we must have already tried that child +			*/ +			goto retry; +		} + +		unwind = 0; + +		STACK_WIND_COOKIE (frame, afr_readv_cbk, +				   (void *) (long) this_try, +				   children[this_try],  +				   children[this_try]->fops->readv, +				   local->fd, local->cont.readv.size, +				   local->cont.readv.offset); +	} + +out: +	if (unwind) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno, vector, count, buf); +	} + +	return 0; +} + + +int32_t +afr_readv (call_frame_t *frame, xlator_t *this, +	   fd_t *fd, size_t size, off_t offset) +{ +	afr_private_t * priv       = NULL; +	afr_local_t   * local      = NULL; +	xlator_t **     children   = NULL; + +	int             call_child = 0; + +	int32_t         op_ret     = -1; +	int32_t         op_errno   = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); +	VALIDATE_OR_GOTO (fd, out); + +	priv     = this->private; +	children = priv->children; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	frame->local = local; + +	if (priv->read_child != -1) { +		call_child = priv->read_child; + +		/*  +		   if read fails from the read child, we try +		   all children starting with the first one +		*/ +		local->cont.readv.last_tried = -1; +	} else { +		call_child = afr_first_up_child (priv); +		if (call_child == -1) { +			op_errno = ENOTCONN; +			gf_log (this->name, GF_LOG_ERROR, +				"no child is up :("); +			goto out; +		} + +		local->cont.readv.last_tried = call_child; +	} + +	local->fd                    = fd_ref (fd); + +	local->cont.readv.size       = size; +	local->cont.readv.offset     = offset; + +	STACK_WIND_COOKIE (frame, afr_readv_cbk, +			   (void *) (long) call_child, +			   children[call_child], +			   children[call_child]->fops->readv, +			   fd, size, offset); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, 0, NULL); +	} +	return 0; +} + +/* }}} */ diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h new file mode 100644 index 00000000000..6b3bd2da850 --- /dev/null +++ b/xlators/cluster/afr/src/afr-inode-read.h @@ -0,0 +1,47 @@ +/* +   Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + +#ifndef __INODE_READ_H__ +#define __INODE_READ_H__ + +int32_t +afr_access (call_frame_t *frame, xlator_t *this, +	    loc_t *loc, int32_t mask); + +int32_t +afr_stat (call_frame_t *frame, xlator_t *this, +	  loc_t *loc); + +int32_t +afr_fstat (call_frame_t *frame, xlator_t *this, +	   fd_t *fd); + +int32_t +afr_readlink (call_frame_t *frame, xlator_t *this, +	      loc_t *loc, size_t size); + +int32_t +afr_readv (call_frame_t *frame, xlator_t *this, +	   fd_t *fd, size_t size, off_t offset); + +int32_t +afr_getxattr (call_frame_t *frame, xlator_t *this, +	      loc_t *loc, const char *name); + +#endif /* __INODE_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c new file mode 100644 index 00000000000..267350b2c4a --- /dev/null +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -0,0 +1,2024 @@ +/* +   Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" +#include "afr-transaction.h" + + +/* {{{ chmod */ + + +int +afr_chmod_unwind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	call_frame_t   *main_frame = NULL; + +	local = frame->local; +	priv  = this->private; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.main_frame) +			main_frame = local->transaction.main_frame; +		local->transaction.main_frame = NULL; +	} +	UNLOCK (&frame->lock); + +	if (main_frame) { +		local->cont.chmod.buf.st_ino = local->cont.chmod.ino; +		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, +				  &local->cont.chmod.buf); +	} +	return 0; +} + + +int +afr_chmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +		    int32_t op_ret, int32_t op_errno, struct stat *buf) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int call_count  = -1; +	int child_index = (long) cookie; +	int need_unwind = 0; + +	local = frame->local; +	priv  = this->private; + +	LOCK (&frame->lock); +	{ +		if (child_went_down (op_ret, op_errno)) +			afr_transaction_child_died (frame, this, child_index); + +		if (op_ret != -1) { +			if (local->success_count == 0) { +				local->op_ret = op_ret; +				local->cont.chmod.buf = *buf; +			} +			local->success_count++; + +			if (local->success_count == priv->wait_count) { +				need_unwind = 1; +			} +		} + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	if (need_unwind) +		afr_chmod_unwind (frame, this); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +	} +	 +	return 0; +} + + +int +afr_chmod_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	 +	int i = 0; +	int call_count = -1; + +	local = frame->local; +	priv  = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) {				 +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_chmod_wind_cbk, (void *) (long) i, +					   priv->children[i],  +					   priv->children[i]->fops->chmod, +					   &local->loc,  +					   local->cont.chmod.mode);  +		 +			if (!--call_count) +				break; +		} +	} +	 +	return 0; +} + + +int +afr_chmod_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t * local = NULL; + +	local = frame->local; + +	local->transaction.unwind (frame, this); + +	AFR_STACK_DESTROY (frame); +	 +	return 0; +} + + +int32_t +afr_chmod (call_frame_t *frame, xlator_t *this, +	   loc_t *loc, mode_t mode) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	call_frame_t  * transaction_frame = NULL; + +	int ret = -1; + +	int op_ret   = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	ALLOC_OR_GOTO (local, afr_local_t, out); +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	transaction_frame->local = local; + +	local->cont.chmod.mode = mode; +	local->cont.chmod.ino  = loc->inode->ino; + +	local->transaction.fop    = afr_chmod_wind; +	local->transaction.done   = afr_chmod_done; +	local->transaction.unwind = afr_chmod_unwind; + +	loc_copy (&local->loc, loc); +	 +	local->transaction.main_frame = frame; +	local->transaction.start   = 0; +	local->transaction.len     = 0; +	local->transaction.pending = AFR_METADATA_PENDING; + +	afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		if (transaction_frame) +			AFR_STACK_DESTROY (transaction_frame); +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); +	} + +	return 0; +} + +/* }}} */ + + +/* {{{ fchmod */ + +int +afr_fchmod_unwind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	call_frame_t   *main_frame = NULL; + +	local = frame->local; +	priv  = this->private; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.main_frame) +			main_frame = local->transaction.main_frame; +		local->transaction.main_frame = NULL; +	} +	UNLOCK (&frame->lock); + +	if (main_frame) { +		local->cont.fchmod.buf.st_ino = local->cont.fchmod.ino; +		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, +				  &local->cont.fchmod.buf); +	} +	return 0; +} + + +int +afr_fchmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +		    int32_t op_ret, int32_t op_errno, struct stat *buf) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int call_count  = -1; +	int child_index = (long) cookie; +	int need_unwind = 0; + +	local = frame->local; +	priv  = this->private; + +	LOCK (&frame->lock); +	{ +		if (child_went_down (op_ret, op_errno)) +			afr_transaction_child_died (frame, this, child_index); + +		if (op_ret != -1) { +			if (local->success_count == 0) { +				local->op_ret = op_ret; +				local->cont.fchmod.buf = *buf; +			} +			local->success_count++; + +			if (local->success_count == priv->wait_count) { +				need_unwind = 1; +			} +		} + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	if (need_unwind) +		afr_fchmod_unwind (frame, this); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +	} +	 +	return 0; +} + + +int +afr_fchmod_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	 +	int i = 0; +	int call_count = -1; + +	local = frame->local; +	priv  = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) {				 +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_fchmod_wind_cbk, (void *) (long) i, +					   priv->children[i],  +					   priv->children[i]->fops->fchmod, +					   local->fd,  +					   local->cont.fchmod.mode);  +		 +			if (!--call_count) +				break; +		} +	} +	 +	return 0; +} + + +int +afr_fchmod_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t * local = NULL; + +	local = frame->local; + +	local->transaction.unwind (frame, this); + +	AFR_STACK_DESTROY (frame); +	 +	return 0; +} + + +int32_t +afr_fchmod (call_frame_t *frame, xlator_t *this, +	    fd_t *fd, mode_t mode) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	call_frame_t  * transaction_frame = NULL; + +	int ret = -1; + +	int op_ret   = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	ALLOC_OR_GOTO (local, afr_local_t, out); +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	transaction_frame->local = local; + +	local->cont.fchmod.mode = mode; +	local->cont.fchmod.ino  = fd->inode->ino; + +	local->transaction.fop    = afr_fchmod_wind; +	local->transaction.done   = afr_fchmod_done; +	local->transaction.unwind = afr_fchmod_unwind; + +	local->fd = fd_ref (fd); +	 +	local->transaction.main_frame = frame; +	local->transaction.start   = 0; +	local->transaction.len     = 0; +	local->transaction.pending = AFR_METADATA_PENDING; + +	afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		if (transaction_frame) +			AFR_STACK_DESTROY (transaction_frame); +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); +	} + +	return 0; +} + +/* }}} */ + +/* {{{ chown */ + +int +afr_chown_unwind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	call_frame_t   *main_frame = NULL; + +	local = frame->local; +	priv  = this->private; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.main_frame) +			main_frame = local->transaction.main_frame; +		local->transaction.main_frame = NULL; +	} +	UNLOCK (&frame->lock); + +	if (main_frame) { +		local->cont.chown.buf.st_ino = local->cont.chown.ino; +		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, +				  &local->cont.chown.buf); +	} +	return 0; +} + + +int +afr_chown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +		    int32_t op_ret, int32_t op_errno, struct stat *buf) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +  +	int call_count  = -1; +	int child_index = (long) cookie; +	int need_unwind = 0; + +	local = frame->local; +	priv = this->private; + +	LOCK (&frame->lock); +	{ +		if (child_went_down (op_ret, op_errno)) +			afr_transaction_child_died (frame, this, child_index); + +		if (op_ret != -1) { +			if (local->success_count == 0) { +				local->op_ret = op_ret; +				local->cont.chown.buf = *buf; +			} +			local->success_count++; + +			if (local->success_count == priv->wait_count) { +				need_unwind = 1; +			} +		} + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	if (need_unwind) { +		local->transaction.unwind (frame, this); +	} + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +	} +	 +	return 0; +} + + +int +afr_chown_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int call_count = -1; +	int i = 0; + +	local = frame->local; +	priv  = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) {				 +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_chown_wind_cbk, (void *) (long) i, +					   priv->children[i],  +					   priv->children[i]->fops->chown, +					   &local->loc, local->cont.chown.uid, +					   local->cont.chown.gid);  + +			if (!--call_count) +				break; +		} +	} +	 +	return 0; +} + + +int +afr_chown_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; + +	local = frame->local; + +	local->transaction.unwind (frame, this); + +	AFR_STACK_DESTROY (frame); + +	return 0; +} + + +int +afr_chown (call_frame_t *frame, xlator_t *this, +	   loc_t *loc, uid_t uid, gid_t gid) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	call_frame_t   *transaction_frame = NULL; + +	int ret = -1; + +	int op_ret   = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	transaction_frame->local = local; + +	local->cont.chown.uid  = uid; +	local->cont.chown.gid  = gid; +	local->cont.chown.ino  = loc->inode->ino; + +	local->transaction.fop    = afr_chown_wind; +	local->transaction.done   = afr_chown_done; +	local->transaction.unwind = afr_chown_unwind; + +	loc_copy (&local->loc, loc); + +	local->transaction.main_frame = frame; +	local->transaction.start   = 0; +	local->transaction.len     = 0; +	local->transaction.pending = AFR_METADATA_PENDING; + +	afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		if (transaction_frame) +			AFR_STACK_DESTROY (transaction_frame); +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); +	} + +	return 0; +} + + +/* }}} */ + +/* {{{ chown */ + +int +afr_fchown_unwind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	call_frame_t   *main_frame = NULL; + +	local = frame->local; +	priv  = this->private; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.main_frame) +			main_frame = local->transaction.main_frame; +		local->transaction.main_frame = NULL; +	} +	UNLOCK (&frame->lock); + +	if (main_frame) { +		local->cont.fchown.buf.st_ino = local->cont.fchown.ino; +		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, +				  &local->cont.fchown.buf); +	} +	return 0; +} + + +int +afr_fchown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +		    int32_t op_ret, int32_t op_errno, struct stat *buf) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +  +	int call_count  = -1; +	int child_index = (long) cookie; +	int need_unwind = 0; + +	local = frame->local; +	priv = this->private; + +	LOCK (&frame->lock); +	{ +		if (child_went_down (op_ret, op_errno)) +			afr_transaction_child_died (frame, this, child_index); + +		if (op_ret != -1) { +			if (local->success_count == 0) { +				local->op_ret = op_ret; +				local->cont.fchown.buf = *buf; +			} +			local->success_count++; + +			if (local->success_count == priv->wait_count) { +				need_unwind = 1; +			} +		} + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	if (need_unwind) { +		local->transaction.unwind (frame, this); +	} + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +	} +	 +	return 0; +} + + +int +afr_fchown_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int call_count = -1; +	int i = 0; + +	local = frame->local; +	priv  = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) {				 +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_fchown_wind_cbk, (void *) (long) i, +					   priv->children[i],  +					   priv->children[i]->fops->fchown, +					   local->fd, local->cont.fchown.uid, +					   local->cont.fchown.gid);  + +			if (!--call_count) +				break; +		} +	} +	 +	return 0; +} + + +int +afr_fchown_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; + +	local = frame->local; + +	local->transaction.unwind (frame, this); + +	AFR_STACK_DESTROY (frame); + +	return 0; +} + + +int +afr_fchown (call_frame_t *frame, xlator_t *this, +	    fd_t *fd, uid_t uid, gid_t gid) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	call_frame_t   *transaction_frame = NULL; + +	int ret = -1; + +	int op_ret   = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	transaction_frame->local = local; + +	local->cont.fchown.uid  = uid; +	local->cont.fchown.gid  = gid; +	local->cont.fchown.ino  = fd->inode->ino; + +	local->transaction.fop    = afr_fchown_wind; +	local->transaction.done   = afr_fchown_done; +	local->transaction.unwind = afr_fchown_unwind; + +	local->fd = fd_ref (fd); + +	local->transaction.main_frame = frame; +	local->transaction.start   = 0; +	local->transaction.len     = 0; +	local->transaction.pending = AFR_METADATA_PENDING; + +	afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		if (transaction_frame) +			AFR_STACK_DESTROY (transaction_frame); +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); +	} + +	return 0; +} + +/* }}} */ + +/* {{{ writev */ + +int +afr_writev_unwind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	call_frame_t   *main_frame = NULL; + +	local = frame->local; +	priv  = this->private; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.main_frame) +			main_frame = local->transaction.main_frame; +		local->transaction.main_frame = NULL; +	} +	UNLOCK (&frame->lock); + +	if (main_frame) { +		local->cont.writev.buf.st_ino = local->cont.writev.ino; +		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, +				  &local->cont.writev.buf); +	} +	return 0; +} + + +int +afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +		     int32_t op_ret, int32_t op_errno, struct stat *buf) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int child_index = (long) cookie; +	int call_count  = -1; +	int need_unwind = 0; + +	local = frame->local; +	priv = this->private; + +	LOCK (&frame->lock); +	{ +		if (child_went_down (op_ret, op_errno)) +			afr_transaction_child_died (frame, this, child_index); + +		if (op_ret != -1) { +			if (local->success_count == 0) { +				local->op_ret   = op_ret; +				local->cont.writev.buf = *buf; +			} +			local->success_count++; + +			if (local->success_count == priv->wait_count) { +				need_unwind = 1; +			} +		} + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.unwind (frame, this); + +		local->transaction.resume (frame, this); +	} +	 +	return 0; +} + + +int +afr_writev_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	 +	int i = 0; +	int call_count = -1; + +	local = frame->local; +	priv = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) {				 +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_writev_wind_cbk,  +					   (void *) (long) i,	 +					   priv->children[i],  +					   priv->children[i]->fops->writev, +					   local->fd,  +					   local->cont.writev.vector, +					   local->cont.writev.count,  +					   local->cont.writev.offset);  +		 +			if (!--call_count) +				break; +		} +	} +	 +	return 0; +} + + +int +afr_writev_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; + +	local = frame->local; + +	if (local->cont.writev.refs) +		dict_unref (local->cont.writev.refs); +	local->cont.writev.refs = NULL; + +	local->transaction.unwind (frame, this); + +	AFR_STACK_DESTROY (frame); + +	return 0; +} + + +int +afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,  +	    struct iovec *vector, int32_t count, off_t offset) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	call_frame_t   *transaction_frame = NULL; + +	int ret = -1; + +	int op_ret   = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	transaction_frame->local = local; + +	local->op = GF_FOP_WRITE; +	local->cont.writev.vector  = iov_dup (vector, count); +	local->cont.writev.count   = count; +	local->cont.writev.offset  = offset; +	local->cont.writev.ino     = fd->inode->ino; + +	if (frame->root->req_refs) +		local->cont.writev.refs = dict_ref (frame->root->req_refs); + +	local->transaction.fop    = afr_writev_wind; +	local->transaction.done   = afr_writev_done; +	local->transaction.unwind = afr_writev_unwind; + +	local->fd                = fd_ref (fd); + +	local->transaction.main_frame = frame; +	if (fd->flags & O_APPEND) { +		local->transaction.start   = 0; +		local->transaction.len     = 0; +	} else { +		local->transaction.start   = offset; +		local->transaction.len     = iov_length (vector, count); +	} + +	local->transaction.pending = AFR_DATA_PENDING; + +	afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		if (transaction_frame) +			AFR_STACK_DESTROY (transaction_frame); +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); +	} + +	return 0; +} + + +/* }}} */ + +/* {{{ truncate */ + +int +afr_truncate_unwind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	call_frame_t   *main_frame = NULL; + +	local = frame->local; +	priv  = this->private; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.main_frame) +			main_frame = local->transaction.main_frame; +		local->transaction.main_frame = NULL; +	} +	UNLOCK (&frame->lock); + +	if (main_frame) { +		local->cont.truncate.buf.st_ino = local->cont.truncate.ino; +		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, +				  &local->cont.truncate.buf); +	} +	return 0; +} + + +int +afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +		       int32_t op_ret, int32_t op_errno, struct stat *buf) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int child_index = (long) cookie; +	int call_count  = -1; +	int need_unwind = 0; + +	local = frame->local; +	priv  = this->private; + +	LOCK (&frame->lock); +	{ +		if (child_went_down (op_ret, op_errno)) +			afr_transaction_child_died (frame, this, child_index); + +		if (op_ret != -1) { +			if (local->success_count == 0) { +				local->op_ret = op_ret; +				local->cont.truncate.buf = *buf; +			} +			local->success_count++; + +			if (local->success_count == priv->wait_count) { +				need_unwind = 1; +			} +		} +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	if (need_unwind) +		local->transaction.unwind (frame, this); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +	} +	 +	return 0; +} + + +int32_t +afr_truncate_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	 +	int call_count = -1; +	int i = 0; + +	local = frame->local; +	priv = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) {				 +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, +					   (void *) (long) i,	 +					   priv->children[i],  +					   priv->children[i]->fops->truncate, +					   &local->loc,  +					   local->cont.truncate.offset); + +			if (!--call_count) +				break; +		} +	} +	 +	return 0; +} + + +int +afr_truncate_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; + +	local = frame->local; + +	local->transaction.unwind (frame, this); + +	AFR_STACK_DESTROY (frame); + +	return 0; +} + + +int +afr_truncate (call_frame_t *frame, xlator_t *this, +	      loc_t *loc, off_t offset) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	call_frame_t   *transaction_frame = NULL; + +	int ret = -1; + +	int op_ret   = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	transaction_frame->local = local; + +	local->op_ret = -1; + +	local->cont.truncate.offset  = offset; +	local->cont.truncate.ino     = loc->inode->ino; + +	local->transaction.fop    = afr_truncate_wind; +	local->transaction.done   = afr_truncate_done; +	local->transaction.unwind = afr_truncate_unwind; + +	loc_copy (&local->loc, loc); + +	local->transaction.main_frame = frame; +	local->transaction.start   = 0; +	local->transaction.len     = offset; +	local->transaction.pending = AFR_DATA_PENDING; + +	afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		if (transaction_frame) +			AFR_STACK_DESTROY (transaction_frame); +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); +	} + +	return 0; +} + + +/* }}} */ + +/* {{{ ftruncate */ + + +int +afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	call_frame_t   *main_frame = NULL; + +	local = frame->local; +	priv  = this->private; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.main_frame) +			main_frame = local->transaction.main_frame; +		local->transaction.main_frame = NULL; +	} +	UNLOCK (&frame->lock); + +	if (main_frame) { +		local->cont.ftruncate.buf.st_ino = local->cont.ftruncate.ino; +		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, +				  &local->cont.ftruncate.buf); +	} +	return 0; +} + + +int +afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +			int32_t op_ret, int32_t op_errno, struct stat *buf) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int child_index = (long) cookie; +	int call_count  = -1; +	int need_unwind = 0; + +	local = frame->local; +	priv  = this->private; + +	LOCK (&frame->lock); +	{ +		if (child_went_down (op_ret, op_errno)) +			afr_transaction_child_died (frame, this, child_index); + +		if (op_ret != -1) { +			if (local->success_count == 0) { +				local->op_ret = op_ret; +				local->cont.ftruncate.buf = *buf; +			} +			local->success_count++; + +			if (local->success_count == priv->wait_count) { +				need_unwind = 1; +			} +		} +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	if (need_unwind) +		local->transaction.unwind (frame, this); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +	} +	 +	return 0; +} + + +int +afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	 +	int call_count = -1; +	int i = 0; + +	local = frame->local; +	priv = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) {				 +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, +					   (void *) (long) i,	 +					   priv->children[i],  +					   priv->children[i]->fops->ftruncate, +					   local->fd, local->cont.ftruncate.offset); + +			if (!--call_count) +				break; +		} +	} +	 +	return 0; +} + + +int +afr_ftruncate_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; + +	local = frame->local; + +	local->transaction.unwind (frame, this); + +	AFR_STACK_DESTROY (frame); + +	return 0; +} + + +int +afr_ftruncate (call_frame_t *frame, xlator_t *this, +	       fd_t *fd, off_t offset) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	call_frame_t   *transaction_frame = NULL; + +	int ret = -1; + +	int op_ret   = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	transaction_frame->local = local; + +	local->op = GF_FOP_FTRUNCATE; +	local->op_ret = -1; + +	local->cont.ftruncate.offset  = offset; +	local->cont.ftruncate.ino     = fd->inode->ino; + +	local->transaction.fop    = afr_ftruncate_wind; +	local->transaction.done   = afr_ftruncate_done; +	local->transaction.unwind = afr_ftruncate_unwind; + +	local->fd = fd_ref (fd); + +	local->transaction.main_frame = frame; +	local->transaction.start   = 0; +	local->transaction.len     = offset; +	local->transaction.pending = AFR_DATA_PENDING; + +	afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		if (transaction_frame) +			AFR_STACK_DESTROY (transaction_frame); +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); +	} + +	return 0; +} + +/* }}} */ + +/* {{{ utimens */ + + +int +afr_utimens_unwind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	call_frame_t   *main_frame = NULL; + +	local = frame->local; +	priv  = this->private; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.main_frame) +			main_frame = local->transaction.main_frame; +		local->transaction.main_frame = NULL; +	} +	UNLOCK (&frame->lock); + +	if (main_frame) { +		local->cont.utimens.buf.st_ino = local->cont.utimens.ino; +		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, +				  &local->cont.utimens.buf); +	} +	return 0; +} + + +int +afr_utimens_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +		      int32_t op_ret, int32_t op_errno, struct stat *buf) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int child_index = (long) cookie; +	int call_count  = -1; +	int need_unwind = 1; + +	local = frame->local; +	priv = this->private; + +	LOCK (&frame->lock); +	{ +		if (child_went_down (op_ret, op_errno)) +			afr_transaction_child_died (frame, this, child_index); + +		if (op_ret != -1) { +			if (local->success_count == 0) { +				local->op_ret = op_ret; +				local->cont.utimens.buf = *buf; +			} +			local->success_count++; + +			if (local->success_count == priv->wait_count) { +				need_unwind = 1; +			} +		} + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	if (need_unwind) +		local->transaction.unwind (frame, this); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +	} +	 +	return 0; +} + + +int +afr_utimens_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	 +	int call_count = -1; +	int i = 0; + +	local = frame->local; +	priv = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) {				 +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_utimens_wind_cbk, +					   (void *) (long) i,	 +					   priv->children[i],  +					   priv->children[i]->fops->utimens, +					   &local->loc,  +					   local->cont.utimens.tv);  + +			if (!--call_count) +				break; +		} +	} +	 +	return 0; +} + + +int +afr_utimens_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t * local = NULL; + +	local = frame->local; + +	local->transaction.unwind (frame, this); + +	AFR_STACK_DESTROY (frame); + +	return 0; +} + + +int +afr_utimens (call_frame_t *frame, xlator_t *this, +	     loc_t *loc, struct timespec tv[2]) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	call_frame_t   *transaction_frame = NULL; + +	int ret = -1; + +	int op_ret   = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	transaction_frame->local = local; + +	local->op_ret = -1; + +	local->cont.utimens.tv[0] = tv[0]; +	local->cont.utimens.tv[1] = tv[1]; + +	local->cont.utimens.ino  = loc->inode->ino; + +	local->transaction.fop    = afr_utimens_wind; +	local->transaction.done   = afr_utimens_done; +	local->transaction.unwind = afr_utimens_unwind; + +	loc_copy (&local->loc, loc); +	 +	local->transaction.main_frame = frame; +	local->transaction.start   = 0; +	local->transaction.len     = 0; +	local->transaction.pending = AFR_METADATA_PENDING; + +	afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		if (transaction_frame) +			AFR_STACK_DESTROY (transaction_frame); +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); +	} + +	return 0; +} + +/* }}} */ + +/* {{{ setxattr */ + + +int +afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	call_frame_t   *main_frame = NULL; + +	local = frame->local; +	priv  = this->private; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.main_frame) +			main_frame = local->transaction.main_frame; +		local->transaction.main_frame = NULL; +	} +	UNLOCK (&frame->lock); + +	if (main_frame) { +		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno) +	} +	return 0; +} + + +int +afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +		       int32_t op_ret, int32_t op_errno) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int call_count  = -1; +	int need_unwind = 0; + +	local = frame->local; +	priv = this->private; + +	LOCK (&frame->lock); +	{ +		if (op_ret != -1) { +			if (local->success_count == 0) { +				local->op_ret = op_ret; +			} +			local->success_count++; + +			if (local->success_count == priv->wait_count) { +				need_unwind = 1; +			} +		} + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	if (need_unwind) +		local->transaction.unwind (frame, this); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +	} +	 +	return 0; +} + + +int +afr_setxattr_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; + +	int call_count = -1; +	int i = 0; + +	local = frame->local; +	priv = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) {				 +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, +					   (void *) (long) i,	 +					   priv->children[i],  +					   priv->children[i]->fops->setxattr, +					   &local->loc,  +					   local->cont.setxattr.dict, +					   local->cont.setxattr.flags);  + +			if (!--call_count) +				break; +		} +	} +	 +	return 0; +} + + +int +afr_setxattr_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t * local = frame->local; + +	local->transaction.unwind (frame, this); + +	AFR_STACK_DESTROY (frame); +	 +	return 0; +} + + +int +afr_setxattr (call_frame_t *frame, xlator_t *this, +	      loc_t *loc, dict_t *dict, int32_t flags) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	call_frame_t   *transaction_frame = NULL; + +	int ret = -1; + +	int op_ret   = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	transaction_frame->local = local; + +	local->op_ret = -1; + +	local->cont.setxattr.dict  = dict_ref (dict); +	local->cont.setxattr.flags = flags; + +	local->transaction.fop    = afr_setxattr_wind; +	local->transaction.done   = afr_setxattr_done; +	local->transaction.unwind = afr_setxattr_unwind; + +	loc_copy (&local->loc, loc); + +	local->transaction.main_frame = frame; +	local->transaction.start   = 0; +	local->transaction.len     = 0; +	local->transaction.pending = AFR_METADATA_PENDING; + +	afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		if (transaction_frame) +			AFR_STACK_DESTROY (transaction_frame); +		AFR_STACK_UNWIND (frame, op_ret, op_errno); +	} + +	return 0; +} + +/* }}} */ + +/* {{{ removexattr */ + + +int +afr_removexattr_unwind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	call_frame_t   *main_frame = NULL; + +	local = frame->local; +	priv  = this->private; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.main_frame) +			main_frame = local->transaction.main_frame; +		local->transaction.main_frame = NULL; +	} +	UNLOCK (&frame->lock); + +	if (main_frame) { +		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno) +	} +	return 0; +} + + +int +afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +			  int32_t op_ret, int32_t op_errno) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	int call_count  = -1; +	int need_unwind = 0; + +	local = frame->local; +	priv = this->private; + +	LOCK (&frame->lock); +	{ +		if (op_ret != -1) { +			if (local->success_count == 0) { +				local->op_ret = op_ret; +			} +			local->success_count++; + +			if (local->success_count == priv->wait_count) { +				need_unwind = 1; +			} +		} + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	if (need_unwind) +		local->transaction.unwind (frame, this); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +	} +	 +	return 0; +} + + +int32_t +afr_removexattr_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; + +	int call_count = -1; +	int i = 0; + +	local = frame->local; +	priv = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) {				 +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk,  +					   (void *) (long) i,	 +					   priv->children[i],  +					   priv->children[i]->fops->removexattr, +					   &local->loc,  +					   local->cont.removexattr.name); + +			if (!--call_count) +				break; +		} +	} +	 +	return 0; +} + + +int +afr_removexattr_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t * local = frame->local; + +	local->transaction.unwind (frame, this); + +	AFR_STACK_DESTROY (frame); +	 +	return 0; +} + + +int +afr_removexattr (call_frame_t *frame, xlator_t *this, +		 loc_t *loc, const char *name) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	call_frame_t   *transaction_frame = NULL; + +	int ret = -1; + +	int op_ret   = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); +	VALIDATE_OR_GOTO (loc, out); + +	priv = this->private; + +	transaction_frame = copy_frame (frame); +	if (!transaction_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	transaction_frame->local = local; + +	local->op_ret = -1; + +	local->cont.removexattr.name = strdup (name); + +	local->transaction.fop    = afr_removexattr_wind; +	local->transaction.done   = afr_removexattr_done; +	local->transaction.unwind = afr_removexattr_unwind; + +	loc_copy (&local->loc, loc); + +	local->transaction.main_frame = frame; +	local->transaction.start   = 0; +	local->transaction.len     = 0; +	local->transaction.pending = AFR_METADATA_PENDING; + +	afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		if (transaction_frame) +			AFR_STACK_DESTROY (transaction_frame); +		AFR_STACK_UNWIND (frame, op_ret, op_errno); +	} + +	return 0; +} diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h new file mode 100644 index 00000000000..9c0b5cad314 --- /dev/null +++ b/xlators/cluster/afr/src/afr-inode-write.h @@ -0,0 +1,63 @@ +/* +   Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + +#ifndef __INODE_WRITE_H__ +#define __INODE_WRITE_H__ + +int32_t +afr_chmod (call_frame_t *frame, xlator_t *this, +	   loc_t *loc, mode_t mode); + +int32_t +afr_chown (call_frame_t *frame, xlator_t *this, +	   loc_t *loc, uid_t uid, gid_t gid); + +int +afr_fchown (call_frame_t *frame, xlator_t *this, +	    fd_t *fd, uid_t uid, gid_t gid); + +int32_t +afr_fchmod (call_frame_t *frame, xlator_t *this, +	    fd_t *fd, mode_t mode); + +int32_t +afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,  +	    struct iovec *vector, int32_t count, off_t offset); + +int32_t +afr_truncate (call_frame_t *frame, xlator_t *this, +	      loc_t *loc, off_t offset); + +int32_t +afr_ftruncate (call_frame_t *frame, xlator_t *this, +	       fd_t *fd, off_t offset); + +int32_t +afr_utimens (call_frame_t *frame, xlator_t *this, +	     loc_t *loc, struct timespec tv[2]); + +int32_t +afr_setxattr (call_frame_t *frame, xlator_t *this, +	      loc_t *loc, dict_t *dict, int32_t flags); + +int32_t +afr_removexattr (call_frame_t *frame, xlator_t *this, +		 loc_t *loc, const char *name); + +#endif /* __INODE_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c new file mode 100644 index 00000000000..45d06516965 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -0,0 +1,1073 @@ +/* +  Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +  This file is part of GlusterFS. + +  GlusterFS is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published +  by the Free Software Foundation; either version 3 of the License, +  or (at your option) any later version. + +  GlusterFS is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program.  If not, see +  <http://www.gnu.org/licenses/>. +*/ + +#include "glusterfs.h" +#include "xlator.h" +#include "byte-order.h" + +#include "afr.h" +#include "afr-transaction.h" +#include "afr-self-heal-common.h" +#include "afr-self-heal.h" + + +/** + * select_source - select a source and return it + * TODO: take into account option 'favorite-child' + */ + +int +afr_sh_select_source (int sources[], int child_count) +{ +	int i; +	for (i = 0; i < child_count; i++) +		if (sources[i]) +			return i; + +	return -1; +} + + +/** + * sink_count - return number of sinks in sources array + */ + +int +afr_sh_sink_count (int sources[], int child_count) +{ +	int i; +	int sinks = 0; +	for (i = 0; i < child_count; i++) +		if (!sources[i]) +			sinks++; +	return sinks; +} + +int +afr_sh_source_count (int sources[], int child_count) +{ +	int i; +	int nsource = 0; + +	for (i = 0; i < child_count; i++) +		if (sources[i]) +			nsource++; +	return nsource; +} + + +int +afr_sh_supress_errenous_children (int sources[], int child_errno[], +				  int child_count) +{ +	int i = 0; + +	for (i = 0; i < child_count; i++) { +		if (child_errno[i] && sources[i]) { +			sources[i] = 0; +		} +	} + +	return 0; +} + + +int +afr_sh_supress_empty_children (int sources[], dict_t *xattr[], +			       struct stat *buf, +			       int child_count, const char *key) +{ +	int      i = 0; +	int32_t *pending = NULL; +	int      ret = 0; +	int      all_xattr_missing = 1; + +	/* if the file was created by afr with xattrs */ +	for (i = 0; i < child_count; i++) { +		if (!xattr[i]) +			continue; + +		ret = dict_get_ptr (xattr[i], (char *)key, VOID(&pending)); +		if (ret != 0) { +			continue; +		} + +		all_xattr_missing = 0; +		break; +	} + +	if (all_xattr_missing) { +		/* supress 0byte files.. this avoids empty file created +		   by dir selfheal to overwrite the 'good' file */ +		for (i = 0; i < child_count; i++) { +			if (!buf[i].st_size) +				sources[i] = 0; +		} +		goto out; +	} + + +	for (i = 0; i < child_count; i++) { +		if (!xattr[i]) { +			sources[i] = 0; +			continue; +		} + +		ret = dict_get_ptr (xattr[i], (char *)key, VOID(&pending)); +		if (ret != 0) { +			sources[i] = 0; +			continue; +		} + +		if (!pending) { +			sources[i] = 0; +			continue; +		} +	} + +out: +	return 0; +} + + +void +afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) +{ +	afr_private_t * priv = this->private; + +	char *buf = NULL; +	char *ptr = NULL; + +	int i, j; + +        /* 10 digits per entry + 1 space + '[' and ']' */ +	buf = MALLOC (priv->child_count * 11 + 8);  + +	for (i = 0; i < priv->child_count; i++) { +		ptr = buf; +		ptr += sprintf (ptr, "[ "); +		for (j = 0; j < priv->child_count; j++) { +			ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); +		} +		ptr += sprintf (ptr, "]"); +		gf_log (this->name, GF_LOG_DEBUG, +			"pending_matrix: %s", buf); +	} + +	FREE (buf); +} + + +void +afr_sh_build_pending_matrix (int32_t *pending_matrix[], dict_t *xattr[], +			     int child_count, const char *key) +{ +	int i = 0; +	int j = 0; +	int32_t *pending = NULL; +	int ret = -1; + +	/* start clean */ +	for (i = 0; i < child_count; i++) { +		for (j = 0; j < child_count; j++) { +			pending_matrix[i][j] = 0; +		} +	} + +	for (i = 0; i < child_count; i++) { +		if (!xattr[i]) +			continue; + +		pending = NULL; + +		ret = dict_get_ptr (xattr[i], (char *) key, +				    VOID(&pending)); +		if (ret != 0) +			continue; + +		for (j = 0; j < child_count; j++) { +			pending_matrix[i][j] = ntoh32 (pending[j]); +		} +	} +} + + +/** + * mark_sources: Mark all 'source' nodes and return number of source + * nodes found + */ + +int +afr_sh_mark_sources (int32_t *pending_matrix[], int sources[], int child_count) +{ +	int i = 0; +	int j = 0; + +	int nsources = 0; + + +	/* start clean */ +	for (i = 0; i < child_count; i++) { +		sources[i] = 0; +	} + +	/* +	  Let's 'normalize' the pending matrix first, +	  by disregarding all pending entries that refer +	  to themselves +	*/ +	for (i = 0; i < child_count; i++) { +		pending_matrix[i][i] = 0; +	} + +	for (i = 0; i < child_count; i++) { +		for (j = 0; j < child_count; j++) { +			if (pending_matrix[j][i]) +				break; +		} + +		if (j == child_count) { +			nsources++; +			sources[i] = 1; +		} +	} + +	return nsources; +} + + +void +afr_sh_pending_to_delta (int32_t *pending_matrix[], int32_t *delta_matrix[], +			 int success[], int child_count) +{ +	int i = 0; +	int j = 0; + +	/* start clean */ +	for (i = 0; i < child_count; i++) { +		for (j = 0; j < child_count; j++) { +			delta_matrix[i][j] = 0; +		} +	} + +	for (i = 0; i < child_count; i++) { +		for (j = 0; j < child_count; j++) { +			if (!success[j]) +				continue; +			delta_matrix[i][j] = -pending_matrix[i][j]; +		} +	} +} + + +int +afr_sh_delta_to_xattr (int32_t *delta_matrix[], dict_t *xattr[], +		       int child_count, const char *key) +{ +	int i = 0; +	int j = 0; + +	int ret = 0; + +	int32_t *pending = 0; + +	for (i = 0; i < child_count; i++) { +		if (!xattr[i]) +			continue; + +		pending = CALLOC (sizeof (int32_t), child_count); +		for (j = 0; j < child_count; j++) { +			pending[j] = hton32 (delta_matrix[i][j]); +		} + +		ret = dict_set_bin (xattr[i], (char *) key, pending, +				    child_count * sizeof (int32_t)); +	} + +	return 0; +} + + +int +afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this) +{ +	afr_private_t *priv = NULL; +	int32_t       *pending = NULL; +	void          *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */ + +	int           ret = -1; +	int            i  = 0; + +	priv = this->private; + +	ret = dict_get_ptr (xattr, AFR_METADATA_PENDING, &tmp_pending); + +	if (ret != 0) +		return 0; + +	pending = tmp_pending; +	for (i = 0; i < priv->child_count; i++) { +		if (i == child_count) +			continue; +		if (pending[i]) +			return 1; +	} + +	return 0; +} + + +int +afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this) +{ +	afr_private_t *priv = NULL; +	int32_t       *pending = NULL; +	void          *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */ + +	int          ret = -1; +	int            i = 0; + +	priv = this->private; + +	ret = dict_get_ptr (xattr, AFR_DATA_PENDING, &tmp_pending); + +	if (ret != 0) +		return 0; + +	pending = tmp_pending; +	for (i = 0; i < priv->child_count; i++) { +		if (i == child_count) +			continue; +		if (pending[i]) +			return 1; +	} + +	return 0; +} + + +int +afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this) +{ +	afr_private_t *priv = NULL; +	int32_t       *pending = NULL; +	void          *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */ +	 +	int          ret = -1; +	int            i = 0; + +	priv = this->private; + +	ret = dict_get_ptr (xattr, AFR_ENTRY_PENDING, &tmp_pending); + +	if (ret != 0) +		return 0; + +	pending = tmp_pending; +	for (i = 0; i < priv->child_count; i++) { +		if (i == child_count) +			continue; +		if (pending[i]) +			return 1; +	} + +	return 0; +} + + + +/** + * is_matrix_zero - return true if pending matrix is all zeroes + */ + +int +afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count) +{ +	int i, j; + +	for (i = 0; i < child_count; i++)  +		for (j = 0; j < child_count; j++)  +			if (pending_matrix[i][j])  +				return 0; +	return 1; +} + + +int +afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              i = 0; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +//	memset (sh->child_errno, 0, sizeof (int) * priv->child_count); +	memset (sh->buf, 0, sizeof (struct stat) * priv->child_count); +	 +	for (i = 0; i < priv->child_count; i++) { +		if (sh->xattr[i]) +			dict_unref (sh->xattr[i]); +		sh->xattr[i] = NULL; +	} + +	if (local->govinda_gOvinda) { +		gf_log (this->name, GF_LOG_WARNING, +			"aborting selfheal of %s", +			local->loc.path); +		sh->completion_cbk (frame, this); +	} else { +		gf_log (this->name, GF_LOG_DEBUG, +			"proceeding to metadata check on %s", +			local->loc.path); +		afr_self_heal_metadata (frame, this); +	} + +	return 0; +} + + +int +sh_missing_entries_unlck_cbk (call_frame_t *frame, void *cookie, +			      xlator_t *this, +			      int32_t op_ret, int32_t op_errno) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              call_count = 0; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	LOCK (&frame->lock); +	{ +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		afr_sh_missing_entries_done (frame, this); +	} + +	return 0; +} +			       + +static int +sh_missing_entries_finish (call_frame_t *frame, xlator_t *this) +{ +	afr_private_t      *priv = NULL; +	afr_local_t        *local = NULL; +	int                 i = 0; +	int                 call_count = 0; +	afr_self_heal_t    *sh = NULL; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	call_count = local->child_count; + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			gf_log (this->name, GF_LOG_DEBUG, +				"unlocking %"PRId64"/%s on subvolume %s", +				sh->parent_loc.inode->ino, local->loc.name, +				priv->children[i]->name); + +			STACK_WIND (frame, sh_missing_entries_unlck_cbk, +				    priv->children[i], +				    priv->children[i]->fops->entrylk, +				    &sh->parent_loc, local->loc.name, +				    ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + +			if (!--call_count) +				break; +		} +	} +	return 0; +} + + +static int +sh_destroy_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		int32_t op_ret, int op_errno, struct stat *stbuf) +{ +	STACK_DESTROY (frame->root); +	return 0; +} + + +static int +sh_missing_entries_newentry_cbk (call_frame_t *frame, void *cookie, +				 xlator_t *this, +				 int32_t op_ret, int32_t op_errno, +				 inode_t *inode, struct stat *stbuf) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	call_frame_t    *chown_frame = NULL; +	int              call_count = 0; +	int              child_index = 0; +	struct stat     *buf = NULL; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	buf = &sh->buf[sh->source]; +	child_index = (long) cookie; + +	if (op_ret == 0) { +		chown_frame = copy_frame (frame); + +		gf_log (this->name, GF_LOG_DEBUG, +			"chown %s to %d %d on subvolume %s", +			local->loc.path, buf->st_uid, buf->st_gid, +			priv->children[child_index]->name); + +		STACK_WIND (chown_frame, sh_destroy_cbk, +			    priv->children[child_index], +			    priv->children[child_index]->fops->chown, +			    &local->loc, +			    buf->st_uid, buf->st_gid); +	} + +	LOCK (&frame->lock); +	{ +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		sh_missing_entries_finish (frame, this); +	} + +	return 0; +} + + +static int +sh_missing_entries_mknod (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              i = 0; +	int              enoent_count = 0; +	int              call_count = 0; +	mode_t           st_mode = 0; +	dev_t            st_dev = 0; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	for (i = 0; i < priv->child_count; i++) +		if (sh->child_errno[i] == ENOENT) +			enoent_count++; + +	call_count = enoent_count; +	local->call_count = call_count; + +	st_mode = sh->buf[sh->source].st_mode; +	st_dev  = sh->buf[sh->source].st_dev; + +	gf_log (this->name, GF_LOG_DEBUG, +		"mknod %s mode 0%o on %d subvolumes", +		local->loc.path, st_mode, enoent_count); + +	for (i = 0; i < priv->child_count; i++) { +		if (sh->child_errno[i] == ENOENT) { +			STACK_WIND_COOKIE (frame, +					   sh_missing_entries_newentry_cbk, +					   (void *) (long) i, +					   priv->children[i], +					   priv->children[i]->fops->mknod, +					   &local->loc, st_mode, st_dev); +			if (!--call_count) +				break; +		} +	} + +	return 0; +} + + +static int +sh_missing_entries_mkdir (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              i = 0; +	int              enoent_count = 0; +	int              call_count = 0; +	mode_t           st_mode = 0; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	for (i = 0; i < priv->child_count; i++) +		if (sh->child_errno[i] == ENOENT) +			enoent_count++; + +	call_count = enoent_count; +	local->call_count = call_count; + +	st_mode = sh->buf[sh->source].st_mode; + +	gf_log (this->name, GF_LOG_DEBUG, +		"mkdir %s mode 0%o on %d subvolumes", +		local->loc.path, st_mode, enoent_count); + +	for (i = 0; i < priv->child_count; i++) { +		if (sh->child_errno[i] == ENOENT) { +			STACK_WIND_COOKIE (frame, +					   sh_missing_entries_newentry_cbk, +					   (void *) (long) i, +					   priv->children[i], +					   priv->children[i]->fops->mkdir, +					   &local->loc, st_mode); +			if (!--call_count) +				break; +		} +	} + +	return 0; +} + + +static int +sh_missing_entries_symlink (call_frame_t *frame, xlator_t *this, +			    const char *link) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              i = 0; +	int              enoent_count = 0; +	int              call_count = 0; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	for (i = 0; i < priv->child_count; i++) +		if (sh->child_errno[i] == ENOENT) +			enoent_count++; + +	call_count = enoent_count; +	local->call_count = call_count; + +	gf_log (this->name, GF_LOG_DEBUG, +		"symlink %s -> %s on %d subvolumes", +		local->loc.path, link, enoent_count); + +	for (i = 0; i < priv->child_count; i++) { +		if (sh->child_errno[i] == ENOENT) { +			STACK_WIND_COOKIE (frame, +					   sh_missing_entries_newentry_cbk, +					   (void *) (long) i, +					   priv->children[i], +					   priv->children[i]->fops->symlink, +					   link, &local->loc); +			if (!--call_count) +				break; +		} +	} + +	return 0; +} + + +static int +sh_missing_entries_readlink_cbk (call_frame_t *frame, void *cookie, +				 xlator_t *this, +				 int32_t op_ret, int32_t op_errno, +				 const char *link) +{ +	if (op_ret > 0) +		sh_missing_entries_symlink (frame, this, link); +	else +		sh_missing_entries_finish (frame, this); + +	return 0; +} + + +static int +sh_missing_entries_readlink (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	STACK_WIND (frame, sh_missing_entries_readlink_cbk, +		    priv->children[sh->source], +		    priv->children[sh->source]->fops->readlink, +		    &local->loc, 4096); + +	return 0; +} + + +static int +sh_missing_entries_create (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	int              type = 0; +	int              i = 0; +	afr_private_t   *priv = NULL; +	int              enoent_count = 0; +	int              govinda_gOvinda = 0; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	for (i = 0; i < priv->child_count; i++) { +		if (sh->child_errno[i]) { +			if (sh->child_errno[i] == ENOENT) +				enoent_count++; +		} else { +			if (type) { +				if (type != (sh->buf[i].st_mode & S_IFMT)) +					govinda_gOvinda = 1; +			} else { +				sh->source = i; +				type = sh->buf[i].st_mode & S_IFMT; +			} +		} +	} + +	if (govinda_gOvinda) { +		gf_log (this->name, GF_LOG_ERROR, +			"conflicing filetypes exist for path %s. returning.", +			local->loc.path); + +		local->govinda_gOvinda = 1; +		sh_missing_entries_finish (frame, this); +		return 0; +	} + +	if (!type) { +		gf_log (this->name, GF_LOG_ERROR, +			"no source found for %s. all nodes down?. returning.", +			local->loc.path); +		/* subvolumes down and/or file does not exist */ +		sh_missing_entries_finish (frame, this); +		return 0; +	} + +	if (enoent_count == 0) { +		gf_log (this->name, GF_LOG_ERROR, +			"no missing files - %s. proceeding to metadata check", +			local->loc.path); +		/* proceed to next step - metadata self-heal */ +		sh_missing_entries_finish (frame, this); +		return 0; +	} + +	switch (type) { +	case S_IFSOCK: +	case S_IFREG: +	case S_IFBLK: +	case S_IFCHR: +	case S_IFIFO: +		sh_missing_entries_mknod (frame, this); +		break; +	case S_IFLNK: +		sh_missing_entries_readlink (frame, this); +		break; +	case S_IFDIR: +		sh_missing_entries_mkdir (frame, this); +		break; +	default: +		gf_log (this->name, GF_LOG_ERROR, +			"unknown file type: 0%o", type); +		local->govinda_gOvinda = 1; +		sh_missing_entries_finish (frame, this); +	} + +	return 0; +} + + +static int +sh_missing_entries_lookup_cbk (call_frame_t *frame, void *cookie, +			       xlator_t *this, +			       int32_t op_ret, int32_t op_errno, +			       inode_t *inode, struct stat *buf, dict_t *xattr) +{ +	int              child_index = 0; +	afr_local_t     *local = NULL; +	int              call_count = 0; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	child_index = (long) cookie; + +	LOCK (&frame->lock); +	{ +		if (op_ret == 0) { +			gf_log (this->name, GF_LOG_DEBUG, +				"path %s on subvolume %s is of mode 0%o", +				local->loc.path, +				priv->children[child_index]->name, +				buf->st_mode); + +			local->self_heal.buf[child_index] = *buf; +		} else { +			gf_log (this->name, GF_LOG_WARNING, +				"path %s on subvolume %s => -1 (%s)", +				local->loc.path, +				priv->children[child_index]->name, +				strerror (op_errno)); + +			local->self_heal.child_errno[child_index] = op_errno; +		} + +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		sh_missing_entries_create (frame, this); +	} + +	return 0; +} + + +static int +sh_missing_entries_lookup (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t    *local = NULL; +	int             i = 0; +	int             call_count = 0; +	afr_private_t  *priv = NULL; +	dict_t         *xattr_req = NULL; +	int             ret = -1; + +	local = frame->local; +	call_count = local->child_count; +	priv = this->private; + +	local->call_count = call_count; +	 +	xattr_req = dict_new(); +	 +	if (xattr_req) +		ret = dict_set_uint64 (xattr_req, AFR_ENTRY_PENDING, +				       priv->child_count * sizeof(int32_t)); + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			gf_log (this->name, GF_LOG_DEBUG, +				"looking up %s on subvolume %s", +				local->loc.path, priv->children[i]->name); + +			STACK_WIND_COOKIE (frame, +					   sh_missing_entries_lookup_cbk, +					   (void *) (long) i, +					   priv->children[i], +					   priv->children[i]->fops->lookup, +					   &local->loc, xattr_req); + +			if (!--call_count) +				break; +		} +	} +	 +	if (xattr_req) +		dict_unref (xattr_req); + +	return 0; +} + + +static int +sh_missing_entries_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			   int32_t op_ret, int32_t op_errno) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	int              call_count = 0; +	int              child_index = (long) cookie; + + +	local = frame->local; +	sh    = &local->self_heal; + +	LOCK (&frame->lock); +	{ +		if (op_ret == -1) { +			sh->op_failed = 1; + +			gf_log (this->name, +				(op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), +				"locking inode of %s on child %d failed: %s", +				local->loc.path, child_index, +				strerror (op_errno)); +		} else { +			gf_log (this->name, GF_LOG_DEBUG, +				"inode of %s on child %d locked", +				local->loc.path, child_index); +		} +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		if (sh->op_failed == 1) { +			sh_missing_entries_finish (frame, this); +			return 0; +		} + +		sh_missing_entries_lookup (frame, this); +	} + +	return 0; +} + + +static int +afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              i = 0; +	int              call_count = 0; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	gf_log (this->name, GF_LOG_DEBUG, +		"attempting to recreate missing entries for path=%s", +		local->loc.path); + +	afr_build_parent_loc (&sh->parent_loc, &local->loc); + +	call_count = local->child_count; + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND (frame, sh_missing_entries_lk_cbk, +				    priv->children[i], +				    priv->children[i]->fops->entrylk, +				    &sh->parent_loc, local->loc.name, +				    ENTRYLK_LOCK_NB, ENTRYLK_WRLCK); +			if (!--call_count) +				break; +		} + 	} + +	return 0; +} + + +int +afr_self_heal (call_frame_t *frame, xlator_t *this, +	       int (*completion_cbk) (call_frame_t *, xlator_t *)) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              i = 0; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	gf_log (this->name, GF_LOG_DEBUG, +		"performing self heal on %s (metadata=%d data=%d entry=%d)", +		local->loc.path, +		local->need_metadata_self_heal, +		local->need_data_self_heal, +		local->need_entry_self_heal); + +	sh->completion_cbk = completion_cbk; + +	sh->buf = CALLOC (priv->child_count, sizeof (struct stat)); +	sh->child_errno = CALLOC (priv->child_count, sizeof (int)); +	sh->success = CALLOC (priv->child_count, sizeof (int)); +	sh->xattr = CALLOC (priv->child_count, sizeof (dict_t *)); +	sh->sources = CALLOC (sizeof (*sh->sources), priv->child_count); + +	sh->pending_matrix = CALLOC (sizeof (int32_t *), priv->child_count); +	for (i = 0; i < priv->child_count; i++) { +		sh->pending_matrix[i] = CALLOC (sizeof (int32_t), +						priv->child_count); +	} + +	sh->delta_matrix = CALLOC (sizeof (int32_t *), priv->child_count); +	for (i = 0; i < priv->child_count; i++) { +		sh->delta_matrix[i] = CALLOC (sizeof (int32_t), +					      priv->child_count); +	} + +	if (local->success_count && local->enoent_count) { +		afr_self_heal_missing_entries (frame, this); +	} else { +		gf_log (this->name, GF_LOG_DEBUG, +			"proceeding to metadata check on %s", +			local->loc.path); +		afr_sh_missing_entries_done (frame, this); +	} + +	return 0; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h new file mode 100644 index 00000000000..9dd597f0787 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-common.h @@ -0,0 +1,66 @@ +/* +  Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +  This file is part of GlusterFS. + +  GlusterFS is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published +  by the Free Software Foundation; either version 3 of the License, +  or (at your option) any later version. + +  GlusterFS is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program.  If not, see +  <http://www.gnu.org/licenses/>. +*/ + +#ifndef __AFR_SELF_HEAL_COMMON_H__ +#define __AFR_SELF_HEAL_COMMON_H__ + +#define FILE_HAS_HOLES(buf) (((buf)->st_size) > ((buf)->st_blocks * 512)) + +int +afr_sh_select_source (int sources[], int child_count); + +int +afr_sh_sink_count (int sources[], int child_count); + +int +afr_sh_source_count (int sources[], int child_count); + +int +afr_sh_supress_errenous_children (int sources[], int child_errno[], +				  int child_count); + +int +afr_sh_supress_empty_children (int sources[], dict_t *xattr[], +			       struct stat *buf, +			       int child_count, const char *key); + +void +afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this); + +void +afr_sh_build_pending_matrix (int32_t *pending_matrix[], dict_t *xattr[], +			     int child_count, const char *key); + +void +afr_sh_pending_to_delta (int32_t *pending_matrix[], int32_t *delta_matrix[], +			 int32_t success[], int child_count); + +int +afr_sh_mark_sources (int32_t *pending_matrix[], int sources[], +		     int child_count); + +int +afr_sh_delta_to_xattr (int32_t *delta_matrix[], dict_t *xattr[], +		       int child_count, const char *key); + +int +afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count); + + +#endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c new file mode 100644 index 00000000000..3a48da48587 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -0,0 +1,1030 @@ +/* +   Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" + + + +int +afr_sh_data_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	/*  +	   TODO: cleanup sh->*  +	 */ + +	gf_log (this->name, GF_LOG_DEBUG, +		"self heal of %s completed", +		local->loc.path); + +	sh->completion_cbk (frame, this); + +	return 0; +} + + +int +afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		       int32_t op_ret, int32_t op_errno) +{ +	afr_local_t     *local = NULL; +	afr_private_t   *priv  = NULL; +	afr_self_heal_t *sh = NULL; +	int              call_count = 0; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	LOCK (&frame->lock); +	{ +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		fd_unref (sh->healing_fd); +		sh->healing_fd = NULL; +		afr_sh_data_done (frame, this); +	} + +	return 0; +} + + +int +afr_sh_data_close (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_private_t   *priv  = NULL; +	afr_self_heal_t *sh  = NULL; +	int              i = 0; +	int              call_count = 0; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	if (!sh->healing_fd) { +		afr_sh_data_done (frame, this); +		return 0; +	} + +	call_count = sh->active_sinks + 1; +	local->call_count = call_count; + + +	/* closed source */ +	gf_log (this->name, GF_LOG_DEBUG, +		"closing fd of %s on %s", +		local->loc.path, priv->children[sh->source]->name); + +	STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, +			   (void *) (long) sh->source, +			   priv->children[sh->source], +			   priv->children[sh->source]->fops->flush, +			   sh->healing_fd); +	call_count--; + +	for (i = 0; i < priv->child_count; i++) { +		if (sh->sources[i] || !local->child_up[i]) +			continue; + +		gf_log (this->name, GF_LOG_DEBUG, +			"closing fd of %s on %s", +			local->loc.path, priv->children[i]->name); + +		STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, +				   (void *) (long) i, +				   priv->children[i], +				   priv->children[i]->fops->flush, +				   sh->healing_fd); +		if (!--call_count) +			break; +	} + +	return 0; +} + + +int +afr_sh_data_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		       int32_t op_ret, int32_t op_errno) +{ +	afr_local_t * local = NULL; +	int           call_count = 0; +	int           child_index = (long) cookie; + +	 +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (op_ret == -1) { +			gf_log (this->name, GF_LOG_ERROR,  +				"locking inode of %s on child %d failed: %s", +				local->loc.path, child_index, +				strerror (op_errno)); +		} else { +			gf_log (this->name, GF_LOG_DEBUG, +				"inode of %s on child %d locked", +				local->loc.path, child_index); +		} +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		afr_sh_data_close (frame, this); +	} + +	return 0; +} + + +int +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this) +{ +	struct flock flock;			 +	int i = 0;				 +	int call_count = 0;		      + +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	afr_self_heal_t * sh  = NULL; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	call_count = local->child_count; + +	local->call_count = call_count;		 + +	flock.l_start = 0; +	flock.l_len   = 0; +	flock.l_type  = F_UNLCK; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			gf_log (this->name, GF_LOG_DEBUG, +				"unlocking %s on subvolume %s", +				local->loc.path, priv->children[i]->name); + +			STACK_WIND_COOKIE (frame, afr_sh_data_unlck_cbk, +					   (void *) (long) i, +					   priv->children[i],  +					   priv->children[i]->fops->inodelk, +					   &local->loc, F_SETLK, &flock);  +			if (!--call_count) +				break; +		} +	} + +	return 0; +} + + +int +afr_sh_data_finish (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t   *local = NULL; + +	local = frame->local; + +	gf_log (this->name, GF_LOG_DEBUG, +		"finishing data selfheal of %s", local->loc.path); + +	afr_sh_data_unlock (frame, this); + +	return 0; +} + + +int +afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, +			       xlator_t *this, int32_t op_ret, +			       int32_t op_errno, dict_t *xattr) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int             call_count = 0; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	LOCK (&frame->lock); +	{ +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		afr_sh_data_finish (frame, this); + +	return 0; +} + + +int +afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              call_count = 0; +	int              i = 0; +	dict_t          **erase_xattr = NULL; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + + +	afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix, +				 sh->success, priv->child_count); + +	erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count); + +	for (i = 0; i < priv->child_count; i++) { +		if (sh->xattr[i]) { +			call_count++; + +			erase_xattr[i] = get_new_dict(); +			dict_ref (erase_xattr[i]); +		} +	} + +	afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr, +			       priv->child_count, AFR_DATA_PENDING); + +	local->call_count = call_count; +	for (i = 0; i < priv->child_count; i++) { +		if (!erase_xattr[i]) +			continue; + +		gf_log (this->name, GF_LOG_DEBUG, +			"erasing pending flags from %s on %s", +			local->loc.path, priv->children[i]->name); + +		STACK_WIND_COOKIE (frame, afr_sh_data_erase_pending_cbk, +				   (void *) (long) i, +				   priv->children[i], +				   priv->children[i]->fops->xattrop, +				   &local->loc, +				   GF_XATTROP_ADD_ARRAY, erase_xattr[i]); +		if (!--call_count) +			break; +	} + +	for (i = 0; i < priv->child_count; i++) { +		if (erase_xattr[i]) { +			dict_unref (erase_xattr[i]); +		} +	} +	FREE (erase_xattr); + +	return 0; +} + + +int +afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		      int32_t op_ret, int32_t op_errno, struct stat *buf) +{ +	afr_private_t * priv = NULL; +	afr_local_t * local  = NULL; +	afr_self_heal_t *sh  = NULL; +	int              call_count = 0; +	int              child_index = 0; + +	priv = this->private; +	local = frame->local; +	sh = &local->self_heal; + +	child_index = (long) cookie; + +	LOCK (&frame->lock); +	{ +		if (op_ret == -1) +			gf_log (this->name, GF_LOG_ERROR, +				"ftruncate of %s on subvolume %s failed (%s)", +				local->loc.path, +				priv->children[child_index]->name, +				strerror (op_errno)); +		else +			gf_log (this->name, GF_LOG_DEBUG, +				"ftruncate of %s on subvolume %s completed", +				local->loc.path, +				priv->children[child_index]->name); +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		afr_sh_data_erase_pending (frame, this); +	} + +	return 0; +} + + +int +afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) +{ +	afr_private_t * priv = NULL; +	afr_local_t * local  = NULL; +	afr_self_heal_t *sh  = NULL; +	int             *sources = NULL; +	int              call_count = 0; +	int              i = 0; + + +	priv = this->private; +	local = frame->local; +	sh = &local->self_heal; + +	sources = sh->sources; +	call_count = sh->active_sinks; + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) { +		if (sources[i] || !local->child_up[i]) +			continue; + +		STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk, +				   (void *) (long) i, +				   priv->children[i],  +				   priv->children[i]->fops->ftruncate, +				   sh->healing_fd, sh->file_size);  + +		if (!--call_count) +			break; +	} + +	return 0; +} + + +int +afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this); + +int +afr_sh_data_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +		       int32_t op_ret, int32_t op_errno, struct stat *buf) +{ +	afr_private_t * priv = NULL; +	afr_local_t * local  = NULL; +	afr_self_heal_t *sh  = NULL; + +	int child_index = (long) cookie; +	int call_count = 0; + +	priv = this->private; +	local = frame->local; +	sh = &local->self_heal; + +	gf_log (this->name, GF_LOG_DEBUG,  +		"wrote %d bytes of data from %s to child %d, offset %"PRId64"",  +		op_ret, local->loc.path, child_index, sh->offset - op_ret); + +	LOCK (&frame->lock); +	{ +		if (op_ret == -1) { +			gf_log (this->name, GF_LOG_ERROR, +				"write to %s failed on subvolume %s (%s)", +				local->loc.path, +				priv->children[child_index]->name, +				strerror (op_errno)); +			sh->op_failed = 1; +		} +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		afr_sh_data_read_write_iter (frame, this); +	} + +	return 0; +} + + +int +afr_sh_data_read_cbk (call_frame_t *frame, void *cookie, +		      xlator_t *this, int32_t op_ret, int32_t op_errno, +		      struct iovec *vector, int32_t count, struct stat *buf) +{ +	afr_private_t * priv = NULL; +	afr_local_t * local  = NULL; +	afr_self_heal_t *sh  = NULL; + +	int child_index = (long) cookie; +	int i = 0; +	int call_count = 0; + +	off_t offset; + +	priv = this->private; +	local = frame->local; +	sh = &local->self_heal; + +	call_count = sh->active_sinks; + +	local->call_count = call_count; + +	gf_log (this->name, GF_LOG_DEBUG,  +		"read %d bytes of data from %s on child %d, offset %"PRId64"", +		op_ret, local->loc.path, child_index, sh->offset); + +	if (op_ret <= 0) { +		afr_sh_data_trim_sinks (frame, this); +		return 0; +	} + +	/* what if we read less than block size? */ +	offset = sh->offset; +	sh->offset += op_ret; + +	frame->root->req_refs = frame->root->rsp_refs; + +	if (sh->file_has_holes) { +		if (iov_0filled (vector, count) == 0) { +			/* the iter function depends on the +			   sh->offset already being updated  +			   above +			*/ +			afr_sh_data_read_write_iter (frame, this); +			goto out; +		} +	} + +	for (i = 0; i < priv->child_count; i++) { +		if (sh->sources[i] || !local->child_up[i]) +			continue; + +		/* this is a sink, so write to it */ +		STACK_WIND_COOKIE (frame, afr_sh_data_write_cbk, +				   (void *) (long) i, +				   priv->children[i], +				   priv->children[i]->fops->writev, +				   sh->healing_fd, vector, count, offset); + +		if (!--call_count) +			break; +	} + +out: +	return 0; +} + + +int +afr_sh_data_read_write (call_frame_t *frame, xlator_t *this) +{ +	afr_private_t * priv = NULL; +	afr_local_t * local  = NULL; +	afr_self_heal_t *sh  = NULL; + +	priv = this->private; +	local = frame->local; +	sh = &local->self_heal; + +	STACK_WIND_COOKIE (frame, afr_sh_data_read_cbk, +			   (void *) (long) sh->source, +			   priv->children[sh->source], +			   priv->children[sh->source]->fops->readv, +			   sh->healing_fd, sh->block_size, +			   sh->offset); + +	return 0; +} + + +int +afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this) +{ +	afr_private_t * priv = NULL; +	afr_local_t * local  = NULL; +	afr_self_heal_t *sh  = NULL; + +	priv = this->private; +	local = frame->local; +	sh = &local->self_heal; + +	if (sh->op_failed) { +		afr_sh_data_finish (frame, this); +		goto out; +	} + +	if (sh->offset >= sh->file_size) { +		gf_log (this->name, GF_LOG_DEBUG,  +			"closing fd's of %s", +			local->loc.path); +		afr_sh_data_trim_sinks (frame, this); + +		goto out; +	} + +	afr_sh_data_read_write (frame, this); + +out: +	return 0; +} + + +int +afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		      int32_t op_ret, int32_t op_errno, fd_t *fd) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              call_count = 0; +	int              child_index = 0; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	child_index = (long) cookie; + +	/* TODO: some of the open's might fail. +	   In that case, modify cleanup fn to send flush on those  +	   fd's which are already open */ + +	LOCK (&frame->lock); +	{ +		if (op_ret == -1) { +			gf_log (this->name, GF_LOG_ERROR, +				"open of %s failed on child %s (%s)", +				local->loc.path, +				priv->children[child_index]->name, +				strerror (op_errno)); +			sh->op_failed = 1; +		} + +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		if (sh->op_failed) { +			afr_sh_data_finish (frame, this); +			return 0; +		} +		gf_log (this->name, GF_LOG_DEBUG, +			"fd for %s opened, commencing sync", +			local->loc.path); + +		gf_log (this->name, GF_LOG_WARNING, +			"sourcing file %s from %s to other sinks", +			local->loc.path, priv->children[sh->source]->name); + +		afr_sh_data_read_write (frame, this); +	} + +	return 0; +} + + +int +afr_sh_data_open (call_frame_t *frame, xlator_t *this) +{ +	int i = 0;				 +	int call_count = 0;		      + +	int source = -1; +	int *sources = NULL; + +	fd_t *fd = NULL; + +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	afr_self_heal_t *sh = NULL; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	call_count = sh->active_sinks + 1; +	local->call_count = call_count; + +	fd = fd_create (local->loc.inode, frame->root->pid); +	sh->healing_fd = fd; + +	source  = local->self_heal.source; +	sources = local->self_heal.sources; + +	sh->block_size = 65536; +	sh->file_size  = sh->buf[source].st_size; + +	if (FILE_HAS_HOLES (&sh->buf[source])) +		sh->file_has_holes = 1; + +	/* open source */ +	STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, +			   (void *) (long) source, +			   priv->children[source], +			   priv->children[source]->fops->open, +			   &local->loc, O_RDONLY|O_LARGEFILE, fd); +	call_count--; + +	/* open sinks */ +	for (i = 0; i < priv->child_count; i++) { +		if(sources[i] || !local->child_up[i]) +			continue; + +		STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, +				   (void *) (long) i, +				   priv->children[i],  +				   priv->children[i]->fops->open, +				   &local->loc,  +				   O_WRONLY|O_LARGEFILE, fd);  + +		if (!--call_count) +			break; +	} + +	return 0; +} + + +int +afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              active_sinks = 0; +	int              source = 0; +	int              i = 0; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	source = sh->source; + +	for (i = 0; i < priv->child_count; i++) { +		if (sh->sources[i] == 0 && local->child_up[i] == 1) { +			active_sinks++; +			sh->success[i] = 1; +		} +	} +	sh->success[source] = 1; + +	if (active_sinks == 0) { +		gf_log (this->name, GF_LOG_DEBUG, +			"no active sinks for performing self-heal on file %s", +			local->loc.path); +		afr_sh_data_finish (frame, this); +		return 0; +	} +	sh->active_sinks = active_sinks; + +	gf_log (this->name, GF_LOG_DEBUG, +		"syncing data of %s from subvolume %s to %d active sinks", +		local->loc.path, priv->children[source]->name, active_sinks); + +	afr_sh_data_open (frame, this); + +	return 0; +} + + +int +afr_sh_data_fix (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              nsources = 0; +	int              source = 0; +	int              i = 0; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr,  +				     priv->child_count, AFR_DATA_PENDING); + +	afr_sh_print_pending_matrix (sh->pending_matrix, this); + + +	afr_sh_mark_sources (sh->pending_matrix, sh->sources,  +			     priv->child_count); + +	afr_sh_supress_empty_children (sh->sources, sh->xattr, sh->buf, +				       priv->child_count, AFR_DATA_PENDING); + +	afr_sh_supress_errenous_children (sh->sources, sh->child_errno, +					  priv->child_count); + +	nsources = afr_sh_source_count (sh->sources, priv->child_count); + +	if ((nsources == 0) +	    && (priv->favorite_child != -1) +	    && (sh->child_errno[priv->favorite_child] == 0)) { + +		gf_log (this->name, GF_LOG_WARNING, +			"Picking favorite child %s as authentic source to resolve conflicting data of %s", +			priv->children[priv->favorite_child]->name, +			local->loc.path); + +		sh->sources[priv->favorite_child] = 1; + +		nsources = afr_sh_source_count (sh->sources, +						priv->child_count); +	} + +	if (nsources == 0) { +		gf_log (this->name, GF_LOG_ERROR, +			"Unable to resolve conflicting data of %s. " +			"Please resolve manually by deleting the file %s " +			"from all but the preferred subvolume. " +			"Please consider 'option favorite-child <>'", +			local->loc.path, local->loc.path); + +		local->govinda_gOvinda = 1; + +		afr_sh_data_finish (frame, this); +		return 0; +	} + +	source = afr_sh_select_source (sh->sources, priv->child_count); +	sh->source = source; + +	/* detect changes not visible through pending flags -- JIC */ +	for (i = 0; i < priv->child_count; i++) { +		if (i == source || sh->child_errno[i]) +			continue; + +		if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[source])) +			sh->sources[i] = 0; +	} + +	afr_sh_data_sync_prepare (frame, this); + +	return 0; +} + + +int +afr_sh_data_lookup_cbk (call_frame_t *frame, void *cookie, +			xlator_t *this, int32_t op_ret, int32_t op_errno, +			inode_t *inode, struct stat *buf, dict_t *xattr) +{ +	afr_private_t   *priv  = NULL; +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; + +	int call_count  = -1; +	int child_index = (long) cookie; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	LOCK (&frame->lock); +	{ +		if (op_ret != -1) { +			sh->xattr[child_index] = dict_ref (xattr); +			sh->buf[child_index] = *buf; +		} +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		afr_sh_data_fix (frame, this); +	} + +	return 0; +} + + +int +afr_sh_data_lookup (call_frame_t *frame, xlator_t *this) +{ +	afr_self_heal_t *sh    = NULL;  +	afr_local_t     *local = NULL; +	afr_private_t   *priv  = NULL; +	dict_t          *xattr_req = NULL; + +	int call_count = 0; +	int i = 0; +	int ret = 0; + +	priv  = this->private; +	local = frame->local; +	sh    = &local->self_heal; + +	call_count = local->child_count; + +	local->call_count = call_count; +	 +	xattr_req = dict_new(); +	if (xattr_req) +		ret = dict_set_uint64 (xattr_req, AFR_DATA_PENDING, +				       priv->child_count * sizeof(int32_t)); + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_sh_data_lookup_cbk, +					   (void *) (long) i, +					   priv->children[i],  +					   priv->children[i]->fops->lookup, +					   &local->loc, xattr_req); +			if (!--call_count) +				break; +		} +	} +	 +	if (xattr_req) +		dict_unref (xattr_req); + +	return 0; +} + + +int +afr_sh_data_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		      int32_t op_ret, int32_t op_errno) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	int              call_count = 0; +	int              child_index = (long) cookie; + +	/* TODO: what if lock fails? */ +	 +	local = frame->local; +	sh = &local->self_heal; + +	LOCK (&frame->lock); +	{ +		if (op_ret == -1) { +			sh->op_failed = 1; + +			gf_log (this->name, +				(op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), +				"locking of %s on child %d failed: %s", +				local->loc.path, child_index, +				strerror (op_errno)); +		} else { +			gf_log (this->name, GF_LOG_DEBUG, +				"inode of %s on child %d locked", +				local->loc.path, child_index); +		} +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		if (sh->op_failed) { +			afr_sh_data_finish (frame, this); +			return 0; +		} + +		afr_sh_data_lookup (frame, this); +	} + +	return 0; +} + + +int +afr_sh_data_lock (call_frame_t *frame, xlator_t *this) +{ +	struct flock flock;			 +	int i = 0;				 +	int call_count = 0;		      + +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	afr_self_heal_t * sh  = NULL; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	call_count = local->child_count; + +	local->call_count = call_count;		 + +	flock.l_start = 0; +	flock.l_len   = 0; +	flock.l_type  = F_WRLCK;			 + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			gf_log (this->name, GF_LOG_DEBUG, +				"locking %s on subvolume %s", +				local->loc.path, priv->children[i]->name); + +			STACK_WIND_COOKIE (frame, afr_sh_data_lock_cbk, +					   (void *) (long) i, +					   priv->children[i],  +					   priv->children[i]->fops->inodelk, +					   &local->loc, F_SETLK, &flock);  +			if (!--call_count) +				break; +		} +	} + +	return 0; +} + + +int +afr_self_heal_data (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t   *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t *priv = this->private; + + +	local = frame->local; +	sh = &local->self_heal; + +	if (local->need_data_self_heal && priv->data_self_heal) { +		afr_sh_data_lock (frame, this); +	} else { +		gf_log (this->name, GF_LOG_DEBUG, +			"not doing data self heal on %s", +			local->loc.path); +		afr_sh_data_done (frame, this); +	} + +	return 0; +} + diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c new file mode 100644 index 00000000000..ec341922ee7 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -0,0 +1,2038 @@ +/* +  Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +  This file is part of GlusterFS. + +  GlusterFS is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published +  by the Free Software Foundation; either version 3 of the License, +  or (at your option) any later version. + +  GlusterFS is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program.  If not, see +  <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" + + + +int +afr_sh_entry_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	/*  +	   TODO: cleanup sh->*  +	*/ + +	gf_log (this->name, GF_LOG_DEBUG, +		"self heal of %s completed", +		local->loc.path); + +	sh->completion_cbk (frame, this); + +	return 0; +} + + +int +afr_sh_entry_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			int32_t op_ret, int32_t op_errno) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	int           call_count = 0; +	int           child_index = (long) cookie; + +	/* TODO: what if lock fails? */ +	 +	local = frame->local; +	sh = &local->self_heal; + +	LOCK (&frame->lock); +	{ +		if (op_ret == -1) { +			gf_log (this->name, GF_LOG_ERROR,  +				"unlocking inode of %s on child %d failed: %s", +				local->loc.path, child_index, +				strerror (op_errno)); +		} else { +			gf_log (this->name, GF_LOG_DEBUG, +				"unlocked inode of %s on child %d", +				local->loc.path, child_index); +		} +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		if (sh->healing_fd) +			fd_unref (sh->healing_fd); +		sh->healing_fd = NULL; +		afr_sh_entry_done (frame, this); +	} + +	return 0; +} + + +int +afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this) +{ +	int i = 0;				 +	int call_count = 0;		      + +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	afr_self_heal_t * sh  = NULL; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	call_count = local->child_count; + +	local->call_count = call_count;		 + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			gf_log (this->name, GF_LOG_DEBUG, +				"unlocking %s on subvolume %s", +				local->loc.path, priv->children[i]->name); + +			STACK_WIND_COOKIE (frame, afr_sh_entry_unlck_cbk, +					   (void *) (long) i, +					   priv->children[i],  +					   priv->children[i]->fops->entrylk, +					   &local->loc, NULL, +					   ENTRYLK_UNLOCK, ENTRYLK_WRLCK); +			if (!--call_count) +				break; +		} +	} + +	return 0; +} + + +int +afr_sh_entry_finish (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t   *local = NULL; + +	local = frame->local; + +	gf_log (this->name, GF_LOG_DEBUG, +		"finishing entry selfheal of %s", local->loc.path); + +	afr_sh_entry_unlock (frame, this); + +	return 0; +} + + +int +afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie, +				xlator_t *this, int32_t op_ret, +				int32_t op_errno, dict_t *xattr) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int             call_count = 0; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	LOCK (&frame->lock); +	{ +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		afr_sh_entry_finish (frame, this); + +	return 0; +} + + +int +afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              call_count = 0; +	int              i = 0; +	dict_t          **erase_xattr = NULL; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + + +	afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix, +				 sh->success, priv->child_count); + +	erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count); + +	for (i = 0; i < priv->child_count; i++) { +		if (sh->xattr[i]) { +			call_count++; + +			erase_xattr[i] = get_new_dict(); +			dict_ref (erase_xattr[i]); +		} +	} + +	afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr, +			       priv->child_count, AFR_ENTRY_PENDING); + +	local->call_count = call_count; +	for (i = 0; i < priv->child_count; i++) { +		if (!erase_xattr[i]) +			continue; + +		gf_log (this->name, GF_LOG_DEBUG, +			"erasing pending flags from %s on %s", +			local->loc.path, priv->children[i]->name); + +		STACK_WIND_COOKIE (frame, afr_sh_entry_erase_pending_cbk, +				   (void *) (long) i, +				   priv->children[i], +				   priv->children[i]->fops->xattrop, +				   &local->loc, +				   GF_XATTROP_ADD_ARRAY, erase_xattr[i]); +		if (!--call_count) +			break; +	} + +	for (i = 0; i < priv->child_count; i++) { +		if (erase_xattr[i]) { +			dict_unref (erase_xattr[i]); +		} +	} +	FREE (erase_xattr); + +	return 0; +} + + + +static int +next_active_source (call_frame_t *frame, xlator_t *this, +		    int current_active_source) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *local  = NULL; +	afr_self_heal_t *sh  = NULL; +	int              source = -1; +	int              next_active_source = -1; +	int              i = 0; + +	priv = this->private; +	local = frame->local; +	sh = &local->self_heal; + +	source = sh->source; + +	if (source != -1) { +		if (current_active_source != source) +			next_active_source = source; +		goto out; +	} + +	/* +	  the next active sink becomes the source for the +	  'conservative decision' of merging all entries +	*/ + +	for (i = 0; i < priv->child_count; i++) { +		if ((sh->sources[i] == 0) +		    && (local->child_up[i] == 1) +		    && (i > current_active_source)) { + +			next_active_source = i; +			break; +		} +	} +out: +	return next_active_source; +} + + + +static int +next_active_sink (call_frame_t *frame, xlator_t *this, +		  int current_active_sink) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *local  = NULL; +	afr_self_heal_t *sh  = NULL; +	int              next_active_sink = -1; +	int              i = 0; + +	priv = this->private; +	local = frame->local; +	sh = &local->self_heal; + +	/* +	  the next active sink becomes the source for the +	  'conservative decision' of merging all entries +	*/ + +	for (i = 0; i < priv->child_count; i++) { +		if ((sh->sources[i] == 0) +		    && (local->child_up[i] == 1) +		    && (i > current_active_sink)) { + +			next_active_sink = i; +			break; +		} +	} + +	return next_active_sink; +} + + +int +build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +{ +	int   ret = -1; + +	if (!child) { +		goto out; +	} + +	if (strcmp (parent->path, "/") == 0) +		asprintf ((char **)&child->path, "/%s", name); +	else +		asprintf ((char **)&child->path, "%s/%s", parent->path, name); + +	if (!child->path) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	child->name = strrchr (child->path, '/'); +	if (child->name) +		child->name++; + +	child->parent = inode_ref (parent->inode); +	child->inode = inode_new (parent->inode->table); + +	if (!child->inode) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	ret = 0; +out: +	if (ret == -1) +		loc_wipe (child); + +	return ret; +} + + +int +afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this); + +int +afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, +			     int active_src); + +int +afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this, +				 int active_src) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *local  = NULL; +	afr_self_heal_t *sh  = NULL; +	int              call_count = 0; + +	priv = this->private; +	local = frame->local; +	sh = &local->self_heal; + +	LOCK (&frame->lock); +	{ +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		afr_sh_entry_expunge_subvol (frame, this, active_src); + +	return 0; +} + + +int +afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie, +				 xlator_t *this, +				 int32_t op_ret, int32_t op_errno) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *expunge_local = NULL; +	afr_self_heal_t *expunge_sh = NULL; +	int              active_src = 0; +	call_frame_t    *frame = NULL; + + +	priv = this->private; +	expunge_local = expunge_frame->local; +	expunge_sh = &expunge_local->self_heal; +	frame = expunge_sh->sh_frame; + +	active_src = (long) cookie; + +	if (op_ret == 0) { +		gf_log (this->name, GF_LOG_DEBUG, +			"removed %s on %s", +			expunge_local->loc.path, +			priv->children[active_src]->name); +	} else { +		gf_log (this->name, GF_LOG_ERROR, +			"removing %s on %s failed (%s)", +			expunge_local->loc.path, +			priv->children[active_src]->name, +			strerror (op_errno)); +	} + +	AFR_STACK_DESTROY (expunge_frame); +	afr_sh_entry_expunge_entry_done (frame, this, active_src); + +	return 0; +} + + +int +afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this, +			     int active_src) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *expunge_local = NULL; + +	priv = this->private; +	expunge_local = expunge_frame->local; + +	gf_log (this->name, GF_LOG_WARNING, +		"removing directory %s on %s", +		expunge_local->loc.path, priv->children[active_src]->name); + +	STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, +			   (void *) (long) active_src, +			   priv->children[active_src], +			   priv->children[active_src]->fops->rmdir, +			   &expunge_local->loc); + +	return 0; +} + + +int +afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this, +			     int active_src) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *expunge_local = NULL; + +	priv = this->private; +	expunge_local = expunge_frame->local; + +	gf_log (this->name, GF_LOG_WARNING, +		"unlinking file %s on %s", +		expunge_local->loc.path, priv->children[active_src]->name); +	 +	STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, +			   (void *) (long) active_src, +			   priv->children[active_src], +			   priv->children[active_src]->fops->unlink, +			   &expunge_local->loc); + +	return 0; +} + + +int +afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, +			     int active_src, struct stat *buf) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *expunge_local = NULL; +	afr_self_heal_t *expunge_sh = NULL; +	int              source = 0; +	call_frame_t    *frame = NULL; +	int              type = 0; + +	priv = this->private; +	expunge_local = expunge_frame->local; +	expunge_sh = &expunge_local->self_heal; +	frame = expunge_sh->sh_frame; +	source = expunge_sh->source; + +	type = (buf->st_mode & S_IFMT); + +	switch (type) { +	case S_IFSOCK: +	case S_IFREG: +	case S_IFBLK: +	case S_IFCHR: +	case S_IFIFO: +	case S_IFLNK: +		afr_sh_entry_expunge_unlink (expunge_frame, this, active_src); + +		break; +	case S_IFDIR: +		afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src); +		break; +	default: +		gf_log (this->name, GF_LOG_ERROR, +			"%s has unknown file type on %s: 0%o", +			expunge_local->loc.path, +			priv->children[source]->name, type); +		goto out; +		break; +	} + +	return 0; +out: +	AFR_STACK_DESTROY (expunge_frame); +	afr_sh_entry_expunge_entry_done (frame, this, active_src); + +	return 0; +} + + +int +afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie, +				xlator_t *this, +				int32_t op_ret,	int32_t op_errno, +				inode_t *inode, struct stat *buf, dict_t *x) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *expunge_local = NULL; +	afr_self_heal_t *expunge_sh = NULL; +	call_frame_t    *frame = NULL; +	int              active_src = 0; + +	priv = this->private; +	expunge_local = expunge_frame->local; +	expunge_sh = &expunge_local->self_heal; +	frame = expunge_sh->sh_frame; +	active_src = (long) cookie; + +	if (op_ret == -1) { +		gf_log (this->name, GF_LOG_ERROR, +			"lookup of %s on %s failed (%s)", +			expunge_local->loc.path, +			priv->children[active_src]->name, +			strerror (op_errno)); +		goto out; +	} + +	afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf); + +	return 0; +out: +	AFR_STACK_DESTROY (expunge_frame); +	afr_sh_entry_expunge_entry_done (frame, this, active_src); + +	return 0; +} + + +int +afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this, +			    int active_src) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *expunge_local = NULL; + +	priv = this->private; +	expunge_local = expunge_frame->local; + +	gf_log (this->name, GF_LOG_DEBUG, +		"looking up %s on %s", +		expunge_local->loc.path, priv->children[active_src]->name); +	 +	STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk, +			   (void *) (long) active_src, +			   priv->children[active_src], +			   priv->children[active_src]->fops->lookup, +			   &expunge_local->loc, 0); + +	return 0; +} + + +int +afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie, +				xlator_t *this, +				int32_t op_ret,	int32_t op_errno, +				inode_t *inode, struct stat *buf, dict_t *x) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *expunge_local = NULL; +	afr_self_heal_t *expunge_sh = NULL; +	int              source = 0; +	call_frame_t    *frame = NULL; +	int              active_src = 0; + + +	priv = this->private; +	expunge_local = expunge_frame->local; +	expunge_sh = &expunge_local->self_heal; +	frame = expunge_sh->sh_frame; +	active_src = expunge_sh->active_source; +	source = (long) cookie; + +	if (op_ret == -1 && op_errno == ENOENT) { + +		gf_log (this->name, GF_LOG_DEBUG, +			"missing entry %s on %s", +			expunge_local->loc.path, +			priv->children[source]->name); + +		afr_sh_entry_expunge_purge (expunge_frame, this, active_src); + +		return 0; +	} + +	if (op_ret == 0) { +		gf_log (this->name, GF_LOG_DEBUG, +			"%s exists under %s", +			expunge_local->loc.path, +			priv->children[source]->name); +	} else { +		gf_log (this->name, GF_LOG_ERROR, +			"looking up %s under %s failed (%s)", +			expunge_local->loc.path, +			priv->children[source]->name, +			strerror (op_errno)); +	} + +	AFR_STACK_DESTROY (expunge_frame); +	afr_sh_entry_expunge_entry_done (frame, this, active_src); + +	return 0; +} + + +int +afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, +			    char *name) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *local  = NULL; +	afr_self_heal_t *sh  = NULL; +	int              ret = -1; +	call_frame_t    *expunge_frame = NULL; +	afr_local_t     *expunge_local = NULL; +	afr_self_heal_t *expunge_sh = NULL; +	int              active_src = 0; +	int              source = 0; +	int              op_errno = 0; + +	priv = this->private; +	local = frame->local; +	sh = &local->self_heal; + +	active_src = sh->active_source; +	source = sh->source; + +	if ((strcmp (name, ".") == 0) +	    || (strcmp (name, "..") == 0)) { +		gf_log (this->name, GF_LOG_DEBUG, +			"skipping inspection of %s under %s", +			name, local->loc.path); +		goto out; +	} + +	gf_log (this->name, GF_LOG_DEBUG, +		"inspecting existance of %s under %s", +		name, local->loc.path); + +	expunge_frame = copy_frame (frame); +	if (!expunge_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	ALLOC_OR_GOTO (expunge_local, afr_local_t, out); + +	expunge_frame->local = expunge_local; +	expunge_sh = &expunge_local->self_heal; +	expunge_sh->sh_frame = frame; +	expunge_sh->active_source = active_src; + +	ret = build_child_loc (this, &expunge_local->loc, &local->loc, name); +	if (ret != 0) { +		goto out; +	} + +	gf_log (this->name, GF_LOG_DEBUG, +		"looking up %s on %s", expunge_local->loc.path, +		priv->children[source]->name); + +	STACK_WIND_COOKIE (expunge_frame, +			   afr_sh_entry_expunge_entry_cbk, +			   (void *) (long) source, +			   priv->children[source], +			   priv->children[source]->fops->lookup, +			   &expunge_local->loc, 0); + +	ret = 0; +out: +	if (ret == -1) +		afr_sh_entry_expunge_entry_done (frame, this, active_src); + +	return 0; +} + + +int +afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie, +				  xlator_t *this, +				  int32_t op_ret, int32_t op_errno, +				  gf_dirent_t *entries) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *local  = NULL; +	afr_self_heal_t *sh  = NULL; +	gf_dirent_t     *entry = NULL; +	off_t            last_offset = 0; +	int              active_src = 0; +	int              entry_count = 0; + +	priv = this->private; +	local = frame->local; +	sh = &local->self_heal; + +	active_src = sh->active_source; + +	if (op_ret <= 0) { +		if (op_ret < 0) { +			gf_log (this->name, GF_LOG_ERROR, +				"readdir of %s on subvolume %s failed (%s)", +				local->loc.path, +				priv->children[active_src]->name, +				strerror (op_errno)); +		} else { +			gf_log (this->name, GF_LOG_DEBUG, +				"readdir of %s on subvolume %s complete", +				local->loc.path, +				priv->children[active_src]->name); +		} + +		afr_sh_entry_expunge_all (frame, this); +		return 0; +	} + +	list_for_each_entry (entry, &entries->list, list) { +		last_offset = entry->d_off; +		entry_count++; +	} + +	gf_log (this->name, GF_LOG_DEBUG, +		"readdir'ed %d entries from %s", +		entry_count, priv->children[active_src]->name); + +	sh->offset = last_offset; +	local->call_count = entry_count; + +	list_for_each_entry (entry, &entries->list, list) { +		afr_sh_entry_expunge_entry (frame, this, entry->d_name); +	} + +	return 0; +} + +int +afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, +			     int active_src) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *local  = NULL; +	afr_self_heal_t *sh  = NULL; + +	priv = this->private; +	local = frame->local; +	sh = &local->self_heal; + +	STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk, +		    priv->children[active_src], +		    priv->children[active_src]->fops->readdir, +		    sh->healing_fd, sh->block_size, sh->offset); + +	return 0; +} + + +int +afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *local  = NULL; +	afr_self_heal_t *sh  = NULL; +	int              active_src = -1; + +	priv = this->private; +	local = frame->local; +	sh = &local->self_heal; + +	sh->offset = 0; + +	if (sh->source == -1) { +		gf_log (this->name, GF_LOG_DEBUG, +			"no active sources for %s to expunge entries", +			local->loc.path); +		goto out; +	} + +	active_src = next_active_sink (frame, this, sh->active_source); +	sh->active_source = active_src; + +	if (sh->op_failed) { +		goto out; +	} + +	if (active_src == -1) { +		/* completed creating missing files on all subvolumes */ +		goto out; +	} + +	gf_log (this->name, GF_LOG_DEBUG, +		"expunging entries of %s on %s to other sinks", +		local->loc.path, priv->children[active_src]->name); + +	afr_sh_entry_expunge_subvol (frame, this, active_src); + +	return 0; +out: +	afr_sh_entry_erase_pending (frame, this); +	return 0; + +} + + +int +afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this); + +int +afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this, +			     int active_src); + +int +afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this, +				 int active_src) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *local  = NULL; +	afr_self_heal_t *sh  = NULL; +	int              call_count = 0; + +	priv = this->private; +	local = frame->local; +	sh = &local->self_heal; + +	LOCK (&frame->lock); +	{ +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		afr_sh_entry_impunge_subvol (frame, this, active_src); + +	return 0; +} + + +int +afr_sh_entry_impunge_utimens_cbk (call_frame_t *impunge_frame, void *cookie, +				  xlator_t *this, int32_t op_ret, +				  int32_t op_errno, struct stat *stbuf) +{ +	int              call_count = 0; +	afr_private_t   *priv = NULL; +	afr_local_t     *impunge_local = NULL; +	afr_self_heal_t *impunge_sh = NULL; +	call_frame_t    *frame = NULL; +	int              active_src = 0; +	int              child_index = 0; + +	priv = this->private; +	impunge_local = impunge_frame->local; +	impunge_sh = &impunge_local->self_heal; +	frame = impunge_sh->sh_frame; +	child_index = (long) cookie; + +	if (op_ret == 0) { +		gf_log (this->name, GF_LOG_DEBUG, +			"utimes set for %s on %s", +			impunge_local->loc.path, +			priv->children[child_index]->name); +	} else { +		gf_log (this->name, GF_LOG_ERROR, +			"setting utimes of %s on %s failed (%s)", +			impunge_local->loc.path, +			priv->children[child_index]->name, +			strerror (op_errno)); +	} + +	LOCK (&impunge_frame->lock); +	{ +		call_count = --impunge_local->call_count; +	} +	UNLOCK (&impunge_frame->lock); + +	if (call_count == 0) { +		AFR_STACK_DESTROY (impunge_frame); +		afr_sh_entry_impunge_entry_done (frame, this, active_src); +	} + +	return 0; +} + + +int +afr_sh_entry_impunge_chown_cbk (call_frame_t *impunge_frame, void *cookie, +				xlator_t *this, int32_t op_ret, +				int32_t op_errno, struct stat *stbuf) +{ +	int              call_count = 0; +	afr_private_t   *priv = NULL; +	afr_local_t     *impunge_local = NULL; +	afr_self_heal_t *impunge_sh = NULL; +	call_frame_t    *frame = NULL; +	int              active_src = 0; +	int              child_index = 0; +	struct timespec  ts[2]; + + +	priv = this->private; +	impunge_local = impunge_frame->local; +	impunge_sh = &impunge_local->self_heal; +	frame = impunge_sh->sh_frame; +	child_index = (long) cookie; + +	if (op_ret == 0) { +		gf_log (this->name, GF_LOG_DEBUG, +			"ownership of %s on %s changed", +			impunge_local->loc.path, +			priv->children[child_index]->name); +	} else { +		gf_log (this->name, GF_LOG_ERROR, +			"setting ownership of %s on %s failed (%s)", +			impunge_local->loc.path, +			priv->children[child_index]->name, +			strerror (op_errno)); +		goto out; +	} + +#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC +	ts[0] = impunge_local->cont.lookup.buf.st_atim; +	ts[1] = impunge_local->cont.lookup.buf.st_mtim; +#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC +	ts[0] = impunge_local->cont.lookup.buf.st_atimespec; +	ts[1] = impunge_local->cont.lookup.buf.st_mtimespec; +#else +	ts[0].tv_sec = impunge_local->cont.lookup.buf.st_atime; +	ts[1].tv_sec = impunge_local->cont.lookup.buf.st_mtime; +#endif +	STACK_WIND_COOKIE (impunge_frame, +			   afr_sh_entry_impunge_utimens_cbk, +			   (void *) (long) child_index, +			   priv->children[child_index], +			   priv->children[child_index]->fops->utimens, +			   &impunge_local->loc, ts); + +	return 0; + +out: +	LOCK (&impunge_frame->lock); +	{ +		call_count = --impunge_local->call_count; +	} +	UNLOCK (&impunge_frame->lock); + +	if (call_count == 0) { +		AFR_STACK_DESTROY (impunge_frame); +		afr_sh_entry_impunge_entry_done (frame, this, active_src); +	} + +	return 0; +} + + +int +afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie, +				  xlator_t *this, +				  int32_t op_ret, int32_t op_errno, +				  inode_t *inode, struct stat *stbuf) +{ +	int              call_count = 0; +	afr_private_t   *priv = NULL; +	afr_local_t     *impunge_local = NULL; +	afr_self_heal_t *impunge_sh = NULL; +	call_frame_t    *frame = NULL; +	int              active_src = 0; +	int              child_index = 0; + +	priv = this->private; +	impunge_local = impunge_frame->local; +	impunge_sh = &impunge_local->self_heal; +	frame = impunge_sh->sh_frame; + +	child_index = (long) cookie; + +	if (op_ret == -1) { +		gf_log (this->name, GF_LOG_ERROR, +			"creation of %s on %s failed (%s)", +			impunge_local->loc.path, +			priv->children[child_index]->name, +			strerror (op_errno)); +		goto out; +	} + +	gf_log (this->name, GF_LOG_DEBUG, +		"setting ownership of %s on %s to %d/%d", +		impunge_local->loc.path, +		priv->children[child_index]->name, +		impunge_local->cont.lookup.buf.st_uid, +		impunge_local->cont.lookup.buf.st_gid); + +	inode->st_mode = stbuf->st_mode; + +	STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_chown_cbk, +			   (void *) (long) child_index, +			   priv->children[child_index], +			   priv->children[child_index]->fops->chown, +			   &impunge_local->loc, +			   impunge_local->cont.lookup.buf.st_uid, +			   impunge_local->cont.lookup.buf.st_gid); +	return 0; + +out: +	LOCK (&impunge_frame->lock); +	{ +		call_count = --impunge_local->call_count; +	} +	UNLOCK (&impunge_frame->lock); + +	if (call_count == 0) { +		AFR_STACK_DESTROY (impunge_frame); +		afr_sh_entry_impunge_entry_done (frame, this, active_src); +	} + +	return 0; +} + + +int +afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, +			    int child_index, struct stat *stbuf) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *impunge_local = NULL; +	afr_self_heal_t *impunge_sh = NULL; + + +	priv = this->private; +	impunge_local = impunge_frame->local; +	impunge_sh = &impunge_local->self_heal; + +	gf_log (this->name, GF_LOG_WARNING, +		"creating file %s mode=0%o dev=0x%"GF_PRI_DEV" on %s", +		impunge_local->loc.path, +		stbuf->st_mode, stbuf->st_rdev, +		priv->children[child_index]->name); + +	STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, +			   (void *) (long) child_index, +			   priv->children[child_index], +			   priv->children[child_index]->fops->mknod, +			   &impunge_local->loc, +			   stbuf->st_mode, stbuf->st_rdev); + +	return 0; +} + + + +int +afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this, +			    int child_index, struct stat *stbuf) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *impunge_local = NULL; +	afr_self_heal_t *impunge_sh = NULL; + + +	priv = this->private; +	impunge_local = impunge_frame->local; +	impunge_sh = &impunge_local->self_heal; + +	gf_log (this->name, GF_LOG_WARNING, +		"creating directory %s mode=0%o on %s", +		impunge_local->loc.path, +		stbuf->st_mode, +		priv->children[child_index]->name); + +	STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, +			   (void *) (long) child_index, +			   priv->children[child_index], +			   priv->children[child_index]->fops->mkdir, +			   &impunge_local->loc, stbuf->st_mode); + +	return 0; +} + + +int +afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this, +			      int child_index, const char *linkname) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *impunge_local = NULL; +	afr_self_heal_t *impunge_sh = NULL; + + +	priv = this->private; +	impunge_local = impunge_frame->local; +	impunge_sh = &impunge_local->self_heal; + +	gf_log (this->name, GF_LOG_WARNING, +		"creating symlink %s -> %s on %s", +		impunge_local->loc.path, linkname, +		priv->children[child_index]->name); + +	STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, +			   (void *) (long) child_index, +			   priv->children[child_index], +			   priv->children[child_index]->fops->symlink, +			   linkname, &impunge_local->loc); + +	return 0; +} + + +int +afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie, +				   xlator_t *this, +				   int32_t op_ret, int32_t op_errno, +				   const char *linkname) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *impunge_local = NULL; +	afr_self_heal_t *impunge_sh = NULL; +	int              child_index = -1; +	call_frame_t    *frame = NULL; +	int              call_count = -1; +	int              active_src = -1; + +	priv = this->private; +	impunge_local = impunge_frame->local; +	impunge_sh = &impunge_local->self_heal; +	frame = impunge_sh->sh_frame; +	active_src = impunge_sh->active_source; + +	child_index = (long) cookie; + +	if (op_ret == -1) { +		gf_log (this->name, GF_LOG_ERROR, +			"readlink of %s on %s failed (%s)", +			impunge_local->loc.path, +			priv->children[active_src]->name, +			strerror (op_errno)); +		goto out; +	} + +	afr_sh_entry_impunge_symlink (impunge_frame, this, child_index, +				      linkname); +	return 0; + +out: +	LOCK (&impunge_frame->lock); +	{ +		call_count = --impunge_local->call_count; +	} +	UNLOCK (&impunge_frame->lock); + +	if (call_count == 0) { +		AFR_STACK_DESTROY (impunge_frame); +		afr_sh_entry_impunge_entry_done (frame, this, active_src); +	} + +	return 0; +} + + +int +afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this, +			       int child_index, struct stat *stbuf) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *impunge_local = NULL; +	afr_self_heal_t *impunge_sh = NULL; +	int              active_src = -1; + +	priv = this->private; +	impunge_local = impunge_frame->local; +	impunge_sh = &impunge_local->self_heal; +	active_src = impunge_sh->active_source; + +	STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk, +			   (void *) (long) child_index, +			   priv->children[active_src], +			   priv->children[active_src]->fops->readlink, +			   &impunge_local->loc, 4096); + +	return 0; +} + + +int +afr_sh_entry_impunge_recreate_lookup_cbk (call_frame_t *impunge_frame, +					  void *cookie, xlator_t *this, +					  int32_t op_ret, int32_t op_errno, +					  inode_t *inode, struct stat *buf, +					  dict_t *xattr) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *impunge_local = NULL; +	afr_self_heal_t *impunge_sh = NULL; +	int              active_src = 0; +	int              type = 0; +	int              child_index = 0; +	call_frame_t    *frame = NULL; +	int              call_count = 0; + +	priv = this->private; +	impunge_local = impunge_frame->local; +	impunge_sh = &impunge_local->self_heal; +	frame = impunge_sh->sh_frame; + +	child_index = (long) cookie; + +	active_src = impunge_sh->active_source; + +	if (op_ret != 0) { +		gf_log (this->name, GF_LOG_ERROR, +			"looking up %s on %s (for %s) failed (%s)", +			impunge_local->loc.path, +			priv->children[active_src]->name, +			priv->children[child_index]->name, +			strerror (op_errno)); +		goto out; +	} + +	impunge_local->cont.lookup.buf = *buf; +	type = (buf->st_mode & S_IFMT); + +	switch (type) { +	case S_IFSOCK: +	case S_IFREG: +	case S_IFBLK: +	case S_IFCHR: +	case S_IFIFO: +		afr_sh_entry_impunge_mknod (impunge_frame, this, +					    child_index, buf); +		break; +	case S_IFLNK: +		afr_sh_entry_impunge_readlink (impunge_frame, this, +					       child_index, buf); +		break; +	case S_IFDIR: +		afr_sh_entry_impunge_mkdir (impunge_frame, this, +					    child_index, buf); +		break; +	default: +		gf_log (this->name, GF_LOG_ERROR, +			"%s has unknown file type on %s: 0%o", +			impunge_local->loc.path, +			priv->children[active_src]->name, type); +		goto out; +		break; +	} + +	return 0; + +out: +	LOCK (&impunge_frame->lock); +	{ +		call_count = --impunge_local->call_count; +	} +	UNLOCK (&impunge_frame->lock); + +	if (call_count == 0) { +		AFR_STACK_DESTROY (impunge_frame); +		afr_sh_entry_impunge_entry_done (frame, this, active_src); +	} + +	return 0; +} + + +int +afr_sh_entry_impunge_recreate (call_frame_t *impunge_frame, xlator_t *this, +			       int child_index) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *impunge_local = NULL; +	afr_self_heal_t *impunge_sh = NULL; +	int              active_src = 0; + + +	priv = this->private; +	impunge_local = impunge_frame->local; +	impunge_sh = &impunge_local->self_heal; + +	active_src = impunge_sh->active_source; + +	STACK_WIND_COOKIE (impunge_frame, +			   afr_sh_entry_impunge_recreate_lookup_cbk, +			   (void *) (long) child_index, +			   priv->children[active_src], +			   priv->children[active_src]->fops->lookup, +			   &impunge_local->loc, 0); + +	return 0; +} + + +int +afr_sh_entry_impunge_entry_cbk (call_frame_t *impunge_frame, void *cookie, +				xlator_t *this, +				int32_t op_ret,	int32_t op_errno, +				inode_t *inode, struct stat *buf, dict_t *x) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *impunge_local = NULL; +	afr_self_heal_t *impunge_sh = NULL; +	int              call_count = 0; +	int              child_index = 0; +	call_frame_t    *frame = NULL; +	int              active_src = 0; + +	priv = this->private; +	impunge_local = impunge_frame->local; +	impunge_sh = &impunge_local->self_heal; +	frame = impunge_sh->sh_frame; +	child_index = (long) cookie; +	active_src = impunge_sh->active_source; + +	if (op_ret == -1 && op_errno == ENOENT) { +		/* decrease call_count in recreate-callback */ +		gf_log (this->name, GF_LOG_DEBUG, +			"missing entry %s on %s", +			impunge_local->loc.path, +			priv->children[child_index]->name); + +		afr_sh_entry_impunge_recreate (impunge_frame, this, +					       child_index); +		return 0; +	} + +	if (op_ret == 0) { +		gf_log (this->name, GF_LOG_DEBUG, +			"%s exists under %s", +			impunge_local->loc.path, +			priv->children[child_index]->name); +	} else { +		gf_log (this->name, GF_LOG_ERROR, +			"looking up %s under %s failed (%s)", +			impunge_local->loc.path, +			priv->children[child_index]->name, +			strerror (op_errno)); +	} + +	LOCK (&impunge_frame->lock); +	{ +		call_count = --impunge_local->call_count; +	} +	UNLOCK (&impunge_frame->lock); + +	if (call_count == 0) { +		AFR_STACK_DESTROY (impunge_frame); +		afr_sh_entry_impunge_entry_done (frame, this, active_src); +	} + +	return 0; +} + + +int +afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this, +			    char *name) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *local  = NULL; +	afr_self_heal_t *sh  = NULL; +	int              ret = -1; +	call_frame_t    *impunge_frame = NULL; +	afr_local_t     *impunge_local = NULL; +	afr_self_heal_t *impunge_sh = NULL; +	int              active_src = 0; +	int              i = 0; +	int              call_count = 0; +	int              op_errno = 0; + +	priv = this->private; +	local = frame->local; +	sh = &local->self_heal; + +	active_src = sh->active_source; + +	if ((strcmp (name, ".") == 0) +	    || (strcmp (name, "..") == 0)) { +		gf_log (this->name, GF_LOG_DEBUG, +			"skipping inspection of %s under %s", +			name, local->loc.path); +		goto out; +	} + +	gf_log (this->name, GF_LOG_DEBUG, +		"inspecting existance of %s under %s", +		name, local->loc.path); + +	impunge_frame = copy_frame (frame); +	if (!impunge_frame) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto out; +	} + +	ALLOC_OR_GOTO (impunge_local, afr_local_t, out); + +	impunge_frame->local = impunge_local; +	impunge_sh = &impunge_local->self_heal; +	impunge_sh->sh_frame = frame; +	impunge_sh->active_source = active_src; + +	ret = build_child_loc (this, &impunge_local->loc, &local->loc, name); +	if (ret != 0) { +		goto out; +	} + +	for (i = 0; i < priv->child_count; i++) { +		if (i == active_src) +			continue; +		if (local->child_up[i] == 0) +			continue; +		if (sh->sources[i] == 1) +			continue; +		call_count++; +	} + +	impunge_local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) { +		if (i == active_src) +			continue; +		if (local->child_up[i] == 0) +			continue; +		if (sh->sources[i] == 1) +			continue; + +		gf_log (this->name, GF_LOG_DEBUG, +			"looking up %s on %s", impunge_local->loc.path, +			priv->children[i]->name); + +		STACK_WIND_COOKIE (impunge_frame, +				   afr_sh_entry_impunge_entry_cbk, +				   (void *) (long) i, +				   priv->children[i], +				   priv->children[i]->fops->lookup, +				   &impunge_local->loc, 0); + +		if (!--call_count) +			break; +	} + +	ret = 0; +out: +	if (ret == -1) +		afr_sh_entry_impunge_entry_done (frame, this, active_src); +	 +	return 0; +} + + +int +afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, +				  xlator_t *this, +				  int32_t op_ret, int32_t op_errno, +				  gf_dirent_t *entries) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *local  = NULL; +	afr_self_heal_t *sh  = NULL; +	gf_dirent_t     *entry = NULL; +	off_t            last_offset = 0; +	int              active_src = 0; +	int              entry_count = 0; + +	priv = this->private; +	local = frame->local; +	sh = &local->self_heal; + +	active_src = sh->active_source; + +	if (op_ret <= 0) { +		if (op_ret < 0) { +			gf_log (this->name, GF_LOG_ERROR, +				"readdir of %s on subvolume %s failed (%s)", +				local->loc.path, +				priv->children[active_src]->name, +				strerror (op_errno)); +		} else { +			gf_log (this->name, GF_LOG_DEBUG, +				"readdir of %s on subvolume %s complete", +				local->loc.path, +				priv->children[active_src]->name); +		} + +		afr_sh_entry_impunge_all (frame, this); +		return 0; +	} + +	list_for_each_entry (entry, &entries->list, list) { +		last_offset = entry->d_off; +		entry_count++; +	} + +	gf_log (this->name, GF_LOG_DEBUG, +		"readdir'ed %d entries from %s", +		entry_count, priv->children[active_src]->name); + +	sh->offset = last_offset; +	local->call_count = entry_count; + +	list_for_each_entry (entry, &entries->list, list) { +		afr_sh_entry_impunge_entry (frame, this, entry->d_name); +	} + +	return 0; +} +				   + +int +afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this, +			     int active_src) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *local  = NULL; +	afr_self_heal_t *sh  = NULL; + +	priv = this->private; +	local = frame->local; +	sh = &local->self_heal; + +	STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk, +		    priv->children[active_src], +		    priv->children[active_src]->fops->readdir, +		    sh->healing_fd, sh->block_size, sh->offset); + +	return 0; +} + + +int +afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) +{ +	afr_private_t   *priv = NULL; +	afr_local_t     *local  = NULL; +	afr_self_heal_t *sh  = NULL; +	int              active_src = -1; + +	priv = this->private; +	local = frame->local; +	sh = &local->self_heal; + +	sh->offset = 0; + +	active_src = next_active_source (frame, this, sh->active_source); +	sh->active_source = active_src; + +	if (sh->op_failed) { +		afr_sh_entry_finish (frame, this); +		return 0; +	} + +	if (active_src == -1) { +		/* completed creating missing files on all subvolumes */ +		afr_sh_entry_expunge_all (frame, this); +		return 0; +	} + +	gf_log (this->name, GF_LOG_DEBUG, +		"impunging entries of %s on %s to other sinks", +		local->loc.path, priv->children[active_src]->name); + +	afr_sh_entry_impunge_subvol (frame, this, active_src); + +	return 0; +} + + +int +afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			  int32_t op_ret, int32_t op_errno, fd_t *fd) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              call_count = 0; +	int              child_index = 0; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	child_index = (long) cookie; + +	/* TODO: some of the open's might fail. +	   In that case, modify cleanup fn to send flush on those  +	   fd's which are already open */ + +	LOCK (&frame->lock); +	{ +		if (op_ret == -1) { +			gf_log (this->name, GF_LOG_ERROR, +				"opendir of %s failed on child %s (%s)", +				local->loc.path, +				priv->children[child_index]->name, +				strerror (op_errno)); +			sh->op_failed = 1; +		} +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		if (sh->op_failed) { +			afr_sh_entry_finish (frame, this); +			return 0; +		} +		gf_log (this->name, GF_LOG_DEBUG, +			"fd for %s opened, commencing sync", +			local->loc.path); + +		sh->active_source = -1; +		afr_sh_entry_impunge_all (frame, this); +	} + +	return 0; +} + + +int +afr_sh_entry_open (call_frame_t *frame, xlator_t *this) +{ +	int i = 0;				 +	int call_count = 0;		      + +	int source = -1; +	int *sources = NULL; + +	fd_t *fd = NULL; + +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	afr_self_heal_t *sh = NULL; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	source  = local->self_heal.source; +	sources = local->self_heal.sources; + +	sh->block_size = 131072; +	sh->offset = 0; + +	call_count = sh->active_sinks; +	if (source != -1) +		call_count++; + +	local->call_count = call_count; + +	fd = fd_create (local->loc.inode, frame->root->pid); +	sh->healing_fd = fd; + +	if (source != -1) { +		gf_log (this->name, GF_LOG_DEBUG, +			"opening directory %s on subvolume %s (source)", +			local->loc.path, priv->children[source]->name); + +		/* open source */ +		STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, +				   (void *) (long) source, +				   priv->children[source], +				   priv->children[source]->fops->opendir, +				   &local->loc, fd); +		call_count--; +	} + +	/* open sinks */ +	for (i = 0; i < priv->child_count; i++) { +		if (sources[i] || !local->child_up[i]) +			continue; + +		gf_log (this->name, GF_LOG_DEBUG, +			"opening directory %s on subvolume %s (sink)", +			local->loc.path, priv->children[i]->name); + +		STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, +				   (void *) (long) i, +				   priv->children[i],  +				   priv->children[i]->fops->opendir, +				   &local->loc, fd); + +		if (!--call_count) +			break; +	} + +	return 0; +} + + +int +afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              active_sinks = 0; +	int              source = 0; +	int              i = 0; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	source = sh->source; + +	for (i = 0; i < priv->child_count; i++) { +		if (sh->sources[i] == 0 && local->child_up[i] == 1) { +			active_sinks++; +			sh->success[i] = 1; +		} +	} +	if (source != -1) +		sh->success[source] = 1; + +	if (active_sinks == 0) { +		gf_log (this->name, GF_LOG_DEBUG, +			"no active sinks for self-heal on dir %s", +			local->loc.path); +		afr_sh_entry_finish (frame, this); +		return 0; +	} +	if (source == -1 && active_sinks < 2) { +		gf_log (this->name, GF_LOG_WARNING, +			"cannot sync with 0 sources and 1 sink on dir %s", +			local->loc.path); +		afr_sh_entry_finish (frame, this); +		return 0; +	} +	sh->active_sinks = active_sinks; + +	if (source != -1) +		gf_log (this->name, GF_LOG_DEBUG, +			"syncing %s from subvolume %s to %d active sinks", +			local->loc.path, priv->children[source]->name, +			active_sinks); +	else +		gf_log (this->name, GF_LOG_DEBUG, +			"no active sources for %s found. " +			"merging all entries as a conservative decision", +			local->loc.path); + +	afr_sh_entry_open (frame, this); + +	return 0; +} + + +int +afr_sh_entry_fix (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              source = 0; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr,  +				     priv->child_count, AFR_ENTRY_PENDING); + +	afr_sh_print_pending_matrix (sh->pending_matrix, this); + + +	afr_sh_mark_sources (sh->pending_matrix, sh->sources,  +			     priv->child_count); + +	afr_sh_supress_errenous_children (sh->sources, sh->child_errno, +					  priv->child_count); + +	source = afr_sh_select_source (sh->sources, priv->child_count); +	sh->source = source; + +	afr_sh_entry_sync_prepare (frame, this); + +	return 0; +} + + + +int +afr_sh_entry_lookup_cbk (call_frame_t *frame, void *cookie, +			 xlator_t *this, int32_t op_ret, int32_t op_errno, +			 inode_t *inode, struct stat *buf, dict_t *xattr) +{ +	afr_private_t   *priv  = NULL; +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; + +	int call_count  = -1; +	int child_index = (long) cookie; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	LOCK (&frame->lock); +	{ +		if (op_ret != -1) { +			sh->xattr[child_index] = dict_ref (xattr); +			sh->buf[child_index] = *buf; +		} +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		afr_sh_entry_fix (frame, this); +	} + +	return 0; +} + + + +int +afr_sh_entry_lookup (call_frame_t *frame, xlator_t *this) +{ +	afr_self_heal_t * sh    = NULL;  +	afr_local_t    *  local = NULL; +	afr_private_t  *  priv  = NULL; +	dict_t         *xattr_req = NULL; +	int ret = 0; +	int call_count = 0; +	int i = 0; + +	priv  = this->private; +	local = frame->local; +	sh    = &local->self_heal; + +	call_count = local->child_count; + +	local->call_count = call_count; +	 +	xattr_req = dict_new(); +	if (xattr_req) +		ret = dict_set_uint64 (xattr_req, AFR_ENTRY_PENDING, +				       priv->child_count * sizeof(int32_t)); + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, +					   afr_sh_entry_lookup_cbk, +					   (void *) (long) i, +					   priv->children[i],  +					   priv->children[i]->fops->lookup, +					   &local->loc, xattr_req); +			if (!--call_count) +				break; +		} +	} +	 +	if (xattr_req) +		dict_unref (xattr_req); + +	return 0; +} + + + +int +afr_sh_entry_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		       int32_t op_ret, int32_t op_errno) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	int              call_count = 0; +	int              child_index = (long) cookie; + +	/* TODO: what if lock fails? */ +	 +	local = frame->local; +	sh    = &local->self_heal; + +	LOCK (&frame->lock); +	{ +		if (op_ret == -1) { +			sh->op_failed = 1; + +			gf_log (this->name, +				(op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), +				"locking inode of %s on child %d failed: %s", +				local->loc.path, child_index, +				strerror (op_errno)); +		} else { +			gf_log (this->name, GF_LOG_DEBUG, +				"inode of %s on child %d locked", +				local->loc.path, child_index); +		} +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		if (sh->op_failed == 1) { +			afr_sh_entry_finish (frame, this); +			return 0; +		} + +		afr_sh_entry_lookup (frame, this); +	} + +	return 0; +} + + +int +afr_sh_entry_lock (call_frame_t *frame, xlator_t *this) +{ +	int i = 0;				 +	int call_count = 0;		      + +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	afr_self_heal_t * sh  = NULL; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	call_count = local->child_count; + +	local->call_count = call_count;		 + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			gf_log (this->name, GF_LOG_DEBUG, +				"locking %s on subvolume %s", +				local->loc.path, priv->children[i]->name); + +			STACK_WIND_COOKIE (frame, afr_sh_entry_lock_cbk, +					   (void *) (long) i, +					   priv->children[i],  +					   priv->children[i]->fops->entrylk, +					   &local->loc, NULL, +					   ENTRYLK_LOCK_NB, ENTRYLK_WRLCK); +			if (!--call_count) +				break; +		} +	} + +	return 0; +} + + +int +afr_self_heal_entry (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t   *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; + + +	priv = this->private; +	local = frame->local; +	sh = &local->self_heal; + +	if (local->need_entry_self_heal && priv->entry_self_heal) { +		afr_sh_entry_lock (frame, this); +	} else { +		gf_log (this->name, GF_LOG_DEBUG, +			"proceeding to completion on %s", +			local->loc.path); +		afr_sh_entry_done (frame, this); +	} + +	return 0; +} + diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c new file mode 100644 index 00000000000..e65a426db6c --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -0,0 +1,791 @@ +/* +   Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" + + +int +afr_sh_metadata_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              i = 0; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +//	memset (sh->child_errno, 0, sizeof (int) * priv->child_count); +	memset (sh->buf, 0, sizeof (struct stat) * priv->child_count); +	memset (sh->success, 0, sizeof (int) * priv->child_count); +	 +	for (i = 0; i < priv->child_count; i++) { +		if (sh->xattr[i]) +			dict_unref (sh->xattr[i]); +		sh->xattr[i] = NULL; +	} + +	if (local->govinda_gOvinda) { +		gf_log (this->name, GF_LOG_WARNING, +			"aborting selfheal of %s", +			local->loc.path); +		sh->completion_cbk (frame, this); +	} else { +		if (S_ISREG (local->cont.lookup.buf.st_mode)) { +			gf_log (this->name, GF_LOG_DEBUG, +				"proceeding to data check on %s", +				local->loc.path); +			afr_self_heal_data (frame, this); +			return 0; +		} + +		if (S_ISDIR (local->cont.lookup.buf.st_mode)) { +			gf_log (this->name, GF_LOG_DEBUG, +				"proceeding to entry check on %s", +				local->loc.path); +			afr_self_heal_entry (frame, this); +			return 0; +		} +		gf_log (this->name, GF_LOG_DEBUG, +			"completed self heal of %s", +			local->loc.path); + +		sh->completion_cbk (frame, this); +	} + +	return 0; +} + + +int +afr_sh_metadata_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			   int32_t op_ret, int32_t op_errno) +{ +	afr_local_t      *local = NULL; +	int               call_count = 0; + + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		afr_sh_metadata_done (frame, this); + +	return 0; +} + + +int +afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              i = 0; +	int              call_count = 0; +	struct flock     flock = {0, }; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	call_count = local->child_count; +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) { +		flock.l_start   = 0; +		flock.l_len     = 0; +		flock.l_type    = F_UNLCK; + +		if (local->child_up[i]) { +			gf_log (this->name, GF_LOG_DEBUG, +				"unlocking %s on subvolume %s", +				local->loc.path, priv->children[i]->name); + +			STACK_WIND (frame, afr_sh_metadata_unlck_cbk, +				    priv->children[i], +				    priv->children[i]->fops->inodelk, +				    &local->loc, F_SETLK, &flock); + +			if (!--call_count) +				break; +		} +	} + +	return 0; +} + + +int +afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie, +				   xlator_t *this, int32_t op_ret, +				   int32_t op_errno, dict_t *xattr) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int             call_count = 0; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	LOCK (&frame->lock); +	{ +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		afr_sh_metadata_finish (frame, this); + +	return 0; +} + + +int +afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              call_count = 0; +	int              i = 0; +	dict_t          **erase_xattr = NULL; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + + +	afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix, +				 sh->success, priv->child_count); + +	erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count); + +	for (i = 0; i < priv->child_count; i++) { +		if (sh->xattr[i]) { +			call_count++; + +			erase_xattr[i] = get_new_dict(); +			dict_ref (erase_xattr[i]); +		} +	} + +	afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr, +			       priv->child_count, AFR_METADATA_PENDING); + +	local->call_count = call_count; + +	if (call_count == 0) { +		gf_log (this->name, GF_LOG_WARNING, +			"metadata of %s not healed on any subvolume", +			local->loc.path); + +		afr_sh_metadata_finish (frame, this); +	} + +	for (i = 0; i < priv->child_count; i++) { +		if (!erase_xattr[i]) +			continue; + +		gf_log (this->name, GF_LOG_DEBUG, +			"erasing pending flags from %s on %s", +			local->loc.path, priv->children[i]->name); + +		STACK_WIND_COOKIE (frame, afr_sh_metadata_erase_pending_cbk, +				   (void *) (long) i, +				   priv->children[i], +				   priv->children[i]->fops->xattrop, +				   &local->loc, +				   GF_XATTROP_ADD_ARRAY, erase_xattr[i]); +		if (!--call_count) +			break; +	} + +	for (i = 0; i < priv->child_count; i++) { +		if (erase_xattr[i]) { +			dict_unref (erase_xattr[i]); +		} +	} +	FREE (erase_xattr); + +	return 0; +} + + +int +afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			  int32_t op_ret, int32_t op_errno) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              call_count = 0; +	int              child_index = 0; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	child_index = (long) cookie; + +	LOCK (&frame->lock); +	{ +		if (op_ret == -1) { +			gf_log (this->name, GF_LOG_ERROR, +				"setting attributes failed for %s on %s (%s)", +				local->loc.path, +				priv->children[child_index]->name, +				strerror (op_errno)); + +			sh->success[child_index] = 0; +		} +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		afr_sh_metadata_erase_pending (frame, this); + +	return 0; +} + + +int +afr_sh_metadata_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			  int32_t op_ret, int32_t op_errno, struct stat *buf) +{ +	afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); + +	return 0; +} + + +int +afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			   int32_t op_ret, int32_t op_errno) +{ +	afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); + +	return 0; +} + + +int +afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              source = 0; +	int              active_sinks = 0; +	int              call_count = 0; +	int              i = 0; +	struct timespec  ts[2]; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	source = sh->source; +	active_sinks = sh->active_sinks; + +	/* +	 * 4 calls per sink - chown, chmod, utimes, setxattr +	 */ +	if (xattr) +		call_count = active_sinks * 4; +	else +		call_count = active_sinks * 3; + +	local->call_count = call_count; + +#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC +	ts[0] = sh->buf[source].st_atim; +	ts[1] = sh->buf[source].st_mtim; +#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC +	ts[0] = sh->buf[source].st_atimespec; +	ts[1] = sh->buf[source].st_mtimespec; +#else +	ts[0].tv_sec = sh->buf[source].st_atime; +	ts[1].tv_sec = sh->buf[source].st_mtime; +#endif + +	for (i = 0; i < priv->child_count; i++) { +		if (call_count == 0) { +			break; +		} +		if (sh->sources[i] || !local->child_up[i]) +			continue; + +		gf_log (this->name, GF_LOG_DEBUG, +			"syncing metadata of %s from %s to %s", +			local->loc.path, priv->children[source]->name, +			priv->children[i]->name); + +		STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk, +				   (void *) (long) i, +				   priv->children[i], +				   priv->children[i]->fops->chown, +				   &local->loc, +				   sh->buf[source].st_uid, +				   sh->buf[source].st_gid); + +		STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk, +				   (void *) (long) i, +				   priv->children[i], +				   priv->children[i]->fops->chmod, +				   &local->loc, sh->buf[source].st_mode); + +		STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk, +				   (void *) (long) i, +				   priv->children[i], +				   priv->children[i]->fops->utimens, +				   &local->loc, ts); + +		call_count = call_count - 3; + +		if (!xattr) +			continue; + +		STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk, +				   (void *) (long) i, +				   priv->children[i], +				   priv->children[i]->fops->setxattr, +				   &local->loc, xattr, 0); +		call_count--; +	} + +	return 0; +} + + +int +afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, +			      xlator_t *this, +			      int32_t op_ret, int32_t op_errno, dict_t *xattr) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              source = 0; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	source = sh->source; + +	if (op_ret == -1) { +		gf_log (this->name, GF_LOG_ERROR, +			"getxattr of %s failed on subvolume %s (%s). proceeding without xattr", +			local->loc.path, priv->children[source]->name, +			strerror (op_errno)); + +		afr_sh_metadata_sync (frame, this, NULL); +	} else { +		dict_del (xattr, AFR_DATA_PENDING); +		dict_del (xattr, AFR_METADATA_PENDING); +		dict_del (xattr, AFR_ENTRY_PENDING); +		afr_sh_metadata_sync (frame, this, xattr); +	} + +	return 0; +} + + +int +afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              active_sinks = 0; +	int              source = 0; +	int              i = 0; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	source = sh->source; + +	for (i = 0; i < priv->child_count; i++) { +		if (sh->sources[i] == 0 && local->child_up[i] == 1) { +			active_sinks++; +			sh->success[i] = 1; +		} +	} +	sh->success[source] = 1; + +	if (active_sinks == 0) { +		gf_log (this->name, GF_LOG_DEBUG, +			"no active sinks for performing self-heal on file %s", +			local->loc.path); +		afr_sh_metadata_finish (frame, this); +		return 0; +	} +	sh->active_sinks = active_sinks; + +	gf_log (this->name, GF_LOG_DEBUG, +		"syncing metadata of %s from subvolume %s to %d active sinks", +		local->loc.path, priv->children[source]->name, active_sinks); + +	STACK_WIND (frame, afr_sh_metadata_getxattr_cbk, +		    priv->children[source], +		    priv->children[source]->fops->getxattr, +		    &local->loc, NULL); + +	return 0; +} + + +int +afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              nsources = 0; +	int              source = 0; +	int              i = 0; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr,  +				     priv->child_count, AFR_METADATA_PENDING); + +	afr_sh_print_pending_matrix (sh->pending_matrix, this); + +	afr_sh_mark_sources (sh->pending_matrix, sh->sources,  +			     priv->child_count); + +	afr_sh_supress_errenous_children (sh->sources, sh->child_errno, +					  priv->child_count); + +	nsources = afr_sh_source_count (sh->sources, priv->child_count); + +	if ((nsources == 0) +	    && (priv->favorite_child != -1) +	    && (sh->child_errno[priv->favorite_child] == 0)) { + +		gf_log (this->name, GF_LOG_WARNING, +			"Picking favorite child %s as authentic source to resolve conflicting metadata of %s", +			priv->children[priv->favorite_child]->name, +			local->loc.path); + +		sh->sources[priv->favorite_child] = 1; + +		nsources = afr_sh_source_count (sh->sources, +						priv->child_count); +	} + +	if (nsources == 0) { +		gf_log (this->name, GF_LOG_ERROR, +			"Unable to resolve conflicting metadata of %s. " +			"Please resolve manually by fixing the " +			"permissions/ownership of %s on your subvolumes. " +			"You can also consider 'option favorite-child <>'", +			local->loc.path, local->loc.path); + +		local->govinda_gOvinda = 1; + +		afr_sh_metadata_finish (frame, this); +		return 0; +	} + +	source = afr_sh_select_source (sh->sources, priv->child_count); +	sh->source = source; + +	/* detect changes not visible through pending flags -- JIC */ +	for (i = 0; i < priv->child_count; i++) { +		if (i == source || sh->child_errno[i]) +			continue; + +		if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source])) +			sh->sources[i] = 0; + +		if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source])) +			sh->sources[i] = 0; +	} + +	afr_sh_metadata_sync_prepare (frame, this); + +	return 0; +} + + +int +afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			    int32_t op_ret, int32_t op_errno, +			    inode_t *inode, struct stat *buf, dict_t *xattr) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              call_count = 0; +	int              child_index = 0; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	child_index = (long) cookie; + +	LOCK (&frame->lock); +	{ +		if (op_ret == 0) { +			gf_log (this->name, GF_LOG_DEBUG, +				"path %s on subvolume %s is of mode 0%o", +				local->loc.path, +				priv->children[child_index]->name, +				buf->st_mode); + +			sh->buf[child_index] = *buf; +			if (xattr) +				sh->xattr[child_index] = dict_ref (xattr); +		} else { +			gf_log (this->name, GF_LOG_DEBUG, +				"path %s on subvolume %s => -1 (%s)", +				local->loc.path, +				priv->children[child_index]->name, +				strerror (op_errno)); + +			sh->child_errno[child_index] = op_errno; +		} +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		afr_sh_metadata_fix (frame, this); + +	return 0; +} + + +int +afr_sh_metadata_lookup (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              i = 0; +	int              call_count = 0; +	dict_t          *xattr_req = NULL; +	int              ret = 0; + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	call_count = local->child_count; +	local->call_count = call_count; +	 +	xattr_req = dict_new(); +	 +	if (xattr_req) +		ret = dict_set_uint64 (xattr_req, AFR_METADATA_PENDING, +				       priv->child_count * sizeof(int32_t)); + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			gf_log (this->name, GF_LOG_DEBUG, +				"looking up %s on %s", +				local->loc.path, priv->children[i]->name); + +			STACK_WIND_COOKIE (frame, afr_sh_metadata_lookup_cbk, +					   (void *) (long) i, +					   priv->children[i], +					   priv->children[i]->fops->lookup, +					   &local->loc, xattr_req); +			if (!--call_count) +				break; +		} +	} +	 +	if (xattr_req) +		dict_unref (xattr_req); + +	return 0; +} + + +int +afr_sh_metadata_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			int32_t op_ret, int32_t op_errno) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              call_count = 0; +	int              child_index = (long) cookie; + +	/* TODO: what if lock fails? */ +	 +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	LOCK (&frame->lock); +	{ +		if (op_ret == -1) { +			sh->op_failed = 1; + +			gf_log (this->name, +				(op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), +				"locking of %s on child %d failed: %s", +				local->loc.path, child_index, +				strerror (op_errno)); +		} else { +			gf_log (this->name, GF_LOG_DEBUG, +				"inode of %s on child %d locked", +				local->loc.path, child_index); +		} +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		if (sh->op_failed) { +			afr_sh_metadata_finish (frame, this); +			return 0; +		} + +		afr_sh_metadata_lookup (frame, this); +	} + +	return 0; +} + + +int +afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t     *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              i = 0; +	int              call_count = 0; +	struct flock     flock = {0, }; + + +	local = frame->local; +	sh = &local->self_heal; +	priv = this->private; + +	call_count = local->child_count; +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) { +		flock.l_start   = 0; +		flock.l_len     = 0; +		flock.l_type    = F_WRLCK; + +		if (local->child_up[i]) { +			gf_log (this->name, GF_LOG_DEBUG, +				"locking %s on subvolume %s", +				local->loc.path, priv->children[i]->name); + +			STACK_WIND_COOKIE (frame, afr_sh_metadata_lk_cbk, +					   (void *) (long) i, +					   priv->children[i], +					   priv->children[i]->fops->inodelk, +					   &local->loc, F_SETLK, &flock); + +			if (!--call_count) +				break; +		} +	} + +	return 0; +} + + +int +afr_self_heal_metadata (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t   *local = NULL; +	afr_self_heal_t *sh = NULL; +	afr_private_t *priv = this->private; + + +	local = frame->local; +	sh = &local->self_heal; + +	if (local->need_metadata_self_heal && priv->metadata_self_heal) { +		afr_sh_metadata_lock (frame, this); +	} else { +		gf_log (this->name, GF_LOG_DEBUG, +			"proceeding to data check on %s", +			local->loc.path); +		afr_sh_metadata_done (frame, this); +	} + +	return 0; +} + diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h new file mode 100644 index 00000000000..1c97a9bc11b --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -0,0 +1,52 @@ +/* +   Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + +#ifndef __AFR_SELF_HEAL_H__ +#define __AFR_SELF_HEAL_H__ + +#include <sys/stat.h> + +#define FILETYPE_DIFFERS(buf1,buf2) ((S_IFMT & ((struct stat *)buf1)->st_mode) != (S_IFMT & ((struct stat *)buf2)->st_mode)) +#define PERMISSION_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_mode) != (((struct stat *)buf2)->st_mode)) +#define OWNERSHIP_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_uid) != (((struct stat *)buf2)->st_uid) || (((struct stat *)buf1)->st_gid != (((struct stat *)buf2)->st_gid))) +#define SIZE_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_size) != (((struct stat *)buf2)->st_size)) + + + +int +afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this); +int +afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this); +int +afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this); + +int +afr_self_heal_entry (call_frame_t *frame, xlator_t *this); + +int +afr_self_heal_data (call_frame_t *frame, xlator_t *this); + +int +afr_self_heal_metadata (call_frame_t *frame, xlator_t *this); + +int +afr_self_heal (call_frame_t *frame, xlator_t *this, +	       int (*completion_cbk) (call_frame_t *, xlator_t *)); + +#endif /* __AFR_SELF_HEAL_H__ */ diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c new file mode 100644 index 00000000000..3df9f07e5a3 --- /dev/null +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -0,0 +1,957 @@ +/* +  Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> +  This file is part of GlusterFS. + +  GlusterFS is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published +  by the Free Software Foundation; either version 3 of the License, +  or (at your option) any later version. + +  GlusterFS is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program.  If not, see +  <http://www.gnu.org/licenses/>. +*/ + +#include "dict.h" +#include "byte-order.h" + +#include "afr.h" +#include "afr-transaction.h" + +#include <signal.h> + + +static void +__mark_all_pending (int32_t *pending, int child_count) +{	 +	int i; +	 +	for (i = 0; i < child_count; i++) +		pending[i] = hton32 (1); +} + + +static void +__mark_child_dead (int32_t *pending, int child_count, int child) +{ +	pending[child] = 0; +} + + +static void +__mark_down_children (int32_t *pending, int child_count, unsigned char *child_up) +{ +	int i; +	 +	for (i = 0; i < child_count; i++) +		if (!child_up[i]) +			pending[i] = 0; +} + + +static void +__mark_all_success (int32_t *pending, int child_count) +{ +	int i; +	 +	for (i = 0; i < child_count; i++) +		pending[i] = hton32 (-1); +} + + +static int +__is_first_write_on_fd (xlator_t *this, fd_t *fd) +{ +	int op_ret     = 0; +	int _ret       = -1; + +	_ret = fd_ctx_get (fd, this, NULL); +	if (_ret < 0) { +		gf_log (this->name, GF_LOG_DEBUG, +			"first writev() on fd=%p, writing changelog", +			fd); + +		_ret = fd_ctx_set (fd, this, 0xaf1); +		op_ret = 1; +	} + +	return op_ret; +} + + +static int +__changelog_enabled (afr_private_t *priv, afr_transaction_type type) +{ +	int ret = 0; + +	switch (type) { +	case AFR_DATA_TRANSACTION: +		if (priv->data_change_log) +			ret = 1; +		 +		break; + +	case AFR_METADATA_TRANSACTION: +		if (priv->metadata_change_log) +			ret = 1; + +		break; + +	case AFR_ENTRY_TRANSACTION: +	case AFR_ENTRY_RENAME_TRANSACTION: +		if (priv->entry_change_log) +			ret = 1; + +		break; +		 +	case AFR_FLUSH_TRANSACTION: +		ret = 1; +	} + +	return ret; +} + + +static int +__changelog_needed_pre_op (call_frame_t *frame, xlator_t *this) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; +	fd_t *          fd    = NULL; + +	int op_ret   = 0; + +	priv  = this->private; +	local = frame->local; +	 +	if (__changelog_enabled (priv, local->transaction.type)) { +		switch (local->op) { + +		case GF_FOP_WRITE: +		case GF_FOP_FTRUNCATE: +			/*  +			   if it's a data transaction, we write the changelog +			   only on the first write on an fd  +			*/ +			 +			fd = local->fd; +			if (!fd || __is_first_write_on_fd (this, fd)) +				op_ret = 1; + +			break; + +		case GF_FOP_FLUSH: +			/* only do post-op on flush() */ + +			op_ret = 0; +			break; + +		default: +			op_ret = 1; +		} +	} + +	return op_ret; +} + + +static int +__changelog_needed_post_op (call_frame_t *frame, xlator_t *this) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; + +	int ret = 0; +	afr_transaction_type type = -1; + +	priv  = this->private; +	local = frame->local; +	type  = local->transaction.type; + +	if (__changelog_enabled (priv, type) +	    && (local->op != GF_FOP_WRITE) +	    && (local->op != GF_FOP_FTRUNCATE)) +		ret = 1; +	 +	return ret; +} + + +static int +afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) +{ +	int ret = 0; + +	switch (type) { +	case AFR_FLUSH_TRANSACTION: +	case AFR_DATA_TRANSACTION: +		ret = priv->data_lock_server_count; +		break; + +	case AFR_METADATA_TRANSACTION: +		ret = priv->metadata_lock_server_count; +		break; + +	case AFR_ENTRY_TRANSACTION: +	case AFR_ENTRY_RENAME_TRANSACTION: +		ret = priv->entry_lock_server_count; +		break; +	} + +	return ret; +} + + +/* {{{ unlock */ + +int32_t +afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +		       int32_t op_ret, int32_t op_errno) +{ +	afr_local_t *local; +	int call_count = 0; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		call_count = --local->call_count; +	} +	UNLOCK (&frame->lock); + +	if (call_count == 0) { +		local->transaction.done (frame, this); +	} +	 +	return 0; +} + + +int +afr_unlock (call_frame_t *frame, xlator_t *this) +{ +	struct flock flock;			 + +	int i = 0;				 +	int call_count = 0;		      + +	afr_local_t *local = NULL; +	afr_private_t * priv = this->private; + +	local = frame->local; +	 +	call_count = afr_locked_nodes_count (local->transaction.locked_nodes,  +					     priv->child_count); +	 +	if (call_count == 0) { +		local->transaction.done (frame, this); +		return 0; +	} + +	if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION)  +		call_count *= 2; + +	local->call_count = call_count;		 + +	for (i = 0; i < priv->child_count; i++) {				 +		flock.l_start = local->transaction.start;			 +		flock.l_len   = local->transaction.len; +		flock.l_type  = F_UNLCK;			 + +		if (local->transaction.locked_nodes[i]) { +			switch (local->transaction.type) { +			case AFR_DATA_TRANSACTION: +			case AFR_METADATA_TRANSACTION: +			case AFR_FLUSH_TRANSACTION: + +				if (local->fd) { +					STACK_WIND (frame, afr_unlock_common_cbk,	 +						    priv->children[i],  +						    priv->children[i]->fops->finodelk,  +						    local->fd, F_SETLK, &flock);  +				} else { +					STACK_WIND (frame, afr_unlock_common_cbk,	 +						    priv->children[i],  +						    priv->children[i]->fops->inodelk,  +						    &local->loc,  F_SETLK, &flock);  +				} +				 +				break; + +			case AFR_ENTRY_RENAME_TRANSACTION: +				 +				STACK_WIND (frame, afr_unlock_common_cbk,	 +					    priv->children[i],  +					    priv->children[i]->fops->entrylk,  +					    &local->transaction.new_parent_loc,  +					    local->transaction.new_basename, +					    ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + +				call_count--; + +				/* fall through */ + +			case AFR_ENTRY_TRANSACTION: +				if (local->fd) { +					STACK_WIND (frame, afr_unlock_common_cbk,	 +						    priv->children[i],  +						    priv->children[i]->fops->fentrylk,  +						    local->fd,  +						    local->transaction.basename, +						    ENTRYLK_UNLOCK, ENTRYLK_WRLCK); +				} else { +					STACK_WIND (frame, afr_unlock_common_cbk,	 +						    priv->children[i],  +						    priv->children[i]->fops->entrylk,  +						    &local->transaction.parent_loc,  +						    local->transaction.basename, +						    ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + +				} +				break; +			} +			 +			if (!--call_count) +				break; +		} +	} + +	return 0; +} + +/* }}} */ + + +/* {{{ pending */ + +int32_t +afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			   int32_t op_ret, int32_t op_errno, dict_t *xattr) +{ +	afr_private_t * priv  = NULL; +	afr_local_t *   local = NULL; +	 +	int call_count = -1; + +	priv  = this->private; +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		call_count = --local->call_count; +	} +	UNLOCK (&frame->lock); + +	if (call_count == 0) { +		if (afr_lock_server_count (priv, local->transaction.type) == 0) { +			local->transaction.done (frame, this); +		} else { +			afr_unlock (frame, this); +		} +	} + +	return 0;	 +} + + +int  +afr_changelog_post_op (call_frame_t *frame, xlator_t *this) +{ +	afr_private_t * priv = this->private; + +	int ret        = 0; +	int i          = 0;				 +	int call_count = 0; +	 +	afr_local_t *  local = NULL;	 +	dict_t *       xattr = dict_ref (get_new_dict ()); + +	local = frame->local; + +	__mark_all_success (local->pending_array, priv->child_count); +	__mark_down_children (local->pending_array, priv->child_count, local->child_up); + +	call_count = afr_up_children_count (priv->child_count, local->child_up);  + +	if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { +		call_count *= 2; +	} + +	local->call_count = call_count;		 + +	if (call_count == 0) { +		/* no child is up */ +		dict_unref (xattr); +		afr_unlock (frame, this); +		return 0; +	} + +	for (i = 0; i < priv->child_count; i++) {					 +		if (local->child_up[i]) { +			ret = dict_set_static_bin (xattr, local->transaction.pending,  +						   local->pending_array,  +						   priv->child_count * sizeof (int32_t)); +			if (ret < 0) +				gf_log (this->name, GF_LOG_ERROR,  +					"failed to set pending entry"); + + +			switch (local->transaction.type) { +			case AFR_DATA_TRANSACTION: +			case AFR_METADATA_TRANSACTION: +			case AFR_FLUSH_TRANSACTION: +			{ +				if (local->fd) +					STACK_WIND (frame, afr_changelog_post_op_cbk, +						    priv->children[i],  +						    priv->children[i]->fops->fxattrop, +						    local->fd,  +						    GF_XATTROP_ADD_ARRAY, xattr); +				else  +					STACK_WIND (frame, afr_changelog_post_op_cbk, +						    priv->children[i],  +						    priv->children[i]->fops->xattrop, +						    &local->loc,  +						    GF_XATTROP_ADD_ARRAY, xattr); +			} +			break; + +			case AFR_ENTRY_RENAME_TRANSACTION: +			{ +				STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, +						   (void *) (long) i, +						   priv->children[i], +						   priv->children[i]->fops->xattrop, +						   &local->transaction.new_parent_loc, +						   GF_XATTROP_ADD_ARRAY, xattr); +				 +				call_count--; +			} + +			/*  +			   set it again because previous stack_wind +			   might have already returned (think of case +			   where subvolume is posix) and would have +			   used the dict as placeholder for return +			   value +			*/ +			ret = dict_set_static_bin (xattr, local->transaction.pending,  +						   local->pending_array,  +						   priv->child_count * sizeof (int32_t)); +			if (ret < 0) +				gf_log (this->name, GF_LOG_ERROR,  +					"failed to set pending entry"); + +			/* fall through */ + +			case AFR_ENTRY_TRANSACTION: +			{ +				if (local->fd) +					STACK_WIND (frame, afr_changelog_post_op_cbk, +						    priv->children[i],  +						    priv->children[i]->fops->fxattrop, +						    local->fd,  +						    GF_XATTROP_ADD_ARRAY, xattr); +				else  +					STACK_WIND (frame, afr_changelog_post_op_cbk, +						    priv->children[i],  +						    priv->children[i]->fops->xattrop, +						    &local->transaction.parent_loc,  +						    GF_XATTROP_ADD_ARRAY, xattr); +			} +			break; +			} + +			if (!--call_count) +				break; +		} +	} +	 +	dict_unref (xattr); +	return 0; +} + + +int32_t +afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +			      int32_t op_ret, int32_t op_errno, dict_t *xattr) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = this->private; +	loc_t       *   loc   = NULL; + +	int call_count  = -1; +	int child_index = (long) cookie; + +	local = frame->local; +	loc   = &local->loc; + +	LOCK (&frame->lock); +	{ +		if (op_ret == -1) { +			local->child_up[child_index] = 0; +			 +			if (op_errno == ENOTSUP) { +				gf_log (this->name, GF_LOG_ERROR, +					"xattrop not supported by %s", +					priv->children[child_index]->name); +				local->op_ret = -1; +			} else if (!child_went_down (op_ret, op_errno)) { +				gf_log (this->name, GF_LOG_ERROR, +					"xattrop failed on child %s: %s", +					priv->children[child_index]->name,  +					strerror (op_errno)); +			} +			local->op_errno = op_errno; +		} + +		call_count = --local->call_count; +	} +	UNLOCK (&frame->lock); + +	if (call_count == 0) { +		if ((local->op_ret == -1) &&  +		    (local->op_errno == ENOTSUP)) { +			local->transaction.resume (frame, this); +		} else { +			local->transaction.fop (frame, this); +		} +	} + +	return 0;	 +} + + +int  +afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) +{ +	afr_private_t * priv = this->private; + +	int i = 0;				 +	int ret = 0; +	int call_count = 0;		      +	dict_t *xattr = NULL; + +	afr_local_t *local = NULL; + +	local = frame->local; +	xattr = get_new_dict (); +	dict_ref (xattr); + +	call_count = afr_up_children_count (priv->child_count,  +					    local->child_up);  + +	if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { +		call_count *= 2; +	} + +	if (call_count == 0) { +		/* no child is up */ +		dict_unref (xattr); +		afr_unlock (frame, this); +		return 0; +	} + +	local->call_count = call_count;		 + +	__mark_all_pending (local->pending_array, priv->child_count); + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			ret = dict_set_static_bin (xattr,  +						   local->transaction.pending,  +						   local->pending_array,  +						   (priv->child_count *  +						    sizeof (int32_t))); +			if (ret < 0) +				gf_log (this->name, GF_LOG_ERROR,  +					"failed to set pending entry"); + + +			switch (local->transaction.type) { +			case AFR_DATA_TRANSACTION: +			case AFR_METADATA_TRANSACTION: +			case AFR_FLUSH_TRANSACTION: +			{ +				if (local->fd) +					STACK_WIND_COOKIE (frame,  +							   afr_changelog_pre_op_cbk, +							   (void *) (long) i, +							   priv->children[i],  +							   priv->children[i]->fops->fxattrop, +							   local->fd, +							   GF_XATTROP_ADD_ARRAY, xattr); +				else +					STACK_WIND_COOKIE (frame,  +							   afr_changelog_pre_op_cbk, +							   (void *) (long) i, +							   priv->children[i],  +							   priv->children[i]->fops->xattrop, +							   &(local->loc),  +							   GF_XATTROP_ADD_ARRAY, xattr); +			} +			break; +				 +			case AFR_ENTRY_RENAME_TRANSACTION:  +			{ +				STACK_WIND_COOKIE (frame,  +						   afr_changelog_pre_op_cbk, +						   (void *) (long) i, +						   priv->children[i],  +						   priv->children[i]->fops->xattrop, +						   &local->transaction.new_parent_loc,  +						   GF_XATTROP_ADD_ARRAY, xattr); + +				call_count--; +			} + + +			/*  +			   set it again because previous stack_wind +			   might have already returned (think of case +			   where subvolume is posix) and would have +			   used the dict as placeholder for return +			   value +			*/ + +			ret = dict_set_static_bin (xattr, local->transaction.pending,  +						   local->pending_array,  +						   priv->child_count * sizeof (int32_t)); +			if (ret < 0) +				gf_log (this->name, GF_LOG_ERROR,  +					"failed to set pending entry"); + +			/* fall through */ +				 +			case AFR_ENTRY_TRANSACTION: +			{ +				if (local->fd) +					STACK_WIND_COOKIE (frame,  +							   afr_changelog_pre_op_cbk, +							   (void *) (long) i, +							   priv->children[i],  +							   priv->children[i]->fops->fxattrop, +							   local->fd,  +							   GF_XATTROP_ADD_ARRAY, xattr); +				else +					STACK_WIND_COOKIE (frame,  +							   afr_changelog_pre_op_cbk, +							   (void *) (long) i, +							   priv->children[i],  +							   priv->children[i]->fops->xattrop, +							   &local->transaction.parent_loc,  +							   GF_XATTROP_ADD_ARRAY, xattr); +			} + +			break; +			} + +			if (!--call_count) +				break; +		} +	} + +	dict_unref (xattr); +	return 0; +} + +/* }}} */ + +/* {{{ lock */ + +static +int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index); + +int32_t +afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +	      int32_t op_ret, int32_t op_errno) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv = NULL; +	int done = 0; +	int child_index = (long) cookie; + +	int call_count = 0; + +	local = frame->local; +	priv  = this->private; + +	LOCK (&frame->lock); +	{ +		if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { +			/* wait for the other lock to return */ +			call_count = --local->call_count; +		} + +		if (op_ret == -1) { +			if (op_errno == ENOSYS) { +				/* return ENOTSUP */ +				gf_log (this->name, GF_LOG_ERROR, +					"subvolume does not support locking. " +					"please load features/posix-locks xlator on server"); +				local->op_ret   = op_ret; +				done = 1; +			} + +			local->child_up[child_index] = 0; +			local->op_errno = op_errno; +		} +	} +	UNLOCK (&frame->lock); +	 +	if (call_count == 0) { +		if ((local->op_ret == -1) && +		    (local->op_errno == ENOSYS)) { +			afr_unlock (frame, this); +		} else { +			local->transaction.locked_nodes[child_index] = 1; +			local->transaction.lock_count++; +			afr_lock_rec (frame, this, child_index + 1); +		} +	} + +	return 0; +} + + +static loc_t * +lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2) +{ +	int ret = 0; + +	ret = strcmp (l1->path, l2->path); +	 +	if (ret == 0)  +		ret = strcmp (b1, b2); + +	if (ret <= 0) +		return l1; +	else +		return l2; +} + + +static +int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	struct flock flock; + +	loc_t * lower  = NULL; +	loc_t * higher = NULL; + +	const char *lower_name  = NULL; +	const char *higher_name = NULL; + +	local = frame->local; +	priv  = this->private; + +	flock.l_start = local->transaction.start; +	flock.l_len   = local->transaction.len; +	flock.l_type  = F_WRLCK; + +	/* skip over children that are down */ +	while ((child_index < priv->child_count) +	       && !local->child_up[child_index]) +		child_index++; + +	if ((child_index == priv->child_count) && +	    local->transaction.lock_count == 0) { + +		gf_log (this->name, GF_LOG_DEBUG, +			"unable to lock on even one child"); + +		local->op_ret   = -1; +		local->op_errno = EAGAIN; + +		local->transaction.done (frame, this); +		 +		return 0; + +	} + +	if ((child_index == priv->child_count)  +	    || (local->transaction.lock_count ==  +		afr_lock_server_count (priv, local->transaction.type))) { + +		/* we're done locking */ + +		if (__changelog_needed_pre_op (frame, this)) { +			afr_changelog_pre_op (frame, this); +		} else { +			local->transaction.fop (frame, this); +		} + +		return 0; +	} + +	switch (local->transaction.type) { +	case AFR_DATA_TRANSACTION:		 +	case AFR_METADATA_TRANSACTION: +	case AFR_FLUSH_TRANSACTION: + +		if (local->fd) { +			STACK_WIND_COOKIE (frame, afr_lock_cbk, +					   (void *) (long) child_index, +					   priv->children[child_index],  +					   priv->children[child_index]->fops->finodelk, +					   local->fd, F_SETLKW, &flock); +			 +		} else { +			STACK_WIND_COOKIE (frame, afr_lock_cbk, +					   (void *) (long) child_index, +					   priv->children[child_index],  +					   priv->children[child_index]->fops->inodelk, +					   &local->loc, F_SETLKW, &flock); +		} +		 +		break; +		 +	case AFR_ENTRY_RENAME_TRANSACTION: +	{ +		local->call_count = 2; + +		lower = lower_path (&local->transaction.parent_loc,  +				    local->transaction.basename, +				    &local->transaction.new_parent_loc, +				    local->transaction.new_basename); +		 +		lower_name = (lower == &local->transaction.parent_loc ?  +			      local->transaction.basename : +			      local->transaction.new_basename); + +		higher = (lower == &local->transaction.parent_loc ?  +			  &local->transaction.new_parent_loc : +			  &local->transaction.parent_loc); + +		higher_name = (higher == &local->transaction.parent_loc ?  +			       local->transaction.basename : +			       local->transaction.new_basename); + + +		/* TODO: these locks should be blocking */ + +		STACK_WIND_COOKIE (frame, afr_lock_cbk, +				   (void *) (long) child_index, +				   priv->children[child_index],  +				   priv->children[child_index]->fops->entrylk,  +				   lower, lower_name, +				   ENTRYLK_LOCK, ENTRYLK_WRLCK); + +		STACK_WIND_COOKIE (frame, afr_lock_cbk, +				   (void *) (long) child_index, +				   priv->children[child_index],  +				   priv->children[child_index]->fops->entrylk,  +				   higher, higher_name, +				   ENTRYLK_LOCK, ENTRYLK_WRLCK); + +		break; +	} +		 +	case AFR_ENTRY_TRANSACTION: +		if (local->fd) { +			STACK_WIND_COOKIE (frame, afr_lock_cbk, +					   (void *) (long) child_index,	 +					   priv->children[child_index],  +					   priv->children[child_index]->fops->fentrylk,  +					   local->fd,  +					   local->transaction.basename, +					   ENTRYLK_LOCK, ENTRYLK_WRLCK); +		} else { +			STACK_WIND_COOKIE (frame, afr_lock_cbk, +					   (void *) (long) child_index,	 +					   priv->children[child_index],  +					   priv->children[child_index]->fops->entrylk,  +					   &local->transaction.parent_loc,  +					   local->transaction.basename, +					   ENTRYLK_LOCK, ENTRYLK_WRLCK); +		} + +		break; +	} + +	return 0; +} + + +int32_t afr_lock (call_frame_t *frame, xlator_t *this) +{ +	return afr_lock_rec (frame, this, 0); +} + + +/* }}} */ + +int32_t +afr_transaction_resume (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	local = frame->local; +	priv  = this->private; + +	if (__changelog_needed_post_op (frame, this)) { +		afr_changelog_post_op (frame, this); +	} else { +		if (afr_lock_server_count (priv, local->transaction.type) == 0) { +			local->transaction.done (frame, this); +		} else { +			afr_unlock (frame, this); +		} +	} + +	return 0; +} + + +/** + * afr_transaction_child_died - inform that a child died during an fop + */ + +void +afr_transaction_child_died (call_frame_t *frame, xlator_t *this, int child_index) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	local = frame->local; +	priv  = this->private; + +	__mark_child_dead (local->pending_array, priv->child_count, child_index); +} + + +int32_t +afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; + +	local = frame->local; +	priv  = this->private; + +	afr_transaction_local_init (local, priv); + +	local->transaction.resume = afr_transaction_resume; +	local->transaction.type   = type; + +	if (afr_lock_server_count (priv, local->transaction.type) == 0) { +		if (__changelog_needed_pre_op (frame, this)) { +			afr_changelog_pre_op (frame, this); +		} else { +			local->transaction.fop (frame, this); +		} +	} else { +		afr_lock (frame, this); +	} + +	return 0; +} diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h new file mode 100644 index 00000000000..49cdd219f25 --- /dev/null +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -0,0 +1,36 @@ +/* +   Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + +#ifndef __TRANSACTION_H__ +#define __TRANSACTION_H__ + +#define AFR_METADATA_PENDING "trusted.glusterfs.afr.metadata-pending" + +#define AFR_DATA_PENDING "trusted.glusterfs.afr.data-pending" + +#define AFR_ENTRY_PENDING "trusted.glusterfs.afr.entry-pending" + +void +afr_transaction_child_died (call_frame_t *frame, xlator_t *this, +			    int child_index); + +int32_t +afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); + +#endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c new file mode 100644 index 00000000000..e4c1a847985 --- /dev/null +++ b/xlators/cluster/afr/src/afr.c @@ -0,0 +1,2338 @@ +/* +   Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-inode-read.h" +#include "afr-inode-write.h" +#include "afr-dir-read.h" +#include "afr-dir-write.h" +#include "afr-transaction.h" + +#include "afr-self-heal.h" + + +/** + * afr_local_cleanup - cleanup everything in frame->local + */ + +void +afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) +{ +	afr_self_heal_t *sh = NULL; +	afr_private_t   *priv = NULL; +	int              i = 0; + + +	sh = &local->self_heal; +	priv = this->private; + +	if (sh->buf) +		FREE (sh->buf); + +	if (sh->xattr) { +		for (i = 0; i < priv->child_count; i++) { +			if (sh->xattr[i]) { +				dict_unref (sh->xattr[i]); +				sh->xattr[i] = NULL; +			} +		} +		FREE (sh->xattr); +	} + +	if (sh->child_errno) +		FREE (sh->child_errno); + +	if (sh->pending_matrix) { +		for (i = 0; i < priv->child_count; i++) { +			FREE (sh->pending_matrix[i]); +		} +		FREE (sh->pending_matrix); +	} + +	if (sh->delta_matrix) { +		for (i = 0; i < priv->child_count; i++) { +			FREE (sh->delta_matrix[i]); +		} +		FREE (sh->delta_matrix); +	} + +	if (sh->sources) +		FREE (sh->sources); + +	if (sh->success) +		FREE (sh->success); + +	if (sh->healing_fd) { +		fd_unref (sh->healing_fd); +		sh->healing_fd = NULL; +	} + +	loc_wipe (&sh->parent_loc); +} + + +void  +afr_local_cleanup (afr_local_t *local, xlator_t *this) +{ +	if (!local) +		return; + +	afr_local_sh_cleanup (local, this); + +	FREE (local->child_errno); +	FREE (local->pending_array); + +	loc_wipe (&local->loc); +	loc_wipe (&local->newloc); + +	FREE (local->transaction.locked_nodes); +	FREE (local->transaction.child_errno); + +	FREE (local->transaction.basename); +	FREE (local->transaction.new_basename); + +	loc_wipe (&local->transaction.parent_loc);	 +	loc_wipe (&local->transaction.new_parent_loc); + +	if (local->fd) +		fd_unref (local->fd); +	 +	if (local->xattr_req) +		dict_unref (local->xattr_req); + +	FREE (local->child_up); + +	{ /* lookup */ +		if (local->cont.lookup.xattr) +			dict_unref (local->cont.lookup.xattr); +	} + +	{ /* getxattr */ +		if (local->cont.getxattr.name) +			FREE (local->cont.getxattr.name); +	} + +	{ /* lk */ +		if (local->cont.lk.locked_nodes) +			FREE (local->cont.lk.locked_nodes); +	} + +	{ /* checksum */ +		if (local->cont.checksum.file_checksum) +			FREE (local->cont.checksum.file_checksum); +		if (local->cont.checksum.dir_checksum) +			FREE (local->cont.checksum.dir_checksum); +	} + +	{ /* create */ +		if (local->cont.create.fd) +			fd_unref (local->cont.create.fd); +	} + +	{ /* writev */ +		FREE (local->cont.writev.vector); +	} + +	{ /* setxattr */ +		if (local->cont.setxattr.dict) +			dict_unref (local->cont.setxattr.dict); +	} + +	{ /* removexattr */ +		FREE (local->cont.removexattr.name); +	} + +	{ /* symlink */ +		FREE (local->cont.symlink.linkpath); +	} +} + + +int +afr_frame_return (call_frame_t *frame) +{ +	afr_local_t *local = NULL; +	int          call_count = 0; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		call_count = --local->call_count; +	} +	UNLOCK (&frame->lock); + +	return call_count; +} + +/** + * first_up_child - return the index of the first child that is up + */ + +int +afr_first_up_child (afr_private_t *priv) +{ +	xlator_t ** children = NULL; +	int         ret      = -1; +	int         i        = 0; + +	LOCK (&priv->lock); +	{ +		children = priv->children; +		for (i = 0; i < priv->child_count; i++) { +			if (priv->child_up[i]) { +				ret = i; +				break; +			} +		} +	} +	UNLOCK (&priv->lock); + +	return ret; +} + + +/** + * up_children_count - return the number of children that are up + */ + +int +afr_up_children_count (int child_count, unsigned char *child_up) +{ +	int i   = 0; +	int ret = 0; + +	for (i = 0; i < child_count; i++) +		if (child_up[i]) +			ret++; +	return ret; +} + + +int +afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) +{ +	int ret = 0; +	int i; + +	for (i = 0; i < child_count; i++) +		if (locked_nodes[i]) +			ret++; + +	return ret; +} + + +ino64_t +afr_itransform (ino64_t ino, int child_count, int child_index) +{ +	ino64_t scaled_ino = -1; + +	if (ino == ((uint64_t) -1)) { +		scaled_ino = ((uint64_t) -1); +		goto out; +	} + +	scaled_ino = (ino * child_count) + child_index; + +out: +	return scaled_ino; +} + + +int +afr_deitransform_orig (ino64_t ino, int child_count) +{ +	int index = -1; + +	index = ino % child_count; + +	return index; +} + + +int +afr_deitransform (ino64_t ino, int child_count) +{ +	return 0; +} + + +int +afr_self_heal_cbk (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	int ret = -1; + +	local = frame->local; + +	if (local->govinda_gOvinda) { +		ret = inode_ctx_put (local->cont.lookup.inode, this, 1); + +		if (ret < 0) { +			local->op_ret   = -1; +			local->op_errno = -ret; +		} +	} else { +		inode_ctx_del (local->cont.lookup.inode, this, NULL); +	} + +	AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, +			  local->cont.lookup.inode, +			  &local->cont.lookup.buf, +			  local->cont.lookup.xattr); + +	return 0; +} + + +int +afr_lookup_cbk (call_frame_t *frame, void *cookie, +		xlator_t *this,	int32_t op_ret,	int32_t op_errno, +		inode_t *inode,	struct stat *buf, dict_t *xattr) +{ +	afr_local_t *   local = NULL; +	afr_private_t * priv  = NULL; +	struct stat *   lookup_buf = NULL; +	int             call_count = -1; +	int             child_index = -1; +	int             prev_child_index = -1; +	uint32_t        open_fd_count = 0; +	int             ret = 0; + +	child_index = (long) cookie; +	priv = this->private; + +	LOCK (&frame->lock); +	{ +		local = frame->local; + +		lookup_buf = &local->cont.lookup.buf; + +		if (op_ret == -1) { +			if (op_errno == ENOENT) +				local->enoent_count++; +			 +			if (op_errno != ENOTCONN) +				local->op_errno = op_errno; + +			goto unlock; +		} + +		if (afr_sh_has_metadata_pending (xattr, child_index, this)) +			local->need_metadata_self_heal = 1; + +		if (afr_sh_has_entry_pending (xattr, child_index, this)) +			local->need_entry_self_heal = 1; + +		if (afr_sh_has_data_pending (xattr, child_index, this)) +			local->need_data_self_heal = 1; + +		ret = dict_get_uint32 (xattr, GLUSTERFS_OPEN_FD_COUNT, +				       &open_fd_count); +		local->open_fd_count += open_fd_count; + +		/* in case of revalidate, we need to send stat of the +		 * child whose stat was sent during the first lookup. +		 * (so that time stamp does not vary with revalidate. +		 * in case it is down, stat of the fist success will +		 * be replied */ + +		/* inode number should be preserved across revalidates */ + +		if (local->success_count == 0) { +			local->op_ret   = op_ret; +				 +			local->cont.lookup.inode = inode; +			local->cont.lookup.xattr = dict_ref (xattr); + +			*lookup_buf = *buf; +			lookup_buf->st_ino = afr_itransform (buf->st_ino, +							     priv->child_count, +							     child_index); +		} else { +			if (FILETYPE_DIFFERS (buf, lookup_buf)) { +				/* mismatching filetypes with same name +				   -- Govinda !! GOvinda !!! +				*/ +				local->govinda_gOvinda = 1; +			} + +			if (PERMISSION_DIFFERS (buf, lookup_buf)) { +				/* mismatching permissions */ +				local->need_metadata_self_heal = 1; +			} + +			if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { +				/* mismatching permissions */ +				local->need_metadata_self_heal = 1; +			} + +			if (SIZE_DIFFERS (buf, lookup_buf) +			    && S_ISREG (buf->st_mode)) { +				local->need_data_self_heal = 1; +			} + +			prev_child_index = afr_deitransform_orig (lookup_buf->st_ino,  +								  priv->child_count); +			if (child_index < prev_child_index) { +				*lookup_buf = *buf; +				lookup_buf->st_ino = afr_itransform (buf->st_ino, +								     priv->child_count, +								     child_index); +			} +		} + +		local->success_count++; +	} +unlock: +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		if (local->op_ret == 0) { +			/* KLUDGE: assuming DHT will not itransform in  +			   revalidate */ +			if (local->cont.lookup.inode->ino) +				lookup_buf->st_ino =  +					local->cont.lookup.inode->ino; +		} + +		if (local->success_count && local->enoent_count) { +			local->need_metadata_self_heal = 1; +			local->need_data_self_heal = 1; +			local->need_entry_self_heal = 1; +		} + +		if (local->success_count) { +			/* check for govinda_gOvinda case in previous lookup */ +			if (!inode_ctx_get (local->cont.lookup.inode,  +					   this, NULL)) +				local->need_data_self_heal = 1; +		} + +		if ((local->need_metadata_self_heal +		     || local->need_data_self_heal +		     || local->need_entry_self_heal) +		    && (!local->open_fd_count)) { + +			if (!local->cont.lookup.inode->st_mode) { +				/* fix for RT #602 */ +				local->cont.lookup.inode->st_mode = +					lookup_buf->st_mode; +			} + +			afr_self_heal (frame, this, afr_self_heal_cbk); +		} else { +			AFR_STACK_UNWIND (frame, local->op_ret, +					  local->op_errno, +					  local->cont.lookup.inode,  +					  &local->cont.lookup.buf, +					  local->cont.lookup.xattr); +		} +	} + +	return 0; +} + + +int +afr_lookup (call_frame_t *frame, xlator_t *this, +	    loc_t *loc, dict_t *xattr_req) +{ +	afr_private_t *priv = NULL; +	afr_local_t   *local = NULL; +	int            ret = -1; +	int            i = 0; +	int32_t        op_errno = 0; + + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	local->op_ret = -1; + +	frame->local = local; + +	loc_copy (&local->loc, loc); + +	local->reval_child_index = 0; + +	local->call_count = priv->child_count; + +	local->child_up = memdup (priv->child_up, priv->child_count); +	local->child_count = afr_up_children_count (priv->child_count, +						    local->child_up); + +	/* By default assume ENOTCONN. On success it will be set to 0. */ +	local->op_errno = ENOTCONN; +	 +	if ((xattr_req == NULL) +	    && (priv->metadata_self_heal +		|| priv->data_self_heal +		|| priv->entry_self_heal)) +		local->xattr_req = dict_new (); +	else +		local->xattr_req = dict_ref (xattr_req); + +	if (priv->metadata_self_heal) { +		ret = dict_set_uint64 (local->xattr_req, AFR_METADATA_PENDING, +				       priv->child_count * sizeof(int32_t)); +	} +	 +	if (priv->data_self_heal) { +		ret = dict_set_uint64 (local->xattr_req, AFR_DATA_PENDING, +				       priv->child_count * sizeof(int32_t)); +	} +	 +	if (priv->entry_self_heal) { +		ret = dict_set_uint64 (local->xattr_req, AFR_ENTRY_PENDING, +				       priv->child_count * sizeof(int32_t)); +	} + +	ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_OPEN_FD_COUNT, 0); + +	for (i = 0; i < priv->child_count; i++) { +		STACK_WIND_COOKIE (frame, afr_lookup_cbk, (void *) (long) i, +				   priv->children[i], +				   priv->children[i]->fops->lookup, +				   loc, local->xattr_req); +	} + +	ret = 0; +out: +	if (ret == -1) +		AFR_STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL, NULL); + +	return 0; +} + + +/* {{{ open */ + +int +afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +			int32_t op_ret, int32_t op_errno, struct stat *buf) +{ +	afr_local_t * local = frame->local; + +	AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, +			  local->fd); +	return 0; +} + + +int +afr_open_cbk (call_frame_t *frame, void *cookie, +	      xlator_t *this, int32_t op_ret, int32_t op_errno, +	      fd_t *fd) +{ +	afr_local_t *  local = NULL; +	afr_private_t * priv = NULL; + +	int call_count = -1; +	 +	priv  = this->private; +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (op_ret == -1) { +			local->op_errno = op_errno; +		} + +		if (op_ret >= 0) { +			local->op_ret = op_ret; +		} +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		if ((local->cont.open.flags & O_TRUNC) +		    && (local->op_ret >= 0)) { +			STACK_WIND (frame, afr_open_ftruncate_cbk, +				    this, this->fops->ftruncate, +				    fd, 0); +		} else { +			AFR_STACK_UNWIND (frame, local->op_ret, +					  local->op_errno, local->fd); +		} +	} + +	return 0; +} + + +int +afr_open (call_frame_t *frame, xlator_t *this, +	  loc_t *loc, int32_t flags, fd_t *fd) +{ +	afr_private_t * priv  = NULL; +	afr_local_t *   local = NULL; +	 +	int     i = 0; +	int   ret = -1; + +	int32_t call_count = 0;	 +	int32_t op_ret   = -1; +	int32_t op_errno = 0; +	int32_t wind_flags = flags & (~O_TRUNC); + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); +	VALIDATE_OR_GOTO (loc, out); +	 +	priv = this->private; + +	ret = inode_ctx_get (loc->inode, this, NULL); +	if (ret == 0) { +		/* if ctx is set it means self-heal failed */ + +		gf_log (this->name, GF_LOG_WARNING,  +			"returning EIO, file has to be manually corrected " +			"in backend"); +		op_errno = EIO; +		goto out; +	} + +	ALLOC_OR_GOTO (local, afr_local_t, out); +	 +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	frame->local = local; +	call_count   = local->call_count; + +	local->cont.open.flags = flags; +	local->fd = fd_ref (fd); + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, +					   priv->children[i], +					   priv->children[i]->fops->open, +					   loc, wind_flags, fd); +			 +			if (!--call_count) +				break; +		} +	} + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno, fd); +	} + +	return 0; +} + +/* }}} */ + +/* {{{ flush */ + +int +afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +		      int32_t op_ret, int32_t op_errno) +{ +	afr_local_t *   local = NULL; + +	int call_count  = -1; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (op_ret == 0) +			local->op_ret = 0; + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +	} +	 +	return 0; +} + + +int +afr_flush_wind (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; +	 +	int i = 0; +	int call_count = -1; + +	local = frame->local; +	priv = this->private; + +	call_count = afr_up_children_count (priv->child_count, local->child_up); + +	if (call_count == 0) { +		local->transaction.resume (frame, this); +		return 0; +	} + +	local->call_count = call_count; + +	for (i = 0; i < priv->child_count; i++) {				 +		if (local->child_up[i]) { +			STACK_WIND_COOKIE (frame, afr_flush_wind_cbk,  +					   (void *) (long) i,	 +					   priv->children[i],  +					   priv->children[i]->fops->flush, +					   local->fd); +		 +			if (!--call_count) +				break; +		} +	} +	 +	return 0; +} + + +int +afr_flush_done (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t *local = NULL; + +	local = frame->local; + +	AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + +	return 0; +} + + +int +afr_simple_flush_cbk (call_frame_t *frame, void *cookie, +		      xlator_t *this, int32_t op_ret, int32_t op_errno) +{ +        afr_local_t *local = NULL; +         +        int call_count = -1; +	 +        local = frame->local; +	 +        LOCK (&frame->lock); +        { +                if (op_ret == 0) +                        local->op_ret = 0; + +                local->op_errno = op_errno; +        } +        UNLOCK (&frame->lock); +	 +        call_count = afr_frame_return (frame); +	 +        if (call_count == 0) +                AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); +	 +        return 0; +} + + +static int +__is_fd_ctx_set (xlator_t *this, fd_t *fd) +{ +	int _ret   = 0; +	int op_ret = 0; + +	_ret = fd_ctx_get (fd, this, NULL); +	if (_ret == 0) +		op_ret = 1; + +	return op_ret; +} + + +int +afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +{ +	afr_private_t * priv  = NULL; +	afr_local_t   * local = NULL; + +	int ret        = -1; +	int i          = 0; +	int call_count = 0; + +	int op_ret   = -1; +	int op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	frame->local = local; + +	if (__is_fd_ctx_set (this, fd)) { +		local->op = GF_FOP_FLUSH; +		local->transaction.fop    = afr_flush_wind; +		local->transaction.done   = afr_flush_done; +		 +		local->fd                 = fd_ref (fd); +		 +		local->transaction.start  = 0; +		local->transaction.len    = 0; +		 +		local->transaction.pending = AFR_DATA_PENDING; +		 +		afr_transaction (frame, this, AFR_FLUSH_TRANSACTION); +	} else { +		/* +		 * if fd's ctx is not set, then there is no need +		 * to erase changelog. So just send the flush +		 */ + +		call_count = local->call_count; + +		for (i = 0; i < priv->child_count; i++) { +			if (local->child_up[i]) { +				STACK_WIND (frame, afr_simple_flush_cbk, +					    priv->children[i], +					    priv->children[i]->fops->flush, +					    fd); + +				if (!--call_count) +					break; +			} +		} +	} + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); +	} + +	return 0; +} + +/* }}} */ + +/* {{{ fsync */ + +int +afr_fsync_cbk (call_frame_t *frame, void *cookie, +	       xlator_t *this, int32_t op_ret, int32_t op_errno) +{ +	afr_local_t *local = NULL; +	 +	int call_count = -1; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (op_ret == 0) +			local->op_ret = 0; + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + +	return 0; +} + + +int +afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, +	   int32_t datasync) +{ +	afr_private_t *priv = NULL; +	afr_local_t *local = NULL; + +	int ret = -1; + +	int i = 0; +	int32_t call_count = 0; +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	call_count = local->call_count; +	frame->local = local; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND (frame, afr_fsync_cbk, +				    priv->children[i], +				    priv->children[i]->fops->fsync, +				    fd, datasync); +			if (!--call_count) +				break; +		} +	} + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno); +	} +	return 0; +} + +/* }}} */ + +/* {{{ fsync */ + +int32_t +afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, +		  xlator_t *this, int32_t op_ret, int32_t op_errno) +{ +	afr_local_t *local = NULL; +	 +	int call_count = -1; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (op_ret == 0) +			local->op_ret = 0; + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + +	return 0; +} + + +int32_t +afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, +	      int32_t datasync) +{ +	afr_private_t *priv = NULL; +	afr_local_t *local = NULL; + +	int ret = -1; + +	int i = 0; +	int32_t call_count = 0; +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	call_count = local->call_count; +	frame->local = local; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND (frame, afr_fsync_cbk, +				    priv->children[i], +				    priv->children[i]->fops->fsyncdir, +				    fd, datasync); +			if (!--call_count) +				break; +		} +	} + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno); +	} +	return 0; +} + +/* }}} */ + +/* {{{ xattrop */ + +int32_t +afr_xattrop_cbk (call_frame_t *frame, void *cookie, +		 xlator_t *this, int32_t op_ret, int32_t op_errno, +		 dict_t *xattr) +{ +	afr_local_t *local = NULL; +	 +	int call_count = -1; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (op_ret == 0) +			local->op_ret = 0; + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr); + +	return 0; +} + + +int32_t +afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, +	     gf_xattrop_flags_t optype, dict_t *xattr) +{ +	afr_private_t *priv = NULL; +	afr_local_t *local  = NULL; + +	int ret = -1; + +	int i = 0; +	int32_t call_count = 0; +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	call_count = local->call_count; +	frame->local = local; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND (frame, afr_xattrop_cbk, +				    priv->children[i], +				    priv->children[i]->fops->xattrop, +				    loc, optype, xattr); +			if (!--call_count) +				break; +		} +	} + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno); +	} +	return 0; +} + +/* }}} */ + +/* {{{ fxattrop */ + +int32_t +afr_fxattrop_cbk (call_frame_t *frame, void *cookie, +		  xlator_t *this, int32_t op_ret, int32_t op_errno, +		  dict_t *xattr) +{ +	afr_local_t *local = NULL; +	 +	int call_count = -1; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (op_ret == 0) +			local->op_ret = 0; + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr); + +	return 0; +} + + +int32_t +afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, +	      gf_xattrop_flags_t optype, dict_t *xattr) +{ +	afr_private_t *priv = NULL; +	afr_local_t *local  = NULL; + +	int ret = -1; + +	int i = 0; +	int32_t call_count = 0; +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	call_count = local->call_count; +	frame->local = local; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND (frame, afr_fxattrop_cbk, +				    priv->children[i], +				    priv->children[i]->fops->fxattrop, +				    fd, optype, xattr); +			if (!--call_count) +				break; +		} +	} + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno); +	} +	return 0; +} + +/* }}} */ + + +int32_t +afr_inodelk_cbk (call_frame_t *frame, void *cookie, +		 xlator_t *this, int32_t op_ret, int32_t op_errno) +		 +{ +	afr_local_t *local = NULL; +	 +	int call_count = -1; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (op_ret == 0) +			local->op_ret = 0; + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + +	return 0; +} + + +int32_t +afr_inodelk (call_frame_t *frame, xlator_t *this, loc_t *loc, +	     int32_t cmd, struct flock *flock) +{ +	afr_private_t *priv = NULL; +	afr_local_t *local  = NULL; + +	int ret = -1; + +	int i = 0; +	int32_t call_count = 0; +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	call_count = local->call_count; +	frame->local = local; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND (frame, afr_inodelk_cbk, +				    priv->children[i], +				    priv->children[i]->fops->inodelk, +				    loc, cmd, flock); + +			if (!--call_count) +				break; +		} +	} + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno); +	} +	return 0; +} + + +int32_t +afr_finodelk_cbk (call_frame_t *frame, void *cookie, +		  xlator_t *this, int32_t op_ret, int32_t op_errno) +		 +{ +	afr_local_t *local = NULL; +	 +	int call_count = -1; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (op_ret == 0) +			local->op_ret = 0; + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + +	return 0; +} + + +int32_t +afr_finodelk (call_frame_t *frame, xlator_t *this, fd_t *fd, +	      int32_t cmd, struct flock *flock) +{ +	afr_private_t *priv = NULL; +	afr_local_t *local  = NULL; + +	int ret = -1; + +	int i = 0; +	int32_t call_count = 0; +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	call_count = local->call_count; +	frame->local = local; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND (frame, afr_finodelk_cbk, +				    priv->children[i], +				    priv->children[i]->fops->finodelk, +				    fd, cmd, flock); + +			if (!--call_count) +				break; +		} +	} + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno); +	} +	return 0; +} + + +int32_t +afr_entrylk_cbk (call_frame_t *frame, void *cookie, +		 xlator_t *this, int32_t op_ret, int32_t op_errno) +		 +{ +	afr_local_t *local = NULL; +	 +	int call_count = -1; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (op_ret == 0) +			local->op_ret = 0; + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + +	return 0; +} + + +int32_t +afr_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, +	     const char *basename, entrylk_cmd cmd, entrylk_type type) +{ +	afr_private_t *priv = NULL; +	afr_local_t *local  = NULL; + +	int ret = -1; + +	int i = 0; +	int32_t call_count = 0; +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	call_count = local->call_count; +	frame->local = local; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND (frame, afr_entrylk_cbk, +				    priv->children[i], +				    priv->children[i]->fops->entrylk, +				    loc, basename, cmd, type); + +			if (!--call_count) +				break; +		} +	} + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno); +	} +	return 0; +} + + + +int32_t +afr_fentrylk_cbk (call_frame_t *frame, void *cookie, +		 xlator_t *this, int32_t op_ret, int32_t op_errno) +		 +{ +	afr_local_t *local = NULL; +	 +	int call_count = -1; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (op_ret == 0) +			local->op_ret = 0; + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + +	return 0; +} + + +int32_t +afr_fentrylk (call_frame_t *frame, xlator_t *this, fd_t *fd, +	      const char *basename, entrylk_cmd cmd, entrylk_type type) +{ +	afr_private_t *priv = NULL; +	afr_local_t *local  = NULL; + +	int ret = -1; + +	int i = 0; +	int32_t call_count = 0; +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	call_count = local->call_count; +	frame->local = local; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND (frame, afr_fentrylk_cbk, +				    priv->children[i], +				    priv->children[i]->fops->fentrylk, +				    fd, basename, cmd, type); + +			if (!--call_count) +				break; +		} +	} + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno); +	} +	return 0; +} + + +int32_t +afr_checksum_cbk (call_frame_t *frame, void *cookie, +		  xlator_t *this, int32_t op_ret, int32_t op_errno, +		  uint8_t *file_checksum, uint8_t *dir_checksum) +		 +{ +	afr_local_t *local = NULL; +	 +	int call_count = -1; + +	local = frame->local; + +	LOCK (&frame->lock); +	{ +		if (op_ret == 0 && (local->op_ret != 0)) { +			local->op_ret = 0; + +			local->cont.checksum.file_checksum = MALLOC (ZR_FILENAME_MAX); +			memcpy (local->cont.checksum.file_checksum, file_checksum,  +				ZR_FILENAME_MAX); + +			local->cont.checksum.dir_checksum = MALLOC (ZR_FILENAME_MAX); +			memcpy (local->cont.checksum.dir_checksum, dir_checksum,  +				ZR_FILENAME_MAX); + +		} + +		local->op_errno = op_errno; +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, +				  local->cont.checksum.file_checksum,  +				  local->cont.checksum.dir_checksum); + +	return 0; +} + + +int32_t +afr_checksum (call_frame_t *frame, xlator_t *this, loc_t *loc, +	      int32_t flag) +{ +	afr_private_t *priv = NULL; +	afr_local_t *local  = NULL; + +	int ret = -1; + +	int i = 0; +	int32_t call_count = 0; +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); + +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	call_count = local->call_count; +	frame->local = local; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND (frame, afr_checksum_cbk, +				    priv->children[i], +				    priv->children[i]->fops->checksum, +				    loc, flag); + +			if (!--call_count) +				break; +		} +	} + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno); +	} +	return 0; +} + + +int32_t +afr_statfs_cbk (call_frame_t *frame, void *cookie, +		xlator_t *this, int32_t op_ret, int32_t op_errno, +		struct statvfs *statvfs) +{ +	afr_local_t *local = NULL; + +	int call_count = 0; + +	LOCK (&frame->lock); +	{ +		local = frame->local; + +		if (op_ret == 0) { +			local->op_ret   = op_ret; +			 +			if (local->cont.statfs.buf_set) { +				if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail) +					local->cont.statfs.buf = *statvfs; +			} else { +				local->cont.statfs.buf = *statvfs; +				local->cont.statfs.buf_set = 1; +			} +		} + +		if (op_ret == -1) +			local->op_errno = op_errno; + +	} +	UNLOCK (&frame->lock); + +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,  +				  &local->cont.statfs.buf); + +	return 0; +} + + +int32_t +afr_statfs (call_frame_t *frame, xlator_t *this, +	    loc_t *loc) +{ +	afr_private_t *  priv        = NULL; +	int              child_count = 0; +	afr_local_t   *  local       = NULL; +	int              i           = 0; + +	int ret = -1; +	int              call_count = 0; +	int32_t          op_ret      = -1; +	int32_t          op_errno    = 0; + +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); +	VALIDATE_OR_GOTO (loc, out); + +	priv = this->private; +	child_count = priv->child_count; + +	ALLOC_OR_GOTO (local, afr_local_t, out); +	 +	ret = AFR_LOCAL_INIT (local, priv); +	if (ret < 0) { +		op_errno = -ret; +		goto out; +	} + +	frame->local = local; +	call_count = local->call_count; + +	for (i = 0; i < child_count; i++) { +		if (local->child_up[i]) { +			STACK_WIND (frame, afr_statfs_cbk, +				    priv->children[i], +				    priv->children[i]->fops->statfs,  +				    loc); +			if (!--call_count) +				break; +		} +	} +	 +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); +	} +	return 0; +} + + +int32_t +afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +		   int32_t op_ret, int32_t op_errno, struct flock *lock) +{ +	afr_local_t * local = NULL; + +	int call_count = -1; + +	local = frame->local; +	call_count = afr_frame_return (frame); + +	if (call_count == 0) +		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, +				  lock); + +	return 0; +} + + +int32_t  +afr_lk_unlock (call_frame_t *frame, xlator_t *this) +{ +	afr_local_t   * local = NULL; +	afr_private_t * priv  = NULL; + +	int i; +	int call_count = 0; + +	local = frame->local; +	priv  = this->private; + +	call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes,  +					     priv->child_count); + +	if (call_count == 0) { +		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, +				  &local->cont.lk.flock); +		return 0; +	} + +	local->call_count = call_count; + +	local->cont.lk.flock.l_type = F_UNLCK; + +	for (i = 0; i < priv->child_count; i++) { +		if (local->cont.lk.locked_nodes[i]) { +			STACK_WIND (frame, afr_lk_unlock_cbk, +				    priv->children[i], +				    priv->children[i]->fops->lk, +				    local->fd, F_SETLK,  +				    &local->cont.lk.flock); + +			if (!--call_count) +				break; +		} +	} + +	return 0; +} + + +int32_t +afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  +	    int32_t op_ret, int32_t op_errno, struct flock *lock) +{ +	afr_local_t *local = NULL; +	afr_private_t *priv = NULL; + +	int call_count  = -1; +	int child_index = -1; + +	local = frame->local; +	priv  = this->private; + +	child_index = (long) cookie; + +	call_count = --local->call_count; + +	if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) { +		local->op_ret   = -1; +		local->op_errno = op_errno; + +		afr_lk_unlock (frame, this); +		return 0; +	} + +	if (op_ret == 0) { +		local->op_ret        = 0; +		local->op_errno      = 0; +		local->cont.lk.flock = *lock; +		local->cont.lk.locked_nodes[child_index] = 1; +	} + +	child_index++; + +	if (child_index < priv->child_count) { +		STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index, +				   priv->children[child_index], +				   priv->children[child_index]->fops->lk, +				   local->fd, local->cont.lk.cmd,  +				   &local->cont.lk.flock); +	} else if (local->op_ret == -1) { +		/* all nodes have gone down */ +		 +		AFR_STACK_UNWIND (frame, -1, ENOTCONN, &local->cont.lk.flock); +	} else { +		/* locking has succeeded on all nodes that are up */ +		 +		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, +			      &local->cont.lk.flock); +	} + +	return 0; +} + + +int +afr_lk (call_frame_t *frame, xlator_t *this, +	fd_t *fd, int32_t cmd, +	struct flock *flock) +{ +	afr_private_t *priv = NULL; +	afr_local_t *local = NULL; + +	int i = 0; + +	int32_t op_ret   = -1; +	int32_t op_errno = 0; + +	VALIDATE_OR_GOTO (frame, out); +	VALIDATE_OR_GOTO (this, out); +	VALIDATE_OR_GOTO (this->private, out); + +	priv = this->private; + +	ALLOC_OR_GOTO (local, afr_local_t, out); +	AFR_LOCAL_INIT (local, priv); + +	frame->local  = local; + +	local->cont.lk.locked_nodes = CALLOC (priv->child_count,  +					      sizeof (*local->cont.lk.locked_nodes)); +	 +	if (!local->cont.lk.locked_nodes) { +		gf_log (this->name, GF_LOG_ERROR, "out of memory :("); +		op_errno = ENOMEM; +		goto out; +	} + +	local->fd            = fd_ref (fd); +	local->cont.lk.cmd   = cmd; +	local->cont.lk.flock = *flock; + +	STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0, +			   priv->children[i], +			   priv->children[i]->fops->lk, +			   fd, cmd, flock); + +	op_ret = 0; +out: +	if (op_ret == -1) { +		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); +	} +	return 0; +} + + +/** + * find_child_index - find the child's index in the array of subvolumes + * @this: AFR + * @child: child + */ + +static int +find_child_index (xlator_t *this, xlator_t *child) +{ +	afr_private_t *priv = NULL; + +	int i = -1; + +	priv = this->private; + +	for (i = 0; i < priv->child_count; i++) { +		if ((xlator_t *) child == priv->children[i]) +			break; +	} + +	return i; +} + + +int32_t +notify (xlator_t *this, int32_t event, +	void *data, ...) +{ +	afr_private_t *     priv     = NULL; +	unsigned char *     child_up = NULL; + +	int i           = -1; +	int up_children = 0; + +	priv = this->private; + +	if (!priv) +		return 0; + +	child_up = priv->child_up; + +	switch (event) { +	case GF_EVENT_CHILD_UP: +		i = find_child_index (this, data); + +		child_up[i] = 1; + +		/*  +		   if all the children were down, and one child came up,  +		   send notify to parent +		*/ + +		for (i = 0; i < priv->child_count; i++) +			if (child_up[i]) +				up_children++; + +		if (up_children == 1) +			default_notify (this, event, data); + +		break; + +	case GF_EVENT_CHILD_DOWN: +		i = find_child_index (this, data); + +		child_up[i] = 0; +		 +		/*  +		   if all children are down, and this was the last to go down, +		   send notify to parent +		*/ + +		for (i = 0; i < priv->child_count; i++) +			if (child_up[i]) +				up_children++; + +		if (up_children == 0) +			default_notify (this, event, data); + +		break; + +	default: +		default_notify (this, event, data); +	} + +	return 0; +} + + +static const char *favorite_child_warning_str = "You have specified subvolume '%s' " +	"as the 'favorite child'. This means that if a discrepancy in the content " +	"or attributes (ownership, permission, etc.) of a file is detected among " +	"the subvolumes, the file on '%s' will be considered the definitive " +	"version and its contents will OVERWRITE the contents of the file on other " +	"subvolumes. All versions of the file except that on '%s' " +	"WILL BE LOST."; + +static const char *no_lock_servers_warning_str = "You have set lock-server-count = 0. " +	"This means correctness is NO LONGER GUARANTEED in all cases. If two or more " +	"applications write to the same region of a file, there is a possibility that " +	"its copies will be INCONSISTENT. Set it to a value greater than 0 unless you " +	"are ABSOLUTELY SURE of what you are doing and WILL NOT HOLD GlusterFS " +	"RESPOSIBLE for inconsistent data. If you are in doubt, set it to a value " +	"greater than 0."; + +int32_t  +init (xlator_t *this) +{ +	afr_private_t * priv        = NULL; +	int             child_count = 0; +	xlator_list_t * trav        = NULL; +	int             i           = 0; +	int             ret         = -1; +	int             op_errno    = 0; + +	char * read_subvol = NULL; +	char * fav_child   = NULL; +	char * self_heal   = NULL; +	char * change_log  = NULL; + +	int32_t lock_server_count = 1; + +	int    fav_ret       = -1; +	int    read_ret      = -1; +	int    dict_ret      = -1; + +	if (!this->children) { +		gf_log (this->name, GF_LOG_ERROR, +			"AFR needs more than one child defined"); +		return -1; +	} +   +	if (!this->parents) { +		gf_log (this->name, GF_LOG_WARNING, +			"dangling volume. check volfile "); +	} + +	ALLOC_OR_GOTO (this->private, afr_private_t, out); + +	priv = this->private; + +	read_ret = dict_get_str (this->options, "read-subvolume", &read_subvol); +	priv->read_child = -1; + +	fav_ret = dict_get_str (this->options, "favorite-child", &fav_child); +	priv->favorite_child = -1; + +	/* Default values */ + +	priv->data_self_heal     = 1; +	priv->metadata_self_heal = 1; +	priv->entry_self_heal    = 1; + +	dict_ret = dict_get_str (this->options, "data-self-heal", &self_heal); +	if (dict_ret == 0) { +		ret = gf_string2boolean (self_heal, &priv->data_self_heal); +		if (ret < 0) { +			gf_log (this->name, GF_LOG_WARNING, +				"invalid 'option data-self-heal %s' " +				"defaulting to data-self-heal as 'on'", +				self_heal); +			priv->data_self_heal = 1; +		}  +	} + +	dict_ret = dict_get_str (this->options, "metadata-self-heal", +				 &self_heal); +	if (dict_ret == 0) { +		ret = gf_string2boolean (self_heal, &priv->metadata_self_heal); +		if (ret < 0) { +			gf_log (this->name, GF_LOG_WARNING, +				"invalid 'option metadata-self-heal %s' " +				"defaulting to metadata-self-heal as 'on'",  +				self_heal); +			priv->metadata_self_heal = 1; +		}  +	} + +	dict_ret = dict_get_str (this->options, "entry-self-heal", &self_heal); +	if (dict_ret == 0) { +		ret = gf_string2boolean (self_heal, &priv->entry_self_heal); +		if (ret < 0) { +			gf_log (this->name, GF_LOG_WARNING, +				"invalid 'option entry-self-heal %s' " +				"defaulting to entry-self-heal as 'on'",  +				self_heal); +			priv->entry_self_heal = 1; +		}  +	} + +	/* Change log options */ + +	priv->data_change_log     = 1; +	priv->metadata_change_log = 0; +	priv->entry_change_log    = 1; + +	dict_ret = dict_get_str (this->options, "data-change-log", +				 &change_log); +	if (dict_ret == 0) { +		ret = gf_string2boolean (change_log, &priv->data_change_log); +		if (ret < 0) { +			gf_log (this->name, GF_LOG_WARNING, +				"invalid 'option data-change-log %s'. " +				"defaulting to data-change-log as 'on'",  +				change_log); +			priv->data_change_log = 1; +		}  +	} + +	dict_ret = dict_get_str (this->options, "metadata-change-log", +				 &change_log); +	if (dict_ret == 0) { +		ret = gf_string2boolean (change_log, +					 &priv->metadata_change_log); +		if (ret < 0) { +			gf_log (this->name, GF_LOG_WARNING, +				"invalid 'option metadata-change-log %s'. " +				"defaulting to metadata-change-log as 'off'", +				change_log); +			priv->metadata_change_log = 0; +		}  +	} + +	dict_ret = dict_get_str (this->options, "entry-change-log", +				 &change_log); +	if (dict_ret == 0) { +		ret = gf_string2boolean (change_log, &priv->entry_change_log); +		if (ret < 0) { +			gf_log (this->name, GF_LOG_WARNING, +				"invalid 'option entry-change-log %s'. " +				"defaulting to entry-change-log as 'on'",  +				change_log); +			priv->entry_change_log = 1; +		}  +	} + +	/* Locking options */ + +	priv->data_lock_server_count = 1; +	priv->metadata_lock_server_count = 0; +	priv->entry_lock_server_count = 1; + +	dict_ret = dict_get_int32 (this->options, "data-lock-server-count",  +				   &lock_server_count); +	if (dict_ret == 0) { +		gf_log (this->name, GF_LOG_DEBUG, +			"setting data lock server count to %d", +			lock_server_count); + +		if (lock_server_count == 0)  +			gf_log (this->name, GF_LOG_WARNING, +				no_lock_servers_warning_str); + +		priv->data_lock_server_count = lock_server_count; +	} + + +	dict_ret = dict_get_int32 (this->options, +				   "metadata-lock-server-count",  +				   &lock_server_count); +	if (dict_ret == 0) { +		gf_log (this->name, GF_LOG_DEBUG, +			"setting metadata lock server count to %d", +			lock_server_count); +		priv->metadata_lock_server_count = lock_server_count; +	} + + +	dict_ret = dict_get_int32 (this->options, "entry-lock-server-count",  +				   &lock_server_count); +	if (dict_ret == 0) { +		gf_log (this->name, GF_LOG_DEBUG, +			"setting entry lock server count to %d", +			lock_server_count); + +		priv->entry_lock_server_count = lock_server_count; +	} + + +	trav = this->children; +	while (trav) { +		if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) { +			gf_log (this->name, GF_LOG_DEBUG, +				"subvolume '%s' specified as read child", +				trav->xlator->name); + +			priv->read_child = child_count; +		} + +		if (fav_ret == 0 && !strcmp (fav_child, trav->xlator->name)) { +			gf_log (this->name, GF_LOG_WARNING, +				favorite_child_warning_str, trav->xlator->name, +				trav->xlator->name, trav->xlator->name); +			priv->favorite_child = child_count; +		} + +		child_count++; +		trav = trav->next; +	} + +	/* XXX: return inode numbers from 1st subvolume till +	   afr supports read-subvolume based on inode's ctx  +	   (and not itransform) for this reason afr_deitransform()  +	   returns 0 always +	*/ +	priv->read_child = 0; + +	priv->wait_count = 1; + +	priv->child_count = child_count; +	LOCK_INIT (&priv->lock); + +	priv->child_up = CALLOC (sizeof (unsigned char), child_count); +	if (!priv->child_up) { +		gf_log (this->name, GF_LOG_ERROR,	 +			"out of memory :(");		 +		op_errno = ENOMEM;			 +		goto out; +	} + +	priv->children = CALLOC (sizeof (xlator_t *), child_count); +	if (!priv->children) { +		gf_log (this->name, GF_LOG_ERROR,	 +			"out of memory :(");		 +		op_errno = ENOMEM;			 +		goto out; +	} + +	trav = this->children; +	i = 0; +	while (i < child_count) { +		priv->children[i] = trav->xlator; + +		trav = trav->next; +		i++; +	} + +	ret = 0; +out: +	return ret; +} + + +int +fini (xlator_t *this) +{ +	return 0; +} + + +struct xlator_fops fops = { +	.lookup      = afr_lookup, +	.open        = afr_open, +	.lk          = afr_lk, +	.flush       = afr_flush, +	.statfs      = afr_statfs, +	.fsync       = afr_fsync, +	.fsyncdir    = afr_fsyncdir, +	.xattrop     = afr_xattrop, +	.fxattrop    = afr_fxattrop, +	.inodelk     = afr_inodelk, +	.finodelk    = afr_finodelk, +	.entrylk     = afr_entrylk, +	.fentrylk    = afr_fentrylk, +	.checksum    = afr_checksum, + +	/* inode read */ +	.access      = afr_access, +	.stat        = afr_stat, +	.fstat       = afr_fstat, +	.readlink    = afr_readlink, +	.getxattr    = afr_getxattr, +	.readv       = afr_readv, + +	/* inode write */ +	.chmod       = afr_chmod, +	.chown       = afr_chown, +	.fchmod      = afr_fchmod, +	.fchown      = afr_fchown, +	.writev      = afr_writev, +	.truncate    = afr_truncate, +	.ftruncate   = afr_ftruncate, +	.utimens     = afr_utimens, +	.setxattr    = afr_setxattr, +	.removexattr = afr_removexattr, + +	/* dir read */ +	.opendir     = afr_opendir, +	.readdir     = afr_readdir, +	.getdents    = afr_getdents, + +	/* dir write */ +	.create      = afr_create, +	.mknod       = afr_mknod, +	.mkdir       = afr_mkdir, +	.unlink      = afr_unlink, +	.rmdir       = afr_rmdir, +	.link        = afr_link, +	.symlink     = afr_symlink, +	.rename      = afr_rename, +	.setdents    = afr_setdents, +}; + + +struct xlator_mops mops = { +}; + + +struct xlator_cbks cbks = { +}; + +struct volume_options options[] = { +	{ .key  = {"read-subvolume" },  +	  .type = GF_OPTION_TYPE_XLATOR +	}, +	{ .key  = {"favorite-child"},  +	  .type = GF_OPTION_TYPE_XLATOR +	}, +	{ .key  = {"data-self-heal"},   +	  .type = GF_OPTION_TYPE_BOOL  +	}, +	{ .key  = {"metadata-self-heal"},   +	  .type = GF_OPTION_TYPE_BOOL +	}, +	{ .key  = {"entry-self-heal"},   +	  .type = GF_OPTION_TYPE_BOOL  +	}, +	{ .key  = {"data-change-log"},   +	  .type = GF_OPTION_TYPE_BOOL  +	}, +	{ .key  = {"metadata-change-log"},   +	  .type = GF_OPTION_TYPE_BOOL +	}, +	{ .key  = {"entry-change-log"},   +	  .type = GF_OPTION_TYPE_BOOL +	}, +	{ .key  = {"data-lock-server-count"},   +	  .type = GF_OPTION_TYPE_INT,  +	  .min  = 0 +	}, +	{ .key  = {"metadata-lock-server-count"},   +	  .type = GF_OPTION_TYPE_INT,  +	  .min  = 0 +	}, +	{ .key  = {"entry-lock-server-count"},   +	  .type = GF_OPTION_TYPE_INT, +	  .min  = 0 +	}, +	{ .key  = {NULL} }, +}; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h new file mode 100644 index 00000000000..4cf6cdf9dfe --- /dev/null +++ b/xlators/cluster/afr/src/afr.h @@ -0,0 +1,523 @@ +/* +   Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +   This file is part of GlusterFS. + +   GlusterFS is free software; you can redistribute it and/or modify +   it under the terms of the GNU General Public License as published +   by the Free Software Foundation; either version 3 of the License, +   or (at your option) any later version. + +   GlusterFS is distributed in the hope that it will be useful, but +   WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   General Public License for more details. + +   You should have received a copy of the GNU General Public License +   along with this program.  If not, see +   <http://www.gnu.org/licenses/>. +*/ + + +#ifndef __AFR_H__ +#define __AFR_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "scheduler.h" +#include "call-stub.h" +#include "compat-errno.h" + + +typedef struct _afr_private { +	gf_lock_t lock;               /* to guard access to child_count, etc */ +	unsigned int child_count;     /* total number of children   */ + +	xlator_t **children; + +	unsigned char *child_up; + +	gf_boolean_t data_self_heal;       /* on/off */ +	gf_boolean_t metadata_self_heal;   /* on/off */ +	gf_boolean_t entry_self_heal;      /* on/off */ + + +	gf_boolean_t data_change_log;       /* on/off */ +	gf_boolean_t metadata_change_log;   /* on/off */ +	gf_boolean_t entry_change_log;      /* on/off */ + +	unsigned int read_child;      /* read-subvolume */ +	unsigned int favorite_child;  /* subvolume to be preferred in resolving +					 split-brain cases */ + +	unsigned int data_lock_server_count; +	unsigned int metadata_lock_server_count; +	unsigned int entry_lock_server_count; + +	unsigned int wait_count;      /* # of servers to wait for success */ +} afr_private_t; + +typedef struct { +	/* array of stat's, one for each child */ +	struct stat *buf; + +	/* array of xattr's, one for each child */ +	dict_t **xattr; + +	/* array of errno's, one for each child */ +	int *child_errno; + +	int32_t **pending_matrix; +	int32_t **delta_matrix; + +	int *sources; +	int source; +	int active_source; +	int active_sinks; +	int *success; + +	fd_t *healing_fd; +	int   op_failed; + +	int   file_has_holes; +	blksize_t block_size; +	off_t file_size; +	off_t offset; + +	loc_t parent_loc; +	int (*completion_cbk) (call_frame_t *frame, xlator_t *this); +	call_frame_t *sh_frame; +} afr_self_heal_t; + + +typedef enum { +	AFR_DATA_TRANSACTION,          /* truncate, write, ... */ +	AFR_METADATA_TRANSACTION,      /* chmod, chown, ... */ +	AFR_ENTRY_TRANSACTION,         /* create, rmdir, ... */ +	AFR_ENTRY_RENAME_TRANSACTION,  /* rename */ +	AFR_FLUSH_TRANSACTION,         /* flush */ +} afr_transaction_type; + +typedef struct _afr_local { +	unsigned int call_count; +	unsigned int success_count; +	unsigned int enoent_count; + +	unsigned int need_metadata_self_heal; +	unsigned int need_entry_self_heal; +	unsigned int need_data_self_heal; +	unsigned int govinda_gOvinda; + +	unsigned int reval_child_index; +	int32_t op_ret; +	int32_t op_errno; + +	int32_t *pending_array; + +	loc_t loc; +	loc_t newloc; + +	fd_t *fd; + +	glusterfs_fop_t fop; + +	unsigned char *child_up;  +	int            child_count; + +	int32_t *child_errno; +	 +	dict_t  *xattr_req; +	int      open_fd_count; +	/*  +	   This struct contains the arguments for the "continuation" +	   (scheme-like) of fops +	*/ + +	int   op; +	struct { +		struct { +			unsigned char buf_set; +			struct statvfs buf; +		} statfs; + +		struct { +			inode_t *inode; +			struct stat buf; +			dict_t *xattr; +		} lookup; + +		struct { +			int32_t flags; +		} open; + +		struct { +			int32_t cmd; +			struct flock flock; +			unsigned char *locked_nodes; +		} lk; + +		struct { +			uint8_t *file_checksum; +			uint8_t *dir_checksum; +		} checksum; + +		/* inode read */ + +		struct { +			int32_t mask; +			int last_tried;  /* index of the child we tried previously */ +		} access; + +		struct { +			int last_tried; +			ino_t ino; +		} stat; + +		struct { +			int last_tried; +			ino_t ino; +		} fstat; + +		struct { +			size_t size; +			int last_tried; +		} readlink; + +		struct { +			const char *name; +			int last_tried; +		} getxattr; + +		struct { +			size_t size; +			off_t offset; +			int last_tried; +		} readv; + +		/* dir read */ + +		struct { +			int success_count; +			int32_t op_ret; +			int32_t op_errno; +		} opendir; + +		struct { +			int32_t op_ret; +			int32_t op_errno; +			size_t size; +			off_t offset; + +			int last_tried; +		} readdir; + +		struct { +			int32_t op_ret; +			int32_t op_errno; + +			size_t size; +			off_t offset; +			int32_t flag; + +			int last_tried; +		} getdents; + +		/* inode write */ + +		struct { +			ino_t ino; +			mode_t mode; +			struct stat buf; +		} chmod; + +		struct { +			ino_t ino; +			mode_t mode; +			struct stat buf; +		} fchmod; + +		struct { +			ino_t ino; +			uid_t uid; +			gid_t gid; +			struct stat buf; +		} chown; + +		struct { +			ino_t ino; +			uid_t uid; +			gid_t gid; +			struct stat buf; +		} fchown; +		 +		struct { +			ino_t ino; +			struct stat buf; + +			int32_t op_ret; + +			struct iovec *vector; +			dict_t *refs; +			int32_t count; +			off_t offset; +		} writev; + +		struct { +			ino_t ino; +			off_t offset; +			struct stat buf; +		} truncate; + +		struct { +			ino_t ino; +			off_t offset; +			struct stat buf; +		} ftruncate; + +		struct { +			ino_t ino; +			struct timespec tv[2]; +			struct stat buf; +		} utimens; + +		struct { +			dict_t *dict; +			int32_t flags; +		} setxattr; + +		struct { +			const char *name; +		} removexattr; + +		/* dir write */ +		 +		struct { +			ino_t ino; +			fd_t *fd; +			int32_t flags; +			mode_t mode; +			inode_t *inode; +			struct stat buf; +		} create; + +		struct { +			ino_t ino; +			dev_t dev; +			mode_t mode; +			inode_t *inode; +			struct stat buf; +		} mknod; + +		struct { +			ino_t ino; +			int32_t mode; +			inode_t *inode; +			struct stat buf; +		} mkdir; + +		struct { +			int32_t op_ret; +			int32_t op_errno; +		} unlink; + +		struct { +			int32_t op_ret; +			int32_t op_errno; +		} rmdir; + +		struct { +			ino_t ino; +			struct stat buf; +		} rename; + +		struct { +			ino_t ino; +			inode_t *inode; +			struct stat buf; +		} link; + +		struct { +			ino_t ino; +			inode_t *inode; +			struct stat buf; +			char *linkpath; +		} symlink; + +		struct { +			int32_t flags; +			dir_entry_t *entries; +			int32_t count; +		} setdents; +	} cont; +	 +	struct { +		off_t start, len; + +		unsigned char *locked_nodes; +		int lock_count; + +		const char *basename; +		const char *new_basename; + +		char *pending; + +		loc_t parent_loc; +		loc_t new_parent_loc; + +		afr_transaction_type type; + +		int success_count; +		int erase_pending; +		int failure_count; + +		int last_tried; +		int32_t *child_errno; + +		call_frame_t *main_frame; + +		int (*fop) (call_frame_t *frame, xlator_t *this); + +		int (*done) (call_frame_t *frame, xlator_t *this); + +		int (*resume) (call_frame_t *frame, xlator_t *this); + +		int (*unwind) (call_frame_t *frame, xlator_t *this); +	} transaction; + +	afr_self_heal_t self_heal; +} afr_local_t; + +/* try alloc and if it fails, goto label */ +#define ALLOC_OR_GOTO(var, type, label) do {			\ +		var = CALLOC (sizeof (type), 1);		\ +		if (!var) {					\ +			gf_log (this->name, GF_LOG_ERROR,	\ +				"out of memory :(");		\ +			op_errno = ENOMEM;			\ +			goto label;				\ +		}						\ +	} while (0); + + +/* did a call fail due to a child failing? */ +#define child_went_down(op_ret, op_errno) (((op_ret) < 0) &&	      \ +					   ((op_errno == ENOTCONN) || \ +					    (op_errno == EBADFD))) + +/* have we tried all children? */ +#define all_tried(i, count)  ((i) == (count) - 1) + +void +afr_build_parent_loc (loc_t *parent, loc_t *child); + +int +afr_up_children_count (int child_count, unsigned char *child_up); + +int +afr_locked_nodes_count (unsigned char *locked_nodes, int child_count); + +int +afr_first_up_child (afr_private_t *priv); + +ino64_t +afr_itransform (ino64_t ino, int child_count, int child_index); + +int +afr_deitransform (ino64_t ino, int child_count); + +void +afr_local_cleanup (afr_local_t *local, xlator_t *this); + +int +afr_frame_return (call_frame_t *frame); + +#define AFR_STACK_UNWIND(frame, params ...)		\ +	do {						\ +		afr_local_t *__local = NULL;		\ +		xlator_t    *__this = NULL;		\ +		__local = frame->local;			\ +		__this = frame->this;			\ +		frame->local = NULL;                    \ +		STACK_UNWIND (frame, params);		\ +		afr_local_cleanup (__local, __this);	\ +		free (__local);				\ +} while (0);					 + +#define AFR_STACK_DESTROY(frame)			\ +	do {						\ +		afr_local_t *__local = NULL;		\ +		xlator_t    *__this = NULL;		\ +		__local = frame->local;			\ +		__this = frame->this;			\ +		frame->local = NULL;                    \ +		STACK_DESTROY (frame->root);		\ +		afr_local_cleanup (__local, __this);	\ +		free (__local);				\ +} while (0);					 + +/* allocate and return a string that is the basename of argument */ +static inline char *  +AFR_BASENAME (const char *str)						 +{ +	char *__tmp_str = NULL;				 +	char *__basename_str = NULL;			 +	__tmp_str = strdup (str);			 +	__basename_str = strdup (basename (__tmp_str));	 +	FREE (__tmp_str); +	return __basename_str; +} + +/* initialize local_t */ +static inline int +AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv) +{ +	local->child_up = CALLOC (sizeof (*local->child_up), +				  priv->child_count); +	if (!local->child_up) { +		return -ENOMEM; +	} + +	memcpy (local->child_up, priv->child_up,  +		sizeof (*local->child_up) * priv->child_count); + + +	local->call_count = afr_up_children_count (priv->child_count, local->child_up); +	if (local->call_count == 0) +		return -ENOTCONN; + +	local->transaction.erase_pending = 1; + +	local->op_ret = -1; +	local->op_errno = EUCLEAN; + +	return 0; +} + + +static inline int +afr_transaction_local_init (afr_local_t *local, afr_private_t *priv) +{ +	local->child_errno = CALLOC (sizeof (*local->child_errno), +				     priv->child_count); +	if (!local->child_errno) { +		return -ENOMEM; +	} + +	local->pending_array = CALLOC (sizeof (*local->pending_array), +				       priv->child_count); +	if (!local->pending_array) { +		return -ENOMEM; +	} + +	local->transaction.locked_nodes = CALLOC (sizeof (*local->transaction.locked_nodes), +						  priv->child_count); + +	local->transaction.child_errno = CALLOC (sizeof (*local->transaction.child_errno), +						  priv->child_count); + +	return 0; +} + +#endif /* __AFR_H__ */  | 
