summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr/src
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster/afr/src')
-rw-r--r--xlators/cluster/afr/src/Makefile.am34
-rw-r--r--xlators/cluster/afr/src/afr-common.c3629
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.c547
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.h37
-rw-r--r--xlators/cluster/afr/src/afr-dir-write.c2531
-rw-r--r--xlators/cluster/afr/src/afr-dir-write.h46
-rw-r--r--xlators/cluster/afr/src/afr-inode-read.c1916
-rw-r--r--xlators/cluster/afr/src/afr-inode-read.h39
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c2681
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.h72
-rw-r--r--xlators/cluster/afr/src/afr-lk-common.c1667
-rw-r--r--xlators/cluster/afr/src/afr-mem-types.h49
-rw-r--r--xlators/cluster/afr/src/afr-open.c337
-rw-r--r--xlators/cluster/afr/src/afr-read-txn.c239
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c1754
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.h70
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c1302
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c2263
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c918
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-name.c457
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h181
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.c1256
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.h72
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c2261
-rw-r--r--xlators/cluster/afr/src/afr-transaction.h55
-rw-r--r--xlators/cluster/afr/src/afr.c2974
-rw-r--r--xlators/cluster/afr/src/afr.h1202
-rw-r--r--xlators/cluster/afr/src/pump.c2473
-rw-r--r--xlators/cluster/afr/src/pump.h81
29 files changed, 19077 insertions, 12066 deletions
diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am
index 1bde9e5ba..ea5a90abb 100644
--- a/xlators/cluster/afr/src/Makefile.am
+++ b/xlators/cluster/afr/src/Makefile.am
@@ -1,20 +1,38 @@
-xlator_LTLIBRARIES = afr.la
+xlator_LTLIBRARIES = afr.la pump.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
-afr_la_LDFLAGS = -module -avoidversion
+afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \
+ afr-inode-write.c afr-open.c afr-transaction.c afr-lk-common.c \
+ afr-read-txn.c \
+ $(top_builddir)/xlators/lib/src/libxlator.c
-afr_la_SOURCES = afr.c afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c
+AFR_SELFHEAL_SOURCES = afr-self-heal-common.c afr-self-heal-data.c \
+ afr-self-heal-entry.c afr-self-heal-metadata.c afr-self-heald.c \
+ afr-self-heal-name.c
+
+afr_la_LDFLAGS = -module -avoid-version
+afr_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) afr.c
afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h
+pump_la_LDFLAGS = -module -avoid-version
+pump_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) pump.c
+pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h \
+ afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-mem-types.h \
+ afr-common.c afr-self-heald.h pump.h \
+ $(top_builddir)/xlators/lib/src/libxlator.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) \
+ -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
- -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+AM_CFLAGS = -Wall $(GF_CFLAGS)
-CLEANFILES =
+CLEANFILES =
uninstall-local:
rm -f $(DESTDIR)$(xlatordir)/replicate.so
install-data-hook:
- ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so \ No newline at end of file
+ ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
new file mode 100644
index 000000000..164a651ba
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -0,0 +1,3629 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+#include "byte-order.h"
+#include "statedump.h"
+#include "inode.h"
+
+#include "fd.h"
+
+#include "afr-inode-read.h"
+#include "afr-inode-write.h"
+#include "afr-dir-read.h"
+#include "afr-dir-write.h"
+#include "afr-transaction.h"
+#include "afr-self-heal.h"
+#include "afr-self-heald.h"
+
+
+call_frame_t *
+afr_copy_frame (call_frame_t *base)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ int op_errno = 0;
+
+ frame = copy_frame (base);
+ if (!frame)
+ return NULL;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local) {
+ AFR_STACK_DESTROY (frame);
+ return NULL;
+ }
+
+ return frame;
+}
+
+/*
+ * INODE CTX 64-bit VALUE FORMAT FOR SMALL (<= 16) SUBVOL COUNTS:
+ *
+ * |<---------- 64bit ------------>|
+ * 63 32 31 16 15 0
+ * | EVENT_GEN | DATA | METADATA |
+ *
+ *
+ * METADATA (bit-0 .. bit-15): bitmap representing subvolumes from which
+ * metadata can be attempted to be read.
+ *
+ * bit-0 => priv->subvolumes[0]
+ * bit-1 => priv->subvolumes[1]
+ * ... etc. till bit-15
+ *
+ * DATA (bit-16 .. bit-31): bitmap representing subvolumes from which data
+ * can be attempted to be read.
+ *
+ * bit-16 => priv->subvolumes[0]
+ * bit-17 => priv->subvolumes[1]
+ * ... etc. till bit-31
+ *
+ * EVENT_GEN (bit-32 .. bit-63): event generation (i.e priv->event_generation)
+ * when DATA and METADATA was last updated.
+ *
+ * If EVENT_GEN is < priv->event_generation,
+ * or is 0, it means afr_inode_refresh() needs
+ * to be called to recalculate the bitmaps.
+ */
+
+int
+__afr_inode_read_subvol_get_small (inode_t *inode, xlator_t *this,
+ unsigned char *data, unsigned char *metadata,
+ int *event_p)
+{
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint32_t event = 0;
+ uint64_t val = 0;
+ int i = 0;
+
+ priv = this->private;
+
+ ret = __inode_ctx_get (inode, this, &val);
+ if (ret < 0)
+ return ret;
+
+ metadatamap = (val & 0x000000000000ffff);
+ datamap = (val & 0x00000000ffff0000) >> 16;
+ event = (val & 0xffffffff00000000) >> 32;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (metadata)
+ metadata[i] = (metadatamap >> i) & 1;
+ if (data)
+ data[i] = (datamap >> i) & 1;
+ }
+
+ if (event_p)
+ *event_p = event;
+ return ret;
+}
+
+
+int
+__afr_inode_read_subvol_set_small (inode_t *inode, xlator_t *this,
+ unsigned char *data, unsigned char *metadata,
+ int event)
+{
+ afr_private_t *priv = NULL;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint64_t val = 0;
+ int i = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (data[i])
+ datamap |= (1 << i);
+ if (metadata[i])
+ metadatamap |= (1 << i);
+ }
+
+ val = ((uint64_t) metadatamap) |
+ (((uint64_t) datamap) << 16) |
+ (((uint64_t) event) << 32);
+
+ return __inode_ctx_set (inode, this, &val);
+}
+
+
+int
+__afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this)
+{
+ int ret = -1;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint32_t event = 0;
+ uint64_t val = 0;
+
+ ret = __inode_ctx_get (inode, this, &val);
+ (void) ret;
+
+ metadatamap = (val & 0x000000000000ffff) >> 0;
+ datamap = (val & 0x00000000ffff0000) >> 16;
+ event = 0;
+
+ val = ((uint64_t) metadatamap) |
+ (((uint64_t) datamap) << 16) |
+ (((uint64_t) event) << 32);
+
+ return __inode_ctx_set (inode, this, &val);
+}
+
+
+int
+__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this,
+ unsigned char *data, unsigned char *metadata,
+ int *event_p)
+{
+ afr_private_t *priv = NULL;
+ int ret = -1;
+
+ priv = this->private;
+
+ if (priv->child_count <= 16)
+ ret = __afr_inode_read_subvol_get_small (inode, this, data,
+ metadata, event_p);
+ else
+ /* TBD: allocate structure with array and read from it */
+ ret = -1;
+
+ return ret;
+}
+
+
+int
+__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data,
+ unsigned char *metadata, int event)
+{
+ afr_private_t *priv = NULL;
+ int ret = -1;
+
+ priv = this->private;
+
+ if (priv->child_count <= 16)
+ ret = __afr_inode_read_subvol_set_small (inode, this, data,
+ metadata, event);
+ else
+ ret = -1;
+
+ return ret;
+}
+
+
+int
+__afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int ret = -1;
+
+ priv = this->private;
+
+ if (priv->child_count <= 16)
+ ret = __afr_inode_read_subvol_reset_small (inode, this);
+ else
+ ret = -1;
+
+ return ret;
+}
+
+
+int
+afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, unsigned char *data,
+ unsigned char *metadata, int *event_p)
+{
+ int ret = -1;
+
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_read_subvol_get (inode, this, data,
+ metadata, event_p);
+ }
+ UNLOCK(&inode->lock);
+
+ return ret;
+}
+
+
+int
+afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data,
+ unsigned char *metadata, int event)
+{
+ int ret = -1;
+
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_read_subvol_set (inode, this, data, metadata,
+ event);
+ }
+ UNLOCK(&inode->lock);
+
+ return ret;
+}
+
+
+int
+afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this)
+{
+ int ret = -1;
+
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_read_subvol_reset (inode, this);
+ }
+ UNLOCK(&inode->lock);
+
+ return ret;
+}
+
+
+int
+afr_accused_fill (xlator_t *this, dict_t *xdata, unsigned char *accused,
+ afr_transaction_type type)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int idx = afr_index_for_transaction_type (type);
+ void *pending_raw = NULL;
+ int pending[3];
+ int ret = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_get_ptr (xdata, priv->pending_key[i],
+ &pending_raw);
+ if (ret) /* no pending flags */
+ continue;
+ memcpy (pending, pending_raw, sizeof(pending));
+
+ if (ntoh32 (pending[idx]))
+ accused[i] = 1;
+ }
+
+ return 0;
+}
+
+
+int
+afr_accuse_smallfiles (xlator_t *this, struct afr_reply *replies,
+ unsigned char *data_accused)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint64_t maxsize = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (data_accused[i])
+ continue;
+ if (replies[i].poststat.ia_size > maxsize)
+ maxsize = replies[i].poststat.ia_size;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (data_accused[i])
+ continue;
+ if (replies[i].poststat.ia_size < maxsize)
+ data_accused[i] = 1;
+ }
+
+ return 0;
+}
+
+
+int
+afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ int event_generation = 0;
+ int i = 0;
+ unsigned char *data_accused = NULL;
+ unsigned char *metadata_accused = NULL;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ priv = this->private;
+ replies = local->replies;
+ event_generation = local->event_generation;
+
+ data_accused = alloca0 (priv->child_count);
+ data_readable = alloca0 (priv->child_count);
+ metadata_accused = alloca0 (priv->child_count);
+ metadata_readable = alloca0 (priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ data_readable[i] = 1;
+ metadata_readable[i] = 1;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid) {
+ data_readable[i] = 0;
+ metadata_readable[i] = 0;
+ continue;
+ }
+
+ if (replies[i].op_ret == -1) {
+ data_readable[i] = 0;
+ metadata_readable[i] = 0;
+ continue;
+ }
+
+ afr_accused_fill (this, replies[i].xdata, data_accused,
+ (inode->ia_type == IA_IFDIR) ?
+ AFR_ENTRY_TRANSACTION : AFR_DATA_TRANSACTION);
+
+ afr_accused_fill (this, replies[i].xdata,
+ metadata_accused, AFR_METADATA_TRANSACTION);
+
+ }
+
+ if (inode->ia_type != IA_IFDIR)
+ afr_accuse_smallfiles (this, replies, data_accused);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (data_accused[i]) {
+ data_readable[i] = 0;
+ ret = 1;
+ }
+ if (metadata_accused[i]) {
+ metadata_readable[i] = 0;
+ ret = 1;
+ }
+ }
+
+ afr_inode_read_subvol_set (inode, this, data_readable,
+ metadata_readable, event_generation);
+ return ret;
+}
+
+
+
+int
+afr_refresh_selfheal_done (int ret, call_frame_t *heal, void *opaque)
+{
+ if (heal)
+ STACK_DESTROY (heal->root);
+ return 0;
+}
+
+int
+afr_inode_refresh_err (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int err = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && !local->replies[i].op_ret) {
+ err = 0;
+ goto ret;
+ }
+ }
+
+ err = afr_final_errno (local, priv);
+ret:
+ return -err;
+}
+
+
+int
+afr_refresh_selfheal_wrap (void *opaque)
+{
+ call_frame_t *frame = opaque;
+ afr_local_t *local = NULL;
+ xlator_t *this = NULL;
+ int err = 0;
+
+ local = frame->local;
+ this = frame->this;
+
+ afr_selfheal (frame->this, local->refreshinode->gfid);
+
+ afr_selfheal_unlocked_discover (frame, local->refreshinode,
+ local->refreshinode->gfid,
+ local->replies);
+
+ afr_replies_interpret (frame, this, local->refreshinode);
+
+ err = afr_inode_refresh_err (frame, this);
+
+ afr_replies_wipe (local, this->private);
+
+ local->refreshfn (frame, this, err);
+
+ return 0;
+}
+
+
+gf_boolean_t
+afr_selfheal_enabled (xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ gf_boolean_t data = _gf_false;
+
+ priv = this->private;
+
+ gf_string2boolean (priv->data_self_heal, &data);
+
+ return data || priv->metadata_self_heal || priv->entry_self_heal;
+}
+
+
+
+int
+afr_inode_refresh_done (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *heal = NULL;
+ afr_local_t *local = NULL;
+ int ret = 0;
+ int err = 0;
+
+ local = frame->local;
+
+ ret = afr_replies_interpret (frame, this, local->refreshinode);
+
+ err = afr_inode_refresh_err (frame, this);
+
+ afr_replies_wipe (local, this->private);
+
+ if (ret && afr_selfheal_enabled (this)) {
+ heal = copy_frame (frame);
+ if (heal)
+ heal->root->pid = -1;
+ ret = synctask_new (this->ctx->env, afr_refresh_selfheal_wrap,
+ afr_refresh_selfheal_done, heal, frame);
+ if (ret)
+ goto refresh_done;
+ } else {
+ refresh_done:
+ local->refreshfn (frame, this, err);
+ }
+
+ return 0;
+}
+
+
+int
+afr_inode_refresh_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *par)
+{
+ afr_local_t *local = NULL;
+ int call_child = (long) cookie;
+ int call_count = 0;
+
+ local = frame->local;
+
+ local->replies[call_child].valid = 1;
+ local->replies[call_child].op_ret = op_ret;
+ local->replies[call_child].op_errno = op_errno;
+ if (op_ret != -1) {
+ local->replies[call_child].poststat = *buf;
+ local->replies[call_child].postparent = *par;
+ local->replies[call_child].xdata = dict_ref (xdata);
+ }
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_inode_refresh_done (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_inode_refresh_subvol (call_frame_t *frame, xlator_t *this, int i,
+ inode_t *inode, dict_t *xdata)
+{
+ loc_t loc = {0, };
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ loc.inode = inode;
+ uuid_copy (loc.gfid, inode->gfid);
+
+ STACK_WIND_COOKIE (frame, afr_inode_refresh_subvol_cbk,
+ (void *) (long) i, priv->children[i],
+ priv->children[i]->fops->lookup, &loc, xdata);
+ return 0;
+}
+
+
+int
+afr_inode_refresh_do (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int i = 0;
+ dict_t *xdata = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ afr_replies_wipe (local, priv);
+
+ xdata = dict_new ();
+ if (!xdata) {
+ afr_inode_refresh_done (frame, this);
+ return 0;
+ }
+
+ if (afr_xattr_req_prepare (this, xdata) != 0) {
+ dict_unref (xdata);
+ afr_inode_refresh_done (frame, this);
+ return 0;
+ }
+
+ local->call_count = AFR_COUNT (local->child_up, priv->child_count);
+
+ call_count = local->call_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->child_up[i])
+ continue;
+
+ afr_inode_refresh_subvol (frame, this, i, local->refreshinode,
+ xdata);
+
+ if (!--call_count)
+ break;
+ }
+
+ dict_unref (xdata);
+
+ return 0;
+}
+
+
+int
+afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ afr_inode_refresh_cbk_t refreshfn)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->refreshfn = refreshfn;
+
+ if (local->refreshinode) {
+ inode_unref (local->refreshinode);
+ local->refreshinode = NULL;
+ }
+
+ local->refreshinode = inode_ref (inode);
+
+ afr_inode_refresh_do (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_set_uint64 (xattr_req, priv->pending_key[i],
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_WARNING,
+ "Unable to set dict value for %s",
+ priv->pending_key[i]);
+ /* 3 = data+metadata+entry */
+ }
+ ret = dict_set_uint64 (xattr_req, AFR_DIRTY,
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "failed to set dirty "
+ "query flag");
+ }
+
+ return ret;
+}
+
+int
+afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this,
+ dict_t *xattr_req, loc_t *loc)
+{
+ int ret = -ENOMEM;
+
+ local->xattr_req = dict_new ();
+ if (!local->xattr_req)
+ goto out;
+ if (xattr_req)
+ dict_copy (xattr_req, local->xattr_req);
+
+ ret = afr_xattr_req_prepare (this, local->xattr_req);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: Unable to prepare xattr_req", loc->path);
+ }
+
+ ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: Unable to set dict value for %s",
+ loc->path, GLUSTERFS_INODELK_COUNT);
+ }
+ ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: Unable to set dict value for %s",
+ loc->path, GLUSTERFS_ENTRYLK_COUNT);
+ }
+
+ ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: Unable to set dict value for %s",
+ loc->path, GLUSTERFS_PARENT_ENTRYLK);
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+
+int
+afr_hash_child (inode_t *inode, int32_t child_count, int hashmode)
+{
+ uuid_t gfid_copy = {0,};
+ pid_t pid;
+
+ if (!hashmode) {
+ return -1;
+ }
+
+ if (inode) {
+ uuid_copy (gfid_copy, inode->gfid);
+ }
+
+ if (hashmode > 1) {
+ /*
+ * Why getpid? Because it's one of the cheapest calls
+ * available - faster than gethostname etc. - and returns a
+ * constant-length value that's sure to be shorter than a UUID.
+ * It's still very unlikely to be the same across clients, so
+ * it still provides good mixing. We're not trying for
+ * perfection here. All we need is a low probability that
+ * multiple clients won't converge on the same subvolume.
+ */
+ pid = getpid();
+ memcpy (gfid_copy, &pid, sizeof(pid));
+ }
+
+ return SuperFastHash((char *)gfid_copy,
+ sizeof(gfid_copy)) % child_count;
+}
+
+
+int
+afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
+ unsigned char *readable)
+{
+ afr_private_t *priv = NULL;
+ int read_subvol = -1;
+ int i = 0;
+
+ priv = this->private;
+
+ /* first preference - explicitly specified or local subvolume */
+ if (priv->read_child >= 0 && readable[priv->read_child])
+ return priv->read_child;
+
+ /* second preference - use hashed mode */
+ read_subvol = afr_hash_child (inode, priv->child_count,
+ priv->hash_mode);
+ if (read_subvol >= 0 && readable[read_subvol])
+ return read_subvol;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (readable[i])
+ return i;
+ }
+
+ /* no readable subvolumes, either split brain or all subvols down */
+
+ return -1;
+}
+
+
+int
+afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p,
+ int type)
+{
+ int ret = -1;
+
+ if (type == AFR_METADATA_TRANSACTION)
+ ret = afr_inode_read_subvol_get (inode, this, 0, readable,
+ event_p);
+ else
+ ret = afr_inode_read_subvol_get (inode, this, readable, 0,
+ event_p);
+ return ret;
+}
+
+
+int
+afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,
+ int *event_p, afr_transaction_type type)
+{
+ afr_private_t *priv = NULL;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ unsigned char *readable = NULL;
+ unsigned char *intersection = NULL;
+ int subvol = -1;
+ int event = 0;
+
+ priv = this->private;
+
+ readable = alloca0 (priv->child_count);
+ data_readable = alloca0 (priv->child_count);
+ metadata_readable = alloca0 (priv->child_count);
+ intersection = alloca0 (priv->child_count);
+
+ afr_inode_read_subvol_type_get (inode, this, readable, &event, type);
+
+ afr_inode_read_subvol_get (inode, this, data_readable, metadata_readable,
+ &event);
+
+ AFR_INTERSECT (intersection, data_readable, metadata_readable,
+ priv->child_count);
+
+ if (AFR_COUNT (intersection, priv->child_count) > 0)
+ subvol = afr_read_subvol_select_by_policy (inode, this,
+ intersection);
+ else
+ subvol = afr_read_subvol_select_by_policy (inode, this,
+ readable);
+ if (subvol_p)
+ *subvol_p = subvol;
+ if (event_p)
+ *event_p = event;
+ return subvol;
+}
+
+
+void
+afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ priv = this->private;
+
+ afr_matrix_cleanup (local->pending, priv->child_count);
+
+ GF_FREE (local->internal_lock.locked_nodes);
+
+ for (i = 0; local->internal_lock.inodelk[i].domain; i++) {
+ GF_FREE (local->internal_lock.inodelk[i].locked_nodes);
+ }
+
+ GF_FREE (local->internal_lock.lower_locked_nodes);
+
+ afr_entry_lockee_cleanup (&local->internal_lock);
+
+ GF_FREE (local->transaction.pre_op);
+ GF_FREE (local->transaction.eager_lock);
+ GF_FREE (local->transaction.fop_subvols);
+ GF_FREE (local->transaction.failed_subvols);
+
+ GF_FREE (local->transaction.basename);
+ GF_FREE (local->transaction.new_basename);
+
+ loc_wipe (&local->transaction.parent_loc);
+ loc_wipe (&local->transaction.new_parent_loc);
+
+}
+
+
+void
+afr_replies_wipe (afr_local_t *local, afr_private_t *priv)
+{
+ int i;
+
+ if (!local->replies)
+ return;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].xdata) {
+ dict_unref (local->replies[i].xdata);
+ local->replies[i].xdata = NULL;
+ }
+ }
+
+ memset (local->replies, 0, sizeof(*local->replies) * priv->child_count);
+}
+
+void
+afr_remove_eager_lock_stub (afr_local_t *local)
+{
+ LOCK (&local->fd->lock);
+ {
+ list_del_init (&local->transaction.eager_locked);
+ }
+ UNLOCK (&local->fd->lock);
+}
+
+void
+afr_local_cleanup (afr_local_t *local, xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+
+ if (!local)
+ return;
+
+ syncbarrier_destroy (&local->barrier);
+
+ if (local->transaction.eager_lock_on &&
+ !list_empty (&local->transaction.eager_locked))
+ afr_remove_eager_lock_stub (local);
+
+ afr_local_transaction_cleanup (local, this);
+
+ priv = this->private;
+
+ loc_wipe (&local->loc);
+ loc_wipe (&local->newloc);
+
+ if (local->fd)
+ fd_unref (local->fd);
+
+ if (local->xattr_req)
+ dict_unref (local->xattr_req);
+
+ if (local->dict)
+ dict_unref (local->dict);
+
+ afr_replies_wipe (local, priv);
+ GF_FREE(local->replies);
+
+ GF_FREE (local->child_up);
+
+ GF_FREE (local->read_attempted);
+
+ GF_FREE (local->readable);
+
+ if (local->inode)
+ inode_unref (local->inode);
+
+ if (local->parent)
+ inode_unref (local->parent);
+
+ if (local->parent2)
+ inode_unref (local->parent2);
+
+ if (local->refreshinode)
+ inode_unref (local->refreshinode);
+
+ { /* getxattr */
+ GF_FREE (local->cont.getxattr.name);
+ }
+
+ { /* lk */
+ GF_FREE (local->cont.lk.locked_nodes);
+ }
+
+ { /* create */
+ if (local->cont.create.fd)
+ fd_unref (local->cont.create.fd);
+ if (local->cont.create.params)
+ dict_unref (local->cont.create.params);
+ }
+
+ { /* mknod */
+ if (local->cont.mknod.params)
+ dict_unref (local->cont.mknod.params);
+ }
+
+ { /* mkdir */
+ if (local->cont.mkdir.params)
+ dict_unref (local->cont.mkdir.params);
+ }
+
+ { /* symlink */
+ if (local->cont.symlink.params)
+ dict_unref (local->cont.symlink.params);
+ }
+
+ { /* writev */
+ GF_FREE (local->cont.writev.vector);
+ if (local->cont.writev.iobref)
+ iobref_unref (local->cont.writev.iobref);
+ }
+
+ { /* setxattr */
+ if (local->cont.setxattr.dict)
+ dict_unref (local->cont.setxattr.dict);
+ }
+
+ { /* fsetxattr */
+ if (local->cont.fsetxattr.dict)
+ dict_unref (local->cont.fsetxattr.dict);
+ }
+
+ { /* removexattr */
+ GF_FREE (local->cont.removexattr.name);
+ }
+ { /* xattrop */
+ if (local->cont.xattrop.xattr)
+ dict_unref (local->cont.xattrop.xattr);
+ }
+ { /* fxattrop */
+ if (local->cont.fxattrop.xattr)
+ dict_unref (local->cont.fxattrop.xattr);
+ }
+ { /* symlink */
+ GF_FREE (local->cont.symlink.linkpath);
+ }
+
+ { /* opendir */
+ GF_FREE (local->cont.opendir.checksum);
+ }
+
+ { /* readdirp */
+ if (local->cont.readdir.dict)
+ dict_unref (local->cont.readdir.dict);
+ }
+
+ if (local->xdata_req)
+ dict_unref (local->xdata_req);
+
+ if (local->xdata_rsp)
+ dict_unref (local->xdata_rsp);
+}
+
+
+int
+afr_frame_return (call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ call_count = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ return call_count;
+}
+
+
+gf_boolean_t
+afr_is_entry_possibly_under_txn (afr_local_t *local, xlator_t *this)
+{
+ int i = 0;
+ int tmp = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].xdata)
+ continue;
+ if (dict_get_int32 (local->replies[i].xdata,
+ GLUSTERFS_PARENT_ENTRYLK,
+ &tmp) == 0)
+ if (tmp)
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
+
+
+/*
+ * Quota size xattrs are not maintained by afr. There is a
+ * possibility that they differ even when both the directory changelog xattrs
+ * suggest everything is fine. So if there is at least one 'source' check among
+ * the sources which has the maximum quota size. Otherwise check among all the
+ * available ones for maximum quota size. This way if there is a source and
+ * stale copies it always votes for the 'source'.
+ * */
+
+static void
+afr_handle_quota_size (call_frame_t *frame, xlator_t *this)
+{
+ unsigned char *readable = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ int i = 0;
+ uint64_t size = 0;
+ uint64_t max_size = 0;
+ int readable_cnt = 0;
+
+ local = frame->local;
+ priv = this->private;
+ replies = local->replies;
+
+ readable = alloca0 (priv->child_count);
+
+ afr_inode_read_subvol_get (local->inode, this, readable, 0, 0);
+
+ readable_cnt = AFR_COUNT (readable, priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (readable_cnt && !readable[i])
+ continue;
+ if (!replies[i].xdata)
+ continue;
+ if (dict_get_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, &size))
+ continue;
+ if (size > max_size)
+ max_size = size;
+ }
+
+ if (!max_size)
+ return;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (readable_cnt && !readable[i])
+ continue;
+ if (!replies[i].xdata)
+ continue;
+ if (dict_set_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, max_size))
+ continue;
+ }
+}
+
+
+static void
+afr_lookup_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = -1;
+ int op_errno = 0;
+ int read_subvol = 0;
+ unsigned char *readable = NULL;
+ int event = 0;
+ struct afr_reply *replies = NULL;
+ uuid_t read_gfid = {0, };
+ gf_boolean_t locked_entry = _gf_false;
+ gf_boolean_t can_interpret = _gf_true;
+
+ priv = this->private;
+ local = frame->local;
+ replies = local->replies;
+
+ locked_entry = afr_is_entry_possibly_under_txn (local, this);
+
+ readable = alloca0 (priv->child_count);
+
+ afr_inode_read_subvol_get (local->loc.parent, this, readable,
+ NULL, &event);
+
+ /* First, check if we have an ESTALE from somewhere,
+ If so, propagate that so that a revalidate can be
+ issued
+ */
+ op_errno = afr_final_errno (frame->local, this->private);
+ local->op_errno = op_errno;
+ if (op_errno == ESTALE) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ goto unwind;
+ }
+
+ read_subvol = -1;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (locked_entry && replies[i].op_ret == -1 &&
+ replies[i].op_errno == ENOENT) {
+ /* Second, check entry is still
+ "underway" in creation */
+ local->op_ret = -1;
+ local->op_errno = ENOENT;
+ read_subvol = i;
+ goto unwind;
+ }
+
+ if (replies[i].op_ret == -1)
+ continue;
+
+ if (read_subvol == -1 || !readable[read_subvol]) {
+ read_subvol = i;
+ uuid_copy (read_gfid, replies[i].poststat.ia_gfid);
+ local->op_ret = 0;
+ }
+ }
+
+ if (read_subvol == -1)
+ goto unwind;
+ /* We now have a read_subvol, which is readable[] (if there
+ were any). Next we look for GFID mismatches. We don't
+ consider a GFID mismatch as an error if read_subvol is
+ readable[] but the mismatching GFID subvol is not.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1) {
+ if (priv->child_up[i])
+ can_interpret = _gf_false;
+ continue;
+ }
+
+ if (!uuid_compare (replies[i].poststat.ia_gfid,
+ read_gfid))
+ continue;
+
+ can_interpret = _gf_false;
+
+ if (locked_entry)
+ continue;
+
+ /* Now GFIDs mismatch. It's OK as long as this subvol
+ is not readable[] but read_subvol is */
+ if (readable[read_subvol] && !readable[i])
+ continue;
+
+ /* LOG ERROR */
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto unwind;
+ }
+
+ /* Forth, for the finalized GFID, pick the best subvolume
+ to return stats from.
+ */
+ if (can_interpret) {
+ /* It is safe to call afr_replies_interpret() because we have
+ a response from all the UP subvolumes and all of them resolved
+ to the same GFID
+ */
+ if (afr_replies_interpret (frame, this, local->inode)) {
+ read_subvol = afr_data_subvol_get (local->inode, this,
+ 0, 0);
+ afr_inode_read_subvol_reset (local->inode, this);
+ goto cant_interpret;
+ } else {
+ read_subvol = afr_data_subvol_get (local->inode, this,
+ 0, 0);
+ }
+ } else {
+ cant_interpret:
+ if (read_subvol == -1)
+ dict_del (replies[0].xdata, GF_CONTENT_KEY);
+ else
+ dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY);
+ }
+
+ afr_handle_quota_size (frame, this);
+
+unwind:
+ if (read_subvol == -1)
+ read_subvol = 0;
+
+ AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->replies[read_subvol].poststat,
+ local->replies[read_subvol].xdata,
+ &local->replies[read_subvol].postparent);
+}
+
+/*
+ * During a lookup, some errors are more "important" than
+ * others in that they must be given higher priority while
+ * returning to the user.
+ *
+ * The hierarchy is ESTALE > ENOENT > others
+ */
+
+int
+afr_higher_errno (int32_t old_errno, int32_t new_errno)
+{
+ if (old_errno == ENODATA || new_errno == ENODATA)
+ return ENODATA;
+ if (old_errno == ESTALE || new_errno == ESTALE)
+ return ESTALE;
+ if (old_errno == ENOENT || new_errno == ENOENT)
+ return ENOENT;
+
+ return new_errno;
+}
+
+
+int
+afr_final_errno (afr_local_t *local, afr_private_t *priv)
+{
+ int i = 0;
+ int op_errno = 0;
+ int tmp_errno = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret == 0)
+ continue;
+ tmp_errno = local->replies[i].op_errno;
+ op_errno = afr_higher_errno (op_errno, tmp_errno);
+ }
+
+ return op_errno;
+}
+
+static int
+get_pathinfo_host (char *pathinfo, char *hostname, size_t size)
+{
+ char *start = NULL;
+ char *end = NULL;
+ int ret = -1;
+ int i = 0;
+
+ if (!pathinfo)
+ goto out;
+
+ start = strchr (pathinfo, ':');
+ if (!start)
+ goto out;
+ end = strrchr (pathinfo, ':');
+ if (start == end)
+ goto out;
+
+ memset (hostname, 0, size);
+ i = 0;
+ while (++start != end)
+ hostname[i++] = *start;
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+afr_local_pathinfo (char *pathinfo, gf_boolean_t *local)
+{
+ int ret = 0;
+ char pathinfohost[1024] = {0};
+ char localhost[1024] = {0};
+ xlator_t *this = THIS;
+
+ *local = _gf_false;
+ ret = get_pathinfo_host (pathinfo, pathinfohost, sizeof (pathinfohost));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Invalid pathinfo: %s",
+ pathinfo);
+ goto out;
+ }
+
+ ret = gethostname (localhost, sizeof (localhost));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "gethostname() failed, "
+ "reason: %s", strerror (errno));
+ goto out;
+ }
+
+ if (!strcmp (localhost, pathinfohost))
+ *local = _gf_true;
+out:
+ return ret;
+}
+
+static int32_t
+afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ int ret = 0;
+ char *pathinfo = NULL;
+ gf_boolean_t is_local = _gf_false;
+ afr_private_t *priv = NULL;
+ int32_t child_index = -1;
+
+ if (op_ret != 0) {
+ goto out;
+ }
+
+ priv = this->private;
+ child_index = (int32_t)(long)cookie;
+
+ ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo);
+ if (ret != 0) {
+ goto out;
+ }
+
+ ret = afr_local_pathinfo (pathinfo, &is_local);
+ if (ret) {
+ goto out;
+ }
+
+ /*
+ * Note that one local subvolume will override another here. The only
+ * way to avoid that would be to retain extra information about whether
+ * the previous read_child is local, and it's just not worth it. Even
+ * the slowest local subvolume is far preferable to a remote one.
+ */
+ if (is_local) {
+ gf_log (this->name, GF_LOG_INFO,
+ "selecting local read_child %s",
+ priv->children[child_index]->name);
+ priv->read_child = child_index;
+ }
+out:
+ STACK_DESTROY(frame->root);
+ return 0;
+}
+
+static void
+afr_attempt_local_discovery (xlator_t *this, int32_t child_index)
+{
+ call_frame_t *newframe = NULL;
+ loc_t tmploc = {0,};
+ afr_private_t *priv = this->private;
+
+ newframe = create_frame(this,this->ctx->pool);
+ if (!newframe) {
+ return;
+ }
+
+ tmploc.gfid[sizeof(tmploc.gfid)-1] = 1;
+ STACK_WIND_COOKIE (newframe, afr_local_discovery_cbk,
+ (void *)(long)child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->getxattr,
+ &tmploc, GF_XATTR_PATHINFO_KEY, NULL);
+}
+
+
+int
+afr_lookup_selfheal_wrap (void *opaque)
+{
+ call_frame_t *frame = opaque;
+ afr_local_t *local = NULL;
+ xlator_t *this = NULL;
+ inode_t *inode = NULL;
+
+ local = frame->local;
+ this = frame->this;
+
+ afr_selfheal_name (frame->this, local->loc.pargfid, local->loc.name);
+
+ afr_replies_wipe (local, this->private);
+
+ inode = afr_selfheal_unlocked_lookup_on (frame, local->loc.parent,
+ local->loc.name, local->replies,
+ local->child_up);
+ if (inode)
+ inode_unref (inode);
+ afr_lookup_done (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ call_frame_t *heal = NULL;
+ int i = 0, first = -1;
+ gf_boolean_t need_heal = _gf_false;
+ struct afr_reply *replies = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ replies = local->replies;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (first == -1) {
+ first = i;
+ continue;
+ }
+
+ if (replies[i].op_ret != replies[first].op_ret) {
+ need_heal = _gf_true;
+ break;
+ }
+
+ if (uuid_compare (replies[i].poststat.ia_gfid,
+ replies[first].poststat.ia_gfid)) {
+ need_heal = _gf_true;
+ break;
+ }
+ }
+
+ if (need_heal) {
+ heal = copy_frame (frame);
+ if (heal)
+ heal->root->pid = -1;
+ ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap,
+ afr_refresh_selfheal_done, heal, frame);
+ if (ret)
+ goto lookup_done;
+ } else {
+ lookup_done:
+ afr_lookup_done (frame, this);
+ }
+
+ return ret;
+}
+
+
+int
+afr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *buf,
+ dict_t *xdata, struct iatt *postparent)
+{
+ afr_local_t * local = NULL;
+ int call_count = -1;
+ int child_index = -1;
+
+ child_index = (long) cookie;
+
+ local = frame->local;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ if (op_ret != -1) {
+ local->replies[child_index].poststat = *buf;
+ local->replies[child_index].postparent = *postparent;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref (xdata);
+ }
+
+ call_count = afr_frame_return (frame);
+ if (call_count == 0) {
+ afr_lookup_entry_heal (frame, this);
+ }
+
+ return 0;
+}
+
+
+
+static void
+afr_discover_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = -1;
+ int op_errno = 0;
+ int read_subvol = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret == 0)
+ local->op_ret = 0;
+ }
+
+ op_errno = afr_final_errno (frame->local, this->private);
+
+ if (local->op_ret < 0) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ goto unwind;
+ }
+
+ afr_replies_interpret (frame, this, local->inode);
+
+ read_subvol = afr_data_subvol_get (local->inode, this, 0, 0);
+ if (read_subvol == -1) {
+ gf_log (this->name, GF_LOG_WARNING, "no read subvols for %s",
+ local->loc.path);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid ||
+ local->replies[i].op_ret == -1)
+ continue;
+ read_subvol = i;
+ break;
+ }
+ }
+
+unwind:
+ if (read_subvol == -1)
+ read_subvol = 0;
+
+ AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->replies[read_subvol].poststat,
+ local->replies[read_subvol].xdata,
+ &local->replies[read_subvol].postparent);
+}
+
+
+int
+afr_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *buf,
+ dict_t *xdata, struct iatt *postparent)
+{
+ afr_local_t * local = NULL;
+ int call_count = -1;
+ int child_index = -1;
+
+ child_index = (long) cookie;
+
+ local = frame->local;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ if (op_ret != -1) {
+ local->replies[child_index].poststat = *buf;
+ local->replies[child_index].postparent = *postparent;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref (xdata);
+ }
+
+ if (local->do_discovery && (op_ret == 0))
+ afr_attempt_local_discovery (this, child_index);
+
+ call_count = afr_frame_return (frame);
+ if (call_count == 0) {
+ afr_discover_done (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_discover_do (call_frame_t *frame, xlator_t *this, int err)
+{
+ int ret = 0;
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (err) {
+ local->op_errno = -err;
+ ret = -1;
+ goto out;
+ }
+
+ call_count = local->call_count = AFR_COUNT (local->child_up,
+ priv->child_count);
+
+ ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req,
+ &local->loc);
+ if (ret) {
+ local->op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_discover_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ &local->loc, local->xattr_req);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+out:
+ AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0);
+ return 0;
+}
+
+
+int
+afr_discover (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+ int op_errno = ENOMEM;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int event = 0;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ if (!local->call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ if (__is_root_gfid (loc->inode->gfid)) {
+ if (!this->itable)
+ this->itable = loc->inode->table;
+ if (!priv->root_inode)
+ priv->root_inode = inode_ref (loc->inode);
+
+ if (priv->choose_local && !priv->did_discovery) {
+ /* Logic to detect which subvolumes of AFR are
+ local, in order to prefer them for reads
+ */
+ local->do_discovery = _gf_true;
+ priv->did_discovery = _gf_true;
+ }
+ }
+
+ local->op = GF_FOP_LOOKUP;
+
+ loc_copy (&local->loc, loc);
+
+ local->inode = inode_ref (loc->inode);
+
+ if (xattr_req)
+ /* If xattr_req was null, afr_lookup_xattr_req_prepare() will
+ allocate one for us */
+ local->xattr_req = dict_ref (xattr_req);
+
+ if (uuid_is_null (loc->inode->gfid)) {
+ afr_discover_do (frame, this, 0);
+ return 0;
+ }
+
+ afr_read_subvol_get (loc->inode, this, NULL, &event,
+ AFR_DATA_TRANSACTION);
+
+ if (event != local->event_generation)
+ afr_inode_refresh (frame, this, loc->inode, afr_discover_do);
+ else
+ afr_discover_do (frame, this, 0);
+
+ return 0;
+out:
+ AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+ return 0;
+}
+
+
+int
+afr_lookup_do (call_frame_t *frame, xlator_t *this, int err)
+{
+ int ret = 0;
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (err < 0) {
+ local->op_errno = -err;
+ ret = -1;
+ goto out;
+ }
+
+ call_count = local->call_count = AFR_COUNT (local->child_up,
+ priv->child_count);
+
+ ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req,
+ &local->loc);
+ if (ret) {
+ local->op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_lookup_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ &local->loc, local->xattr_req);
+ if (!--call_count)
+ break;
+ }
+ }
+ return 0;
+out:
+ AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0);
+ return 0;
+}
+
+/*
+ * afr_lookup()
+ *
+ * The goal here is to figure out what the element getting looked up is.
+ * i.e what is the GFID, inode type and a conservative estimate of the
+ * inode attributes are.
+ *
+ * As we lookup, operations may be underway on the entry name and the
+ * inode. In lookup() we are primarily concerned only with the entry
+ * operations. If the entry is getting unlinked or renamed, we detect
+ * what operation is underway by querying for on-going transactions and
+ * pending self-healing on the entry through xdata.
+ *
+ * If the entry is a file/dir, it may need self-heal and/or in a
+ * split-brain condition. Lookup is not the place to worry about these
+ * conditions. Outcast marking will naturally handle them in the read
+ * paths.
+ *
+ * Here is a brief goal of what we are trying to achieve:
+ *
+ * - LOOKUP on all subvolumes concurrently, querying on-going transaction
+ * and pending self-heal info from the servers.
+ *
+ * - If all servers reply the same inode type and GFID, the overall call
+ * MUST be a success.
+ *
+ * - If inode types or GFIDs mismatch, and there IS either an on-going
+ * transaction or pending self-heal, inspect what the nature of the
+ * transaction or pending heal is, and select the appropriate subvolume's
+ * reply as the winner.
+ *
+ * - If inode types or GFIDs mismatch, and there are no on-going transactions
+ * or pending self-heal on the entry name on any of the servers, fail the
+ * lookup with EIO. Something has gone wrong beyond reasonable action.
+ */
+
+int
+afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
+ int event = 0;
+
+ if (!loc->parent) {
+ afr_discover (frame, this, loc, xattr_req);
+ return 0;
+ }
+
+ if (__is_root_gfid (loc->parent->gfid)) {
+ if (!strcmp (loc->name, GF_REPLICATE_TRASH_DIR)) {
+ op_errno = EPERM;
+ goto out;
+ }
+ }
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ if (!local->call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ local->op = GF_FOP_LOOKUP;
+
+ loc_copy (&local->loc, loc);
+
+ local->inode = inode_ref (loc->inode);
+
+ if (xattr_req)
+ /* If xattr_req was null, afr_lookup_xattr_req_prepare() will
+ allocate one for us */
+ local->xattr_req = dict_ref (xattr_req);
+
+ afr_read_subvol_get (loc->parent, this, NULL, &event,
+ AFR_DATA_TRANSACTION);
+
+ if (event != local->event_generation)
+ afr_inode_refresh (frame, this, loc->parent, afr_lookup_do);
+ else
+ afr_lookup_do (frame, this, 0);
+
+ return 0;
+out:
+ AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+
+ return 0;
+}
+
+
+/* {{{ open */
+
+afr_fd_ctx_t *
+__afr_fd_ctx_get (fd_t *fd, xlator_t *this)
+{
+ uint64_t ctx = 0;
+ int ret = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ ret = __fd_ctx_get (fd, this, &ctx);
+
+ if (ret < 0) {
+ ret = __afr_fd_ctx_set (this, fd);
+ if (ret < 0)
+ goto out;
+
+ ret = __fd_ctx_get (fd, this, &ctx);
+ if (ret < 0)
+ goto out;
+ }
+
+ fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+out:
+ return fd_ctx;
+}
+
+
+afr_fd_ctx_t *
+afr_fd_ctx_get (fd_t *fd, xlator_t *this)
+{
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ LOCK(&fd->lock);
+ {
+ fd_ctx = __afr_fd_ctx_get (fd, this);
+ }
+ UNLOCK(&fd->lock);
+
+ return fd_ctx;
+}
+
+
+int
+__afr_fd_ctx_set (xlator_t *this, fd_t *fd)
+{
+ afr_private_t * priv = NULL;
+ int ret = -1;
+ uint64_t ctx = 0;
+ afr_fd_ctx_t * fd_ctx = NULL;
+ int i = 0;
+
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ priv = this->private;
+
+ ret = __fd_ctx_get (fd, this, &ctx);
+
+ if (ret == 0)
+ goto out;
+
+ fd_ctx = GF_CALLOC (1, sizeof (afr_fd_ctx_t),
+ gf_afr_mt_afr_fd_ctx_t);
+ if (!fd_ctx) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
+ fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]),
+ priv->child_count,
+ gf_afr_mt_int32_t);
+ if (!fd_ctx->pre_op_done[i]) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on),
+ priv->child_count,
+ gf_afr_mt_int32_t);
+ if (!fd_ctx->opened_on) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (fd_is_anonymous (fd))
+ fd_ctx->opened_on[i] = AFR_FD_OPENED;
+ else
+ fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED;
+ }
+
+ fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback),
+ priv->child_count,
+ gf_afr_mt_char);
+ if (!fd_ctx->lock_piggyback) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired),
+ priv->child_count,
+ gf_afr_mt_char);
+ if (!fd_ctx->lock_acquired) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ pthread_mutex_init (&fd_ctx->delay_lock, NULL);
+
+ INIT_LIST_HEAD (&fd_ctx->eager_locked);
+
+ ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx);
+ if (ret)
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to set fd ctx (%p)", fd);
+out:
+ return ret;
+}
+
+
+int
+afr_fd_ctx_set (xlator_t *this, fd_t *fd)
+{
+ int ret = -1;
+
+ LOCK (&fd->lock);
+ {
+ ret = __afr_fd_ctx_set (this, fd);
+ }
+ UNLOCK (&fd->lock);
+
+ return ret;
+}
+
+/* {{{ flush */
+
+int
+afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret != -1) {
+ local->op_ret = op_ret;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
+ } else {
+ local->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (flush, frame, local->op_ret,
+ local->op_errno, local->xdata_rsp);
+
+ return 0;
+}
+
+static int
+afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+
+ priv = this->private;
+ local = frame->local;
+ call_count = local->call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_flush_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->flush,
+ local->fd, xdata);
+ if (!--call_count)
+ break;
+
+ }
+ }
+
+ return 0;
+}
+
+int
+afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_stub_t *stub = NULL;
+ int op_errno = ENOMEM;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ if (!local->call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ local->fd = fd_ref(fd);
+
+ stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata);
+ if (!stub)
+ goto out;
+
+ afr_delayed_changelog_wake_resume (this, fd, stub);
+
+ return 0;
+out:
+ AFR_STACK_UNWIND (flush, frame, -1, op_errno, NULL);
+ return 0;
+}
+
+/* }}} */
+
+
+int
+afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd)
+{
+ uint64_t ctx = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int ret = 0;
+ int i = 0;
+
+ ret = fd_ctx_get (fd, this, &ctx);
+ if (ret < 0)
+ goto out;
+
+ fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+
+ if (fd_ctx) {
+ //no need to take any locks
+ if (!list_empty (&fd_ctx->eager_locked))
+ gf_log (this->name, GF_LOG_WARNING, "%s: Stale "
+ "Eager-lock stubs found",
+ uuid_utoa (fd->inode->gfid));
+
+ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++)
+ GF_FREE (fd_ctx->pre_op_done[i]);
+
+ GF_FREE (fd_ctx->opened_on);
+
+ GF_FREE (fd_ctx->lock_piggyback);
+
+ GF_FREE (fd_ctx->lock_acquired);
+
+ pthread_mutex_destroy (&fd_ctx->delay_lock);
+
+ GF_FREE (fd_ctx);
+ }
+
+out:
+ return 0;
+}
+
+
+int
+afr_release (xlator_t *this, fd_t *fd)
+{
+ afr_cleanup_fd_ctx (this, fd);
+
+ return 0;
+}
+
+
+/* {{{ fsync */
+
+int
+afr_fsync_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
+}
+
+int
+afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int call_count = -1;
+ int child_index = (long) cookie;
+ int read_subvol = 0;
+ call_stub_t *stub = NULL;
+
+ local = frame->local;
+
+ read_subvol = afr_data_subvol_get (local->inode, this, 0, 0);
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0) {
+ if (local->op_ret == -1) {
+ local->op_ret = 0;
+
+ local->cont.inode_wfop.prebuf = *prebuf;
+ local->cont.inode_wfop.postbuf = *postbuf;
+
+ if (xdata)
+ local->xdata_rsp = dict_ref (xdata);
+ }
+
+ if (child_index == read_subvol) {
+ local->cont.inode_wfop.prebuf = *prebuf;
+ local->cont.inode_wfop.postbuf = *postbuf;
+ if (xdata) {
+ if (local->xdata_rsp)
+ dict_unref (local->xdata_rsp);
+ local->xdata_rsp = dict_ref (xdata);
+ }
+ }
+ } else {
+ local->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ /* Make a stub out of the frame, and register it
+ with the waking up post-op. When the call-stub resumes,
+ we are guaranteed that there was no post-op pending
+ (i.e changelogs were unset in the server). This is an
+ essential "guarantee", that fsync() returns only after
+ completely finishing EVERYTHING, including the delayed
+ post-op. This guarantee is expected by FUSE graph switching
+ for example.
+ */
+ stub = fop_fsync_cbk_stub (frame, afr_fsync_unwind_cbk,
+ local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
+ local->xdata_rsp);
+ if (!stub) {
+ AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0);
+ return 0;
+ }
+
+ /* If no new unstable writes happened between the
+ time we cleared the unstable write witness flag in afr_fsync
+ and now, calling afr_delayed_changelog_wake_up() should
+ wake up and skip over the fsync phase and go straight to
+ afr_changelog_post_op_now()
+ */
+ afr_delayed_changelog_wake_resume (this, local->fd, stub);
+ }
+
+ return 0;
+}
+
+
+int
+afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_errno = ENOMEM;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ local->fd = fd_ref (fd);
+
+ if (afr_fd_has_witnessed_unstable_write (this, fd)) {
+ /* don't care. we only wanted to CLEAR the bit */
+ }
+
+ local->inode = inode_ref (fd->inode);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_fsync_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fsync,
+ fd, datasync, xdata);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+out:
+ AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ fsync */
+
+int
+afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
+ } else {
+ local->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret,
+ local->op_errno, local->xdata_rsp);
+
+ return 0;
+}
+
+
+int
+afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_errno = ENOMEM;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_fsyncdir_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fsyncdir,
+ fd, datasync, xdata);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+out:
+ AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ xattrop */
+
+int32_t
+afr_xattrop_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *xattr, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0) {
+ if (!local->cont.xattrop.xattr)
+ local->cont.xattrop.xattr = dict_ref (xattr);
+
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
+
+ local->op_ret = 0;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (xattrop, frame, local->op_ret, local->op_errno,
+ local->cont.xattrop.xattr, local->xdata_rsp);
+
+ return 0;
+}
+
+
+int32_t
+afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_errno = ENOMEM;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_xattrop_cbk,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ loc, optype, xattr, xdata);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+out:
+ AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+/* }}} */
+
+/* {{{ fxattrop */
+
+int32_t
+afr_fxattrop_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *xattr, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0) {
+ if (!local->cont.fxattrop.xattr)
+ local->cont.fxattrop.xattr = dict_ref (xattr);
+
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
+ local->op_ret = 0;
+ }
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (fxattrop, frame, local->op_ret, local->op_errno,
+ local->cont.fxattrop.xattr, local->xdata_rsp);
+
+ return 0;
+}
+
+
+int32_t
+afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_errno = 0;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_fxattrop_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ fd, optype, xattr, xdata);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+out:
+ AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+/* }}} */
+
+
+int32_t
+afr_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+
+{
+ afr_local_t *local = NULL;
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (inodelk, frame, local->op_ret,
+ local->op_errno, xdata);
+
+ return 0;
+}
+
+
+int32_t
+afr_inodelk (call_frame_t *frame, xlator_t *this,
+ const char *volume, loc_t *loc, int32_t cmd,
+ struct gf_flock *flock, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_errno = ENOMEM;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_inodelk_cbk,
+ priv->children[i],
+ priv->children[i]->fops->inodelk,
+ volume, loc, cmd, flock, xdata);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+out:
+ AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int32_t
+afr_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+
+{
+ afr_local_t *local = NULL;
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (finodelk, frame, local->op_ret,
+ local->op_errno, xdata);
+
+ return 0;
+}
+
+
+int32_t
+afr_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_errno = ENOMEM;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_finodelk_cbk,
+ priv->children[i],
+ priv->children[i]->fops->finodelk,
+ volume, fd, cmd, flock, xdata);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+out:
+ AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int32_t
+afr_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (entrylk, frame, local->op_ret,
+ local->op_errno, xdata);
+
+ return 0;
+}
+
+
+int
+afr_entrylk (call_frame_t *frame, xlator_t *this, const char *volume,
+ loc_t *loc, const char *basename, entrylk_cmd cmd,
+ entrylk_type type, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_errno = 0;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_entrylk_cbk,
+ priv->children[i],
+ priv->children[i]->fops->entrylk,
+ volume, loc, basename, cmd, type, xdata);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+out:
+ AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+
+int
+afr_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+
+{
+ afr_local_t *local = NULL;
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (fentrylk, frame, local->op_ret,
+ local->op_errno, xdata);
+
+ return 0;
+}
+
+
+int
+afr_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_errno = ENOMEM;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_fentrylk_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fentrylk,
+ volume, fd, basename, cmd, type, xdata);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+out:
+ AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+
+int
+afr_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct statvfs *statvfs, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int call_count = 0;
+ struct statvfs *buf = NULL;
+
+ LOCK (&frame->lock);
+ {
+ local = frame->local;
+
+ if (op_ret != 0) {
+ local->op_errno = op_errno;
+ goto unlock;
+ }
+
+ local->op_ret = op_ret;
+
+ buf = &local->cont.statfs.buf;
+ if (local->cont.statfs.buf_set) {
+ if (statvfs->f_bavail < buf->f_bavail) {
+ *buf = *statvfs;
+ if (xdata) {
+ if (local->xdata_rsp)
+ dict_unref (local->xdata_rsp);
+ local->xdata_rsp = dict_ref (xdata);
+ }
+ }
+ } else {
+ *buf = *statvfs;
+ local->cont.statfs.buf_set = 1;
+ if (xdata)
+ local->xdata_rsp = dict_ref (xdata);
+ }
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno,
+ &local->cont.statfs.buf, local->xdata_rsp);
+
+ return 0;
+}
+
+
+int
+afr_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ afr_local_t * local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
+ int32_t op_errno = ENOMEM;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND (frame, afr_statfs_cbk,
+ priv->children[i],
+ priv->children[i]->fops->statfs,
+ loc, xdata);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+out:
+ AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+
+int32_t
+afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ afr_local_t * local = NULL;
+ int call_count = -1;
+
+ local = frame->local;
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno,
+ lock, xdata);
+
+ return 0;
+}
+
+
+int32_t
+afr_lk_unlock (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ int i = 0;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes,
+ priv->child_count);
+
+ if (call_count == 0) {
+ AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno,
+ &local->cont.lk.ret_flock, NULL);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ local->cont.lk.user_flock.l_type = F_UNLCK;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->cont.lk.locked_nodes[i]) {
+ STACK_WIND (frame, afr_lk_unlock_cbk,
+ priv->children[i],
+ priv->children[i]->fops->lk,
+ local->fd, F_SETLK,
+ &local->cont.lk.user_flock, NULL);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int32_t
+afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int child_index = -1;
+/* int ret = 0; */
+
+
+ local = frame->local;
+ priv = this->private;
+
+ child_index = (long) cookie;
+
+ if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+
+ afr_lk_unlock (frame, this);
+ return 0;
+ }
+
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->op_errno = 0;
+ local->cont.lk.locked_nodes[child_index] = 1;
+ local->cont.lk.ret_flock = *lock;
+ }
+
+ child_index++;
+
+ if (child_index < priv->child_count) {
+ STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->lk,
+ local->fd, local->cont.lk.cmd,
+ &local->cont.lk.user_flock, xdata);
+ } else if (local->op_ret == -1) {
+ /* all nodes have gone down */
+
+ AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN,
+ &local->cont.lk.ret_flock, NULL);
+ } else {
+ AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno,
+ &local->cont.lk.ret_flock, NULL);
+ }
+
+ return 0;
+}
+
+
+int
+afr_lk (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int32_t op_errno = ENOMEM;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count,
+ sizeof (*local->cont.lk.locked_nodes),
+ gf_afr_mt_char);
+
+ if (!local->cont.lk.locked_nodes) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->fd = fd_ref (fd);
+ local->cont.lk.cmd = cmd;
+ local->cont.lk.user_flock = *flock;
+ local->cont.lk.ret_flock = *flock;
+
+ STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0,
+ priv->children[i],
+ priv->children[i]->fops->lk,
+ fd, cmd, flock, xdata);
+
+ return 0;
+out:
+ AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
+}
+
+int
+afr_forget (xlator_t *this, inode_t *inode)
+{
+ return 0;
+}
+
+int
+afr_priv_dump (xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ char key_prefix[GF_DUMP_MAX_BUF_LEN];
+ char key[GF_DUMP_MAX_BUF_LEN];
+ int i = 0;
+
+
+ GF_ASSERT (this);
+ priv = this->private;
+
+ GF_ASSERT (priv);
+ snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name);
+ gf_proc_dump_add_section(key_prefix);
+ gf_proc_dump_write("child_count", "%u", priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ sprintf (key, "child_up[%d]", i);
+ gf_proc_dump_write(key, "%d", priv->child_up[i]);
+ sprintf (key, "pending_key[%d]", i);
+ gf_proc_dump_write(key, "%s", priv->pending_key[i]);
+ }
+ gf_proc_dump_write("data_self_heal", "%s", priv->data_self_heal);
+ gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal);
+ gf_proc_dump_write("entry_self_heal", "%d", priv->entry_self_heal);
+ gf_proc_dump_write("data_change_log", "%d", priv->data_change_log);
+ gf_proc_dump_write("metadata_change_log", "%d", priv->metadata_change_log);
+ gf_proc_dump_write("entry-change_log", "%d", priv->entry_change_log);
+ gf_proc_dump_write("read_child", "%d", priv->read_child);
+ gf_proc_dump_write("favorite_child", "%d", priv->favorite_child);
+ gf_proc_dump_write("wait_count", "%u", priv->wait_count);
+
+ return 0;
+}
+
+
+/**
+ * find_child_index - find the child's index in the array of subvolumes
+ * @this: AFR
+ * @child: child
+ */
+
+static int
+find_child_index (xlator_t *this, xlator_t *child)
+{
+ afr_private_t *priv = NULL;
+ int i = -1;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if ((xlator_t *) child == priv->children[i])
+ break;
+ }
+
+ return i;
+}
+
+int32_t
+afr_notify (xlator_t *this, int32_t event,
+ void *data, void *data2)
+{
+ afr_private_t *priv = NULL;
+ int i = -1;
+ int up_children = 0;
+ int down_children = 0;
+ int propagate = 0;
+ int had_heard_from_all = 0;
+ int have_heard_from_all = 0;
+ int idx = -1;
+ int ret = -1;
+ int call_psh = 0;
+ int up_child = -1;
+ dict_t *input = NULL;
+ dict_t *output = NULL;
+
+ priv = this->private;
+
+ if (!priv)
+ return 0;
+
+ /*
+ * We need to reset this in case children come up in "staggered"
+ * fashion, so that we discover a late-arriving local subvolume. Note
+ * that we could end up issuing N lookups to the first subvolume, and
+ * O(N^2) overall, but N is small for AFR so it shouldn't be an issue.
+ */
+ priv->did_discovery = _gf_false;
+
+ had_heard_from_all = 1;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->last_event[i]) {
+ had_heard_from_all = 0;
+ }
+ }
+
+ /* parent xlators dont need to know about every child_up, child_down
+ * because of afr ha. If all subvolumes go down, child_down has
+ * to be triggered. In that state when 1 subvolume comes up child_up
+ * needs to be triggered. dht optimizes revalidate lookup by sending
+ * it only to one of its subvolumes. When child up/down happens
+ * for afr's subvolumes dht should be notified by child_modified. The
+ * subsequent revalidate lookup happens on all the dht's subvolumes
+ * which triggers afr self-heals if any.
+ */
+ idx = find_child_index (this, data);
+ if (idx < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Received child_up "
+ "from invalid subvolume");
+ goto out;
+ }
+
+ switch (event) {
+ case GF_EVENT_CHILD_UP:
+ LOCK (&priv->lock);
+ {
+ /*
+ * This only really counts if the child was never up
+ * (value = -1) or had been down (value = 0). See
+ * comment at GF_EVENT_CHILD_DOWN for a more detailed
+ * explanation.
+ */
+ if (priv->child_up[idx] != 1) {
+ priv->up_count++;
+ priv->event_generation++;
+ }
+ priv->child_up[idx] = 1;
+
+ call_psh = 1;
+ up_child = idx;
+ for (i = 0; i < priv->child_count; i++)
+ if (priv->child_up[i] == 1)
+ up_children++;
+ if (up_children == 1) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Subvolume '%s' came back up; "
+ "going online.", ((xlator_t *)data)->name);
+ } else {
+ event = GF_EVENT_CHILD_MODIFIED;
+ }
+
+ priv->last_event[idx] = event;
+ }
+ UNLOCK (&priv->lock);
+
+ break;
+
+ case GF_EVENT_CHILD_DOWN:
+ LOCK (&priv->lock);
+ {
+ /*
+ * If a brick is down when we start, we'll get a
+ * CHILD_DOWN to indicate its initial state. There
+ * was never a CHILD_UP in this case, so if we
+ * increment "down_count" the difference between than
+ * and "up_count" will no longer be the number of
+ * children that are currently up. This has serious
+ * implications e.g. for quorum enforcement, so we
+ * don't increment these values unless the event
+ * represents an actual state transition between "up"
+ * (value = 1) and anything else.
+ */
+ if (priv->child_up[idx] == 1) {
+ priv->down_count++;
+ priv->event_generation++;
+ }
+ priv->child_up[idx] = 0;
+
+ for (i = 0; i < priv->child_count; i++)
+ if (priv->child_up[i] == 0)
+ down_children++;
+ if (down_children == priv->child_count) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "All subvolumes are down. Going offline "
+ "until atleast one of them comes back up.");
+ } else {
+ event = GF_EVENT_CHILD_MODIFIED;
+ }
+
+ priv->last_event[idx] = event;
+ }
+ UNLOCK (&priv->lock);
+
+ break;
+
+ case GF_EVENT_CHILD_CONNECTING:
+ LOCK (&priv->lock);
+ {
+ priv->last_event[idx] = event;
+ }
+ UNLOCK (&priv->lock);
+
+ break;
+
+ case GF_EVENT_TRANSLATOR_OP:
+ input = data;
+ output = data2;
+ if (!had_heard_from_all) {
+ ret = -1;
+ goto out;
+ }
+ ret = afr_xl_op (this, input, output);
+ goto out;
+ break;
+
+ default:
+ propagate = 1;
+ break;
+ }
+
+ /* have all subvolumes reported status once by now? */
+ have_heard_from_all = 1;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->last_event[i])
+ have_heard_from_all = 0;
+ }
+
+ /* if all subvols have reported status, no need to hide anything
+ or wait for anything else. Just propagate blindly */
+ if (have_heard_from_all)
+ propagate = 1;
+
+ if (!had_heard_from_all && have_heard_from_all) {
+ /* This is the first event which completes aggregation
+ of events from all subvolumes. If at least one subvol
+ had come up, propagate CHILD_UP, but only this time
+ */
+ event = GF_EVENT_CHILD_DOWN;
+
+ LOCK (&priv->lock);
+ {
+ up_children = AFR_COUNT (priv->child_up, priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->last_event[i] == GF_EVENT_CHILD_UP) {
+ event = GF_EVENT_CHILD_UP;
+ break;
+ }
+
+ if (priv->last_event[i] ==
+ GF_EVENT_CHILD_CONNECTING) {
+ event = GF_EVENT_CHILD_CONNECTING;
+ /* continue to check other events for CHILD_UP */
+ }
+ }
+ }
+ UNLOCK (&priv->lock);
+ }
+
+ ret = 0;
+ if (propagate)
+ ret = default_notify (this, event, data);
+
+ if (call_psh && priv->shd.iamshd) {
+ afr_selfheal_childup (this, up_child);
+ }
+out:
+ return ret;
+}
+
+
+int
+afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
+{
+ local->op_ret = -1;
+ local->op_errno = EUCLEAN;
+
+ syncbarrier_init (&local->barrier);
+
+ local->child_up = GF_CALLOC (priv->child_count,
+ sizeof (*local->child_up),
+ gf_afr_mt_char);
+ if (!local->child_up) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ memcpy (local->child_up, priv->child_up,
+ sizeof (*local->child_up) * priv->child_count);
+ local->call_count = AFR_COUNT (local->child_up, priv->child_count);
+ if (local->call_count == 0) {
+ gf_log (THIS->name, GF_LOG_INFO, "no subvolumes up");
+ if (op_errno)
+ *op_errno = ENOTCONN;
+ goto out;
+ }
+ local->event_generation = priv->event_generation;
+
+ local->read_attempted = GF_CALLOC (priv->child_count, sizeof (char),
+ gf_afr_mt_char);
+ if (!local->read_attempted) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->readable = GF_CALLOC (priv->child_count, sizeof (char),
+ gf_afr_mt_char);
+ if (!local->readable) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies),
+ gf_afr_mt_reply_t);
+ if (!local->replies) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ return 0;
+out:
+ return -1;
+}
+
+int
+afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count,
+ transaction_lk_type_t lk_type)
+{
+ int ret = -ENOMEM;
+
+ lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes),
+ child_count, gf_afr_mt_char);
+ if (NULL == lk->locked_nodes)
+ goto out;
+
+ lk->lower_locked_nodes = GF_CALLOC (sizeof (*lk->lower_locked_nodes),
+ child_count, gf_afr_mt_char);
+ if (NULL == lk->lower_locked_nodes)
+ goto out;
+
+ lk->lock_op_ret = -1;
+ lk->lock_op_errno = EUCLEAN;
+ lk->transaction_lk_type = lk_type;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+void
+afr_matrix_cleanup (int32_t **matrix, unsigned int m)
+{
+ int i = 0;
+
+ if (!matrix)
+ goto out;
+ for (i = 0; i < m; i++) {
+ GF_FREE (matrix[i]);
+ }
+
+ GF_FREE (matrix);
+out:
+ return;
+}
+
+int32_t**
+afr_matrix_create (unsigned int m, unsigned int n)
+{
+ int32_t **matrix = NULL;
+ int i = 0;
+
+ matrix = GF_CALLOC (sizeof (*matrix), m, gf_afr_mt_int32_t);
+ if (!matrix)
+ goto out;
+
+ for (i = 0; i < m; i++) {
+ matrix[i] = GF_CALLOC (sizeof (*matrix[i]), n,
+ gf_afr_mt_int32_t);
+ if (!matrix[i])
+ goto out;
+ }
+ return matrix;
+out:
+ afr_matrix_cleanup (matrix, m);
+ return NULL;
+}
+
+int
+afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count)
+{
+ int ret = -ENOMEM;
+
+ lk->domain = dom;
+ lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes),
+ child_count, gf_afr_mt_char);
+ if (NULL == lk->locked_nodes)
+ goto out;
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+afr_transaction_local_init (afr_local_t *local, xlator_t *this)
+{
+ int child_up_count = 0;
+ int ret = -ENOMEM;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ ret = afr_internal_lock_init (&local->internal_lock, priv->child_count,
+ AFR_TRANSACTION_LK);
+ if (ret < 0)
+ goto out;
+
+ if ((local->transaction.type == AFR_DATA_TRANSACTION) ||
+ (local->transaction.type == AFR_METADATA_TRANSACTION)) {
+ ret = afr_inodelk_init (&local->internal_lock.inodelk[0],
+ this->name, priv->child_count);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = -ENOMEM;
+ child_up_count = AFR_COUNT (local->child_up, priv->child_count);
+ if (priv->optimistic_change_log && child_up_count == priv->child_count)
+ local->optimistic_change_log = 1;
+
+ local->pre_op_compat = priv->pre_op_compat;
+
+ local->transaction.eager_lock =
+ GF_CALLOC (sizeof (*local->transaction.eager_lock),
+ priv->child_count,
+ gf_afr_mt_int32_t);
+
+ if (!local->transaction.eager_lock)
+ goto out;
+
+ local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op),
+ priv->child_count,
+ gf_afr_mt_char);
+ if (!local->transaction.pre_op)
+ goto out;
+
+ local->transaction.fop_subvols = GF_CALLOC (sizeof (*local->transaction.fop_subvols),
+ priv->child_count,
+ gf_afr_mt_char);
+ if (!local->transaction.fop_subvols)
+ goto out;
+
+ local->transaction.failed_subvols = GF_CALLOC (sizeof (*local->transaction.failed_subvols),
+ priv->child_count,
+ gf_afr_mt_char);
+ if (!local->transaction.failed_subvols)
+ goto out;
+
+ local->pending = afr_matrix_create (priv->child_count,
+ AFR_NUM_CHANGE_LOGS);
+ if (!local->pending)
+ goto out;
+
+ INIT_LIST_HEAD (&local->transaction.eager_locked);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+
+void
+afr_set_low_priority (call_frame_t *frame)
+{
+ frame->root->pid = LOW_PRIO_PROC_PID;
+}
+
+
+gf_boolean_t
+afr_have_quorum (char *logname, afr_private_t *priv)
+{
+ unsigned int quorum = 0;
+
+ GF_VALIDATE_OR_GOTO(logname,priv,out);
+
+ quorum = priv->quorum_count;
+ if (quorum != AFR_QUORUM_AUTO) {
+ return (priv->up_count >= (priv->down_count + quorum));
+ }
+
+ quorum = priv->child_count / 2 + 1;
+ if (priv->up_count >= (priv->down_count + quorum)) {
+ return _gf_true;
+ }
+
+ /*
+ * Special case for even numbers of nodes: if we have exactly half
+ * and that includes the first ("senior-most") node, then that counts
+ * as quorum even if it wouldn't otherwise. This supports e.g. N=2
+ * while preserving the critical property that there can only be one
+ * such group.
+ */
+ if ((priv->child_count % 2) == 0) {
+ quorum = priv->child_count / 2;
+ if (priv->up_count >= (priv->down_count + quorum)) {
+ if (priv->child_up[0]) {
+ return _gf_true;
+ }
+ }
+ }
+
+out:
+ return _gf_false;
+}
+
+void
+afr_priv_destroy (afr_private_t *priv)
+{
+ int i = 0;
+
+ if (!priv)
+ goto out;
+ inode_unref (priv->root_inode);
+ GF_FREE (priv->last_event);
+ if (priv->pending_key) {
+ for (i = 0; i < priv->child_count; i++)
+ GF_FREE (priv->pending_key[i]);
+ }
+ GF_FREE (priv->pending_key);
+ GF_FREE (priv->children);
+ GF_FREE (priv->child_up);
+ LOCK_DESTROY (&priv->lock);
+
+ GF_FREE (priv);
+out:
+ return;
+}
+
+int
+xlator_subvolume_count (xlator_t *this)
+{
+ int i = 0;
+ xlator_list_t *list = NULL;
+
+ for (list = this->children; list; list = list->next)
+ i++;
+ return i;
+}
+
+
+void
+afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ local = frame->local;
+
+ if (!local->fd)
+ return;
+
+ fd_ctx = afr_fd_ctx_get (local->fd, this);
+ if (!fd_ctx)
+ return;
+
+ fd_ctx->open_fd_count = local->open_fd_count;
+}
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
index fd3fc1ba7..fa1da3958 100644
--- a/xlators/cluster/afr/src/afr-dir-read.c
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
@@ -24,6 +15,7 @@
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
+#include <string.h>
#ifndef _CONFIG_H
#define _CONFIG_H
@@ -42,304 +34,395 @@
#include "common-utils.h"
#include "compat-errno.h"
#include "compat.h"
+#include "checksum.h"
#include "afr.h"
+#include "afr-transaction.h"
int32_t
afr_opendir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- fd_t *fd)
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ fd_t *fd, dict_t *xdata)
{
- afr_local_t * local = NULL;
+ afr_local_t *local = NULL;
+ int call_count = -1;
+ int32_t child_index = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ local = frame->local;
+ fd_ctx = local->fd_ctx;
+ child_index = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+ } else {
+ local->op_ret = op_ret;
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ AFR_STACK_UNWIND (opendir, frame, local->op_ret,
+ local->op_errno, local->fd, NULL);
+ return 0;
+}
- int call_count = -1;
- LOCK (&frame->lock);
- {
- local = frame->local;
+int
+afr_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ int i = 0;
+ int call_count = -1;
+ int32_t op_errno = ENOMEM;
+ afr_fd_ctx_t *fd_ctx = NULL;
- if (op_ret == 0)
- local->op_ret = 0;
+ priv = this->private;
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- call_count = afr_frame_return (frame);
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
+ goto out;
- if (call_count == 0) {
- AFR_STACK_UNWIND (frame, local->op_ret,
- local->op_errno, local->fd);
- }
+ loc_copy (&local->loc, loc);
+
+ local->fd = fd_ref (fd);
+ local->fd_ctx = fd_ctx;
+
+ call_count = local->call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_opendir_cbk,
+ (void*) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->opendir,
+ loc, fd, NULL);
+
+ if (!--call_count)
+ break;
+ }
+ }
return 0;
+out:
+ AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL);
+ return 0;
}
-int32_t
-afr_opendir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, fd_t *fd)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
-
- int child_count = 0;
- int i = 0;
+#define BACKEND_D_OFF_BITS 63
+#define PRESENT_D_OFF_BITS 63
- int ret = -1;
- int call_count = -1;
+#define ONE 1ULL
+#define MASK (~0ULL)
+#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS))
+#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS))
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1))
+#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1)))
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+static uint64_t
+afr_bits_for (uint64_t num)
+{
+ uint64_t bits = 0, ctrl = 1;
- priv = this->private;
+ while (ctrl < num) {
+ ctrl *= 2;
+ bits ++;
+ }
- child_count = priv->child_count;
+ return bits;
+}
- ALLOC_OR_GOTO (local, afr_local_t, out);
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
+int
+afr_itransform (xlator_t *this, int subvol, uint64_t x, uint64_t *y_p)
+{
+ afr_private_t *conf = NULL;
+ int cnt = 0;
+ int max = 0;
+ uint64_t y = 0;
+ uint64_t hi_mask = 0;
+ uint64_t off_mask = 0;
+ int max_bits = 0;
+
+ if (x == ((uint64_t) -1)) {
+ y = (uint64_t) -1;
+ goto out;
+ }
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ max = conf->child_count;
+ cnt = subvol;
+
+ if (max == 1) {
+ y = x;
goto out;
}
- frame->local = local;
- local->fd = fd_ref (fd);
+ max_bits = afr_bits_for (max);
- call_count = local->call_count;
-
- for (i = 0; i < child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_opendir_cbk,
- priv->children[i],
- priv->children[i]->fops->opendir,
- loc, fd);
+ hi_mask = ~(PRESENT_MASK >> (max_bits + 1));
- if (!--call_count)
- break;
- }
- }
+ if (x & hi_mask) {
+ /* HUGE d_off */
+ off_mask = MASK << max_bits;
+ y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt;
+ } else {
+ /* small d_off */
+ y = ((x * max) + cnt);
+ }
- op_ret = 0;
out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno, fd);
- }
+ if (y_p)
+ *y_p = y;
- return 0;
+ return 0;
}
-/**
- * Common algorithm for directory read calls:
- *
- * - Try the fop on the first child that is up
- * - if we have failed due to ENOTCONN:
- * try the next child
- *
- * Applicable to: readdir
- */
-
-int32_t
-afr_readdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- gf_dirent_t *buf)
+int
+afr_deitransform (xlator_t *this, uint64_t y, int *subvol_p,
+ uint64_t *x_p)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
+ afr_private_t *conf = NULL;
+ int cnt = 0;
+ int max = 0;
+ uint64_t x = 0;
+ int subvol = 0;
+ int max_bits = 0;
+ uint64_t off_mask = 0;
+ uint64_t host_mask = 0;
+
+ if (!this->private)
+ return -1;
+
+ conf = this->private;
+ max = conf->child_count;
+
+ if (max == 1) {
+ x = y;
+ cnt = 0;
+ goto out;
+ }
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
+ if (y & TOP_BIT) {
+ /* HUGE d_off */
+ max_bits = afr_bits_for (max);
+ off_mask = (MASK << max_bits);
+ host_mask = ~(off_mask);
- priv = this->private;
- children = priv->children;
+ x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS;
- local = frame->local;
+ cnt = y & host_mask;
+ } else {
+ /* small d_off */
+ cnt = y % max;
+ x = y / max;
+ }
- if (op_ret == -1) {
- last_tried = local->cont.readdir.last_tried;
+out:
+ subvol = cnt;
- if (all_tried (last_tried, priv->child_count)) {
- goto out;
- }
+ if (subvol_p)
+ *subvol_p = subvol;
- this_try = ++local->cont.readdir.last_tried;
- unwind = 0;
+ if (x_p)
+ *x_p = x;
- STACK_WIND (frame, afr_readdir_cbk,
- children[this_try],
- children[this_try]->fops->readdir,
- local->fd, local->cont.readdir.size,
- local->cont.readdir.offset);
- }
+ return 0;
+}
-out:
- if (unwind) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno, buf);
- }
- return 0;
+static void
+afr_readdir_transform_entries (gf_dirent_t *subvol_entries, int subvol,
+ gf_dirent_t *entries, fd_t *fd)
+{
+ afr_private_t *priv = NULL;
+ gf_dirent_t *entry = NULL;
+ gf_dirent_t *tmp = NULL;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ int gen = 0;
+
+ priv = THIS->private;
+
+ data_readable = alloca0 (priv->child_count);
+ metadata_readable = alloca0 (priv->child_count);
+
+ list_for_each_entry_safe (entry, tmp, &subvol_entries->list, list) {
+ if (__is_root_gfid (fd->inode->gfid) &&
+ !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) {
+ continue;
+ }
+
+ list_del_init (&entry->list);
+ afr_itransform (THIS, subvol, entry->d_off, &entry->d_off);
+ list_add_tail (&entry->list, &entries->list);
+
+ if (entry->inode) {
+ gen = 0;
+ afr_inode_read_subvol_get (entry->inode, THIS,
+ data_readable,
+ metadata_readable, &gen);
+
+ if (gen != priv->event_generation ||
+ !data_readable[subvol] ||
+ !metadata_readable[subvol]) {
+
+ inode_unref (entry->inode);
+ entry->inode = NULL;
+ }
+ }
+ }
}
int32_t
-afr_readdir (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset)
+afr_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *subvol_entries,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- xlator_t ** children = NULL;
- int call_child = 0;
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ gf_dirent_t entries;
- int ret = -1;
+ INIT_LIST_HEAD (&entries.list);
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ local = frame->local;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
- children = priv->children;
+ if (op_ret < 0 && !local->cont.readdir.offset) {
+ /* failover only if this was first readdir, detected
+ by offset == 0 */
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- ALLOC_OR_GOTO (local, afr_local_t, out);
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- frame->local = local;
-
- call_child = afr_first_up_child (priv);
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_ERROR,
- "no child is up :(");
- goto out;
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
}
- local->cont.readdir.last_tried = call_child;
-
- local->fd = fd_ref (fd);
- local->cont.readdir.size = size;
- local->cont.readdir.offset = offset;
+ if (op_ret >= 0)
+ afr_readdir_transform_entries (subvol_entries, (long) cookie,
+ &entries, local->fd);
- STACK_WIND (frame, afr_readdir_cbk,
- children[call_child], children[call_child]->fops->readdir,
- fd, size, offset);
+ AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries, xdata);
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
- }
- return 0;
+ return 0;
}
-int32_t
-afr_getdents_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dir_entry_t *entry, int32_t count)
+int
+afr_readdir_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
+ priv = this->private;
+ local = frame->local;
- priv = this->private;
- children = priv->children;
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (readdir, frame, local->op_ret,
+ local->op_errno, 0, 0);
+ return 0;
+ }
- local = frame->local;
+ if (local->op == GF_FOP_READDIR)
+ STACK_WIND_COOKIE (frame, afr_readdir_cbk,
+ (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->readdir,
+ local->fd, local->cont.readdir.size,
+ local->cont.readdir.offset,
+ local->xdata_req);
+ else
+ STACK_WIND_COOKIE (frame, afr_readdir_cbk,
+ (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->readdirp,
+ local->fd, local->cont.readdir.size,
+ local->cont.readdir.offset,
+ local->xdata_req);
+ return 0;
+}
- if (op_ret == -1) {
- last_tried = local->cont.getdents.last_tried;
- if (all_tried (last_tried, priv->child_count)) {
- goto out;
- }
+int
+afr_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, int whichop, dict_t *dict)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
+ int subvol = -1;
- this_try = ++local->cont.getdents.last_tried;
- unwind = 0;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- STACK_WIND (frame, afr_getdents_cbk,
- children[this_try],
- children[this_try]->fops->getdents,
- local->fd, local->cont.getdents.size,
- local->cont.getdents.offset, local->cont.getdents.flag);
+ local->op = whichop;
+ local->fd = fd_ref (fd);
+ local->cont.readdir.size = size;
+ local->cont.readdir.offset = offset;
+ local->xdata_req = (dict)? dict_ref (dict) : NULL;
+
+ if (offset == 0) {
+ /* First readdir has option of failing over and selecting
+ an appropriate read subvolume */
+ afr_read_txn (frame, this, fd->inode, afr_readdir_wind,
+ AFR_DATA_TRANSACTION);
+ } else {
+ /* But continued readdirs MUST stick to the same subvolume
+ without an option to failover */
+ afr_deitransform (this, offset, &subvol,
+ (uint64_t *)&local->cont.readdir.offset);
+ afr_readdir_wind (frame, this, subvol);
}
+ return 0;
out:
- if (unwind) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno, entry, count);
- }
-
- return 0;
+ AFR_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL);
+ return 0;
}
int32_t
-afr_getdents (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset, int32_t flag)
+afr_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- xlator_t ** children = NULL;
- int call_child = 0;
- afr_local_t *local = NULL;
-
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
- children = priv->children;
+ afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata);
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- call_child = afr_first_up_child (priv);
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_ERROR,
- "no child is up :(");
- goto out;
- }
+ return 0;
+}
- local->cont.getdents.last_tried = call_child;
- local->fd = fd_ref (fd);
+int32_t
+afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *dict)
+{
+ afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP, dict);
- local->cont.getdents.size = size;
- local->cont.getdents.offset = offset;
- local->cont.getdents.flag = flag;
-
- frame->local = local;
+ return 0;
+}
- STACK_WIND (frame, afr_getdents_cbk,
- children[call_child], children[call_child]->fops->getdents,
- fd, size, offset, flag);
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
- }
+int32_t
+afr_releasedir (xlator_t *this, fd_t *fd)
+{
+ afr_cleanup_fd_ctx (this, fd);
- return 0;
+ return 0;
}
-
-
diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h
index 6d981fdfd..09456d159 100644
--- a/xlators/cluster/afr/src/afr-dir-read.h
+++ b/xlators/cluster/afr/src/afr-dir-read.h
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef __DIR_READ_H__
@@ -23,25 +14,23 @@
int32_t
afr_opendir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, fd_t *fd);
+ loc_t *loc, fd_t *fd, dict_t *xdata);
int32_t
-afr_closedir (call_frame_t *frame, xlator_t *this,
- fd_t *fd);
+afr_releasedir (xlator_t *this, fd_t *fd);
int32_t
afr_readdir (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset);
+ fd_t *fd, size_t size, off_t offset, dict_t *xdata);
int32_t
-afr_getdents (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset, int32_t flag);
-
+afr_readdirp (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset, dict_t *dict);
int32_t
afr_checksum (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags);
+ loc_t *loc, int32_t flags, dict_t *xdata);
#endif /* __DIR_READ_H__ */
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
index e357c7b17..465dde54f 100644
--- a/xlators/cluster/afr/src/afr-dir-write.c
+++ b/xlators/cluster/afr/src/afr-dir-write.c
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
@@ -43,1880 +34,1446 @@
#include "common-utils.h"
#include "compat-errno.h"
#include "compat.h"
+#include "byte-order.h"
#include "afr.h"
#include "afr-transaction.h"
-
void
-afr_build_parent_loc (loc_t *parent, loc_t *child)
-{
- char *tmp = NULL;
-
- if (!child->parent) {
- loc_copy (parent, child);
- return;
- }
-
- tmp = strdup (child->path);
- parent->path = strdup (dirname (tmp));
- FREE (tmp);
-
- parent->name = strrchr (parent->path, '/');
- if (parent->name)
- parent->name++;
-
- parent->inode = inode_ref (child->parent);
- parent->parent = inode_parent (parent->inode, 0, NULL);
- parent->ino = parent->inode->ino;
-}
-
-/* {{{ create */
+afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this);
int
-afr_create_unwind (call_frame_t *frame, xlator_t *this)
+afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
+ int ret = -1;
+ char *child_path = NULL;
- local = frame->local;
+ if (!child->parent) {
+ if (op_errno)
+ *op_errno = EINVAL;
+ goto out;
+ }
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ child_path = gf_strdup (child->path);
+ if (!child_path) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
- if (main_frame) {
- AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
- local->cont.create.fd,
- local->cont.create.inode,
- &local->cont.create.buf);
+ parent->path = gf_strdup (dirname (child_path));
+ if (!parent->path) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
}
-
- return 0;
-}
+ parent->inode = inode_ref (child->parent);
+ uuid_copy (parent->gfid, child->pargfid);
-int
-afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- fd_t *fd, inode_t *inode, struct stat *buf)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
+ ret = 0;
+out:
+ GF_FREE (child_path);
- int ret = 0;
+ return ret;
+}
- int call_count = -1;
- int child_index = -1;
- local = frame->local;
- priv = this->private;
+static void
+__afr_dir_write_finalize (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int inode_read_subvol = -1;
+ int parent_read_subvol = -1;
+ int parent2_read_subvol = -1;
+ int i = 0;
- child_index = (long) cookie;
+ local = frame->local;
+ priv = this->private;
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- ret = afr_fd_ctx_set (this, fd);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not set ctx on fd=%p", fd);
-
- local->op_ret = -1;
- local->op_errno = -ret;
- }
-
-
- if (op_ret != -1) {
- local->op_ret = op_ret;
-
- if (local->success_count == 0) {
- local->cont.create.buf = *buf;
- local->cont.create.buf.st_ino =
- afr_itransform (buf->st_ino,
- priv->child_count,
- child_index);
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this, inode,
- priv->read_child);
- } else {
- afr_set_read_child (this, inode,
- local->read_child_index);
- }
- }
-
- if (child_index == local->read_child_index) {
- if (priv->read_child >= 0) {
- afr_set_read_child (this, inode,
- priv->read_child);
- } else {
- afr_set_read_child (this, inode,
- local->read_child_index);
- }
- }
-
- local->cont.create.inode = inode;
-
- local->success_count++;
+ if (local->inode) {
+ afr_replies_interpret (frame, this, local->inode);
+ inode_read_subvol = afr_data_subvol_get (local->inode, this,
+ NULL, NULL);
+ }
+ if (local->parent)
+ parent_read_subvol = afr_data_subvol_get (local->parent, this,
+ NULL, NULL);
+ if (local->parent2)
+ parent2_read_subvol = afr_data_subvol_get (local->parent2, this,
+ NULL, NULL);
+
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno (local, priv);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret < 0) {
+ if (local->inode)
+ afr_inode_read_subvol_reset (local->inode,
+ this);
+ if (local->parent)
+ afr_inode_read_subvol_reset (local->parent,
+ this);
+ if (local->parent2)
+ afr_inode_read_subvol_reset (local->parent2,
+ this);
+ continue;
}
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ if (local->op_ret == -1) {
+ local->op_ret = local->replies[i].op_ret;
+ local->op_errno = local->replies[i].op_errno;
+
+ local->cont.dir_fop.buf =
+ local->replies[i].poststat;
+ local->cont.dir_fop.preparent =
+ local->replies[i].preparent;
+ local->cont.dir_fop.postparent =
+ local->replies[i].postparent;
+ local->cont.dir_fop.prenewparent =
+ local->replies[i].preparent2;
+ local->cont.dir_fop.postnewparent =
+ local->replies[i].postparent2;
+ if (local->replies[i].xdata)
+ local->xdata_rsp =
+ dict_ref (local->replies[i].xdata);
+ continue;
+ }
- call_count = afr_frame_return (frame);
+ if (i == inode_read_subvol) {
+ local->cont.dir_fop.buf =
+ local->replies[i].poststat;
+ if (local->replies[i].xdata) {
+ if (local->xdata_rsp)
+ dict_unref (local->xdata_rsp);
+ local->xdata_rsp =
+ dict_ref (local->replies[i].xdata);
+ }
+ }
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
+ if (i == parent_read_subvol) {
+ local->cont.dir_fop.preparent =
+ local->replies[i].preparent;
+ local->cont.dir_fop.postparent =
+ local->replies[i].postparent;
+ }
- local->transaction.resume (frame, this);
+ if (i == parent2_read_subvol) {
+ local->cont.dir_fop.prenewparent =
+ local->replies[i].preparent2;
+ local->cont.dir_fop.postnewparent =
+ local->replies[i].postparent2;
+ }
}
-
- return 0;
}
-int
-afr_create_wind (call_frame_t *frame, xlator_t *this)
+static void
+__afr_dir_write_fill (call_frame_t *frame, xlator_t *this, int child_index,
+ int op_ret, int op_errno, struct iatt *poststat,
+ struct iatt *preparent, struct iatt *postparent,
+ struct iatt *preparent2, struct iatt *postparent2,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ local = frame->local;
+ fd_ctx = local->fd_ctx;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+
+ if (op_ret >= 0) {
+ if (poststat)
+ local->replies[child_index].poststat = *poststat;
+ if (preparent)
+ local->replies[child_index].preparent = *preparent;
+ if (postparent)
+ local->replies[child_index].postparent = *postparent;
+ if (preparent2)
+ local->replies[child_index].preparent2 = *preparent2;
+ if (postparent2)
+ local->replies[child_index].postparent2 = *postparent2;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref (xdata);
+
+ if (fd_ctx)
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+ } else {
+ if (op_errno != ENOTEMPTY)
+ afr_transaction_fop_failed (frame, this, child_index);
+ if (fd_ctx)
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+ }
- int call_count = -1;
- int i = 0;
+ return;
+}
- local = frame->local;
- priv = this->private;
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+static int
+__afr_dir_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent,
+ struct iatt *preparent2, struct iatt *postparent2,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int child_index = (long) cookie;
+ int call_count = -1;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
+ local = frame->local;
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_create_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->create,
- &local->loc,
- local->cont.create.flags,
- local->cont.create.mode,
- local->cont.create.fd);
- if (!--call_count)
- break;
- }
+ LOCK (&frame->lock);
+ {
+ __afr_dir_write_fill (frame, this, child_index, op_ret,
+ op_errno, buf, preparent, postparent,
+ preparent2, postparent2, xdata);
}
-
- return 0;
-}
+ UNLOCK (&frame->lock);
+ call_count = afr_frame_return (frame);
+ if (call_count == 0) {
+ __afr_dir_write_finalize (frame, this);
-int
-afr_create_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = NULL;
-
- local = frame->local;
+ if (afr_txn_nothing_failed (frame, this))
+ local->transaction.unwind (frame, this);
- local->transaction.unwind (frame, this);
+ afr_mark_entry_pending_changelog (frame, this);
- AFR_STACK_DESTROY (frame);
+ local->transaction.resume (frame, this);
+ }
- return 0;
+ return 0;
}
int
-afr_create (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, mode_t mode, fd_t *fd)
+afr_mark_new_entry_changelog_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ dict_t *xattr, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
+ int call_count = 0;
- int ret = -1;
+ call_count = afr_frame_return (frame);
- int op_ret = -1;
- int op_errno = 0;
+ if (call_count == 0)
+ AFR_STACK_DESTROY (frame);
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ return 0;
+}
- priv = this->private;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
+void
+afr_mark_new_entry_changelog (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *new_frame = NULL;
+ afr_local_t *local = NULL;
+ afr_local_t *new_local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int32_t **changelog = NULL;
+ int i = 0;
+ int idx = 0;
+ int op_errno = ENOMEM;
+ unsigned char *pending = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ new_frame = copy_frame (frame);
+ if (!new_frame)
+ goto out;
+
+ new_local = AFR_FRAME_INIT (new_frame, op_errno);
+ if (!new_local)
goto out;
- }
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS);
+ if (!changelog)
+ goto out;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
+ xattr = dict_new ();
+ if (!xattr)
goto out;
- }
- transaction_frame->local = local;
+ idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);
- loc_copy (&local->loc, loc);
+ pending = alloca0 (priv->child_count);
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
-
- local->cont.create.flags = flags;
- local->cont.create.mode = mode;
- local->cont.create.fd = fd_ref (fd);
-
- local->transaction.fop = afr_create_wind;
- local->transaction.done = afr_create_done;
- local->transaction.unwind = afr_create_unwind;
-
- afr_build_parent_loc (&local->transaction.parent_loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
-
- afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] &&
+ !local->transaction.failed_subvols[i]) {
+ call_count ++;
+ continue;
+ }
- op_ret = 0;
-out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ changelog[i][idx] = hton32(1);
+ pending[i] = 1;
}
- return 0;
-}
+ new_local->pending = changelog;
+ uuid_copy (new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid);
+ new_local->loc.inode = inode_ref (local->inode);
-/* }}} */
-/* {{{ mknod */
+ afr_set_pending_dict (priv, xattr, changelog);
-int
-afr_mknod_unwind (call_frame_t *frame, xlator_t *this)
-{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
+ new_local->call_count = call_count;
- local = frame->local;
+ for (i = 0; i < priv->child_count; i++) {
+ if (pending[i])
+ continue;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
- local->cont.mknod.inode,
- &local->cont.mknod.buf);
+ STACK_WIND_COOKIE (new_frame, afr_mark_new_entry_changelog_cbk,
+ (void *) (long) i, priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &new_local->loc, GF_XATTROP_ADD_ARRAY,
+ xattr, NULL);
+ if (!--call_count)
+ break;
}
- return 0;
+ new_frame = NULL;
+out:
+ if (new_frame)
+ AFR_STACK_DESTROY (new_frame);
+ if (xattr)
+ dict_unref (xattr);
+ return;
}
-int
-afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct stat *buf)
+void
+afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int child_index = -1;
-
- local = frame->local;
- priv = this->private;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int pre_op_count = 0;
+ int failed_count = 0;
- child_index = (long) cookie;
+ local = frame->local;
+ priv = this->private;
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- local->op_ret = op_ret;
-
- if (local->success_count == 0){
- local->cont.mknod.buf = *buf;
- local->cont.mknod.buf.st_ino =
- afr_itransform (buf->st_ino,
- priv->child_count,
- child_index);
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this, inode,
- priv->read_child);
- } else {
- afr_set_read_child (this, inode,
- local->read_child_index);
- }
- }
+ if (local->op_ret < 0)
+ return;
- if (child_index == local->read_child_index) {
- if (priv->read_child >= 0) {
- afr_set_read_child (this, inode,
- priv->read_child);
- } else {
- afr_set_read_child (this, inode,
- local->read_child_index);
- }
- }
-
- local->cont.mknod.inode = inode;
-
- local->success_count++;
- }
+ if (local->op != GF_FOP_CREATE && local->op != GF_FOP_MKNOD)
+ return;
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ pre_op_count = AFR_COUNT (local->transaction.pre_op, priv->child_count);
+ failed_count = AFR_COUNT (local->transaction.failed_subvols,
+ priv->child_count);
- call_count = afr_frame_return (frame);
+ if (pre_op_count == priv->child_count && !failed_count)
+ return;
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
+ afr_mark_new_entry_changelog (frame, this);
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return;
}
-int32_t
-afr_mknod_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+/* {{{ create */
- int call_count = -1;
- int i = 0;
+int
+afr_create_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ main_frame = afr_transaction_detach_fop_frame (frame);
- if (call_count == 0) {
- local->transaction.resume (frame, this);
+ if (!main_frame)
return 0;
- }
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->mknod,
- &local->loc, local->cont.mknod.mode,
- local->cont.mknod.dev);
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ AFR_STACK_UNWIND (create, main_frame, local->op_ret, local->op_errno,
+ local->cont.create.fd, local->inode,
+ &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
}
int
-afr_mknod_done (call_frame_t *frame, xlator_t *this)
+afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ fd_t *fd, inode_t *inode, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
{
- afr_local_t * local = NULL;
-
- local = frame->local;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
+}
- local->transaction.unwind (frame, this);
- AFR_STACK_DESTROY (frame);
- return 0;
+int
+afr_create_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE (frame, afr_create_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->create,
+ &local->loc, local->cont.create.flags,
+ local->cont.create.mode, local->umask,
+ local->cont.create.fd, local->xdata_req);
+ return 0;
}
int
-afr_mknod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dev_t dev)
+afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- int ret = -1;
+ priv = this->private;
- int op_ret = -1;
- int op_errno = 0;
+ QUORUM_CHECK(create,out);
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
goto out;
- }
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ loc_copy (&local->loc, loc);
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
+ local->fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!local->fd_ctx)
goto out;
- }
-
- transaction_frame->local = local;
- loc_copy (&local->loc, loc);
-
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
+ local->inode = inode_ref (loc->inode);
+ local->parent = inode_ref (loc->parent);
- local->cont.mknod.mode = mode;
- local->cont.mknod.dev = dev;
+ local->op = GF_FOP_CREATE;
+ local->cont.create.flags = flags;
+ local->cont.create.mode = mode;
+ local->cont.create.fd = fd_ref (fd);
+ local->umask = umask;
- local->transaction.fop = afr_mknod_wind;
- local->transaction.done = afr_mknod_done;
- local->transaction.unwind = afr_mknod_unwind;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- afr_build_parent_loc (&local->transaction.parent_loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
+ if (!local->xdata_req)
+ goto out;
- afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ local->transaction.wind = afr_create_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_create_unwind;
+
+ ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
+ &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ int_lock = &local->internal_lock;
+
+ int_lock->lockee_count = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ int_lock->lockee_count++;
+ ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
- return 0;
+ AFR_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL, NULL);
+ return 0;
}
/* }}} */
-/* {{{ mkdir */
-
+/* {{{ mknod */
int
-afr_mkdir_unwind (call_frame_t *frame, xlator_t *this)
+afr_mknod_unwind (call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ local = frame->local;
- if (main_frame) {
- AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
- local->cont.mkdir.inode,
- &local->cont.mkdir.buf);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- return 0;
+ AFR_STACK_UNWIND (mknod, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
}
int
-afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct stat *buf)
+afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int child_index = -1;
-
- local = frame->local;
- priv = this->private;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- local->op_ret = op_ret;
-
- if (local->success_count == 0) {
- local->cont.mkdir.buf = *buf;
- local->cont.mkdir.buf.st_ino =
- afr_itransform (buf->st_ino, priv->child_count,
- child_index);
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this, inode,
- priv->read_child);
- } else {
- afr_set_read_child (this, inode,
- local->read_child_index);
- }
- }
-
- if (child_index == local->read_child_index) {
- if (priv->read_child >= 0) {
- afr_set_read_child (this, inode,
- priv->read_child);
- } else {
- afr_set_read_child (this, inode,
- local->read_child_index);
- }
- }
-
- local->cont.mkdir.inode = inode;
-
- local->success_count++;
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
-
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
int
-afr_mkdir_wind (call_frame_t *frame, xlator_t *this)
+afr_mknod_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->mkdir,
- &local->loc, local->cont.mkdir.mode);
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->mknod,
+ &local->loc, local->cont.mknod.mode,
+ local->cont.mknod.dev, local->umask,
+ local->xdata_req);
+ return 0;
}
-
int
-afr_mkdir_done (call_frame_t *frame, xlator_t *this)
+afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t dev, mode_t umask, dict_t *xdata)
{
- afr_local_t * local = NULL;
-
- local = frame->local;
-
- local->transaction.unwind (frame, this);
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
+ priv = this->private;
+ QUORUM_CHECK(mknod,out);
-int
-afr_mkdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
- int ret = -1;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
- int op_ret = -1;
- int op_errno = 0;
+ loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
+ local->parent = inode_ref (loc->parent);
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local->op = GF_FOP_MKNOD;
+ local->cont.mknod.mode = mode;
+ local->cont.mknod.dev = dev;
+ local->umask = umask;
- priv = this->private;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
+ if (!local->xdata_req)
goto out;
- }
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
+ local->transaction.wind = afr_mknod_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_mknod_unwind;
+
+ ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
+ &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ int_lock = &local->internal_lock;
+
+ int_lock->lockee_count = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ int_lock->lockee_count++;
+ ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
op_errno = -ret;
goto out;
- }
-
- transaction_frame->local = local;
-
- loc_copy (&local->loc, loc);
-
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
}
- UNLOCK (&priv->read_child_lock);
-
- local->cont.mkdir.mode = mode;
-
- local->transaction.fop = afr_mkdir_wind;
- local->transaction.done = afr_mkdir_done;
- local->transaction.unwind = afr_mkdir_unwind;
- afr_build_parent_loc (&local->transaction.parent_loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
-
- afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
-
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
- }
-
- return 0;
+ AFR_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL);
+ return 0;
}
/* }}} */
-/* {{{ link */
+/* {{{ mkdir */
int
-afr_link_unwind (call_frame_t *frame, xlator_t *this)
+afr_mkdir_unwind (call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- if (main_frame) {
- local->cont.link.buf.st_ino = local->cont.link.ino;
+ local = frame->local;
- AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
- local->cont.link.inode,
- &local->cont.link.buf);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- return 0;
+ AFR_STACK_UNWIND (mkdir, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
}
int
-afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct stat *buf)
+afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int child_index = -1;
-
- local = frame->local;
- priv = this->private;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- local->op_ret = op_ret;
-
- if (local->success_count == 0) {
- local->cont.link.buf = *buf;
- local->cont.link.buf.st_ino =
- afr_itransform (buf->st_ino, priv->child_count,
- child_index);
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this, inode,
- priv->read_child);
- } else {
- afr_set_read_child (this, inode,
- local->read_child_index);
- }
- }
-
- if (child_index == local->read_child_index) {
- if (priv->read_child >= 0) {
- afr_set_read_child (this, inode,
- priv->read_child);
- } else {
- afr_set_read_child (this, inode,
- local->read_child_index);
- }
- }
-
- local->cont.link.inode = inode;
-
- local->success_count++;
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
-
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
int
-afr_link_wind (call_frame_t *frame, xlator_t *this)
+afr_mkdir_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->link,
- &local->loc,
- &local->newloc);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->mkdir, &local->loc,
+ local->cont.mkdir.mode, local->umask,
+ local->xdata_req);
+ return 0;
}
int
-afr_link_done (call_frame_t *frame, xlator_t *this)
+afr_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata)
{
- afr_local_t * local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- return 0;
-}
+ priv = this->private;
+ QUORUM_CHECK(mkdir,out);
-int
-afr_link (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
- int ret = -1;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
- int op_ret = -1;
- int op_errno = 0;
+ loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
+ local->parent = inode_ref (loc->parent);
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local->cont.mkdir.mode = mode;
+ local->umask = umask;
- priv = this->private;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
+ if (!local->xdata_req)
goto out;
- }
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
+ local->op = GF_FOP_MKDIR;
+ local->transaction.wind = afr_mkdir_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_mkdir_unwind;
+
+ ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
+ &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ int_lock = &local->internal_lock;
+
+ int_lock->lockee_count = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ int_lock->lockee_count++;
+ ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
op_errno = -ret;
goto out;
- }
-
- transaction_frame->local = local;
-
- loc_copy (&local->loc, oldloc);
- loc_copy (&local->newloc, newloc);
-
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
}
- UNLOCK (&priv->read_child_lock);
-
- local->cont.link.ino = oldloc->inode->ino;
-
- local->transaction.fop = afr_link_wind;
- local->transaction.done = afr_link_done;
- local->transaction.unwind = afr_link_unwind;
-
- afr_build_parent_loc (&local->transaction.parent_loc, oldloc);
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (oldloc->path);
- local->transaction.new_basename = AFR_BASENAME (newloc->path);
-
- afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
-
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (frame, op_ret, op_errno);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
- return 0;
+ AFR_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL);
+ return 0;
}
/* }}} */
-/* {{{ symlink */
+/* {{{ link */
int
-afr_symlink_unwind (call_frame_t *frame, xlator_t *this)
+afr_link_unwind (call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
- local->cont.symlink.inode,
- &local->cont.symlink.buf);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- return 0;
+ AFR_STACK_UNWIND (link, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
}
int
-afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct stat *buf)
+afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int child_index = -1;
-
- local = frame->local;
- priv = this->private;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- local->op_ret = op_ret;
-
- if (local->success_count == 0) {
- local->cont.symlink.buf = *buf;
- local->cont.symlink.buf.st_ino =
- afr_itransform (buf->st_ino, priv->child_count,
- child_index);
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this, inode,
- priv->read_child);
- } else {
- afr_set_read_child (this, inode,
- local->read_child_index);
- }
- }
-
- if (child_index == local->read_child_index) {
- if (priv->read_child >= 0) {
- afr_set_read_child (this, inode,
- priv->read_child);
- } else {
- afr_set_read_child (this, inode,
- local->read_child_index);
- }
- }
-
- local->cont.symlink.inode = inode;
-
- local->success_count++;
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
-
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
int
-afr_symlink_wind (call_frame_t *frame, xlator_t *this)
+afr_link_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->symlink,
- local->cont.symlink.linkpath,
- &local->loc);
-
- if (!--call_count)
- break;
-
- }
- }
-
- return 0;
+ STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->link,
+ &local->loc, &local->newloc, local->xdata_req);
+ return 0;
}
int
-afr_symlink_done (call_frame_t *frame, xlator_t *this)
+afr_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
{
- afr_local_t * local = frame->local;
-
- local->transaction.unwind (frame, this);
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
+ priv = this->private;
+ QUORUM_CHECK(link,out);
-int
-afr_symlink (call_frame_t *frame, xlator_t *this,
- const char *linkpath, loc_t *loc)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
- int ret = -1;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
- int op_ret = -1;
- int op_errno = 0;
+ loc_copy (&local->loc, oldloc);
+ loc_copy (&local->newloc, newloc);
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local->inode = inode_ref (oldloc->inode);
+ local->parent = inode_ref (newloc->parent);
- priv = this->private;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
+ if (!local->xdata_req)
goto out;
- }
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
+ local->op = GF_FOP_LINK;
+
+ local->transaction.wind = afr_link_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_link_unwind;
+
+ ret = afr_build_parent_loc (&local->transaction.parent_loc, newloc,
+ &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (newloc->path);
+ int_lock = &local->internal_lock;
+
+ int_lock->lockee_count = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ int_lock->lockee_count++;
+ ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
op_errno = -ret;
goto out;
- }
-
- transaction_frame->local = local;
-
- loc_copy (&local->loc, loc);
-
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
}
- UNLOCK (&priv->read_child_lock);
-
- local->cont.symlink.ino = loc->inode->ino;
- local->cont.symlink.linkpath = strdup (linkpath);
-
- local->transaction.fop = afr_symlink_wind;
- local->transaction.done = afr_symlink_done;
- local->transaction.unwind = afr_symlink_unwind;
-
- afr_build_parent_loc (&local->transaction.parent_loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
- afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
-
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
- return 0;
+ AFR_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL);
+ return 0;
}
/* }}} */
-/* {{{ rename */
+/* {{{ symlink */
+
int
-afr_rename_unwind (call_frame_t *frame, xlator_t *this)
+afr_symlink_unwind (call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- local->cont.rename.buf.st_ino = local->cont.rename.ino;
-
- AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
- &local->cont.rename.buf);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- return 0;
+ AFR_STACK_UNWIND (symlink, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
}
int
-afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct stat *buf)
+afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int child_index = -1;
-
- local = frame->local;
- priv = this->private;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if ((op_ret != -1) && (local->success_count == 0)) {
- local->op_ret = op_ret;
-
- if (buf) {
- local->cont.rename.buf = *buf;
- local->cont.rename.buf.st_ino =
- afr_itransform (buf->st_ino, priv->child_count,
- child_index);
- }
- local->success_count++;
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
-
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
-int32_t
-afr_rename_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_symlink_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_rename_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->rename,
- &local->loc,
- &local->newloc);
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->symlink,
+ local->cont.symlink.linkpath, &local->loc,
+ local->umask, local->xdata_req);
+ return 0;
}
int
-afr_rename_done (call_frame_t *frame, xlator_t *this)
+afr_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath,
+ loc_t *loc, mode_t umask, dict_t *xdata)
{
- afr_local_t * local = frame->local;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- local->transaction.unwind (frame, this);
+ priv = this->private;
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
+ QUORUM_CHECK(symlink,out);
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
-int
-afr_rename (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
-
- int ret = -1;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
- int op_ret = -1;
- int op_errno = 0;
+ loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
+ local->parent = inode_ref (loc->parent);
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local->cont.symlink.linkpath = gf_strdup (linkpath);
+ local->umask = umask;
- priv = this->private;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
+ if (!local->xdata_req)
goto out;
- }
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
+ local->op = GF_FOP_SYMLINK;
+ local->transaction.wind = afr_symlink_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_symlink_unwind;
+
+ ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
+ &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ int_lock = &local->internal_lock;
+
+ int_lock->lockee_count = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ int_lock->lockee_count++;
+ ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
op_errno = -ret;
goto out;
- }
-
- transaction_frame->local = local;
-
- loc_copy (&local->loc, oldloc);
- loc_copy (&local->newloc, newloc);
-
- local->cont.rename.ino = oldloc->inode->ino;
-
- local->transaction.fop = afr_rename_wind;
- local->transaction.done = afr_rename_done;
- local->transaction.unwind = afr_rename_unwind;
-
- afr_build_parent_loc (&local->transaction.parent_loc, oldloc);
- afr_build_parent_loc (&local->transaction.new_parent_loc, newloc);
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (oldloc->path);
- local->transaction.new_basename = AFR_BASENAME (newloc->path);
-
- afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION);
+ }
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
-
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
- return 0;
+ AFR_STACK_UNWIND (symlink, frame, -1, op_errno, NULL, NULL, NULL,
+ NULL, NULL);
+ return 0;
}
/* }}} */
-/* {{{ unlink */
+/* {{{ rename */
int
-afr_unlink_unwind (call_frame_t *frame, xlator_t *this)
+afr_rename_unwind (call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame)
- AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- return 0;
+ AFR_STACK_UNWIND (rename, main_frame, local->op_ret, local->op_errno,
+ &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent,
+ &local->cont.dir_fop.prenewparent,
+ &local->cont.dir_fop.postnewparent, local->xdata_rsp);
+ return 0;
}
int
-afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ struct iatt *preoldparent, struct iatt *postoldparent,
+ struct iatt *prenewparent, struct iatt *postnewparent,
+ dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int child_index = (long) cookie;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
-
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
-
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
+ preoldparent, postoldparent, prenewparent,
+ postnewparent, xdata);
}
-int32_t
-afr_unlink_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_rename_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
+ priv = this->private;
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->unlink,
- &local->loc);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->rename,
+ &local->loc, &local->newloc, local->xdata_req);
+ return 0;
}
-int32_t
-afr_unlink_done (call_frame_t *frame, xlator_t *this)
+int
+afr_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
{
- afr_local_t * local = frame->local;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+ int nlockee = 0;
- local->transaction.unwind (frame, this);
+ priv = this->private;
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
+ QUORUM_CHECK(rename,out);
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ op_errno = ENOMEM;
-int32_t
-afr_unlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
-
- int ret = -1;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
- int op_ret = -1;
- int op_errno = 0;
+ loc_copy (&local->loc, oldloc);
+ loc_copy (&local->newloc, newloc);
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local->inode = inode_ref (oldloc->inode);
+ local->parent = inode_ref (oldloc->parent);
+ local->parent2 = inode_ref (newloc->parent);
- priv = this->private;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
+ if (!local->xdata_req)
goto out;
- }
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ local->op = GF_FOP_RENAME;
+ local->transaction.wind = afr_rename_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_rename_unwind;
+
+ ret = afr_build_parent_loc (&local->transaction.parent_loc, oldloc,
+ &op_errno);
+ if (ret)
+ goto out;
+ ret = afr_build_parent_loc (&local->transaction.new_parent_loc, newloc,
+ &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (oldloc->path);
+ local->transaction.new_basename = AFR_BASENAME (newloc->path);
+ int_lock = &local->internal_lock;
+
+ int_lock->lockee_count = nlockee = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local,
+ &local->transaction.new_parent_loc,
+ local->transaction.new_basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ nlockee++;
+ ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ nlockee++;
+ if (local->newloc.inode && IA_ISDIR (local->newloc.inode->ia_type)) {
+ ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local,
+ &local->newloc,
+ NULL,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ nlockee++;
+ }
+ qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee),
+ afr_entry_lockee_cmp);
+ int_lock->lockee_count = nlockee;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
+ ret = afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION);
+ if (ret < 0) {
op_errno = -ret;
goto out;
- }
-
- transaction_frame->local = local;
-
- loc_copy (&local->loc, loc);
-
- local->transaction.fop = afr_unlink_wind;
- local->transaction.done = afr_unlink_done;
- local->transaction.unwind = afr_unlink_unwind;
-
- afr_build_parent_loc (&local->transaction.parent_loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
-
- afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ }
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (frame, op_ret, op_errno);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
- return 0;
+ AFR_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL, NULL);
+ return 0;
}
/* }}} */
-/* {{{ rmdir */
-
-
+/* {{{ unlink */
int
-afr_rmdir_unwind (call_frame_t *frame, xlator_t *this)
+afr_unlink_unwind (call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ local = frame->local;
- if (main_frame)
- AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- return 0;
+ AFR_STACK_UNWIND (unlink, main_frame, local->op_ret, local->op_errno,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
}
int
-afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int child_index = (long) cookie;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
-
- if (local->success_count == priv->wait_count)
- need_unwind = 1;
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
-
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL,
+ preparent, postparent, NULL, NULL, xdata);
}
int
-afr_rmdir_wind (call_frame_t *frame, xlator_t *this)
+afr_unlink_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->rmdir,
- &local->loc);
+ local = frame->local;
+ priv = this->private;
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->unlink,
+ &local->loc, local->xflag, local->xdata_req);
+ return 0;
}
int
-afr_rmdir_done (call_frame_t *frame, xlator_t *this)
+afr_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata)
{
- afr_local_t * local = frame->local;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- local->transaction.unwind (frame, this);
+ priv = this->private;
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
+ QUORUM_CHECK(unlink,out);
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
-int
-afr_rmdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
-
- int ret = -1;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
- int op_ret = -1;
- int op_errno = 0;
+ loc_copy (&local->loc, loc);
+ local->xflag = xflag;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local->inode = inode_ref (loc->inode);
+ local->parent = inode_ref (loc->parent);
- priv = this->private;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
+ if (!local->xdata_req)
goto out;
- }
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
+ local->op = GF_FOP_UNLINK;
+ local->transaction.wind = afr_unlink_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_unlink_unwind;
+
+ ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
+ &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ int_lock = &local->internal_lock;
+
+ int_lock->lockee_count = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ int_lock->lockee_count++;
+ ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
op_errno = -ret;
goto out;
- }
-
- transaction_frame->local = local;
-
- loc_copy (&local->loc, loc);
-
- local->transaction.fop = afr_rmdir_wind;
- local->transaction.done = afr_rmdir_done;
- local->transaction.unwind = afr_rmdir_unwind;
-
- afr_build_parent_loc (&local->transaction.parent_loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
-
- afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ }
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (frame, op_ret, op_errno);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
- return 0;
+ AFR_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
/* }}} */
-/* {{{ setdents */
-
-int32_t
-afr_setdents_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
+/* {{{ rmdir */
- int call_count = -1;
- int child_index = (long) cookie;
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
- if ((op_ret != -1) && (local->success_count == 0)) {
- local->op_ret = op_ret;
- local->success_count++;
- }
+int
+afr_rmdir_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ local = frame->local;
- call_count = afr_frame_return (frame);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ AFR_STACK_UNWIND (rmdir, main_frame, local->op_ret, local->op_errno,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
}
-int32_t
-afr_setdents_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_setdents_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setdents,
- local->fd, local->cont.setdents.flags,
- local->cont.setdents.entries,
- local->cont.setdents.count);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL,
+ preparent, postparent, NULL, NULL, xdata);
}
-int32_t
-afr_setdents_done (call_frame_t *frame, xlator_t *this)
+int
+afr_rmdir_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t * local = frame->local;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
-
- return 0;
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->rmdir,
+ &local->loc, local->cont.rmdir.flags, local->xdata_req);
+ return 0;
}
-int32_t
-afr_setdents (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count)
+int
+afr_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
-
- int ret = -1;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+ int nlockee = 0;
- int op_ret = -1;
- int op_errno = 0;
+ priv = this->private;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
+ QUORUM_CHECK(rmdir,out);
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
goto out;
- }
- frame->local = local;
- local->fd = fd_ref (fd);
+ loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
+ local->parent = inode_ref (loc->parent);
- local->cont.setdents.flags = flags;
- local->cont.setdents.entries = entries;
- local->cont.setdents.count = count;
+ local->cont.rmdir.flags = flags;
- local->transaction.fop = afr_setdents_wind;
- local->transaction.done = afr_setdents_done;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- local->transaction.basename = NULL;
+ if (!local->xdata_req)
+ goto out;
- afr_transaction (frame, this, AFR_ENTRY_TRANSACTION);
+ local->op = GF_FOP_RMDIR;
+ local->transaction.wind = afr_rmdir_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_rmdir_unwind;
+
+ ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
+ &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME (loc->path);
+ int_lock = &local->internal_lock;
+
+ int_lock->lockee_count = nlockee = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ nlockee++;
+ ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local,
+ &local->loc,
+ NULL,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ nlockee++;
+ qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee),
+ afr_entry_lockee_cmp);
+ int_lock->lockee_count = nlockee;
+
+ ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
- return 0;
+ AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
/* }}} */
diff --git a/xlators/cluster/afr/src/afr-dir-write.h b/xlators/cluster/afr/src/afr-dir-write.h
index 76b4b60fa..02f0a3682 100644
--- a/xlators/cluster/afr/src/afr-dir-write.h
+++ b/xlators/cluster/afr/src/afr-dir-write.h
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef __DIR_WRITE_H__
@@ -22,38 +13,35 @@
int32_t
afr_create (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, mode_t mode, fd_t *fd);
+ loc_t *loc, int32_t flags, mode_t mode,
+ mode_t umask, fd_t *fd, dict_t *xdata);
int32_t
afr_mknod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dev_t dev);
+ loc_t *loc, mode_t mode, dev_t dev, mode_t umask, dict_t *xdata);
int32_t
afr_mkdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode);
+ loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata);
int32_t
afr_unlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc);
+ loc_t *loc, int xflag, dict_t *xdata);
int32_t
afr_rmdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc);
+ loc_t *loc, int flags, dict_t *xdata);
int32_t
afr_link (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc);
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata);
int32_t
afr_rename (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc);
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata);
-int32_t
+int
afr_symlink (call_frame_t *frame, xlator_t *this,
- const char *linkpath, loc_t *oldloc);
-
-int32_t
-afr_setdents (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count);
+ const char *linkpath, loc_t *oldloc, mode_t umask, dict_t *params);
#endif /* __DIR_WRITE_H__ */
diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c
index 2fa8f9e36..4cb219246 100644
--- a/xlators/cluster/afr/src/afr-inode-read.c
+++ b/xlators/cluster/afr/src/afr-inode-read.c
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
@@ -44,769 +35,1630 @@
#include "compat-errno.h"
#include "compat.h"
-#include "afr.h"
-
+#include "afr-transaction.h"
-/**
- * Common algorithm for inode read calls:
- *
- * - Try the fop on the first child that is up
- * - if we have failed due to ENOTCONN:
- * try the next child
- *
- * Applicable to: access, stat, fstat, readlink, getxattr
- */
/* {{{ access */
-int32_t
-afr_access_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
+int
+afr_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
+ afr_local_t *local = NULL;
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
- int read_child = -1;
+ local = frame->local;
- priv = this->private;
- children = priv->children;
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- local = frame->local;
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
- read_child = (long) cookie;
+ AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata);
- if (op_ret == -1) {
- retry:
- last_tried = local->cont.access.last_tried;
+ return 0;
+}
- if (all_tried (last_tried, priv->child_count)) {
- goto out;
- }
- this_try = ++local->cont.access.last_tried;
- if (this_try == read_child) {
- goto retry;
- }
+int
+afr_access_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
- unwind = 0;
+ priv = this->private;
+ local = frame->local;
- STACK_WIND_COOKIE (frame, afr_access_cbk,
- (void *) (long) read_child,
- children[this_try],
- children[this_try]->fops->access,
- &local->loc, local->cont.access.mask);
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (access, frame, local->op_ret,
+ local->op_errno, 0);
+ return 0;
}
-out:
- if (unwind) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno);
- }
+ STACK_WIND_COOKIE (frame, afr_access_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->access,
+ &local->loc, local->cont.access.mask,
+ local->xdata_req);
+ return 0;
+}
+
+int
+afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ int mask, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int op_errno = 0;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_ACCESS;
+ loc_copy (&local->loc, loc);
+ local->cont.access.mask = mask;
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
+
+ afr_read_txn (frame, this, loc->inode, afr_access_wind,
+ AFR_METADATA_TRANSACTION);
return 0;
+out:
+ AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL);
+
+ return 0;
}
+/* }}} */
-int32_t
-afr_access (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t mask)
+/* {{{ stat */
+
+int
+afr_stat_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ struct iatt *buf, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- xlator_t ** children = NULL;
- int call_child = 0;
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
- int32_t read_child = -1;
+ local = frame->local;
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata);
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
+ return 0;
+}
- children = priv->children;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+int
+afr_stat_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
- read_child = afr_read_child (this, loc->inode);
-
- if (read_child >= 0) {
- call_child = read_child;
+ priv = this->private;
+ local = frame->local;
- local->cont.access.last_tried = -1;
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (stat, frame, local->op_ret, local->op_errno,
+ 0, 0);
+ return 0;
+ }
- } else {
- call_child = afr_first_up_child (priv);
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_ERROR,
- "no child is up :(");
- goto out;
- }
+ STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->stat,
+ &local->loc, local->xdata_req);
+ return 0;
+}
- local->cont.access.last_tried = call_child;
- }
+int
+afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int op_errno = 0;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_STAT;
loc_copy (&local->loc, loc);
- local->cont.access.mask = mask;
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
- STACK_WIND_COOKIE (frame, afr_access_cbk,
- (void *) (long) call_child,
- children[call_child], children[call_child]->fops->access,
- loc, mask);
+ afr_read_txn (frame, this, loc->inode, afr_stat_wind,
+ AFR_DATA_TRANSACTION);
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno);
- }
return 0;
+out:
+ AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
}
/* }}} */
-/* {{{ stat */
+/* {{{ fstat */
-int32_t
-afr_stat_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct stat *buf)
+int
+afr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
-
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
- int read_child = -1;
+ afr_local_t *local = NULL;
- priv = this->private;
- children = priv->children;
+ local = frame->local;
- read_child = (long) cookie;
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- local = frame->local;
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
- if (op_ret == -1) {
- retry:
- last_tried = local->cont.stat.last_tried;
+ AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata);
- if (all_tried (last_tried, priv->child_count)) {
- goto out;
- }
- this_try = ++local->cont.stat.last_tried;
-
- if (this_try == read_child) {
- goto retry;
- }
+ return 0;
+}
- unwind = 0;
- STACK_WIND_COOKIE (frame, afr_stat_cbk,
- (void *) (long) read_child,
- children[this_try],
- children[this_try]->fops->stat,
- &local->loc);
- }
+int
+afr_fstat_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
-out:
- if (unwind) {
- if (op_ret != -1)
- buf->st_ino = local->cont.stat.ino;
+ priv = this->private;
+ local = frame->local;
- AFR_STACK_UNWIND (frame, op_ret, op_errno, buf);
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (fstat, frame, local->op_ret, local->op_errno,
+ 0, 0);
+ return 0;
}
+ STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fstat,
+ local->fd, local->xdata_req);
return 0;
}
int32_t
-afr_stat (call_frame_t *frame, xlator_t *this,
- loc_t *loc)
+afr_fstat (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
+ afr_local_t *local = NULL;
+ int op_errno = 0;
- int32_t read_child = -1;
- int call_child = 0;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ local->op = GF_FOP_FSTAT;
+ local->fd = fd_ref (fd);
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ afr_fix_open (fd, this);
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
+ afr_read_txn (frame, this, fd->inode, afr_fstat_wind,
+ AFR_DATA_TRANSACTION);
- children = priv->children;
+ return 0;
+out:
+ AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL);
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ return 0;
+}
- frame->local = local;
+/* }}} */
- read_child = afr_read_child (this, loc->inode);
+/* {{{ readlink */
- if (read_child >= 0) {
- call_child = read_child;
+int
+afr_readlink_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ const char *buf, struct iatt *sbuf, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
- local->cont.stat.last_tried = -1;
+ local = frame->local;
- } else {
- call_child = afr_first_up_child (priv);
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_ERROR,
- "no child is up :(");
- goto out;
- }
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- local->cont.stat.last_tried = call_child;
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
}
- loc_copy (&local->loc, loc);
+ AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno,
+ buf, sbuf, xdata);
+ return 0;
+}
- local->cont.stat.ino = loc->inode->ino;
+int
+afr_readlink_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->stat,
- loc);
+ local = frame->local;
+ priv = this->private;
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (readlink, frame, local->op_ret,
+ local->op_errno, 0, 0, 0);
+ return 0;
}
+ STACK_WIND_COOKIE (frame, afr_readlink_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->readlink,
+ &local->loc, local->cont.readlink.size,
+ local->xdata_req);
+ return 0;
+}
+
+
+int
+afr_readlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, size_t size, dict_t *xdata)
+{
+ afr_local_t * local = NULL;
+ int32_t op_errno = 0;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_READLINK;
+ loc_copy (&local->loc, loc);
+ local->cont.readlink.size = size;
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
+
+ afr_read_txn (frame, this, loc->inode, afr_readlink_wind,
+ AFR_DATA_TRANSACTION);
+
+ return 0;
+out:
+ AFR_STACK_UNWIND(readlink, frame, -1, op_errno, 0, 0, 0);
+
return 0;
}
/* }}} */
-/* {{{ fstat */
+/* {{{ getxattr */
-int32_t
-afr_fstat_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct stat *buf)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
+struct _xattr_key {
+ char *key;
+ struct list_head list;
+};
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
- int read_child = -1;
- priv = this->private;
- children = priv->children;
+int
+__gather_xattr_keys (dict_t *dict, char *key, data_t *value,
+ void *data)
+{
+ struct list_head * list = data;
+ struct _xattr_key * xkey = NULL;
- local = frame->local;
+ if (!strncmp (key, AFR_XATTR_PREFIX,
+ strlen (AFR_XATTR_PREFIX))) {
- read_child = (long) cookie;
+ xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key);
+ if (!xkey)
+ return -1;
- if (op_ret == -1) {
- retry:
- last_tried = local->cont.fstat.last_tried;
+ xkey->key = key;
+ INIT_LIST_HEAD (&xkey->list);
- if (all_tried (last_tried, priv->child_count)) {
- goto out;
- }
- this_try = ++local->cont.fstat.last_tried;
+ list_add_tail (&xkey->list, list);
+ }
+ return 0;
+}
- if (this_try == read_child) {
- goto retry;
- }
- unwind = 0;
+void
+afr_filter_xattrs (dict_t *dict)
+{
+ struct list_head keys = {0,};
+ struct _xattr_key *key = NULL;
+ struct _xattr_key *tmp = NULL;
- STACK_WIND_COOKIE (frame, afr_fstat_cbk,
- (void *) (long) read_child,
- children[this_try],
- children[this_try]->fops->fstat,
- local->fd);
- }
+ INIT_LIST_HEAD (&keys);
-out:
- if (unwind) {
- if (op_ret != -1)
- buf->st_ino = local->cont.fstat.ino;
+ dict_foreach (dict, __gather_xattr_keys,
+ (void *) &keys);
- AFR_STACK_UNWIND (frame, op_ret, op_errno, buf);
- }
+ list_for_each_entry_safe (key, tmp, &keys, list) {
+ dict_del (dict, key->key);
- return 0;
+ list_del_init (&key->list);
+
+ GF_FREE (key);
+ }
}
-int32_t
-afr_fstat (call_frame_t *frame, xlator_t *this,
- fd_t *fd)
+int
+afr_getxattr_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
- int call_child = 0;
- int32_t read_child = -1;
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
- VALIDATE_OR_GOTO (this->private, out);
+ if (dict)
+ afr_filter_xattrs (dict);
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
+ AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata);
- children = priv->children;
+ return 0;
+}
- ALLOC_OR_GOTO (local, afr_local_t, out);
- frame->local = local;
+int
+afr_getxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- VALIDATE_OR_GOTO (fd->inode, out);
+ local = frame->local;
+ priv = this->private;
- read_child = afr_read_child (this, fd->inode);
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (getxattr, frame, local->op_ret,
+ local->op_errno, NULL, NULL);
+ return 0;
+ }
- if (read_child >= 0) {
- call_child = read_child;
+ STACK_WIND_COOKIE (frame, afr_getxattr_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->getxattr,
+ &local->loc, local->cont.getxattr.name,
+ local->xdata_req);
+ return 0;
+}
- local->cont.fstat.last_tried = -1;
- } else {
- call_child = afr_first_up_child (priv);
- if (call_child == -1) {
- op_errno = ENOTCONN;
+int32_t
+afr_getxattr_unwind (call_frame_t *frame, int op_ret, int op_errno,
+ dict_t *dict, dict_t *xdata)
+
+{
+ AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata);
+ return 0;
+}
+
+int32_t
+afr_fgetxattr_clrlk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ xlator_t **children = NULL;
+ dict_t *xattr = NULL;
+ char *tmp_report = NULL;
+ char lk_summary[1024] = {0,};
+ int serz_len = 0;
+ int32_t callcnt = 0;
+ long int cky = 0;
+ int ret = 0;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+ cky = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+ if (op_ret == -1)
+ local->replies[cky].op_errno = op_errno;
+
+ if (!local->dict)
+ local->dict = dict_new ();
+ if (local->dict) {
+ ret = dict_get_str (dict, local->cont.getxattr.name,
+ &tmp_report);
+ if (ret)
+ goto unlock;
+ ret = dict_set_dynstr (local->dict,
+ children[cky]->name,
+ gf_strdup (tmp_report));
+ if (ret)
+ goto unlock;
+ }
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ xattr = dict_new ();
+ if (!xattr) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+ ret = dict_serialize_value_with_delim (local->dict,
+ lk_summary,
+ &serz_len, '\n');
+ if (ret) {
+ op_ret = -1;
+ op_errno = ENOMEM;
gf_log (this->name, GF_LOG_ERROR,
- "no child is up :(");
- goto out;
- }
+ "Error serializing dictionary");
+ goto unwind;
+ }
+ if (serz_len == -1)
+ snprintf (lk_summary, sizeof (lk_summary),
+ "No locks cleared.");
+ ret = dict_set_dynstr (xattr, local->cont.getxattr.name,
+ gf_strdup (lk_summary));
+ if (ret) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "Error setting dictionary");
+ goto unwind;
+ }
- local->cont.fstat.last_tried = call_child;
+ unwind:
+ // Updating child_errno with more recent 'events'
+ op_errno = afr_final_errno (local, priv);
+
+ AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr,
+ xdata);
+ if (xattr)
+ dict_unref (xattr);
}
- local->cont.fstat.ino = fd->inode->ino;
- local->fd = fd_ref (fd);
+ return ret;
+}
- STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->fstat,
- fd);
+int32_t
+afr_getxattr_clrlk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ xlator_t **children = NULL;
+ dict_t *xattr = NULL;
+ char *tmp_report = NULL;
+ char lk_summary[1024] = {0,};
+ int serz_len = 0;
+ int32_t callcnt = 0;
+ long int cky = 0;
+ int ret = 0;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+ cky = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+ if (op_ret == -1)
+ local->replies[cky].op_errno = op_errno;
+
+ if (!local->dict)
+ local->dict = dict_new ();
+ if (local->dict) {
+ ret = dict_get_str (dict, local->cont.getxattr.name,
+ &tmp_report);
+ if (ret)
+ goto unlock;
+ ret = dict_set_dynstr (local->dict,
+ children[cky]->name,
+ gf_strdup (tmp_report));
+ if (ret)
+ goto unlock;
+ }
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ xattr = dict_new ();
+ if (!xattr) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+ ret = dict_serialize_value_with_delim (local->dict,
+ lk_summary,
+ &serz_len, '\n');
+ if (ret) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "Error serializing dictionary");
+ goto unwind;
+ }
+ if (serz_len == -1)
+ snprintf (lk_summary, sizeof (lk_summary),
+ "No locks cleared.");
+ ret = dict_set_dynstr (xattr, local->cont.getxattr.name,
+ gf_strdup (lk_summary));
+ if (ret) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "Error setting dictionary");
+ goto unwind;
+ }
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
- }
+ unwind:
+ // Updating child_errno with more recent 'events'
+ op_errno = afr_final_errno (local, priv);
- return 0;
+ AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata);
+
+ if (xattr)
+ dict_unref (xattr);
+ }
+
+ return ret;
}
-/* }}} */
+/**
+ * node-uuid cbk uses next child querying mechanism
+ */
+int32_t
+afr_getxattr_node_uuid_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ xlator_t **children = NULL;
+ int unwind = 1;
+ int curr_call_child = 0;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+
+ if (op_ret == -1) { /** query the _next_ child */
+
+ /**
+ * _current_ becomes _next_
+ * If done with all childs and yet no success; give up !
+ */
+ curr_call_child = (int) ((long)cookie);
+ if (++curr_call_child == priv->child_count)
+ goto unwind;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "op_ret (-1): Re-querying afr-child (%d/%d)",
+ curr_call_child, priv->child_count);
+
+ unwind = 0;
+ STACK_WIND_COOKIE (frame, afr_getxattr_node_uuid_cbk,
+ (void *) (long) curr_call_child,
+ children[curr_call_child],
+ children[curr_call_child]->fops->getxattr,
+ &local->loc,
+ local->cont.getxattr.name,
+ NULL);
+ }
-/* {{{ readlink */
+ unwind:
+ if (unwind)
+ AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict,
+ NULL);
+
+ return 0;
+}
int32_t
-afr_readlink_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- const char *buf)
+afr_getxattr_lockinfo_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
+ int call_cnt = 0, len = 0;
+ char *lockinfo_buf = NULL;
+ dict_t *lockinfo = NULL, *newdict = NULL;
+ afr_local_t *local = NULL;
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
- int read_child = -1;
+ LOCK (&frame->lock);
+ {
+ local = frame->local;
- priv = this->private;
- children = priv->children;
+ call_cnt = --local->call_count;
- local = frame->local;
+ if ((op_ret < 0) || (!dict && !xdata)) {
+ goto unlock;
+ }
- read_child = (long) cookie;
+ if (xdata) {
+ if (!local->xdata_rsp) {
+ local->xdata_rsp = dict_new ();
+ if (!local->xdata_rsp) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unlock;
+ }
+ }
+ }
- if (op_ret == -1) {
- retry:
- last_tried = local->cont.readlink.last_tried;
+ if (!dict) {
+ goto unlock;
+ }
- if (all_tried (last_tried, priv->child_count)) {
- goto out;
- }
- this_try = ++local->cont.readlink.last_tried;
+ op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY,
+ (void **)&lockinfo_buf, &len);
- if (this_try == read_child) {
- goto retry;
+ if (!lockinfo_buf) {
+ goto unlock;
}
- unwind = 0;
- STACK_WIND_COOKIE (frame, afr_readlink_cbk,
- (void *) (long) read_child,
- children[this_try],
- children[this_try]->fops->readlink,
- &local->loc,
- local->cont.readlink.size);
- }
+ if (!local->dict) {
+ local->dict = dict_new ();
+ if (!local->dict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unlock;
+ }
+ }
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ if (lockinfo_buf != NULL) {
+ lockinfo = dict_new ();
+ if (lockinfo == NULL) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ } else {
+ op_ret = dict_unserialize (lockinfo_buf, len,
+ &lockinfo);
+
+ if (lockinfo && local->dict) {
+ dict_copy (lockinfo, local->dict);
+ }
+ }
+ }
-out:
- if (unwind) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno, buf);
- }
+ if (xdata && local->xdata_rsp) {
+ dict_copy (xdata, local->xdata_rsp);
+ }
- return 0;
-}
+ if (!call_cnt) {
+ newdict = dict_new ();
+ if (!newdict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ len = dict_serialized_length (local->dict);
+ if (len == 0) {
+ goto unwind;
+ }
+ lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char);
+ if (!lockinfo_buf) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ op_ret = dict_serialize (local->dict, lockinfo_buf);
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = -op_ret;
+ }
+
+ op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY,
+ (void *)lockinfo_buf, len);
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = -op_ret;
+ goto unwind;
+ }
+
+ unwind:
+ AFR_STACK_UNWIND (getxattr, frame, op_ret,
+ op_errno, newdict,
+ local->xdata_rsp);
+ }
+
+ dict_unref (lockinfo);
+
+ return 0;
+}
int32_t
-afr_readlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc, size_t size)
+afr_fgetxattr_lockinfo_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- xlator_t ** children = NULL;
- int call_child = 0;
- afr_local_t *local = NULL;
+ int call_cnt = 0, len = 0;
+ char *lockinfo_buf = NULL;
+ dict_t *lockinfo = NULL, *newdict = NULL;
+ afr_local_t *local = NULL;
- int32_t read_child = -1;
+ LOCK (&frame->lock);
+ {
+ local = frame->local;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ call_cnt = --local->call_count;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ if ((op_ret < 0) || (!dict && !xdata)) {
+ goto unlock;
+ }
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
+ if (xdata) {
+ if (!local->xdata_rsp) {
+ local->xdata_rsp = dict_new ();
+ if (!local->xdata_rsp) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unlock;
+ }
+ }
+ }
+
+ if (!dict) {
+ goto unlock;
+ }
- children = priv->children;
+ op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY,
+ (void **)&lockinfo_buf, &len);
+
+ if (!lockinfo_buf) {
+ goto unlock;
+ }
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ if (!local->dict) {
+ local->dict = dict_new ();
+ if (!local->dict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unlock;
+ }
+ }
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ if (lockinfo_buf != NULL) {
+ lockinfo = dict_new ();
+ if (lockinfo == NULL) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ } else {
+ op_ret = dict_unserialize (lockinfo_buf, len,
+ &lockinfo);
+
+ if (lockinfo && local->dict) {
+ dict_copy (lockinfo, local->dict);
+ }
+ }
+ }
- frame->local = local;
+ if (xdata && local->xdata_rsp) {
+ dict_copy (xdata, local->xdata_rsp);
+ }
- read_child = afr_read_child (this, loc->inode);
+ if (!call_cnt) {
+ newdict = dict_new ();
+ if (!newdict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
+ }
- if (read_child >= 0) {
- call_child = read_child;
+ len = dict_serialized_length (local->dict);
+ if (len <= 0) {
+ goto unwind;
+ }
- local->cont.readlink.last_tried = -1;
+ lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char);
+ if (!lockinfo_buf) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
+ }
- } else {
- call_child = afr_first_up_child (priv);
+ op_ret = dict_serialize (local->dict, lockinfo_buf);
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = -op_ret;
+ }
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_ERROR,
- "no child is up :(");
- goto out;
+ op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY,
+ (void *)lockinfo_buf, len);
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = -op_ret;
+ goto unwind;
}
- local->cont.readlink.last_tried = call_child;
+ unwind:
+ AFR_STACK_UNWIND (fgetxattr, frame, op_ret,
+ op_errno, newdict,
+ local->xdata_rsp);
}
- loc_copy (&local->loc, loc);
- local->cont.readlink.size = size;
+ dict_unref (lockinfo);
+
+ return 0;
+}
- STACK_WIND_COOKIE (frame, afr_readlink_cbk,
- (void *) (long) call_child,
- children[call_child], children[call_child]->fops->readlink,
- loc, size);
+int32_t
+afr_fgetxattr_pathinfo_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t callcnt = 0;
+ int ret = 0;
+ char *xattr = NULL;
+ char *xattr_serz = NULL;
+ char xattr_cky[1024] = {0,};
+ dict_t *nxattr = NULL;
+ long cky = 0;
+ int32_t padding = 0;
+ int32_t tlen = 0;
+
+ if (!frame || !frame->local || !this) {
+ gf_log ("", GF_LOG_ERROR, "possible NULL deref");
+ goto out;
+ }
- op_ret = 0;
+ local = frame->local;
+ cky = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret < 0) {
+ local->op_errno = op_errno;
+ } else {
+ local->op_ret = op_ret;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
+ }
+
+ if (!dict || (op_ret < 0))
+ goto out;
+
+ if (!local->dict)
+ local->dict = dict_new ();
+
+ if (local->dict) {
+ ret = dict_get_str (dict,
+ local->cont.getxattr.name,
+ &xattr);
+ if (ret)
+ goto out;
+
+ xattr = gf_strdup (xattr);
+
+ (void)snprintf (xattr_cky, 1024, "%s-%ld",
+ local->cont.getxattr.name, cky);
+ ret = dict_set_dynstr (local->dict,
+ xattr_cky, xattr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Cannot set xattr cookie key");
+ goto out;
+ }
+
+ local->cont.getxattr.xattr_len
+ += strlen (xattr) + 1;
+ }
+ }
out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
- }
- return 0;
-}
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ if (!local->cont.getxattr.xattr_len)
+ goto unwind;
+
+ nxattr = dict_new ();
+ if (!nxattr)
+ goto unwind;
+
+ /* extra bytes for decorations (brackets and <>'s) */
+ padding += strlen (this->name)
+ + strlen (AFR_PATHINFO_HEADER) + 4;
+ local->cont.getxattr.xattr_len += (padding + 2);
+
+ xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len,
+ sizeof (char), gf_common_mt_char);
+
+ if (!xattr_serz)
+ goto unwind;
+
+ /* the xlator info */
+ (void) sprintf (xattr_serz, "(<"AFR_PATHINFO_HEADER"%s> ",
+ this->name);
+
+ /* actual series of pathinfo */
+ ret = dict_serialize_value_with_delim (local->dict,
+ xattr_serz
+ + strlen (xattr_serz),
+ &tlen, ' ');
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Error serializing"
+ " dictionary");
+ goto unwind;
+ }
+ /* closing part */
+ *(xattr_serz + padding + tlen) = ')';
+ *(xattr_serz + padding + tlen + 1) = '\0';
-/* }}} */
+ ret = dict_set_dynstr (nxattr, local->cont.getxattr.name,
+ xattr_serz);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo"
+ " key in dict");
-/* {{{ getxattr */
+ unwind:
+ AFR_STACK_UNWIND (fgetxattr, frame, local->op_ret,
+ local->op_errno, nxattr, local->xdata_rsp);
+
+ if (nxattr)
+ dict_unref (nxattr);
+ }
+
+ return ret;
+}
int32_t
-afr_getxattr_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict)
+afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
+ afr_local_t *local = NULL;
+ int32_t callcnt = 0;
+ int ret = 0;
+ char *xattr = NULL;
+ char *xattr_serz = NULL;
+ char xattr_cky[1024] = {0,};
+ dict_t *nxattr = NULL;
+ long cky = 0;
+ int32_t padding = 0;
+ int32_t tlen = 0;
+
+ if (!frame || !frame->local || !this) {
+ gf_log ("", GF_LOG_ERROR, "possible NULL deref");
+ goto out;
+ }
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
- int read_child = -1;
+ local = frame->local;
+ cky = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret < 0) {
+ local->op_errno = op_errno;
+ } else {
+ local->op_ret = op_ret;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
+ }
+
+ if (!dict || (op_ret < 0))
+ goto out;
+
+ if (!local->dict)
+ local->dict = dict_new ();
+
+ if (local->dict) {
+ ret = dict_get_str (dict,
+ local->cont.getxattr.name,
+ &xattr);
+ if (ret)
+ goto out;
+
+ xattr = gf_strdup (xattr);
+
+ (void)snprintf (xattr_cky, 1024, "%s-%ld",
+ local->cont.getxattr.name, cky);
+ ret = dict_set_dynstr (local->dict,
+ xattr_cky, xattr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Cannot set xattr cookie key");
+ goto out;
+ }
+
+ local->cont.getxattr.xattr_len += strlen (xattr) + 1;
+ }
+ }
+ out:
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ if (!local->cont.getxattr.xattr_len)
+ goto unwind;
+
+ nxattr = dict_new ();
+ if (!nxattr)
+ goto unwind;
+
+ /* extra bytes for decorations (brackets and <>'s) */
+ padding += strlen (this->name) + strlen (AFR_PATHINFO_HEADER) + 4;
+ local->cont.getxattr.xattr_len += (padding + 2);
+
+ xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len,
+ sizeof (char), gf_common_mt_char);
+
+ if (!xattr_serz)
+ goto unwind;
+
+ /* the xlator info */
+ (void) sprintf (xattr_serz, "(<"AFR_PATHINFO_HEADER"%s> ",
+ this->name);
+
+ /* actual series of pathinfo */
+ ret = dict_serialize_value_with_delim (local->dict,
+ xattr_serz + strlen (xattr_serz),
+ &tlen, ' ');
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Error serializing"
+ " dictionary");
+ goto unwind;
+ }
- priv = this->private;
- children = priv->children;
+ /* closing part */
+ *(xattr_serz + padding + tlen) = ')';
+ *(xattr_serz + padding + tlen + 1) = '\0';
- local = frame->local;
+ ret = dict_set_dynstr (nxattr, local->cont.getxattr.name,
+ xattr_serz);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo"
+ " key in dict");
- read_child = (long) cookie;
+ unwind:
+ AFR_STACK_UNWIND (getxattr, frame, local->op_ret,
+ local->op_errno, nxattr, local->xdata_rsp);
- if (op_ret == -1) {
- retry:
- last_tried = local->cont.getxattr.last_tried;
+ if (nxattr)
+ dict_unref (nxattr);
+ }
- if (all_tried (last_tried, priv->child_count)) {
- goto out;
- }
- this_try = ++local->cont.getxattr.last_tried;
+ return ret;
+}
- if (this_try == read_child) {
- goto retry;
+static int
+afr_aggregate_stime_xattr (dict_t *this, char *key, data_t *value, void *data)
+{
+ int ret = 0;
+
+ if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0)
+ ret = gf_get_max_stime (THIS, data, key, value);
+
+ return ret;
+}
+
+int32_t
+afr_common_getxattr_stime_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t callcnt = 0;
+
+ if (!frame || !frame->local || !this) {
+ gf_log ("", GF_LOG_ERROR, "possible NULL deref");
+ goto out;
+ }
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (!dict || (op_ret < 0)) {
+ local->op_errno = op_errno;
+ goto cleanup;
}
- unwind = 0;
- STACK_WIND_COOKIE (frame, afr_getxattr_cbk,
- (void *) (long) read_child,
- children[this_try],
- children[this_try]->fops->getxattr,
- &local->loc,
- local->cont.getxattr.name);
- }
+ if (!local->dict)
+ local->dict = dict_copy_with_ref (dict, NULL);
+ else
+ dict_foreach (dict, afr_aggregate_stime_xattr,
+ local->dict);
+ local->op_ret = 0;
+ }
+
+cleanup:
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ AFR_STACK_UNWIND (getxattr, frame, local->op_ret,
+ local->op_errno, local->dict, xdata);
+ }
out:
- if (unwind) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno, dict);
- }
+ return 0;
+}
- return 0;
+
+static gf_boolean_t
+afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk,
+ gf_boolean_t is_fgetxattr)
+{
+ gf_boolean_t is_spl = _gf_true;
+
+ GF_ASSERT (cbk);
+ if (!cbk || !name) {
+ is_spl = _gf_false;
+ goto out;
+ }
+
+ if (!strcmp (name, GF_XATTR_PATHINFO_KEY) ||
+ !strcmp (name, GF_XATTR_USER_PATHINFO_KEY)) {
+ if (is_fgetxattr) {
+ *cbk = afr_fgetxattr_pathinfo_cbk;
+ } else {
+ *cbk = afr_getxattr_pathinfo_cbk;
+ }
+ } else if (!strncmp (name, GF_XATTR_CLRLK_CMD,
+ strlen (GF_XATTR_CLRLK_CMD))) {
+ if (is_fgetxattr) {
+ *cbk = afr_fgetxattr_clrlk_cbk;
+ } else {
+ *cbk = afr_getxattr_clrlk_cbk;
+ }
+ } else if (!strncmp (name, GF_XATTR_LOCKINFO_KEY,
+ strlen (GF_XATTR_LOCKINFO_KEY))) {
+ if (is_fgetxattr) {
+ *cbk = afr_fgetxattr_lockinfo_cbk;
+ } else {
+ *cbk = afr_getxattr_lockinfo_cbk;
+ }
+ } else if (fnmatch (GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) {
+ *cbk = afr_common_getxattr_stime_cbk;
+ } else {
+ is_spl = _gf_false;
+ }
+
+out:
+ return is_spl;
}
+static void
+afr_getxattr_all_subvols (xlator_t *this, call_frame_t *frame,
+ const char *name, loc_t *loc,
+ fop_getxattr_cbk_t cbk)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+
+ priv = this->private;
+
+ local = frame->local;
+ //local->call_count set in afr_local_init
+ call_count = local->call_count;
+
+ //If up-children count is 0, afr_local_init would have failed already
+ //and the call would have unwound so not handling it here.
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, cbk,
+ (void *) (long) i, priv->children[i],
+ priv->children[i]->fops->getxattr,
+ loc, name, NULL);
+ if (!--call_count)
+ break;
+ }
+ }
+ return;
+}
int32_t
afr_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name)
+ loc_t *loc, const char *name, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- xlator_t ** children = NULL;
- int call_child = 0;
- afr_local_t * local = NULL;
+ afr_private_t *priv = NULL;
+ xlator_t **children = NULL;
+ afr_local_t *local = NULL;
+ xlator_list_t *trav = NULL;
+ xlator_t **sub_volumes = NULL;
+ int i = 0;
+ int32_t op_errno = 0;
+ int ret = -1;
+ fop_getxattr_cbk_t cbk = NULL;
+ int afr_xtime_gauge[MCNT_MAX] = {0,};
- int read_child = -1;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ priv = this->private;
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
+ children = priv->children;
- children = priv->children;
+ loc_copy (&local->loc, loc);
- ALLOC_OR_GOTO (local, afr_local_t, out);
- frame->local = local;
+ local->op = GF_FOP_GETXATTR;
- read_child = afr_read_child (this, loc->inode);
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
- if (read_child >= 0) {
- call_child = read_child;
+ if (!name)
+ goto no_name;
- local->cont.getxattr.last_tried = -1;
- } else {
- call_child = afr_first_up_child (priv);
+ local->cont.getxattr.name = gf_strdup (name);
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_ERROR,
- "no child is up :(");
+ if (!local->cont.getxattr.name) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ if (!strncmp (name, AFR_XATTR_PREFIX,
+ strlen (AFR_XATTR_PREFIX))) {
+ gf_log (this->name, GF_LOG_INFO,
+ "%s: no data present for key %s",
+ loc->path, name);
+ op_errno = ENODATA;
+ goto out;
+ }
+ if ((strcmp (GF_XATTR_MARKER_KEY, name) == 0)
+ && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) {
+
+ local->marker.call_count = priv->child_count;
+
+ sub_volumes = alloca ( priv->child_count * sizeof (xlator_t *));
+ for (i = 0, trav = this->children; trav ;
+ trav = trav->next, i++) {
+
+ *(sub_volumes + i) = trav->xlator;
+ }
+
+ if (cluster_getmarkerattr (frame, this, loc, name,
+ local, afr_getxattr_unwind,
+ sub_volumes,
+ priv->child_count,
+ MARKER_UUID_TYPE,
+ marker_uuid_default_gauge,
+ priv->vol_uuid)) {
+
+ gf_log (this->name, GF_LOG_INFO,
+ "%s: failed to get marker attr (%s)",
+ loc->path, name);
+ op_errno = EINVAL;
goto out;
}
- local->cont.getxattr.last_tried = call_child;
+ return 0;
}
- loc_copy (&local->loc, loc);
- if (name)
- local->cont.getxattr.name = strdup (name);
+ /*
+ * if we are doing getxattr with pathinfo as the key then we
+ * collect information from all childs
+ */
+ if (afr_is_special_xattr (name, &cbk, 0)) {
+ afr_getxattr_all_subvols (this, frame, name, loc, cbk);
+ return 0;
+ }
- STACK_WIND_COOKIE (frame, afr_getxattr_cbk,
- (void *) (long) call_child,
- children[call_child], children[call_child]->fops->getxattr,
- loc, name);
+ if (XATTR_IS_NODE_UUID (name)) {
+ i = 0;
+ STACK_WIND_COOKIE (frame, afr_getxattr_node_uuid_cbk,
+ (void *) (long) i,
+ children[i],
+ children[i]->fops->getxattr,
+ loc, name, xdata);
+ return 0;
+ }
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
- }
- return 0;
-}
+ if (*priv->vol_uuid) {
+ if ((match_uuid_local (name, priv->vol_uuid) == 0)
+ && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) {
+ local->marker.call_count = priv->child_count;
+
+ sub_volumes = alloca ( priv->child_count
+ * sizeof (xlator_t *));
+ for (i = 0, trav = this->children; trav ;
+ trav = trav->next, i++) {
+
+ *(sub_volumes + i) = trav->xlator;
+
+ }
+
+ /* don't err out on getting ENOTCONN (brick down)
+ * from a subset of the bricks
+ */
+ memcpy (afr_xtime_gauge, marker_xtime_default_gauge,
+ sizeof (afr_xtime_gauge));
+ afr_xtime_gauge[MCNT_NOTFOUND] = 0;
+ afr_xtime_gauge[MCNT_ENOTCONN] = 0;
+ if (cluster_getmarkerattr (frame, this, loc,
+ name, local,
+ afr_getxattr_unwind,
+ sub_volumes,
+ priv->child_count,
+ MARKER_XTIME_TYPE,
+ afr_xtime_gauge,
+ priv->vol_uuid)) {
+ gf_log (this->name, GF_LOG_INFO,
+ "%s: failed to get marker attr (%s)",
+ loc->path, name);
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ return 0;
+ }
+ }
+no_name:
-/* }}} */
+ afr_read_txn (frame, this, local->loc.inode, afr_getxattr_wind,
+ AFR_METADATA_TRANSACTION);
-/* {{{ readv */
+ ret = 0;
+out:
+ if (ret < 0)
+ AFR_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL);
+ return 0;
+}
+
+/* {{{ fgetxattr */
-/**
- * read algorithm:
- *
- * if the user has specified a read subvolume, use it
- * otherwise -
- * use the inode number to hash it to one of the subvolumes, and
- * read from there (to balance read load)
- *
- * if any of the above read's fail, try the children in sequence
- * beginning at the beginning
- */
int32_t
-afr_readv_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iovec *vector, int32_t count, struct stat *buf,
- struct iobref *iobref)
+afr_fgetxattr_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
+ afr_local_t *local = NULL;
- int unwind = 1;
- int last_tried = -1;
- int this_try = -1;
- int read_child = -1;
+ local = frame->local;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
+
+ if (dict)
+ afr_filter_xattrs (dict);
+
+ AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata);
- children = priv->children;
+ return 0;
+}
+
+int
+afr_fgetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
local = frame->local;
+ priv = this->private;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (fgetxattr, frame, local->op_ret,
+ local->op_errno, NULL, NULL);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fgetxattr,
+ local->fd, local->cont.getxattr.name,
+ local->xdata_req);
+ return 0;
+}
+
- read_child = (long) cookie;
+static void
+afr_fgetxattr_all_subvols (xlator_t *this, call_frame_t *frame,
+ fop_fgetxattr_cbk_t cbk)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+
+ priv = this->private;
+
+ local = frame->local;
+ //local->call_count set in afr_local_init
+ call_count = local->call_count;
+
+ //If up-children count is 0, afr_local_init would have failed already
+ //and the call would have unwound so not handling it here.
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fgetxattr,
+ local->fd, local->cont.getxattr.name,
+ NULL);
+ if (!--call_count)
+ break;
+ }
+ }
- if (op_ret == -1) {
- retry:
- last_tried = local->cont.readv.last_tried;
+ return;
+}
- if (all_tried (last_tried, priv->child_count)) {
+
+int
+afr_fgetxattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
+ fop_fgetxattr_cbk_t cbk = NULL;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_FGETXATTR;
+ local->fd = fd_ref (fd);
+ if (name) {
+ local->cont.getxattr.name = gf_strdup (name);
+ if (!local->cont.getxattr.name) {
+ op_errno = ENOMEM;
goto out;
}
- this_try = ++local->cont.readv.last_tried;
-
- if (this_try == read_child) {
- /*
- skip the read child since if we are here
- we must have already tried that child
- */
- goto retry;
- }
+ }
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
+
+ /* pathinfo gets handled only in getxattr(), but we need to handle
+ * lockinfo.
+ * If we are doing fgetxattr with lockinfo as the key then we
+ * collect information from all children.
+ */
+ if (afr_is_special_xattr (name, &cbk, 1)) {
+ afr_fgetxattr_all_subvols (this, frame, cbk);
+ return 0;
+ }
- unwind = 0;
+ afr_fix_open (fd, this);
- STACK_WIND_COOKIE (frame, afr_readv_cbk,
- (void *) (long) read_child,
- children[this_try],
- children[this_try]->fops->readv,
- local->fd, local->cont.readv.size,
- local->cont.readv.offset);
- }
+ afr_read_txn (frame, this, fd->inode, afr_fgetxattr_wind,
+ AFR_METADATA_TRANSACTION);
+ return 0;
out:
- if (unwind) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno, vector, count, buf,
- iobref);
- }
+ AFR_STACK_UNWIND (fgetxattr, frame, -1, op_errno, NULL, NULL);
- return 0;
+ return 0;
}
-int32_t
-afr_readv (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset)
+/* }}} */
+
+/* {{{ readv */
+
+int
+afr_readv_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ struct iovec *vector, int32_t count, struct iatt *buf,
+ struct iobref *iobref, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
+ afr_local_t *local = NULL;
- int32_t read_child = -1;
- int call_child = 0;
+ local = frame->local;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (fd, out);
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
- priv = this->private;
- children = priv->children;
+ AFR_STACK_UNWIND (readv, frame, op_ret, op_errno,
+ vector, count, buf, iobref, xdata);
+ return 0;
+}
- ALLOC_OR_GOTO (local, afr_local_t, out);
- frame->local = local;
+int
+afr_readv_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
- read_child = afr_read_child (this, fd->inode);
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (readv, frame, local->op_ret, local->op_errno,
+ 0, 0, 0, 0, 0);
+ return 0;
+ }
- if (read_child >= 0) {
- call_child = read_child;
+ STACK_WIND_COOKIE (frame, afr_readv_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->readv,
+ local->fd, local->cont.readv.size,
+ local->cont.readv.offset, local->cont.readv.flags,
+ local->xdata_req);
+ return 0;
+}
- /*
- if read fails from the read child, we try
- all children starting with the first one
- */
- local->cont.readv.last_tried = -1;
- } else {
- call_child = afr_first_up_child (priv);
- if (call_child == -1) {
- op_errno = ENOTCONN;
- gf_log (this->name, GF_LOG_ERROR,
- "no child is up :(");
- goto out;
- }
+int
+afr_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
+{
+ afr_local_t * local = NULL;
+ int32_t op_errno = 0;
- local->cont.readv.last_tried = call_child;
- }
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- local->fd = fd_ref (fd);
+ local->op = GF_FOP_READ;
+ local->fd = fd_ref (fd);
+ local->cont.readv.size = size;
+ local->cont.readv.offset = offset;
+ local->cont.readv.flags = flags;
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
- local->cont.readv.size = size;
- local->cont.readv.offset = offset;
+ afr_fix_open (fd, this);
- STACK_WIND_COOKIE (frame, afr_readv_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->readv,
- fd, size, offset);
+ afr_read_txn (frame, this, fd->inode, afr_readv_wind,
+ AFR_DATA_TRANSACTION);
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, 0, NULL,
- NULL);
- }
+ AFR_STACK_UNWIND(readv, frame, -1, op_errno, 0, 0, 0, 0, 0);
+
return 0;
}
diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h
index 1127cb652..e4091a793 100644
--- a/xlators/cluster/afr/src/afr-inode-read.h
+++ b/xlators/cluster/afr/src/afr-inode-read.h
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef __INODE_READ_H__
@@ -22,26 +13,30 @@
int32_t
afr_access (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t mask);
+ loc_t *loc, int32_t mask, dict_t *xdata);
int32_t
afr_stat (call_frame_t *frame, xlator_t *this,
- loc_t *loc);
+ loc_t *loc, dict_t *xdata);
int32_t
afr_fstat (call_frame_t *frame, xlator_t *this,
- fd_t *fd);
+ fd_t *fd, dict_t *xdata);
int32_t
afr_readlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc, size_t size);
+ loc_t *loc, size_t size, dict_t *xdata);
int32_t
afr_readv (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset);
+ fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata);
int32_t
afr_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name);
+ loc_t *loc, const char *name, dict_t *xdata);
+
+int32_t
+afr_fgetxattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *name, dict_t *xdata);
#endif /* __INODE_READ_H__ */
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index 8b568a0ea..00e0d2676 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
@@ -48,1965 +39,1713 @@
#include "afr-transaction.h"
-/* {{{ chmod */
-
-
-int
-afr_chmod_unwind (call_frame_t *frame, xlator_t *this)
+static void
+__afr_inode_write_finalize (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int read_subvol = 0;
+ int i = 0;
local = frame->local;
- priv = this->private;
+ priv = this->private;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
+ if (local->inode) {
+ if (local->transaction.type == AFR_METADATA_TRANSACTION)
+ read_subvol = afr_metadata_subvol_get (local->inode, this,
+ NULL, NULL);
+ else
+ read_subvol = afr_data_subvol_get (local->inode, this,
+ NULL, NULL);
}
- UNLOCK (&frame->lock);
- if (main_frame) {
- local->cont.chmod.buf.st_ino = local->cont.chmod.ino;
- AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
- &local->cont.chmod.buf);
- }
- return 0;
-}
-
-
-int
-afr_chmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct stat *buf)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int child_index = (long) cookie;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.chmod.buf = *buf;
- }
- local->success_count++;
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno (local, priv);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret < 0) {
+ afr_inode_read_subvol_reset (local->inode, this);
+ continue;
+ }
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
+ /* Order of checks in the compound conditional
+ below is important.
+
+ - Highest precedence: largest op_ret
+ - Next precendence: if all op_rets are equal, read subvol
+ - Least precedence: any succeeded subvol
+ */
+ if ((local->op_ret < local->replies[i].op_ret) ||
+ ((local->op_ret == local->replies[i].op_ret) &&
+ (i == read_subvol))) {
+
+ local->op_ret = local->replies[i].op_ret;
+ local->op_errno = local->replies[i].op_errno;
+
+ local->cont.inode_wfop.prebuf =
+ local->replies[i].prestat;
+ local->cont.inode_wfop.postbuf =
+ local->replies[i].poststat;
+
+ if (local->replies[i].xdata) {
+ if (local->xdata_rsp)
+ dict_unref (local->xdata_rsp);
+ local->xdata_rsp =
+ dict_ref (local->replies[i].xdata);
}
}
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- afr_chmod_unwind (frame, this);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
}
-
- return 0;
}
-int
-afr_chmod_wind (call_frame_t *frame, xlator_t *this)
+static void
+__afr_inode_write_fill (call_frame_t *frame, xlator_t *this, int child_index,
+ int op_ret, int op_errno,
+ struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int i = 0;
- int call_count = -1;
+ afr_local_t *local = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
+ if (op_ret >= 0) {
+ if (prebuf)
+ local->replies[child_index].prestat = *prebuf;
+ if (postbuf)
+ local->replies[child_index].poststat = *postbuf;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref (xdata);
+ } else {
+ afr_transaction_fop_failed (frame, this, child_index);
}
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_chmod_wind_cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->chmod,
- &local->loc,
- local->cont.chmod.mode);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ return;
}
-int
-afr_chmod_done (call_frame_t *frame, xlator_t *this)
+static int
+__afr_inode_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t * local = NULL;
+ afr_local_t *local = NULL;
+ int child_index = (long) cookie;
+ int call_count = -1;
- local = frame->local;
+ local = frame->local;
- local->transaction.unwind (frame, this);
+ LOCK (&frame->lock);
+ {
+ __afr_inode_write_fill (frame, this, child_index, op_ret,
+ op_errno, prebuf, postbuf, xdata);
+ }
+ UNLOCK (&frame->lock);
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
+ call_count = afr_frame_return (frame);
+ if (call_count == 0) {
+ __afr_inode_write_finalize (frame, this);
-int32_t
-afr_chmod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
-
- int ret = -1;
-
- int op_ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
- goto out;
- }
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ if (afr_txn_nothing_failed (frame, this))
+ local->transaction.unwind (frame, this);
- transaction_frame->local = local;
+ local->transaction.resume (frame, this);
+ }
- local->cont.chmod.mode = mode;
- local->cont.chmod.ino = loc->inode->ino;
-
- local->transaction.fop = afr_chmod_wind;
- local->transaction.done = afr_chmod_done;
- local->transaction.unwind = afr_chmod_unwind;
-
- loc_copy (&local->loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.start = 0;
- local->transaction.len = 0;
-
- afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ return 0;
+}
- op_ret = 0;
-out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
- }
+/* {{{ writev */
- return 0;
+void
+afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame)
+{
+ afr_local_t *src_local = NULL;
+ afr_local_t *dst_local = NULL;
+
+ src_local = src_frame->local;
+ dst_local = dst_frame->local;
+
+ dst_local->op_ret = src_local->op_ret;
+ dst_local->op_errno = src_local->op_errno;
+ dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf;
+ dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf;
+ if (src_local->xdata_rsp)
+ dst_local->xdata_rsp = dict_ref (src_local->xdata_rsp);
}
-/* }}} */
-
+void
+afr_writev_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ local = frame->local;
+
+ AFR_STACK_UNWIND (writev, frame,
+ local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
+ local->xdata_rsp);
+}
-/* {{{ fchmod */
int
-afr_fchmod_unwind (call_frame_t *frame, xlator_t *this)
+afr_transaction_writev_unwind (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- call_frame_t *main_frame = NULL;
-
- local = frame->local;
- priv = this->private;
+ call_frame_t *fop_frame = NULL;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ fop_frame = afr_transaction_detach_fop_frame (frame);
- if (main_frame) {
- local->cont.fchmod.buf.st_ino = local->cont.fchmod.ino;
- AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
- &local->cont.fchmod.buf);
- }
- return 0;
+ if (fop_frame) {
+ afr_writev_copy_outvars (frame, fop_frame);
+ afr_writev_unwind (fop_frame, this);
+ }
+ return 0;
}
+static void
+afr_writev_handle_short_writes (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+ /*
+ * We already have the best case result of the writev calls staged
+ * as the return value. Any writev that returns some value less
+ * than the best case is now out of sync, so mark the fop as
+ * failed. Note that fops that have returned with errors have
+ * already been marked as failed.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if ((!local->replies[i].valid) ||
+ (local->replies[i].op_ret == -1))
+ continue;
+
+ if (local->replies[i].op_ret < local->op_ret)
+ afr_transaction_fop_failed(frame, this, i);
+ }
+}
int
-afr_fchmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct stat *buf)
+afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int child_index = (long) cookie;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.fchmod.buf = *buf;
- }
- local->success_count++;
-
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
+ afr_local_t * local = NULL;
+ call_frame_t *fop_frame = NULL;
+ int child_index = (long) cookie;
+ int call_count = -1;
+ int ret = 0;
+ uint32_t open_fd_count = 0;
+ uint32_t write_is_append = 0;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ __afr_inode_write_fill (frame, this, child_index, op_ret,
+ op_errno, prebuf, postbuf, xdata);
+ if (op_ret == -1 || !xdata)
+ goto unlock;
+
+ write_is_append = 0;
+ ret = dict_get_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND,
+ &write_is_append);
+ if (ret || !write_is_append)
+ local->append_write = _gf_false;
+
+ ret = dict_get_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT,
+ &open_fd_count);
+ if (ret == -1)
+ goto unlock;
+ if ((open_fd_count > local->open_fd_count)) {
+ local->open_fd_count = open_fd_count;
+ local->update_open_fd_count = _gf_true;
}
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if (!local->stable_write && !local->append_write)
+ /* An appended write removes the necessity to
+ fsync() the file. This is because self-heal
+ has the logic to check for larger file when
+ the xattrs are not reliably pointing at
+ a stale file.
+ */
+ afr_fd_report_unstable_write (this, local->fd);
+
+ __afr_inode_write_finalize (frame, this);
+
+ afr_writev_handle_short_writes (frame, this);
+
+ if (local->update_open_fd_count)
+ afr_handle_open_fd_count (frame, this);
+
+ if (!afr_txn_nothing_failed (frame, this)) {
+ //Don't unwind until post-op is complete
+ local->transaction.resume (frame, this);
+ } else {
+ /*
+ * Generally inode-write fops do transaction.unwind then
+ * transaction.resume, but writev needs to make sure that
+ * delayed post-op frame is placed in fdctx before unwind
+ * happens. This prevents the race of flush doing the
+ * changelog wakeup first in fuse thread and then this
+ * writev placing its delayed post-op frame in fdctx.
+ * This helps flush make sure all the delayed post-ops are
+ * completed.
+ */
+
+ fop_frame = afr_transaction_detach_fop_frame (frame);
+ afr_writev_copy_outvars (frame, fop_frame);
+ local->transaction.resume (frame, this);
+ afr_writev_unwind (fop_frame, this);
+ }
+ }
+ return 0;
+}
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- afr_fchmod_unwind (frame, this);
-
- call_count = afr_frame_return (frame);
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
+int
+afr_writev_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->writev,
+ local->fd, local->cont.writev.vector,
+ local->cont.writev.count, local->cont.writev.offset,
+ local->cont.writev.flags, local->cont.writev.iobref,
+ local->xdata_req);
+ return 0;
}
int
-afr_fchmod_wind (call_frame_t *frame, xlator_t *this)
+afr_do_writev (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int i = 0;
- int call_count = -1;
+ call_frame_t *transaction_frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- local = frame->local;
- priv = this->private;
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ local = frame->local;
+ transaction_frame->local = local;
+ frame->local = NULL;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
+ if (!AFR_FRAME_INIT (frame, op_errno))
+ goto out;
+
+ local->op = GF_FOP_WRITE;
+
+ local->transaction.wind = afr_writev_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_transaction_writev_unwind;
+
+ local->transaction.main_frame = frame;
+
+ if (local->fd->flags & O_APPEND) {
+ /*
+ * Backend vfs ignores the 'offset' for append mode fd so
+ * locking just the region provided for the writev does not
+ * give consistency gurantee. The actual write may happen at a
+ * completely different range than the one provided by the
+ * offset, len in the fop. So lock the entire file.
+ */
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ } else {
+ local->transaction.start = local->cont.writev.offset;
+ local->transaction.len = iov_length (local->cont.writev.vector,
+ local->cont.writev.count);
+ }
+
+ ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_fchmod_wind_cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fchmod,
- local->fd,
- local->cont.fchmod.mode);
-
- if (!--call_count)
- break;
- }
- }
-
return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
int
-afr_fchmod_done (call_frame_t *frame, xlator_t *this)
+afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset,
+ uint32_t flags, struct iobref *iobref, dict_t *xdata)
{
- afr_local_t * local = NULL;
-
- local = frame->local;
-
- local->transaction.unwind (frame, this);
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int op_errno = ENOMEM;
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
+ priv = this->private;
+ QUORUM_CHECK(writev,out);
-int32_t
-afr_fchmod (call_frame_t *frame, xlator_t *this,
- fd_t *fd, mode_t mode)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- int ret = -1;
+ local->cont.writev.vector = iov_dup (vector, count);
+ if (!local->cont.writev.vector)
+ goto out;
+ local->cont.writev.count = count;
+ local->cont.writev.offset = offset;
+ local->cont.writev.flags = flags;
+ local->cont.writev.iobref = iobref_ref (iobref);
- int op_ret = -1;
- int op_errno = 0;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ if (!local->xdata_req)
+ goto out;
- priv = this->private;
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
+ if (dict_set_uint32 (local->xdata_req, GLUSTERFS_OPEN_FD_COUNT, 4)) {
+ op_errno = ENOMEM;
goto out;
}
- ALLOC_OR_GOTO (local, afr_local_t, out);
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
+ if (dict_set_uint32 (local->xdata_req, GLUSTERFS_WRITE_IS_APPEND, 4)) {
+ op_errno = ENOMEM;
goto out;
}
- transaction_frame->local = local;
+ /* Set append_write to be true speculatively. If on any
+ server it turns not be true, we unset it in the
+ callback.
+ */
+ local->append_write = _gf_true;
- local->cont.fchmod.mode = mode;
- local->cont.fchmod.ino = fd->inode->ino;
+ /* detect here, but set it in writev_wind_cbk *after* the unstable
+ write is performed
+ */
+ local->stable_write = !!((fd->flags|flags)&(O_SYNC|O_DSYNC));
- local->transaction.fop = afr_fchmod_wind;
- local->transaction.done = afr_fchmod_done;
- local->transaction.unwind = afr_fchmod_unwind;
+ afr_fix_open (fd, this);
- local->fd = fd_ref (fd);
-
- local->transaction.main_frame = frame;
- local->transaction.start = 0;
- local->transaction.len = 0;
+ afr_do_writev (frame, this);
- afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
-
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
- }
+ AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ return 0;
}
+
/* }}} */
-/* {{{ chown */
+/* {{{ truncate */
int
-afr_chown_unwind (call_frame_t *frame, xlator_t *this)
+afr_truncate_unwind (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- call_frame_t *main_frame = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- if (main_frame) {
- local->cont.chown.buf.st_ino = local->cont.chown.ino;
- AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
- &local->cont.chown.buf);
- }
- return 0;
+ AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
}
int
-afr_chown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct stat *buf)
+afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int child_index = (long) cookie;
- int need_unwind = 0;
+ afr_local_t *local = NULL;
local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.chown.buf = *buf;
- }
- local->success_count++;
-
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind) {
- local->transaction.unwind (frame, this);
- }
+ if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size)
+ local->stable_write = _gf_false;
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
}
int
-afr_chown_wind (call_frame_t *frame, xlator_t *this)
+afr_truncate_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_chown_wind_cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->chown,
- &local->loc, local->cont.chown.uid,
- local->cont.chown.gid);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->truncate,
+ &local->loc, local->cont.truncate.offset,
+ local->xdata_req);
+ return 0;
}
int
-afr_chown_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
-
-
-int
-afr_chown (call_frame_t *frame, xlator_t *this,
- loc_t *loc, uid_t uid, gid_t gid)
+afr_truncate (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, off_t offset, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
-
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *transaction_frame = NULL;
int ret = -1;
+ int op_errno = ENOMEM;
- int op_ret = -1;
- int op_errno = 0;
+ priv = this->private;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ QUORUM_CHECK(truncate,out);
- priv = this->private;
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
goto out;
- }
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ local->cont.truncate.offset = offset;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
+ if (!local->xdata_req)
goto out;
- }
- transaction_frame->local = local;
+ local->transaction.wind = afr_truncate_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_truncate_unwind;
- local->cont.chown.uid = uid;
- local->cont.chown.gid = gid;
- local->cont.chown.ino = loc->inode->ino;
+ loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
- local->transaction.fop = afr_chown_wind;
- local->transaction.done = afr_chown_done;
- local->transaction.unwind = afr_chown_unwind;
+ local->op = GF_FOP_TRUNCATE;
- loc_copy (&local->loc, loc);
+ local->transaction.main_frame = frame;
+ local->transaction.start = offset;
+ local->transaction.len = 0;
- local->transaction.main_frame = frame;
- local->transaction.start = 0;
- local->transaction.len = 0;
+ /* Set it true speculatively, will get reset in afr_truncate_wind_cbk
+ if truncate was not a NOP */
+ local->stable_write = _gf_true;
- afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
- return 0;
+ AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
/* }}} */
-/* {{{ chown */
-
-int
-afr_fchown_unwind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- call_frame_t *main_frame = NULL;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- local->cont.fchown.buf.st_ino = local->cont.fchown.ino;
- AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
- &local->cont.fchown.buf);
- }
- return 0;
-}
+/* {{{ ftruncate */
int
-afr_fchown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct stat *buf)
+afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int child_index = (long) cookie;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.fchown.buf = *buf;
- }
- local->success_count++;
-
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
- }
+ afr_local_t * local = NULL;
+ call_frame_t *main_frame = NULL;
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ local = frame->local;
- if (need_unwind) {
- local->transaction.unwind (frame, this);
- }
-
- call_count = afr_frame_return (frame);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
}
int
-afr_fchown_wind (call_frame_t *frame, xlator_t *this)
+afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int i = 0;
+ afr_local_t *local = NULL;
local = frame->local;
- priv = this->private;
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size)
+ local->stable_write = _gf_false;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_fchown_wind_cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fchown,
- local->fd, local->cont.fchown.uid,
- local->cont.fchown.gid);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
}
int
-afr_fchown_done (call_frame_t *frame, xlator_t *this)
+afr_ftruncate_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
local = frame->local;
+ priv = this->private;
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
+ STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->ftruncate,
+ local->fd, local->cont.ftruncate.offset,
+ local->xdata_req);
+ return 0;
}
int
-afr_fchown (call_frame_t *frame, xlator_t *this,
- fd_t *fd, uid_t uid, gid_t gid)
+afr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
-
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
int ret = -1;
+ int op_errno = ENOMEM;
- int op_ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ priv = this->private;
- priv = this->private;
+ QUORUM_CHECK(ftruncate,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
+ if (!frame)
goto out;
- }
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
+ local->cont.ftruncate.offset = offset;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
goto out;
- }
- transaction_frame->local = local;
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
- local->cont.fchown.uid = uid;
- local->cont.fchown.gid = gid;
- local->cont.fchown.ino = fd->inode->ino;
+ local->op = GF_FOP_FTRUNCATE;
- local->transaction.fop = afr_fchown_wind;
- local->transaction.done = afr_fchown_done;
- local->transaction.unwind = afr_fchown_unwind;
+ local->transaction.wind = afr_ftruncate_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_ftruncate_unwind;
- local->fd = fd_ref (fd);
+ local->transaction.main_frame = frame;
- local->transaction.main_frame = frame;
- local->transaction.start = 0;
- local->transaction.len = 0;
+ local->transaction.start = local->cont.ftruncate.offset;
+ local->transaction.len = 0;
- afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ afr_fix_open (fd, this);
- op_ret = 0;
-out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
- }
+ /* Set it true speculatively, will get reset in afr_ftruncate_wind_cbk
+ if truncate was not a NOP */
+ local->stable_write = _gf_true;
+
+ ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
return 0;
+out:
+ AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
}
/* }}} */
-/* {{{ writev */
+/* {{{ setattr */
int
-afr_writev_unwind (call_frame_t *frame, xlator_t *this)
+afr_setattr_unwind (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- if (main_frame) {
- local->cont.writev.buf.st_ino = local->cont.writev.ino;
- AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
- &local->cont.writev.buf);
- }
- return 0;
+ AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
+ local->xdata_rsp);
+ return 0;
}
int
-afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct stat *buf)
+afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ struct iatt *preop, struct iatt *postop, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ preop, postop, xdata);
+}
- int child_index = (long) cookie;
- int call_count = -1;
- int need_unwind = 0;
- local = frame->local;
- priv = this->private;
+int
+afr_setattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->setattr,
+ &local->loc, &local->cont.setattr.in_buf,
+ local->cont.setattr.valid, local->xdata_req);
+ return 0;
+}
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.writev.buf = *buf;
- }
- local->success_count++;
+int
+afr_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf,
+ int32_t valid, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
- }
+ priv = this->private;
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ QUORUM_CHECK(setattr,out);
- call_count = afr_frame_return (frame);
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
- local->transaction.resume (frame, this);
- }
-
- return 0;
-}
+ local->cont.setattr.in_buf = *buf;
+ local->cont.setattr.valid = valid;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+ if (!local->xdata_req)
+ goto out;
-int
-afr_writev_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int i = 0;
- int call_count = -1;
+ local->transaction.wind = afr_setattr_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_setattr_unwind;
- local = frame->local;
- priv = this->private;
+ loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ local->op = GF_FOP_SETATTR;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
+
+ ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_writev_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->writev,
- local->fd,
- local->cont.writev.vector,
- local->cont.writev.count,
- local->cont.writev.offset,
- local->cont.writev.iobref);
-
- if (!--call_count)
- break;
- }
- }
-
return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
+/* {{{ fsetattr */
int
-afr_writev_done (call_frame_t *frame, xlator_t *this)
+afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
+ local = frame->local;
- iobref_unref (local->cont.writev.iobref);
- local->cont.writev.iobref = NULL;
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- local->transaction.unwind (frame, this);
+ AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
+}
- AFR_STACK_DESTROY (frame);
- return 0;
+int
+afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preop, struct iatt *postop, dict_t *xdata)
+{
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ preop, postop, xdata);
}
int
-afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iovec *vector, int32_t count, off_t offset,
- struct iobref *iobref)
+afr_fsetattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fsetattr,
+ local->fd, &local->cont.fsetattr.in_buf,
+ local->cont.fsetattr.valid, local->xdata_req);
+ return 0;
+}
- int ret = -1;
- int op_ret = -1;
- int op_errno = 0;
+int
+afr_fsetattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ priv = this->private;
- priv = this->private;
+ QUORUM_CHECK(fsetattr,out);
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
goto out;
- }
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ local->cont.fsetattr.in_buf = *buf;
+ local->cont.fsetattr.valid = valid;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
+ if (!local->xdata_req)
goto out;
- }
- transaction_frame->local = local;
+ local->transaction.wind = afr_fsetattr_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_fsetattr_unwind;
- local->op = GF_FOP_WRITE;
- local->cont.writev.vector = iov_dup (vector, count);
- local->cont.writev.count = count;
- local->cont.writev.offset = offset;
- local->cont.writev.ino = fd->inode->ino;
- local->cont.writev.iobref = iobref_ref (iobref);
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
- local->transaction.fop = afr_writev_wind;
- local->transaction.done = afr_writev_done;
- local->transaction.unwind = afr_writev_unwind;
+ local->op = GF_FOP_FSETATTR;
- local->fd = fd_ref (fd);
+ afr_fix_open (fd, this);
- local->transaction.main_frame = frame;
- if (fd->flags & O_APPEND) {
- local->transaction.start = 0;
- local->transaction.len = 0;
- } else {
- local->transaction.start = offset;
- local->transaction.len = iov_length (vector, count);
- }
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
- return 0;
+ AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
-/* }}} */
+/* {{{ setxattr */
-/* {{{ truncate */
int
-afr_truncate_unwind (call_frame_t *frame, xlator_t *this)
+afr_setxattr_unwind (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- call_frame_t *main_frame = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- if (main_frame) {
- local->cont.truncate.buf.st_ino = local->cont.truncate.ino;
- AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
- &local->cont.truncate.buf);
- }
- return 0;
+ AFR_STACK_UNWIND (setxattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return 0;
}
int
-afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct stat *buf)
+afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int child_index = (long) cookie;
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.truncate.buf = *buf;
- }
- local->success_count++;
-
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
- }
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ NULL, NULL, xdata);
}
-int32_t
-afr_truncate_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_setxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->truncate,
- &local->loc,
- local->cont.truncate.offset);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->setxattr,
+ &local->loc, local->cont.setxattr.dict,
+ local->cont.setxattr.flags, local->xdata_req);
+ return 0;
}
int
-afr_truncate_done (call_frame_t *frame, xlator_t *this)
+afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata)
{
- afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = EINVAL;
- local = frame->local;
+ GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict,
+ op_errno, out);
- local->transaction.unwind (frame, this);
+ GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict,
+ op_errno, out);
- AFR_STACK_DESTROY (frame);
+ priv = this->private;
- return 0;
-}
+ QUORUM_CHECK(setxattr,out);
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
-int
-afr_truncate (call_frame_t *frame, xlator_t *this,
- loc_t *loc, off_t offset)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
- int ret = -1;
+ local->cont.setxattr.dict = dict_ref (dict);
+ local->cont.setxattr.flags = flags;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- int op_ret = -1;
- int op_errno = 0;
+ if (!local->xdata_req)
+ goto out;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local->transaction.wind = afr_setxattr_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_setxattr_unwind;
- priv = this->private;
+ loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
- goto out;
- }
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ local->op = GF_FOP_SETXATTR;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
+ ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
op_errno = -ret;
goto out;
- }
-
- transaction_frame->local = local;
-
- local->op_ret = -1;
-
- local->cont.truncate.offset = offset;
- local->cont.truncate.ino = loc->inode->ino;
-
- local->transaction.fop = afr_truncate_wind;
- local->transaction.done = afr_truncate_done;
- local->transaction.unwind = afr_truncate_unwind;
-
- loc_copy (&local->loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.start = 0;
- local->transaction.len = offset;
-
- afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
- }
+ }
return 0;
-}
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
-/* }}} */
+ return 0;
+}
-/* {{{ ftruncate */
+/* {{{ fsetxattr */
int
-afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this)
+afr_fsetxattr_unwind (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- if (main_frame) {
- local->cont.ftruncate.buf.st_ino = local->cont.ftruncate.ino;
- AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
- &local->cont.ftruncate.buf);
- }
- return 0;
+ AFR_STACK_UNWIND (fsetxattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return 0;
}
int
-afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct stat *buf)
+afr_fsetxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int child_index = (long) cookie;
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.ftruncate.buf = *buf;
- }
- local->success_count++;
-
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
- }
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ NULL, NULL, xdata);
}
int
-afr_ftruncate_wind (call_frame_t *frame, xlator_t *this)
+afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->ftruncate,
- local->fd, local->cont.ftruncate.offset);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fsetxattr,
+ local->fd, local->cont.fsetxattr.dict,
+ local->cont.fsetxattr.flags, local->xdata_req);
+ return 0;
}
int
-afr_ftruncate_done (call_frame_t *frame, xlator_t *this)
+afr_fsetxattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata)
{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- return 0;
-}
-
-
-int
-afr_ftruncate (call_frame_t *frame, xlator_t *this,
- fd_t *fd, off_t offset)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
+ GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict,
+ op_errno, out);
- int ret = -1;
+ GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict,
+ op_errno, out);
- int op_ret = -1;
- int op_errno = 0;
+ priv = this->private;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ QUORUM_CHECK(fsetxattr,out);
- priv = this->private;
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
goto out;
- }
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ local->cont.fsetxattr.dict = dict_ref (dict);
+ local->cont.fsetxattr.flags = flags;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- transaction_frame->local = local;
-
- local->op = GF_FOP_FTRUNCATE;
- local->op_ret = -1;
+ if (!local->xdata_req)
+ goto out;
- local->cont.ftruncate.offset = offset;
- local->cont.ftruncate.ino = fd->inode->ino;
+ local->transaction.wind = afr_fsetxattr_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_fsetxattr_unwind;
- local->transaction.fop = afr_ftruncate_wind;
- local->transaction.done = afr_ftruncate_done;
- local->transaction.unwind = afr_ftruncate_unwind;
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
- local->fd = fd_ref (fd);
+ local->op = GF_FOP_FSETXATTR;
- local->transaction.main_frame = frame;
- local->transaction.start = 0;
- local->transaction.len = offset;
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
- return 0;
+ AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
+ return 0;
}
/* }}} */
-/* {{{ utimens */
+
+/* {{{ removexattr */
int
-afr_utimens_unwind (call_frame_t *frame, xlator_t *this)
+afr_removexattr_unwind (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- call_frame_t *main_frame = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- if (main_frame) {
- local->cont.utimens.buf.st_ino = local->cont.utimens.ino;
- AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
- &local->cont.utimens.buf);
- }
- return 0;
+ AFR_STACK_UNWIND (removexattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return 0;
}
int
-afr_utimens_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct stat *buf)
+afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int child_index = (long) cookie;
- int call_count = -1;
- int need_unwind = 1;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.utimens.buf = *buf;
- }
- local->success_count++;
-
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ NULL, NULL, xdata);
}
int
-afr_utimens_wind (call_frame_t *frame, xlator_t *this)
+afr_removexattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
local = frame->local;
priv = this->private;
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_utimens_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->utimens,
- &local->loc,
- local->cont.utimens.tv);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->removexattr,
+ &local->loc, local->cont.removexattr.name,
+ local->xdata_req);
+ return 0;
}
int
-afr_utimens_done (call_frame_t *frame, xlator_t *this)
+afr_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name, dict_t *xdata)
{
- afr_local_t * local = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- local = frame->local;
+ GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*",
+ name, op_errno, out);
- local->transaction.unwind (frame, this);
+ GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*",
+ name, op_errno, out);
- AFR_STACK_DESTROY (frame);
+ priv = this->private;
- return 0;
-}
+ QUORUM_CHECK(removexattr,out);
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
-int
-afr_utimens (call_frame_t *frame, xlator_t *this,
- loc_t *loc, struct timespec tv[2])
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
- int ret = -1;
+ local->cont.removexattr.name = gf_strdup (name);
- int op_ret = -1;
- int op_errno = 0;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ if (!local->xdata_req)
+ goto out;
- priv = this->private;
+ local->transaction.wind = afr_removexattr_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_removexattr_unwind;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
- goto out;
- }
+ loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
+
+ local->op = GF_FOP_REMOVEXATTR;
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
+ ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
op_errno = -ret;
goto out;
- }
+ }
- transaction_frame->local = local;
-
- local->op_ret = -1;
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
- local->cont.utimens.tv[0] = tv[0];
- local->cont.utimens.tv[1] = tv[1];
+ AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL);
+ return 0;
+}
- local->cont.utimens.ino = loc->inode->ino;
+/* ffremovexattr */
+int
+afr_fremovexattr_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ call_frame_t *main_frame = NULL;
- local->transaction.fop = afr_utimens_wind;
- local->transaction.done = afr_utimens_done;
- local->transaction.unwind = afr_utimens_unwind;
+ local = frame->local;
- loc_copy (&local->loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.start = 0;
- local->transaction.len = 0;
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ AFR_STACK_UNWIND (fremovexattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return 0;
+}
- op_ret = 0;
-out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
- }
- return 0;
+int
+afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ NULL, NULL, xdata);
}
-/* }}} */
-/* {{{ setxattr */
+int
+afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fremovexattr,
+ local->fd, local->cont.removexattr.name,
+ local->xdata_req);
+ return 0;
+}
int
-afr_setxattr_unwind (call_frame_t *frame, xlator_t *this)
+afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- call_frame_t *main_frame = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- local = frame->local;
- priv = this->private;
+ GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*",
+ name, op_errno, out);
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*",
+ name, op_errno, out);
- if (main_frame) {
- AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno)
- }
- return 0;
-}
+ priv = this->private;
+ QUORUM_CHECK(fremovexattr, out);
-int
-afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
- int call_count = -1;
- int need_unwind = 0;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
- local = frame->local;
- priv = this->private;
+ local->cont.removexattr.name = gf_strdup (name);
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
+ if (!local->xdata_req)
+ goto out;
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
- }
+ local->transaction.wind = afr_fremovexattr_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_fremovexattr_unwind;
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
- if (need_unwind)
- local->transaction.unwind (frame, this);
+ local->op = GF_FOP_FREMOVEXATTR;
- call_count = afr_frame_return (frame);
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
+
+ ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (fremovexattr, frame, -1, op_errno, NULL);
+
+ return 0;
}
int
-afr_setxattr_wind (call_frame_t *frame, xlator_t *this)
+afr_fallocate_unwind (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
+ afr_local_t * local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ local = frame->local;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
return 0;
- }
-
- local->call_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setxattr,
- &local->loc,
- local->cont.setxattr.dict,
- local->cont.setxattr.flags);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
}
int
-afr_setxattr_done (call_frame_t *frame, xlator_t *this)
+afr_fallocate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t * local = frame->local;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+}
- local->transaction.unwind (frame, this);
- AFR_STACK_DESTROY (frame);
-
- return 0;
+int
+afr_fallocate_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fallocate,
+ local->fd, local->cont.fallocate.mode,
+ local->cont.fallocate.offset,
+ local->cont.fallocate.len, local->xdata_req);
+ return 0;
}
int
-afr_setxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *dict, int32_t flags)
+afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
-
- int ret = -1;
+ afr_private_t *priv = NULL;
+ call_frame_t *transaction_frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- int op_ret = -1;
- int op_errno = 0;
+ priv = this->private;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ QUORUM_CHECK(fallocate,out);
- priv = this->private;
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
goto out;
- }
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ local->cont.fallocate.mode = mode;
+ local->cont.fallocate.offset = offset;
+ local->cont.fallocate.len = len;
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
- transaction_frame->local = local;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- local->op_ret = -1;
+ if (!local->xdata_req)
+ goto out;
- local->cont.setxattr.dict = dict_ref (dict);
- local->cont.setxattr.flags = flags;
+ local->op = GF_FOP_FALLOCATE;
- local->transaction.fop = afr_setxattr_wind;
- local->transaction.done = afr_setxattr_done;
- local->transaction.unwind = afr_setxattr_unwind;
+ local->transaction.wind = afr_fallocate_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_fallocate_unwind;
- loc_copy (&local->loc, loc);
+ local->transaction.main_frame = frame;
- local->transaction.main_frame = frame;
- local->transaction.start = 0;
- local->transaction.len = 0;
+ local->transaction.start = local->cont.fallocate.offset;
+ local->transaction.len = 0;
- afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ afr_fix_open (fd, this);
- op_ret = 0;
-out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (frame, op_ret, op_errno);
- }
+ ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
-/* }}} */
-/* {{{ removexattr */
+/* }}} */
+/* {{{ discard */
int
-afr_removexattr_unwind (call_frame_t *frame, xlator_t *this)
+afr_discard_unwind (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- call_frame_t *main_frame = NULL;
+ afr_local_t * local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- if (main_frame) {
- AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno)
- }
- return 0;
+ AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
}
int
-afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+afr_discard_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+}
- int call_count = -1;
- int need_unwind = 0;
- local = frame->local;
- priv = this->private;
+int
+afr_discard_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->discard,
+ local->fd, local->cont.discard.offset,
+ local->cont.discard.len, local->xdata_req);
+ return 0;
+}
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
- }
+int
+afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ priv = this->private;
- if (need_unwind)
- local->transaction.unwind (frame, this);
+ QUORUM_CHECK(discard, out);
- call_count = afr_frame_return (frame);
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
-}
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
+ local->cont.discard.offset = offset;
+ local->cont.discard.len = len;
-int32_t
-afr_removexattr_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
- int call_count = -1;
- int i = 0;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- local = frame->local;
- priv = this->private;
+ if (!local->xdata_req)
+ goto out;
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ local->op = GF_FOP_DISCARD;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
+ local->transaction.wind = afr_discard_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_discard_unwind;
- local->call_count = call_count;
+ local->transaction.main_frame = frame;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->removexattr,
- &local->loc,
- local->cont.removexattr.name);
+ local->transaction.start = local->cont.discard.offset;
+ local->transaction.len = 0;
+
+ afr_fix_open (fd, this);
+
+ ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- if (!--call_count)
- break;
- }
- }
-
return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
+/* {{{ zerofill */
+
int
-afr_removexattr_done (call_frame_t *frame, xlator_t *this)
+afr_zerofill_unwind (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = frame->local;
+ afr_local_t * local = NULL;
+ call_frame_t *main_frame = NULL;
- local->transaction.unwind (frame, this);
+ local = frame->local;
- AFR_STACK_DESTROY (frame);
-
- return 0;
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+
+ AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
}
int
-afr_removexattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name)
+afr_zerofill_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+}
- int ret = -1;
- int op_ret = -1;
- int op_errno = 0;
+int
+afr_zerofill_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->zerofill,
+ local->fd, local->cont.zerofill.offset,
+ local->cont.zerofill.len, local->xdata_req);
+ return 0;
+}
+
+int
+afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
+ priv = this->private;
- priv = this->private;
+ QUORUM_CHECK(discard, out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
+ if (!transaction_frame)
goto out;
- }
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
goto out;
- }
- transaction_frame->local = local;
+ local->cont.zerofill.offset = offset;
+ local->cont.zerofill.len = len;
- local->op_ret = -1;
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- local->cont.removexattr.name = strdup (name);
+ if (!local->xdata_req)
+ goto out;
- local->transaction.fop = afr_removexattr_wind;
- local->transaction.done = afr_removexattr_done;
- local->transaction.unwind = afr_removexattr_unwind;
+ local->op = GF_FOP_ZEROFILL;
- loc_copy (&local->loc, loc);
+ local->transaction.wind = afr_zerofill_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_zerofill_unwind;
- local->transaction.main_frame = frame;
- local->transaction.start = 0;
- local->transaction.len = 0;
+ local->transaction.main_frame = frame;
- afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ local->transaction.start = local->cont.discard.offset;
+ local->transaction.len = len;
- op_ret = 0;
-out:
- if (op_ret == -1) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (frame, op_ret, op_errno);
- }
+ afr_fix_open (fd, this);
+
+ ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
+
+/* }}} */
diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h
index 358d25b52..7b1fc5528 100644
--- a/xlators/cluster/afr/src/afr-inode-write.h
+++ b/xlators/cluster/afr/src/afr-inode-write.h
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef __INODE_WRITE_H__
@@ -22,43 +13,70 @@
int32_t
afr_chmod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode);
+ loc_t *loc, mode_t mode, dict_t *xdata);
int32_t
afr_chown (call_frame_t *frame, xlator_t *this,
- loc_t *loc, uid_t uid, gid_t gid);
+ loc_t *loc, uid_t uid, gid_t gid, dict_t *xdata);
int
afr_fchown (call_frame_t *frame, xlator_t *this,
- fd_t *fd, uid_t uid, gid_t gid);
+ fd_t *fd, uid_t uid, gid_t gid, dict_t *xdata);
int32_t
afr_fchmod (call_frame_t *frame, xlator_t *this,
- fd_t *fd, mode_t mode);
+ fd_t *fd, mode_t mode, dict_t *xdata);
int32_t
-afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iovec *vector, int32_t count, off_t offset,
- struct iobref *iobref);
+ uint32_t flags, struct iobref *iobref, dict_t *xdata);
int32_t
afr_truncate (call_frame_t *frame, xlator_t *this,
- loc_t *loc, off_t offset);
+ loc_t *loc, off_t offset, dict_t *xdata);
int32_t
afr_ftruncate (call_frame_t *frame, xlator_t *this,
- fd_t *fd, off_t offset);
+ fd_t *fd, off_t offset, dict_t *xdata);
int32_t
afr_utimens (call_frame_t *frame, xlator_t *this,
- loc_t *loc, struct timespec tv[2]);
+ loc_t *loc, struct timespec tv[2], dict_t *xdata);
+
+int
+afr_setattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata);
+
+int
+afr_fsetattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata);
int32_t
afr_setxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *dict, int32_t flags);
+ loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata);
+
+int32_t
+afr_fsetxattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata);
int32_t
afr_removexattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name);
+ loc_t *loc, const char *name, dict_t *xdata);
+int32_t
+afr_fremovexattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *name, dict_t *xdata);
+
+int
+afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata);
+
+int
+afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata);
+
+int
+afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata);
#endif /* __INODE_WRITE_H__ */
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
new file mode 100644
index 000000000..a2a758f35
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-lk-common.c
@@ -0,0 +1,1667 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "dict.h"
+#include "byte-order.h"
+#include "common-utils.h"
+
+#include "afr.h"
+#include "afr-transaction.h"
+
+#include <signal.h>
+
+
+#define LOCKED_NO 0x0 /* no lock held */
+#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path */
+#define LOCKED_LOWER 0x2 /* for lower path */
+
+#define AFR_TRACE_INODELK_IN(frame, this, params ...) \
+ do { \
+ afr_private_t *_priv = this->private; \
+ if (!_priv->inodelk_trace) \
+ break; \
+ afr_trace_inodelk_in (frame, this, params); \
+ } while (0);
+
+#define AFR_TRACE_INODELK_OUT(frame, this, params ...) \
+ do { \
+ afr_private_t *_priv = this->private; \
+ if (!_priv->inodelk_trace) \
+ break; \
+ afr_trace_inodelk_out (frame, this, params); \
+ } while (0);
+
+#define AFR_TRACE_ENTRYLK_IN(frame, this, params ...) \
+ do { \
+ afr_private_t *_priv = this->private; \
+ if (!_priv->entrylk_trace) \
+ break; \
+ afr_trace_entrylk_in (frame, this, params); \
+ } while (0);
+
+#define AFR_TRACE_ENTRYLK_OUT(frame, this, params ...) \
+ do { \
+ afr_private_t *_priv = this->private; \
+ if (!_priv->entrylk_trace) \
+ break; \
+ afr_trace_entrylk_out (frame, this, params); \
+ } while (0);
+
+int
+afr_entry_lockee_cmp (const void *l1, const void *l2)
+{
+ const afr_entry_lockee_t *r1 = l1;
+ const afr_entry_lockee_t *r2 = l2;
+ int ret = 0;
+ uuid_t gfid1 = {0};
+ uuid_t gfid2 = {0};
+
+ loc_gfid ((loc_t*)&r1->loc, gfid1);
+ loc_gfid ((loc_t*)&r2->loc, gfid2);
+ ret = uuid_compare (gfid1, gfid2);
+ /*Entrylks with NULL basename are the 'smallest'*/
+ if (ret == 0) {
+ if (!r1->basename)
+ return -1;
+ if (!r2->basename)
+ return 1;
+ ret = strcmp (r1->basename, r2->basename);
+ }
+
+ if (ret <= 0)
+ return -1;
+ else
+ return 1;
+}
+
+int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index);
+
+static int
+afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this);
+
+static uint64_t afr_lock_number = 1;
+
+static uint64_t
+get_afr_lock_number ()
+{
+ return (++afr_lock_number);
+}
+
+int
+afr_set_lock_number (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ int_lock->lock_number = get_afr_lock_number ();
+
+ return 0;
+}
+
+void
+afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner)
+{
+ gf_log (this->name, GF_LOG_TRACE,
+ "Setting lk-owner=%llu",
+ (unsigned long long) (unsigned long)lk_owner);
+
+ set_lk_owner_from_ptr (&frame->root->lk_owner, lk_owner);
+}
+
+static int
+is_afr_lock_selfheal (afr_local_t *local)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ int ret = -1;
+
+ int_lock = &local->internal_lock;
+
+ switch (int_lock->selfheal_lk_type) {
+ case AFR_DATA_SELF_HEAL_LK:
+ case AFR_METADATA_SELF_HEAL_LK:
+ ret = 1;
+ break;
+ case AFR_ENTRY_SELF_HEAL_LK:
+ ret = 0;
+ break;
+ }
+
+ return ret;
+
+}
+
+int32_t
+internal_lock_count (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int32_t call_count = 0;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i])
+ ++call_count;
+ }
+
+ return call_count;
+}
+
+static void
+afr_print_inodelk (char *str, int size, int cmd,
+ struct gf_flock *flock, gf_lkowner_t *owner)
+{
+ char *cmd_str = NULL;
+ char *type_str = NULL;
+
+ switch (cmd) {
+#if F_GETLK != F_GETLK64
+ case F_GETLK64:
+#endif
+ case F_GETLK:
+ cmd_str = "GETLK";
+ break;
+
+#if F_SETLK != F_SETLK64
+ case F_SETLK64:
+#endif
+ case F_SETLK:
+ cmd_str = "SETLK";
+ break;
+
+#if F_SETLKW != F_SETLKW64
+ case F_SETLKW64:
+#endif
+ case F_SETLKW:
+ cmd_str = "SETLKW";
+ break;
+
+ default:
+ cmd_str = "<null>";
+ break;
+ }
+
+ switch (flock->l_type) {
+ case F_RDLCK:
+ type_str = "READ";
+ break;
+ case F_WRLCK:
+ type_str = "WRITE";
+ break;
+ case F_UNLCK:
+ type_str = "UNLOCK";
+ break;
+ default:
+ type_str = "UNKNOWN";
+ break;
+ }
+
+ snprintf (str, size, "lock=INODELK, cmd=%s, type=%s, "
+ "start=%llu, len=%llu, pid=%llu, lk-owner=%s",
+ cmd_str, type_str, (unsigned long long) flock->l_start,
+ (unsigned long long) flock->l_len,
+ (unsigned long long) flock->l_pid,
+ lkowner_utoa (owner));
+
+}
+
+static void
+afr_print_lockee (char *str, int size, loc_t *loc, fd_t *fd,
+ int child_index)
+{
+ snprintf (str, size, "path=%s, fd=%p, child=%d",
+ loc->path ? loc->path : "<nul>",
+ fd ? fd : NULL,
+ child_index);
+}
+
+void
+afr_print_entrylk (char *str, int size, const char *basename,
+ gf_lkowner_t *owner)
+{
+ snprintf (str, size, "Basename=%s, lk-owner=%s",
+ basename ? basename : "<nul>",
+ lkowner_utoa (owner));
+}
+
+static void
+afr_print_verdict (int op_ret, int op_errno, char *str)
+{
+ if (op_ret < 0) {
+ if (op_errno == EAGAIN)
+ strcpy (str, "EAGAIN");
+ else
+ strcpy (str, "FAILED");
+ }
+ else
+ strcpy (str, "GRANTED");
+}
+
+static void
+afr_set_lock_call_type (afr_lock_call_type_t lock_call_type,
+ char *lock_call_type_str,
+ afr_internal_lock_t *int_lock)
+{
+ switch (lock_call_type) {
+ case AFR_INODELK_TRANSACTION:
+ if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK)
+ strcpy (lock_call_type_str, "AFR_INODELK_TRANSACTION");
+ else
+ strcpy (lock_call_type_str, "AFR_INODELK_SELFHEAL");
+ break;
+ case AFR_INODELK_NB_TRANSACTION:
+ if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK)
+ strcpy (lock_call_type_str, "AFR_INODELK_NB_TRANSACTION");
+ else
+ strcpy (lock_call_type_str, "AFR_INODELK_NB_SELFHEAL");
+ break;
+ case AFR_ENTRYLK_TRANSACTION:
+ if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK)
+ strcpy (lock_call_type_str, "AFR_ENTRYLK_TRANSACTION");
+ else
+ strcpy (lock_call_type_str, "AFR_ENTRYLK_SELFHEAL");
+ break;
+ case AFR_ENTRYLK_NB_TRANSACTION:
+ if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK)
+ strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_TRANSACTION");
+ else
+ strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_SELFHEAL");
+ break;
+ default:
+ strcpy (lock_call_type_str, "UNKNOWN");
+ break;
+ }
+
+}
+
+static void
+afr_trace_inodelk_out (call_frame_t *frame, xlator_t *this,
+ afr_lock_call_type_t lock_call_type,
+ afr_lock_op_type_t lk_op_type, struct gf_flock *flock,
+ int op_ret, int op_errno, int32_t child_index)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+
+ char lockee[256];
+ char lock_call_type_str[256];
+ char verdict[16];
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index);
+
+ afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock);
+
+ afr_print_verdict (op_ret, op_errno, verdict);
+
+ gf_log (this->name, GF_LOG_INFO,
+ "[%s %s] [%s] lk-owner=%s Lockee={%s} Number={%llu}",
+ lock_call_type_str,
+ lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY",
+ verdict, lkowner_utoa (&frame->root->lk_owner), lockee,
+ (unsigned long long) int_lock->lock_number);
+
+}
+
+static void
+afr_trace_inodelk_in (call_frame_t *frame, xlator_t *this,
+ afr_lock_call_type_t lock_call_type,
+ afr_lock_op_type_t lk_op_type, struct gf_flock *flock,
+ int32_t cmd, int32_t child_index)
+{
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+
+ char lock[256];
+ char lockee[256];
+ char lock_call_type_str[256];
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ afr_print_inodelk (lock, 256, cmd, flock, &frame->root->lk_owner);
+ afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index);
+
+ afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock);
+
+ gf_log (this->name, GF_LOG_INFO,
+ "[%s %s] Lock={%s} Lockee={%s} Number={%llu}",
+ lock_call_type_str,
+ lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST",
+ lock, lockee,
+ (unsigned long long) int_lock->lock_number);
+
+}
+
+static void
+afr_trace_entrylk_in (call_frame_t *frame, xlator_t *this,
+ afr_lock_call_type_t lock_call_type,
+ afr_lock_op_type_t lk_op_type, const char *basename,
+ int32_t cookie)
+{
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_private_t *priv = NULL;
+ int child_index = 0;
+ int lockee_no = 0;
+
+ char lock[256];
+ char lockee[256];
+ char lock_call_type_str[256];
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ priv = this->private;
+
+ if (!priv->entrylk_trace) {
+ return;
+ }
+ lockee_no = cookie / priv->child_count;
+ child_index = cookie % priv->child_count;
+
+ afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner);
+ afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd,
+ child_index);
+
+ afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock);
+
+ gf_log (this->name, GF_LOG_INFO,
+ "[%s %s] Lock={%s} Lockee={%s} Number={%llu}, Cookie={%d}",
+ lock_call_type_str,
+ lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST",
+ lock, lockee,
+ (unsigned long long) int_lock->lock_number,
+ cookie);
+}
+
+static void
+afr_trace_entrylk_out (call_frame_t *frame, xlator_t *this,
+ afr_lock_call_type_t lock_call_type,
+ afr_lock_op_type_t lk_op_type, const char *basename,
+ int op_ret, int op_errno, int32_t cookie)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int lockee_no = 0;
+ int child_index = 0;
+
+ char lock[256];
+ char lockee[256];
+ char lock_call_type_str[256];
+ char verdict[16];
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ priv = this->private;
+
+ if (!priv->entrylk_trace) {
+ return;
+ }
+ lockee_no = cookie / priv->child_count;
+ child_index = cookie % priv->child_count;
+
+ afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner);
+ afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd,
+ child_index);
+
+ afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock);
+
+ afr_print_verdict (op_ret, op_errno, verdict);
+
+ gf_log (this->name, GF_LOG_INFO,
+ "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu} Cookie={%d}",
+ lock_call_type_str,
+ lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY",
+ verdict,
+ lock, lockee,
+ (unsigned long long) int_lock->lock_number,
+ cookie);
+
+}
+
+static int
+transaction_lk_op (afr_local_t *local)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ int ret = -1;
+
+ int_lock = &local->internal_lock;
+
+ if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) {
+ gf_log (THIS->name, GF_LOG_DEBUG,
+ "lk op is for a transaction");
+ ret = 1;
+ }
+ else if (int_lock->transaction_lk_type == AFR_SELFHEAL_LK) {
+ gf_log (THIS->name, GF_LOG_DEBUG,
+ "lk op is for a self heal");
+
+ ret = 0;
+ }
+
+ if (ret == -1)
+ gf_log (THIS->name, GF_LOG_DEBUG,
+ "lk op is not set");
+
+ return ret;
+
+}
+
+static int
+is_afr_lock_transaction (afr_local_t *local)
+{
+ int ret = 0;
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ ret = 1;
+ break;
+
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ case AFR_ENTRY_TRANSACTION:
+ ret = 0;
+ break;
+
+ }
+
+ return ret;
+}
+
+int
+afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local,
+ loc_t *loc, char *basename, int child_count)
+{
+ int ret = -1;
+
+ loc_copy (&lockee->loc, loc);
+ lockee->basename = (basename)? gf_strdup (basename): NULL;
+ if (basename && !lockee->basename)
+ goto out;
+
+ lockee->locked_count = 0;
+ lockee->locked_nodes = GF_CALLOC (child_count,
+ sizeof (*lockee->locked_nodes),
+ gf_afr_mt_afr_node_character);
+
+ if (!lockee->locked_nodes)
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+
+}
+
+void
+afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock)
+{
+ int i = 0;
+
+ for (i = 0; i < int_lock->lockee_count; i++) {
+ loc_wipe (&int_lock->lockee[i].loc);
+ if (int_lock->lockee[i].basename)
+ GF_FREE (int_lock->lockee[i].basename);
+ if (int_lock->lockee[i].locked_nodes)
+ GF_FREE (int_lock->lockee[i].locked_nodes);
+ }
+
+ return;
+}
+
+static int
+initialize_entrylk_variables (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_private_t *priv = NULL;
+
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ int_lock->entrylk_lock_count = 0;
+ int_lock->lock_op_ret = -1;
+ int_lock->lock_op_errno = 0;
+
+ for (i = 0; i < AFR_LOCKEE_COUNT_MAX; i++) {
+ if (!int_lock->lockee[i].locked_nodes)
+ break;
+ int_lock->lockee[i].locked_count = 0;
+ memset (int_lock->lockee[i].locked_nodes, 0,
+ sizeof (*int_lock->lockee[i].locked_nodes) *
+ priv->child_count);
+ }
+
+ return 0;
+}
+
+static int
+initialize_inodelk_variables (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_private_t *priv = NULL;
+ afr_inodelk_t *inodelk = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+
+ inodelk->lock_count = 0;
+ int_lock->lk_attempted_count = 0;
+ int_lock->lock_op_ret = -1;
+ int_lock->lock_op_errno = 0;
+
+ memset (inodelk->locked_nodes, 0,
+ sizeof (*inodelk->locked_nodes) * priv->child_count);
+ memset (int_lock->locked_nodes, 0,
+ sizeof (*int_lock->locked_nodes) * priv->child_count);
+
+ return 0;
+}
+
+int
+afr_lockee_locked_nodes_count (afr_internal_lock_t *int_lock)
+{
+ int call_count = 0;
+ int i = 0;
+
+ for (i = 0; i < int_lock->lockee_count; i++)
+ call_count += int_lock->lockee[i].locked_count;
+
+ return call_count;
+}
+
+int
+afr_locked_nodes_count (unsigned char *locked_nodes, int child_count)
+
+{
+ int i = 0;
+ int call_count = 0;
+
+ for (i = 0; i < child_count; i++) {
+ if (locked_nodes[i] & LOCKED_YES)
+ call_count++;
+ }
+
+ return call_count;
+}
+
+/* FIXME: What if UNLOCK fails */
+static int32_t
+afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ LOCK (&frame->lock);
+ {
+ call_count = --int_lock->lk_call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ if (call_count == 0) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "All internal locks unlocked");
+ int_lock->lock_cbk (frame, this);
+ }
+
+ return 0;
+}
+
+static int32_t
+afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
+ int32_t child_index = (long)cookie;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION,
+ AFR_UNLOCK_OP, NULL, op_ret,
+ op_errno, child_index);
+
+ priv = this->private;
+
+ if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
+ gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on subvolume %s "
+ "with lock owner %s", local->loc.path,
+ priv->children[child_index]->name,
+ lkowner_utoa (&frame->root->lk_owner));
+ }
+
+
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ inodelk->locked_nodes[child_index] &= LOCKED_NO;
+ if (local->transaction.eager_lock)
+ local->transaction.eager_lock[child_index] = 0;
+
+ afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, xdata);
+
+ return 0;
+
+}
+
+static int
+afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct gf_flock flock = {0,};
+ struct gf_flock full_flock = {0,};
+ struct gf_flock *flock_use = NULL;
+ int call_count = 0;
+ int i = 0;
+ int piggyback = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ priv = this->private;
+
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+
+ flock.l_start = inodelk->flock.l_start;
+ flock.l_len = inodelk->flock.l_len;
+ flock.l_type = F_UNLCK;
+
+ full_flock.l_type = F_UNLCK;
+ call_count = afr_locked_nodes_count (inodelk->locked_nodes,
+ priv->child_count);
+
+ int_lock->lk_call_count = call_count;
+
+ if (!call_count) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "No internal locks unlocked");
+ int_lock->lock_cbk (frame, this);
+ goto out;
+ }
+
+ if (local->fd)
+ fd_ctx = afr_fd_ctx_get (local->fd, this);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES)
+ continue;
+
+ if (local->fd) {
+ flock_use = &flock;
+ if (!local->transaction.eager_lock[i]) {
+ goto wind;
+ }
+
+ piggyback = 0;
+
+ LOCK (&local->fd->lock);
+ {
+ if (fd_ctx->lock_piggyback[i]) {
+ fd_ctx->lock_piggyback[i]--;
+ piggyback = 1;
+ } else {
+ fd_ctx->lock_acquired[i]--;
+ }
+ }
+ UNLOCK (&local->fd->lock);
+
+ if (piggyback) {
+ afr_unlock_inodelk_cbk (frame, (void *) (long) i,
+ this, 1, 0, NULL);
+ if (!--call_count)
+ break;
+ continue;
+ }
+
+ flock_use = &full_flock;
+ wind:
+ AFR_TRACE_INODELK_IN (frame, this,
+ AFR_INODELK_TRANSACTION,
+ AFR_UNLOCK_OP, flock_use, F_SETLK,
+ i);
+
+ STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk,
+ (void *) (long)i,
+ priv->children[i],
+ priv->children[i]->fops->finodelk,
+ int_lock->domain, local->fd,
+ F_SETLK, flock_use, NULL);
+
+ if (!--call_count)
+ break;
+
+ } else {
+ AFR_TRACE_INODELK_IN (frame, this,
+ AFR_INODELK_TRANSACTION,
+ AFR_UNLOCK_OP, &flock, F_SETLK, i);
+
+ STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk,
+ (void *) (long)i,
+ priv->children[i],
+ priv->children[i]->fops->inodelk,
+ int_lock->domain, &local->loc,
+ F_SETLK, &flock, NULL);
+
+ if (!--call_count)
+ break;
+ }
+ }
+out:
+ return 0;
+}
+
+static int32_t
+afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ int32_t child_index = 0;
+ int lockee_no = 0;
+
+ priv = this->private;
+ lockee_no = (int)((long) cookie) / priv->child_count;
+ child_index = (int) ((long) cookie) % priv->child_count;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION,
+ AFR_UNLOCK_OP,
+ int_lock->lockee[lockee_no].basename, op_ret,
+ op_errno, (int) ((long)cookie));
+
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: unlock failed on %d, reason: %s",
+ local->loc.path, child_index, strerror (op_errno));
+ }
+
+ int_lock->lockee[lockee_no].locked_nodes[child_index] &= LOCKED_NO;
+ afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, NULL);
+
+ return 0;
+}
+
+static int
+afr_unlock_entrylk (call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int index = 0;
+ int lockee_no = 0;
+ int copies = 0;
+ int i = -1;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ priv = this->private;
+ copies = priv->child_count;
+
+ call_count = afr_lockee_locked_nodes_count (int_lock);
+
+ int_lock->lk_call_count = call_count;
+
+ if (!call_count){
+ gf_log (this->name, GF_LOG_TRACE,
+ "No internal locks unlocked");
+ int_lock->lock_cbk (frame, this);
+ goto out;
+ }
+
+ for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) {
+ lockee_no = i / copies;
+ index = i % copies;
+ if (int_lock->lockee[lockee_no].locked_nodes[index] & LOCKED_YES) {
+ AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION,
+ AFR_UNLOCK_OP,
+ int_lock->lockee[lockee_no].basename,
+ i);
+
+ STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk,
+ (void *) (long) i,
+ priv->children[index],
+ priv->children[index]->fops->entrylk,
+ int_lock->domain,
+ &int_lock->lockee[lockee_no].loc,
+ int_lock->lockee[lockee_no].basename,
+ ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL);
+
+ if (!--call_count)
+ break;
+ }
+ }
+
+out:
+ return 0;
+
+}
+
+static int32_t
+afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int cky = (long) cookie;
+ int child_index = 0;
+ int lockee_no = 0;
+
+ priv = this->private;
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ child_index = ((int)cky) % priv->child_count;
+ lockee_no = ((int)cky) / priv->child_count;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ if (op_errno == ENOSYS) {
+ /* return ENOTSUP */
+ gf_log (this->name, GF_LOG_ERROR,
+ "subvolume does not support locking. "
+ "please load features/locks xlator on server");
+ local->op_ret = op_ret;
+ int_lock->lock_op_ret = op_ret;
+ }
+
+ local->op_errno = op_errno;
+ int_lock->lock_op_errno = op_errno;
+ }
+
+ int_lock->lk_attempted_count++;
+ }
+ UNLOCK (&frame->lock);
+
+ if ((op_ret == -1) &&
+ (op_errno == ENOSYS)) {
+ afr_unlock (frame, this);
+ } else {
+ if (op_ret == 0) {
+ if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
+ local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ int_lock->lockee[lockee_no].locked_nodes[child_index] |= LOCKED_YES;
+ int_lock->lockee[lockee_no].locked_count++;
+ int_lock->entrylk_lock_count++;
+ } else {
+ int_lock->locked_nodes[child_index] |= LOCKED_YES;
+ int_lock->lock_count++;
+ }
+ }
+ afr_lock_blocking (frame, this, cky + 1);
+ }
+
+ return 0;
+}
+
+static int32_t
+afr_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION,
+ AFR_LOCK_OP, NULL, op_ret,
+ op_errno, (long) cookie);
+
+ afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata);
+ return 0;
+
+}
+
+static int32_t
+afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION,
+ AFR_LOCK_OP, NULL, op_ret,
+ op_errno, (long)cookie);
+
+ afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata);
+ return 0;
+}
+
+static int
+afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ memcpy (inodelk->locked_nodes, int_lock->locked_nodes,
+ sizeof (*inodelk->locked_nodes) * priv->child_count);
+ inodelk->lock_count = int_lock->lock_count;
+ break;
+
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ case AFR_ENTRY_TRANSACTION:
+ /*entrylk_count is being used in both non-blocking and blocking
+ * modes */
+ break;
+ }
+
+ return 0;
+
+}
+
+static inline gf_boolean_t
+afr_is_entrylk (afr_internal_lock_t *int_lock,
+ afr_transaction_type trans_type)
+{
+ gf_boolean_t is_entrylk = _gf_false;
+
+ if ((int_lock->transaction_lk_type == AFR_SELFHEAL_LK) &&
+ int_lock->selfheal_lk_type == AFR_ENTRY_SELF_HEAL_LK) {
+
+ is_entrylk = _gf_true;
+
+ } else if ((int_lock->transaction_lk_type == AFR_TRANSACTION_LK) &&
+ (trans_type == AFR_ENTRY_TRANSACTION ||
+ trans_type == AFR_ENTRY_RENAME_TRANSACTION)) {
+
+ is_entrylk = _gf_true;
+
+ } else {
+ is_entrylk = _gf_false;
+ }
+
+ return is_entrylk;
+}
+
+static gf_boolean_t
+_is_lock_wind_needed (afr_local_t *local, int child_index)
+{
+ if (!local->child_up[child_index])
+ return _gf_false;
+
+ return _gf_true;
+}
+
+int
+afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct gf_flock flock = {0,};
+ uint64_t ctx = 0;
+ int ret = 0;
+ int child_index = 0;
+ int lockee_no = 0;
+ gf_boolean_t is_entrylk = _gf_false;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ priv = this->private;
+ child_index = cookie % priv->child_count;
+ lockee_no = cookie / priv->child_count;
+ is_entrylk = afr_is_entrylk (int_lock, local->transaction.type);
+
+
+ if (!is_entrylk) {
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ flock.l_start = inodelk->flock.l_start;
+ flock.l_len = inodelk->flock.l_len;
+ flock.l_type = inodelk->flock.l_type;
+ }
+
+ if (local->fd) {
+ ret = fd_ctx_get (local->fd, this, &ctx);
+
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_INFO,
+ "unable to get fd ctx for fd=%p",
+ local->fd);
+
+ local->op_ret = -1;
+ int_lock->lock_op_ret = -1;
+
+ afr_copy_locked_nodes (frame, this);
+
+ afr_unlock (frame, this);
+
+ return 0;
+ }
+ }
+
+ if (int_lock->lk_expected_count == int_lock->lk_attempted_count) {
+ if ((is_entrylk && int_lock->entrylk_lock_count == 0) ||
+ (!is_entrylk && int_lock->lock_count == 0)) {
+ gf_log (this->name, GF_LOG_INFO,
+ "unable to lock on even one child");
+
+ local->op_ret = -1;
+ int_lock->lock_op_ret = -1;
+
+ afr_copy_locked_nodes (frame, this);
+
+ afr_unlock(frame, this);
+
+ return 0;
+ }
+ }
+
+ if (int_lock->lk_expected_count == int_lock->lk_attempted_count) {
+ /* we're done locking */
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "we're done locking");
+
+ afr_copy_locked_nodes (frame, this);
+
+ int_lock->lock_op_ret = 0;
+ int_lock->lock_cbk (frame, this);
+ return 0;
+ }
+
+ if (!_is_lock_wind_needed (local, child_index)) {
+ afr_lock_blocking (frame, this, cookie + 1);
+ return 0;
+ }
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+
+ if (local->fd) {
+ AFR_TRACE_INODELK_IN (frame, this,
+ AFR_INODELK_TRANSACTION,
+ AFR_LOCK_OP, &flock, F_SETLKW,
+ child_index);
+
+ STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->finodelk,
+ int_lock->domain, local->fd,
+ F_SETLKW, &flock, NULL);
+
+ } else {
+ AFR_TRACE_INODELK_IN (frame, this,
+ AFR_INODELK_TRANSACTION,
+ AFR_LOCK_OP, &flock, F_SETLKW,
+ child_index);
+
+ STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk,
+ (void *) (long) child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->inodelk,
+ int_lock->domain, &local->loc,
+ F_SETLKW, &flock, NULL);
+ }
+
+ break;
+
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ case AFR_ENTRY_TRANSACTION:
+ /*Accounting for child_index increments on 'down'
+ *and 'fd-less' children */
+
+ if (local->fd) {
+ AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_TRANSACTION,
+ AFR_LOCK_OP,
+ int_lock->lockee[lockee_no].basename,
+ cookie);
+
+ STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,
+ (void *) (long) cookie,
+ priv->children[child_index],
+ priv->children[child_index]->fops->fentrylk,
+ int_lock->domain, local->fd,
+ int_lock->lockee[lockee_no].basename,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
+ } else {
+ AFR_TRACE_ENTRYLK_IN (frame, this,
+ AFR_ENTRYLK_TRANSACTION,
+ AFR_LOCK_OP, local->transaction.basename,
+ child_index);
+
+ STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,
+ (void *) (long) cookie,
+ priv->children[child_index],
+ priv->children[child_index]->fops->entrylk,
+ int_lock->domain,
+ &int_lock->lockee[lockee_no].loc,
+ int_lock->lockee[lockee_no].basename,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
+ }
+
+ break;
+ }
+
+ return 0;
+}
+
+int32_t
+afr_blocking_lock (call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int up_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ initialize_inodelk_variables (frame, this);
+ break;
+
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ case AFR_ENTRY_TRANSACTION:
+ up_count = AFR_COUNT (local->child_up, priv->child_count);
+ int_lock->lk_call_count = int_lock->lk_expected_count
+ = (int_lock->lockee_count *
+ up_count);
+ initialize_entrylk_variables (frame, this);
+ break;
+ }
+
+ afr_lock_blocking (frame, this, 0);
+
+ return 0;
+}
+
+static int32_t
+afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+ int copies = 0;
+ int index = 0;
+ int lockee_no = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ copies = priv->child_count;
+ index = child_index % copies;
+ lockee_no = child_index / copies;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION,
+ AFR_LOCK_OP,
+ int_lock->lockee[lockee_no].basename, op_ret,
+ op_errno, (long) cookie);
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret < 0 ) {
+ if (op_errno == ENOSYS) {
+ /* return ENOTSUP */
+ gf_log (this->name, GF_LOG_ERROR,
+ "subvolume does not support locking. "
+ "please load features/locks xlator on server");
+ local->op_ret = op_ret;
+ int_lock->lock_op_ret = op_ret;
+
+ int_lock->lock_op_errno = op_errno;
+ local->op_errno = op_errno;
+ }
+ } else if (op_ret == 0) {
+ int_lock->lockee[lockee_no].locked_nodes[index] |= \
+ LOCKED_YES;
+ int_lock->lockee[lockee_no].locked_count++;
+ int_lock->entrylk_lock_count++;
+ }
+
+ call_count = --int_lock->lk_call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ if (call_count == 0) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "Last locking reply received");
+ /* all locks successful. Proceed to call FOP */
+ if (int_lock->entrylk_lock_count ==
+ int_lock->lk_expected_count) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "All servers locked. Calling the cbk");
+ int_lock->lock_op_ret = 0;
+ int_lock->lock_cbk (frame, this);
+ }
+ /* Not all locks were successful. Unlock and try locking
+ again, this time with serially blocking locks */
+ else {
+ gf_log (this->name, GF_LOG_TRACE,
+ "%d servers locked. Trying again with blocking calls",
+ int_lock->lock_count);
+
+ afr_unlock(frame, this);
+ }
+ }
+
+ return 0;
+}
+
+int
+afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int copies = 0;
+ int index = 0;
+ int lockee_no = 0;
+ int32_t call_count = 0;
+ int i = 0;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ priv = this->private;
+
+ copies = priv->child_count;
+ initialize_entrylk_variables (frame, this);
+
+ if (local->fd) {
+ fd_ctx = afr_fd_ctx_get (local->fd, this);
+ if (!fd_ctx) {
+ gf_log (this->name, GF_LOG_INFO,
+ "unable to get fd ctx for fd=%p",
+ local->fd);
+
+ local->op_ret = -1;
+ int_lock->lock_op_ret = -1;
+ local->op_errno = EINVAL;
+ int_lock->lock_op_errno = EINVAL;
+
+ afr_unlock (frame, this);
+ return -1;
+ }
+
+ call_count = int_lock->lockee_count * internal_lock_count (frame, this);
+ int_lock->lk_call_count = call_count;
+ int_lock->lk_expected_count = call_count;
+
+ if (!call_count) {
+ gf_log (this->name, GF_LOG_INFO,
+ "fd not open on any subvolumes. aborting.");
+ afr_unlock (frame, this);
+ goto out;
+ }
+
+ /* Send non-blocking entrylk calls only on up children
+ and where the fd has been opened */
+ for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) {
+ index = i%copies;
+ lockee_no = i/copies;
+ if (local->child_up[index]) {
+ AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION,
+ AFR_LOCK_OP,
+ int_lock->lockee[lockee_no].basename,
+ i);
+
+ STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,
+ (void *) (long) i,
+ priv->children[index],
+ priv->children[index]->fops->fentrylk,
+ this->name, local->fd,
+ int_lock->lockee[lockee_no].basename,
+ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK,
+ NULL);
+ if (!--call_count)
+ break;
+ }
+ }
+ } else {
+ call_count = int_lock->lockee_count * internal_lock_count (frame, this);
+ int_lock->lk_call_count = call_count;
+ int_lock->lk_expected_count = call_count;
+
+ for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) {
+ index = i%copies;
+ lockee_no = i/copies;
+ if (local->child_up[index]) {
+ AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION,
+ AFR_LOCK_OP,
+ int_lock->lockee[lockee_no].basename,
+ i);
+
+ STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,
+ (void *) (long) i,
+ priv->children[index],
+ priv->children[index]->fops->entrylk,
+ this->name, &int_lock->lockee[lockee_no].loc,
+ int_lock->lockee[lockee_no].basename,
+ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK,
+ NULL);
+
+ if (!--call_count)
+ break;
+ }
+ }
+ }
+out:
+ return 0;
+}
+
+int32_t
+afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
+ afr_local_t *local = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+
+ AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_NB_TRANSACTION,
+ AFR_LOCK_OP, NULL, op_ret,
+ op_errno, (long) cookie);
+
+ if (local->fd)
+ fd_ctx = afr_fd_ctx_get (local->fd, this);
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret < 0) {
+ if (op_errno == ENOSYS) {
+ /* return ENOTSUP */
+ gf_log (this->name, GF_LOG_ERROR,
+ "subvolume does not support locking. "
+ "please load features/locks xlator on "
+ "server");
+ local->op_ret = op_ret;
+ int_lock->lock_op_ret = op_ret;
+ int_lock->lock_op_errno = op_errno;
+ local->op_errno = op_errno;
+ }
+ if (local->transaction.eager_lock)
+ local->transaction.eager_lock[child_index] = 0;
+ } else {
+ inodelk->locked_nodes[child_index] |= LOCKED_YES;
+ inodelk->lock_count++;
+
+ if (local->transaction.eager_lock &&
+ local->transaction.eager_lock[child_index] &&
+ local->fd) {
+ /* piggybacked */
+ if (op_ret == 1) {
+ /* piggybacked */
+ } else if (op_ret == 0) {
+ /* lock acquired from server */
+ fd_ctx->lock_acquired[child_index]++;
+ }
+ }
+ }
+
+ call_count = --int_lock->lk_call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ if (call_count == 0) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "Last inode locking reply received");
+ /* all locks successful. Proceed to call FOP */
+ if (inodelk->lock_count == int_lock->lk_expected_count) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "All servers locked. Calling the cbk");
+ int_lock->lock_op_ret = 0;
+ int_lock->lock_cbk (frame, this);
+ }
+ /* Not all locks were successful. Unlock and try locking
+ again, this time with serially blocking locks */
+ else {
+ gf_log (this->name, GF_LOG_TRACE,
+ "%d servers locked. Trying again with blocking calls",
+ int_lock->lock_count);
+
+ afr_unlock(frame, this);
+ }
+ }
+
+ return 0;
+}
+
+int
+afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int32_t call_count = 0;
+ int i = 0;
+ int ret = 0;
+ struct gf_flock flock = {0,};
+ struct gf_flock full_flock = {0,};
+ struct gf_flock *flock_use = NULL;
+ int piggyback = 0;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ priv = this->private;
+
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+
+ flock.l_start = inodelk->flock.l_start;
+ flock.l_len = inodelk->flock.l_len;
+ flock.l_type = inodelk->flock.l_type;
+
+ full_flock.l_type = inodelk->flock.l_type;
+
+ initialize_inodelk_variables (frame, this);
+
+ if (local->fd) {
+ fd_ctx = afr_fd_ctx_get (local->fd, this);
+ if (!fd_ctx) {
+ gf_log (this->name, GF_LOG_INFO,
+ "unable to get fd ctx for fd=%p",
+ local->fd);
+
+ local->op_ret = -1;
+ int_lock->lock_op_ret = -1;
+ local->op_errno = EINVAL;
+ int_lock->lock_op_errno = EINVAL;
+
+ afr_unlock (frame, this);
+ ret = -1;
+ goto out;
+ }
+
+ call_count = internal_lock_count (frame, this);
+ int_lock->lk_call_count = call_count;
+ int_lock->lk_expected_count = call_count;
+
+ if (!call_count) {
+ gf_log (this->name, GF_LOG_INFO,
+ "fd not open on any subvolumes. aborting.");
+ afr_unlock (frame, this);
+ goto out;
+ }
+
+ /* Send non-blocking inodelk calls only on up children
+ and where the fd has been opened */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->child_up[i])
+ continue;
+
+ flock_use = &flock;
+ if (!local->transaction.eager_lock_on) {
+ goto wind;
+ }
+
+ piggyback = 0;
+ local->transaction.eager_lock[i] = 1;
+
+ afr_set_delayed_post_op (frame, this);
+
+ LOCK (&local->fd->lock);
+ {
+ if (fd_ctx->lock_acquired[i]) {
+ fd_ctx->lock_piggyback[i]++;
+ piggyback = 1;
+ }
+ }
+ UNLOCK (&local->fd->lock);
+
+ if (piggyback) {
+ /* (op_ret == 1) => indicate piggybacked lock */
+ afr_nonblocking_inodelk_cbk (frame, (void *) (long) i,
+ this, 1, 0, NULL);
+ if (!--call_count)
+ break;
+ continue;
+ }
+ flock_use = &full_flock;
+ wind:
+ AFR_TRACE_INODELK_IN (frame, this,
+ AFR_INODELK_NB_TRANSACTION,
+ AFR_LOCK_OP, flock_use, F_SETLK, i);
+
+ STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->finodelk,
+ int_lock->domain, local->fd,
+ F_SETLK, flock_use, NULL);
+
+ if (!--call_count)
+ break;
+ }
+ } else {
+ call_count = internal_lock_count (frame, this);
+ int_lock->lk_call_count = call_count;
+ int_lock->lk_expected_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->child_up[i])
+ continue;
+ AFR_TRACE_INODELK_IN (frame, this,
+ AFR_INODELK_NB_TRANSACTION,
+ AFR_LOCK_OP, &flock, F_SETLK, i);
+
+ STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->inodelk,
+ int_lock->domain, &local->loc,
+ F_SETLK, &flock, NULL);
+
+ if (!--call_count)
+ break;
+ }
+ }
+out:
+ return ret;
+}
+
+int32_t
+afr_unlock (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (transaction_lk_op (local)) {
+ if (is_afr_lock_transaction (local))
+ afr_unlock_inodelk (frame, this);
+ else
+ afr_unlock_entrylk (frame, this);
+
+ } else {
+ if (is_afr_lock_selfheal (local))
+ afr_unlock_inodelk (frame, this);
+ else
+ afr_unlock_entrylk (frame, this);
+ }
+
+ return 0;
+}
+
+int
+afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom,
+ unsigned int child_count)
+{
+ afr_local_t *dst_local = NULL;
+ afr_local_t *src_local = NULL;
+ afr_internal_lock_t *dst_lock = NULL;
+ afr_internal_lock_t *src_lock = NULL;
+ afr_inodelk_t *dst_inodelk = NULL;
+ afr_inodelk_t *src_inodelk = NULL;
+ int ret = -1;
+
+ src_local = src->local;
+ src_lock = &src_local->internal_lock;
+ src_inodelk = afr_get_inodelk (src_lock, dom);
+ dst_local = dst->local;
+ dst_lock = &dst_local->internal_lock;
+ dst_inodelk = afr_get_inodelk (dst_lock, dom);
+ if (!dst_inodelk || !src_inodelk)
+ goto out;
+ if (src_inodelk->locked_nodes) {
+ memcpy (dst_inodelk->locked_nodes, src_inodelk->locked_nodes,
+ sizeof (*dst_inodelk->locked_nodes) * child_count);
+ memset (src_inodelk->locked_nodes, 0,
+ sizeof (*src_inodelk->locked_nodes) * child_count);
+ }
+
+ dst_lock->transaction_lk_type = src_lock->transaction_lk_type;
+ dst_lock->selfheal_lk_type = src_lock->selfheal_lk_type;
+ dst_inodelk->lock_count = src_inodelk->lock_count;
+ src_inodelk->lock_count = 0;
+ ret = 0;
+out:
+ return ret;
+}
diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h
new file mode 100644
index 000000000..05df90cc0
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-mem-types.h
@@ -0,0 +1,49 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef __AFR_MEM_TYPES_H__
+#define __AFR_MEM_TYPES_H__
+
+#include "mem-types.h"
+
+enum gf_afr_mem_types_ {
+ gf_afr_mt_iovec = gf_common_mt_end + 1,
+ gf_afr_mt_afr_fd_ctx_t,
+ gf_afr_mt_afr_private_t,
+ gf_afr_mt_int32_t,
+ gf_afr_mt_char,
+ gf_afr_mt_xattr_key,
+ gf_afr_mt_dict_t,
+ gf_afr_mt_xlator_t,
+ gf_afr_mt_iatt,
+ gf_afr_mt_int,
+ gf_afr_mt_afr_node_character,
+ gf_afr_mt_sh_diff_loop_state,
+ gf_afr_mt_uint8_t,
+ gf_afr_mt_loc_t,
+ gf_afr_mt_entry_name,
+ gf_afr_mt_pump_priv,
+ gf_afr_mt_locked_fd,
+ gf_afr_mt_inode_ctx_t,
+ gf_afr_fd_paused_call_t,
+ gf_afr_mt_crawl_data_t,
+ gf_afr_mt_brick_pos_t,
+ gf_afr_mt_shd_bool_t,
+ gf_afr_mt_shd_timer_t,
+ gf_afr_mt_shd_event_t,
+ gf_afr_mt_time_t,
+ gf_afr_mt_pos_data_t,
+ gf_afr_mt_reply_t,
+ gf_afr_mt_subvol_healer_t,
+ gf_afr_mt_end
+};
+#endif
+
diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c
new file mode 100644
index 000000000..f86aa7fd8
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-open.c
@@ -0,0 +1,337 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "afr.h"
+#include "dict.h"
+#include "xlator.h"
+#include "hashfn.h"
+#include "logging.h"
+#include "stack.h"
+#include "list.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "common-utils.h"
+#include "compat-errno.h"
+#include "compat.h"
+#include "byte-order.h"
+#include "statedump.h"
+
+#include "fd.h"
+
+#include "afr-inode-read.h"
+#include "afr-inode-write.h"
+#include "afr-dir-read.h"
+#include "afr-dir-write.h"
+#include "afr-transaction.h"
+
+
+gf_boolean_t
+afr_is_fd_fixable (fd_t *fd)
+{
+ if (!fd || !fd->inode)
+ return _gf_false;
+ else if (fd_is_anonymous (fd))
+ return _gf_false;
+ else if (uuid_is_null (fd->inode->gfid))
+ return _gf_false;
+
+ return _gf_true;
+}
+
+
+int
+afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ afr_local_t * local = frame->local;
+
+ AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno,
+ local->fd, xdata);
+ return 0;
+}
+
+
+int
+afr_open_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ fd_t *fd, dict_t *xdata)
+{
+ afr_local_t * local = NULL;
+ int call_count = -1;
+ int child_index = (long) cookie;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ local = frame->local;
+ fd_ctx = local->fd_ctx;
+
+ LOCK (&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+ } else {
+ local->op_ret = op_ret;
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ if ((fd_ctx->flags & O_TRUNC) && (local->op_ret >= 0)) {
+ STACK_WIND (frame, afr_open_ftruncate_cbk,
+ this, this->fops->ftruncate,
+ fd, 0, NULL);
+ } else {
+ AFR_STACK_UNWIND (open, frame, local->op_ret,
+ local->op_errno, local->fd,
+ local->xdata_rsp);
+ }
+ }
+
+ return 0;
+}
+
+int
+afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata)
+{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_errno = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ //We can't let truncation to happen outside transaction.
+
+ priv = this->private;
+
+ if (flags & (O_CREAT|O_TRUNC)) {
+ QUORUM_CHECK(open,out);
+ }
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->fd = fd_ref (fd);
+ local->fd_ctx = fd_ctx;
+ fd_ctx->flags = flags;
+
+ call_count = local->call_count;
+
+ local->cont.open.flags = flags;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->open,
+ loc, (flags & ~O_TRUNC), fd, xdata);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+out:
+ AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, NULL);
+
+ return 0;
+}
+
+int
+afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (op_ret >= 0) {
+ gf_log (this->name, GF_LOG_DEBUG, "fd for %s opened "
+ "successfully on subvolume %s", local->loc.path,
+ priv->children[child_index]->name);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to open %s "
+ "on subvolume %s", local->loc.path,
+ priv->children[child_index]->name);
+ }
+
+ fd_ctx = local->fd_ctx;
+
+ LOCK (&local->fd->lock);
+ {
+ if (op_ret >= 0) {
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+ } else {
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+ }
+ }
+ UNLOCK (&local->fd->lock);
+
+ call_count = afr_frame_return (frame);
+ if (call_count == 0)
+ AFR_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+
+static int
+afr_fd_ctx_need_open (fd_t *fd, xlator_t *this, unsigned char *need_open)
+{
+ afr_fd_ctx_t *fd_ctx = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int count = 0;
+
+ priv = this->private;
+
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
+ return 0;
+
+ LOCK (&fd->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ if (fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED &&
+ priv->child_up[i]) {
+ fd_ctx->opened_on[i] = AFR_FD_OPENING;
+ need_open[i] = 1;
+ count++;
+ } else {
+ need_open[i] = 0;
+ }
+ }
+ }
+ UNLOCK (&fd->lock);
+
+ return count;
+}
+
+
+void
+afr_fix_open (fd_t *fd, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int32_t op_errno = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ unsigned char *need_open = NULL;
+ int call_count = 0;
+
+ priv = this->private;
+
+ if (!afr_is_fd_fixable (fd))
+ goto out;
+
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
+ goto out;
+
+ need_open = alloca0 (priv->child_count);
+
+ call_count = afr_fd_ctx_need_open (fd, this, need_open);
+ if (!call_count)
+ goto out;
+
+ frame = create_frame (this, this->ctx->pool);
+ if (!frame)
+ goto out;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->loc.inode = inode_ref (fd->inode);
+ ret = loc_path (&local->loc, NULL);
+ if (ret < 0)
+ goto out;
+
+ local->fd = fd_ref (fd);
+ local->fd_ctx = fd_ctx;
+
+ local->call_count = call_count;
+
+ gf_log (this->name, GF_LOG_DEBUG, "need open count: %d",
+ call_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!need_open[i])
+ continue;
+
+ if (IA_IFDIR == fd->inode->ia_type) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "opening fd for dir %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk,
+ (void*) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->opendir,
+ &local->loc, local->fd,
+ NULL);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "opening fd for file %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk,
+ (void *)(long) i,
+ priv->children[i],
+ priv->children[i]->fops->open,
+ &local->loc,
+ fd_ctx->flags & (~O_TRUNC),
+ local->fd, NULL);
+ }
+
+ if (!--call_count)
+ break;
+ }
+
+ return;
+out:
+ if (frame)
+ AFR_STACK_DESTROY (frame);
+}
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
new file mode 100644
index 000000000..186f68c33
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -0,0 +1,239 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "afr.h"
+#include "afr-transaction.h"
+
+int
+afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int subvol = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->readable[i]) {
+ /* don't even bother trying here.
+ just mark as attempted and move on. */
+ local->read_attempted[i] = 1;
+ continue;
+ }
+
+ if (!local->read_attempted[i]) {
+ subvol = i;
+ break;
+ }
+ }
+
+ /* If no more subvols were available for reading, we leave
+ @subvol as -1, which is an indication we have run out of
+ readable subvols. */
+ if (subvol != -1)
+ local->read_attempted[subvol] = 1;
+ local->readfn (frame, this, subvol);
+
+ return 0;
+}
+
+
+int
+afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
+{
+ afr_local_t *local = NULL;
+ int read_subvol = 0;
+ int event_generation = 0;
+ inode_t *inode = NULL;
+ int ret = -1;
+
+ local = frame->local;
+ inode = local->inode;
+
+ if (err) {
+ local->op_errno = -err;
+ local->op_ret = -1;
+ read_subvol = -1;
+ goto readfn;
+ }
+
+ ret = afr_inode_read_subvol_type_get (inode, this, local->readable,
+ &event_generation,
+ local->transaction.type);
+
+ if (ret == -1 || !event_generation) {
+ /* Even after refresh, we don't have a good
+ read subvolume. Time to bail */
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ read_subvol = -1;
+ goto readfn;
+ }
+
+ read_subvol = afr_read_subvol_select_by_policy (inode, this,
+ local->readable);
+
+ if (read_subvol == -1) {
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto readfn;
+ }
+
+ if (local->read_attempted[read_subvol]) {
+ afr_read_txn_next_subvol (frame, this);
+ return 0;
+ }
+
+ local->read_attempted[read_subvol] = 1;
+readfn:
+ local->readfn (frame, this, read_subvol);
+
+ return 0;
+}
+
+
+int
+afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (!local->refreshed) {
+ local->refreshed = _gf_true;
+ afr_inode_refresh (frame, this, local->inode,
+ afr_read_txn_refresh_done);
+ } else {
+ afr_read_txn_next_subvol (frame, this);
+ }
+
+ return 0;
+}
+
+
+/* afr_read_txn_wipe:
+
+ clean internal variables in @local in order to make
+ it possible to call afr_read_txn() multiple times from
+ the same frame
+*/
+
+void
+afr_read_txn_wipe (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ local->readfn = NULL;
+
+ if (local->inode)
+ inode_unref (local->inode);
+
+ for (i = 0; i < priv->child_count; i++) {
+ local->read_attempted[i] = 0;
+ local->readable[i] = 0;
+ }
+}
+
+
+/*
+ afr_read_txn:
+
+ This is the read transaction function. The way it works:
+
+ - Determine read-subvolume from inode ctx.
+
+ - If read-subvolume's generation was stale, refresh ctx once by
+ calling afr_inode_refresh()
+
+ Else make an attempt to read on read-subvolume.
+
+ - If attempted read on read-subvolume fails, refresh ctx once
+ by calling afr_inode_refresh()
+
+ - After ctx refresh, query read-subvolume freshly and attempt
+ read once.
+
+ - If read fails, try every other readable[] subvolume before
+ finally giving up. readable[] elements are set by afr_inode_refresh()
+ based on dirty and pending flags.
+
+ - If file is in split brain in the backend, generation will be
+ kept 0 by afr_inode_refresh() and readable[] will be set 0 for
+ all elements. Therefore reads always fail.
+*/
+
+int
+afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ afr_read_txn_wind_t readfn, afr_transaction_type type)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int read_subvol = -1;
+ int event_generation = 0;
+ int ret = -1;
+
+ priv = this->private;
+ local = frame->local;
+
+ afr_read_txn_wipe (frame, this);
+
+ local->readfn = readfn;
+ local->inode = inode_ref (inode);
+
+ local->transaction.type = type;
+ ret = afr_inode_read_subvol_type_get (inode, this, local->readable,
+ &event_generation, type);
+ if (ret == -1)
+ /* very first transaction on this inode */
+ goto refresh;
+
+ if (local->event_generation != event_generation)
+ /* servers have disconnected / reconnected, and possibly
+ rebooted, very likely changing the state of freshness
+ of copies */
+ goto refresh;
+
+ read_subvol = afr_read_subvol_select_by_policy (inode, this,
+ local->readable);
+
+ if (read_subvol < 0 || read_subvol > priv->child_count) {
+ gf_log (this->name, GF_LOG_WARNING, "Unreadable subvolume %d "
+ "found with event generation %d", read_subvol,
+ event_generation);
+ goto refresh;
+ }
+
+ if (!local->child_up[read_subvol]) {
+ /* should never happen, just in case */
+ gf_log (this->name, GF_LOG_WARNING, "subvolume %d is the "
+ "read subvolume in this generation, but is not up",
+ read_subvol);
+ goto refresh;
+ }
+
+ local->read_attempted[read_subvol] = 1;
+
+ local->readfn (frame, this, read_subvol);
+
+ return 0;
+
+refresh:
+ afr_inode_refresh (frame, this, inode, afr_read_txn_refresh_done);
+
+ return 0;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index f6a4bf9fb..4dac83113 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -1,584 +1,383 @@
/*
- Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#include "glusterfs.h"
-#include "xlator.h"
-#include "byte-order.h"
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
#include "afr.h"
-#include "afr-transaction.h"
-#include "afr-self-heal-common.h"
#include "afr-self-heal.h"
+#include "byte-order.h"
-/**
- * select_source - select a source and return it
- * TODO: take into account option 'favorite-child'
- */
-
int
-afr_sh_select_source (int sources[], int child_count)
+afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
{
- int i;
- for (i = 0; i < child_count; i++)
- if (sources[i])
- return i;
-
- return -1;
-}
+ afr_local_t *local = NULL;
+ local = frame->local;
-/**
- * sink_count - return number of sinks in sources array
- */
+ syncbarrier_wake (&local->barrier);
-int
-afr_sh_sink_count (int sources[], int child_count)
-{
- int i;
- int sinks = 0;
- for (i = 0; i < child_count; i++)
- if (!sources[i])
- sinks++;
- return sinks;
+ return 0;
}
+
int
-afr_sh_source_count (int sources[], int child_count)
+afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int subvol, dict_t *xattr)
{
- int i;
- int nsource = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ loc_t loc = {0, };
- for (i = 0; i < child_count; i++)
- if (sources[i])
- nsource++;
- return nsource;
-}
+ priv = this->private;
+ local = frame->local;
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
-int
-afr_sh_supress_errenous_children (int sources[], int child_errno[],
- int child_count)
-{
- int i = 0;
+ STACK_WIND (frame, afr_selfheal_post_op_cbk, priv->children[subvol],
+ priv->children[subvol]->fops->xattrop, &loc,
+ GF_XATTROP_ADD_ARRAY, xattr, NULL);
- for (i = 0; i < child_count; i++) {
- if (child_errno[i] && sources[i]) {
- sources[i] = 0;
- }
- }
+ syncbarrier_wait (&local->barrier, 1);
return 0;
}
-void
-afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this)
+dict_t *
+afr_selfheal_output_xattr (xlator_t *this, afr_transaction_type type,
+ int *output_dirty, int **output_matrix, int subvol)
{
- afr_private_t * priv = this->private;
+ dict_t *xattr = NULL;
+ afr_private_t *priv = NULL;
+ int j = 0;
+ int idx = 0;
+ int ret = 0;
+ int *raw = 0;
- char *buf = NULL;
- char *ptr = NULL;
+ priv = this->private;
+ idx = afr_index_for_transaction_type (type);
- int i, j;
+ xattr = dict_new ();
+ if (!xattr)
+ return NULL;
- /* 10 digits per entry + 1 space + '[' and ']' */
- buf = MALLOC (priv->child_count * 11 + 8);
+ if (output_dirty[subvol]) {
+ /* clear dirty */
+ raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t);
+ if (!raw)
+ goto err;
- for (i = 0; i < priv->child_count; i++) {
- ptr = buf;
- ptr += sprintf (ptr, "[ ");
- for (j = 0; j < priv->child_count; j++) {
- ptr += sprintf (ptr, "%d ", pending_matrix[i][j]);
- }
- ptr += sprintf (ptr, "]");
- gf_log (this->name, GF_LOG_DEBUG,
- "pending_matrix: %s", buf);
+ raw[idx] = hton32 (output_dirty[subvol]);
+ ret = dict_set_bin (xattr, AFR_DIRTY, raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret)
+ goto err;
}
- FREE (buf);
-}
-
-
-void
-afr_sh_build_pending_matrix (afr_private_t *priv,
- int32_t *pending_matrix[], dict_t *xattr[],
- int child_count, afr_transaction_type type)
-{
- int i, j, k;
-
- int32_t *pending = NULL;
- int ret = -1;
+ /* clear/set pending */
+ for (j = 0; j < priv->child_count; j++) {
+ if (!output_matrix[subvol][j])
+ continue;
- unsigned char *ignorant_subvols = NULL;
+ raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS,
+ gf_afr_mt_int32_t);
+ if (!raw)
+ goto err;
- ignorant_subvols = CALLOC (sizeof (*ignorant_subvols), child_count);
+ raw[idx] = hton32 (output_matrix[subvol][j]);
- /* start clean */
- for (i = 0; i < child_count; i++) {
- for (j = 0; j < child_count; j++) {
- pending_matrix[i][j] = 0;
- }
+ ret = dict_set_bin (xattr, priv->pending_key[j],
+ raw, sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret)
+ goto err;
}
- for (i = 0; i < child_count; i++) {
- pending = NULL;
-
- for (j = 0; j < child_count; j++) {
- ret = dict_get_ptr (xattr[i], priv->pending_key[j],
- VOID(&pending));
-
- if (ret != 0) {
- /*
- * There is no xattr present. This means this
- * subvolume should be considered an 'ignorant'
- * subvolume.
- */
-
- ignorant_subvols[i] = 1;
- continue;
- }
-
- k = afr_index_for_transaction_type (type);
-
- pending_matrix[i][j] = ntoh32 (pending[k]);
- }
- }
-
- /*
- * Make all non-ignorant subvols point towards the ignorant
- * subvolumes.
- */
-
- for (i = 0; i < child_count; i++) {
- if (ignorant_subvols[i]) {
- for (j = 0; j < child_count; j++) {
- if (!ignorant_subvols[j])
- pending_matrix[j][i] += 1;
- }
- }
- }
-
- FREE (ignorant_subvols);
+ return xattr;
+err:
+ if (xattr)
+ dict_unref (xattr);
+ return NULL;
}
-/**
- * mark_sources: Mark all 'source' nodes and return number of source
- * nodes found
- *
- * A node (a row in the pending matrix) belongs to one of
- * three categories:
- *
- * M is the pending matrix.
- *
- * 'innocent' - M[i] is all zeroes
- * 'fool' - M[i] has i'th element = 1 (self-reference)
- * 'wise' - M[i] has i'th element = 0, others are 1 or 0.
- *
- * All 'innocent' nodes are sinks. If all nodes are innocent, no self-heal is
- * needed.
- *
- * A 'wise' node can be a source. If two 'wise' nodes conflict, it is
- * a split-brain. If one wise node refers to the other but the other doesn't
- * refer back, the referrer is a source.
- *
- * All fools are sinks, unless there are no 'wise' nodes. In that case,
- * one of the fools is made a source.
- */
-
-typedef enum {
- AFR_NODE_INNOCENT,
- AFR_NODE_FOOL,
- AFR_NODE_WISE
-} afr_node_type;
-
-typedef struct {
- afr_node_type type;
- int wisdom;
-} afr_node_character;
-
-
-static int
-afr_sh_is_innocent (int32_t *array, int child_count)
+int
+afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks, afr_transaction_type type,
+ struct afr_reply *replies, unsigned char *locked_on)
{
- int i = 0;
- int ret = 1; /* innocent until proven guilty */
-
- for (i = 0; i < child_count; i++) {
- if (array[i]) {
- ret = 0;
- break;
- }
- }
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int j = 0;
+ unsigned char *pending = NULL;
+ int *input_dirty = NULL;
+ int **input_matrix = NULL;
+ int *output_dirty = NULL;
+ int **output_matrix = NULL;
+ dict_t *xattr = NULL;
- return ret;
-}
+ priv = this->private;
+ pending = alloca0 (priv->child_count);
-static int
-afr_sh_is_fool (int32_t *array, int i, int child_count)
-{
- return array[i]; /* fool if accuses itself */
-}
+ input_dirty = alloca0 (priv->child_count * sizeof (int));
+ input_matrix = ALLOC_MATRIX (priv->child_count, int);
+ output_dirty = alloca0 (priv->child_count * sizeof (int));
+ output_matrix = ALLOC_MATRIX (priv->child_count, int);
+ afr_selfheal_extract_xattr (this, replies, type, input_dirty,
+ input_matrix);
-static int
-afr_sh_is_wise (int32_t *array, int i, int child_count)
-{
- return !array[i]; /* wise if does not accuse itself */
-}
-
+ for (i = 0; i < priv->child_count; i++)
+ if (sinks[i] && !healed_sinks[i])
+ pending[i] = 1;
-static int
-afr_sh_all_nodes_innocent (afr_node_character *characters,
- int child_count)
-{
- int i = 0;
- int ret = 1;
+ for (i = 0; i < priv->child_count; i++) {
+ for (j = 0; j < priv->child_count; j++) {
+ if (pending[j])
+ output_matrix[i][j] = 1;
+ else
+ output_matrix[i][j] = -input_matrix[i][j];
+ }
+ }
- for (i = 0; i < child_count; i++) {
- if (characters[i].type != AFR_NODE_INNOCENT) {
- ret = 0;
- break;
- }
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!pending[i])
+ output_dirty[i] = -input_dirty[i];
+ }
- return ret;
-}
+ for (i = 0; i < priv->child_count; i++) {
+ if (!locked_on[i])
+ /* perform post-op only on subvols we had locked
+ and inspected on.
+ */
+ continue;
+ xattr = afr_selfheal_output_xattr (this, type, output_dirty,
+ output_matrix, i);
+ if (!xattr) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "unable to allocate xdata for subvol %d", i);
+ continue;
+ }
-static int
-afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count)
-{
- int i = 0;
- int ret = 0;
+ afr_selfheal_post_op (frame, this, inode, i, xattr);
- for (i = 0; i < child_count; i++) {
- if (characters[i].type == AFR_NODE_WISE) {
- ret = 1;
- break;
- }
- }
+ dict_unref (xattr);
+ }
- return ret;
+ return 0;
}
-/*
- * The 'wisdom' of a wise node is 0 if any other wise node accuses it.
- * It is 1 if no other wise node accuses it.
- * Only wise nodes with wisdom 1 are sources.
- *
- * If no nodes with wisdom 1 exist, a split-brain has occured.
- */
-
-static void
-afr_sh_compute_wisdom (int32_t *pending_matrix[],
- afr_node_character characters[], int child_count)
+void
+afr_replies_copy (struct afr_reply *dst, struct afr_reply *src, int count)
{
- int i = 0;
- int j = 0;
-
- for (i = 0; i < child_count; i++) {
- if (characters[i].type == AFR_NODE_WISE) {
- characters[i].wisdom = 1;
-
- for (j = 0; j < child_count; j++) {
- if ((characters[j].type == AFR_NODE_WISE)
- && pending_matrix[j][i]) {
-
- characters[i].wisdom = 0;
- }
- }
- }
- }
+ int i = 0;
+ dict_t *xdata = NULL;
+
+ if (dst == src)
+ return;
+
+ for (i = 0; i < count; i++) {
+ dst[i].valid = src[i].valid;
+ dst[i].op_ret = src[i].op_ret;
+ dst[i].op_errno = src[i].op_errno;
+ dst[i].prestat = src[i].prestat;
+ dst[i].poststat = src[i].poststat;
+ dst[i].preparent = src[i].preparent;
+ dst[i].postparent = src[i].postparent;
+ dst[i].preparent2 = src[i].preparent2;
+ dst[i].postparent2 = src[i].postparent2;
+ if (src[i].xdata)
+ xdata = dict_ref (src[i].xdata);
+ else
+ xdata = NULL;
+ if (dst[i].xdata)
+ dict_unref (dst[i].xdata);
+ dst[i].xdata = xdata;
+ memcpy (dst[i].checksum, src[i].checksum,
+ MD5_DIGEST_LENGTH);
+ }
}
-static int
-afr_sh_wise_nodes_conflict (afr_node_character *characters,
- int child_count)
+int
+afr_selfheal_fill_dirty (xlator_t *this, int *dirty, int subvol,
+ int idx, dict_t *xdata)
{
- int i = 0;
- int ret = 1;
+ void *pending_raw = NULL;
+ int pending[3] = {0, };
- for (i = 0; i < child_count; i++) {
- if ((characters[i].type == AFR_NODE_WISE)
- && characters[i].wisdom == 1) {
+ if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw))
+ return -1;
- /* There is atleast one bona-fide wise node */
- ret = 0;
- break;
- }
- }
+ if (!pending_raw)
+ return -1;
- return ret;
-}
+ memcpy (pending, pending_raw, sizeof(pending));
+ dirty[subvol] = ntoh32 (pending[idx]);
-static int
-afr_sh_mark_wisest_as_sources (int sources[],
- afr_node_character *characters,
- int child_count)
-{
- int nsources = 0;
-
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- if (characters[i].wisdom == 1) {
- sources[i] = 1;
- nsources++;
- }
- }
-
- return nsources;
+ return 0;
}
-static int
-afr_sh_mark_if_size_differs (afr_self_heal_t *sh, int child_count)
+int
+afr_selfheal_fill_matrix (xlator_t *this, int **matrix, int subvol,
+ int idx, dict_t *xdata)
{
- int32_t ** pending_matrix;
- int i, j;
-
- int size_differs = 0;
-
- pending_matrix = sh->pending_matrix;
-
- for (i = 0; i < child_count; i++) {
- for (j = 0; j < child_count; j++) {
- if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[j])
- && (pending_matrix[i][j] == 0)
- && (pending_matrix[j][i] == 0)) {
-
- pending_matrix[i][j] = 1;
- pending_matrix[j][i] = 1;
-
- size_differs = 1;
- }
- }
- }
-
- return size_differs;
-}
+ int i = 0;
+ void *pending_raw = NULL;
+ int pending[3] = {0, };
+ afr_private_t *priv = NULL;
-
-static int
-afr_sh_mark_biggest_fool_as_source (afr_self_heal_t *sh,
- afr_node_character *characters,
- int child_count)
-{
- int i = 0;
- int biggest = 0;
-
- for (i = 0; i < child_count; i++) {
- if (characters[i].type == AFR_NODE_FOOL) {
- biggest = i;
- break;
- }
- }
-
- for (i = 0; i < child_count; i++) {
- if (characters[i].type != AFR_NODE_FOOL)
- continue;
-
- if (SIZE_GREATER (&sh->buf[i], &sh->buf[biggest])) {
- biggest = i;
- }
- }
-
- sh->sources[biggest] = 1;
-
- return 1;
-}
+ priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (dict_get_ptr (xdata, priv->pending_key[i], &pending_raw))
+ continue;
-static int
-afr_sh_mark_biggest_as_source (afr_self_heal_t *sh, int child_count)
-{
- int biggest = 0;
- int i;
+ if (!pending_raw)
+ continue;
- for (i = 0; i < child_count; i++) {
- if (SIZE_GREATER (&sh->buf[i], &sh->buf[biggest])) {
- biggest = i;
- }
- }
+ memcpy (pending, pending_raw, sizeof(pending));
- sh->sources[biggest] = 1;
+ matrix[subvol][i] = ntoh32 (pending[idx]);
+ }
- return 1;
+ return 0;
}
int
-afr_sh_mark_sources (afr_self_heal_t *sh, int child_count,
- afr_self_heal_type type)
+afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies,
+ afr_transaction_type type, int *dirty, int **matrix)
{
+ afr_private_t *priv = NULL;
int i = 0;
+ dict_t *xdata = NULL;
+ int idx = -1;
- int32_t ** pending_matrix;
- int * sources;
+ idx = afr_index_for_transaction_type (type);
- int size_differs = 0;
-
- pending_matrix = sh->pending_matrix;
- sources = sh->sources;
+ priv = this->private;
- int nsources = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].xdata)
+ continue;
- /* stores the 'characters' (innocent, fool, wise) of the nodes */
- afr_node_character *
- characters = CALLOC (sizeof (afr_node_character),
- child_count);
+ xdata = replies[i].xdata;
- /* start clean */
- for (i = 0; i < child_count; i++) {
- sources[i] = 0;
+ afr_selfheal_fill_dirty (this, dirty, i, idx, xdata);
+ afr_selfheal_fill_matrix (this, matrix, i, idx, xdata);
}
-
- for (i = 0; i < child_count; i++) {
- if (afr_sh_is_innocent (pending_matrix[i], child_count)) {
- characters[i].type = AFR_NODE_INNOCENT;
-
- } else if (afr_sh_is_fool (pending_matrix[i], i, child_count)) {
- characters[i].type = AFR_NODE_FOOL;
-
- } else if (afr_sh_is_wise (pending_matrix[i], i, child_count)) {
- characters[i].type = AFR_NODE_WISE;
-
- } else {
- gf_log ("[module:afr]", GF_LOG_ERROR,
- "node %d is diabolical! "
- "(This message should never appear."
- " Please file a bug report.)", i);
- }
- }
-
- if (type == AFR_SELF_HEAL_DATA) {
- size_differs = afr_sh_mark_if_size_differs (sh, child_count);
- }
-
- if (afr_sh_all_nodes_innocent (characters, child_count)) {
- if (size_differs) {
- nsources = afr_sh_mark_biggest_as_source (sh,
- child_count);
- }
-
- } else if (afr_sh_wise_nodes_exist (characters, child_count)) {
- afr_sh_compute_wisdom (pending_matrix, characters, child_count);
-
- if (afr_sh_wise_nodes_conflict (characters, child_count)) {
- /* split-brain */
-
- nsources = -1;
- goto out;
-
- } else {
- nsources = afr_sh_mark_wisest_as_sources (sources,
- characters,
- child_count);
- }
- } else {
- nsources = afr_sh_mark_biggest_fool_as_source (sh, characters,
- child_count);
- }
-
-out:
- FREE (characters);
- return nsources;
+ return 0;
}
-void
-afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr,
- int32_t *delta_matrix[], int success[],
- int child_count, afr_transaction_type type)
-{
- int i = 0;
- int j = 0;
- int k = 0;
-
- int32_t * pending = NULL;
- int ret = 0;
-
- /* start clean */
- for (i = 0; i < child_count; i++) {
- for (j = 0; j < child_count; j++) {
- delta_matrix[i][j] = 0;
- }
- }
-
- for (i = 0; i < child_count; i++) {
- pending = NULL;
-
- for (j = 0; j < child_count; j++) {
- ret = dict_get_ptr (xattr[i], priv->pending_key[j],
- VOID(&pending));
-
- if (!success[j])
- continue;
-
- k = afr_index_for_transaction_type (type);
-
- if (pending) {
- delta_matrix[i][j] = -(ntoh32 (pending[k]));
- } else {
- delta_matrix[i][j] = 0;
- }
-
- }
- }
-}
+/*
+ * This function determines if a self-heal is required for a given inode,
+ * and if needed, in what direction.
+ *
+ * locked_on[] is the array representing servers which have been locked and
+ * from which xattrs have been fetched for analysis.
+ *
+ * The output of the function is by filling the arrays sources[] and sinks[].
+ *
+ * sources[i] is set if i'th server is an eligible source for a selfheal.
+ *
+ * sinks[i] is set if i'th server needs to be healed.
+ *
+ * if sources[0..N] are all set, there is no need for a selfheal.
+ *
+ * if sinks[0..N] are all set, the inode is in split brain.
+ *
+ */
int
-afr_sh_delta_to_xattr (afr_private_t *priv,
- int32_t *delta_matrix[], dict_t *xattr[],
- int child_count, afr_transaction_type type)
+afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
+ struct afr_reply *replies,
+ afr_transaction_type type, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks)
{
+ afr_private_t *priv = NULL;
int i = 0;
int j = 0;
- int k = 0;
+ int *dirty = NULL;
+ int **matrix = NULL;
+ char *accused = NULL;
- int ret = 0;
+ priv = this->private;
- int32_t *pending = 0;
+ dirty = alloca0 (priv->child_count * sizeof (int));
+ accused = alloca0 (priv->child_count);
+ matrix = ALLOC_MATRIX(priv->child_count, int);
- for (i = 0; i < child_count; i++) {
- if (!xattr[i])
- continue;
+ /* First construct the pending matrix for further analysis */
+ afr_selfheal_extract_xattr (this, replies, type, dirty, matrix);
- for (j = 0; j < child_count; j++) {
- pending = CALLOC (sizeof (int32_t), 3);
- /* 3 = data+metadata+entry */
+ /* Next short list all accused to exclude them from being sources */
+ for (i = 0; i < priv->child_count; i++) {
+ for (j = 0; j < priv->child_count; j++) {
+ if (matrix[i][j])
+ accused[j] = 1;
+ }
+ }
- k = afr_index_for_transaction_type (type);
+ /* Short list all non-accused as sources */
+ memset (sources, 0, priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!accused[i] && locked_on[i])
+ sources[i] = 1;
+ }
- pending[k] = hton32 (delta_matrix[i][j]);
+ /* Everyone accused by sources are sinks */
+ memset (sinks, 0, priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ for (j = 0; j < priv->child_count; j++) {
+ if (matrix[i][j])
+ sinks[j] = 1;
+ }
+ }
- ret = dict_set_bin (xattr[i], priv->pending_key[j],
- pending,
- 3 * sizeof (int32_t));
+ /* If any source has 'dirty' bit, pick first
+ 'dirty' source and make everybody else sinks */
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i] && dirty[i]) {
+ for (j = 0; j < priv->child_count; j++) {
+ if (j != i) {
+ sources[j] = 0;
+ sinks[j] = 1;
+ }
+ }
+ break;
+ }
+ }
+
+ /* If no sources, all locked nodes are sinks - split brain */
+ if (AFR_COUNT (sources, priv->child_count) == 0) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i])
+ sinks[i] = 1;
}
}
@@ -587,783 +386,624 @@ afr_sh_delta_to_xattr (afr_private_t *priv,
int
-afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this)
+afr_selfheal_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *parbuf)
{
- afr_private_t *priv = NULL;
- int32_t *pending = NULL;
- void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */
-
- int ret = -1;
- int i = 0;
- int j = 0;
+ afr_local_t *local = NULL;
+ int i = -1;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_get_ptr (xattr, priv->pending_key[i],
- &tmp_pending);
-
- if (ret != 0)
- return 0;
-
- pending = tmp_pending;
+ local = frame->local;
+ i = (long) cookie;
- j = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION);
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (buf)
+ local->replies[i].poststat = *buf;
+ if (parbuf)
+ local->replies[i].postparent = *parbuf;
+ if (xdata)
+ local->replies[i].xdata = dict_ref (xdata);
- if (pending[j])
- return 1;
- }
+ syncbarrier_wake (&local->barrier);
return 0;
}
-int
-afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this)
+inode_t *
+afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent,
+ const char *name, struct afr_reply *replies,
+ unsigned char *lookup_on)
{
+ loc_t loc = {0, };
+ dict_t *xattr_req = NULL;
+ afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int32_t *pending = NULL;
- void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */
+ inode_t *inode = NULL;
+
+ local = frame->local;
+ priv = frame->this->private;
- int ret = -1;
- int i = 0;
- int j = 0;
+ xattr_req = dict_new ();
+ if (!xattr_req)
+ return NULL;
- priv = this->private;
+ if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) {
+ dict_destroy (xattr_req);
+ return NULL;
+ }
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_get_ptr (xattr, priv->pending_key[i],
- &tmp_pending);
+ inode = inode_new (parent->table);
+ if (!inode) {
+ dict_destroy (xattr_req);
+ return NULL;
+ }
- if (ret != 0)
- return 0;
-
- pending = tmp_pending;
+ loc.parent = inode_ref (parent);
+ uuid_copy (loc.pargfid, parent->gfid);
+ loc.name = name;
+ loc.inode = inode_ref (inode);
- j = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);
+ AFR_ONLIST (lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xattr_req);
- if (pending[j])
- return 1;
- }
+ afr_replies_copy (replies, local->replies, priv->child_count);
- return 0;
+ loc_wipe (&loc);
+ dict_unref (xattr_req);
+
+ return inode;
}
int
-afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this)
+afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode,
+ uuid_t gfid, struct afr_reply *replies,
+ unsigned char *discover_on)
{
- afr_private_t *priv = NULL;
- int32_t *pending = NULL;
- void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */
+ loc_t loc = {0, };
+ dict_t *xattr_req = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- int ret = -1;
- int i = 0;
- int j = 0;
+ local = frame->local;
+ priv = frame->this->private;
- priv = this->private;
+ xattr_req = dict_new ();
+ if (!xattr_req)
+ return -ENOMEM;
+
+ if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) {
+ dict_destroy (xattr_req);
+ return -ENOMEM;
+ }
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_get_ptr (xattr, priv->pending_key[i],
- &tmp_pending);
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, gfid);
- if (ret != 0)
- return 0;
-
- pending = tmp_pending;
+ AFR_ONLIST (discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xattr_req);
- j = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION);
+ afr_replies_copy (replies, local->replies, priv->child_count);
- if (pending[j])
- return 1;
- }
+ loc_wipe (&loc);
+ dict_unref (xattr_req);
return 0;
}
-
-/**
- * is_matrix_zero - return true if pending matrix is all zeroes
- */
-
int
-afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count)
+afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode,
+ uuid_t gfid, struct afr_reply *replies)
{
- int i, j;
+ afr_private_t *priv = NULL;
+
+ priv = frame->this->private;
- for (i = 0; i < child_count; i++)
- for (j = 0; j < child_count; j++)
- if (pending_matrix[i][j])
- return 0;
- return 1;
+ return afr_selfheal_unlocked_discover_on (frame, inode, gfid, replies,
+ priv->child_up);
}
int
-afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this)
+afr_selfheal_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
+ afr_local_t *local = NULL;
+ int i = 0;
local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ i = (long) cookie;
-// memset (sh->child_errno, 0, sizeof (int) * priv->child_count);
- memset (sh->buf, 0, sizeof (struct stat) * priv->child_count);
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i])
- dict_unref (sh->xattr[i]);
- sh->xattr[i] = NULL;
- }
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
- if (local->govinda_gOvinda) {
- gf_log (this->name, GF_LOG_WARNING,
- "aborting selfheal of %s",
- local->loc.path);
- sh->completion_cbk (frame, this);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "proceeding to metadata check on %s",
- local->loc.path);
- afr_self_heal_metadata (frame, this);
- }
+ syncbarrier_wake (&local->barrier);
return 0;
}
int
-sh_missing_entries_unlck_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+afr_selfheal_locked_fill (call_frame_t *frame, xlator_t *this,
+ unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
-
+ int i = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int count = 0;
local = frame->local;
- sh = &local->self_heal;
priv = this->private;
- LOCK (&frame->lock);
- {
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- afr_sh_missing_entries_done (frame, this);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && local->replies[i].op_ret == 0) {
+ locked_on[i] = 1;
+ count++;
+ } else {
+ locked_on[i] = 0;
+ }
}
- return 0;
+ return count;
}
-
-static int
-sh_missing_entries_finish (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int i = 0;
- int call_count = 0;
- afr_self_heal_t *sh = NULL;
+int
+afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on)
+{
+ loc_t loc = {0,};
+ struct gf_flock flock = {0, };
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- call_count = local->child_count;
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
- local->call_count = call_count;
+ flock.l_type = F_WRLCK;
+ flock.l_start = off;
+ flock.l_len = size;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- gf_log (this->name, GF_LOG_DEBUG,
- "unlocking %"PRId64"/%s on subvolume %s",
- sh->parent_loc.inode->ino, local->loc.name,
- priv->children[i]->name);
-
- STACK_WIND (frame, sh_missing_entries_unlck_cbk,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- this->name,
- &sh->parent_loc, local->loc.name,
- ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
-
- if (!--call_count)
- break;
- }
- }
- return 0;
-}
+ AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom,
+ &loc, F_SETLK, &flock, NULL);
+ loc_wipe (&loc);
-static int
-sh_destroy_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int op_errno, struct stat *stbuf)
-{
- STACK_DESTROY (frame->root);
- return 0;
+ return afr_selfheal_locked_fill (frame, this, locked_on);
}
-static int
-sh_missing_entries_newentry_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct stat *stbuf)
+int
+afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- call_frame_t *chown_frame = NULL;
- int call_count = 0;
- int child_index = 0;
- struct stat *buf = NULL;
-
+ loc_t loc = {0,};
+ struct gf_flock flock = {0, };
+ afr_local_t *local = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
- local = frame->local;
- sh = &local->self_heal;
priv = this->private;
+ local = frame->local;
- buf = &sh->buf[sh->source];
- child_index = (long) cookie;
-
- if (op_ret == 0) {
- chown_frame = copy_frame (frame);
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
- gf_log (this->name, GF_LOG_DEBUG,
- "chown %s to %d %d on subvolume %s",
- local->loc.path, buf->st_uid, buf->st_gid,
- priv->children[child_index]->name);
+ flock.l_type = F_WRLCK;
+ flock.l_start = off;
+ flock.l_len = size;
- STACK_WIND (chown_frame, sh_destroy_cbk,
- priv->children[child_index],
- priv->children[child_index]->fops->chown,
- &local->loc,
- buf->st_uid, buf->st_gid);
- }
+ AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom,
+ &loc, F_SETLK, &flock, NULL);
- LOCK (&frame->lock);
- {
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == -1 &&
+ local->replies[i].op_errno == EAGAIN) {
+ afr_selfheal_locked_fill (frame, this, locked_on);
+ afr_selfheal_uninodelk (frame, this, inode, dom, off,
+ size, locked_on);
+
+ AFR_SEQ (frame, afr_selfheal_lock_cbk, inodelk, dom,
+ &loc, F_SETLKW, &flock, NULL);
+ break;
+ }
}
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
- if (call_count == 0) {
- sh_missing_entries_finish (frame, this);
- }
+ loc_wipe (&loc);
- return 0;
+ return afr_selfheal_locked_fill (frame, this, locked_on);
}
-static int
-sh_missing_entries_mknod (call_frame_t *frame, xlator_t *this)
+int
+afr_selfheal_uninodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ const unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int enoent_count = 0;
- int call_count = 0;
- mode_t st_mode = 0;
- dev_t st_dev = 0;
-
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ loc_t loc = {0,};
+ struct gf_flock flock = {0, };
- for (i = 0; i < priv->child_count; i++)
- if (sh->child_errno[i] == ENOENT)
- enoent_count++;
- call_count = enoent_count;
- local->call_count = call_count;
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
- st_mode = sh->buf[sh->source].st_mode;
- st_dev = sh->buf[sh->source].st_dev;
+ flock.l_type = F_UNLCK;
+ flock.l_start = off;
+ flock.l_len = size;
- gf_log (this->name, GF_LOG_DEBUG,
- "mknod %s mode 0%o on %d subvolumes",
- local->loc.path, st_mode, enoent_count);
+ AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, inodelk,
+ dom, &loc, F_SETLK, &flock, NULL);
- for (i = 0; i < priv->child_count; i++) {
- if (sh->child_errno[i] == ENOENT) {
- STACK_WIND_COOKIE (frame,
- sh_missing_entries_newentry_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->mknod,
- &local->loc, st_mode, st_dev);
- if (!--call_count)
- break;
- }
- }
+ loc_wipe (&loc);
return 0;
}
-static int
-sh_missing_entries_mkdir (call_frame_t *frame, xlator_t *this)
+int
+afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int enoent_count = 0;
- int call_count = 0;
- mode_t st_mode = 0;
+ loc_t loc = {0,};
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++)
- if (sh->child_errno[i] == ENOENT)
- enoent_count++;
-
- call_count = enoent_count;
- local->call_count = call_count;
-
- st_mode = sh->buf[sh->source].st_mode;
+ AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom,
+ &loc, name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
- gf_log (this->name, GF_LOG_DEBUG,
- "mkdir %s mode 0%o on %d subvolumes",
- local->loc.path, st_mode, enoent_count);
+ loc_wipe (&loc);
- for (i = 0; i < priv->child_count; i++) {
- if (sh->child_errno[i] == ENOENT) {
- STACK_WIND_COOKIE (frame,
- sh_missing_entries_newentry_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->mkdir,
- &local->loc, st_mode);
- if (!--call_count)
- break;
- }
- }
-
- return 0;
+ return afr_selfheal_locked_fill (frame, this, locked_on);
}
-static int
-sh_missing_entries_symlink (call_frame_t *frame, xlator_t *this,
- const char *link)
+int
+afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int enoent_count = 0;
- int call_count = 0;
-
+ loc_t loc = {0,};
+ afr_local_t *local = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
- local = frame->local;
- sh = &local->self_heal;
priv = this->private;
+ local = frame->local;
- for (i = 0; i < priv->child_count; i++)
- if (sh->child_errno[i] == ENOENT)
- enoent_count++;
-
- call_count = enoent_count;
- local->call_count = call_count;
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
- gf_log (this->name, GF_LOG_DEBUG,
- "symlink %s -> %s on %d subvolumes",
- local->loc.path, link, enoent_count);
+ AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom, &loc,
+ name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
for (i = 0; i < priv->child_count; i++) {
- if (sh->child_errno[i] == ENOENT) {
- STACK_WIND_COOKIE (frame,
- sh_missing_entries_newentry_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->symlink,
- link, &local->loc);
- if (!--call_count)
- break;
+ if (local->replies[i].op_ret == -1 &&
+ local->replies[i].op_errno == EAGAIN) {
+ afr_selfheal_locked_fill (frame, this, locked_on);
+ afr_selfheal_unentrylk (frame, this, inode, dom, name,
+ locked_on);
+
+ AFR_SEQ (frame, afr_selfheal_lock_cbk, entrylk, dom,
+ &loc, name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
+ break;
}
}
- return 0;
-}
-
+ loc_wipe (&loc);
-static int
-sh_missing_entries_readlink_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- const char *link)
-{
- if (op_ret > 0)
- sh_missing_entries_symlink (frame, this, link);
- else
- sh_missing_entries_finish (frame, this);
-
- return 0;
+ return afr_selfheal_locked_fill (frame, this, locked_on);
}
-static int
-sh_missing_entries_readlink (call_frame_t *frame, xlator_t *this)
+int
+afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
+ loc_t loc = {0,};
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, entrylk,
+ dom, &loc, name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL);
- STACK_WIND (frame, sh_missing_entries_readlink_cbk,
- priv->children[sh->source],
- priv->children[sh->source]->fops->readlink,
- &local->loc, 4096);
+ loc_wipe (&loc);
return 0;
}
-static int
-sh_missing_entries_create (call_frame_t *frame, xlator_t *this)
+gf_boolean_t
+afr_is_pending_set (xlator_t *this, dict_t *xdata, int type)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int type = 0;
- int i = 0;
- afr_private_t *priv = NULL;
- int enoent_count = 0;
- int govinda_gOvinda = 0;
-
+ int idx = -1;
+ afr_private_t *priv = NULL;
+ void *pending_raw = NULL;
+ int *pending_int = NULL;
+ int i = 0;
- local = frame->local;
- sh = &local->self_heal;
priv = this->private;
+ idx = afr_index_for_transaction_type (type);
- for (i = 0; i < priv->child_count; i++) {
- if (sh->child_errno[i]) {
- if (sh->child_errno[i] == ENOENT)
- enoent_count++;
- } else {
- if (type) {
- if (type != (sh->buf[i].st_mode & S_IFMT))
- govinda_gOvinda = 1;
- } else {
- sh->source = i;
- type = sh->buf[i].st_mode & S_IFMT;
- }
+ if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw) == 0) {
+ if (pending_raw) {
+ pending_int = pending_raw;
+
+ if (ntoh32 (pending_int[idx]))
+ return _gf_true;
}
}
- if (govinda_gOvinda) {
- gf_log (this->name, GF_LOG_ERROR,
- "conflicing filetypes exist for path %s. returning.",
- local->loc.path);
+ for (i = 0; i < priv->child_count; i++) {
+ if (dict_get_ptr (xdata, priv->pending_key[i],
+ &pending_raw))
+ continue;
+ if (!pending_raw)
+ continue;
+ pending_int = pending_raw;
- local->govinda_gOvinda = 1;
- sh_missing_entries_finish (frame, this);
- return 0;
+ if (ntoh32 (pending_int[idx]))
+ return _gf_true;
}
- if (!type) {
- gf_log (this->name, GF_LOG_ERROR,
- "no source found for %s. all nodes down?. returning.",
- local->loc.path);
- /* subvolumes down and/or file does not exist */
- sh_missing_entries_finish (frame, this);
- return 0;
- }
+ return _gf_false;
+}
- if (enoent_count == 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "no missing files - %s. proceeding to metadata check",
- local->loc.path);
- /* proceed to next step - metadata self-heal */
- sh_missing_entries_finish (frame, this);
- return 0;
- }
- switch (type) {
- case S_IFSOCK:
- case S_IFREG:
- case S_IFBLK:
- case S_IFCHR:
- case S_IFIFO:
- sh_missing_entries_mknod (frame, this);
- break;
- case S_IFLNK:
- sh_missing_entries_readlink (frame, this);
- break;
- case S_IFDIR:
- sh_missing_entries_mkdir (frame, this);
- break;
- default:
- gf_log (this->name, GF_LOG_ERROR,
- "unknown file type: 0%o", type);
- local->govinda_gOvinda = 1;
- sh_missing_entries_finish (frame, this);
- }
+gf_boolean_t
+afr_is_data_set (xlator_t *this, dict_t *xdata)
+{
+ return afr_is_pending_set (this, xdata, AFR_DATA_TRANSACTION);
+}
- return 0;
+gf_boolean_t
+afr_is_metadata_set (xlator_t *this, dict_t *xdata)
+{
+ return afr_is_pending_set (this, xdata, AFR_METADATA_TRANSACTION);
+}
+
+gf_boolean_t
+afr_is_entry_set (xlator_t *this, dict_t *xdata)
+{
+ return afr_is_pending_set (this, xdata, AFR_ENTRY_TRANSACTION);
}
-static int
-sh_missing_entries_lookup_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct stat *buf, dict_t *xattr)
+void
+afr_inode_link (inode_t *inode, struct iatt *iatt)
{
- int child_index = 0;
- afr_local_t *local = NULL;
- int call_count = 0;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
+ inode_t *linked_inode = NULL;
+ linked_inode = inode_link (inode, NULL, NULL, iatt);
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ uuid_copy (inode->gfid, iatt->ia_gfid);
+ inode->ia_type = iatt->ia_type;
- child_index = (long) cookie;
+ if (linked_inode) {
+ inode_lookup (linked_inode);
+ inode_unref (linked_inode);
+ }
+}
- LOCK (&frame->lock);
- {
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "path %s on subvolume %s is of mode 0%o",
- local->loc.path,
- priv->children[child_index]->name,
- buf->st_mode);
- local->self_heal.buf[child_index] = *buf;
- } else {
- gf_log (this->name, GF_LOG_WARNING,
- "path %s on subvolume %s => -1 (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
+/*
+ * This function inspects the looked up replies (in an unlocked manner)
+ * and decides whether a locked verification and possible healing is
+ * required or not. It updates the three booleans for each type
+ * of healing. If the boolean flag gets set to FALSE, then we are sure
+ * no healing is required. If the boolean flag gets set to TRUE then
+ * we have to proceed with locked reinspection.
+ */
- local->self_heal.child_errno[child_index] = op_errno;
- }
+int
+afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
+ inode_t *inode, uuid_t gfid,
+ gf_boolean_t *data_selfheal,
+ gf_boolean_t *metadata_selfheal,
+ gf_boolean_t *entry_selfheal)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int valid_cnt = 0;
+ struct iatt first = {0, };
+ struct afr_reply *replies = NULL;
+ int ret = -1;
- }
- UNLOCK (&frame->lock);
+ priv = this->private;
- call_count = afr_frame_return (frame);
+ replies = alloca0 (sizeof (*replies) * priv->child_count);
- if (call_count == 0) {
- sh_missing_entries_create (frame, this);
- }
+ ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies);
+ if (ret)
+ return ret;
- return 0;
-}
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+ if (replies[i].op_ret == -1)
+ continue;
+ if (afr_is_data_set (this, replies[i].xdata))
+ *data_selfheal = _gf_true;
-static int
-sh_missing_entries_lookup (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- int i = 0;
- int call_count = 0;
- afr_private_t *priv = NULL;
- dict_t *xattr_req = NULL;
- int ret = -1;
+ if (afr_is_metadata_set (this, replies[i].xdata))
+ *metadata_selfheal = _gf_true;
- local = frame->local;
- call_count = local->child_count;
- priv = this->private;
+ if (afr_is_entry_set (this, replies[i].xdata))
+ *entry_selfheal = _gf_true;
+
+ valid_cnt ++;
+ if (valid_cnt == 1) {
+ first = replies[i].poststat;
+ continue;
+ }
- local->call_count = call_count;
-
- xattr_req = dict_new();
-
- if (xattr_req) {
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_uint64 (xattr_req,
- priv->pending_key[i],
- 3 * sizeof(int32_t));
- }
- }
+ if (!IA_EQUAL (first, replies[i].poststat, type)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "TYPE mismatch %d vs %d on %s for gfid:%s",
+ (int) first.ia_type,
+ (int) replies[i].poststat.ia_type,
+ priv->children[i]->name,
+ uuid_utoa (replies[i].poststat.ia_gfid));
+ return -EIO;
+ }
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
+ if (!IA_EQUAL (first, replies[i].poststat, uid)) {
gf_log (this->name, GF_LOG_DEBUG,
- "looking up %s on subvolume %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame,
- sh_missing_entries_lookup_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->lookup,
- &local->loc, xattr_req);
-
- if (!--call_count)
- break;
+ "UID mismatch %d vs %d on %s for gfid:%s",
+ (int) first.ia_uid,
+ (int) replies[i].poststat.ia_uid,
+ priv->children[i]->name,
+ uuid_utoa (replies[i].poststat.ia_gfid));
+
+ *metadata_selfheal = _gf_true;
}
- }
-
- if (xattr_req)
- dict_unref (xattr_req);
- return 0;
-}
+ if (!IA_EQUAL (first, replies[i].poststat, gid)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "GID mismatch %d vs %d on %s for gfid:%s",
+ (int) first.ia_uid,
+ (int) replies[i].poststat.ia_uid,
+ priv->children[i]->name,
+ uuid_utoa (replies[i].poststat.ia_gfid));
+ *metadata_selfheal = _gf_true;
+ }
-static int
-sh_missing_entries_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
+ if (!IA_EQUAL (first, replies[i].poststat, prot)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "MODE mismatch %d vs %d on %s for gfid:%s",
+ (int) st_mode_from_ia (first.ia_prot, 0),
+ (int) st_mode_from_ia (replies[i].poststat.ia_prot, 0),
+ priv->children[i]->name,
+ uuid_utoa (replies[i].poststat.ia_gfid));
+ *metadata_selfheal = _gf_true;
+ }
- local = frame->local;
- sh = &local->self_heal;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- sh->op_failed = 1;
-
- gf_log (this->name,
- (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR),
- "locking inode of %s on child %d failed: %s",
- local->loc.path, child_index,
- strerror (op_errno));
- } else {
+ if (IA_ISREG(first.ia_type) &&
+ !IA_EQUAL (first, replies[i].poststat, size)) {
gf_log (this->name, GF_LOG_DEBUG,
- "inode of %s on child %d locked",
- local->loc.path, child_index);
+ "SIZE mismatch %lld vs %lld on %s for gfid:%s",
+ (long long) first.ia_size,
+ (long long) replies[i].poststat.ia_size,
+ priv->children[i]->name,
+ uuid_utoa (replies[i].poststat.ia_gfid));
+
+ *data_selfheal = _gf_true;
}
}
- UNLOCK (&frame->lock);
- call_count = afr_frame_return (frame);
+ if (valid_cnt > 0)
+ afr_inode_link (inode, &first);
- if (call_count == 0) {
- if (sh->op_failed == 1) {
- sh_missing_entries_finish (frame, this);
- return 0;
- }
-
- sh_missing_entries_lookup (frame, this);
- }
+ if (valid_cnt < 2)
+ return -ENOTCONN;
return 0;
}
-static int
-afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this)
+inode_t *
+afr_inode_find (xlator_t *this, uuid_t gfid)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int call_count = 0;
+ inode_table_t *table = NULL;
+ inode_t *inode = NULL;
+ table = this->itable;
+ if (!table)
+ return NULL;
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ inode = inode_find (table, gfid);
+ if (inode)
+ return inode;
- gf_log (this->name, GF_LOG_DEBUG,
- "attempting to recreate missing entries for path=%s",
- local->loc.path);
+ inode = inode_new (table);
+ if (!inode)
+ return NULL;
- afr_build_parent_loc (&sh->parent_loc, &local->loc);
+ uuid_copy (inode->gfid, gfid);
- call_count = local->child_count;
+ return inode;
+}
- local->call_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, sh_missing_entries_lk_cbk,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- this->name,
- &sh->parent_loc, local->loc.name,
- ENTRYLK_LOCK_NB, ENTRYLK_WRLCK);
- if (!--call_count)
- break;
- }
- }
+call_frame_t *
+afr_frame_create (xlator_t *this)
+{
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ int op_errno = 0;
+ pid_t pid = -1;
- return 0;
+ frame = create_frame (this, this->ctx->pool);
+ if (!frame)
+ return NULL;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local) {
+ STACK_DESTROY (frame->root);
+ return NULL;
+ }
+
+ syncopctx_setfspid (&pid);
+
+ frame->root->pid = pid;
+
+ afr_set_lk_owner (frame, this, frame->root);
+
+ return frame;
}
+/*
+ * This is the entry point for healing a given GFID
+ */
+
int
-afr_self_heal (call_frame_t *frame, xlator_t *this,
- int (*completion_cbk) (call_frame_t *, xlator_t *))
+afr_selfheal (xlator_t *this, uuid_t gfid)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
+ inode_t *inode = NULL;
+ call_frame_t *frame = NULL;
+ int ret = -1;
+ gf_boolean_t data_selfheal = _gf_false;
+ gf_boolean_t metadata_selfheal = _gf_false;
+ gf_boolean_t entry_selfheal = _gf_false;
+ inode = afr_inode_find (this, gfid);
+ if (!inode)
+ goto out;
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ frame = afr_frame_create (this);
+ if (!frame)
+ goto out;
- gf_log (this->name, GF_LOG_DEBUG,
- "performing self heal on %s (metadata=%d data=%d entry=%d)",
- local->loc.path,
- local->need_metadata_self_heal,
- local->need_data_self_heal,
- local->need_entry_self_heal);
+ ret = afr_selfheal_unlocked_inspect (frame, this, inode, gfid,
+ &data_selfheal,
+ &metadata_selfheal,
+ &entry_selfheal);
+ if (ret)
+ goto out;
- sh->completion_cbk = completion_cbk;
+ if (data_selfheal)
+ afr_selfheal_data (frame, this, inode);
- sh->buf = CALLOC (priv->child_count, sizeof (struct stat));
- sh->child_errno = CALLOC (priv->child_count, sizeof (int));
- sh->success = CALLOC (priv->child_count, sizeof (int));
- sh->xattr = CALLOC (priv->child_count, sizeof (dict_t *));
- sh->sources = CALLOC (sizeof (*sh->sources), priv->child_count);
+ if (metadata_selfheal)
+ afr_selfheal_metadata (frame, this, inode);
- sh->pending_matrix = CALLOC (sizeof (int32_t *), priv->child_count);
- for (i = 0; i < priv->child_count; i++) {
- sh->pending_matrix[i] = CALLOC (sizeof (int32_t),
- priv->child_count);
- }
+ if (entry_selfheal)
+ afr_selfheal_entry (frame, this, inode);
- sh->delta_matrix = CALLOC (sizeof (int32_t *), priv->child_count);
- for (i = 0; i < priv->child_count; i++) {
- sh->delta_matrix[i] = CALLOC (sizeof (int32_t),
- priv->child_count);
- }
-
- if (local->success_count && local->enoent_count) {
- afr_self_heal_missing_entries (frame, this);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "proceeding to metadata check on %s",
- local->loc.path);
- afr_sh_missing_entries_done (frame, this);
- }
+ inode_forget (inode, 1);
+out:
+ if (inode)
+ inode_unref (inode);
+ if (frame)
+ AFR_STACK_DESTROY (frame);
- return 0;
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h
deleted file mode 100644
index a311cdf5e..000000000
--- a/xlators/cluster/afr/src/afr-self-heal-common.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __AFR_SELF_HEAL_COMMON_H__
-#define __AFR_SELF_HEAL_COMMON_H__
-
-#define FILE_HAS_HOLES(buf) (((buf)->st_size) > ((buf)->st_blocks * 512))
-
-typedef enum {
- AFR_SELF_HEAL_ENTRY,
- AFR_SELF_HEAL_METADATA,
- AFR_SELF_HEAL_DATA,
-} afr_self_heal_type;
-
-int
-afr_sh_select_source (int sources[], int child_count);
-
-int
-afr_sh_sink_count (int sources[], int child_count);
-
-int
-afr_sh_source_count (int sources[], int child_count);
-
-int
-afr_sh_supress_errenous_children (int sources[], int child_errno[],
- int child_count);
-
-void
-afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this);
-
-void
-afr_sh_build_pending_matrix (afr_private_t *priv,
- int32_t *pending_matrix[], dict_t *xattr[],
- int child_count, afr_transaction_type type);
-
-void
-afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr,
- int32_t *delta_matrix[], int success[],
- int child_count, afr_transaction_type type);
-
-int
-afr_sh_mark_sources (afr_self_heal_t *sh, int child_count,
- afr_self_heal_type type);
-
-int
-afr_sh_delta_to_xattr (afr_private_t *priv,
- int32_t *delta_matrix[], dict_t *xattr[],
- int child_count, afr_transaction_type type);
-
-int
-afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count);
-
-
-#endif /* __AFR_SELF_HEAL_COMMON_H__ */
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index a7a3d44f7..c0548d995 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -1,1037 +1,635 @@
/*
- Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#include <libgen.h>
-#include <unistd.h>
-#include <fnmatch.h>
-#include <sys/time.h>
-#include <stdlib.h>
-#include <signal.h>
#ifndef _CONFIG_H
#define _CONFIG_H
#include "config.h"
#endif
-#include "glusterfs.h"
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-
-#include "afr-transaction.h"
#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
+#include "byte-order.h"
+enum {
+ AFR_SELFHEAL_DATA_FULL = 0,
+ AFR_SELFHEAL_DATA_DIFF,
+};
-int
-afr_sh_data_done (call_frame_t *frame, xlator_t *this)
+#define HAS_HOLES(i) ((i->ia_blocks * 512) < (i->ia_size))
+static int
+__checksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, uint32_t weak, uint8_t *strong,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = (long) cookie;
local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
- /*
- TODO: cleanup sh->*
- */
-
- gf_log (this->name, GF_LOG_DEBUG,
- "self heal of %s completed",
- local->loc.path);
-
- sh->completion_cbk (frame, this);
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (strong)
+ memcpy (local->replies[i].checksum, strong, MD5_DIGEST_LENGTH);
+ syncbarrier_wake (&local->barrier);
return 0;
}
-int
-afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+static int
+attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre, struct iatt *post,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- int call_count = 0;
+ int i = (long) cookie;
+ afr_local_t *local = NULL;
local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
- LOCK (&frame->lock);
- {
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (pre)
+ local->replies[i].prestat = *pre;
+ if (post)
+ local->replies[i].poststat = *post;
+ if (xdata)
+ local->replies[i].xdata = dict_ref (xdata);
- if (call_count == 0) {
- fd_unref (sh->healing_fd);
- sh->healing_fd = NULL;
- afr_sh_data_done (frame, this);
- }
+ syncbarrier_wake (&local->barrier);
return 0;
}
-int
-afr_sh_data_close (call_frame_t *frame, xlator_t *this)
+static gf_boolean_t
+__afr_selfheal_data_checksums_match (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int source,
+ unsigned char *healed_sinks,
+ off_t offset, size_t size)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- int i = 0;
- int call_count = 0;
-
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ unsigned char *wind_subvols = NULL;
+ int i = 0;
- local = frame->local;
- sh = &local->self_heal;
priv = this->private;
+ local = frame->local;
- if (!sh->healing_fd) {
- afr_sh_data_done (frame, this);
- return 0;
+ wind_subvols = alloca0 (priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || healed_sinks[i])
+ wind_subvols[i] = 1;
}
- call_count = sh->active_sinks + 1;
- local->call_count = call_count;
-
+ AFR_ONLIST (wind_subvols, frame, __checksum_cbk, rchecksum, fd,
+ offset, size, NULL);
- /* closed source */
- gf_log (this->name, GF_LOG_DEBUG,
- "closing fd of %s on %s",
- local->loc.path, priv->children[sh->source]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk,
- (void *) (long) sh->source,
- priv->children[sh->source],
- priv->children[sh->source]->fops->flush,
- sh->healing_fd);
- call_count--;
+ if (!local->replies[source].valid || local->replies[source].op_ret != 0)
+ return _gf_false;
for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] || !local->child_up[i])
+ if (i == source)
continue;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "closing fd of %s on %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->flush,
- sh->healing_fd);
- if (!--call_count)
- break;
- }
-
- return 0;
-}
-
-
-int
-afr_sh_data_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t * local = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
-
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "locking inode of %s on child %d failed: %s",
- local->loc.path, child_index,
- strerror (op_errno));
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "inode of %s on child %d locked",
- local->loc.path, child_index);
- }
+ if (memcmp (local->replies[source].checksum,
+ local->replies[i].checksum,
+ MD5_DIGEST_LENGTH))
+ return _gf_false;
}
- UNLOCK (&frame->lock);
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- afr_sh_data_close (frame, this);
- }
-
- return 0;
+ return _gf_true;
}
-int
-afr_sh_data_unlock (call_frame_t *frame, xlator_t *this)
+static int
+__afr_selfheal_data_read_write (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks,
+ off_t offset, size_t size,
+ struct afr_reply *replies)
{
- struct flock flock;
- int i = 0;
- int call_count = 0;
-
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- afr_self_heal_t * sh = NULL;
-
+ struct iovec *iovec = NULL;
+ int count = 0;
+ struct iobref *iobref = NULL;
+ int ret = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
- local = frame->local;
- sh = &local->self_heal;
priv = this->private;
- call_count = local->child_count;
+ ret = syncop_readv (priv->children[source], fd, size, offset, 0,
+ &iovec, &count, &iobref);
+ if (ret <= 0)
+ return ret;
- local->call_count = call_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
- flock.l_start = 0;
- flock.l_len = 0;
- flock.l_type = F_UNLCK;
+ /*
+ * TODO: Use fiemap() and discard() to heal holes
+ * in the future.
+ *
+ * For now,
+ *
+ * - if the source had any holes at all,
+ * AND
+ * - if we are writing past the original file size
+ * of the sink
+ * AND
+ * - is NOT the last block of the source file. if
+ * the block contains EOF, it has to be written
+ * in order to set the file size even if the
+ * last block is 0-filled.
+ * AND
+ * - if the read buffer is filled with only 0's
+ *
+ * then, skip writing to this source. We don't depend
+ * on the write to happen to update the size as we
+ * have performed an ftruncate() upfront anyways.
+ */
+#define is_last_block(o,b,s) ((s >= o) && (s <= (o + b)))
+ if (HAS_HOLES ((&replies[source].poststat)) &&
+ offset >= replies[i].poststat.ia_size &&
+ !is_last_block (offset, size,
+ replies[source].poststat.ia_size) &&
+ (iov_0filled (iovec, count) == 0))
+ continue;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- gf_log (this->name, GF_LOG_DEBUG,
- "unlocking %s on subvolume %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_data_unlck_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->inodelk,
- this->name,
- &local->loc, F_SETLK, &flock);
- if (!--call_count)
- break;
+ ret = syncop_writev (priv->children[i], fd, iovec, count,
+ offset, iobref, 0);
+ if (ret != iov_length (iovec, count)) {
+ /* write() failed on this sink. unset the corresponding
+ member in sinks[] (which is healed_sinks[] in the
+ caller) so that this server does NOT get considered
+ as successfully healed.
+ */
+ healed_sinks[i] = 0;
}
}
+ if (iobref)
+ iobref_unref (iobref);
- return 0;
-}
-
-
-int
-afr_sh_data_finish (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "finishing data selfheal of %s", local->loc.path);
-
- afr_sh_data_unlock (frame, this);
-
- return 0;
+ return ret;
}
-int
-afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xattr)
+static int
+afr_selfheal_data_block (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks, off_t offset,
+ size_t size, int type, struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
+ int ret = -1;
+ int sink_count = 0;
+ afr_private_t *priv = NULL;
+ unsigned char *data_lock = NULL;
- local = frame->local;
- sh = &local->self_heal;
priv = this->private;
+ sink_count = AFR_COUNT (healed_sinks, priv->child_count);
+ data_lock = alloca0 (priv->child_count);
- LOCK (&frame->lock);
+ ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name,
+ offset, size, data_lock);
{
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_sh_data_finish (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int i = 0;
- dict_t **erase_xattr = NULL;
-
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, sh->success,
- priv->child_count, AFR_DATA_TRANSACTION);
-
- erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count);
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i]) {
- call_count++;
-
- erase_xattr[i] = get_new_dict();
- dict_ref (erase_xattr[i]);
+ if (ret < sink_count) {
+ ret = -ENOTCONN;
+ goto unlock;
}
- }
-
- afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr,
- priv->child_count, AFR_DATA_TRANSACTION);
-
- local->call_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (!erase_xattr[i])
- continue;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "erasing pending flags from %s on %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_data_erase_pending_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, erase_xattr[i]);
- if (!--call_count)
- break;
- }
- for (i = 0; i < priv->child_count; i++) {
- if (erase_xattr[i]) {
- dict_unref (erase_xattr[i]);
+ if (type == AFR_SELFHEAL_DATA_DIFF &&
+ __afr_selfheal_data_checksums_match (frame, this, fd, source,
+ healed_sinks, offset, size)) {
+ ret = 0;
+ goto unlock;
}
- }
- FREE (erase_xattr);
- return 0;
+ ret = __afr_selfheal_data_read_write (frame, this, fd, source,
+ healed_sinks, offset, size,
+ replies);
+ }
+unlock:
+ afr_selfheal_uninodelk (frame, this, fd->inode, this->name,
+ offset, size, data_lock);
+ return ret;
}
-int
-afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct stat *buf)
+
+static int
+afr_selfheal_data_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *healed_sinks)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t *sh = NULL;
- int call_count = 0;
- int child_index = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
- priv = this->private;
local = frame->local;
- sh = &local->self_heal;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1)
- gf_log (this->name, GF_LOG_ERROR,
- "ftruncate of %s on subvolume %s failed (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- else
- gf_log (this->name, GF_LOG_DEBUG,
- "ftruncate of %s on subvolume %s completed",
- local->loc.path,
- priv->children[child_index]->name);
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
+ priv = this->private;
- if (call_count == 0) {
- afr_sh_data_erase_pending (frame, this);
- }
+ AFR_ONLIST (healed_sinks, frame, attr_cbk, fsync, fd, 0, NULL);
+ for (i = 0; i < priv->child_count; i++)
+ if (healed_sinks[i] && local->replies[i].op_ret != 0)
+ /* fsync() failed. Do NOT consider this server
+ as successfully healed. Mark it so.
+ */
+ healed_sinks[i] = 0;
return 0;
}
-int
-afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this)
+static int
+afr_selfheal_data_restore_time (call_frame_t *frame, xlator_t *this,
+ inode_t *inode, int source,
+ unsigned char *healed_sinks,
+ struct afr_reply *replies)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t *sh = NULL;
- int *sources = NULL;
- int call_count = 0;
- int i = 0;
-
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- sources = sh->sources;
- call_count = sh->active_sinks;
-
- local->call_count = call_count;
+ loc_t loc = {0, };
- for (i = 0; i < priv->child_count; i++) {
- if (sources[i] || !local->child_up[i])
- continue;
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
- STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->ftruncate,
- sh->healing_fd, sh->file_size);
+ AFR_ONLIST (healed_sinks, frame, attr_cbk, setattr, &loc,
+ &replies[source].poststat,
+ (GF_SET_ATTR_ATIME|GF_SET_ATTR_MTIME), NULL);
- if (!--call_count)
- break;
- }
+ loc_wipe (&loc);
return 0;
}
-
-int
-afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_data_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct stat *buf)
+static int
+afr_data_self_heal_type_get (afr_private_t *priv, unsigned char *healed_sinks,
+ int source, struct afr_reply *replies)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t *sh = NULL;
+ int type = AFR_SELFHEAL_DATA_FULL;
+ int i = 0;
- int child_index = (long) cookie;
- int call_count = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "wrote %d bytes of data from %s to child %d, offset %"PRId64"",
- op_ret, local->loc.path, child_index, sh->offset - op_ret);
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "write to %s failed on subvolume %s (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- sh->op_failed = 1;
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- afr_sh_data_read_write_iter (frame, this);
- }
-
- return 0;
+ if (priv->data_self_heal_algorithm == NULL) {
+ type = AFR_SELFHEAL_DATA_FULL;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i] && i != source)
+ continue;
+ if (replies[i].poststat.ia_size) {
+ type = AFR_SELFHEAL_DATA_DIFF;
+ break;
+ }
+ }
+ } else if (strcmp (priv->data_self_heal_algorithm, "full") == 0) {
+ type = AFR_SELFHEAL_DATA_FULL;
+ } else if (strcmp (priv->data_self_heal_algorithm, "diff") == 0) {
+ type = AFR_SELFHEAL_DATA_DIFF;
+ }
+ return type;
}
-
-int
-afr_sh_data_read_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iovec *vector, int32_t count, struct stat *buf,
- struct iobref *iobref)
+static int
+afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks,
+ struct afr_reply *replies)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t *sh = NULL;
-
- int child_index = (long) cookie;
+ afr_private_t *priv = NULL;
int i = 0;
- int call_count = 0;
-
- off_t offset;
+ off_t off = 0;
+ size_t block = 128 * 1024;
+ int type = AFR_SELFHEAL_DATA_FULL;
+ int ret = -1;
+ call_frame_t *iter_frame = NULL;
+ char *sinks_str = NULL;
+ char *p = NULL;
priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- call_count = sh->active_sinks;
-
- local->call_count = call_count;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "read %d bytes of data from %s on child %d, offset %"PRId64"",
- op_ret, local->loc.path, child_index, sh->offset);
-
- if (op_ret <= 0) {
- afr_sh_data_trim_sinks (frame, this);
- return 0;
- }
-
- /* what if we read less than block size? */
- offset = sh->offset;
- sh->offset += op_ret;
-
- if (sh->file_has_holes) {
- if (iov_0filled (vector, count) == 0) {
- /* the iter function depends on the
- sh->offset already being updated
- above
- */
- afr_sh_data_read_write_iter (frame, this);
- goto out;
- }
- }
+ sinks_str = alloca0 (priv->child_count * 8);
+ p = sinks_str;
for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] || !local->child_up[i])
+ if (!healed_sinks[i])
continue;
-
- /* this is a sink, so write to it */
- STACK_WIND_COOKIE (frame, afr_sh_data_write_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->writev,
- sh->healing_fd, vector, count, offset,
- iobref);
-
- if (!--call_count)
- break;
+ p += sprintf (p, "%d ", i);
}
-out:
- return 0;
-}
-
-
-int
-afr_sh_data_read_write (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t *sh = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
+ gf_log (this->name, GF_LOG_INFO, "performing data selfheal on %s. "
+ "source=%d sinks=%s",
+ uuid_utoa (fd->inode->gfid), source, sinks_str);
- STACK_WIND_COOKIE (frame, afr_sh_data_read_cbk,
- (void *) (long) sh->source,
- priv->children[sh->source],
- priv->children[sh->source]->fops->readv,
- sh->healing_fd, sh->block_size,
- sh->offset);
+ type = afr_data_self_heal_type_get (priv, healed_sinks, source,
+ replies);
- return 0;
-}
+ iter_frame = afr_copy_frame (frame);
+ if (!iter_frame)
+ return -ENOMEM;
+ for (off = 0; off < replies[source].poststat.ia_size; off += block) {
+ ret = afr_selfheal_data_block (iter_frame, this, fd, source,
+ healed_sinks, off, block, type,
+ replies);
+ if (ret < 0)
+ goto out;
-int
-afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t *sh = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- if (sh->op_failed) {
- afr_sh_data_finish (frame, this);
- goto out;
+ AFR_STACK_RESET (iter_frame);
}
- if (sh->offset >= sh->file_size) {
- gf_log (this->name, GF_LOG_DEBUG,
- "closing fd's of %s",
- local->loc.path);
- afr_sh_data_trim_sinks (frame, this);
-
- goto out;
- }
+ afr_selfheal_data_restore_time (frame, this, fd->inode, source,
+ healed_sinks, replies);
- afr_sh_data_read_write (frame, this);
+ ret = afr_selfheal_data_fsync (frame, this, fd, healed_sinks);
out:
- return 0;
+ if (iter_frame)
+ AFR_STACK_DESTROY (iter_frame);
+ return ret;
}
-int
-afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd)
+static int
+__afr_selfheal_truncate_sinks (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, unsigned char *healed_sinks,
+ struct afr_reply *replies, uint64_t size)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ unsigned char *larger_sinks = 0;
+ int i = 0;
local = frame->local;
- sh = &local->self_heal;
priv = this->private;
- child_index = (long) cookie;
-
- /* TODO: some of the open's might fail.
- In that case, modify cleanup fn to send flush on those
- fd's which are already open */
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "open of %s failed on child %s (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- sh->op_failed = 1;
- }
-
+ larger_sinks = alloca0 (priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (healed_sinks[i] && replies[i].poststat.ia_size > size)
+ larger_sinks[i] = 1;
}
- UNLOCK (&frame->lock);
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (sh->op_failed) {
- afr_sh_data_finish (frame, this);
- return 0;
- }
- gf_log (this->name, GF_LOG_DEBUG,
- "fd for %s opened, commencing sync",
- local->loc.path);
-
- gf_log (this->name, GF_LOG_WARNING,
- "sourcing file %s from %s to other sinks",
- local->loc.path, priv->children[sh->source]->name);
-
- afr_sh_data_read_write (frame, this);
- }
+ AFR_ONLIST (larger_sinks, frame, attr_cbk, ftruncate, fd, size, NULL);
+ for (i = 0; i < priv->child_count; i++)
+ if (healed_sinks[i] && local->replies[i].op_ret == -1)
+ /* truncate() failed. Do NOT consider this server
+ as successfully healed. Mark it so.
+ */
+ healed_sinks[i] = 0;
return 0;
}
-
-int
-afr_sh_data_open (call_frame_t *frame, xlator_t *this)
+/*
+ * If by chance there are multiple sources with differing sizes, select
+ * the largest file as the source.
+ *
+ * This can only happen if data was directly modified in the backend.
+ */
+static int
+__afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies)
{
- int i = 0;
- int call_count = 0;
-
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint64_t size = 0;
int source = -1;
- int *sources = NULL;
-
- fd_t *fd = NULL;
+ int locked_count = 0;
+ int sources_count = 0;
+ int healed_sinks_count = 0;
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
priv = this->private;
- call_count = sh->active_sinks + 1;
- local->call_count = call_count;
+ locked_count = AFR_COUNT (locked_on, priv->child_count);
+ sources_count = AFR_COUNT (sources, priv->child_count);
+ healed_sinks_count = AFR_COUNT (healed_sinks, priv->child_count);
- fd = fd_create (local->loc.inode, frame->root->pid);
- sh->healing_fd = fd;
-
- source = local->self_heal.source;
- sources = local->self_heal.sources;
-
- sh->block_size = 65536;
- sh->file_size = sh->buf[source].st_size;
-
- if (FILE_HAS_HOLES (&sh->buf[source]))
- sh->file_has_holes = 1;
-
- /* open source */
- STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk,
- (void *) (long) source,
- priv->children[source],
- priv->children[source]->fops->open,
- &local->loc, O_RDONLY|O_LARGEFILE, fd);
- call_count--;
+ if (locked_count == healed_sinks_count || !sources_count) {
+ /* split brain */
+ return -EIO;
+ }
- /* open sinks */
for (i = 0; i < priv->child_count; i++) {
- if(sources[i] || !local->child_up[i])
+ if (!sources[i])
continue;
-
- STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->open,
- &local->loc,
- O_WRONLY|O_LARGEFILE, fd);
-
- if (!--call_count)
- break;
+ if (size <= replies[i].poststat.ia_size) {
+ size = replies[i].poststat.ia_size;
+ source = i;
+ }
}
- return 0;
-}
-
-
-int
-afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int active_sinks = 0;
- int source = 0;
- int i = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
-
for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] == 0 && local->child_up[i] == 1) {
- active_sinks++;
- sh->success[i] = 1;
+ if (!sources[i])
+ continue;
+ if (replies[i].poststat.ia_size < size) {
+ sources[i] = 0;
+ sinks[i] = 1;
}
}
- sh->success[source] = 1;
-
- if (active_sinks == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no active sinks for performing self-heal on file %s",
- local->loc.path);
- afr_sh_data_finish (frame, this);
- return 0;
- }
- sh->active_sinks = active_sinks;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "syncing data of %s from subvolume %s to %d active sinks",
- local->loc.path, priv->children[source]->name, active_sinks);
-
- afr_sh_data_open (frame, this);
- return 0;
+ return source;
}
-
-int
-afr_sh_data_fix (call_frame_t *frame, xlator_t *this)
+/*
+ * __afr_selfheal_data_prepare:
+ *
+ * This function inspects the on-disk xattrs and determines which subvols
+ * are sources and sinks.
+ *
+ * The return value is the index of the subvolume to be used as the source
+ * for self-healing, or -1 if no healing is necessary/split brain.
+ */
+static int
+__afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks,
+ struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int nsources = 0;
- int source = 0;
- int i = 0;
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
- local = frame->local;
- sh = &local->self_heal;
priv = this->private;
- afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr,
- priv->child_count, AFR_DATA_TRANSACTION);
-
- afr_sh_print_pending_matrix (sh->pending_matrix, this);
-
- nsources = afr_sh_mark_sources (sh, priv->child_count,
- AFR_SELF_HEAL_DATA);
-
- afr_sh_supress_errenous_children (sh->sources, sh->child_errno,
- priv->child_count);
-
- if (nsources == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "No self-heal needed for %s",
- local->loc.path);
-
- afr_sh_data_finish (frame, this);
- return 0;
- }
-
- if ((nsources == -1)
- && (priv->favorite_child != -1)
- && (sh->child_errno[priv->favorite_child] == 0)) {
-
- gf_log (this->name, GF_LOG_WARNING,
- "Picking favorite child %s as authentic source to resolve conflicting data of %s",
- priv->children[priv->favorite_child]->name,
- local->loc.path);
-
- sh->sources[priv->favorite_child] = 1;
-
- nsources = afr_sh_source_count (sh->sources,
- priv->child_count);
- }
-
- if (nsources == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "Unable to resolve conflicting data of %s. "
- "Please resolve manually by deleting the file %s "
- "from all but the preferred subvolume. "
- "Please consider 'option favorite-child <>'",
- local->loc.path, local->loc.path);
-
- local->govinda_gOvinda = 1;
-
- afr_sh_data_finish (frame, this);
- return 0;
- }
-
- source = afr_sh_select_source (sh->sources, priv->child_count);
- sh->source = source;
-
- /* detect changes not visible through pending flags -- JIC */
- for (i = 0; i < priv->child_count; i++) {
- if (i == source || sh->child_errno[i])
- continue;
-
- if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[source]))
- sh->sources[i] = 0;
- }
-
- afr_sh_data_sync_prepare (frame, this);
-
- return 0;
+ ret = afr_selfheal_unlocked_discover (frame, fd->inode, fd->inode->gfid,
+ replies);
+ if (ret)
+ return ret;
+
+ ret = afr_selfheal_find_direction (frame, this, replies,
+ AFR_DATA_TRANSACTION,
+ locked_on, sources, sinks);
+ if (ret)
+ return ret;
+
+ source = __afr_selfheal_data_finalize_source (this, sources, sinks,
+ healed_sinks, locked_on,
+ replies);
+ if (source < 0)
+ return -EIO;
+
+ for (i = 0; i < priv->child_count; i++)
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
+
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ healed_sinks[i] = sinks[i] && locked_on[i];
+
+ return source;
}
-int
-afr_sh_data_lookup_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct stat *buf, dict_t *xattr)
+static int
+__afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *locked_on)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- int call_count = -1;
- int child_index = (long) cookie;
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *data_lock = NULL;
+ unsigned char *healed_sinks = NULL;
+ struct afr_reply *locked_replies = NULL;
+ int source = -1;
+ gf_boolean_t compat = _gf_false;
+ unsigned char *compat_lock = NULL;
- local = frame->local;
- sh = &local->self_heal;
priv = this->private;
- LOCK (&frame->lock);
+ sources = alloca0 (priv->child_count);
+ sinks = alloca0 (priv->child_count);
+ healed_sinks = alloca0 (priv->child_count);
+ data_lock = alloca0 (priv->child_count);
+ compat_lock = alloca0 (priv->child_count);
+
+ locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
+
+ ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, 0, 0,
+ data_lock);
{
- if (op_ret != -1) {
- sh->xattr[child_index] = dict_ref (xattr);
- sh->buf[child_index] = *buf;
+ if (ret < 2) {
+ ret = -ENOTCONN;
+ goto unlock;
}
- }
- UNLOCK (&frame->lock);
- call_count = afr_frame_return (frame);
+ ret = __afr_selfheal_data_prepare (frame, this, fd, data_lock,
+ sources, sinks, healed_sinks,
+ locked_replies);
+ if (ret < 0)
+ goto unlock;
+
+ source = ret;
+
+ ret = __afr_selfheal_truncate_sinks (frame, this, fd, healed_sinks,
+ locked_replies,
+ locked_replies[source].poststat.ia_size);
+ if (ret < 0)
+ goto unlock;
+
+ ret = 0;
+
+ /* Locking from (LLONG_MAX - 2) to (LLONG_MAX - 1) is for
+ compatibility with older self-heal clients which do not
+ hold a lock in the @priv->sh_domain domain to guard
+ against concurrent ongoing self-heals
+ */
+ afr_selfheal_inodelk (frame, this, fd->inode, this->name,
+ LLONG_MAX - 2, 1, compat_lock);
+ compat = _gf_true;
+ }
+unlock:
+ afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0,
+ data_lock);
+ if (ret < 0)
+ goto out;
- if (call_count == 0) {
- afr_sh_data_fix (frame, this);
- }
+ ret = afr_selfheal_data_do (frame, this, fd, source, healed_sinks,
+ locked_replies);
+ if (ret)
+ goto out;
- return 0;
+ ret = afr_selfheal_undo_pending (frame, this, fd->inode, sources, sinks,
+ healed_sinks, AFR_DATA_TRANSACTION,
+ locked_replies, data_lock);
+out:
+ if (compat)
+ afr_selfheal_uninodelk (frame, this, fd->inode, this->name,
+ LLONG_MAX - 2, 1, compat_lock);
+ return ret;
}
-int
-afr_sh_data_lookup (call_frame_t *frame, xlator_t *this)
+static fd_t *
+afr_selfheal_data_open (xlator_t *this, inode_t *inode)
{
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- dict_t *xattr_req = NULL;
-
- int call_count = 0;
- int i = 0;
+ loc_t loc = {0,};
int ret = 0;
+ fd_t *fd = NULL;
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- call_count = local->child_count;
-
- local->call_count = call_count;
-
- xattr_req = dict_new();
- if (xattr_req) {
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_uint64 (xattr_req, priv->pending_key[i],
- 3 * sizeof(int32_t));
- }
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_sh_data_lookup_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->lookup,
- &local->loc, xattr_req);
- if (!--call_count)
- break;
- }
- }
-
- if (xattr_req)
- dict_unref (xattr_req);
-
- return 0;
-}
-
-
-int
-afr_sh_data_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
+ fd = fd_create (inode, 0);
+ if (!fd)
+ return NULL;
- /* TODO: what if lock fails? */
-
- local = frame->local;
- sh = &local->self_heal;
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- sh->op_failed = 1;
-
- gf_log (this->name,
- (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR),
- "locking of %s on child %d failed: %s",
- local->loc.path, child_index,
- strerror (op_errno));
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "inode of %s on child %d locked",
- local->loc.path, child_index);
- }
+ ret = syncop_open (this, &loc, O_RDWR|O_LARGEFILE, fd);
+ if (ret) {
+ fd_unref (fd);
+ fd = NULL;
+ } else {
+ fd_bind (fd);
}
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
- if (call_count == 0) {
- if (sh->op_failed) {
- afr_sh_data_finish (frame, this);
- return 0;
- }
-
- afr_sh_data_lookup (frame, this);
- }
+ loc_wipe (&loc);
- return 0;
+ return fd;
}
int
-afr_sh_data_lock (call_frame_t *frame, xlator_t *this)
+afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode)
{
- struct flock flock;
- int i = 0;
- int call_count = 0;
-
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- afr_self_heal_t * sh = NULL;
-
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ int ret = 0;
+ fd_t *fd = NULL;
- local = frame->local;
- sh = &local->self_heal;
priv = this->private;
- call_count = local->child_count;
-
- local->call_count = call_count;
+ fd = afr_selfheal_data_open (this, inode);
+ if (!fd)
+ return -EIO;
- flock.l_start = 0;
- flock.l_len = 0;
- flock.l_type = F_WRLCK;
+ locked_on = alloca0 (priv->child_count);
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- gf_log (this->name, GF_LOG_DEBUG,
- "locking %s on subvolume %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_data_lock_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->inodelk,
- this->name,
- &local->loc, F_SETLK, &flock);
- if (!--call_count)
- break;
+ ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain, 0, 0,
+ locked_on);
+ {
+ if (ret < 2) {
+ /* Either less than two subvols available, or another
+ selfheal (from another server) is in progress. Skip
+ for now in any case there isn't anything to do.
+ */
+ ret = -ENOTCONN;
+ goto unlock;
}
- }
-
- return 0;
-}
-
-int
-afr_self_heal_data (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = this->private;
-
-
- local = frame->local;
- sh = &local->self_heal;
-
- if (local->need_data_self_heal && priv->data_self_heal) {
- afr_sh_data_lock (frame, this);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "not doing data self heal on %s",
- local->loc.path);
- afr_sh_data_done (frame, this);
+ ret = __afr_selfheal_data (frame, this, fd, locked_on);
}
+unlock:
+ afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on);
- return 0;
-}
+ if (fd)
+ fd_unref (fd);
+ return ret;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index 70edd5bab..9e714b026 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -1,2042 +1,629 @@
/*
- Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#include <libgen.h>
-#include <unistd.h>
-#include <fnmatch.h>
-#include <sys/time.h>
-#include <stdlib.h>
-#include <signal.h>
#ifndef _CONFIG_H
#define _CONFIG_H
#include "config.h"
#endif
-#include "glusterfs.h"
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
+#include "afr-self-heal.h"
#include "byte-order.h"
-
#include "afr-transaction.h"
-#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-
-
-
-int
-afr_sh_entry_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- /*
- TODO: cleanup sh->*
- */
-
- gf_log (this->name, GF_LOG_DEBUG,
- "self heal of %s completed",
- local->loc.path);
-
- sh->completion_cbk (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
-
- /* TODO: what if lock fails? */
-
- local = frame->local;
- sh = &local->self_heal;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "unlocking inode of %s on child %d failed: %s",
- local->loc.path, child_index,
- strerror (op_errno));
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "unlocked inode of %s on child %d",
- local->loc.path, child_index);
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (sh->healing_fd)
- fd_unref (sh->healing_fd);
- sh->healing_fd = NULL;
- afr_sh_entry_done (frame, this);
- }
-
- return 0;
-}
-
-
-int
-afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this)
-{
- int i = 0;
- int call_count = 0;
-
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- afr_self_heal_t * sh = NULL;
-
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- call_count = local->child_count;
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- gf_log (this->name, GF_LOG_DEBUG,
- "unlocking %s on subvolume %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_entry_unlck_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- this->name,
- &local->loc, NULL,
- ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
-
-
-int
-afr_sh_entry_finish (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "finishing entry selfheal of %s", local->loc.path);
-
- afr_sh_entry_unlock (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xattr)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_sh_entry_finish (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int i = 0;
- dict_t **erase_xattr = NULL;
-
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, sh->success,
- priv->child_count, AFR_ENTRY_TRANSACTION);
-
- erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count);
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i]) {
- call_count++;
-
- erase_xattr[i] = get_new_dict();
- dict_ref (erase_xattr[i]);
- }
- }
-
- afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr,
- priv->child_count, AFR_ENTRY_TRANSACTION);
-
- local->call_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (!erase_xattr[i])
- continue;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "erasing pending flags from %s on %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_entry_erase_pending_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, erase_xattr[i]);
- if (!--call_count)
- break;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (erase_xattr[i]) {
- dict_unref (erase_xattr[i]);
- }
- }
- FREE (erase_xattr);
-
- return 0;
-}
-
static int
-next_active_source (call_frame_t *frame, xlator_t *this,
- int current_active_source)
+afr_selfheal_entry_delete (call_frame_t *frame, xlator_t *this, inode_t *dir,
+ const char *name, inode_t *inode, int child,
+ struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int source = -1;
- int next_active_source = -1;
- int i = 0;
+ afr_private_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ int ret = 0;
+ loc_t loc = {0, };
+ char g[64];
priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- source = sh->source;
- if (source != -1) {
- if (current_active_source != source)
- next_active_source = source;
- goto out;
- }
+ subvol = priv->children[child];
- /*
- the next active sink becomes the source for the
- 'conservative decision' of merging all entries
- */
+ loc.parent = inode_ref (dir);
+ uuid_copy (loc.pargfid, dir->gfid);
+ loc.name = name;
+ loc.inode = inode_ref (inode);
- for (i = 0; i < priv->child_count; i++) {
- if ((sh->sources[i] == 0)
- && (local->child_up[i] == 1)
- && (i > current_active_source)) {
-
- next_active_source = i;
+ if (replies[child].valid && replies[child].op_ret == 0) {
+ switch (replies[child].poststat.ia_type) {
+ case IA_IFDIR:
+ gf_log (this->name, GF_LOG_WARNING,
+ "expunging dir %s/%s (%s) on %s",
+ uuid_utoa (dir->gfid), name,
+ uuid_utoa_r (replies[child].poststat.ia_gfid, g),
+ subvol->name);
+ ret = syncop_rmdir (subvol, &loc, 1);
break;
- }
- }
-out:
- return next_active_source;
-}
-
-
-
-static int
-next_active_sink (call_frame_t *frame, xlator_t *this,
- int current_active_sink)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int next_active_sink = -1;
- int i = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- /*
- the next active sink becomes the source for the
- 'conservative decision' of merging all entries
- */
-
- for (i = 0; i < priv->child_count; i++) {
- if ((sh->sources[i] == 0)
- && (local->child_up[i] == 1)
- && (i > current_active_sink)) {
-
- next_active_sink = i;
+ default:
+ gf_log (this->name, GF_LOG_WARNING,
+ "expunging file %s/%s (%s) on %s",
+ uuid_utoa (dir->gfid), name,
+ uuid_utoa_r (replies[child].poststat.ia_gfid, g),
+ subvol->name);
+ ret = syncop_unlink (subvol, &loc);
break;
}
}
- return next_active_sink;
-}
-
-
-int
-build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name)
-{
- int ret = -1;
-
- if (!child) {
- goto out;
- }
-
- if (strcmp (parent->path, "/") == 0)
- asprintf ((char **)&child->path, "/%s", name);
- else
- asprintf ((char **)&child->path, "%s/%s", parent->path, name);
-
- if (!child->path) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
- goto out;
- }
-
- child->name = strrchr (child->path, '/');
- if (child->name)
- child->name++;
-
- child->parent = inode_ref (parent->inode);
- child->inode = inode_new (parent->inode->table);
-
- if (!child->inode) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
- goto out;
- }
-
- ret = 0;
-out:
- if (ret == -1)
- loc_wipe (child);
+ loc_wipe (&loc);
return ret;
}
int
-afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this,
- int active_src);
-
-int
-afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this,
- int active_src)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int call_count = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- LOCK (&frame->lock);
- {
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_sh_entry_expunge_subvol (frame, this, active_src);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+afr_selfheal_recreate_entry (call_frame_t *frame, xlator_t *this, int dst,
+ int source, inode_t *dir, const char *name,
+ inode_t *inode, struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- int active_src = 0;
- call_frame_t *frame = NULL;
-
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
-
- active_src = (long) cookie;
-
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "removed %s on %s",
- expunge_local->loc.path,
- priv->children[active_src]->name);
- } else {
- gf_log (this->name, GF_LOG_ERROR,
- "removing %s on %s failed (%s)",
- expunge_local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- }
-
- AFR_STACK_DESTROY (expunge_frame);
- afr_sh_entry_expunge_entry_done (frame, this, active_src);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this,
- int active_src)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
+ int ret = 0;
+ loc_t loc = {0,};
+ loc_t srcloc = {0,};
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ struct iatt *iatt = NULL;
+ char *linkname = NULL;
+ mode_t mode = 0;
+ struct iatt newent = {0,};
priv = this->private;
- expunge_local = expunge_frame->local;
- gf_log (this->name, GF_LOG_WARNING,
- "removing directory %s on %s",
- expunge_local->loc.path, priv->children[active_src]->name);
+ xdata = dict_new();
+ if (!xdata)
+ return -ENOMEM;
- STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk,
- (void *) (long) active_src,
- priv->children[active_src],
- priv->children[active_src]->fops->rmdir,
- &expunge_local->loc);
-
- return 0;
-}
+ loc.parent = inode_ref (dir);
+ uuid_copy (loc.pargfid, dir->gfid);
+ loc.name = name;
+ loc.inode = inode_ref (inode);
+ ret = afr_selfheal_entry_delete (frame, this, dir, name, inode, dst,
+ replies);
+ if (ret)
+ goto out;
-int
-afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this,
- int active_src)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
-
- gf_log (this->name, GF_LOG_WARNING,
- "unlinking file %s on %s",
- expunge_local->loc.path, priv->children[active_src]->name);
-
- STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk,
- (void *) (long) active_src,
- priv->children[active_src],
- priv->children[active_src]->fops->unlink,
- &expunge_local->loc);
-
- return 0;
-}
+ ret = dict_set_static_bin (xdata, "gfid-req",
+ replies[source].poststat.ia_gfid, 16);
+ if (ret)
+ goto out;
+ iatt = &replies[source].poststat;
-int
-afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this,
- int active_src, struct stat *buf)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- int source = 0;
- call_frame_t *frame = NULL;
- int type = 0;
+ srcloc.inode = inode_ref (inode);
+ uuid_copy (srcloc.gfid, iatt->ia_gfid);
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
- source = expunge_sh->source;
-
- type = (buf->st_mode & S_IFMT);
-
- switch (type) {
- case S_IFSOCK:
- case S_IFREG:
- case S_IFBLK:
- case S_IFCHR:
- case S_IFIFO:
- case S_IFLNK:
- afr_sh_entry_expunge_unlink (expunge_frame, this, active_src);
+ mode = st_mode_from_ia (iatt->ia_prot, iatt->ia_type);
+ switch (iatt->ia_type) {
+ case IA_IFDIR:
+ ret = syncop_mkdir (priv->children[dst], &loc, mode, xdata, 0);
break;
- case S_IFDIR:
- afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src);
+ case IA_IFLNK:
+ ret = syncop_lookup (priv->children[dst], &srcloc, 0, 0, 0, 0);
+ if (ret == 0) {
+ ret = syncop_link (priv->children[dst], &srcloc, &loc);
+ } else {
+ ret = syncop_readlink (priv->children[source], &srcloc,
+ &linkname, 4096);
+ if (ret <= 0)
+ goto out;
+ ret = syncop_symlink (priv->children[dst], &loc, linkname,
+ xdata, NULL);
+ }
break;
default:
- gf_log (this->name, GF_LOG_ERROR,
- "%s has unknown file type on %s: 0%o",
- expunge_local->loc.path,
- priv->children[source]->name, type);
- goto out;
+ ret = dict_set_int32 (xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1);
+ if (ret)
+ goto out;
+ ret = syncop_mknod (priv->children[dst], &loc, mode,
+ iatt->ia_rdev, xdata, &newent);
+ if (ret == 0 && iatt->ia_size && !newent.ia_size) {
+ /* New entry created. Mark @dst pending on all sources */
+ ret = 1;
+ }
break;
}
- return 0;
-out:
- AFR_STACK_DESTROY (expunge_frame);
- afr_sh_entry_expunge_entry_done (frame, this, active_src);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct stat *buf, dict_t *x)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- call_frame_t *frame = NULL;
- int active_src = 0;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
- active_src = (long) cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "lookup of %s on %s failed (%s)",
- expunge_local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- goto out;
- }
-
- afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf);
-
- return 0;
out:
- AFR_STACK_DESTROY (expunge_frame);
- afr_sh_entry_expunge_entry_done (frame, this, active_src);
-
- return 0;
+ if (xdata)
+ dict_unref (xdata);
+ loc_wipe (&loc);
+ loc_wipe (&srcloc);
+ return ret;
}
-int
-afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this,
- int active_src)
+static int
+afr_selfheal_newentry_mark (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, struct afr_reply *replies,
+ unsigned char *sources, unsigned char *newentry)
{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
+ int ret = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int **changelog = NULL;
+ int idx = 0;
priv = this->private;
- expunge_local = expunge_frame->local;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "looking up %s on %s",
- expunge_local->loc.path, priv->children[active_src]->name);
-
- STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk,
- (void *) (long) active_src,
- priv->children[active_src],
- priv->children[active_src]->fops->lookup,
- &expunge_local->loc, 0);
-
- return 0;
-}
-
-int
-afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct stat *buf, dict_t *x)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- int source = 0;
- call_frame_t *frame = NULL;
- int active_src = 0;
-
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
- active_src = expunge_sh->active_source;
- source = (long) cookie;
+ idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);
- if (op_ret == -1 && op_errno == ENOENT) {
+ uuid_copy (inode->gfid, replies[source].poststat.ia_gfid);
- gf_log (this->name, GF_LOG_DEBUG,
- "missing entry %s on %s",
- expunge_local->loc.path,
- priv->children[source]->name);
+ changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS);
- afr_sh_entry_expunge_purge (expunge_frame, this, active_src);
+ xattr = dict_new();
+ if (!xattr)
+ return -ENOMEM;
- return 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!newentry[i])
+ continue;
+ changelog[i][idx] = hton32(1);
}
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s exists under %s",
- expunge_local->loc.path,
- priv->children[source]->name);
- } else {
- gf_log (this->name, GF_LOG_ERROR,
- "looking up %s under %s failed (%s)",
- expunge_local->loc.path,
- priv->children[source]->name,
- strerror (op_errno));
- }
+ afr_set_pending_dict (priv, xattr, changelog);
- AFR_STACK_DESTROY (expunge_frame);
- afr_sh_entry_expunge_entry_done (frame, this, active_src);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ afr_selfheal_post_op (frame, this, inode, i, xattr);
+ }
- return 0;
+ dict_unref (xattr);
+ return ret;
}
-int
-afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this,
- char *name)
+static int
+__afr_selfheal_heal_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ char *name, inode_t *inode, int source,
+ unsigned char *sources, unsigned char *healed_sinks,
+ unsigned char *locked_on, struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int ret = -1;
- call_frame_t *expunge_frame = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- int active_src = 0;
- int source = 0;
- int op_errno = 0;
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ unsigned char *newentry = NULL;
priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- active_src = sh->active_source;
- source = sh->source;
-
- if ((strcmp (name, ".") == 0)
- || (strcmp (name, "..") == 0)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "skipping inspection of %s under %s",
- name, local->loc.path);
- goto out;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "inspecting existance of %s under %s",
- name, local->loc.path);
-
- expunge_frame = copy_frame (frame);
- if (!expunge_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
- goto out;
- }
-
- ALLOC_OR_GOTO (expunge_local, afr_local_t, out);
-
- expunge_frame->local = expunge_local;
- expunge_sh = &expunge_local->self_heal;
- expunge_sh->sh_frame = frame;
- expunge_sh->active_source = active_src;
-
- ret = build_child_loc (this, &expunge_local->loc, &local->loc, name);
- if (ret != 0) {
- goto out;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "looking up %s on %s", expunge_local->loc.path,
- priv->children[source]->name);
-
- STACK_WIND_COOKIE (expunge_frame,
- afr_sh_entry_expunge_entry_cbk,
- (void *) (long) source,
- priv->children[source],
- priv->children[source]->fops->lookup,
- &expunge_local->loc, 0);
-
- ret = 0;
-out:
- if (ret == -1)
- afr_sh_entry_expunge_entry_done (frame, this, active_src);
+ newentry = alloca0 (priv->child_count);
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- gf_dirent_t *entries)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- gf_dirent_t *entry = NULL;
- off_t last_offset = 0;
- int active_src = 0;
- int entry_count = 0;
+ if (!replies[source].valid)
+ return -EIO;
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- active_src = sh->active_source;
-
- if (op_ret <= 0) {
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "readdir of %s on subvolume %s failed (%s)",
- local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
+ if (replies[source].op_ret == -1 &&
+ replies[source].op_errno == ENOENT) {
+ ret = afr_selfheal_entry_delete (frame, this, fd->inode,
+ name, inode, i, replies);
} else {
- gf_log (this->name, GF_LOG_DEBUG,
- "readdir of %s on subvolume %s complete",
- local->loc.path,
- priv->children[active_src]->name);
+ if (!uuid_compare (replies[i].poststat.ia_gfid,
+ replies[source].poststat.ia_gfid))
+ continue;
+
+ ret = afr_selfheal_recreate_entry (frame, this, i, source,
+ fd->inode, name, inode,
+ replies);
+ if (ret > 0) {
+ newentry[i] = 1;
+ ret = 0;
+ }
}
-
- afr_sh_entry_expunge_all (frame, this);
- return 0;
- }
-
- list_for_each_entry (entry, &entries->list, list) {
- last_offset = entry->d_off;
- entry_count++;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "readdir'ed %d entries from %s",
- entry_count, priv->children[active_src]->name);
-
- sh->offset = last_offset;
- local->call_count = entry_count;
-
- list_for_each_entry (entry, &entries->list, list) {
- afr_sh_entry_expunge_entry (frame, this, entry->d_name);
- }
-
- return 0;
-}
-
-int
-afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this,
- int active_src)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk,
- priv->children[active_src],
- priv->children[active_src]->fops->readdir,
- sh->healing_fd, sh->block_size, sh->offset);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int active_src = -1;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- sh->offset = 0;
-
- if (sh->source == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no active sources for %s to expunge entries",
- local->loc.path);
- goto out;
- }
-
- active_src = next_active_sink (frame, this, sh->active_source);
- sh->active_source = active_src;
-
- if (sh->op_failed) {
- goto out;
- }
-
- if (active_src == -1) {
- /* completed creating missing files on all subvolumes */
- goto out;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "expunging entries of %s on %s to other sinks",
- local->loc.path, priv->children[active_src]->name);
-
- afr_sh_entry_expunge_subvol (frame, this, active_src);
-
- return 0;
-out:
- afr_sh_entry_erase_pending (frame, this);
- return 0;
-
-}
-
-
-int
-afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this,
- int active_src);
-
-int
-afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this,
- int active_src)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int call_count = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- LOCK (&frame->lock);
- {
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_sh_entry_impunge_subvol (frame, this, active_src);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_utimens_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct stat *stbuf)
-{
- int call_count = 0;
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- call_frame_t *frame = NULL;
- int active_src = 0;
- int child_index = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- frame = impunge_sh->sh_frame;
- child_index = (long) cookie;
-
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "utimes set for %s on %s",
- impunge_local->loc.path,
- priv->children[child_index]->name);
- } else {
- gf_log (this->name, GF_LOG_ERROR,
- "setting utimes of %s on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- }
-
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
-
- if (call_count == 0) {
- AFR_STACK_DESTROY (impunge_frame);
- afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ if (ret < 0)
+ break;
}
- return 0;
+ if (AFR_COUNT (newentry, priv->child_count))
+ afr_selfheal_newentry_mark (frame, this, inode, source, replies,
+ sources, newentry);
+ return ret;
}
-int
-afr_sh_entry_impunge_chown_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct stat *stbuf)
+static int
+__afr_selfheal_merge_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ char *name, inode_t *inode, unsigned char *sources,
+ unsigned char *healed_sinks, unsigned char *locked_on,
+ struct afr_reply *replies)
{
- int call_count = 0;
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- call_frame_t *frame = NULL;
- int active_src = 0;
- int child_index = 0;
- struct timespec ts[2];
-
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int source = -1;
priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- frame = impunge_sh->sh_frame;
- child_index = (long) cookie;
-
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "ownership of %s on %s changed",
- impunge_local->loc.path,
- priv->children[child_index]->name);
- } else {
- gf_log (this->name, GF_LOG_ERROR,
- "setting ownership of %s on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- goto out;
- }
-
-#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC
- ts[0] = impunge_local->cont.lookup.buf.st_atim;
- ts[1] = impunge_local->cont.lookup.buf.st_mtim;
-#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC
- ts[0] = impunge_local->cont.lookup.buf.st_atimespec;
- ts[1] = impunge_local->cont.lookup.buf.st_mtimespec;
-#else
- ts[0].tv_sec = impunge_local->cont.lookup.buf.st_atime;
- ts[1].tv_sec = impunge_local->cont.lookup.buf.st_mtime;
-#endif
- STACK_WIND_COOKIE (impunge_frame,
- afr_sh_entry_impunge_utimens_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->utimens,
- &impunge_local->loc, ts);
- return 0;
-
-out:
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
-
- if (call_count == 0) {
- AFR_STACK_DESTROY (impunge_frame);
- afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid && replies[i].op_ret == 0) {
+ source = i;
+ break;
+ }
}
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct stat *stbuf)
-{
- int call_count = 0;
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- call_frame_t *frame = NULL;
- int active_src = 0;
- int child_index = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- frame = impunge_sh->sh_frame;
-
- child_index = (long) cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "creation of %s on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- goto out;
+ if (source == -1) {
+ /* entry got deleted in the mean time? */
+ return 0;
}
- gf_log (this->name, GF_LOG_DEBUG,
- "setting ownership of %s on %s to %d/%d",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- impunge_local->cont.lookup.buf.st_uid,
- impunge_local->cont.lookup.buf.st_gid);
-
- inode->st_mode = stbuf->st_mode;
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_chown_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->chown,
- &impunge_local->loc,
- impunge_local->cont.lookup.buf.st_uid,
- impunge_local->cont.lookup.buf.st_gid);
- return 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || !healed_sinks[i])
+ continue;
-out:
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
+ if (replies[i].op_errno != ENOENT)
+ continue;
- if (call_count == 0) {
- AFR_STACK_DESTROY (impunge_frame);
- afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ ret = afr_selfheal_recreate_entry (frame, this, i, source,
+ fd->inode, name, inode,
+ replies);
}
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, struct stat *stbuf)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
-
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
-
- gf_log (this->name, GF_LOG_WARNING,
- "creating file %s mode=0%o dev=0x%"GF_PRI_DEV" on %s",
- impunge_local->loc.path,
- stbuf->st_mode, stbuf->st_rdev,
- priv->children[child_index]->name);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->mknod,
- &impunge_local->loc,
- stbuf->st_mode, stbuf->st_rdev);
-
- return 0;
+ return ret;
}
-
-int
-afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, struct stat *stbuf)
+static int
+__afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ char *name, inode_t *inode, int source,
+ unsigned char *sources, unsigned char *healed_sinks,
+ unsigned char *locked_on, struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
-
+ int ret = -1;
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
-
- gf_log (this->name, GF_LOG_WARNING,
- "creating directory %s mode=0%o on %s",
- impunge_local->loc.path,
- stbuf->st_mode,
- priv->children[child_index]->name);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->mkdir,
- &impunge_local->loc, stbuf->st_mode);
-
- return 0;
+ if (source < 0)
+ ret = __afr_selfheal_merge_dirent (frame, this, fd, name, inode,
+ sources, healed_sinks,
+ locked_on, replies);
+ else
+ ret = __afr_selfheal_heal_dirent (frame, this, fd, name, inode,
+ source, sources, healed_sinks,
+ locked_on, replies);
+ return ret;
}
-int
-afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, const char *linkname)
+static int
+afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *sources,
+ unsigned char *healed_sinks, char *name)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
-
+ afr_private_t *priv = NULL;
+ int ret = 0;
+ unsigned char *locked_on = NULL;
+ struct afr_reply *replies = NULL;
+ inode_t *inode = NULL;
priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
-
- gf_log (this->name, GF_LOG_WARNING,
- "creating symlink %s -> %s on %s",
- impunge_local->loc.path, linkname,
- priv->children[child_index]->name);
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->symlink,
- linkname, &impunge_local->loc);
+ locked_on = alloca0 (priv->child_count);
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- const char *linkname)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int child_index = -1;
- call_frame_t *frame = NULL;
- int call_count = -1;
- int active_src = -1;
+ replies = alloca0 (priv->child_count * sizeof(*replies));
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- frame = impunge_sh->sh_frame;
- active_src = impunge_sh->active_source;
-
- child_index = (long) cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "readlink of %s on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- goto out;
- }
-
- afr_sh_entry_impunge_symlink (impunge_frame, this, child_index,
- linkname);
- return 0;
-
-out:
- LOCK (&impunge_frame->lock);
+ ret = afr_selfheal_entrylk (frame, this, fd->inode, this->name,
+ name, locked_on);
{
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
-
- if (call_count == 0) {
- AFR_STACK_DESTROY (impunge_frame);
- afr_sh_entry_impunge_entry_done (frame, this, active_src);
- }
-
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, struct stat *stbuf)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int active_src = -1;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- active_src = impunge_sh->active_source;
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk,
- (void *) (long) child_index,
- priv->children[active_src],
- priv->children[active_src]->fops->readlink,
- &impunge_local->loc, 4096);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_recreate_lookup_cbk (call_frame_t *impunge_frame,
- void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct stat *buf,
- dict_t *xattr)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int active_src = 0;
- int type = 0;
- int child_index = 0;
- call_frame_t *frame = NULL;
- int call_count = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- frame = impunge_sh->sh_frame;
-
- child_index = (long) cookie;
-
- active_src = impunge_sh->active_source;
-
- if (op_ret != 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "looking up %s on %s (for %s) failed (%s)",
- impunge_local->loc.path,
- priv->children[active_src]->name,
- priv->children[child_index]->name,
- strerror (op_errno));
- goto out;
- }
-
- impunge_local->cont.lookup.buf = *buf;
- type = (buf->st_mode & S_IFMT);
-
- switch (type) {
- case S_IFSOCK:
- case S_IFREG:
- case S_IFBLK:
- case S_IFCHR:
- case S_IFIFO:
- afr_sh_entry_impunge_mknod (impunge_frame, this,
- child_index, buf);
- break;
- case S_IFLNK:
- afr_sh_entry_impunge_readlink (impunge_frame, this,
- child_index, buf);
- break;
- case S_IFDIR:
- afr_sh_entry_impunge_mkdir (impunge_frame, this,
- child_index, buf);
- break;
- default:
- gf_log (this->name, GF_LOG_ERROR,
- "%s has unknown file type on %s: 0%o",
- impunge_local->loc.path,
- priv->children[active_src]->name, type);
- goto out;
- break;
- }
-
- return 0;
+ if (ret < 2) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
-out:
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
+ inode = afr_selfheal_unlocked_lookup_on (frame, fd->inode, name,
+ replies, locked_on);
+ if (!inode) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
- if (call_count == 0) {
- AFR_STACK_DESTROY (impunge_frame);
- afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ ret = __afr_selfheal_entry_dirent (frame, this, fd, name, inode,
+ source, sources, healed_sinks,
+ locked_on, replies);
}
-
- return 0;
+unlock:
+ afr_selfheal_unentrylk (frame, this, fd->inode, this->name, name,
+ locked_on);
+ if (inode)
+ inode_unref (inode);
+ return ret;
}
-int
-afr_sh_entry_impunge_recreate (call_frame_t *impunge_frame, xlator_t *this,
- int child_index)
+static int
+afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int child, int source, unsigned char *sources,
+ unsigned char *healed_sinks)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int active_src = 0;
-
+ int ret = 0;
+ gf_dirent_t entries;
+ gf_dirent_t *entry = NULL;
+ off_t offset = 0;
+ call_frame_t *iter_frame = NULL;
+ xlator_t *subvol = NULL;
+ afr_private_t *priv = NULL;
priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
+ subvol = priv->children[child];
- active_src = impunge_sh->active_source;
+ INIT_LIST_HEAD (&entries.list);
- STACK_WIND_COOKIE (impunge_frame,
- afr_sh_entry_impunge_recreate_lookup_cbk,
- (void *) (long) child_index,
- priv->children[active_src],
- priv->children[active_src]->fops->lookup,
- &impunge_local->loc, 0);
+ iter_frame = afr_copy_frame (frame);
+ if (!iter_frame)
+ return -ENOMEM;
- return 0;
-}
+ while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) {
+ if (ret > 0)
+ ret = 0;
+ list_for_each_entry (entry, &entries.list, list) {
+ offset = entry->d_off;
+ if (!strcmp (entry->d_name, ".") ||
+ !strcmp (entry->d_name, ".."))
+ continue;
-int
-afr_sh_entry_impunge_entry_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct stat *buf, dict_t *x)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int call_count = 0;
- int child_index = 0;
- call_frame_t *frame = NULL;
- int active_src = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- frame = impunge_sh->sh_frame;
- child_index = (long) cookie;
- active_src = impunge_sh->active_source;
-
- if (op_ret == -1 && op_errno == ENOENT) {
- /* decrease call_count in recreate-callback */
- gf_log (this->name, GF_LOG_DEBUG,
- "missing entry %s on %s",
- impunge_local->loc.path,
- priv->children[child_index]->name);
-
- afr_sh_entry_impunge_recreate (impunge_frame, this,
- child_index);
- return 0;
- }
+ if (__is_root_gfid (fd->inode->gfid) &&
+ !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR))
+ continue;
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s exists under %s",
- impunge_local->loc.path,
- priv->children[child_index]->name);
- } else {
- gf_log (this->name, GF_LOG_ERROR,
- "looking up %s under %s failed (%s)",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- }
+ ret = afr_selfheal_entry_dirent (iter_frame, this, fd,
+ source, sources,
+ healed_sinks,
+ entry->d_name);
+ AFR_STACK_RESET (iter_frame);
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
+ if (ret)
+ break;
+ }
- if (call_count == 0) {
- AFR_STACK_DESTROY (impunge_frame);
- afr_sh_entry_impunge_entry_done (frame, this, active_src);
+ gf_dirent_free (&entries);
+ if (ret)
+ break;
}
- return 0;
+ AFR_STACK_DESTROY (iter_frame);
+ return ret;
}
-
-int
-afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this,
- char *name)
+static int
+afr_selfheal_entry_do (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *sources,
+ unsigned char *healed_sinks,
+ struct afr_reply *locked_replies)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int ret = -1;
- call_frame_t *impunge_frame = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int active_src = 0;
- int i = 0;
- int call_count = 0;
- int op_errno = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int ret = 0;
priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- active_src = sh->active_source;
-
- if ((strcmp (name, ".") == 0)
- || (strcmp (name, "..") == 0)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "skipping inspection of %s under %s",
- name, local->loc.path);
- goto out;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "inspecting existance of %s under %s",
- name, local->loc.path);
-
- impunge_frame = copy_frame (frame);
- if (!impunge_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
- goto out;
- }
-
- ALLOC_OR_GOTO (impunge_local, afr_local_t, out);
-
- impunge_frame->local = impunge_local;
- impunge_sh = &impunge_local->self_heal;
- impunge_sh->sh_frame = frame;
- impunge_sh->active_source = active_src;
- ret = build_child_loc (this, &impunge_local->loc, &local->loc, name);
- if (ret != 0) {
- goto out;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (i == active_src)
- continue;
- if (local->child_up[i] == 0)
- continue;
- if (sh->sources[i] == 1)
- continue;
- call_count++;
- }
-
- impunge_local->call_count = call_count;
+ gf_log (this->name, GF_LOG_INFO, "performing entry selfheal on %s",
+ uuid_utoa (fd->inode->gfid));
for (i = 0; i < priv->child_count; i++) {
- if (i == active_src)
- continue;
- if (local->child_up[i] == 0)
+ if (i != source && !healed_sinks[i])
continue;
- if (sh->sources[i] == 1)
- continue;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "looking up %s on %s", impunge_local->loc.path,
- priv->children[i]->name);
-
- STACK_WIND_COOKIE (impunge_frame,
- afr_sh_entry_impunge_entry_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->lookup,
- &impunge_local->loc, 0);
-
- if (!--call_count)
+ ret = afr_selfheal_entry_do_subvol (frame, this, fd, i, source,
+ sources, healed_sinks);
+ if (ret)
break;
}
-
- ret = 0;
-out:
- if (ret == -1)
- afr_sh_entry_impunge_entry_done (frame, this, active_src);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- gf_dirent_t *entries)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- gf_dirent_t *entry = NULL;
- off_t last_offset = 0;
- int active_src = 0;
- int entry_count = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- active_src = sh->active_source;
-
- if (op_ret <= 0) {
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "readdir of %s on subvolume %s failed (%s)",
- local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "readdir of %s on subvolume %s complete",
- local->loc.path,
- priv->children[active_src]->name);
- }
-
- afr_sh_entry_impunge_all (frame, this);
- return 0;
- }
-
- list_for_each_entry (entry, &entries->list, list) {
- last_offset = entry->d_off;
- entry_count++;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "readdir'ed %d entries from %s",
- entry_count, priv->children[active_src]->name);
-
- sh->offset = last_offset;
- local->call_count = entry_count;
-
- list_for_each_entry (entry, &entries->list, list) {
- afr_sh_entry_impunge_entry (frame, this, entry->d_name);
- }
-
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this,
- int active_src)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk,
- priv->children[active_src],
- priv->children[active_src]->fops->readdir,
- sh->healing_fd, sh->block_size, sh->offset);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int active_src = -1;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- sh->offset = 0;
-
- active_src = next_active_source (frame, this, sh->active_source);
- sh->active_source = active_src;
-
- if (sh->op_failed) {
- afr_sh_entry_finish (frame, this);
- return 0;
- }
-
- if (active_src == -1) {
- /* completed creating missing files on all subvolumes */
- afr_sh_entry_expunge_all (frame, this);
- return 0;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "impunging entries of %s on %s to other sinks",
- local->loc.path, priv->children[active_src]->name);
-
- afr_sh_entry_impunge_subvol (frame, this, active_src);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- child_index = (long) cookie;
-
- /* TODO: some of the open's might fail.
- In that case, modify cleanup fn to send flush on those
- fd's which are already open */
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "opendir of %s failed on child %s (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- sh->op_failed = 1;
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (sh->op_failed) {
- afr_sh_entry_finish (frame, this);
- return 0;
- }
- gf_log (this->name, GF_LOG_DEBUG,
- "fd for %s opened, commencing sync",
- local->loc.path);
-
- sh->active_source = -1;
- afr_sh_entry_impunge_all (frame, this);
- }
-
- return 0;
+ return ret;
}
-int
-afr_sh_entry_open (call_frame_t *frame, xlator_t *this)
+static int
+__afr_selfheal_entry_finalize_source (xlator_t *this, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies)
{
- int i = 0;
- int call_count = 0;
-
+ int i = 0;
+ afr_private_t *priv = NULL;
int source = -1;
- int *sources = NULL;
-
- fd_t *fd = NULL;
+ int locked_count = 0;
+ int sources_count = 0;
+ int sinks_count = 0;
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
priv = this->private;
- source = local->self_heal.source;
- sources = local->self_heal.sources;
-
- sh->block_size = 131072;
- sh->offset = 0;
-
- call_count = sh->active_sinks;
- if (source != -1)
- call_count++;
-
- local->call_count = call_count;
-
- fd = fd_create (local->loc.inode, frame->root->pid);
- sh->healing_fd = fd;
+ locked_count = AFR_COUNT (locked_on, priv->child_count);
+ sources_count = AFR_COUNT (sources, priv->child_count);
+ sinks_count = AFR_COUNT (sinks, priv->child_count);
- if (source != -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "opening directory %s on subvolume %s (source)",
- local->loc.path, priv->children[source]->name);
-
- /* open source */
- STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk,
- (void *) (long) source,
- priv->children[source],
- priv->children[source]->fops->opendir,
- &local->loc, fd);
- call_count--;
+ if (locked_count == sinks_count || !sources_count) {
+ return -1;
}
- /* open sinks */
for (i = 0; i < priv->child_count; i++) {
- if (sources[i] || !local->child_up[i])
- continue;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "opening directory %s on subvolume %s (sink)",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->opendir,
- &local->loc, fd);
-
- if (!--call_count)
+ if (sources[i]) {
+ source = i;
break;
- }
-
- return 0;
-}
-
-
-int
-afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int active_sinks = 0;
- int source = 0;
- int i = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] == 0 && local->child_up[i] == 1) {
- active_sinks++;
- sh->success[i] = 1;
}
}
- if (source != -1)
- sh->success[source] = 1;
-
- if (active_sinks == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no active sinks for self-heal on dir %s",
- local->loc.path);
- afr_sh_entry_finish (frame, this);
- return 0;
- }
- if (source == -1 && active_sinks < 2) {
- gf_log (this->name, GF_LOG_WARNING,
- "cannot sync with 0 sources and 1 sink on dir %s",
- local->loc.path);
- afr_sh_entry_finish (frame, this);
- return 0;
- }
- sh->active_sinks = active_sinks;
- if (source != -1)
- gf_log (this->name, GF_LOG_DEBUG,
- "syncing %s from subvolume %s to %d active sinks",
- local->loc.path, priv->children[source]->name,
- active_sinks);
- else
- gf_log (this->name, GF_LOG_DEBUG,
- "no active sources for %s found. "
- "merging all entries as a conservative decision",
- local->loc.path);
-
- afr_sh_entry_open (frame, this);
-
- return 0;
+ return source;
}
-int
-afr_sh_entry_fix (call_frame_t *frame, xlator_t *this)
+static int
+__afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks,
+ struct afr_reply *replies, int *source_p)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int source = 0;
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
- local = frame->local;
- sh = &local->self_heal;
priv = this->private;
- afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr,
- priv->child_count, AFR_ENTRY_TRANSACTION);
-
- afr_sh_print_pending_matrix (sh->pending_matrix, this);
+ ret = afr_selfheal_unlocked_discover (frame, fd->inode, fd->inode->gfid,
+ replies);
+ if (ret)
+ return ret;
- afr_sh_mark_sources (sh, priv->child_count,
- AFR_SELF_HEAL_ENTRY);
+ ret = afr_selfheal_find_direction (frame, this, replies,
+ AFR_ENTRY_TRANSACTION,
+ locked_on, sources, sinks);
+ if (ret)
+ return ret;
- afr_sh_supress_errenous_children (sh->sources, sh->child_errno,
- priv->child_count);
+ source = __afr_selfheal_entry_finalize_source (this, sources, sinks,
+ locked_on, replies);
+ if (source < 0) {
+ /* If source is < 0 (typically split-brain), we perform a
+ conservative merge of entries rather than erroring out */
+ }
+ *source_p = source;
- source = afr_sh_select_source (sh->sources, priv->child_count);
- sh->source = source;
+ for (i = 0; i < priv->child_count; i++)
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
- afr_sh_entry_sync_prepare (frame, this);
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ healed_sinks[i] = sinks[i] && locked_on[i];
- return 0;
+ return ret;
}
+static int
+__afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *locked_on)
+{
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *data_lock = NULL;
+ unsigned char *healed_sinks = NULL;
+ struct afr_reply *locked_replies = NULL;
+ int source = -1;
-int
-afr_sh_entry_lookup_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct stat *buf, dict_t *xattr)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ priv = this->private;
- int call_count = -1;
- int child_index = (long) cookie;
+ sources = alloca0 (priv->child_count);
+ sinks = alloca0 (priv->child_count);
+ healed_sinks = alloca0 (priv->child_count);
+ data_lock = alloca0 (priv->child_count);
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
- LOCK (&frame->lock);
+ ret = afr_selfheal_entrylk (frame, this, fd->inode, this->name, NULL,
+ data_lock);
{
- if (op_ret != -1) {
- sh->xattr[child_index] = dict_ref (xattr);
- sh->buf[child_index] = *buf;
+ if (ret < 2) {
+ ret = -ENOTCONN;
+ goto unlock;
}
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
- if (call_count == 0) {
- afr_sh_entry_fix (frame, this);
+ ret = __afr_selfheal_entry_prepare (frame, this, fd, data_lock,
+ sources, sinks, healed_sinks,
+ locked_replies, &source);
}
+unlock:
+ afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL,
+ data_lock);
+ if (ret < 0)
+ goto out;
- return 0;
-}
-
-
-
-int
-afr_sh_entry_lookup (call_frame_t *frame, xlator_t *this)
-{
- afr_self_heal_t * sh = NULL;
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- dict_t *xattr_req = NULL;
- int ret = 0;
- int call_count = 0;
- int i = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- call_count = local->child_count;
-
- local->call_count = call_count;
-
- xattr_req = dict_new();
- if (xattr_req) {
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_uint64 (xattr_req,
- priv->pending_key[i],
- 3 * sizeof(int32_t));
- }
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame,
- afr_sh_entry_lookup_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->lookup,
- &local->loc, xattr_req);
- if (!--call_count)
- break;
- }
- }
-
- if (xattr_req)
- dict_unref (xattr_req);
+ ret = afr_selfheal_entry_do (frame, this, fd, source, sources,
+ healed_sinks, locked_replies);
+ if (ret)
+ goto out;
- return 0;
+ ret = afr_selfheal_undo_pending (frame, this, fd->inode, sources, sinks,
+ healed_sinks, AFR_ENTRY_TRANSACTION,
+ locked_replies, data_lock);
+out:
+ return ret;
}
-
-int
-afr_sh_entry_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+static fd_t *
+afr_selfheal_data_opendir (xlator_t *this, inode_t *inode)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
-
- /* TODO: what if lock fails? */
-
- local = frame->local;
- sh = &local->self_heal;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- sh->op_failed = 1;
-
- gf_log (this->name,
- (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR),
- "locking inode of %s on child %d failed: %s",
- local->loc.path, child_index,
- strerror (op_errno));
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "inode of %s on child %d locked",
- local->loc.path, child_index);
- }
- }
- UNLOCK (&frame->lock);
+ loc_t loc = {0,};
+ int ret = 0;
+ fd_t *fd = NULL;
- call_count = afr_frame_return (frame);
+ fd = fd_create (inode, 0);
+ if (!fd)
+ return NULL;
- if (call_count == 0) {
- if (sh->op_failed == 1) {
- afr_sh_entry_finish (frame, this);
- return 0;
- }
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
- afr_sh_entry_lookup (frame, this);
+ ret = syncop_opendir (this, &loc, fd);
+ if (ret) {
+ fd_unref (fd);
+ fd = NULL;
+ } else {
+ fd_bind (fd);
}
- return 0;
+ loc_wipe (&loc);
+ return fd;
}
int
-afr_sh_entry_lock (call_frame_t *frame, xlator_t *this)
+afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode)
{
- int i = 0;
- int call_count = 0;
-
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- afr_self_heal_t * sh = NULL;
-
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ fd_t *fd = NULL;
+ int ret = 0;
- local = frame->local;
- sh = &local->self_heal;
priv = this->private;
- call_count = local->child_count;
+ fd = afr_selfheal_data_opendir (this, inode);
+ if (!fd)
+ return -EIO;
- local->call_count = call_count;
+ locked_on = alloca0 (priv->child_count);
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- gf_log (this->name, GF_LOG_DEBUG,
- "locking %s on subvolume %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_entry_lock_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- this->name,
- &local->loc, NULL,
- ENTRYLK_LOCK_NB, ENTRYLK_WRLCK);
- if (!--call_count)
- break;
+ ret = afr_selfheal_tryentrylk (frame, this, inode, priv->sh_domain, NULL,
+ locked_on);
+ {
+ if (ret < 2) {
+ /* Either less than two subvols available, or another
+ selfheal (from another server) is in progress. Skip
+ for now in any case there isn't anything to do.
+ */
+ ret = -ENOTCONN;
+ goto unlock;
}
- }
-
- return 0;
-}
-
-int
-afr_self_heal_entry (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- if (local->need_entry_self_heal && priv->entry_self_heal) {
- afr_sh_entry_lock (frame, this);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "proceeding to completion on %s",
- local->loc.path);
- afr_sh_entry_done (frame, this);
+ ret = __afr_selfheal_entry (frame, this, fd, locked_on);
}
+unlock:
+ afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL, locked_on);
- return 0;
-}
+ if (fd)
+ fd_unref (fd);
+ return ret;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index 8e832698f..83628297f 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -1,808 +1,280 @@
/*
- Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#include <libgen.h>
-#include <unistd.h>
-#include <fnmatch.h>
-#include <sys/time.h>
-#include <stdlib.h>
-#include <signal.h>
#ifndef _CONFIG_H
#define _CONFIG_H
#include "config.h"
#endif
-#include "glusterfs.h"
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-
-#include "afr-transaction.h"
#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-
-
-int
-afr_sh_metadata_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
-// memset (sh->child_errno, 0, sizeof (int) * priv->child_count);
- memset (sh->buf, 0, sizeof (struct stat) * priv->child_count);
- memset (sh->success, 0, sizeof (int) * priv->child_count);
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i])
- dict_unref (sh->xattr[i]);
- sh->xattr[i] = NULL;
- }
-
- if (local->govinda_gOvinda) {
- gf_log (this->name, GF_LOG_WARNING,
- "aborting selfheal of %s",
- local->loc.path);
- sh->completion_cbk (frame, this);
- } else {
- if (S_ISREG (local->cont.lookup.buf.st_mode)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "proceeding to data check on %s",
- local->loc.path);
- afr_self_heal_data (frame, this);
- return 0;
- }
-
- if (S_ISDIR (local->cont.lookup.buf.st_mode)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "proceeding to entry check on %s",
- local->loc.path);
- afr_self_heal_entry (frame, this);
- return 0;
- }
- gf_log (this->name, GF_LOG_DEBUG,
- "completed self heal of %s",
- local->loc.path);
-
- sh->completion_cbk (frame, this);
- }
-
- return 0;
-}
-
-
-int
-afr_sh_metadata_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *local = NULL;
- int call_count = 0;
-
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_sh_metadata_done (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int call_count = 0;
- struct flock flock = {0, };
-
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- call_count = local->child_count;
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- flock.l_start = 0;
- flock.l_len = 0;
- flock.l_type = F_UNLCK;
-
- if (local->child_up[i]) {
- gf_log (this->name, GF_LOG_DEBUG,
- "unlocking %s on subvolume %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND (frame, afr_sh_metadata_unlck_cbk,
- priv->children[i],
- priv->children[i]->fops->inodelk,
- this->name,
- &local->loc, F_SETLK, &flock);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
-
-
-int
-afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xattr)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_sh_metadata_finish (frame, this);
-
- return 0;
-}
+#include "byte-order.h"
+#define AFR_HEAL_ATTR (GF_SET_ATTR_UID|GF_SET_ATTR_GID|GF_SET_ATTR_MODE)
int
-afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this)
+afr_selfheal_metadata_do (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, unsigned char *healed_sinks,
+ struct afr_reply *locked_replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int i = 0;
- dict_t **erase_xattr = NULL;
+ int ret = -1;
+ loc_t loc = {0,};
+ dict_t *xattr = NULL;
+ dict_t *old_xattr = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
-
- local = frame->local;
- sh = &local->self_heal;
priv = this->private;
- afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix,
- sh->success, priv->child_count,
- AFR_METADATA_TRANSACTION);
-
- erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count);
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i]) {
- call_count++;
+ gf_log (this->name, GF_LOG_INFO, "performing metadata selfheal on %s",
+ uuid_utoa (inode->gfid));
- erase_xattr[i] = get_new_dict();
- dict_ref (erase_xattr[i]);
- }
+ ret = syncop_getxattr (priv->children[source], &loc, &xattr, NULL);
+ if (ret < 0) {
+ loc_wipe (&loc);
+ return -EIO;
}
- afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr,
- priv->child_count, AFR_METADATA_TRANSACTION);
-
- local->call_count = call_count;
-
- if (call_count == 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "metadata of %s not healed on any subvolume",
- local->loc.path);
-
- afr_sh_metadata_finish (frame, this);
- }
+ afr_filter_xattrs (xattr);
+ dict_del (xattr, GF_SELINUX_XATTR_KEY);
for (i = 0; i < priv->child_count; i++) {
- if (!erase_xattr[i])
+ if (!healed_sinks[i])
continue;
- gf_log (this->name, GF_LOG_DEBUG,
- "erasing pending flags from %s on %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_metadata_erase_pending_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, erase_xattr[i]);
- if (!--call_count)
- break;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (erase_xattr[i]) {
- dict_unref (erase_xattr[i]);
+ ret = syncop_setattr (priv->children[i], &loc,
+ &locked_replies[source].poststat,
+ AFR_HEAL_ATTR, NULL, NULL);
+ if (ret)
+ healed_sinks[i] = 0;
+
+ old_xattr = NULL;
+ ret = syncop_getxattr (priv->children[i], &loc, &old_xattr, 0);
+ if (old_xattr) {
+ dict_del (old_xattr, GF_SELINUX_XATTR_KEY);
+ afr_filter_xattrs (old_xattr);
+ ret = syncop_removexattr (priv->children[i], &loc, "",
+ old_xattr);
}
- }
- FREE (erase_xattr);
- return 0;
-}
-
-
-int
-afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = 0;
-
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "setting attributes failed for %s on %s (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
-
- sh->success[child_index] = 0;
- }
+ ret = syncop_setxattr (priv->children[i], &loc, xattr, 0);
+ if (ret)
+ healed_sinks[i] = 0;
}
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_sh_metadata_erase_pending (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_metadata_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct stat *buf)
-{
- afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno);
-
- return 0;
-}
-
-
-int
-afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno);
-
- return 0;
-}
-
-int
-afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int source = 0;
- int active_sinks = 0;
- int call_count = 0;
- int i = 0;
- struct timespec ts[2];
-
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
- active_sinks = sh->active_sinks;
-
- /*
- * 4 calls per sink - chown, chmod, utimes, setxattr
- */
+ loc_wipe (&loc);
if (xattr)
- call_count = active_sinks * 4;
- else
- call_count = active_sinks * 3;
-
- local->call_count = call_count;
-
-#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC
- ts[0] = sh->buf[source].st_atim;
- ts[1] = sh->buf[source].st_mtim;
-#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC
- ts[0] = sh->buf[source].st_atimespec;
- ts[1] = sh->buf[source].st_mtimespec;
-#else
- ts[0].tv_sec = sh->buf[source].st_atime;
- ts[1].tv_sec = sh->buf[source].st_mtime;
-#endif
-
- for (i = 0; i < priv->child_count; i++) {
- if (call_count == 0) {
- break;
- }
- if (sh->sources[i] || !local->child_up[i])
- continue;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "syncing metadata of %s from %s to %s",
- local->loc.path, priv->children[source]->name,
- priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->chown,
- &local->loc,
- sh->buf[source].st_uid,
- sh->buf[source].st_gid);
-
- STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->chmod,
- &local->loc, sh->buf[source].st_mode);
-
- STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->utimens,
- &local->loc, ts);
-
- call_count = call_count - 3;
-
- if (!xattr)
- continue;
-
- STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setxattr,
- &local->loc, xattr, 0);
- call_count--;
- }
+ dict_unref (xattr);
return 0;
}
-int
-afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr)
+/*
+ * Look for mismatching uid/gid or mode even if xattrs don't say so, and
+ * pick one arbitrarily as winner.
+ */
+
+static int
+__afr_selfheal_metadata_finalize_source (xlator_t *this, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int source = 0;
-
- int i;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ struct iatt first = {0, };
+ int source = -1;
+ int locked_count = 0;
+ int sources_count = 0;
+ int sinks_count = 0;
- local = frame->local;
- sh = &local->self_heal;
priv = this->private;
- source = sh->source;
+ locked_count = AFR_COUNT (locked_on, priv->child_count);
+ sources_count = AFR_COUNT (sources, priv->child_count);
+ sinks_count = AFR_COUNT (sinks, priv->child_count);
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "getxattr of %s failed on subvolume %s (%s). proceeding without xattr",
- local->loc.path, priv->children[source]->name,
- strerror (op_errno));
-
- afr_sh_metadata_sync (frame, this, NULL);
- } else {
- for (i = 0; i < priv->child_count; i++) {
- dict_del (xattr, priv->pending_key[i]);
- }
-
- afr_sh_metadata_sync (frame, this, xattr);
+ if (locked_count == sinks_count || !sources_count) {
+ if (!priv->metadata_splitbrain_forced_heal) {
+ return -EIO;
+ }
+ /* Metadata split brain, select one subvol
+ arbitrarily */
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i] && sinks[i]) {
+ sources[i] = 1;
+ sinks[i] = 0;
+ break;
+ }
+ }
}
- return 0;
-}
-
-
-int
-afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int active_sinks = 0;
- int source = 0;
- int i = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
-
for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] == 0 && local->child_up[i] == 1) {
- active_sinks++;
- sh->success[i] = 1;
+ if (!sources[i])
+ continue;
+ if (source == -1) {
+ source = i;
+ first = replies[i].poststat;
}
}
- sh->success[source] = 1;
-
- if (active_sinks == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no active sinks for performing self-heal on file %s",
- local->loc.path);
- afr_sh_metadata_finish (frame, this);
- return 0;
- }
- sh->active_sinks = active_sinks;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "syncing metadata of %s from subvolume %s to %d active sinks",
- local->loc.path, priv->children[source]->name, active_sinks);
-
- STACK_WIND (frame, afr_sh_metadata_getxattr_cbk,
- priv->children[source],
- priv->children[source]->fops->getxattr,
- &local->loc, NULL);
-
- return 0;
-}
-
-
-int
-afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int nsources = 0;
- int source = 0;
- int i = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr,
- priv->child_count,
- AFR_METADATA_TRANSACTION);
-
- afr_sh_print_pending_matrix (sh->pending_matrix, this);
-
- nsources = afr_sh_mark_sources (sh, priv->child_count,
- AFR_SELF_HEAL_METADATA);
-
- afr_sh_supress_errenous_children (sh->sources, sh->child_errno,
- priv->child_count);
-
- if (nsources == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "No self-heal needed for %s",
- local->loc.path);
-
- afr_sh_metadata_finish (frame, this);
- return 0;
- }
-
- if ((nsources == -1)
- && (priv->favorite_child != -1)
- && (sh->child_errno[priv->favorite_child] == 0)) {
-
- gf_log (this->name, GF_LOG_WARNING,
- "Picking favorite child %s as authentic source to resolve conflicting metadata of %s",
- priv->children[priv->favorite_child]->name,
- local->loc.path);
-
- sh->sources[priv->favorite_child] = 1;
-
- nsources = afr_sh_source_count (sh->sources,
- priv->child_count);
- }
- if (nsources == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "Unable to resolve conflicting metadata of %s. "
- "Please resolve manually by fixing the "
- "permissions/ownership of %s on your subvolumes. "
- "You can also consider 'option favorite-child <>'",
- local->loc.path, local->loc.path);
-
- local->govinda_gOvinda = 1;
-
- afr_sh_metadata_finish (frame, this);
- return 0;
- }
-
- source = afr_sh_select_source (sh->sources, priv->child_count);
- sh->source = source;
-
- /* detect changes not visible through pending flags -- JIC */
for (i = 0; i < priv->child_count; i++) {
- if (i == source || sh->child_errno[i])
+ if (!sources[i])
continue;
-
- if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source]))
- sh->sources[i] = 0;
-
- if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source]))
- sh->sources[i] = 0;
+ if (!IA_EQUAL (first, replies[i].poststat, type) ||
+ !IA_EQUAL (first, replies[i].poststat, uid) ||
+ !IA_EQUAL (first, replies[i].poststat, gid) ||
+ !IA_EQUAL (first, replies[i].poststat, prot)) {
+ sources[i] = 0;
+ sinks[i] = 1;
+ }
}
- afr_sh_metadata_sync_prepare (frame, this);
-
- return 0;
+ return source;
}
-int
-afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct stat *buf, dict_t *xattr)
+static int
+__afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks,
+ struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = 0;
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
-
- local = frame->local;
- sh = &local->self_heal;
priv = this->private;
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "path %s on subvolume %s is of mode 0%o",
- local->loc.path,
- priv->children[child_index]->name,
- buf->st_mode);
-
- sh->buf[child_index] = *buf;
- if (xattr)
- sh->xattr[child_index] = dict_ref (xattr);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "path %s on subvolume %s => -1 (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
-
- sh->child_errno[child_index] = op_errno;
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_sh_metadata_fix (frame, this);
-
- return 0;
+ ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid,
+ replies);
+ if (ret)
+ return ret;
+
+ ret = afr_selfheal_find_direction (frame, this, replies,
+ AFR_METADATA_TRANSACTION,
+ locked_on, sources, sinks);
+ if (ret)
+ return ret;
+
+ source = __afr_selfheal_metadata_finalize_source (this, sources, sinks,
+ locked_on, replies);
+ if (source < 0)
+ return -EIO;
+
+ for (i = 0; i < priv->child_count; i++)
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
+
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ healed_sinks[i] = sinks[i] && locked_on[i];
+
+ return source;
}
-int
-afr_sh_metadata_lookup (call_frame_t *frame, xlator_t *this)
+static int
+__afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int call_count = 0;
- dict_t *xattr_req = NULL;
- int ret = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- call_count = local->child_count;
- local->call_count = call_count;
-
- xattr_req = dict_new();
-
- if (xattr_req) {
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_uint64 (xattr_req,
- priv->pending_key[i],
- 3 * sizeof(int32_t));
- }
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- gf_log (this->name, GF_LOG_DEBUG,
- "looking up %s on %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_metadata_lookup_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->lookup,
- &local->loc, xattr_req);
- if (!--call_count)
- break;
- }
- }
-
- if (xattr_req)
- dict_unref (xattr_req);
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *data_lock = NULL;
+ unsigned char *healed_sinks = NULL;
+ struct afr_reply *locked_replies = NULL;
+ int source = -1;
- return 0;
-}
+ priv = this->private;
+ sources = alloca0 (priv->child_count);
+ sinks = alloca0 (priv->child_count);
+ healed_sinks = alloca0 (priv->child_count);
+ data_lock = alloca0 (priv->child_count);
-int
-afr_sh_metadata_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
-
- /* TODO: what if lock fails? */
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
- LOCK (&frame->lock);
+ ret = afr_selfheal_inodelk (frame, this, inode, this->name,
+ LLONG_MAX - 1, 0, data_lock);
{
- if (op_ret == -1) {
- sh->op_failed = 1;
-
- gf_log (this->name,
- (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR),
- "locking of %s on child %d failed: %s",
- local->loc.path, child_index,
- strerror (op_errno));
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "inode of %s on child %d locked",
- local->loc.path, child_index);
+ if (ret < 2) {
+ ret = -ENOTCONN;
+ goto unlock;
}
- }
- UNLOCK (&frame->lock);
- call_count = afr_frame_return (frame);
+ ret = __afr_selfheal_metadata_prepare (frame, this, inode, data_lock,
+ sources, sinks, healed_sinks,
+ locked_replies);
+ if (ret < 0)
+ goto unlock;
- if (call_count == 0) {
- if (sh->op_failed) {
- afr_sh_metadata_finish (frame, this);
- return 0;
- }
-
- afr_sh_metadata_lookup (frame, this);
+ source = ret;
+ ret = 0;
}
-
- return 0;
+unlock:
+ afr_selfheal_uninodelk (frame, this, inode, this->name,
+ LLONG_MAX -1, 0, data_lock);
+ if (ret < 0)
+ goto out;
+
+ ret = afr_selfheal_metadata_do (frame, this, inode, source, healed_sinks,
+ locked_replies);
+ if (ret)
+ goto out;
+
+ ret = afr_selfheal_undo_pending (frame, this, inode, sources, sinks,
+ healed_sinks, AFR_METADATA_TRANSACTION,
+ locked_replies, data_lock);
+out:
+ return ret;
}
int
-afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this)
+afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int call_count = 0;
- struct flock flock = {0, };
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ int ret = 0;
-
- local = frame->local;
- sh = &local->self_heal;
priv = this->private;
- call_count = local->child_count;
- local->call_count = call_count;
+ locked_on = alloca0 (priv->child_count);
- for (i = 0; i < priv->child_count; i++) {
- flock.l_start = 0;
- flock.l_len = 0;
- flock.l_type = F_WRLCK;
-
- if (local->child_up[i]) {
- gf_log (this->name, GF_LOG_DEBUG,
- "locking %s on subvolume %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_metadata_lk_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->inodelk,
- this->name,
- &local->loc, F_SETLK, &flock);
-
- if (!--call_count)
- break;
+ ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain, 0, 0,
+ locked_on);
+ {
+ if (ret < 2) {
+ /* Either less than two subvols available, or another
+ selfheal (from another server) is in progress. Skip
+ for now in any case there isn't anything to do.
+ */
+ ret = -ENOTCONN;
+ goto unlock;
}
- }
- return 0;
-}
-
-
-int
-afr_self_heal_metadata (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = this->private;
-
-
- local = frame->local;
- sh = &local->self_heal;
-
- if (local->need_metadata_self_heal && priv->metadata_self_heal) {
- afr_sh_metadata_lock (frame, this);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "proceeding to data check on %s",
- local->loc.path);
- afr_sh_metadata_done (frame, this);
+ ret = __afr_selfheal_metadata (frame, this, inode, locked_on);
}
+unlock:
+ afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on);
- return 0;
+ return ret;
}
-
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
new file mode 100644
index 000000000..ce80b8da3
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
@@ -0,0 +1,457 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "afr.h"
+#include "afr-self-heal.h"
+
+
+int
+__afr_selfheal_assign_gfid (call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, const char *bname, inode_t *inode,
+ struct afr_reply *replies, int gfid_idx)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ int ret = 0;
+ loc_t loc = {0, };
+
+ priv = this->private;
+
+ uuid_copy (parent->gfid, pargfid);
+
+ xdata = dict_new ();
+ if (!xdata) {
+ return -ENOMEM;
+ }
+
+ ret = dict_set_static_bin (xdata, "gfid-req",
+ replies[gfid_idx].poststat.ia_gfid, 16);
+ if (ret) {
+ dict_destroy (xdata);
+ return -ENOMEM;
+ }
+
+ loc.parent = inode_ref (parent);
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.pargfid, pargfid);
+ loc.name = bname;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].op_ret == 0 || replies[i].op_errno != ENODATA)
+ continue;
+
+ ret = syncop_lookup (priv->children[i], &loc, xdata, 0, 0, 0);
+ }
+
+ loc_wipe (&loc);
+ dict_unref (xdata);
+
+ return ret;
+}
+
+
+int
+__afr_selfheal_name_impunge (call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, const char *bname, inode_t *inode,
+ struct afr_reply *replies, int gfid_idx)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ uuid_copy (parent->gfid, pargfid);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (uuid_compare (replies[i].poststat.ia_gfid,
+ replies[gfid_idx].poststat.ia_gfid) == 0)
+ continue;
+
+ ret |= afr_selfheal_recreate_entry (frame, this, i, gfid_idx,
+ parent, bname, inode, replies);
+ }
+
+ return ret;
+}
+
+
+int
+__afr_selfheal_name_expunge (call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, const char *bname, inode_t *inode,
+ struct afr_reply *replies)
+{
+ loc_t loc = {0, };
+ int i = 0;
+ afr_private_t *priv = NULL;
+ char g[64];
+ int ret = 0;
+
+ priv = this->private;
+
+ loc.parent = inode_ref (parent);
+ uuid_copy (loc.pargfid, pargfid);
+ loc.name = bname;
+ loc.inode = inode_ref (inode);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret)
+ continue;
+
+ switch (replies[i].poststat.ia_type) {
+ case IA_IFDIR:
+ gf_log (this->name, GF_LOG_WARNING,
+ "expunging dir %s/%s (%s) on %s",
+ uuid_utoa (pargfid), bname,
+ uuid_utoa_r (replies[i].poststat.ia_gfid, g),
+ priv->children[i]->name);
+ ret |= syncop_rmdir (priv->children[i], &loc, 1);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_WARNING,
+ "expunging file %s/%s (%s) on %s",
+ uuid_utoa (pargfid), bname,
+ uuid_utoa_r (replies[i].poststat.ia_gfid, g),
+ priv->children[i]->name);
+ ret |= syncop_unlink (priv->children[i], &loc);
+ break;
+ }
+ }
+
+ loc_wipe (&loc);
+
+ return ret;
+
+}
+
+
+int
+__afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, const char *bname, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks, int source,
+ unsigned char *locked_on, struct afr_reply *replies)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uuid_t gfid = {0, };
+ int gfid_idx = -1;
+ gf_boolean_t source_is_empty = _gf_true;
+ gf_boolean_t need_heal = _gf_false;
+ int first_idx = -1;
+ char g1[64],g2[64];
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (first_idx == -1) {
+ first_idx = i;
+ continue;
+ }
+
+ if (replies[i].op_ret != replies[first_idx].op_ret)
+ need_heal = _gf_true;
+
+ if (uuid_compare (replies[i].poststat.ia_gfid,
+ replies[first_idx].poststat.ia_gfid))
+ need_heal = _gf_true;
+ }
+
+ if (!need_heal)
+ return 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (!replies[i].op_ret && (source == -1 || sources[i])) {
+ source_is_empty = _gf_false;
+ break;
+ }
+ }
+
+ if (source_is_empty) {
+ return __afr_selfheal_name_expunge (frame, this, parent, pargfid,
+ bname, inode, replies);
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (uuid_is_null (replies[i].poststat.ia_gfid))
+ continue;
+
+ if (uuid_is_null (gfid)) {
+ uuid_copy (gfid, replies[i].poststat.ia_gfid);
+ gfid_idx = i;
+ continue;
+ }
+
+ if (sources[i] || source == -1) {
+ if (gfid_idx != -1 &&
+ (sources[gfid_idx] || source == -1) &&
+ uuid_compare (gfid, replies[i].poststat.ia_gfid)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "GFID mismatch for <gfid:%s>/%s "
+ "%s on %s and %s on %s",
+ uuid_utoa (pargfid), bname,
+ uuid_utoa_r (replies[i].poststat.ia_gfid, g1),
+ priv->children[i]->name,
+ uuid_utoa_r (replies[gfid_idx].poststat.ia_gfid, g2),
+ priv->children[gfid_idx]->name);
+ return -1;
+ }
+
+ uuid_copy (gfid, replies[i].poststat.ia_gfid);
+ gfid_idx = i;
+ continue;
+ }
+ }
+
+ if (gfid_idx == -1)
+ return -1;
+
+ __afr_selfheal_assign_gfid (frame, this, parent, pargfid, bname, inode,
+ replies, gfid_idx);
+
+ return __afr_selfheal_name_impunge (frame, this, parent, pargfid,
+ bname, inode, replies, gfid_idx);
+}
+
+
+int
+__afr_selfheal_name_finalize_source (xlator_t *this, unsigned char *sources,
+ unsigned char *sinks, unsigned char *locked_on,
+ struct afr_reply *replies)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int source = -1;
+ int locked_count = 0;
+ int sources_count = 0;
+ int sinks_count = 0;
+
+ priv = this->private;
+
+ locked_count = AFR_COUNT (locked_on, priv->child_count);
+ sources_count = AFR_COUNT (sources, priv->child_count);
+ sinks_count = AFR_COUNT (sinks, priv->child_count);
+
+ if (locked_count == sinks_count || !sources_count) {
+ return -1;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i]) {
+ source = i;
+ break;
+ }
+ }
+
+ return source;
+}
+
+
+int
+__afr_selfheal_name_prepare (call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks, struct afr_reply *replies,
+ int *source_p)
+{
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ priv = this->private;
+
+ ret = afr_selfheal_unlocked_discover (frame, parent, pargfid, replies);
+ if (ret)
+ return ret;
+
+ ret = afr_selfheal_find_direction (frame, this, replies,
+ AFR_ENTRY_TRANSACTION,
+ locked_on, sources, sinks);
+ if (ret)
+ return ret;
+
+ source = __afr_selfheal_name_finalize_source (this, sources, sinks,
+ locked_on, replies);
+ if (source < 0) {
+ /* If source is < 0 (typically split-brain), we perform a
+ conservative merge of entries rather than erroring out */
+ }
+ *source_p = source;
+
+ for (i = 0; i < priv->child_count; i++)
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
+
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ healed_sinks[i] = sinks[i] && locked_on[i];
+
+ return ret;
+}
+
+
+int
+afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, const char *bname)
+{
+ afr_private_t *priv = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *healed_sinks = NULL;
+ unsigned char *locked_on = NULL;
+ int source = -1;
+ struct afr_reply *replies = NULL;
+ int ret = -1;
+ inode_t *inode = NULL;
+
+ priv = this->private;
+
+ locked_on = alloca0 (priv->child_count);
+ sources = alloca0 (priv->child_count);
+ sinks = alloca0 (priv->child_count);
+ healed_sinks = alloca0 (priv->child_count);
+
+ replies = alloca0 (priv->child_count * sizeof(*replies));
+
+ ret = afr_selfheal_entrylk (frame, this, parent, this->name, bname,
+ locked_on);
+ {
+ if (ret < 2) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_name_prepare (frame, this, parent, pargfid,
+ locked_on, sources, sinks,
+ healed_sinks, replies,
+ &source);
+ if (ret)
+ goto unlock;
+
+ inode = afr_selfheal_unlocked_lookup_on (frame, parent, bname,
+ replies, locked_on);
+ if (!inode) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_name_do (frame, this, parent, pargfid, bname,
+ inode, sources, sinks, healed_sinks,
+ source, locked_on, replies);
+ }
+unlock:
+ afr_selfheal_unentrylk (frame, this, parent, this->name, bname,
+ locked_on);
+ if (inode)
+ inode_unref (inode);
+
+ return ret;
+}
+
+
+int
+afr_selfheal_name_unlocked_inspect (call_frame_t *frame, xlator_t *this,
+ inode_t *parent, uuid_t pargfid,
+ const char *bname, gf_boolean_t *need_heal)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ struct afr_reply *replies = NULL;
+ inode_t *inode = NULL;
+ int first_idx = -1;
+
+ priv = this->private;
+
+ replies = alloca0 (sizeof (*replies) * priv->child_count);
+
+ inode = afr_selfheal_unlocked_lookup_on (frame, parent, bname,
+ replies, priv->child_up);
+ if (!inode)
+ return -ENOMEM;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (first_idx == -1) {
+ first_idx = i;
+ continue;
+ }
+
+ if (replies[i].op_ret != replies[first_idx].op_ret)
+ *need_heal = _gf_true;
+
+ if (uuid_compare (replies[i].poststat.ia_gfid,
+ replies[first_idx].poststat.ia_gfid))
+ *need_heal = _gf_true;
+ }
+
+ if (inode)
+ inode_unref (inode);
+ return 0;
+}
+
+int
+afr_selfheal_name (xlator_t *this, uuid_t pargfid, const char *bname)
+{
+ inode_t *parent = NULL;
+ call_frame_t *frame = NULL;
+ int ret = -1;
+ gf_boolean_t need_heal = _gf_false;
+
+ parent = afr_inode_find (this, pargfid);
+ if (!parent)
+ goto out;
+
+ frame = afr_frame_create (this);
+ if (!frame)
+ goto out;
+
+ ret = afr_selfheal_name_unlocked_inspect (frame, this, parent, pargfid,
+ bname, &need_heal);
+ if (ret)
+ goto out;
+
+ if (need_heal)
+ afr_selfheal_name_do (frame, this, parent, pargfid, bname);
+out:
+ if (parent)
+ inode_unref (parent);
+ if (frame)
+ AFR_STACK_DESTROY (frame);
+
+ return ret;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 73abd8fb9..a1b972ac3 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -1,52 +1,167 @@
/*
- Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#ifndef __AFR_SELF_HEAL_H__
-#define __AFR_SELF_HEAL_H__
-#include <sys/stat.h>
+#ifndef _AFR_SELFHEAL_H
+#define _AFR_SELFHEAL_H
+
+
+/* Perform fop on all UP subvolumes and wait for all callbacks to return */
+
+#define AFR_ONALL(frame, rfn, fop, args ...) do { \
+ afr_local_t *__local = frame->local; \
+ afr_private_t *__priv = frame->this->private; \
+ int __i = 0, __count = 0; \
+ \
+ afr_replies_wipe (__local, __priv); \
+ \
+ for (__i = 0; __i < __priv->child_count; __i++) { \
+ if (!__priv->child_up[__i]) continue; \
+ STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \
+ __priv->children[__i], \
+ __priv->children[__i]->fops->fop, args); \
+ __count++; \
+ } \
+ syncbarrier_wait (&__local->barrier, __count); \
+ } while (0)
+
+
+/* Perform fop on all subvolumes represented by list[] array and wait
+ for all callbacks to return */
+
+#define AFR_ONLIST(list, frame, rfn, fop, args ...) do { \
+ afr_local_t *__local = frame->local; \
+ afr_private_t *__priv = frame->this->private; \
+ int __i = 0, __count = 0; \
+ \
+ afr_replies_wipe (__local, __priv); \
+ \
+ for (__i = 0; __i < __priv->child_count; __i++) { \
+ if (!list[__i]) continue; \
+ STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \
+ __priv->children[__i], \
+ __priv->children[__i]->fops->fop, args); \
+ __count++; \
+ } \
+ syncbarrier_wait (&__local->barrier, __count); \
+ } while (0)
+
+
+#define AFR_SEQ(frame, rfn, fop, args ...) do { \
+ afr_local_t *__local = frame->local; \
+ afr_private_t *__priv = frame->this->private; \
+ int __i = 0; \
+ \
+ afr_replies_wipe (__local, __priv); \
+ \
+ for (__i = 0; __i < __priv->child_count; __i++) { \
+ if (!__priv->child_up[__i]) continue; \
+ STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \
+ __priv->children[__i], \
+ __priv->children[__i]->fops->fop, args); \
+ syncbarrier_wait (&__local->barrier, 1); \
+ } \
+ } while (0)
+
+
+#define ALLOC_MATRIX(n, type) ({type **__ptr = NULL; \
+ int __i; \
+ __ptr = alloca0 (n * sizeof(type *)); \
+ for (__i = 0; __i < n; __i++) __ptr[__i] = alloca0 (n * sizeof(type)); \
+ __ptr;})
+
+
+#define IA_EQUAL(f,s,field) (memcmp (&(f.ia_##field), &(s.ia_##field), sizeof (s.ia_##field)) == 0)
+
+
+int
+afr_selfheal (xlator_t *this, uuid_t gfid);
+
+int
+afr_selfheal_name (xlator_t *this, uuid_t gfid, const char *name);
+
+int
+afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode);
+
+int
+afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode);
+
+int
+afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode);
-#define FILETYPE_DIFFERS(buf1,buf2) ((S_IFMT & ((struct stat *)buf1)->st_mode) != (S_IFMT & ((struct stat *)buf2)->st_mode))
-#define PERMISSION_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_mode) != (((struct stat *)buf2)->st_mode))
-#define OWNERSHIP_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_uid) != (((struct stat *)buf2)->st_uid) || (((struct stat *)buf1)->st_gid != (((struct stat *)buf2)->st_gid)))
-#define SIZE_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_size) != (((struct stat *)buf2)->st_size))
-#define SIZE_GREATER(buf1,buf2) ((((struct stat *)buf1)->st_size > (((struct stat *)buf2)->st_size)))
+int
+afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on);
int
-afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this);
+afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on);
+
int
-afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this);
+afr_selfheal_uninodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ const unsigned char *locked_on);
+
int
-afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this);
+afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on);
int
-afr_self_heal_entry (call_frame_t *frame, xlator_t *this);
+afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on);
int
-afr_self_heal_data (call_frame_t *frame, xlator_t *this);
+afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on);
int
-afr_self_heal_metadata (call_frame_t *frame, xlator_t *this);
+afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode,
+ uuid_t gfid, struct afr_reply *replies);
+
+inode_t *
+afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent,
+ const char *name, struct afr_reply *replies,
+ unsigned char *lookup_on);
int
-afr_self_heal (call_frame_t *frame, xlator_t *this,
- int (*completion_cbk) (call_frame_t *, xlator_t *));
+afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
+ struct afr_reply *replies,
+ afr_transaction_type type, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks);
+
+int
+afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies,
+ afr_transaction_type type, int *dirty, int **matrix);
+
+int
+afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks, afr_transaction_type type,
+ struct afr_reply *replies, unsigned char *locked_on);
+
+int
+afr_selfheal_recreate_entry (call_frame_t *frame, xlator_t *this, int dst,
+ int source, inode_t *dir, const char *name,
+ inode_t *inode, struct afr_reply *replies);
+
+int
+afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int subvol, dict_t *xattr);
+
+call_frame_t *
+afr_frame_create (xlator_t *this);
+
+inode_t *
+afr_inode_find (xlator_t *this, uuid_t gfid);
-#endif /* __AFR_SELF_HEAL_H__ */
+#endif /* !_AFR_SELFHEAL_H */
diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c
new file mode 100644
index 000000000..4bfe909bc
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heald.c
@@ -0,0 +1,1256 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "afr.h"
+#include "afr-self-heal.h"
+#include "afr-self-heald.h"
+#include "protocol-common.h"
+
+#define SHD_INODE_LRU_LIMIT 2048
+#define AFR_EH_HEALED_LIMIT 1024
+#define AFR_EH_HEAL_FAIL_LIMIT 1024
+#define AFR_EH_SPLIT_BRAIN_LIMIT 1024
+#define AFR_STATISTICS_HISTORY_SIZE 50
+
+
+#define ASSERT_LOCAL(this, healer) \
+ if (!afr_shd_is_subvol_local(this, healer->subvol)) { \
+ healer->local = _gf_false; \
+ if (safe_break (healer)) { \
+ break; \
+ } else { \
+ continue; \
+ } \
+ } else { \
+ healer->local = _gf_true; \
+ }
+
+
+#define NTH_INDEX_HEALER(this, n) &((((afr_private_t *)this->private))->shd.index_healers[n])
+#define NTH_FULL_HEALER(this, n) &((((afr_private_t *)this->private))->shd.full_healers[n])
+
+int afr_shd_gfid_to_path (xlator_t *this, xlator_t *subvol, uuid_t gfid, char **path_p);
+
+char *
+afr_subvol_name (xlator_t *this, int subvol)
+{
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ if (subvol < 0 || subvol > priv->child_count)
+ return NULL;
+
+ return priv->children[subvol]->name;
+}
+
+
+void
+afr_destroy_crawl_event_data (void *data)
+{
+ return;
+}
+
+
+void
+afr_destroy_shd_event_data (void *data)
+{
+ shd_event_t *shd_event = data;
+
+ if (!shd_event)
+ return;
+ GF_FREE (shd_event->path);
+
+ return;
+}
+
+
+gf_boolean_t
+afr_shd_is_subvol_local (xlator_t *this, int subvol)
+{
+ char *pathinfo = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int ret = 0;
+ gf_boolean_t is_local = _gf_false;
+ loc_t loc = {0, };
+
+ priv = this->private;
+
+ loc.inode = this->itable->root;
+ uuid_copy (loc.gfid, loc.inode->gfid);
+
+ ret = syncop_getxattr (priv->children[subvol], &loc, &xattr,
+ GF_XATTR_PATHINFO_KEY);
+ if (ret)
+ return _gf_false;
+ if (!xattr)
+ return _gf_false;
+
+ ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &pathinfo);
+ if (ret)
+ return _gf_false;
+
+ afr_local_pathinfo (pathinfo, &is_local);
+
+ gf_log (this->name, GF_LOG_DEBUG, "subvol %s is %slocal",
+ priv->children[subvol]->name, is_local? "" : "not ");
+
+ return is_local;
+}
+
+
+int
+__afr_shd_healer_wait (struct subvol_healer *healer)
+{
+ afr_private_t *priv = NULL;
+ struct timespec wait_till = {0, };
+ int ret = 0;
+
+ priv = healer->this->private;
+
+disabled_loop:
+ wait_till.tv_sec = time (NULL) + 60;
+
+ while (!healer->rerun) {
+ ret = pthread_cond_timedwait (&healer->cond,
+ &healer->mutex,
+ &wait_till);
+ if (ret == ETIMEDOUT)
+ break;
+ }
+
+ ret = healer->rerun;
+ healer->rerun = 0;
+
+ if (!priv->shd.enabled)
+ goto disabled_loop;
+
+ return ret;
+}
+
+
+int
+afr_shd_healer_wait (struct subvol_healer *healer)
+{
+ int ret = 0;
+
+ pthread_mutex_lock (&healer->mutex);
+ {
+ ret = __afr_shd_healer_wait (healer);
+ }
+ pthread_mutex_unlock (&healer->mutex);
+
+ return ret;
+}
+
+
+gf_boolean_t
+safe_break (struct subvol_healer *healer)
+{
+ gf_boolean_t ret = _gf_false;
+
+ pthread_mutex_lock (&healer->mutex);
+ {
+ if (healer->rerun)
+ goto unlock;
+
+ healer->running = _gf_false;
+ ret = _gf_true;
+ }
+unlock:
+ pthread_mutex_unlock (&healer->mutex);
+
+ return ret;
+}
+
+
+inode_t *
+afr_shd_inode_find (xlator_t *this, xlator_t *subvol, uuid_t gfid)
+{
+ inode_t *inode = NULL;
+ int ret = 0;
+ loc_t loc = {0, };
+ struct iatt iatt = {0, };
+
+ inode = inode_find (this->itable, gfid);
+ if (inode)
+ goto out;
+
+ loc.inode = inode_new (this->itable);
+ if (!loc.inode)
+ goto out;
+ uuid_copy (loc.gfid, gfid);
+
+ ret = syncop_lookup (subvol, &loc, NULL, &iatt, NULL, NULL);
+ if (ret < 0)
+ goto out;
+
+ inode = inode_link (loc.inode, NULL, NULL, &iatt);
+ if (inode)
+ inode_lookup (inode);
+out:
+ loc_wipe (&loc);
+ return inode;
+}
+
+
+fd_t *
+afr_shd_index_opendir (xlator_t *this, int child)
+{
+ fd_t *fd = NULL;
+ afr_private_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ loc_t rootloc = {0, };
+ inode_t *inode = NULL;
+ int ret = 0;
+ dict_t *xattr = NULL;
+ void *index_gfid = NULL;
+
+ priv = this->private;
+ subvol = priv->children[child];
+
+ rootloc.inode = inode_ref (this->itable->root);
+ uuid_copy (rootloc.gfid, rootloc.inode->gfid);
+
+ ret = syncop_getxattr (subvol, &rootloc, &xattr,
+ GF_XATTROP_INDEX_GFID);
+ if (ret || !xattr) {
+ errno = -ret;
+ goto out;
+ }
+
+ ret = dict_get_ptr (xattr, GF_XATTROP_INDEX_GFID, &index_gfid);
+ if (ret)
+ goto out;
+
+ gf_log (this->name, GF_LOG_DEBUG, "index-dir gfid for %s: %s",
+ subvol->name, uuid_utoa (index_gfid));
+
+ inode = afr_shd_inode_find (this, subvol, index_gfid);
+ if (!inode)
+ goto out;
+ fd = fd_anonymous (inode);
+out:
+ loc_wipe (&rootloc);
+ if (xattr)
+ dict_unref (xattr);
+ return fd;
+}
+
+
+int
+afr_shd_index_purge (xlator_t *subvol, inode_t *inode, char *name)
+{
+ loc_t loc = {0, };
+ int ret = 0;
+
+ loc.parent = inode_ref (inode);
+ loc.name = name;
+
+ ret = syncop_unlink (subvol, &loc);
+
+ loc_wipe (&loc);
+ return ret;
+}
+
+
+int
+afr_shd_selfheal_name (struct subvol_healer *healer, int child, uuid_t parent,
+ const char *bname)
+{
+ int ret = -1;
+
+ ret = afr_selfheal_name (THIS, parent, bname);
+
+ return ret;
+}
+
+int
+afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid)
+{
+ int ret = 0;
+ eh_t *eh = NULL;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ shd_event_t *shd_event = NULL;
+ char *path = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *this = NULL;
+ crawl_event_t *crawl_event = NULL;
+
+ this = healer->this;
+ priv = this->private;
+ shd = &priv->shd;
+ crawl_event = &healer->crawl_event;
+
+ subvol = priv->children[child];
+
+ ret = afr_selfheal (this, gfid);
+
+ if (ret == -EIO) {
+ eh = shd->split_brain;
+ crawl_event->split_brain_count++;
+ } else if (ret < 0) {
+ eh = shd->heal_failed;
+ crawl_event->heal_failed_count++;
+ } else if (ret == 0) {
+ eh = shd->healed;
+ crawl_event->healed_count++;
+ }
+
+ afr_shd_gfid_to_path (this, subvol, gfid, &path);
+ if (!path)
+ return ret;
+
+ if (eh) {
+ shd_event = GF_CALLOC (1, sizeof(*shd_event),
+ gf_afr_mt_shd_event_t);
+ if (!shd_event) {
+ GF_FREE (path);
+ return ret;
+ }
+
+ shd_event->child = child;
+ shd_event->path = path;
+
+ if (eh_save_history (eh, shd_event) < 0) {
+ GF_FREE (shd_event);
+ GF_FREE (path);
+ }
+ }
+ return ret;
+}
+
+
+void
+afr_shd_sweep_prepare (struct subvol_healer *healer)
+{
+ crawl_event_t *event = NULL;
+
+ event = &healer->crawl_event;
+
+ event->healed_count = 0;
+ event->split_brain_count = 0;
+ event->heal_failed_count = 0;
+
+ time (&event->start_time);
+ event->end_time = 0;
+}
+
+
+void
+afr_shd_sweep_done (struct subvol_healer *healer)
+{
+ crawl_event_t *event = NULL;
+ crawl_event_t *history = NULL;
+ afr_self_heald_t *shd = NULL;
+
+ event = &healer->crawl_event;
+ shd = &(((afr_private_t *)healer->this->private)->shd);
+
+ time (&event->end_time);
+ history = memdup (event, sizeof (*event));
+ event->start_time = 0;
+
+ if (!history)
+ return;
+
+ if (eh_save_history (shd->statistics[healer->subvol], history) < 0)
+ GF_FREE (history);
+}
+
+
+int
+afr_shd_index_sweep (struct subvol_healer *healer)
+{
+ xlator_t *this = NULL;
+ int child = -1;
+ fd_t *fd = NULL;
+ xlator_t *subvol = NULL;
+ afr_private_t *priv = NULL;
+ off_t offset = 0;
+ gf_dirent_t entries;
+ gf_dirent_t *entry = NULL;
+ uuid_t gfid;
+ int ret = 0;
+ int count = 0;
+
+ this = healer->this;
+ child = healer->subvol;
+ priv = this->private;
+ subvol = priv->children[child];
+
+ fd = afr_shd_index_opendir (this, child);
+ if (!fd) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "unable to opendir index-dir on %s", subvol->name);
+ return -errno;
+ }
+
+ INIT_LIST_HEAD (&entries.list);
+
+ while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) {
+ if (ret > 0)
+ ret = 0;
+ list_for_each_entry (entry, &entries.list, list) {
+ offset = entry->d_off;
+
+ if (!priv->shd.enabled) {
+ ret = -EBUSY;
+ break;
+ }
+
+ if (!strcmp (entry->d_name, ".") ||
+ !strcmp (entry->d_name, ".."))
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG, "got entry: %s",
+ entry->d_name);
+
+ ret = uuid_parse (entry->d_name, gfid);
+ if (ret)
+ continue;
+
+ ret = afr_shd_selfheal (healer, child, gfid);
+ if (ret == 0)
+ count++;
+
+ if (ret == -ENOENT || ret == -ESTALE) {
+ afr_shd_index_purge (subvol, fd->inode,
+ entry->d_name);
+ ret = 0;
+ }
+ }
+
+ gf_dirent_free (&entries);
+ if (ret)
+ break;
+ }
+
+ if (fd)
+ fd_unref (fd);
+ if (!ret)
+ ret = count;
+ return ret;
+}
+
+
+int
+afr_shd_full_sweep (struct subvol_healer *healer, inode_t *inode)
+{
+ fd_t *fd = NULL;
+ xlator_t *this = NULL;
+ xlator_t *subvol = NULL;
+ afr_private_t *priv = NULL;
+ off_t offset = 0;
+ gf_dirent_t entries;
+ gf_dirent_t *entry = NULL;
+ int ret = 0;
+
+ this = healer->this;
+ priv = this->private;
+ subvol = priv->children[healer->subvol];
+
+ fd = fd_anonymous (inode);
+ if (!fd)
+ return -errno;
+
+ INIT_LIST_HEAD (&entries.list);
+
+ while ((ret = syncop_readdirp (subvol, fd, 131072, offset, 0, &entries))) {
+ if (ret < 0)
+ break;
+
+ ret = gf_link_inodes_from_dirent (this, fd->inode, &entries);
+ if (ret)
+ break;
+
+ list_for_each_entry (entry, &entries.list, list) {
+ offset = entry->d_off;
+
+ if (!priv->shd.enabled) {
+ ret = -EBUSY;
+ break;
+ }
+
+ if (!strcmp (entry->d_name, ".") ||
+ !strcmp (entry->d_name, ".."))
+ continue;
+
+ afr_shd_selfheal_name (healer, healer->subvol,
+ inode->gfid, entry->d_name);
+
+ afr_shd_selfheal (healer, healer->subvol,
+ entry->d_stat.ia_gfid);
+
+ if (entry->d_stat.ia_type == IA_IFDIR) {
+ ret = afr_shd_full_sweep (healer, entry->inode);
+ if (ret)
+ break;
+ }
+ }
+
+ gf_dirent_free (&entries);
+ if (ret)
+ break;
+ }
+
+ if (fd)
+ fd_unref (fd);
+ return ret;
+}
+
+
+void *
+afr_shd_index_healer (void *data)
+{
+ struct subvol_healer *healer = NULL;
+ xlator_t *this = NULL;
+ int ret = 0;
+
+ healer = data;
+ THIS = this = healer->this;
+
+ for (;;) {
+ afr_shd_healer_wait (healer);
+
+ ASSERT_LOCAL(this, healer);
+
+ do {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "starting index sweep on subvol %s",
+ afr_subvol_name (this, healer->subvol));
+
+ afr_shd_sweep_prepare (healer);
+
+ ret = afr_shd_index_sweep (healer);
+
+ afr_shd_sweep_done (healer);
+ /*
+ As long as at least one gfid was
+ healed, keep retrying. We may have
+ just healed a directory and thereby
+ created entries for other gfids which
+ could not be healed thus far.
+ */
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "finished index sweep on subvol %s",
+ afr_subvol_name (this, healer->subvol));
+ /*
+ Give a pause before retrying to avoid a busy loop
+ in case the only entry in index is because of
+ an ongoing I/O.
+ */
+ sleep (1);
+ } while (ret > 0);
+ }
+
+ return NULL;
+}
+
+
+void *
+afr_shd_full_healer (void *data)
+{
+ struct subvol_healer *healer = NULL;
+ xlator_t *this = NULL;
+ int run = 0;
+
+ healer = data;
+ THIS = this = healer->this;
+
+ for (;;) {
+ pthread_mutex_lock (&healer->mutex);
+ {
+ run = __afr_shd_healer_wait (healer);
+ if (!run)
+ healer->running = _gf_false;
+ }
+ pthread_mutex_unlock (&healer->mutex);
+
+ if (!run)
+ break;
+
+ ASSERT_LOCAL(this, healer);
+
+ gf_log (this->name, GF_LOG_INFO,
+ "starting full sweep on subvol %s",
+ afr_subvol_name (this, healer->subvol));
+
+ afr_shd_sweep_prepare (healer);
+
+ afr_shd_full_sweep (healer, this->itable->root);
+
+ afr_shd_sweep_done (healer);
+
+ gf_log (this->name, GF_LOG_INFO,
+ "finished full sweep on subvol %s",
+ afr_subvol_name (this, healer->subvol));
+ }
+
+ return NULL;
+}
+
+
+int
+afr_shd_healer_init (xlator_t *this, struct subvol_healer *healer)
+{
+ int ret = 0;
+
+ ret = pthread_mutex_init (&healer->mutex, NULL);
+ if (ret)
+ goto out;
+
+ ret = pthread_cond_init (&healer->cond, NULL);
+ if (ret)
+ goto out;
+
+ healer->this = this;
+ healer->running = _gf_false;
+ healer->rerun = _gf_false;
+ healer->local = _gf_false;
+out:
+ return ret;
+}
+
+
+int
+afr_shd_healer_spawn (xlator_t *this, struct subvol_healer *healer,
+ void *(threadfn)(void *))
+{
+ int ret = 0;
+
+ pthread_mutex_lock (&healer->mutex);
+ {
+ if (healer->running) {
+ pthread_cond_signal (&healer->cond);
+ } else {
+ ret = gf_thread_create (&healer->thread, NULL,
+ threadfn, healer);
+ if (ret)
+ goto unlock;
+ healer->running = 1;
+ }
+
+ healer->rerun = 1;
+ }
+unlock:
+ pthread_mutex_unlock (&healer->mutex);
+
+ return ret;
+}
+
+
+int
+afr_shd_full_healer_spawn (xlator_t *this, int subvol)
+{
+ return afr_shd_healer_spawn (this, NTH_FULL_HEALER (this, subvol),
+ afr_shd_full_healer);
+}
+
+
+int
+afr_shd_index_healer_spawn (xlator_t *this, int subvol)
+{
+ return afr_shd_healer_spawn (this, NTH_INDEX_HEALER (this, subvol),
+ afr_shd_index_healer);
+}
+
+
+int
+afr_shd_dict_add_crawl_event (xlator_t *this, dict_t *output,
+ crawl_event_t *crawl_event)
+{
+ int ret = 0;
+ uint64_t count = 0;
+ char key[256] = {0};
+ int xl_id = 0;
+ uint64_t healed_count = 0;
+ uint64_t split_brain_count = 0;
+ uint64_t heal_failed_count = 0;
+ char *start_time_str = 0;
+ char *end_time_str = NULL;
+ char *crawl_type = NULL;
+ int progress = -1;
+ int child = -1;
+
+ child = crawl_event->child;
+ healed_count = crawl_event->healed_count;
+ split_brain_count = crawl_event->split_brain_count;
+ heal_failed_count = crawl_event->heal_failed_count;
+ crawl_type = crawl_event->crawl_type;
+
+ if (!crawl_event->start_time)
+ goto out;
+
+ start_time_str = gf_strdup (ctime (&crawl_event->start_time));
+
+ if (crawl_event->end_time)
+ end_time_str = gf_strdup (ctime (&crawl_event->end_time));
+
+ ret = dict_get_int32 (output, this->name, &xl_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "xl does not have id");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child);
+ ret = dict_get_uint64 (output, key, &count);
+
+
+ snprintf (key, sizeof (key), "statistics_healed_cnt-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_uint64(output, key, healed_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not add statistics_healed_count to outout");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "statistics_sb_cnt-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_uint64 (output, key, split_brain_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not add statistics_split_brain_count to outout");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "statistics_crawl_type-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_str (output, key, crawl_type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not add statistics_crawl_type to output");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "statistics_heal_failed_cnt-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_uint64 (output, key, heal_failed_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not add statistics_healed_failed_count to outout");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "statistics_strt_time-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_dynstr (output, key, start_time_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not add statistics_crawl_start_time to outout");
+ goto out;
+ } else {
+ start_time_str = NULL;
+ }
+
+ if (!end_time_str)
+ progress = 1;
+ else
+ progress = 0;
+
+ snprintf (key, sizeof (key), "statistics_end_time-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ if (!end_time_str)
+ end_time_str = gf_strdup ("Could not determine the end time");
+ ret = dict_set_dynstr (output, key, end_time_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not add statistics_crawl_end_time to outout");
+ goto out;
+ } else {
+ end_time_str = NULL;
+ }
+
+ snprintf (key, sizeof (key), "statistics_inprogress-%d-%d-%"PRIu64,
+ xl_id, child, count);
+
+ ret = dict_set_int32 (output, key, progress);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not add statistics_inprogress to outout");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child);
+ ret = dict_set_uint64 (output, key, count + 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not increment the counter.");
+ goto out;
+ }
+out:
+ GF_FREE (start_time_str);
+ GF_FREE (end_time_str);
+ return ret;
+}
+
+
+int
+afr_shd_dict_add_path (xlator_t *this, dict_t *output, int child, char *path,
+ struct timeval *tv)
+{
+ int ret = -1;
+ uint64_t count = 0;
+ char key[256] = {0};
+ int xl_id = 0;
+
+ ret = dict_get_int32 (output, this->name, &xl_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "xl does not have id");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "%d-%d-count", xl_id, child);
+ ret = dict_get_uint64 (output, key, &count);
+
+ snprintf (key, sizeof (key), "%d-%d-%"PRIu64, xl_id, child, count);
+ ret = dict_set_dynstr (output, key, path);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s: Could not add to output",
+ path);
+ goto out;
+ }
+
+ if (tv) {
+ snprintf (key, sizeof (key), "%d-%d-%"PRIu64"-time", xl_id,
+ child, count);
+ ret = dict_set_uint32 (output, key, tv->tv_sec);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s: Could not set time",
+ path);
+ goto out;
+ }
+ }
+
+ snprintf (key, sizeof (key), "%d-%d-count", xl_id, child);
+
+ ret = dict_set_uint64 (output, key, count + 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not increment count");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+
+int
+afr_shd_gfid_to_path (xlator_t *this, xlator_t *subvol, uuid_t gfid, char **path_p)
+{
+ loc_t loc = {0,};
+ char *path = NULL;
+ dict_t *xattr = NULL;
+ int ret = 0;
+
+ uuid_copy (loc.gfid, gfid);
+ loc.inode = inode_new (this->itable);
+
+ ret = syncop_getxattr (subvol, &loc, &xattr, GFID_TO_PATH_KEY);
+ loc_wipe (&loc);
+ if (ret)
+ return ret;
+
+ ret = dict_get_str (xattr, GFID_TO_PATH_KEY, &path);
+ if (ret || !path)
+ return -EINVAL;
+
+ *path_p = gf_strdup (path);
+ if (!*path_p)
+ return -ENOMEM;
+ return 0;
+}
+
+
+int
+afr_shd_gather_index_entries (xlator_t *this, int child, dict_t *output)
+{
+ fd_t *fd = NULL;
+ xlator_t *subvol = NULL;
+ afr_private_t *priv = NULL;
+ off_t offset = 0;
+ gf_dirent_t entries;
+ gf_dirent_t *entry = NULL;
+ uuid_t gfid;
+ int ret = 0;
+ int count = 0;
+ char *path = NULL;
+
+ priv = this->private;
+ subvol = priv->children[child];
+
+ fd = afr_shd_index_opendir (this, child);
+ if (!fd) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "unable to opendir index-dir on %s", subvol->name);
+ return -errno;
+ }
+
+ INIT_LIST_HEAD (&entries.list);
+
+ while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) {
+ if (ret > 0)
+ ret = 0;
+ list_for_each_entry (entry, &entries.list, list) {
+ offset = entry->d_off;
+
+ if (!strcmp (entry->d_name, ".") ||
+ !strcmp (entry->d_name, ".."))
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG, "got entry: %s",
+ entry->d_name);
+
+ ret = uuid_parse (entry->d_name, gfid);
+ if (ret)
+ continue;
+
+ path = NULL;
+ ret = afr_shd_gfid_to_path (this, subvol, gfid, &path);
+
+ if (ret == -ENOENT || ret == -ESTALE) {
+ afr_shd_index_purge (subvol, fd->inode,
+ entry->d_name);
+ ret = 0;
+ continue;
+ }
+
+ ret = afr_shd_dict_add_path (this, output, child, path,
+ NULL);
+ }
+
+ gf_dirent_free (&entries);
+ if (ret)
+ break;
+ }
+
+ if (fd)
+ fd_unref (fd);
+ if (!ret)
+ ret = count;
+ return ret;
+}
+
+
+int
+afr_add_shd_event (circular_buffer_t *cb, void *data)
+{
+ dict_t *output = NULL;
+ xlator_t *this = THIS;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ shd_event_t *shd_event = NULL;
+ char *path = NULL;
+
+ output = data;
+ priv = this->private;
+ shd = &priv->shd;
+ shd_event = cb->data;
+
+ if (!shd->index_healers[shd_event->child].local)
+ return 0;
+
+ path = gf_strdup (shd_event->path);
+ if (!path)
+ return -ENOMEM;
+
+ afr_shd_dict_add_path (this, output, shd_event->child, path,
+ &cb->tv);
+ return 0;
+}
+
+int
+afr_add_crawl_event (circular_buffer_t *cb, void *data)
+{
+ dict_t *output = NULL;
+ xlator_t *this = THIS;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ crawl_event_t *crawl_event = NULL;
+
+ output = data;
+ priv = this->private;
+ shd = &priv->shd;
+ crawl_event = cb->data;
+
+ if (!shd->index_healers[crawl_event->child].local)
+ return 0;
+
+ afr_shd_dict_add_crawl_event (this, output, crawl_event);
+
+ return 0;
+}
+
+
+int
+afr_selfheal_daemon_init (xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ int ret = -1;
+ int i = 0;
+
+ priv = this->private;
+ shd = &priv->shd;
+
+ this->itable = inode_table_new (SHD_INODE_LRU_LIMIT, this);
+ if (!this->itable)
+ goto out;
+
+ shd->index_healers = GF_CALLOC (sizeof(*shd->index_healers),
+ priv->child_count,
+ gf_afr_mt_subvol_healer_t);
+ if (!shd->index_healers)
+ goto out;
+
+ for (i = 0; i < priv->child_count; i++) {
+ shd->index_healers[i].subvol = i;
+ ret = afr_shd_healer_init (this, &shd->index_healers[i]);
+ if (ret)
+ goto out;
+ }
+
+ shd->full_healers = GF_CALLOC (sizeof(*shd->full_healers),
+ priv->child_count,
+ gf_afr_mt_subvol_healer_t);
+ if (!shd->full_healers)
+ goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ shd->full_healers[i].subvol = i;
+ ret = afr_shd_healer_init (this, &shd->full_healers[i]);
+ if (ret)
+ goto out;
+ }
+
+ shd->healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false,
+ afr_destroy_shd_event_data);
+ if (!shd->healed)
+ goto out;
+
+ shd->heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false,
+ afr_destroy_shd_event_data);
+ if (!shd->heal_failed)
+ goto out;
+
+ shd->split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false,
+ afr_destroy_shd_event_data);
+ if (!shd->split_brain)
+ goto out;
+
+ shd->statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count,
+ gf_common_mt_eh_t);
+ if (!shd->statistics)
+ goto out;
+
+ for (i = 0; i < priv->child_count ; i++) {
+ shd->statistics[i] = eh_new (AFR_STATISTICS_HISTORY_SIZE,
+ _gf_false,
+ afr_destroy_crawl_event_data);
+ if (!shd->statistics[i])
+ goto out;
+ shd->full_healers[i].crawl_event.child = i;
+ shd->full_healers[i].crawl_event.crawl_type = "FULL";
+ shd->index_healers[i].crawl_event.child = i;
+ shd->index_healers[i].crawl_event.crawl_type = "INDEX";
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+
+int
+afr_selfheal_childup (xlator_t *this, int subvol)
+{
+ afr_shd_index_healer_spawn (this, subvol);
+
+ return 0;
+}
+
+
+int64_t
+afr_shd_get_index_count (xlator_t *this, int i)
+{
+ afr_private_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ uint64_t count = 0;
+ loc_t rootloc = {0, };
+ dict_t *xattr = NULL;
+ int ret = -1;
+
+ priv = this->private;
+ subvol = priv->children[i];
+
+ rootloc.inode = inode_ref (this->itable->root);
+ uuid_copy (rootloc.gfid, rootloc.inode->gfid);
+
+ ret = syncop_getxattr (subvol, &rootloc, &xattr,
+ GF_XATTROP_INDEX_COUNT);
+ loc_wipe (&rootloc);
+
+ if (ret < 0)
+ return -1;
+
+ ret = dict_get_uint64 (xattr, GF_XATTROP_INDEX_COUNT, &count);
+ if (ret)
+ return -1;
+ return count;
+}
+
+
+int
+afr_xl_op (xlator_t *this, dict_t *input, dict_t *output)
+{
+ gf_xl_afr_op_t op = GF_AFR_OP_INVALID;
+ int ret = 0;
+ int xl_id = 0;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ struct subvol_healer *healer = NULL;
+ int i = 0;
+ char key[64];
+ int op_ret = 0;
+ int64_t cnt = 0;
+
+ priv = this->private;
+ shd = &priv->shd;
+
+ for (i = 0; i < priv->child_count; i++)
+ if (priv->child_up[i] == -1)
+ goto out;
+
+ ret = dict_get_int32 (input, "xl-op", (int32_t*)&op);
+ if (ret)
+ goto out;
+ ret = dict_get_int32 (input, this->name, &xl_id);
+ if (ret)
+ goto out;
+ ret = dict_set_int32 (output, this->name, xl_id);
+ if (ret)
+ goto out;
+ switch (op) {
+ case GF_AFR_OP_HEAL_INDEX:
+ op_ret = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ healer = &shd->index_healers[i];
+ snprintf (key, 64, "%d-%d-status", xl_id, i);
+
+ if (!priv->child_up[i]) {
+ ret = dict_set_str (output, key,
+ "Brick is not connected");
+ } else if (AFR_COUNT (priv->child_up,
+ priv->child_count) < 2) {
+ ret = dict_set_str (output, key,
+ "< 2 bricks in replica are up");
+ } else if (!afr_shd_is_subvol_local (this, healer->subvol)) {
+ ret = dict_set_str (output, key,
+ "Brick is remote");
+ } else {
+ ret = dict_set_str (output, key,
+ "Started self-heal");
+ afr_shd_index_healer_spawn (this, i);
+ op_ret = 0;
+ }
+ }
+ break;
+ case GF_AFR_OP_HEAL_FULL:
+ op_ret = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ healer = &shd->full_healers[i];
+ snprintf (key, 64, "%d-%d-status", xl_id, i);
+
+ if (!priv->child_up[i]) {
+ ret = dict_set_str (output, key,
+ "Brick is not connected");
+ } else if (AFR_COUNT (priv->child_up,
+ priv->child_count) < 2) {
+ ret = dict_set_str (output, key,
+ "< 2 bricks in replica are up");
+ } else if (!afr_shd_is_subvol_local (this, healer->subvol)) {
+ ret = dict_set_str (output, key,
+ "Brick is remote");
+ } else {
+ ret = dict_set_str (output, key,
+ "Started self-heal");
+ afr_shd_full_healer_spawn (this, i);
+ op_ret = 0;
+ }
+ }
+ break;
+ case GF_AFR_OP_INDEX_SUMMARY:
+ for (i = 0; i < priv->child_count; i++)
+ if (shd->index_healers[i].local)
+ afr_shd_gather_index_entries (this, i, output);
+ break;
+ case GF_AFR_OP_HEALED_FILES:
+ eh_dump (shd->healed, output, afr_add_shd_event);
+ break;
+ case GF_AFR_OP_HEAL_FAILED_FILES:
+ eh_dump (shd->heal_failed, output, afr_add_shd_event);
+ break;
+ case GF_AFR_OP_SPLIT_BRAIN_FILES:
+ eh_dump (shd->split_brain, output, afr_add_shd_event);
+ break;
+ case GF_AFR_OP_STATISTICS:
+ for (i = 0; i < priv->child_count; i++) {
+ eh_dump (shd->statistics[i], output,
+ afr_add_crawl_event);
+ afr_shd_dict_add_crawl_event (this, output,
+ &shd->index_healers[i].crawl_event);
+ afr_shd_dict_add_crawl_event (this, output,
+ &shd->full_healers[i].crawl_event);
+ }
+ break;
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT:
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+ op_ret = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->child_up[i]) {
+ snprintf (key, 64, "%d-%d-status", xl_id, i);
+ ret = dict_set_str (output, key,
+ "Brick is not connected");
+ } else {
+ snprintf (key, 64, "%d-%d-hardlinks", xl_id, i);
+ cnt = afr_shd_get_index_count (this, i);
+ if (cnt >= 0) {
+ ret = dict_set_uint64 (output, key, cnt);
+ }
+ op_ret = 0;
+ }
+ }
+
+// ret = _do_crawl_op_on_local_subvols (this, INDEX_TO_BE_HEALED,
+// STATISTICS_TO_BE_HEALED,
+// output);
+ break;
+
+ default:
+ gf_log (this->name, GF_LOG_ERROR, "Unknown set op %d", op);
+ break;
+ }
+out:
+ dict_del (output, this->name);
+ return op_ret;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h
new file mode 100644
index 000000000..10e229ee7
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heald.h
@@ -0,0 +1,72 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _AFR_SELF_HEALD_H
+#define _AFR_SELF_HEALD_H
+
+#include <pthread.h>
+
+
+typedef struct {
+ int child;
+ char *path;
+} shd_event_t;
+
+typedef struct {
+ int child;
+ uint64_t healed_count;
+ uint64_t split_brain_count;
+ uint64_t heal_failed_count;
+
+ /* If start_time is 0, it means crawler is not in progress
+ and stats are not valid */
+ time_t start_time;
+ /* If start_time is NOT 0 and end_time is 0, it means
+ cralwer is in progress */
+ time_t end_time;
+ char *crawl_type;
+} crawl_event_t;
+
+struct subvol_healer {
+ xlator_t *this;
+ int subvol;
+ gf_boolean_t local;
+ gf_boolean_t running;
+ gf_boolean_t rerun;
+ crawl_event_t crawl_event;
+ pthread_mutex_t mutex;
+ pthread_cond_t cond;
+ pthread_t thread;
+};
+
+typedef struct {
+ gf_boolean_t iamshd;
+ gf_boolean_t enabled;
+ struct subvol_healer *index_healers;
+ struct subvol_healer *full_healers;
+
+ eh_t *healed;
+ eh_t *heal_failed;
+ eh_t *split_brain;
+ eh_t **statistics;
+} afr_self_heald_t;
+
+
+int
+afr_selfheal_childup (xlator_t *this, int subvol);
+
+int
+afr_selfheal_daemon_init (xlator_t *this);
+
+int
+afr_xl_op (xlator_t *this, dict_t *input, dict_t *output);
+
+#endif /* !_AFR_SELF_HEALD_H */
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index d779bacc3..205ff759e 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -1,1117 +1,1579 @@
/*
- Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#include "dict.h"
#include "byte-order.h"
+#include "common-utils.h"
+#include "timer.h"
#include "afr.h"
#include "afr-transaction.h"
#include <signal.h>
+gf_boolean_t
+afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this);
-static void
-afr_pid_save (call_frame_t *frame)
-{
- afr_local_t * local = NULL;
-
- local = frame->local;
+gf_boolean_t
+afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this);
- local->saved_pid = frame->root->pid;
-}
+int
+afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr,
+ afr_changelog_resume_t changelog_resume);
-static void
-afr_pid_restore (call_frame_t *frame)
+int
+__afr_txn_write_fop (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+ int i = 0;
local = frame->local;
+ priv = this->private;
- frame->root->pid = local->saved_pid;
-}
+ call_count = AFR_COUNT (local->transaction.pre_op, priv->child_count);
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
+ }
-static void
-__mark_all_pending (int32_t *pending[], int child_count,
- afr_transaction_type type)
-{
- int i;
- int j;
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i]) {
+ local->transaction.wind (frame, this, i);
- for (i = 0; i < child_count; i++) {
- j = afr_index_for_transaction_type (type);
- pending[i][j] = hton32 (1);
+ if (!--call_count)
+ break;
+ }
}
+
+ return 0;
}
-static void
-__mark_child_dead (int32_t *pending[], int child_count, int child,
- afr_transaction_type type)
+int
+__afr_txn_write_done (call_frame_t *frame, xlator_t *this)
{
- int j;
+ afr_local_t *local = NULL;
- j = afr_index_for_transaction_type (type);
-
- pending[child][j] = 0;
-}
+ local = frame->local;
+ local->transaction.unwind (frame, this);
-static void
-__mark_fop_failed_on_fd (fd_t *fd, xlator_t *this,
- int child_index)
-{
- uint64_t ctx;
- afr_fd_ctx_t * fd_ctx = NULL;
+ AFR_STACK_DESTROY (frame);
- int ret = 0;
+ return 0;
+}
- ret = fd_ctx_get (fd, this, &ctx);
- if (ret < 0)
- goto out;
+call_frame_t*
+afr_transaction_detach_fop_frame (call_frame_t *frame)
+{
+ afr_local_t * local = NULL;
+ call_frame_t *fop_frame = NULL;
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+ local = frame->local;
- fd_ctx->child_failed[child_index] = 1;
-out:
- return;
+ LOCK (&frame->lock);
+ {
+ fop_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
+
+ return fop_frame;
}
static void
-__mark_failed_children (int32_t *pending[], int child_count,
- xlator_t *this, fd_t *fd, afr_transaction_type type)
+afr_save_lk_owner (call_frame_t *frame)
{
- uint64_t ctx;
- afr_fd_ctx_t * fd_ctx = NULL;
+ afr_local_t * local = NULL;
- int ret = 0;
- int i = 0;
- int j = 0;
+ local = frame->local;
- ret = fd_ctx_get (fd, this, &ctx);
+ local->saved_lk_owner = frame->root->lk_owner;
+}
- if (ret < 0)
- goto out;
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+static void
+afr_restore_lk_owner (call_frame_t *frame)
+{
+ afr_local_t * local = NULL;
- for (i = 0; i < child_count; i++) {
- j = afr_index_for_transaction_type (type);
+ local = frame->local;
- if (fd_ctx->child_failed[i])
- pending[i][j] = 0;
- }
-
-out:
- return;
+ frame->root->lk_owner = local->saved_lk_owner;
}
-
-static void
-__mark_down_children (int32_t *pending[], int child_count,
- unsigned char *child_up, afr_transaction_type type)
+void
+__mark_all_success (call_frame_t *frame, xlator_t *this)
{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
int i;
- int j;
- for (i = 0; i < child_count; i++) {
- j = afr_index_for_transaction_type (type);
+ local = frame->local;
+ priv = this->private;
- if (!child_up[i])
- pending[i][j] = 0;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ local->transaction.failed_subvols[i] = 0;
+ }
}
-static void
-__mark_all_success (int32_t *pending[], int child_count,
- afr_transaction_type type)
+int
+afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
{
- int i;
- int j;
+ afr_local_t *local = NULL;
+ fd_t *fd = NULL;
- for (i = 0; i < child_count; i++) {
- j = afr_index_for_transaction_type (type);
- pending[i][j] = hton32 (-1);
- }
+ local = frame->local;
+ fd = local->fd;
+
+ /* Perform fops with the lk-owner from top xlator.
+ * Eg: lk-owner of posix-lk and flush should be same,
+ * flush cant clear the posix-lks without that lk-owner.
+ */
+ afr_save_lk_owner (frame);
+ frame->root->lk_owner =
+ local->transaction.main_frame->root->lk_owner;
+
+ if (local->pre_op_compat)
+ /* old mode, pre-op was done as afr_changelog_do()
+ just now, before OP */
+ afr_changelog_pre_op_update (frame, this);
+
+ /* The wake up needs to happen independent of
+ what type of fop arrives here. If it was
+ a write, then it has already inherited the
+ lock and changelog. If it was not a write,
+ then the presumption of the optimization (of
+ optimizing for successive write operations)
+ fails.
+ */
+ if (fd)
+ afr_delayed_changelog_wake_up (this, fd);
+ local->transaction.fop (frame, this);
+
+ return 0;
}
static int
-__is_first_write_on_fd (xlator_t *this, fd_t *fd)
+__changelog_enabled (afr_private_t *priv, afr_transaction_type type)
{
- int op_ret = 0;
- int _ret = -1;
+ int ret = 0;
- uint64_t ctx;
- afr_fd_ctx_t * fd_ctx = NULL;
+ switch (type) {
+ case AFR_DATA_TRANSACTION:
+ if (priv->data_change_log)
+ ret = 1;
- LOCK (&fd->lock);
- {
- _ret = __fd_ctx_get (fd, this, &ctx);
-
- if (_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "could not get fd ctx on fd=%p",
- fd);
- goto out;
- }
+ break;
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+ case AFR_METADATA_TRANSACTION:
+ if (priv->metadata_change_log)
+ ret = 1;
- if (fd_ctx->pre_op_done == 0) {
- fd_ctx->pre_op_done = 1;
- op_ret = 1;
- }
+ break;
+
+ case AFR_ENTRY_TRANSACTION:
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ if (priv->entry_change_log)
+ ret = 1;
+
+ break;
}
-out:
- UNLOCK (&fd->lock);
- return op_ret;
+ return ret;
}
static int
-__if_fd_pre_op_done (xlator_t *this, fd_t *fd)
+__fop_changelog_needed (call_frame_t *frame, xlator_t *this)
{
+ afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
int op_ret = 0;
- int _ret = -1;
+ afr_transaction_type type = -1;
- uint64_t ctx;
- afr_fd_ctx_t * fd_ctx = NULL;
+ priv = this->private;
+ local = frame->local;
+ type = local->transaction.type;
- LOCK (&fd->lock);
- {
- _ret = __fd_ctx_get (fd, this, &ctx);
+ if (__changelog_enabled (priv, type)) {
+ switch (local->op) {
- if (_ret < 0) {
- goto out;
- }
+ case GF_FOP_WRITE:
+ case GF_FOP_FTRUNCATE:
+ op_ret = 1;
+ break;
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+ case GF_FOP_FLUSH:
+ op_ret = 0;
+ break;
- if (fd_ctx->pre_op_done) {
- fd_ctx->pre_op_done = 0;
+ default:
op_ret = 1;
}
}
-out:
- UNLOCK (&fd->lock);
return op_ret;
}
-static int
-__changelog_enabled (afr_private_t *priv, afr_transaction_type type)
+int
+afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int **pending)
{
- int ret = 0;
+ int i = 0;
+ int ret = 0;
+ int pending_zero[AFR_NUM_CHANGE_LOGS] = {0, };
- switch (type) {
- case AFR_DATA_TRANSACTION:
- if (priv->data_change_log)
- ret = 1;
-
- break;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!memcmp (pending_zero, pending[i], sizeof (pending_zero)))
+ /* don't set xattrs for non-pending servers */
+ continue;
- case AFR_METADATA_TRANSACTION:
- if (priv->metadata_change_log)
- ret = 1;
+ ret = dict_set_static_bin (xattr, priv->pending_key[i],
+ pending[i],
+ AFR_NUM_CHANGE_LOGS * sizeof (int));
+ /* 3 = data+metadata+entry */
- break;
+ if (ret)
+ break;
+ }
- case AFR_ENTRY_TRANSACTION:
- case AFR_ENTRY_RENAME_TRANSACTION:
- if (priv->entry_change_log)
- ret = 1;
+ return ret;
+}
- break;
-
- case AFR_FLUSH_TRANSACTION:
- ret = 1;
- }
+int
+afr_lock_server_count (afr_private_t *priv, afr_transaction_type type)
+{
+ int ret = 0;
- return ret;
-}
+ switch (type) {
+ case AFR_DATA_TRANSACTION:
+ ret = priv->child_count;
+ break;
+ case AFR_METADATA_TRANSACTION:
+ ret = priv->child_count;
+ break;
-static int
-__changelog_needed_pre_op (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- fd_t * fd = NULL;
+ case AFR_ENTRY_TRANSACTION:
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ ret = priv->child_count;
+ break;
+ }
- int op_ret = 0;
+ return ret;
+}
- priv = this->private;
- local = frame->local;
-
- if (__changelog_enabled (priv, local->transaction.type)) {
- switch (local->op) {
-
- case GF_FOP_WRITE:
- case GF_FOP_FTRUNCATE:
- /*
- if it's a data transaction, we write the changelog
- only on the first write on an fd
- */
-
- fd = local->fd;
- if (!fd || __is_first_write_on_fd (this, fd))
- op_ret = 1;
+/* {{{ pending */
- break;
- case GF_FOP_FLUSH:
- /* only do post-op on flush() */
+int
+afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_internal_lock_t *int_lock = NULL;
- op_ret = 0;
- break;
+ local = frame->local;
+ priv = this->private;
+ int_lock = &local->internal_lock;
- default:
- op_ret = 1;
- }
+ if (local->transaction.resume_stub) {
+ call_resume (local->transaction.resume_stub);
+ local->transaction.resume_stub = NULL;
+ }
+
+ if (afr_lock_server_count (priv, local->transaction.type) == 0) {
+ local->transaction.done (frame, this);
+ } else {
+ int_lock->lock_cbk = local->transaction.done;
+ afr_unlock (frame, this);
}
- return op_ret;
+ return 0;
}
-static int
-__changelog_needed_post_op (call_frame_t *frame, xlator_t *this)
+afr_inodelk_t*
+afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
+ afr_inodelk_t *inodelk = NULL;
+ int i = 0;
- int op_ret = 0;
- afr_transaction_type type = -1;
+ for (i = 0; int_lock->inodelk[i].domain; i++) {
+ inodelk = &int_lock->inodelk[i];
+ if (strcmp (dom, inodelk->domain) == 0)
+ return inodelk;
+ }
+ return NULL;
+}
- priv = this->private;
- local = frame->local;
- type = local->transaction.type;
+unsigned char*
+afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock)
+{
+ unsigned char *locked_nodes = NULL;
+ afr_inodelk_t *inodelk = NULL;
+ switch (type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ locked_nodes = inodelk->locked_nodes;
+ break;
+
+ case AFR_ENTRY_TRANSACTION:
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ /*Because same set of subvols participate in all lockee
+ * entities*/
+ locked_nodes = int_lock->lockee[0].locked_nodes;
+ break;
+ }
+ return locked_nodes;
+}
- if (__changelog_enabled (priv, type)) {
- switch (local->op) {
- case GF_FOP_WRITE:
- case GF_FOP_FTRUNCATE:
- op_ret = 0;
- break;
+int
+afr_changelog_call_count (afr_transaction_type type,
+ unsigned char *pre_op_subvols,
+ unsigned int child_count)
+{
+ int call_count = 0;
- case GF_FOP_FLUSH:
- op_ret = __if_fd_pre_op_done (this, local->fd);
- break;
+ call_count = AFR_COUNT(pre_op_subvols, child_count);
- default:
- op_ret = 1;
- }
- }
+ if (type == AFR_ENTRY_RENAME_TRANSACTION)
+ call_count *= 2;
- return op_ret;
+ return call_count;
}
-static int
-afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending)
+gf_boolean_t
+afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this)
{
- int i;
- int ret = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_static_bin (xattr, priv->pending_key[i],
- pending[i], 3 * sizeof (int32_t));
- /* 3 = data+metadata+entry */
-
- if (ret < 0)
- goto out;
+ if (local->transaction.failed_subvols[i])
+ return _gf_false;
}
-out:
- return ret;
+ return _gf_true;
}
-static int
-afr_lock_server_count (afr_private_t *priv, afr_transaction_type type)
+void
+afr_handle_symmetric_errors (call_frame_t *frame, xlator_t *this)
{
- int ret = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int op_errno = 0;
+ int i_errno = 0;
+ gf_boolean_t matching_errors = _gf_true;
+ int i = 0;
- switch (type) {
- case AFR_FLUSH_TRANSACTION:
- case AFR_DATA_TRANSACTION:
- ret = priv->data_lock_server_count;
- break;
+ priv = this->private;
+ local = frame->local;
- case AFR_METADATA_TRANSACTION:
- ret = priv->metadata_lock_server_count;
- break;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret != -1) {
+ /* Operation succeeded on at least on subvol,
+ so it is not a failed-everywhere situation.
+ */
+ matching_errors = _gf_false;
+ break;
+ }
+ i_errno = local->replies[i].op_errno;
+
+ if (i_errno == ENOTCONN) {
+ /* ENOTCONN is not a symmetric error. We do not
+ know if the operation was performed on the
+ backend or not.
+ */
+ matching_errors = _gf_false;
+ break;
+ }
- case AFR_ENTRY_TRANSACTION:
- case AFR_ENTRY_RENAME_TRANSACTION:
- ret = priv->entry_lock_server_count;
- break;
+ if (!op_errno) {
+ op_errno = i_errno;
+ } else if (op_errno != i_errno) {
+ /* Mismatching op_errno's */
+ matching_errors = _gf_false;
+ break;
+ }
}
- return ret;
+ if (matching_errors)
+ __mark_all_success (frame, this);
}
-/* {{{ unlock */
-
-int32_t
-afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+int
+afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local;
- int call_count = 0;
+ afr_private_t * priv = this->private;
+ int i = 0;
+ int ret = 0;
+ int idx = 0;
+ afr_local_t * local = NULL;
+ dict_t *xattr = NULL;
+ int nothing_failed = 1;
+ gf_boolean_t need_undirty = _gf_false;
- local = frame->local;
+ local = frame->local;
+ idx = afr_index_for_transaction_type (local->transaction.type);
- LOCK (&frame->lock);
- {
- call_count = --local->call_count;
+ nothing_failed = afr_txn_nothing_failed (frame, this);
+
+ if (afr_changelog_pre_op_uninherit (frame, this))
+ need_undirty = _gf_false;
+ else
+ need_undirty = _gf_true;
+
+ if (nothing_failed && !need_undirty) {
+ afr_changelog_post_op_done (frame, this);
+ goto out;
}
- UNLOCK (&frame->lock);
- if (call_count == 0) {
- local->transaction.done (frame, this);
+ xattr = dict_new ();
+ if (!xattr) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ afr_changelog_post_op_done (frame, this);
+ goto out;
}
-
- return 0;
-}
+ if (need_undirty) {
+ local->dirty[idx] = hton32(-1);
-int
-afr_unlock (call_frame_t *frame, xlator_t *this)
-{
- struct flock flock;
+ ret = dict_set_static_bin (xattr, AFR_DIRTY, local->dirty,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ afr_changelog_post_op_done (frame, this);
+ goto out;
+ }
- int i = 0;
- int call_count = 0;
+ }
+ if (!nothing_failed) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i])
+ local->pending[i][idx] = hton32(1);
+ }
+ ret = afr_set_pending_dict (priv, xattr, local->pending);
+ if (ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ afr_changelog_post_op_done (frame, this);
+ goto out;
+ }
+
+ }
+
+ afr_changelog_do (frame, this, xattr, afr_changelog_post_op_done);
+out:
+ if (xattr)
+ dict_unref (xattr);
+
+ return 0;
+}
+
+
+gf_boolean_t
+afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this)
+{
afr_local_t *local = NULL;
- afr_private_t * priv = this->private;
+ afr_private_t *priv = NULL;
+ fd_t *fd = NULL;
+ int i = 0;
+ gf_boolean_t ret = _gf_false;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int type = 0;
local = frame->local;
+ priv = this->private;
+ fd = local->fd;
- /*
- pid has been restored to saved_pid in the fop,
- so set it back to frame->root
- */
+ type = afr_index_for_transaction_type (local->transaction.type);
+ if (type != AFR_DATA_TRANSACTION)
+ return !local->transaction.dirtied;
- frame->root->pid = (long) frame->root;
+ if (!fd)
+ return !local->transaction.dirtied;
- call_count = afr_locked_nodes_count (local->transaction.locked_nodes,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.done (frame, this);
- return 0;
- }
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
+ return _gf_false;
- if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION)
- call_count *= 2;
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- flock.l_start = local->transaction.start;
- flock.l_len = local->transaction.len;
- flock.l_type = F_UNLCK;
-
- if (local->transaction.locked_nodes[i]) {
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- case AFR_FLUSH_TRANSACTION:
-
- if (local->fd) {
- STACK_WIND (frame, afr_unlock_common_cbk,
- priv->children[i],
- priv->children[i]->fops->finodelk,
- this->name, local->fd,
- F_SETLK, &flock);
- } else {
- STACK_WIND (frame, afr_unlock_common_cbk,
- priv->children[i],
- priv->children[i]->fops->inodelk,
- this->name, &local->loc,
- F_SETLK, &flock);
- }
-
- break;
-
- case AFR_ENTRY_RENAME_TRANSACTION:
-
- STACK_WIND (frame, afr_unlock_common_cbk,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- this->name,
- &local->transaction.new_parent_loc,
- local->transaction.new_basename,
- ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
-
- call_count--;
-
- /* fall through */
-
- case AFR_ENTRY_TRANSACTION:
- if (local->fd) {
- STACK_WIND (frame, afr_unlock_common_cbk,
- priv->children[i],
- priv->children[i]->fops->fentrylk,
- this->name, local->fd,
- local->transaction.basename,
- ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
- } else {
- STACK_WIND (frame, afr_unlock_common_cbk,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- this->name,
- &local->transaction.parent_loc,
- local->transaction.basename,
- ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
+ if (local->transaction.no_uninherit)
+ return _gf_false;
- }
- break;
+ /* This function must be idempotent. So check if we
+ were called before and return the same answer again.
+
+ It is important to keep this function idempotent for
+ the call in afr_changelog_post_op_safe() to not have
+ side effects on the call from afr_changelog_post_op_now()
+ */
+ if (local->transaction.uninherit_done)
+ return local->transaction.uninherit_value;
+
+ LOCK(&fd->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] !=
+ fd_ctx->pre_op_done[type][i]) {
+ ret = !local->transaction.dirtied;
+ goto unlock;
}
-
- if (!--call_count)
- break;
}
- }
- return 0;
-}
+ if (fd_ctx->inherited[type]) {
+ ret = _gf_true;
+ fd_ctx->inherited[type]--;
+ } else if (fd_ctx->on_disk[type]) {
+ ret = _gf_false;
+ fd_ctx->on_disk[type]--;
+ } else {
+ /* ASSERT */
+ ret = _gf_false;
+ }
-/* }}} */
+ if (!fd_ctx->inherited[type] && !fd_ctx->on_disk[type]) {
+ for (i = 0; i < priv->child_count; i++)
+ fd_ctx->pre_op_done[type][i] = 0;
+ }
+ }
+unlock:
+ UNLOCK(&fd->lock);
+ local->transaction.uninherit_done = _gf_true;
+ local->transaction.uninherit_value = ret;
-/* {{{ pending */
+ return ret;
+}
-int32_t
-afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr)
+
+gf_boolean_t
+afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
-
- int call_count = -1;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ fd_t *fd = NULL;
+ int i = 0;
+ gf_boolean_t ret = _gf_false;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int type = 0;
- priv = this->private;
local = frame->local;
+ priv = this->private;
+ fd = local->fd;
- LOCK (&frame->lock);
+ if (local->transaction.type != AFR_DATA_TRANSACTION)
+ return _gf_false;
+
+ type = afr_index_for_transaction_type (local->transaction.type);
+
+ if (!fd)
+ return _gf_false;
+
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
+ return _gf_false;
+
+ LOCK(&fd->lock);
{
- call_count = --local->call_count;
+ if (!fd_ctx->on_disk[type]) {
+ /* nothing to inherit yet */
+ ret = _gf_false;
+ goto unlock;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] !=
+ fd_ctx->pre_op_done[type][i]) {
+ /* either inherit exactly, or don't */
+ ret = _gf_false;
+ goto unlock;
+ }
+ }
+
+ fd_ctx->inherited[type]++;
+
+ ret = _gf_true;
+
+ local->transaction.inherited = _gf_true;
}
- UNLOCK (&frame->lock);
+unlock:
+ UNLOCK(&fd->lock);
- if (call_count == 0) {
- if (afr_lock_server_count (priv, local->transaction.type) == 0) {
- local->transaction.done (frame, this);
+ return ret;
+}
+
+
+gf_boolean_t
+afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ fd_t *fd = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int i = 0;
+ gf_boolean_t ret = _gf_false;
+ int type = 0;
+
+ local = frame->local;
+ priv = this->private;
+ fd = local->fd;
+
+ if (!fd)
+ return _gf_false;
+
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
+ return _gf_false;
+
+ if (local->transaction.inherited)
+ /* was already inherited in afr_changelog_pre_op */
+ return _gf_false;
+
+ if (!local->transaction.dirtied)
+ return _gf_false;
+
+ if (!afr_txn_nothing_failed (frame, this))
+ return _gf_false;
+
+ type = afr_index_for_transaction_type (local->transaction.type);
+
+ ret = _gf_false;
+
+ LOCK(&fd->lock);
+ {
+ if (!fd_ctx->on_disk[type]) {
+ for (i = 0; i < priv->child_count; i++)
+ fd_ctx->pre_op_done[type][i] =
+ local->transaction.pre_op[i];
} else {
- afr_unlock (frame, this);
+ for (i = 0; i < priv->child_count; i++)
+ if (fd_ctx->pre_op_done[type][i] !=
+ local->transaction.pre_op[i]) {
+ local->transaction.no_uninherit = 1;
+ goto unlock;
+ }
}
+ fd_ctx->on_disk[type]++;
+
+ ret = _gf_true;
}
+unlock:
+ UNLOCK(&fd->lock);
- return 0;
+ return ret;
}
-int
-afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
+int
+afr_changelog_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
{
- afr_private_t * priv = this->private;
+ afr_local_t *local = NULL;
+ int call_count = -1;
- int ret = 0;
- int i = 0;
- int call_count = 0;
-
- afr_local_t * local = NULL;
- dict_t * xattr = dict_ref (get_new_dict ());
+ local = frame->local;
- local = frame->local;
+ if (op_ret == -1)
+ afr_transaction_fop_failed (frame, this, (long) cookie);
- __mark_down_children (local->pending, priv->child_count,
- local->child_up, local->transaction.type);
-
- if (local->op == GF_FOP_FLUSH) {
- __mark_failed_children (local->pending, priv->child_count,
- this, local->fd,
- local->transaction.type);
- }
+ call_count = afr_frame_return (frame);
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ if (call_count == 0)
+ local->transaction.changelog_resume (frame, this);
- if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
- call_count *= 2;
- }
+ return 0;
+}
- local->call_count = call_count;
+
+int
+afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr,
+ afr_changelog_resume_t changelog_resume)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_changelog_call_count (local->transaction.type,
+ local->transaction.pre_op,
+ priv->child_count);
if (call_count == 0) {
- /* no child is up */
- dict_unref (xattr);
- afr_unlock (frame, this);
+ changelog_resume (frame, this);
return 0;
}
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- ret = afr_set_pending_dict (priv, xattr,
- local->pending);
-
- if (ret < 0)
- gf_log (this->name, GF_LOG_ERROR,
- "failed to set pending entry");
-
-
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- case AFR_FLUSH_TRANSACTION:
- {
- if (local->fd)
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr);
- else
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, xattr);
- }
- break;
+ local->call_count = call_count;
+
+ local->transaction.changelog_resume = changelog_resume;
- case AFR_ENTRY_RENAME_TRANSACTION:
- {
- STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk,
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i])
+ continue;
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ if (!local->fd) {
+ STACK_WIND_COOKIE (frame, afr_changelog_cbk,
(void *) (long) i,
priv->children[i],
priv->children[i]->fops->xattrop,
- &local->transaction.new_parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr);
-
- call_count--;
- }
-
- /*
- set it again because previous stack_wind
- might have already returned (think of case
- where subvolume is posix) and would have
- used the dict as placeholder for return
- value
- */
-
- ret = afr_set_pending_dict (priv, xattr,
- local->pending);
-
- if (ret < 0)
- gf_log (this->name, GF_LOG_ERROR,
- "failed to set pending entry");
-
- /* fall through */
-
- case AFR_ENTRY_TRANSACTION:
- {
- if (local->fd)
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr);
- else
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr);
- }
+ &local->loc,
+ GF_XATTROP_ADD_ARRAY, xattr,
+ NULL);
+ } else {
+ STACK_WIND_COOKIE (frame, afr_changelog_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr,
+ NULL);
+ }
+ break;
+ case AFR_ENTRY_RENAME_TRANSACTION:
+
+ STACK_WIND_COOKIE (frame, afr_changelog_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.new_parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr,
+ NULL);
+ call_count--;
+
+ /* fall through */
+
+ case AFR_ENTRY_TRANSACTION:
+ if (local->fd)
+ STACK_WIND_COOKIE (frame, afr_changelog_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr,
+ NULL);
+ else
+ STACK_WIND_COOKIE (frame, afr_changelog_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr,
+ NULL);
break;
- }
-
- if (!--call_count)
- break;
}
- }
-
- dict_unref (xattr);
+
+ if (!--call_count)
+ break;
+ }
+
return 0;
}
-int32_t
-afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr)
+int
+afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = this->private;
- loc_t * loc = NULL;
+ afr_private_t * priv = this->private;
+ int i = 0;
+ int ret = 0;
+ int call_count = 0;
+ int op_errno = 0;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ unsigned char *locked_nodes = NULL;
+ unsigned char *pending_subvols = NULL;
+ int idx = -1;
+ gf_boolean_t pre_nop = _gf_true;
+ dict_t *xdata_req = NULL;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ idx = afr_index_for_transaction_type (local->transaction.type);
- int call_count = -1;
- int child_index = (long) cookie;
+ locked_nodes = afr_locked_nodes_get (local->transaction.type, int_lock);
- local = frame->local;
- loc = &local->loc;
+ pending_subvols = alloca0 (priv->child_count);
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->child_up[child_index] = 0;
-
- if (op_errno == ENOTSUP) {
- gf_log (this->name, GF_LOG_ERROR,
- "xattrop not supported by %s",
- priv->children[child_index]->name);
- local->op_ret = -1;
- } else if (!child_went_down (op_ret, op_errno)) {
- gf_log (this->name, GF_LOG_ERROR,
- "xattrop failed on child %s: %s",
- priv->children[child_index]->name,
- strerror (op_errno));
- }
- local->op_errno = op_errno;
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_nodes[i]) {
+ local->transaction.pre_op[i] = 1;
+ call_count++;
+ } else {
+ pending_subvols[i] = 1;
}
+ }
+
+ /* TBD: quorum check w/ call_count */
- call_count = --local->call_count;
+ if (call_count == 0) {
+ op_errno = ENOTCONN;
+ goto err;
}
- UNLOCK (&frame->lock);
- if (call_count == 0) {
- if ((local->op_ret == -1) &&
- (local->op_errno == ENOTSUP)) {
- local->transaction.resume (frame, this);
- } else {
- __mark_all_success (local->pending, priv->child_count,
- local->transaction.type);
+ xdata_req = dict_new();
+ if (!xdata_req) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ pre_nop = _gf_true;
- afr_pid_restore (frame);
+ if (afr_changelog_pre_op_inherit (frame, this))
+ goto next;
- local->transaction.fop (frame, this);
+ if (call_count < priv->child_count) {
+ /* For subvols we are not performing operation on,
+ mark them as pending up-front along with the FOP
+ so that we can safely defer unmarking dirty until
+ later.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (pending_subvols[i])
+ local->pending[i][idx] = hton32(1);
+ }
+ ret = afr_set_pending_dict (priv, xdata_req,
+ local->pending);
+ if (ret < 0) {
+ op_errno = ENOMEM;
+ goto err;
}
+ pre_nop = _gf_false;
}
- return 0;
+ if (call_count > 1 &&
+ (local->transaction.type == AFR_DATA_TRANSACTION ||
+ !local->optimistic_change_log)) {
+
+ /* If we are performing change on only one subvol, no
+ need to mark dirty, because we are setting the pending
+ counts already anyways
+ */
+ local->dirty[idx] = hton32(1);
+
+ ret = dict_set_static_bin (xdata_req, AFR_DIRTY, local->dirty,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ pre_nop = _gf_false;
+ local->transaction.dirtied = 1;
+ }
+
+ if (pre_nop)
+ goto next;
+
+ if (!local->pre_op_compat) {
+ dict_copy (xdata_req, local->xdata_req);
+ goto next;
+ }
+
+ afr_changelog_do (frame, this, xdata_req, afr_transaction_perform_fop);
+
+ if (xdata_req)
+ dict_unref (xdata_req);
+
+ return 0;
+next:
+ afr_transaction_perform_fop (frame, this);
+
+ if (xdata_req)
+ dict_unref (xdata_req);
+
+ return 0;
+err:
+ local->internal_lock.lock_cbk = local->transaction.done;
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+
+ afr_unlock (frame, this);
+
+ if (xdata_req)
+ dict_unref (xdata_req);
+
+ return 0;
}
-int
-afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
+int
+afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
{
- afr_private_t * priv = this->private;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
- int i = 0;
- int ret = 0;
- int call_count = 0;
- dict_t *xattr = NULL;
+ local = frame->local;
+ int_lock = &local->internal_lock;
- afr_local_t *local = NULL;
+ if (int_lock->lock_op_ret < 0) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Blocking inodelks failed.");
+ local->transaction.done (frame, this);
+ } else {
- local = frame->local;
- xattr = get_new_dict ();
- dict_ref (xattr);
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Blocking inodelks done. Proceeding to FOP");
+ afr_internal_lock_finish (frame, this);
+ }
- call_count = afr_up_children_count (priv->child_count,
- local->child_up);
+ return 0;
+}
- if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
- call_count *= 2;
- }
- if (call_count == 0) {
- /* no child is up */
- dict_unref (xattr);
- afr_unlock (frame, this);
- return 0;
- }
+int
+afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ /* Initiate blocking locks if non-blocking has failed */
+ if (int_lock->lock_op_ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Non blocking inodelks failed. Proceeding to blocking");
+ int_lock->lock_cbk = afr_post_blocking_inodelk_cbk;
+ afr_blocking_lock (frame, this);
+ } else {
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Non blocking inodelks done. Proceeding to FOP");
+ afr_internal_lock_finish (frame, this);
+ }
- local->call_count = call_count;
+ return 0;
+}
- __mark_all_pending (local->pending, priv->child_count,
- local->transaction.type);
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- ret = afr_set_pending_dict (priv, xattr,
- local->pending);
-
- if (ret < 0)
- gf_log (this->name, GF_LOG_ERROR,
- "failed to set pending entry");
-
-
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- case AFR_FLUSH_TRANSACTION:
- {
- if (local->fd)
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr);
- else
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &(local->loc),
- GF_XATTROP_ADD_ARRAY, xattr);
- }
- break;
-
- case AFR_ENTRY_RENAME_TRANSACTION:
- {
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.new_parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr);
+int
+afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
- call_count--;
- }
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ if (int_lock->lock_op_ret < 0) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Blocking entrylks failed.");
+ local->transaction.done (frame, this);
+ } else {
- /*
- set it again because previous stack_wind
- might have already returned (think of case
- where subvolume is posix) and would have
- used the dict as placeholder for return
- value
- */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Blocking entrylks done. Proceeding to FOP");
+ afr_internal_lock_finish (frame, this);
+ }
- ret = afr_set_pending_dict (priv, xattr,
- local->pending);
-
- if (ret < 0)
- gf_log (this->name, GF_LOG_ERROR,
- "failed to set pending entry");
-
- /* fall through */
-
- case AFR_ENTRY_TRANSACTION:
- {
- if (local->fd)
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr);
- else
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr);
- }
+ return 0;
+}
- break;
- }
- if (!--call_count)
- break;
- }
- }
+int
+afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
- dict_unref (xattr);
- return 0;
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ /* Initiate blocking locks if non-blocking has failed */
+ if (int_lock->lock_op_ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Non blocking entrylks failed. Proceeding to blocking");
+ int_lock->lock_cbk = afr_post_blocking_entrylk_cbk;
+ afr_blocking_lock (frame, this);
+ } else {
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Non blocking entrylks done. Proceeding to FOP");
+ afr_internal_lock_finish (frame, this);
+ }
+
+ return 0;
+}
+
+
+int
+afr_post_blocking_rename_cbk (call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ if (int_lock->lock_op_ret < 0) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Blocking entrylks failed.");
+ local->transaction.done (frame, this);
+ } else {
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Blocking entrylks done. Proceeding to FOP");
+ afr_internal_lock_finish (frame, this);
+ }
+ return 0;
+}
+
+
+int
+afr_post_lower_unlock_cbk (call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ GF_ASSERT (!int_lock->higher_locked);
+
+ int_lock->lock_cbk = afr_post_blocking_rename_cbk;
+ afr_blocking_lock (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_set_transaction_flock (afr_local_t *local)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
+
+ int_lock = &local->internal_lock;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+
+ inodelk->flock.l_len = local->transaction.len;
+ inodelk->flock.l_start = local->transaction.start;
+ inodelk->flock.l_type = F_WRLCK;
+
+ return 0;
+}
+
+int
+afr_lock_rec (call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ int_lock->transaction_lk_type = AFR_TRANSACTION_LK;
+ int_lock->domain = this->name;
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ afr_set_transaction_flock (local);
+
+ int_lock->lock_cbk = afr_post_nonblocking_inodelk_cbk;
+
+ afr_nonblocking_inodelk (frame, this);
+ break;
+
+ case AFR_ENTRY_RENAME_TRANSACTION:
+
+ int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk;
+ afr_nonblocking_entrylk (frame, this);
+ break;
+
+ case AFR_ENTRY_TRANSACTION:
+ int_lock->lk_basename = local->transaction.basename;
+ if (&local->transaction.parent_loc)
+ int_lock->lk_loc = &local->transaction.parent_loc;
+ else
+ GF_ASSERT (local->fd);
+
+ int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk;
+ afr_nonblocking_entrylk (frame, this);
+ break;
+ }
+
+ return 0;
}
+
+int
+afr_lock (call_frame_t *frame, xlator_t *this)
+{
+ afr_set_lock_number (frame, this);
+
+ return afr_lock_rec (frame, this);
+}
+
+
/* }}} */
-/* {{{ lock */
+int
+afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
+{
+ if (__fop_changelog_needed (frame, this)) {
+ afr_changelog_pre_op (frame, this);
+ } else {
+ afr_transaction_perform_fop (frame, this);
+ }
+
+ return 0;
+}
-static
-int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index);
-int32_t
-afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+void
+afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int done = 0;
- int child_index = (long) cookie;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- int call_count = 0;
+ /* call this function from any of the related optimizations
+ which benefit from delaying post op are enabled, namely:
- local = frame->local;
- priv = this->private;
+ - changelog piggybacking
+ - eager locking
+ */
- LOCK (&frame->lock);
- {
- if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
- /* wait for the other lock to return */
- call_count = --local->call_count;
- }
+ priv = this->private;
+ if (!priv)
+ return;
- if (op_ret == -1) {
- if (op_errno == ENOSYS) {
- /* return ENOTSUP */
- gf_log (this->name, GF_LOG_ERROR,
- "subvolume does not support locking. "
- "please load features/posix-locks xlator on server");
- local->op_ret = op_ret;
- done = 1;
- }
+ if (!priv->post_op_delay_secs)
+ return;
- local->child_up[child_index] = 0;
- local->op_errno = op_errno;
- }
- }
- UNLOCK (&frame->lock);
-
- if (call_count == 0) {
- if ((local->op_ret == -1) &&
- (local->op_errno == ENOSYS)) {
- afr_unlock (frame, this);
- } else {
- local->transaction.locked_nodes[child_index] = 1;
- local->transaction.lock_count++;
- afr_lock_rec (frame, this, child_index + 1);
- }
- }
+ local = frame->local;
+ if (!local->transaction.eager_lock_on)
+ return;
- return 0;
+ if (!local)
+ return;
+
+ if (!local->fd)
+ return;
+
+ if (local->op == GF_FOP_WRITE)
+ local->delayed_post_op = _gf_true;
}
+gf_boolean_t
+afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
+{
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ if (!fd) {
+ /* If false is returned, it may keep on taking eager-lock
+ * which may lead to starvation, so return true to avoid that.
+ */
+ gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid fd");
+ return _gf_true;
+ }
+ /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock
+ * is taken mount2 opened the same file, it won't be able to
+ * perform any data operations until mount1 releases eager-lock.
+ * To avoid such scenario do not enable eager-lock for this transaction
+ * if open-fd-count is > 1
+ */
+
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
+ return _gf_true;
-static loc_t *
-lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2)
+ if (fd_ctx->open_fd_count > 1)
+ return _gf_true;
+
+ return _gf_false;
+}
+
+
+gf_boolean_t
+is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this)
{
- int ret = 0;
+ afr_local_t *local = NULL;
+ gf_boolean_t res = _gf_false;
- ret = strcmp (l1->path, l2->path);
-
- if (ret == 0)
- ret = strcmp (b1, b2);
+ local = frame->local;
+ if (!local)
+ goto out;
- if (ret <= 0)
- return l1;
- else
- return l2;
+ if (!local->delayed_post_op)
+ goto out;
+
+ //Mark pending changelog ASAP
+ if (!afr_txn_nothing_failed (frame, this))
+ goto out;
+
+ if (local->fd && afr_are_multiple_fds_opened (local->fd, this))
+ goto out;
+
+ res = _gf_true;
+out:
+ return res;
}
-static
-int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index)
+void
+afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
+ call_stub_t *stub);
+
+void
+afr_delayed_changelog_wake_up_cbk (void *data)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
+ fd_t *fd = NULL;
- struct flock flock;
+ fd = data;
- loc_t * lower = NULL;
- loc_t * higher = NULL;
+ afr_delayed_changelog_wake_up (THIS, fd);
+}
- const char *lower_name = NULL;
- const char *higher_name = NULL;
- local = frame->local;
- priv = this->private;
+/* SET operation */
+int
+afr_fd_report_unstable_write (xlator_t *this, fd_t *fd)
+{
+ afr_fd_ctx_t *fdctx = NULL;
- flock.l_start = local->transaction.start;
- flock.l_len = local->transaction.len;
- flock.l_type = F_WRLCK;
+ fdctx = afr_fd_ctx_get (fd, this);
- /* skip over children that are down */
- while ((child_index < priv->child_count)
- && !local->child_up[child_index])
- child_index++;
+ LOCK(&fd->lock);
+ {
+ fdctx->witnessed_unstable_write = _gf_true;
+ }
+ UNLOCK(&fd->lock);
- if ((child_index == priv->child_count) &&
- local->transaction.lock_count == 0) {
+ return 0;
+}
- gf_log (this->name, GF_LOG_DEBUG,
- "unable to lock on even one child");
+/* TEST and CLEAR operation */
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd)
+{
+ afr_fd_ctx_t *fdctx = NULL;
+ gf_boolean_t witness = _gf_false;
- local->op_ret = -1;
- local->op_errno = EAGAIN;
+ fdctx = afr_fd_ctx_get (fd, this);
+ if (!fdctx)
+ return _gf_true;
- local->transaction.done (frame, this);
-
- return 0;
+ LOCK(&fd->lock);
+ {
+ if (fdctx->witnessed_unstable_write) {
+ witness = _gf_true;
+ fdctx->witnessed_unstable_write = _gf_false;
+ }
+ }
+ UNLOCK (&fd->lock);
+
+ return witness;
+}
+
+int
+afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ int child_index = (long) cookie;
+ int call_count = -1;
+ afr_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (op_ret != 0) {
+ /* Failure of fsync() is as good as failure of previous
+ write(). So treat it like one.
+ */
+ gf_log (this->name, GF_LOG_WARNING,
+ "fsync(%s) failed on subvolume %s. Transaction was %s",
+ uuid_utoa (local->fd->inode->gfid),
+ priv->children[child_index]->name,
+ gf_fop_list[local->op]);
+
+ afr_transaction_fop_failed (frame, this, child_index);
}
- if ((child_index == priv->child_count)
- || (local->transaction.lock_count ==
- afr_lock_server_count (priv, local->transaction.type))) {
+ call_count = afr_frame_return (frame);
- /* we're done locking */
+ if (call_count == 0)
+ afr_changelog_post_op_now (frame, this);
- if (__changelog_needed_pre_op (frame, this)) {
- afr_changelog_pre_op (frame, this);
- } else {
- __mark_all_success (local->pending, priv->child_count,
- local->transaction.type);
+ return 0;
+}
- afr_pid_restore (frame);
- local->transaction.fop (frame, this);
- }
+int
+afr_changelog_fsync (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ GF_UNUSED int ret = -1;
- return 0;
- }
+ local = frame->local;
+ priv = this->private;
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- case AFR_FLUSH_TRANSACTION:
-
- if (local->fd) {
- STACK_WIND_COOKIE (frame, afr_lock_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->finodelk,
- this->name, local->fd,
- F_SETLKW, &flock);
-
- } else {
- STACK_WIND_COOKIE (frame, afr_lock_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->inodelk,
- this->name, &local->loc,
- F_SETLKW, &flock);
- }
-
- break;
-
- case AFR_ENTRY_RENAME_TRANSACTION:
+ call_count = AFR_COUNT (local->transaction.pre_op, priv->child_count);
+
+ if (!call_count) {
+ /* will go straight to unlock */
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ xdata = dict_new();
+ if (xdata)
+ ret = dict_set_int32 (xdata, "batch-fsync", 1);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i])
+ continue;
+
+ STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk,
+ (void *) (long) i, priv->children[i],
+ priv->children[i]->fops->fsync, local->fd,
+ 1, xdata);
+ if (!--call_count)
+ break;
+ }
+
+ if (xdata)
+ dict_unref (xdata);
+
+ return 0;
+}
+
+
+int
+afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) {
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
+
+ if (afr_changelog_pre_op_uninherit (frame, this) &&
+ afr_txn_nothing_failed (frame, this)) {
+ /* just detected that this post-op is about to
+ be optimized away as a new write() has
+ already piggybacked on this frame's changelog.
+ */
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
+
+ /* Calling afr_changelog_post_op_now() now will result in
+ issuing ->[f]xattrop().
+
+ Performing a hard POST-OP (->[f]xattrop() FOP) is a more
+ responsible operation that what it might appear on the surface.
+
+ The changelog of a file (in the xattr of the file on the server)
+ stores information (pending count) about the state of the file
+ on the OTHER server. This changelog is blindly trusted, and must
+ therefore be updated in such a way it remains trustworthy. This
+ implies that decrementing the pending count (essentially "clearing
+ the dirty flag") must be done STRICTLY after we are sure that the
+ operation on the other server has reached stable storage.
+
+ While the backend filesystem on that server will eventually flush
+ it to stable storage, we (being in userspace) have no mechanism
+ to get notified when the write became "stable".
+
+ This means we need take matter into our own hands and issue an
+ fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES,
+ and get an acknowledgement for it. And we need to wait for the
+ fsync() acknowledgement before initiating the hard POST-OP.
+
+ However if the FD itself was opened in O_SYNC or O_DSYNC then
+ we are already guaranteed that the writes were made stable as
+ part of the FOP itself. The same holds true for NFS stable
+ writes which happen on an anonymous FD with O_DSYNC or O_SYNC
+ flag set in the writev() @flags param. For all other write types,
+ mark a flag in the fdctx whenever an unstable write is witnessed.
+ */
+
+ if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) {
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
+
+ /* Check whether users want durability and perform fsync/post-op
+ * accordingly.
+ */
+ if (priv->ensure_durability) {
+ /* Time to fsync() */
+ afr_changelog_fsync (frame, this);
+ } else {
+ afr_changelog_post_op_now (frame, this);
+ }
+
+ return 0;
+}
+
+
+void
+afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
+ call_stub_t *stub)
+{
+ afr_fd_ctx_t *fd_ctx = NULL;
+ call_frame_t *prev_frame = NULL;
+ struct timespec delta = {0, };
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ priv = this->private;
+
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
+ goto out;
+
+ delta.tv_sec = priv->post_op_delay_secs;
+ delta.tv_nsec = 0;
+
+ pthread_mutex_lock (&fd_ctx->delay_lock);
{
- local->call_count = 2;
-
- lower = lower_path (&local->transaction.parent_loc,
- local->transaction.basename,
- &local->transaction.new_parent_loc,
- local->transaction.new_basename);
-
- lower_name = (lower == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
- higher = (lower == &local->transaction.parent_loc ?
- &local->transaction.new_parent_loc :
- &local->transaction.parent_loc);
-
- higher_name = (higher == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
-
- /* TODO: these locks should be blocking */
-
- STACK_WIND_COOKIE (frame, afr_lock_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->entrylk,
- this->name, lower, lower_name,
- ENTRYLK_LOCK, ENTRYLK_WRLCK);
-
- STACK_WIND_COOKIE (frame, afr_lock_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->entrylk,
- this->name, higher, higher_name,
- ENTRYLK_LOCK, ENTRYLK_WRLCK);
-
- break;
+ prev_frame = fd_ctx->delay_frame;
+ fd_ctx->delay_frame = NULL;
+ if (fd_ctx->delay_timer)
+ gf_timer_call_cancel (this->ctx, fd_ctx->delay_timer);
+ fd_ctx->delay_timer = NULL;
+ if (!frame)
+ goto unlock;
+ fd_ctx->delay_timer = gf_timer_call_after (this->ctx, delta,
+ afr_delayed_changelog_wake_up_cbk,
+ fd);
+ fd_ctx->delay_frame = frame;
}
-
- case AFR_ENTRY_TRANSACTION:
- if (local->fd) {
- STACK_WIND_COOKIE (frame, afr_lock_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->fentrylk,
- this->name, local->fd,
- local->transaction.basename,
- ENTRYLK_LOCK, ENTRYLK_WRLCK);
- } else {
- STACK_WIND_COOKIE (frame, afr_lock_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->entrylk,
- this->name,
- &local->transaction.parent_loc,
- local->transaction.basename,
- ENTRYLK_LOCK, ENTRYLK_WRLCK);
- }
+unlock:
+ pthread_mutex_unlock (&fd_ctx->delay_lock);
- break;
+out:
+ if (prev_frame) {
+ local = prev_frame->local;
+ local->transaction.resume_stub = stub;
+ afr_changelog_post_op_now (prev_frame, this);
+ } else if (stub) {
+ call_resume (stub);
}
-
- return 0;
}
-int32_t afr_lock (call_frame_t *frame, xlator_t *this)
+void
+afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
{
- afr_pid_save (frame);
+ afr_local_t *local = NULL;
- frame->root->pid = (long) frame->root;
+ local = frame->local;
- return afr_lock_rec (frame, this, 0);
+ if (is_afr_delayed_changelog_post_op_needed (frame, this))
+ afr_delayed_changelog_post_op (this, frame, local->fd, NULL);
+ else
+ afr_changelog_post_op_safe (frame, this);
}
-/* }}} */
-int32_t
+/* Wake up the sleeping/delayed post-op, and also register
+ a stub to have it resumed after this transaction
+ completely finishes.
+
+ The @stub gets saved in @local and gets resumed in
+ afr_local_cleanup()
+ */
+void
+afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub)
+{
+ afr_delayed_changelog_post_op (this, NULL, fd, stub);
+}
+
+
+void
+afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd)
+{
+ afr_delayed_changelog_post_op (this, NULL, fd, NULL);
+}
+
+
+int
afr_transaction_resume (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
- if (__changelog_needed_post_op (frame, this)) {
- afr_changelog_post_op (frame, this);
- } else {
- if (afr_lock_server_count (priv, local->transaction.type) == 0) {
- local->transaction.done (frame, this);
- } else {
- afr_unlock (frame, this);
- }
- }
+ if (local->transaction.eager_lock_on) {
+ /* We don't need to retain "local" in the
+ fd list anymore, writes to all subvols
+ are finished by now */
+ afr_remove_eager_lock_stub (local);
+ }
- return 0;
+ afr_restore_lk_owner (frame);
+
+ afr_handle_symmetric_errors (frame, this);
+
+ if (!local->pre_op_compat)
+ /* new mode, pre-op was done along
+ with OP */
+ afr_changelog_pre_op_update (frame, this);
+
+ if (__fop_changelog_needed (frame, this)) {
+ afr_changelog_post_op (frame, this);
+ } else {
+ afr_changelog_post_op_done (frame, this);
+ }
+
+ return 0;
}
@@ -1120,54 +1582,141 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this)
*/
void
-afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index)
+afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
+ int child_index)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
- switch (local->op) {
- case GF_FOP_WRITE:
- __mark_fop_failed_on_fd (local->fd, this, child_index);
- break;
- default:
- __mark_child_dead (local->pending, priv->child_count,
- child_index, local->transaction.type);
- break;
+ local->transaction.failed_subvols[child_index] = 1;
+}
+
+
+
+ static gf_boolean_t
+afr_locals_overlap (afr_local_t *local1, afr_local_t *local2)
+{
+ uint64_t start1 = local1->transaction.start;
+ uint64_t start2 = local2->transaction.start;
+ uint64_t end1 = 0;
+ uint64_t end2 = 0;
+
+ if (local1->transaction.len)
+ end1 = start1 + local1->transaction.len - 1;
+ else
+ end1 = ULLONG_MAX;
+
+ if (local2->transaction.len)
+ end2 = start2 + local2->transaction.len - 1;
+ else
+ end2 = ULLONG_MAX;
+
+ return ((end1 >= start2) && (end2 >= start1));
+}
+
+void
+afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fdctx = NULL;
+ afr_local_t *each = NULL;
+
+ priv = this->private;
+
+ if (!local->fd)
+ return;
+
+ if (local->transaction.type != AFR_DATA_TRANSACTION)
+ return;
+
+ if (!priv->eager_lock)
+ return;
+
+ fdctx = afr_fd_ctx_get (local->fd, this);
+ if (!fdctx)
+ return;
+
+ if (afr_are_multiple_fds_opened (local->fd, this))
+ return;
+ /*
+ * Once full file lock is acquired in eager-lock phase, overlapping
+ * writes do not compete for inode-locks, instead are transferred to the
+ * next writes. Because of this overlapping writes are not ordered.
+ * This can cause inconsistencies in replication.
+ * Example:
+ * Two overlapping writes w1, w2 are sent in parallel on same fd
+ * in two threads t1, t2.
+ * Both threads can execute afr_writev_wind in the following manner.
+ * t1 winds w1 on brick-0
+ * t2 winds w2 on brick-0
+ * t2 winds w2 on brick-1
+ * t1 winds w1 on brick-1
+ *
+ * This check makes sure the locks are not transferred for
+ * overlapping writes.
+ */
+ LOCK (&local->fd->lock);
+ {
+ list_for_each_entry (each, &fdctx->eager_locked,
+ transaction.eager_locked) {
+ if (afr_locals_overlap (each, local)) {
+ local->transaction.eager_lock_on = _gf_false;
+ goto unlock;
+ }
+ }
+
+ local->transaction.eager_lock_on = _gf_true;
+ list_add_tail (&local->transaction.eager_locked,
+ &fdctx->eager_locked);
}
+unlock:
+ UNLOCK (&local->fd->lock);
}
-int32_t
+int
afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
+ afr_local_t * local = NULL;
+ afr_private_t * priv = NULL;
+ fd_t *fd = NULL;
+ int ret = -1;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- afr_transaction_local_init (local, priv);
+ local->transaction.resume = afr_transaction_resume;
+ local->transaction.type = type;
- local->transaction.resume = afr_transaction_resume;
- local->transaction.type = type;
+ ret = afr_transaction_local_init (local, this);
+ if (ret < 0)
+ goto out;
- if (afr_lock_server_count (priv, local->transaction.type) == 0) {
- if (__changelog_needed_pre_op (frame, this)) {
- afr_changelog_pre_op (frame, this);
- } else {
- __mark_all_success (local->pending, priv->child_count,
- local->transaction.type);
+ afr_transaction_eager_lock_init (local, this);
- afr_pid_restore (frame);
+ if (local->fd && local->transaction.eager_lock_on)
+ afr_set_lk_owner (frame, this, local->fd);
+ else
+ afr_set_lk_owner (frame, this, frame->root);
- local->transaction.fop (frame, this);
- }
- } else {
- afr_lock (frame, this);
- }
+ if (!local->transaction.eager_lock_on && local->loc.inode) {
+ fd = fd_lookup (local->loc.inode, frame->root->pid);
+ if (fd == NULL)
+ fd = fd_lookup_anonymous (local->loc.inode);
- return 0;
+ if (fd) {
+ afr_delayed_changelog_wake_up (this, fd);
+ fd_unref (fd);
+ }
+ }
+
+ if (afr_lock_server_count (priv, local->transaction.type) == 0) {
+ afr_internal_lock_finish (frame, this);
+ } else {
+ afr_lock (frame, this);
+ }
+ ret = 0;
+out:
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
index c7a6490e7..77cc8eed0 100644
--- a/xlators/cluster/afr/src/afr-transaction.h
+++ b/xlators/cluster/afr/src/afr-transaction.h
@@ -1,30 +1,53 @@
/*
- Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef __TRANSACTION_H__
#define __TRANSACTION_H__
+#include "afr.h"
+
void
afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
int child_index);
+int
+afr_lock_server_count (afr_private_t *priv, afr_transaction_type type);
+
+afr_inodelk_t*
+afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom);
+
int32_t
afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type);
+int
+afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending);
+
+void
+afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this);
+
+void
+afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd);
+
+void
+__mark_all_success (call_frame_t *frame, xlator_t *this);
+
+gf_boolean_t
+afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this);
+
+int afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ afr_read_txn_wind_t readfn, afr_transaction_type type);
+
+int afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol);
+
+int __afr_txn_write_fop (call_frame_t *frame, xlator_t *this);
+int __afr_txn_write_done (call_frame_t *frame, xlator_t *this);
+call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame);
+
#endif /* __TRANSACTION_H__ */
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 270364ff9..ead08425f 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#include <libgen.h>
@@ -28,2514 +19,731 @@
#define _CONFIG_H
#include "config.h"
#endif
+#include "afr-common.c"
-#include "glusterfs.h"
-#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-
-#include "fd.h"
-
-#include "afr-inode-read.h"
-#include "afr-inode-write.h"
-#include "afr-dir-read.h"
-#include "afr-dir-write.h"
-#include "afr-transaction.h"
-
-#include "afr-self-heal.h"
-
-
-uint64_t
-afr_is_split_brain (xlator_t *this, inode_t *inode)
-{
- int ret = 0;
-
- uint64_t ctx = 0;
- uint64_t split_brain = 0;
-
- LOCK (&inode->lock);
- {
- ret = __inode_ctx_get (inode, this, &ctx);
-
- if (ret < 0)
- goto unlock;
-
- split_brain = ctx & 0xFFFFFFFF00000000ULL;
- }
-unlock:
- UNLOCK (&inode->lock);
-
- return split_brain;
-}
-
+struct volume_options options[];
-void
-afr_set_split_brain (xlator_t *this, inode_t *inode, int32_t split_brain)
+int32_t
+notify (xlator_t *this, int32_t event,
+ void *data, ...)
{
- uint64_t ctx = 0;
- int ret = 0;
-
- LOCK (&inode->lock);
- {
- ret = __inode_ctx_get (inode, this, &ctx);
-
- if (ret < 0) {
- ctx = 0;
- }
+ int ret = -1;
+ va_list ap;
+ void *data2 = NULL;
- ctx = (0x00000000FFFFFFFFULL & ctx)
- | (split_brain & 0xFFFFFFFF00000000ULL);
+ va_start (ap, data);
+ data2 = va_arg (ap, dict_t*);
+ va_end (ap);
+ ret = afr_notify (this, event, data, data2);
- __inode_ctx_put (inode, this, ctx);
- }
- UNLOCK (&inode->lock);
-}
-
-
-uint64_t
-afr_read_child (xlator_t *this, inode_t *inode)
-{
- int ret = 0;
-
- uint64_t ctx = 0;
- uint64_t read_child = 0;
-
- LOCK (&inode->lock);
- {
- ret = __inode_ctx_get (inode, this, &ctx);
-
- if (ret < 0)
- goto unlock;
-
- read_child = ctx & 0x00000000FFFFFFFFULL;
- }
-unlock:
- UNLOCK (&inode->lock);
-
- return read_child;
+ return ret;
}
-
-void
-afr_set_read_child (xlator_t *this, inode_t *inode, int32_t read_child)
+int32_t
+mem_acct_init (xlator_t *this)
{
- uint64_t ctx = 0;
- int ret = 0;
+ int ret = -1;
- LOCK (&inode->lock);
- {
- ret = __inode_ctx_get (inode, this, &ctx);
+ if (!this)
+ return ret;
- if (ret < 0) {
- ctx = 0;
- }
-
- ctx = (0xFFFFFFFF00000000ULL & ctx)
- | (0x00000000FFFFFFFFULL & read_child);
+ ret = xlator_mem_acct_init (this, gf_afr_mt_end + 1);
- __inode_ctx_put (inode, this, ctx);
+ if (ret != 0) {
+ gf_log(this->name, GF_LOG_ERROR, "Memory accounting init"
+ "failed");
+ return ret;
}
- UNLOCK (&inode->lock);
-}
-
-
-/**
- * afr_local_cleanup - cleanup everything in frame->local
- */
-
-void
-afr_local_sh_cleanup (afr_local_t *local, xlator_t *this)
-{
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
-
- sh = &local->self_heal;
- priv = this->private;
-
- if (sh->buf)
- FREE (sh->buf);
-
- if (sh->xattr) {
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i]) {
- dict_unref (sh->xattr[i]);
- sh->xattr[i] = NULL;
- }
- }
- FREE (sh->xattr);
- }
-
- if (sh->child_errno)
- FREE (sh->child_errno);
-
- if (sh->pending_matrix) {
- for (i = 0; i < priv->child_count; i++) {
- FREE (sh->pending_matrix[i]);
- }
- FREE (sh->pending_matrix);
- }
-
- if (sh->delta_matrix) {
- for (i = 0; i < priv->child_count; i++) {
- FREE (sh->delta_matrix[i]);
- }
- FREE (sh->delta_matrix);
- }
-
- if (sh->sources)
- FREE (sh->sources);
-
- if (sh->success)
- FREE (sh->success);
-
- if (sh->healing_fd) {
- fd_unref (sh->healing_fd);
- sh->healing_fd = NULL;
- }
-
- loc_wipe (&sh->parent_loc);
+ return ret;
}
-void
-afr_local_cleanup (afr_local_t *local, xlator_t *this)
+int
+xlator_subvolume_index (xlator_t *this, xlator_t *subvol)
{
- int i;
- afr_private_t * priv = NULL;
-
- if (!local)
- return;
-
- afr_local_sh_cleanup (local, this);
+ int index = -1;
+ int i = 0;
+ xlator_list_t *list = NULL;
- FREE (local->child_errno);
+ list = this->children;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->pending && local->pending[i])
- FREE (local->pending[i]);
+ while (list) {
+ if (subvol == list->xlator ||
+ strcmp (subvol->name, list->xlator->name) == 0) {
+ index = i;
+ break;
+ }
+ list = list->next;
+ i++;
}
- FREE (local->pending);
-
- loc_wipe (&local->loc);
- loc_wipe (&local->newloc);
-
- FREE (local->transaction.locked_nodes);
- FREE (local->transaction.child_errno);
-
- FREE (local->transaction.basename);
- FREE (local->transaction.new_basename);
-
- loc_wipe (&local->transaction.parent_loc);
- loc_wipe (&local->transaction.new_parent_loc);
-
- if (local->fd)
- fd_unref (local->fd);
-
- if (local->xattr_req)
- dict_unref (local->xattr_req);
-
- FREE (local->child_up);
-
- { /* lookup */
- if (local->cont.lookup.xattr)
- dict_unref (local->cont.lookup.xattr);
- }
-
- { /* getxattr */
- if (local->cont.getxattr.name)
- FREE (local->cont.getxattr.name);
- }
-
- { /* lk */
- if (local->cont.lk.locked_nodes)
- FREE (local->cont.lk.locked_nodes);
- }
-
- { /* checksum */
- if (local->cont.checksum.file_checksum)
- FREE (local->cont.checksum.file_checksum);
- if (local->cont.checksum.dir_checksum)
- FREE (local->cont.checksum.dir_checksum);
- }
-
- { /* create */
- if (local->cont.create.fd)
- fd_unref (local->cont.create.fd);
- }
-
- { /* writev */
- FREE (local->cont.writev.vector);
- }
-
- { /* setxattr */
- if (local->cont.setxattr.dict)
- dict_unref (local->cont.setxattr.dict);
- }
-
- { /* removexattr */
- FREE (local->cont.removexattr.name);
- }
-
- { /* symlink */
- FREE (local->cont.symlink.linkpath);
- }
+ return index;
}
-
-int
-afr_frame_return (call_frame_t *frame)
+void
+fix_quorum_options (xlator_t *this, afr_private_t *priv, char *qtype)
{
- afr_local_t *local = NULL;
- int call_count = 0;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- call_count = --local->call_count;
- }
- UNLOCK (&frame->lock);
-
- return call_count;
+ if (priv->quorum_count && strcmp(qtype,"fixed")) {
+ gf_log(this->name,GF_LOG_WARNING,
+ "quorum-type %s overriding quorum-count %u",
+ qtype, priv->quorum_count);
+ }
+ if (!strcmp(qtype,"none")) {
+ priv->quorum_count = 0;
+ }
+ else if (!strcmp(qtype,"auto")) {
+ priv->quorum_count = AFR_QUORUM_AUTO;
+ }
}
-/**
- * first_up_child - return the index of the first child that is up
- */
-
int
-afr_first_up_child (afr_private_t *priv)
+reconfigure (xlator_t *this, dict_t *options)
{
- xlator_t ** children = NULL;
- int ret = -1;
- int i = 0;
-
- LOCK (&priv->lock);
- {
- children = priv->children;
- for (i = 0; i < priv->child_count; i++) {
- if (priv->child_up[i]) {
- ret = i;
- break;
- }
- }
- }
- UNLOCK (&priv->lock);
-
- return ret;
-}
+ afr_private_t *priv = NULL;
+ xlator_t *read_subvol = NULL;
+ int read_subvol_index = -1;
+ int ret = -1;
+ int index = -1;
+ char *qtype = NULL;
+ priv = this->private;
-/**
- * up_children_count - return the number of children that are up
- */
+ GF_OPTION_RECONF ("afr-dirty-xattr",
+ priv->afr_dirty, options, str,
+ out);
-int
-afr_up_children_count (int child_count, unsigned char *child_up)
-{
- int i = 0;
- int ret = 0;
+ GF_OPTION_RECONF ("metadata-splitbrain-forced-heal",
+ priv->metadata_splitbrain_forced_heal, options, bool,
+ out);
- for (i = 0; i < child_count; i++)
- if (child_up[i])
- ret++;
- return ret;
-}
+ GF_OPTION_RECONF ("background-self-heal-count",
+ priv->background_self_heal_count, options, uint32,
+ out);
+ GF_OPTION_RECONF ("metadata-self-heal",
+ priv->metadata_self_heal, options, bool, out);
-int
-afr_locked_nodes_count (unsigned char *locked_nodes, int child_count)
-{
- int ret = 0;
- int i;
+ GF_OPTION_RECONF ("data-self-heal", priv->data_self_heal, options, str,
+ out);
- for (i = 0; i < child_count; i++)
- if (locked_nodes[i])
- ret++;
+ GF_OPTION_RECONF ("entry-self-heal", priv->entry_self_heal, options,
+ bool, out);
- return ret;
-}
+ GF_OPTION_RECONF ("data-self-heal-window-size",
+ priv->data_self_heal_window_size, options,
+ uint32, out);
+ GF_OPTION_RECONF ("data-change-log", priv->data_change_log, options,
+ bool, out);
-ino64_t
-afr_itransform (ino64_t ino, int child_count, int child_index)
-{
- ino64_t scaled_ino = -1;
+ GF_OPTION_RECONF ("metadata-change-log",
+ priv->metadata_change_log, options, bool, out);
- if (ino == ((uint64_t) -1)) {
- scaled_ino = ((uint64_t) -1);
- goto out;
- }
+ GF_OPTION_RECONF ("entry-change-log", priv->entry_change_log, options,
+ bool, out);
- scaled_ino = (ino * child_count) + child_index;
+ GF_OPTION_RECONF ("data-self-heal-algorithm",
+ priv->data_self_heal_algorithm, options, str, out);
-out:
- return scaled_ino;
-}
+ GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out);
+ GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode,
+ options, uint32, out);
-int
-afr_deitransform_orig (ino64_t ino, int child_count)
-{
- int index = -1;
+ if (read_subvol) {
+ index = xlator_subvolume_index (this, read_subvol);
+ if (index == -1) {
+ gf_log (this->name, GF_LOG_ERROR, "%s not a subvolume",
+ read_subvol->name);
+ goto out;
+ }
+ priv->read_child = index;
+ }
- index = ino % child_count;
+ GF_OPTION_RECONF ("read-subvolume-index",read_subvol_index, options,int32,out);
- return index;
-}
+ if (read_subvol_index >-1) {
+ index=read_subvol_index;
+ if (index >= priv->child_count) {
+ gf_log (this->name, GF_LOG_ERROR, "%d not a subvolume-index",
+ index);
+ goto out;
+ }
+ priv->read_child = index;
+ }
+ GF_OPTION_RECONF ("pre-op-compat", priv->pre_op_compat, options, bool, out);
-int
-afr_deitransform (ino64_t ino, int child_count)
-{
- return 0;
-}
+ GF_OPTION_RECONF ("eager-lock", priv->eager_lock, options, bool, out);
+ GF_OPTION_RECONF ("quorum-type", qtype, options, str, out);
+ GF_OPTION_RECONF ("quorum-count", priv->quorum_count, options,
+ uint32, out);
+ fix_quorum_options(this,priv,qtype);
+ GF_OPTION_RECONF ("post-op-delay-secs", priv->post_op_delay_secs, options,
+ uint32, out);
-int
-afr_self_heal_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- int ret = -1;
+ GF_OPTION_RECONF (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size,
+ options, size_uint64, out);
+ /* Reset this so we re-discover in case the topology changed. */
+ GF_OPTION_RECONF ("ensure-durability", priv->ensure_durability, options,
+ bool, out);
- local = frame->local;
+ GF_OPTION_RECONF ("self-heal-daemon", priv->shd.enabled, options,
+ bool, out);
- if (local->govinda_gOvinda) {
- afr_set_split_brain (this, local->cont.lookup.inode, 1);
+ GF_OPTION_RECONF ("iam-self-heal-daemon", priv->shd.iamshd, options,
+ bool, out);
- if (ret < 0) {
- local->op_ret = -1;
- local->op_errno = -ret;
- }
- } else {
- afr_set_split_brain (this, local->cont.lookup.inode, 0);
- }
+ priv->did_discovery = _gf_false;
- AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
- local->cont.lookup.inode,
- &local->cont.lookup.buf,
- local->cont.lookup.xattr);
+ ret = 0;
+out:
+ return ret;
- return 0;
}
-int
-afr_lookup_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct stat *buf, dict_t *xattr)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- struct stat * lookup_buf = NULL;
- int call_count = -1;
- int child_index = -1;
-
- uint32_t open_fd_count = 0;
- int ret = 0;
-
- child_index = (long) cookie;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- local = frame->local;
-
- lookup_buf = &local->cont.lookup.buf;
-
- if (op_ret == -1) {
- if (op_errno == ENOENT)
- local->enoent_count++;
-
- if (op_errno != ENOTCONN) {
- if (local->op_errno != ESTALE)
- local->op_errno = op_errno;
- }
-
- if (op_errno == ESTALE) {
- /* no matter what other subvolumes return for
- * this call, ESTALE _must_ be sent to parent
- */
- local->op_ret = -1;
- local->op_errno = ESTALE;
- }
- goto unlock;
- }
-
- if (afr_sh_has_metadata_pending (xattr, child_index, this))
- local->need_metadata_self_heal = 1;
-
- if (afr_sh_has_entry_pending (xattr, child_index, this))
- local->need_entry_self_heal = 1;
-
- if (afr_sh_has_data_pending (xattr, child_index, this))
- local->need_data_self_heal = 1;
-
- ret = dict_get_uint32 (xattr, GLUSTERFS_OPEN_FD_COUNT,
- &open_fd_count);
- local->open_fd_count += open_fd_count;
-
- /* in case of revalidate, we need to send stat of the
- * child whose stat was sent during the first lookup.
- * (so that time stamp does not vary with revalidate.
- * in case it is down, stat of the fist success will
- * be replied */
-
- /* inode number should be preserved across revalidates */
-
- if (local->success_count == 0) {
- if (local->op_errno != ESTALE)
- local->op_ret = op_ret;
-
- local->cont.lookup.inode = inode;
- local->cont.lookup.xattr = dict_ref (xattr);
-
- *lookup_buf = *buf;
- lookup_buf->st_ino = afr_itransform (buf->st_ino,
- priv->child_count,
- child_index);
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this,
- local->cont.lookup.inode,
- priv->read_child);
- } else {
- afr_set_read_child (this,
- local->cont.lookup.inode,
- child_index);
- }
-
- } else {
- if ((local->op_ret == 0)
- && (child_index == local->read_child_index)) {
-
- /*
- lookup has succeeded on the read child.
- So use its inode number
- */
-
- local->op_ret = op_ret;
-
- if (local->cont.lookup.xattr)
- dict_unref (local->cont.lookup.xattr);
-
- local->cont.lookup.inode = inode;
- local->cont.lookup.xattr = dict_ref (xattr);
-
- *lookup_buf = *buf;
- lookup_buf->st_ino = afr_itransform (buf->st_ino,
- priv->child_count,
- child_index);
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this,
- local->cont.lookup.inode,
- priv->read_child);
- } else {
- afr_set_read_child (this,
- local->cont.lookup.inode,
- local->read_child_index);
- }
- }
-
- if (FILETYPE_DIFFERS (buf, lookup_buf)) {
- /* mismatching filetypes with same name
- -- Govinda !! GOvinda !!!
- */
- local->govinda_gOvinda = 1;
- }
-
- if (PERMISSION_DIFFERS (buf, lookup_buf)) {
- /* mismatching permissions */
- local->need_metadata_self_heal = 1;
- }
-
- if (OWNERSHIP_DIFFERS (buf, lookup_buf)) {
- /* mismatching permissions */
- local->need_metadata_self_heal = 1;
- }
-
- if (SIZE_DIFFERS (buf, lookup_buf)
- && S_ISREG (buf->st_mode)) {
- local->need_data_self_heal = 1;
- }
- }
-
- local->success_count++;
- }
-unlock:
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (local->op_ret == 0) {
- /* KLUDGE: assuming DHT will not itransform in
- revalidate */
- if (local->cont.lookup.inode->ino)
- lookup_buf->st_ino =
- local->cont.lookup.inode->ino;
- }
-
- if (local->success_count && local->enoent_count) {
- local->need_metadata_self_heal = 1;
- local->need_data_self_heal = 1;
- local->need_entry_self_heal = 1;
- }
-
- if (local->success_count) {
- /* check for split-brain case in previous lookup */
- if (afr_is_split_brain (this,
- local->cont.lookup.inode))
- local->need_data_self_heal = 1;
- }
-
- if ((local->need_metadata_self_heal
- || local->need_data_self_heal
- || local->need_entry_self_heal)
- && (!local->open_fd_count)) {
-
- if (!local->cont.lookup.inode->st_mode) {
- /* fix for RT #602 */
- local->cont.lookup.inode->st_mode =
- lookup_buf->st_mode;
- }
-
- afr_self_heal (frame, this, afr_self_heal_cbk);
- } else {
- AFR_STACK_UNWIND (frame, local->op_ret,
- local->op_errno,
- local->cont.lookup.inode,
- &local->cont.lookup.buf,
- local->cont.lookup.xattr);
- }
- }
-
- return 0;
-}
+static const char *favorite_child_warning_str = "You have specified subvolume '%s' "
+ "as the 'favorite child'. This means that if a discrepancy in the content "
+ "or attributes (ownership, permission, etc.) of a file is detected among "
+ "the subvolumes, the file on '%s' will be considered the definitive "
+ "version and its contents will OVERWRITE the contents of the file on other "
+ "subvolumes. All versions of the file except that on '%s' "
+ "WILL BE LOST.";
-int
-afr_lookup (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *xattr_req)
+int32_t
+init (xlator_t *this)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int ret = -1;
- int i = 0;
-
- uint64_t ctx;
-
- int32_t op_errno = 0;
-
- priv = this->private;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- local->op_ret = -1;
-
- frame->local = local;
-
- loc_copy (&local->loc, loc);
-
- ret = inode_ctx_get (loc->inode, this, &ctx);
- if (ret == 0) {
- /* lookup is a revalidate */
-
- local->read_child_index = afr_read_child (this, loc->inode);
- } else {
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
+ afr_private_t *priv = NULL;
+ int child_count = 0;
+ xlator_list_t *trav = NULL;
+ int i = 0;
+ int ret = -1;
+ GF_UNUSED int op_errno = 0;
+ xlator_t *read_subvol = NULL;
+ int read_subvol_index = -1;
+ xlator_t *fav_child = NULL;
+ char *qtype = NULL;
+
+ if (!this->children) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "replicate translator needs more than one "
+ "subvolume defined.");
+ return -1;
}
- local->call_count = priv->child_count;
-
- local->child_up = memdup (priv->child_up, priv->child_count);
- local->child_count = afr_up_children_count (priv->child_count,
- local->child_up);
-
- /* By default assume ENOTCONN. On success it will be set to 0. */
- local->op_errno = ENOTCONN;
-
- if (xattr_req == NULL)
- local->xattr_req = dict_new ();
- else
- local->xattr_req = dict_ref (xattr_req);
-
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_uint64 (local->xattr_req, priv->pending_key[i],
- 3 * sizeof(int32_t));
-
- /* 3 = data+metadata+entry */
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Volume is dangling.");
}
- ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_OPEN_FD_COUNT, 0);
-
- for (i = 0; i < priv->child_count; i++) {
- STACK_WIND_COOKIE (frame, afr_lookup_cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->lookup,
- loc, local->xattr_req);
- }
-
- ret = 0;
-out:
- if (ret == -1)
- AFR_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL);
+ this->private = GF_CALLOC (1, sizeof (afr_private_t),
+ gf_afr_mt_afr_private_t);
+ if (!this->private)
+ goto out;
- return 0;
-}
+ priv = this->private;
+ LOCK_INIT (&priv->lock);
+ child_count = xlator_subvolume_count (this);
-/* {{{ open */
+ priv->child_count = child_count;
-int
-afr_fd_ctx_set (xlator_t *this, fd_t *fd)
-{
- afr_private_t * priv = NULL;
+ priv->read_child = -1;
- int op_ret = 0;
- int ret = 0;
+ GF_OPTION_INIT ("afr-dirty-xattr", priv->afr_dirty, str, out);
- uint64_t ctx;
- afr_fd_ctx_t * fd_ctx = NULL;
+ GF_OPTION_INIT ("metadata-splitbrain-forced-heal",
+ priv->metadata_splitbrain_forced_heal, bool, out);
- priv = this->private;
-
- LOCK (&fd->lock);
- {
- ret = __fd_ctx_get (fd, this, &ctx);
-
- if (ret == 0)
- goto out;
-
- fd_ctx = CALLOC (1, sizeof (afr_fd_ctx_t));
- if (!fd_ctx) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
-
- op_ret = -ENOMEM;
+ GF_OPTION_INIT ("read-subvolume", read_subvol, xlator, out);
+ if (read_subvol) {
+ priv->read_child = xlator_subvolume_index (this, read_subvol);
+ if (priv->read_child == -1) {
+ gf_log (this->name, GF_LOG_ERROR, "%s not a subvolume",
+ read_subvol->name);
goto out;
}
-
- fd_ctx->child_failed = CALLOC (sizeof (*fd_ctx->child_failed),
- priv->child_count);
-
- if (!fd_ctx->child_failed) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
-
- op_ret = -ENOMEM;
+ }
+ GF_OPTION_INIT ("read-subvolume-index",read_subvol_index,int32,out);
+ if (read_subvol_index > -1) {
+ if (read_subvol_index >= priv->child_count) {
+ gf_log (this->name, GF_LOG_ERROR, "%d not a subvolume-index",
+ read_subvol_index);
goto out;
}
-
- ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx);
- if (ret < 0) {
- op_ret = ret;
- }
+ priv->read_child = read_subvol_index;
}
-out:
- UNLOCK (&fd->lock);
-
- return ret;
-}
-
-
-int
-afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct stat *buf)
-{
- afr_local_t * local = frame->local;
- int ret = 0;
+ GF_OPTION_INIT ("choose-local", priv->choose_local, bool, out);
- ret = afr_fd_ctx_set (this, local->fd);
+ GF_OPTION_INIT ("read-hash-mode", priv->hash_mode, uint32, out);
- if (ret < 0) {
- local->op_ret = -1;
- local->op_errno = -ret;
+ priv->favorite_child = -1;
+ GF_OPTION_INIT ("favorite-child", fav_child, xlator, out);
+ if (fav_child) {
+ priv->favorite_child = xlator_subvolume_index (this, fav_child);
+ if (priv->favorite_child == -1) {
+ gf_log (this->name, GF_LOG_ERROR, "%s not a subvolume",
+ fav_child->name);
+ goto out;
+ }
+ gf_log (this->name, GF_LOG_WARNING,
+ favorite_child_warning_str, fav_child->name,
+ fav_child->name, fav_child->name);
}
- AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
- local->fd);
- return 0;
-}
+ GF_OPTION_INIT ("background-self-heal-count",
+ priv->background_self_heal_count, uint32, out);
-int
-afr_open_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- fd_t *fd)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
+ GF_OPTION_INIT ("data-self-heal", priv->data_self_heal, str, out);
- int ret = 0;
+ GF_OPTION_INIT ("data-self-heal-algorithm",
+ priv->data_self_heal_algorithm, str, out);
- int call_count = -1;
-
- priv = this->private;
- local = frame->local;
+ GF_OPTION_INIT ("data-self-heal-window-size",
+ priv->data_self_heal_window_size, uint32, out);
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- }
+ GF_OPTION_INIT ("metadata-self-heal", priv->metadata_self_heal, bool,
+ out);
- if (op_ret >= 0) {
- local->op_ret = op_ret;
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if ((local->cont.open.flags & O_TRUNC)
- && (local->op_ret >= 0)) {
- STACK_WIND (frame, afr_open_ftruncate_cbk,
- this, this->fops->ftruncate,
- fd, 0);
- } else {
- ret = afr_fd_ctx_set (this, fd);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not set fd ctx for fd=%p",
- fd);
-
- local->op_ret = -1;
- local->op_errno = -ret;
- }
-
- AFR_STACK_UNWIND (frame, local->op_ret,
- local->op_errno, local->fd);
- }
- }
+ GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out);
- return 0;
-}
+ GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out);
+ GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool,
+ out);
-int
-afr_open (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, fd_t *fd)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
+ GF_OPTION_INIT ("entry-change-log", priv->entry_change_log, bool, out);
- int i = 0;
- int ret = -1;
+ GF_OPTION_INIT ("optimistic-change-log", priv->optimistic_change_log,
+ bool, out);
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int32_t wind_flags = flags & (~O_TRUNC);
+ GF_OPTION_INIT ("inodelk-trace", priv->inodelk_trace, bool, out);
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
+ GF_OPTION_INIT ("entrylk-trace", priv->entrylk_trace, bool, out);
- priv = this->private;
+ GF_OPTION_INIT ("pre-op-compat", priv->pre_op_compat, bool, out);
- if (afr_is_split_brain (this, loc->inode)) {
- /* self-heal failed */
+ GF_OPTION_INIT ("eager-lock", priv->eager_lock, bool, out);
+ GF_OPTION_INIT ("quorum-type", qtype, str, out);
+ GF_OPTION_INIT ("quorum-count", priv->quorum_count, uint32, out);
+ GF_OPTION_INIT (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size_uint64,
+ out);
+ fix_quorum_options(this,priv,qtype);
- gf_log (this->name, GF_LOG_WARNING,
- "returning EIO, file has to be manually corrected "
- "in the backend");
+ GF_OPTION_INIT ("post-op-delay-secs", priv->post_op_delay_secs, uint32, out);
+ GF_OPTION_INIT ("ensure-durability", priv->ensure_durability, bool,
+ out);
- op_errno = EIO;
- goto out;
- }
+ GF_OPTION_INIT ("self-heal-daemon", priv->shd.enabled, bool, out);
- ALLOC_OR_GOTO (local, afr_local_t, out);
+ GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out);
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- frame->local = local;
- call_count = local->call_count;
-
- local->cont.open.flags = flags;
- local->fd = fd_ref (fd);
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->open,
- loc, wind_flags, fd);
-
- if (!--call_count)
- break;
- }
- }
+ priv->wait_count = 1;
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno, fd);
- }
-
- return 0;
-}
-
-/* }}} */
-
-/* {{{ flush */
-
-int
-afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t * local = NULL;
-
- int call_count = -1;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
-}
-
-
-int
-afr_flush_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int i = 0;
- int call_count = -1;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (priv->child_count, local->child_up);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_flush_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->flush,
- local->fd);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
-
-
-int
-afr_flush_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
-
- return 0;
-}
-
-
-int
-afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
-
- int ret = -1;
-
- int op_ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- frame->local = local;
-
- local->op = GF_FOP_FLUSH;
- local->transaction.fop = afr_flush_wind;
- local->transaction.done = afr_flush_done;
-
- local->fd = fd_ref (fd);
-
- local->transaction.start = 0;
- local->transaction.len = 0;
-
- afr_transaction (frame, this, AFR_FLUSH_TRANSACTION);
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
- }
-
- return 0;
-}
-
-/* }}} */
-
-
-int
-afr_release (xlator_t *this, fd_t *fd)
-{
- uint64_t ctx;
- afr_fd_ctx_t * fd_ctx;
-
- int ret = 0;
-
- ret = fd_ctx_get (fd, this, &ctx);
-
- if (ret < 0)
+ priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count,
+ gf_afr_mt_char);
+ if (!priv->child_up) {
+ ret = -ENOMEM;
goto out;
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- if (fd_ctx) {
- if (fd_ctx->child_failed)
- FREE (fd_ctx->child_failed);
-
- FREE (fd_ctx);
}
-
-out:
- return 0;
-}
-
-
-/* {{{ fsync */
-
-int
-afr_fsync_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *local = NULL;
-
- int call_count = -1;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
-
- return 0;
-}
-
-
-int
-afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int32_t datasync)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- int ret = -1;
-
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- call_count = local->call_count;
- frame->local = local;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_fsync_cbk,
- priv->children[i],
- priv->children[i]->fops->fsync,
- fd, datasync);
- if (!--call_count)
- break;
- }
- }
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno);
- }
- return 0;
-}
-
-/* }}} */
-
-/* {{{ fsync */
-
-int32_t
-afr_fsyncdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *local = NULL;
-
- int call_count = -1;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
-
- return 0;
-}
-
-
-int32_t
-afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int32_t datasync)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- int ret = -1;
-
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- call_count = local->call_count;
- frame->local = local;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_fsync_cbk,
- priv->children[i],
- priv->children[i]->fops->fsyncdir,
- fd, datasync);
- if (!--call_count)
- break;
- }
- }
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno);
- }
- return 0;
-}
-
-/* }}} */
-
-/* {{{ xattrop */
-
-int32_t
-afr_xattrop_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xattr)
-{
- afr_local_t *local = NULL;
-
- int call_count = -1;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr);
-
- return 0;
-}
-
-
-int32_t
-afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
- gf_xattrop_flags_t optype, dict_t *xattr)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- int ret = -1;
-
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- call_count = local->call_count;
- frame->local = local;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_xattrop_cbk,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- loc, optype, xattr);
- if (!--call_count)
- break;
- }
- }
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno);
- }
- return 0;
-}
-
-/* }}} */
-
-/* {{{ fxattrop */
-
-int32_t
-afr_fxattrop_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xattr)
-{
- afr_local_t *local = NULL;
-
- int call_count = -1;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr);
-
- return 0;
-}
-
-
-int32_t
-afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
- gf_xattrop_flags_t optype, dict_t *xattr)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- int ret = -1;
-
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- call_count = local->call_count;
- frame->local = local;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_fxattrop_cbk,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- fd, optype, xattr);
- if (!--call_count)
- break;
- }
- }
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno);
- }
- return 0;
-}
-
-/* }}} */
-
-
-int32_t
-afr_inodelk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
-
-{
- afr_local_t *local = NULL;
-
- int call_count = -1;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
-
- return 0;
-}
-
-
-int32_t
-afr_inodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, int32_t cmd, struct flock *flock)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- int ret = -1;
-
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- call_count = local->call_count;
- frame->local = local;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_inodelk_cbk,
- priv->children[i],
- priv->children[i]->fops->inodelk,
- volume, loc, cmd, flock);
-
- if (!--call_count)
- break;
- }
- }
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno);
- }
- return 0;
-}
-
-
-int32_t
-afr_finodelk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
-
-{
- afr_local_t *local = NULL;
-
- int call_count = -1;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
-
- return 0;
-}
-
-
-int32_t
-afr_finodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, int32_t cmd, struct flock *flock)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- int ret = -1;
-
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- call_count = local->call_count;
- frame->local = local;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_finodelk_cbk,
- priv->children[i],
- priv->children[i]->fops->finodelk,
- volume, fd, cmd, flock);
-
- if (!--call_count)
- break;
- }
- }
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno);
- }
- return 0;
-}
-
-
-int32_t
-afr_entrylk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
-
-{
- afr_local_t *local = NULL;
-
- int call_count = -1;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
-
- return 0;
-}
-
-
-int32_t
-afr_entrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc,
- const char *basename, entrylk_cmd cmd, entrylk_type type)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- int ret = -1;
-
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- call_count = local->call_count;
- frame->local = local;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_entrylk_cbk,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- volume, loc, basename, cmd, type);
-
- if (!--call_count)
- break;
- }
- }
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno);
- }
- return 0;
-}
-
-
-
-int32_t
-afr_fentrylk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno)
-
-{
- afr_local_t *local = NULL;
-
- int call_count = -1;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
-
- return 0;
-}
-
-
-int32_t
-afr_fentrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd,
- const char *basename, entrylk_cmd cmd, entrylk_type type)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- int ret = -1;
-
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- call_count = local->call_count;
- frame->local = local;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_fentrylk_cbk,
- priv->children[i],
- priv->children[i]->fops->fentrylk,
- volume, fd, basename, cmd, type);
-
- if (!--call_count)
- break;
- }
- }
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno);
- }
- return 0;
-}
-
-
-int32_t
-afr_checksum_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- uint8_t *file_checksum, uint8_t *dir_checksum)
-
-{
- afr_local_t *local = NULL;
-
- int call_count = -1;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret == 0 && (local->op_ret != 0)) {
- local->op_ret = 0;
-
- local->cont.checksum.file_checksum = MALLOC (ZR_FILENAME_MAX);
- memcpy (local->cont.checksum.file_checksum, file_checksum,
- ZR_FILENAME_MAX);
-
- local->cont.checksum.dir_checksum = MALLOC (ZR_FILENAME_MAX);
- memcpy (local->cont.checksum.dir_checksum, dir_checksum,
- ZR_FILENAME_MAX);
-
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
- local->cont.checksum.file_checksum,
- local->cont.checksum.dir_checksum);
-
- return 0;
-}
-
-
-int32_t
-afr_checksum (call_frame_t *frame, xlator_t *this, loc_t *loc,
- int32_t flag)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- int ret = -1;
-
- int i = 0;
- int32_t call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- call_count = local->call_count;
- frame->local = local;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_checksum_cbk,
- priv->children[i],
- priv->children[i]->fops->checksum,
- loc, flag);
-
- if (!--call_count)
- break;
- }
- }
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno);
- }
- return 0;
-}
-
-
-int32_t
-afr_statfs_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct statvfs *statvfs)
-{
- afr_local_t *local = NULL;
-
- int call_count = 0;
-
- LOCK (&frame->lock);
- {
- local = frame->local;
-
- if (op_ret == 0) {
- local->op_ret = op_ret;
-
- if (local->cont.statfs.buf_set) {
- if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail)
- local->cont.statfs.buf = *statvfs;
- } else {
- local->cont.statfs.buf = *statvfs;
- local->cont.statfs.buf_set = 1;
- }
- }
-
- if (op_ret == -1)
- local->op_errno = op_errno;
-
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
- &local->cont.statfs.buf);
-
- return 0;
-}
-
-
-int32_t
-afr_statfs (call_frame_t *frame, xlator_t *this,
- loc_t *loc)
-{
- afr_private_t * priv = NULL;
- int child_count = 0;
- afr_local_t * local = NULL;
- int i = 0;
-
- int ret = -1;
- int call_count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
-
- priv = this->private;
- child_count = priv->child_count;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
-
- ret = AFR_LOCAL_INIT (local, priv);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- frame->local = local;
- call_count = local->call_count;
-
- for (i = 0; i < child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_statfs_cbk,
- priv->children[i],
- priv->children[i]->fops->statfs,
- loc);
- if (!--call_count)
- break;
- }
- }
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
- }
- return 0;
-}
-
-
-int32_t
-afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct flock *lock)
-{
- afr_local_t * local = NULL;
-
- int call_count = -1;
-
- local = frame->local;
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
- lock);
-
- return 0;
-}
-
-
-int32_t
-afr_lk_unlock (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int i;
- int call_count = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes,
- priv->child_count);
-
- if (call_count == 0) {
- AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
- &local->cont.lk.flock);
- return 0;
- }
-
- local->call_count = call_count;
-
- local->cont.lk.flock.l_type = F_UNLCK;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->cont.lk.locked_nodes[i]) {
- STACK_WIND (frame, afr_lk_unlock_cbk,
- priv->children[i],
- priv->children[i]->fops->lk,
- local->fd, F_SETLK,
- &local->cont.lk.flock);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
-
-
-int32_t
-afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct flock *lock)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int child_index = -1;
-
- local = frame->local;
- priv = this->private;
-
- child_index = (long) cookie;
-
- call_count = --local->call_count;
-
- if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) {
- local->op_ret = -1;
- local->op_errno = op_errno;
-
- afr_lk_unlock (frame, this);
- return 0;
- }
-
- if (op_ret == 0) {
- local->op_ret = 0;
- local->op_errno = 0;
- local->cont.lk.flock = *lock;
- local->cont.lk.locked_nodes[child_index] = 1;
- }
-
- child_index++;
-
- if (child_index < priv->child_count) {
- STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->lk,
- local->fd, local->cont.lk.cmd,
- &local->cont.lk.flock);
- } else if (local->op_ret == -1) {
- /* all nodes have gone down */
-
- AFR_STACK_UNWIND (frame, -1, ENOTCONN, &local->cont.lk.flock);
- } else {
- /* locking has succeeded on all nodes that are up */
-
- AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
- &local->cont.lk.flock);
- }
-
- return 0;
-}
-
-
-int
-afr_lk (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int32_t cmd,
- struct flock *flock)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- int i = 0;
-
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- ALLOC_OR_GOTO (local, afr_local_t, out);
- AFR_LOCAL_INIT (local, priv);
-
- frame->local = local;
-
- local->cont.lk.locked_nodes = CALLOC (priv->child_count,
- sizeof (*local->cont.lk.locked_nodes));
-
- if (!local->cont.lk.locked_nodes) {
- gf_log (this->name, GF_LOG_ERROR, "out of memory :(");
- op_errno = ENOMEM;
- goto out;
- }
-
- local->fd = fd_ref (fd);
- local->cont.lk.cmd = cmd;
- local->cont.lk.flock = *flock;
-
- STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0,
- priv->children[i],
- priv->children[i]->fops->lk,
- fd, cmd, flock);
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
- }
- return 0;
-}
-
-
-/**
- * find_child_index - find the child's index in the array of subvolumes
- * @this: AFR
- * @child: child
- */
-
-static int
-find_child_index (xlator_t *this, xlator_t *child)
-{
- afr_private_t *priv = NULL;
-
- int i = -1;
-
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if ((xlator_t *) child == priv->children[i])
- break;
- }
-
- return i;
-}
-
-
-int32_t
-notify (xlator_t *this, int32_t event,
- void *data, ...)
-{
- afr_private_t * priv = NULL;
- unsigned char * child_up = NULL;
-
- int i = -1;
- int up_children = 0;
-
- priv = this->private;
-
- if (!priv)
- return 0;
-
- child_up = priv->child_up;
-
- switch (event) {
- case GF_EVENT_CHILD_UP:
- i = find_child_index (this, data);
-
- gf_log (this->name, GF_LOG_NORMAL,
- "subvolume %s came up", ((xlator_t *) data)->name);
-
- child_up[i] = 1;
-
- /*
- if all the children were down, and one child came up,
- send notify to parent
- */
-
- for (i = 0; i < priv->child_count; i++)
- if (child_up[i])
- up_children++;
-
- if (up_children == 1)
- default_notify (this, event, data);
-
- break;
-
- case GF_EVENT_CHILD_DOWN:
- i = find_child_index (this, data);
-
- gf_log (this->name, GF_LOG_NORMAL,
- "subvolume %s went down", ((xlator_t *) data)->name);
-
- child_up[i] = 0;
-
- /*
- if all children are down, and this was the last to go down,
- send notify to parent
- */
-
- for (i = 0; i < priv->child_count; i++)
- if (child_up[i])
- up_children++;
-
- if (up_children == 0)
- default_notify (this, event, data);
-
- break;
-
- default:
- default_notify (this, event, data);
- }
-
- return 0;
-}
-
-
-static const char *favorite_child_warning_str = "You have specified subvolume '%s' "
- "as the 'favorite child'. This means that if a discrepancy in the content "
- "or attributes (ownership, permission, etc.) of a file is detected among "
- "the subvolumes, the file on '%s' will be considered the definitive "
- "version and its contents will OVERWRITE the contents of the file on other "
- "subvolumes. All versions of the file except that on '%s' "
- "WILL BE LOST.";
-
-static const char *no_lock_servers_warning_str = "You have set lock-server-count = 0. "
- "This means correctness is NO LONGER GUARANTEED in all cases. If two or more "
- "applications write to the same region of a file, there is a possibility that "
- "its copies will be INCONSISTENT. Set it to a value greater than 0 unless you "
- "are ABSOLUTELY SURE of what you are doing and WILL NOT HOLD GlusterFS "
- "RESPONSIBLE for inconsistent data. If you are in doubt, set it to a value "
- "greater than 0.";
-
-int32_t
-init (xlator_t *this)
-{
- afr_private_t * priv = NULL;
- int child_count = 0;
- xlator_list_t * trav = NULL;
- int i = 0;
- int ret = -1;
- int op_errno = 0;
-
- char * read_subvol = NULL;
- char * fav_child = NULL;
- char * self_heal = NULL;
- char * change_log = NULL;
-
- int32_t lock_server_count = 1;
-
- int fav_ret = -1;
- int read_ret = -1;
- int dict_ret = -1;
-
- if (!this->children) {
- gf_log (this->name, GF_LOG_ERROR,
- "AFR needs more than one child defined");
- return -1;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile ");
- }
-
- ALLOC_OR_GOTO (this->private, afr_private_t, out);
-
- priv = this->private;
-
- read_ret = dict_get_str (this->options, "read-subvolume", &read_subvol);
- priv->read_child = -1;
-
- fav_ret = dict_get_str (this->options, "favorite-child", &fav_child);
- priv->favorite_child = -1;
-
- /* Default values */
-
- priv->data_self_heal = 1;
- priv->metadata_self_heal = 1;
- priv->entry_self_heal = 1;
-
- dict_ret = dict_get_str (this->options, "data-self-heal", &self_heal);
- if (dict_ret == 0) {
- ret = gf_string2boolean (self_heal, &priv->data_self_heal);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "invalid 'option data-self-heal %s' "
- "defaulting to data-self-heal as 'on'",
- self_heal);
- priv->data_self_heal = 1;
- }
- }
-
- dict_ret = dict_get_str (this->options, "metadata-self-heal",
- &self_heal);
- if (dict_ret == 0) {
- ret = gf_string2boolean (self_heal, &priv->metadata_self_heal);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "invalid 'option metadata-self-heal %s' "
- "defaulting to metadata-self-heal as 'on'",
- self_heal);
- priv->metadata_self_heal = 1;
- }
- }
-
- dict_ret = dict_get_str (this->options, "entry-self-heal", &self_heal);
- if (dict_ret == 0) {
- ret = gf_string2boolean (self_heal, &priv->entry_self_heal);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "invalid 'option entry-self-heal %s' "
- "defaulting to entry-self-heal as 'on'",
- self_heal);
- priv->entry_self_heal = 1;
- }
- }
-
- /* Change log options */
-
- priv->data_change_log = 1;
- priv->metadata_change_log = 0;
- priv->entry_change_log = 1;
-
- dict_ret = dict_get_str (this->options, "data-change-log",
- &change_log);
- if (dict_ret == 0) {
- ret = gf_string2boolean (change_log, &priv->data_change_log);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "invalid 'option data-change-log %s'. "
- "defaulting to data-change-log as 'on'",
- change_log);
- priv->data_change_log = 1;
- }
- }
-
- dict_ret = dict_get_str (this->options, "metadata-change-log",
- &change_log);
- if (dict_ret == 0) {
- ret = gf_string2boolean (change_log,
- &priv->metadata_change_log);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "invalid 'option metadata-change-log %s'. "
- "defaulting to metadata-change-log as 'off'",
- change_log);
- priv->metadata_change_log = 0;
- }
- }
-
- dict_ret = dict_get_str (this->options, "entry-change-log",
- &change_log);
- if (dict_ret == 0) {
- ret = gf_string2boolean (change_log, &priv->entry_change_log);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "invalid 'option entry-change-log %s'. "
- "defaulting to entry-change-log as 'on'",
- change_log);
- priv->entry_change_log = 1;
- }
- }
-
- /* Locking options */
-
- priv->data_lock_server_count = 1;
- priv->metadata_lock_server_count = 0;
- priv->entry_lock_server_count = 1;
-
- dict_ret = dict_get_int32 (this->options, "data-lock-server-count",
- &lock_server_count);
- if (dict_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "setting data lock server count to %d",
- lock_server_count);
-
- if (lock_server_count == 0)
- gf_log (this->name, GF_LOG_WARNING,
- no_lock_servers_warning_str);
-
- priv->data_lock_server_count = lock_server_count;
- }
-
-
- dict_ret = dict_get_int32 (this->options,
- "metadata-lock-server-count",
- &lock_server_count);
- if (dict_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "setting metadata lock server count to %d",
- lock_server_count);
- priv->metadata_lock_server_count = lock_server_count;
- }
-
-
- dict_ret = dict_get_int32 (this->options, "entry-lock-server-count",
- &lock_server_count);
- if (dict_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "setting entry lock server count to %d",
- lock_server_count);
+ for (i = 0; i < child_count; i++)
+ priv->child_up[i] = -1; /* start with unknown state.
+ this initialization needed
+ for afr_notify() to work
+ reliably
+ */
+
+ priv->children = GF_CALLOC (sizeof (xlator_t *), child_count,
+ gf_afr_mt_xlator_t);
+ if (!priv->children) {
+ ret = -ENOMEM;
+ goto out;
+ }
- priv->entry_lock_server_count = lock_server_count;
- }
+ priv->pending_key = GF_CALLOC (sizeof (*priv->pending_key),
+ child_count,
+ gf_afr_mt_char);
+ if (!priv->pending_key) {
+ ret = -ENOMEM;
+ goto out;
+ }
- trav = this->children;
- while (trav) {
- if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "subvolume '%s' specified as read child",
- trav->xlator->name);
-
- priv->read_child = child_count;
- }
-
- if (fav_ret == 0 && !strcmp (fav_child, trav->xlator->name)) {
- gf_log (this->name, GF_LOG_WARNING,
- favorite_child_warning_str, trav->xlator->name,
- trav->xlator->name, trav->xlator->name);
- priv->favorite_child = child_count;
- }
-
- child_count++;
- trav = trav->next;
- }
+ trav = this->children;
+ i = 0;
+ while (i < child_count) {
+ priv->children[i] = trav->xlator;
- priv->wait_count = 1;
+ ret = gf_asprintf (&priv->pending_key[i], "%s.%s",
+ AFR_XATTR_PREFIX,
+ trav->xlator->name);
+ if (-1 == ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
- priv->child_count = child_count;
+ trav = trav->next;
+ i++;
+ }
- LOCK_INIT (&priv->lock);
- LOCK_INIT (&priv->read_child_lock);
+ ret = gf_asprintf (&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT,
+ this->name);
+ if (-1 == ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
- priv->child_up = CALLOC (sizeof (unsigned char), child_count);
- if (!priv->child_up) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
- op_errno = ENOMEM;
- goto out;
- }
+ priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event),
+ gf_afr_mt_int32_t);
+ if (!priv->last_event) {
+ ret = -ENOMEM;
+ goto out;
+ }
- priv->children = CALLOC (sizeof (xlator_t *), child_count);
- if (!priv->children) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
- op_errno = ENOMEM;
+ ret = afr_selfheal_daemon_init (this);
+ if (ret) {
+ ret = -ENOMEM;
goto out;
}
- priv->pending_key = CALLOC (sizeof (*priv->pending_key), child_count);
- if (!priv->pending_key) {
+ /* keep more local here as we may need them for self-heal etc */
+ this->local_pool = mem_pool_new (afr_local_t, 512);
+ if (!this->local_pool) {
+ ret = -1;
gf_log (this->name, GF_LOG_ERROR,
- "out of memory :(");
- op_errno = ENOMEM;
+ "failed to create local_t's memory pool");
goto out;
}
- trav = this->children;
- i = 0;
- while (i < child_count) {
- priv->children[i] = trav->xlator;
+ priv->root_inode = NULL;
- asprintf (&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX,
- trav->xlator->name);
-
- trav = trav->next;
- i++;
- }
-
- ret = 0;
+ ret = 0;
out:
- return ret;
+ return ret;
}
int
fini (xlator_t *this)
{
- return 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ this->private = NULL;
+ afr_priv_destroy (priv);
+ //if (this->itable);//I dont see any destroy func
+
+ return 0;
}
struct xlator_fops fops = {
- .lookup = afr_lookup,
- .open = afr_open,
- .lk = afr_lk,
- .flush = afr_flush,
- .statfs = afr_statfs,
- .fsync = afr_fsync,
- .fsyncdir = afr_fsyncdir,
- .xattrop = afr_xattrop,
- .fxattrop = afr_fxattrop,
- .inodelk = afr_inodelk,
- .finodelk = afr_finodelk,
- .entrylk = afr_entrylk,
- .fentrylk = afr_fentrylk,
- .checksum = afr_checksum,
-
- /* inode read */
- .access = afr_access,
- .stat = afr_stat,
- .fstat = afr_fstat,
- .readlink = afr_readlink,
- .getxattr = afr_getxattr,
- .readv = afr_readv,
-
- /* inode write */
- .chmod = afr_chmod,
- .chown = afr_chown,
- .fchmod = afr_fchmod,
- .fchown = afr_fchown,
- .writev = afr_writev,
- .truncate = afr_truncate,
- .ftruncate = afr_ftruncate,
- .utimens = afr_utimens,
- .setxattr = afr_setxattr,
- .removexattr = afr_removexattr,
-
- /* dir read */
- .opendir = afr_opendir,
- .readdir = afr_readdir,
- .getdents = afr_getdents,
-
- /* dir write */
- .create = afr_create,
- .mknod = afr_mknod,
- .mkdir = afr_mkdir,
- .unlink = afr_unlink,
- .rmdir = afr_rmdir,
- .link = afr_link,
- .symlink = afr_symlink,
- .rename = afr_rename,
- .setdents = afr_setdents,
+ .lookup = afr_lookup,
+ .open = afr_open,
+ .lk = afr_lk,
+ .flush = afr_flush,
+ .statfs = afr_statfs,
+ .fsync = afr_fsync,
+ .fsyncdir = afr_fsyncdir,
+ .xattrop = afr_xattrop,
+ .fxattrop = afr_fxattrop,
+ .inodelk = afr_inodelk,
+ .finodelk = afr_finodelk,
+ .entrylk = afr_entrylk,
+ .fentrylk = afr_fentrylk,
+ .fallocate = afr_fallocate,
+ .discard = afr_discard,
+ .zerofill = afr_zerofill,
+
+ /* inode read */
+ .access = afr_access,
+ .stat = afr_stat,
+ .fstat = afr_fstat,
+ .readlink = afr_readlink,
+ .getxattr = afr_getxattr,
+ .fgetxattr = afr_fgetxattr,
+ .readv = afr_readv,
+
+ /* inode write */
+ .writev = afr_writev,
+ .truncate = afr_truncate,
+ .ftruncate = afr_ftruncate,
+ .setxattr = afr_setxattr,
+ .fsetxattr = afr_fsetxattr,
+ .setattr = afr_setattr,
+ .fsetattr = afr_fsetattr,
+ .removexattr = afr_removexattr,
+ .fremovexattr = afr_fremovexattr,
+
+ /* dir read */
+ .opendir = afr_opendir,
+ .readdir = afr_readdir,
+ .readdirp = afr_readdirp,
+
+ /* dir write */
+ .create = afr_create,
+ .mknod = afr_mknod,
+ .mkdir = afr_mkdir,
+ .unlink = afr_unlink,
+ .rmdir = afr_rmdir,
+ .link = afr_link,
+ .symlink = afr_symlink,
+ .rename = afr_rename,
};
-struct xlator_mops mops = {
+struct xlator_dumpops dumpops = {
+ .priv = afr_priv_dump,
};
struct xlator_cbks cbks = {
.release = afr_release,
+ .releasedir = afr_releasedir,
+ .forget = afr_forget,
};
struct volume_options options[] = {
- { .key = {"read-subvolume" },
- .type = GF_OPTION_TYPE_XLATOR
- },
- { .key = {"favorite-child"},
- .type = GF_OPTION_TYPE_XLATOR
- },
- { .key = {"data-self-heal"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"metadata-self-heal"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"entry-self-heal"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"data-change-log"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"metadata-change-log"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"entry-change-log"},
- .type = GF_OPTION_TYPE_BOOL
- },
- { .key = {"data-lock-server-count"},
- .type = GF_OPTION_TYPE_INT,
- .min = 0
+ { .key = {"read-subvolume" },
+ .type = GF_OPTION_TYPE_XLATOR,
+ .description = "inode-read fops happen only on one of the bricks in "
+ "replicate. Afr will prefer the one specified using "
+ "this option if it is not stale. Option value must be "
+ "one of the xlator names of the children. "
+ "Ex: <volname>-client-0 till "
+ "<volname>-client-<number-of-bricks - 1>"
+ },
+ { .key = {"read-subvolume-index" },
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "-1",
+ .description = "inode-read fops happen only on one of the bricks in "
+ "replicate. AFR will prefer the one specified using "
+ "this option if it is not stale. allowed options"
+ " include -1 till replica-count - 1"
+ },
+ { .key = {"read-hash-mode" },
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = 2,
+ .default_value = "1",
+ .description = "inode-read fops happen only on one of the bricks in "
+ "replicate. AFR will prefer the one computed using "
+ "the method specified using this option"
+ "0 = first up server, "
+ "1 = hash by GFID of file (all clients use "
+ "same subvolume), "
+ "2 = hash by GFID of file and client PID",
+ },
+ { .key = {"choose-local" },
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "true",
+ .description = "Choose a local subvolume (i.e. Brick) to read from"
+ " if read-subvolume is not explicitly set.",
+ },
+ { .key = {"favorite-child"},
+ .type = GF_OPTION_TYPE_XLATOR,
+ .description = "If a split-brain happens choose subvol/brick set by "
+ "this option as source."
+ },
+ { .key = {"background-self-heal-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .default_value = "16",
+ .validate = GF_OPT_VALIDATE_MIN,
+ .description = "This specifies the number of self-heals that can be "
+ " performed in background without blocking the fop"
+ },
+ { .key = {"data-self-heal"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"1", "on", "yes", "true", "enable",
+ "0", "off", "no", "false", "disable",
+ "open"},
+ .default_value = "on",
+ .description = "Using this option we can enable/disable data "
+ "self-heal on the file. \"open\" means data "
+ "self-heal action will only be triggered by file "
+ "open operations."
+ },
+ { .key = {"data-self-heal-algorithm"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Select between \"full\", \"diff\". The "
+ "\"full\" algorithm copies the entire file from "
+ "source to sink. The \"diff\" algorithm copies to "
+ "sink only those blocks whose checksums don't match "
+ "with those of source. If no option is configured "
+ "the option is chosen dynamically as follows: "
+ "If the file does not exist on one of the sinks "
+ "or empty file exists or if the source file size is "
+ "about the same as page size the entire file will "
+ "be read and written i.e \"full\" algo, "
+ "otherwise \"diff\" algo is chosen.",
+ .value = { "diff", "full"}
+ },
+ { .key = {"data-self-heal-window-size"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 1024,
+ .default_value = "1",
+ .description = "Maximum number blocks per file for which self-heal "
+ "process would be applied simultaneously."
+ },
+ { .key = {"metadata-self-heal"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Using this option we can enable/disable metadata "
+ "i.e. Permissions, ownerships, xattrs self-heal on "
+ "the file/directory."
+ },
+ { .key = {"entry-self-heal"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Using this option we can enable/disable entry "
+ "self-heal on the directory."
+ },
+ { .key = {"data-change-log"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Data fops like write/truncate will not perform "
+ "pre/post fop changelog operations in afr transaction "
+ "if this option is disabled"
+ },
+ { .key = {"metadata-change-log"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Metadata fops like setattr/setxattr will not perform "
+ "pre/post fop changelog operations in afr transaction "
+ "if this option is disabled"
+ },
+ { .key = {"entry-change-log"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Entry fops like create/unlink will not perform "
+ "pre/post fop changelog operations in afr transaction "
+ "if this option is disabled"
+ },
+ { .key = {"optimistic-change-log"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Entry/Metadata fops will not perform "
+ "pre fop changelog operations in afr transaction "
+ "if this option is enabled."
+ },
+ { .key = {"inodelk-trace"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "Enabling this option logs inode lock/unlocks"
+ },
+ { .key = {"entrylk-trace"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "Enabling this option logs entry lock/unlocks"
+ },
+ { .key = {"pre-op-compat"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Use separate pre-op xattrop() FOP rather than "
+ "overloading xdata of the OP"
},
- { .key = {"metadata-lock-server-count"},
- .type = GF_OPTION_TYPE_INT,
- .min = 0
+ { .key = {"eager-lock"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Lock phase of a transaction has two sub-phases. "
+ "First is an attempt to acquire locks in parallel by "
+ "broadcasting non-blocking lock requests. If lock "
+ "acquisition fails on any server, then the held locks "
+ "are unlocked and revert to a blocking locked mode "
+ "sequentially on one server after another. If this "
+ "option is enabled the initial broadcasting lock "
+ "request attempt to acquire lock on the entire file. "
+ "If this fails, we revert back to the sequential "
+ "\"regional\" blocking lock as before. In the case "
+ "where such an \"eager\" lock is granted in the "
+ "non-blocking phase, it gives rise to an opportunity "
+ "for optimization. i.e, if the next write transaction "
+ "on the same FD arrives before the unlock phase of "
+ "the first transaction, it \"takes over\" the full "
+ "file lock. Similarly if yet another data transaction "
+ "arrives before the unlock phase of the \"optimized\" "
+ "transaction, that in turn \"takes over\" the lock as "
+ "well. The actual unlock now happens at the end of "
+ "the last \"optimized\" transaction."
+
+ },
+ { .key = {"self-heal-daemon"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "This option applies to only self-heal-daemon. "
+ "Index directory crawl and automatic healing of files "
+ "will not be performed if this option is turned off."
+ },
+ { .key = {"iam-self-heal-daemon"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option differentiates if the replicate "
+ "translator is running as part of self-heal-daemon "
+ "or not."
+ },
+ { .key = {"quorum-type"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = { "none", "auto", "fixed"},
+ .default_value = "none",
+ .description = "If value is \"fixed\" only allow writes if "
+ "quorum-count bricks are present. If value is "
+ "\"auto\" only allow writes if more than half of "
+ "bricks, or exactly half including the first, are "
+ "present.",
+ },
+ { .key = {"quorum-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = INT_MAX,
+ .default_value = 0,
+ .description = "If quorum-type is \"fixed\" only allow writes if "
+ "this many bricks or present. Other quorum types "
+ "will OVERWRITE this value.",
+ },
+ { .key = {"node-uuid"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Local glusterd uuid string, used in starting "
+ "self-heal-daemon so that it can crawl only on "
+ "local index directories.",
+ },
+ { .key = {"post-op-delay-secs"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = INT_MAX,
+ .default_value = "1",
+ .description = "Time interval induced artificially before "
+ "post-operation phase of the transaction to "
+ "enhance overlap of adjacent write operations.",
+ },
+ { .key = {AFR_SH_READDIR_SIZE_KEY},
+ .type = GF_OPTION_TYPE_SIZET,
+ .description = "readdirp size for performing entry self-heal",
+ .min = 1024,
+ .max = 131072,
+ .default_value = "1KB",
+ },
+ { .key = {"ensure-durability"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .description = "Afr performs fsyncs for transactions if this "
+ "option is on to make sure the changelogs/data is "
+ "written to the disk",
+ .default_value = "on",
+ },
+ { .key = {"afr-dirty-xattr"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = AFR_DIRTY_DEFAULT,
},
- { .key = {"entry-lock-server-count"},
- .type = GF_OPTION_TYPE_INT,
- .min = 0
+ { .key = {"metadata-splitbrain-forced-heal"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
},
- { .key = {NULL} },
+ { .key = {NULL} },
};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 2871bfc47..36042f7b2 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
- This file is part of GlusterFS.
-
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
@@ -26,85 +17,147 @@
#include "config.h"
#endif
-#include "scheduler.h"
#include "call-stub.h"
#include "compat-errno.h"
+#include "afr-mem-types.h"
-#define AFR_XATTR_PREFIX "trusted.afr"
+#include "libxlator.h"
+#include "timer.h"
+#include "syncop.h"
-typedef struct _afr_private {
- gf_lock_t lock; /* to guard access to child_count, etc */
- unsigned int child_count; /* total number of children */
+#include "afr-self-heald.h"
- unsigned int read_child_rr; /* round-robin index of the read_child */
- gf_lock_t read_child_lock; /* lock to protect above */
-
- xlator_t **children;
+#define AFR_XATTR_PREFIX "trusted.afr"
+#define AFR_PATHINFO_HEADER "REPLICATE:"
+#define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size"
+#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal"
+#define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty"
+#define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty)
- unsigned char *child_up;
+#define AFR_LOCKEE_COUNT_MAX 3
+#define AFR_DOM_COUNT_MAX 3
+#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/
- char **pending_key;
+typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this);
- gf_boolean_t data_self_heal; /* on/off */
- gf_boolean_t metadata_self_heal; /* on/off */
- gf_boolean_t entry_self_heal; /* on/off */
+typedef int (*afr_read_txn_wind_t) (call_frame_t *frame, xlator_t *this, int subvol);
- gf_boolean_t data_change_log; /* on/off */
- gf_boolean_t metadata_change_log; /* on/off */
- gf_boolean_t entry_change_log; /* on/off */
+typedef int (*afr_inode_refresh_cbk_t) (call_frame_t *frame, xlator_t *this, int err);
- int read_child; /* read-subvolume */
- unsigned int favorite_child; /* subvolume to be preferred in resolving
- split-brain cases */
+typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this);
- unsigned int data_lock_server_count;
- unsigned int metadata_lock_server_count;
- unsigned int entry_lock_server_count;
+#define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr;})
+#define AFR_COUNT(array,max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res;})
+#define AFR_INTERSECT(dst,src1,src2,max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i];})
- unsigned int wait_count; /* # of servers to wait for success */
-} afr_private_t;
+typedef struct _afr_private {
+ gf_lock_t lock; /* to guard access to child_count, etc */
+ unsigned int child_count; /* total number of children */
-typedef struct {
- /* array of stat's, one for each child */
- struct stat *buf;
+ xlator_t **children;
- /* array of xattr's, one for each child */
- dict_t **xattr;
+ inode_t *root_inode;
- /* array of errno's, one for each child */
- int *child_errno;
+ unsigned char *child_up;
- int32_t **pending_matrix;
- int32_t **delta_matrix;
+ char **pending_key;
- int *sources;
- int source;
- int active_source;
- int active_sinks;
- int *success;
+ char *data_self_heal; /* on/off/open */
+ char * data_self_heal_algorithm; /* name of algorithm */
+ unsigned int data_self_heal_window_size; /* max number of pipelined
+ read/writes */
+
+ unsigned int background_self_heal_count;
+ unsigned int background_self_heals_started;
+ gf_boolean_t metadata_self_heal; /* on/off */
+ gf_boolean_t entry_self_heal; /* on/off */
+
+ gf_boolean_t data_change_log; /* on/off */
+ gf_boolean_t metadata_change_log; /* on/off */
+ gf_boolean_t entry_change_log; /* on/off */
+
+ gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
+ int read_child; /* read-subvolume */
+ unsigned int hash_mode; /* for when read_child is not set */
+ int favorite_child; /* subvolume to be preferred in resolving
+ split-brain cases */
+
+ gf_boolean_t inodelk_trace;
+ gf_boolean_t entrylk_trace;
+
+ unsigned int wait_count; /* # of servers to wait for success */
+
+ uint64_t up_count; /* number of CHILD_UPs we have seen */
+ uint64_t down_count; /* number of CHILD_DOWNs we have seen */
+
+ gf_boolean_t optimistic_change_log;
+ gf_boolean_t eager_lock;
+ gf_boolean_t pre_op_compat; /* on/off */
+ uint32_t post_op_delay_secs;
+ unsigned int quorum_count;
+
+ char vol_uuid[UUID_SIZE + 1];
+ int32_t *last_event;
+
+ /* @event_generation: Keeps count of number of events received which can
+ potentially impact consistency decisions. The events are CHILD_UP
+ and CHILD_DOWN, when we have to recalculate the freshness/staleness
+ of copies to detect if changes had happened while the other server
+ was down. CHILD_DOWN and CHILD_UP can also be received on network
+ disconnect/reconnects and not necessarily server going down/up.
+ Recalculating freshness/staleness on network events is equally
+ important as we might have had a network split brain.
+ */
+ uint32_t event_generation;
- fd_t *healing_fd;
- int op_failed;
+ gf_boolean_t choose_local;
+ gf_boolean_t did_discovery;
+ uint64_t sh_readdir_size;
+ gf_boolean_t ensure_durability;
+ char *sh_domain;
+ char *afr_dirty;
- int file_has_holes;
- blksize_t block_size;
- off_t file_size;
- off_t offset;
+ afr_self_heald_t shd;
- loc_t parent_loc;
- int (*completion_cbk) (call_frame_t *frame, xlator_t *this);
- call_frame_t *sh_frame;
-} afr_self_heal_t;
+ /* pump dependencies */
+ void *pump_private;
+ gf_boolean_t use_afr_in_pump;
+} afr_private_t;
typedef enum {
- AFR_DATA_TRANSACTION, /* truncate, write, ... */
- AFR_METADATA_TRANSACTION, /* chmod, chown, ... */
- AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */
- AFR_ENTRY_RENAME_TRANSACTION, /* rename */
- AFR_FLUSH_TRANSACTION, /* flush */
+ AFR_DATA_TRANSACTION, /* truncate, write, ... */
+ AFR_METADATA_TRANSACTION, /* chmod, chown, ... */
+ AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */
+ AFR_ENTRY_RENAME_TRANSACTION, /* rename */
} afr_transaction_type;
+typedef enum {
+ AFR_TRANSACTION_LK,
+ AFR_SELFHEAL_LK,
+} transaction_lk_type_t;
+
+typedef enum {
+ AFR_LOCK_OP,
+ AFR_UNLOCK_OP,
+} afr_lock_op_type_t;
+
+typedef enum {
+ AFR_DATA_SELF_HEAL_LK,
+ AFR_METADATA_SELF_HEAL_LK,
+ AFR_ENTRY_SELF_HEAL_LK,
+}selfheal_lk_type_t;
+
+typedef enum {
+ AFR_INODELK_TRANSACTION,
+ AFR_INODELK_NB_TRANSACTION,
+ AFR_ENTRYLK_TRANSACTION,
+ AFR_ENTRYLK_NB_TRANSACTION,
+ AFR_INODELK_SELFHEAL,
+ AFR_INODELK_NB_SELFHEAL,
+ AFR_ENTRYLK_SELFHEAL,
+ AFR_ENTRYLK_NB_SELFHEAL,
+} afr_lock_call_type_t;
/*
xattr format: trusted.afr.volume = [x y z]
@@ -117,9 +170,8 @@ static inline int
afr_index_for_transaction_type (afr_transaction_type type)
{
switch (type) {
-
+
case AFR_DATA_TRANSACTION:
- case AFR_FLUSH_TRANSACTION:
return 0;
case AFR_METADATA_TRANSACTION:
@@ -133,352 +185,636 @@ afr_index_for_transaction_type (afr_transaction_type type)
return -1; /* make gcc happy */
}
+typedef struct {
+ loc_t loc;
+ char *basename;
+ unsigned char *locked_nodes;
+ int locked_count;
-typedef struct _afr_local {
- unsigned int call_count;
- unsigned int success_count;
- unsigned int enoent_count;
+} afr_entry_lockee_t;
- unsigned int need_metadata_self_heal;
- unsigned int need_entry_self_heal;
- unsigned int need_data_self_heal;
- unsigned int govinda_gOvinda;
+int
+afr_entry_lockee_cmp (const void *l1, const void *l2);
- unsigned int read_child_index;
+typedef struct {
+ char *domain; /* Domain on which inodelk is taken */
+ struct gf_flock flock;
+ unsigned char *locked_nodes;
+ int32_t lock_count;
+} afr_inodelk_t;
- pid_t saved_pid;
+typedef struct {
+ loc_t *lk_loc;
+
+ int lockee_count;
+ afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX];
+
+ afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX];
+ const char *lk_basename;
+ const char *lower_basename;
+ const char *higher_basename;
+ char lower_locked;
+ char higher_locked;
+
+ unsigned char *locked_nodes;
+ unsigned char *lower_locked_nodes;
+
+ selfheal_lk_type_t selfheal_lk_type;
+ transaction_lk_type_t transaction_lk_type;
+
+ int32_t lock_count;
+ int32_t entrylk_lock_count;
+
+ uint64_t lock_number;
+ int32_t lk_call_count;
+ int32_t lk_expected_count;
+ int32_t lk_attempted_count;
+
+ int32_t lock_op_ret;
+ int32_t lock_op_errno;
+ afr_lock_cbk_t lock_cbk;
+ char *domain; /* Domain on which inode/entry lock/unlock in progress.*/
+} afr_internal_lock_t;
+
+struct afr_reply {
+ int valid;
+ int32_t op_ret;
+ int32_t op_errno;
+ dict_t *xdata;
+ struct iatt poststat;
+ struct iatt postparent;
+ struct iatt prestat;
+ struct iatt preparent;
+ struct iatt preparent2;
+ struct iatt postparent2;
+ uint8_t checksum[MD5_DIGEST_LENGTH];
+};
- int32_t op_ret;
- int32_t op_errno;
+typedef enum {
+ AFR_FD_NOT_OPENED,
+ AFR_FD_OPENED,
+ AFR_FD_OPENING
+} afr_fd_open_status_t;
- int32_t **pending;
+typedef struct {
+ unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
+ int inherited[AFR_NUM_CHANGE_LOGS];
+ int on_disk[AFR_NUM_CHANGE_LOGS];
+ afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */
- loc_t loc;
- loc_t newloc;
+ unsigned int *lock_piggyback;
+ unsigned int *lock_acquired;
- fd_t *fd;
+ int flags;
- glusterfs_fop_t fop;
+ /* used for delayed-post-op optimization */
+ pthread_mutex_t delay_lock;
+ gf_timer_t *delay_timer;
+ call_frame_t *delay_frame;
- unsigned char *child_up;
- int child_count;
+ /* set if any write on this fd was a non stable write
+ (i.e, without O_SYNC or O_DSYNC)
+ */
+ gf_boolean_t witnessed_unstable_write;
- int32_t *child_errno;
-
- dict_t *xattr_req;
- int open_fd_count;
- /*
- This struct contains the arguments for the "continuation"
- (scheme-like) of fops
+ /* @open_fd_count:
+ Number of open FDs queried from the server, as queried through
+ xdata in FOPs. Currently, used to decide if eager-locking must be
+ temporarily disabled.
*/
+ uint32_t open_fd_count;
- int op;
- struct {
- struct {
- unsigned char buf_set;
- struct statvfs buf;
- } statfs;
- struct {
- inode_t *inode;
- struct stat buf;
- dict_t *xattr;
- } lookup;
+ /* list of frames currently in progress */
+ struct list_head eager_locked;
+} afr_fd_ctx_t;
- struct {
- int32_t flags;
- } open;
- struct {
- int32_t cmd;
- struct flock flock;
- unsigned char *locked_nodes;
- } lk;
+typedef struct _afr_local {
+ glusterfs_fop_t op;
+ unsigned int call_count;
- struct {
- uint8_t *file_checksum;
- uint8_t *dir_checksum;
- } checksum;
+ /* @event_generation: copy of priv->event_generation taken at the
+ time of starting the transaction. The copy is made so that we
+ have a stable value through the various phases of the transaction.
+ */
+ unsigned int event_generation;
- /* inode read */
+ uint32_t open_fd_count;
+ gf_boolean_t update_open_fd_count;
- struct {
- int32_t mask;
- int last_tried; /* index of the child we tried previously */
- } access;
+ gf_lkowner_t saved_lk_owner;
- struct {
- int last_tried;
- ino_t ino;
- } stat;
+ int32_t op_ret;
+ int32_t op_errno;
- struct {
- int last_tried;
- ino_t ino;
- } fstat;
+ int32_t **pending;
- struct {
- size_t size;
- int last_tried;
- } readlink;
+ int dirty[AFR_NUM_CHANGE_LOGS];
- struct {
- const char *name;
- int last_tried;
- } getxattr;
+ loc_t loc;
+ loc_t newloc;
- struct {
- size_t size;
- off_t offset;
- int last_tried;
- } readv;
+ fd_t *fd;
+ afr_fd_ctx_t *fd_ctx;
- /* dir read */
+ /* @child_up: copy of priv->child_up taken at the time of transaction
+ start. The copy is taken so that we have a stable child_up array
+ through the phases of the transaction as priv->child_up[i] can keep
+ changing through time.
+ */
+ unsigned char *child_up;
+
+ /* @read_attempted:
+ array of flags representing subvolumes where read operations of
+ the read transaction have already been attempted. The array is
+ first pre-filled with down subvolumes, and as reads are performed
+ on other subvolumes, those are set as well. This way if the read
+ operation fails we do not retry on that subvolume again.
+ */
+ unsigned char *read_attempted;
- struct {
- int success_count;
- int32_t op_ret;
- int32_t op_errno;
- } opendir;
+ /* @readfn:
- struct {
- int32_t op_ret;
- int32_t op_errno;
- size_t size;
- off_t offset;
+ pointer to function which will perform the read operation on a given
+ subvolume. Used in read transactions.
+ */
- int last_tried;
- } readdir;
+ afr_read_txn_wind_t readfn;
- struct {
- int32_t op_ret;
- int32_t op_errno;
+ /* @refreshed:
- size_t size;
- off_t offset;
- int32_t flag;
+ the inode was "refreshed" (i.e, pending xattrs from all subvols
+ freshly inspected and inode ctx updated accordingly) as part of
+ this transaction already.
+ */
+ gf_boolean_t refreshed;
- int last_tried;
- } getdents;
+ /* @inode:
- /* inode write */
+ the inode on which the read txn is performed on. ref'ed and copied
+ from either fd->inode or loc.inode
+ */
- struct {
- ino_t ino;
- mode_t mode;
- struct stat buf;
- } chmod;
+ inode_t *inode;
- struct {
- ino_t ino;
- mode_t mode;
- struct stat buf;
- } fchmod;
+ /* @parent[2]:
- struct {
- ino_t ino;
- uid_t uid;
- gid_t gid;
- struct stat buf;
- } chown;
+ parent inode[s] on which directory transactions are performed.
+ */
- struct {
- ino_t ino;
- uid_t uid;
- gid_t gid;
- struct stat buf;
- } fchown;
-
- struct {
- ino_t ino;
- struct stat buf;
+ inode_t *parent;
+ inode_t *parent2;
- int32_t op_ret;
+ /* @readable:
- struct iovec *vector;
- struct iobref *iobref;
- int32_t count;
- off_t offset;
- } writev;
+ array of flags representing servers from which a read can be
+ performed. This is the output of afr_inode_refresh()
+ */
+ unsigned char *readable;
+
+ afr_inode_refresh_cbk_t refreshfn;
+
+ /* @refreshinode:
+
+ Inode currently getting refreshed.
+ */
+ inode_t *refreshinode;
+
+ /*
+ @pre_op_compat:
+
+ compatibility mode of pre-op. send a separate pre-op and
+ op operations as part of transaction, rather than combining
+ */
+
+ gf_boolean_t pre_op_compat;
+
+ dict_t *xattr_req;
+
+ afr_internal_lock_t internal_lock;
+
+ dict_t *dict;
+
+ int optimistic_change_log;
+ gf_boolean_t delayed_post_op;
+
+ /* Is the current writev() going to perform a stable write?
+ i.e, is fd->flags or @flags writev param have O_SYNC or
+ O_DSYNC?
+ */
+ gf_boolean_t stable_write;
+
+ /* This write appended to the file. Nnot necessarily O_APPEND,
+ just means the offset of write was at the end of file.
+ */
+ gf_boolean_t append_write;
+
+ /*
+ This struct contains the arguments for the "continuation"
+ (scheme-like) of fops
+ */
+
+ struct {
+ struct {
+ unsigned char buf_set;
+ struct statvfs buf;
+ } statfs;
+
+ struct {
+ int32_t flags;
+ } open;
+
+ struct {
+ int32_t cmd;
+ struct gf_flock user_flock;
+ struct gf_flock ret_flock;
+ unsigned char *locked_nodes;
+ } lk;
+
+ /* inode read */
+
+ struct {
+ int32_t mask;
+ int last_index; /* index of the child we tried previously */
+ } access;
+
+ struct {
+ int last_index;
+ } stat;
+
+ struct {
+ int last_index;
+ } fstat;
+
+ struct {
+ size_t size;
+ int last_index;
+ } readlink;
+
+ struct {
+ char *name;
+ int last_index;
+ long xattr_len;
+ } getxattr;
+
+ struct {
+ size_t size;
+ off_t offset;
+ int last_index;
+ uint32_t flags;
+ } readv;
+
+ /* dir read */
+
+ struct {
+ int success_count;
+ int32_t op_ret;
+ int32_t op_errno;
+
+ uint32_t *checksum;
+ } opendir;
+
+ struct {
+ int32_t op_ret;
+ int32_t op_errno;
+ size_t size;
+ off_t offset;
+ dict_t *dict;
+ gf_boolean_t failed;
+ int last_index;
+ } readdir;
+ /* inode write */
+
+ struct {
+ struct iatt prebuf;
+ struct iatt postbuf;
+ } inode_wfop; //common structure for all inode-write-fops
+
+ struct {
+ int32_t op_ret;
+
+ struct iovec *vector;
+ struct iobref *iobref;
+ int32_t count;
+ off_t offset;
+ uint32_t flags;
+ } writev;
+
+ struct {
+ off_t offset;
+ } truncate;
+
+ struct {
+ off_t offset;
+ } ftruncate;
+
+ struct {
+ struct iatt in_buf;
+ int32_t valid;
+ } setattr;
+
+ struct {
+ struct iatt in_buf;
+ int32_t valid;
+ } fsetattr;
+
+ struct {
+ dict_t *dict;
+ int32_t flags;
+ } setxattr;
+
+ struct {
+ dict_t *dict;
+ int32_t flags;
+ } fsetxattr;
+
+ struct {
+ char *name;
+ } removexattr;
+
+ struct {
+ dict_t *xattr;
+ } xattrop;
+
+ struct {
+ dict_t *xattr;
+ } fxattrop;
+
+ /* dir write */
+
+ struct {
+ inode_t *inode;
+ struct iatt buf;
+ struct iatt preparent;
+ struct iatt postparent;
+ struct iatt prenewparent;
+ struct iatt postnewparent;
+ } dir_fop; //common structure for all dir fops
+
+ struct {
+ fd_t *fd;
+ dict_t *params;
+ int32_t flags;
+ mode_t mode;
+ } create;
+
+ struct {
+ dev_t dev;
+ mode_t mode;
+ dict_t *params;
+ } mknod;
+
+ struct {
+ int32_t mode;
+ dict_t *params;
+ } mkdir;
+
+ struct {
+ int flags;
+ } rmdir;
+
+ struct {
+ dict_t *params;
+ char *linkpath;
+ } symlink;
struct {
- ino_t ino;
+ int32_t mode;
off_t offset;
- struct stat buf;
- } truncate;
+ size_t len;
+ } fallocate;
struct {
- ino_t ino;
off_t offset;
- struct stat buf;
- } ftruncate;
+ size_t len;
+ } discard;
- struct {
- ino_t ino;
- struct timespec tv[2];
- struct stat buf;
- } utimens;
+ struct {
+ off_t offset;
+ off_t len;
+ struct iatt prebuf;
+ struct iatt postbuf;
+ } zerofill;
- struct {
- dict_t *dict;
- int32_t flags;
- } setxattr;
- struct {
- const char *name;
- } removexattr;
+ } cont;
- /* dir write */
-
- struct {
- ino_t ino;
- fd_t *fd;
- int32_t flags;
- mode_t mode;
- inode_t *inode;
- struct stat buf;
- } create;
+ struct {
+ off_t start, len;
- struct {
- ino_t ino;
- dev_t dev;
- mode_t mode;
- inode_t *inode;
- struct stat buf;
- } mknod;
+ gf_boolean_t eager_lock_on;
+ int *eager_lock;
- struct {
- ino_t ino;
- int32_t mode;
- inode_t *inode;
- struct stat buf;
- } mkdir;
+ char *basename;
+ char *new_basename;
- struct {
- int32_t op_ret;
- int32_t op_errno;
- } unlink;
+ loc_t parent_loc;
+ loc_t new_parent_loc;
- struct {
- int32_t op_ret;
- int32_t op_errno;
- } rmdir;
+ afr_transaction_type type;
- struct {
- ino_t ino;
- struct stat buf;
- } rename;
+ /* stub to resume on destruction
+ of the transaction frame */
+ call_stub_t *resume_stub;
- struct {
- ino_t ino;
- inode_t *inode;
- struct stat buf;
- } link;
+ struct list_head eager_locked;
- struct {
- ino_t ino;
- inode_t *inode;
- struct stat buf;
- char *linkpath;
- } symlink;
+ unsigned char *pre_op;
- struct {
- int32_t flags;
- dir_entry_t *entries;
- int32_t count;
- } setdents;
- } cont;
-
- struct {
- off_t start, len;
+ /* @fop_subvols: subvolumes on which FOP will be attempted */
+ unsigned char *fop_subvols;
- unsigned char *locked_nodes;
- int lock_count;
+ /* @failed_subvols: subvolumes on which FOP failed. Always
+ a subset of @fop_subvols */
+ unsigned char *failed_subvols;
- const char *basename;
- const char *new_basename;
+ /* @dirtied: flag which indicates whether we set dirty flag
+ in the OP. Typically true when we are performing operation
+ on more than one subvol and optimistic changelog is disabled
- loc_t parent_loc;
- loc_t new_parent_loc;
+ A 'true' value set in @dirtied flag means an 'undirtying'
+ has to be done in POST-OP phase.
+ */
+ gf_boolean_t dirtied;
- afr_transaction_type type;
+ /* @inherited: flag which indicates that the dirty flags
+ of the previous transaction were inherited
+ */
+ gf_boolean_t inherited;
- int success_count;
- int erase_pending;
- int failure_count;
+ /*
+ @no_uninherit: flag which indicates that a pre_op_uninherit()
+ must _not_ be attempted (and returned as failure) always. This
+ flag is set when a hard pre-op is performed, but not accounted
+ for it in fd_ctx->on_disk[]. Such transactions are "isolated"
+ from the pre-op piggybacking entirely and therefore uninherit
+ must not be attempted.
+ */
+ gf_boolean_t no_uninherit;
- int last_tried;
- int32_t *child_errno;
+ /* @uninherit_done:
+ @uninherit_value:
- call_frame_t *main_frame;
+ The above pair variables make pre_op_uninherit() idempotent.
+ Both are FALSE initially. The first call to pre_op_uninherit
+ sets @uninherit_done to TRUE and the return value to
+ @uninherit_value. Further calls will check for @uninherit_done
+ to be TRUE and if so will simply return @uninherit_value.
+ */
+ gf_boolean_t uninherit_done;
+ gf_boolean_t uninherit_value;
- int (*fop) (call_frame_t *frame, xlator_t *this);
+ /* @changelog_resume: function to be called after changlogging
+ (either pre-op or post-op) is done
+ */
- int (*done) (call_frame_t *frame, xlator_t *this);
+ afr_changelog_resume_t changelog_resume;
- int (*resume) (call_frame_t *frame, xlator_t *this);
+ call_frame_t *main_frame;
- int (*unwind) (call_frame_t *frame, xlator_t *this);
- } transaction;
+ int (*wind) (call_frame_t *frame, xlator_t *this, int subvol);
- afr_self_heal_t self_heal;
-} afr_local_t;
+ int (*fop) (call_frame_t *frame, xlator_t *this);
+ int (*done) (call_frame_t *frame, xlator_t *this);
-typedef struct {
- unsigned char pre_op_done;
- unsigned char *child_failed;
-} afr_fd_ctx_t;
+ int (*resume) (call_frame_t *frame, xlator_t *this);
+
+ int (*unwind) (call_frame_t *frame, xlator_t *this);
+
+ /* post-op hook */
+ } transaction;
+
+ syncbarrier_t barrier;
+
+ struct marker_str marker;
+ /* extra data for fops */
+ dict_t *xdata_req;
+ dict_t *xdata_rsp;
-/* try alloc and if it fails, goto label */
-#define ALLOC_OR_GOTO(var, type, label) do { \
- var = CALLOC (sizeof (type), 1); \
- if (!var) { \
- gf_log (this->name, GF_LOG_ERROR, \
- "out of memory :("); \
- op_errno = ENOMEM; \
- goto label; \
- } \
- } while (0);
+ mode_t umask;
+ int xflag;
+ gf_boolean_t do_discovery;
+ struct afr_reply *replies;
+} afr_local_t;
/* did a call fail due to a child failing? */
-#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \
- ((op_errno == ENOTCONN) || \
- (op_errno == EBADFD)))
+#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \
+ ((op_errno == ENOTCONN) || \
+ (op_errno == EBADFD)))
-#define afr_fop_failed(op_ret, op_errno) ((op_ret) == -1)
+int
+afr_inode_read_subvol_get (inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int *event_generation);
+int
+__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int *event_generation);
-/* have we tried all children? */
-#define all_tried(i, count) ((i) == (count) - 1)
+int
+__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvol,
+ int event_generation);
+int
+afr_inode_read_subvol_set (inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int event_generation);
int
-afr_fd_ctx_set (xlator_t *this, fd_t *fd);
+afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this);
-uint64_t
-afr_read_child (xlator_t *this, inode_t *inode);
+int
+afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
+ unsigned char *readable);
+
+int
+afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p,
+ int type);
+int
+afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,
+ int *event_p, afr_transaction_type type);
+
+#define afr_data_subvol_get(i, t, s, e) \
+ afr_read_subvol_get(i, t, s, e, AFR_DATA_TRANSACTION)
+
+#define afr_metadata_subvol_get(i, t, s, e) \
+ afr_read_subvol_get(i, t, s, e, AFR_METADATA_TRANSACTION)
+
+int
+afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ afr_inode_refresh_cbk_t cbk);
+
+int32_t
+afr_notify (xlator_t *this, int32_t event, void *data, void *data2);
+
+int
+afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local,
+ loc_t *loc, char *basename, int child_count);
void
-afr_set_read_child (xlator_t *this, inode_t *inode, int32_t read_child);
+afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock);
+
+int
+afr_attempt_lock_recovery (xlator_t *this, int32_t child_index);
+
+int
+afr_mark_locked_nodes (xlator_t *this, fd_t *fd,
+ unsigned char *locked_nodes);
void
-afr_build_parent_loc (loc_t *parent, loc_t *child);
+afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner);
int
-afr_up_children_count (int child_count, unsigned char *child_up);
+afr_set_lock_number (call_frame_t *frame, xlator_t *this);
+
+int32_t
+afr_unlock (call_frame_t *frame, xlator_t *this);
int
-afr_locked_nodes_count (unsigned char *locked_nodes, int child_count);
+afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this);
+
+int
+afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this);
+
+int
+afr_blocking_lock (call_frame_t *frame, xlator_t *this);
+
+int
+afr_internal_lock_finish (call_frame_t *frame, xlator_t *this);
+
+int
+afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom,
+ unsigned int child_count);
int
-afr_first_up_child (afr_private_t *priv);
+__afr_fd_ctx_set (xlator_t *this, fd_t *fd);
+
+int
+afr_fd_ctx_set (xlator_t *this, fd_t *fd);
-ino64_t
-afr_itransform (ino64_t ino, int child_count, int child_index);
+afr_fd_ctx_t *
+afr_fd_ctx_get (fd_t *fd, xlator_t *this);
+
+int
+afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno);
+
+int
+afr_locked_nodes_count (unsigned char *locked_nodes, int child_count);
int
-afr_deitransform (ino64_t ino, int child_count);
+afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode);
+
+void
+afr_replies_wipe (afr_local_t *local, afr_private_t *priv);
void
afr_local_cleanup (afr_local_t *local, xlator_t *this);
@@ -486,101 +822,155 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this);
int
afr_frame_return (call_frame_t *frame);
-#define AFR_STACK_UNWIND(frame, params ...) \
- do { \
- afr_local_t *__local = NULL; \
- xlator_t *__this = NULL; \
- __local = frame->local; \
- __this = frame->this; \
- frame->local = NULL; \
- STACK_UNWIND (frame, params); \
- afr_local_cleanup (__local, __this); \
- free (__local); \
-} while (0);
-
-#define AFR_STACK_DESTROY(frame) \
- do { \
- afr_local_t *__local = NULL; \
- xlator_t *__this = NULL; \
- __local = frame->local; \
- __this = frame->this; \
- frame->local = NULL; \
- STACK_DESTROY (frame->root); \
- afr_local_cleanup (__local, __this); \
- free (__local); \
-} while (0);
+int
+afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata);
+
+void
+afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this);
+
+int
+afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);
+
+#define AFR_STACK_UNWIND(fop, frame, params ...) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ if (frame) { \
+ __local = frame->local; \
+ __this = frame->this; \
+ frame->local = NULL; \
+ } \
+ STACK_UNWIND_STRICT (fop, frame, params); \
+ if (__local) { \
+ afr_local_cleanup (__local, __this); \
+ mem_put (__local); \
+ } \
+ } while (0)
+
+#define AFR_STACK_DESTROY(frame) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ __local = frame->local; \
+ __this = frame->this; \
+ frame->local = NULL; \
+ STACK_DESTROY (frame->root); \
+ if (__local) { \
+ afr_local_cleanup (__local, __this); \
+ mem_put (__local); \
+ } \
+ } while (0);
+
+#define AFR_FRAME_INIT(frame, op_errno) \
+ ({frame->local = mem_get0 (THIS->local_pool); \
+ if (afr_local_init (frame->local, THIS->private, &op_errno)) { \
+ afr_local_cleanup (frame->local, THIS); \
+ mem_put (frame->local); \
+ frame->local = NULL; }; \
+ frame->local;})
+
+#define AFR_STACK_RESET(frame) do { int opr; STACK_RESET (frame->root); AFR_FRAME_INIT(frame, opr);} while (0)
/* allocate and return a string that is the basename of argument */
-static inline char *
-AFR_BASENAME (const char *str)
+static inline char *
+AFR_BASENAME (const char *str)
{
- char *__tmp_str = NULL;
- char *__basename_str = NULL;
- __tmp_str = strdup (str);
- __basename_str = strdup (basename (__tmp_str));
- FREE (__tmp_str);
- return __basename_str;
+ char *__tmp_str = NULL;
+ char *__basename_str = NULL;
+ __tmp_str = gf_strdup (str);
+ __basename_str = gf_strdup (basename (__tmp_str));
+ GF_FREE (__tmp_str);
+ return __basename_str;
}
-/* initialize local_t */
-static inline int
-AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv)
-{
- local->child_up = CALLOC (sizeof (*local->child_up),
- priv->child_count);
- if (!local->child_up) {
- return -ENOMEM;
- }
+call_frame_t *
+afr_copy_frame (call_frame_t *base);
- memcpy (local->child_up, priv->child_up,
- sizeof (*local->child_up) * priv->child_count);
+int
+afr_transaction_local_init (afr_local_t *local, xlator_t *this);
+int32_t
+afr_marker_getxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name,afr_local_t *local, afr_private_t *priv );
- local->call_count = afr_up_children_count (priv->child_count, local->child_up);
- if (local->call_count == 0)
- return -ENOTCONN;
+int
+afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno);
- local->transaction.erase_pending = 1;
+int
+afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count,
+ transaction_lk_type_t lk_type);
- local->op_ret = -1;
- local->op_errno = EUCLEAN;
+int
+afr_higher_errno (int32_t old_errno, int32_t new_errno);
- return 0;
-}
+int
+afr_final_errno (afr_local_t *local, afr_private_t *priv);
+int
+afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req);
-static inline int
-afr_transaction_local_init (afr_local_t *local, afr_private_t *priv)
-{
- int i;
-
- local->child_errno = CALLOC (sizeof (*local->child_errno),
- priv->child_count);
- if (!local->child_errno) {
- return -ENOMEM;
- }
-
- local->pending = CALLOC (sizeof (*local->pending),
- priv->child_count);
-
- if (!local->pending) {
- return -ENOMEM;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- local->pending[i] = CALLOC (sizeof (*local->pending[i]),
- 3); /* data + metadata + entry */
- if (!local->pending[i])
- return -ENOMEM;
- }
-
- local->transaction.locked_nodes = CALLOC (sizeof (*local->transaction.locked_nodes),
- priv->child_count);
+void
+afr_fix_open (fd_t *fd, xlator_t *this);
- local->transaction.child_errno = CALLOC (sizeof (*local->transaction.child_errno),
- priv->child_count);
+afr_fd_ctx_t *
+afr_fd_ctx_get (fd_t *fd, xlator_t *this);
- return 0;
-}
+void
+afr_set_low_priority (call_frame_t *frame);
+int
+afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child,
+ int flags);
+gf_boolean_t
+afr_have_quorum (char *logname, afr_private_t *priv);
+
+void
+afr_matrix_cleanup (int32_t **pending, unsigned int m);
+
+int32_t**
+afr_matrix_create (unsigned int m, unsigned int n);
+
+void
+afr_filter_xattrs (dict_t *xattr);
+
+/*
+ * Special value indicating we should use the "auto" quorum method instead of
+ * a fixed value (including zero to turn off quorum enforcement).
+ */
+#define AFR_QUORUM_AUTO INT_MAX
+
+/*
+ * Having this as a macro will make debugging a bit weirder, but does reduce
+ * the probability of functions handling this check inconsistently.
+ */
+#define QUORUM_CHECK(_func,_label) do { \
+ if (priv->quorum_count && !afr_have_quorum(this->name,priv)) { \
+ gf_log(this->name,GF_LOG_WARNING, \
+ "failing "#_func" due to lack of quorum"); \
+ op_errno = EROFS; \
+ goto _label; \
+ } \
+} while (0);
+
+int
+afr_fd_report_unstable_write (xlator_t *this, fd_t *fd);
+
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd);
+
+void
+afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub);
+
+int
+afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count);
+
+void
+afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this);
+
+int
+afr_local_pathinfo (char *pathinfo, gf_boolean_t *is_local);
+
+void
+afr_remove_eager_lock_stub (afr_local_t *local);
#endif /* __AFR_H__ */
diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c
new file mode 100644
index 000000000..eed509956
--- /dev/null
+++ b/xlators/cluster/afr/src/pump.c
@@ -0,0 +1,2473 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <unistd.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <fnmatch.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "afr-common.c"
+#include "defaults.c"
+#include "glusterfs.h"
+#include "pump.h"
+
+
+static int
+afr_set_dict_gfid (dict_t *dict, uuid_t gfid)
+{
+ int ret = 0;
+ uuid_t *pgfid = NULL;
+
+ GF_ASSERT (gfid);
+
+ pgfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char);
+ if (!pgfid) {
+ ret = -1;
+ goto out;
+ }
+
+ uuid_copy (*pgfid, gfid);
+
+ ret = dict_set_dynptr (dict, "gfid-req", pgfid, sizeof (uuid_t));
+ if (ret)
+ gf_log (THIS->name, GF_LOG_ERROR, "gfid set failed");
+
+out:
+ if (ret && pgfid)
+ GF_FREE (pgfid);
+ return ret;
+}
+
+static int
+afr_set_root_gfid (dict_t *dict)
+{
+ uuid_t gfid;
+ int ret = 0;
+
+ memset (gfid, 0, 16);
+ gfid[15] = 1;
+
+ ret = afr_set_dict_gfid (dict, gfid);
+
+ return ret;
+}
+
+static int
+afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name)
+{
+ int ret = -1;
+ uuid_t pargfid = {0};
+
+ if (!child)
+ goto out;
+
+ if (!uuid_is_null (parent->inode->gfid))
+ uuid_copy (pargfid, parent->inode->gfid);
+ else if (!uuid_is_null (parent->gfid))
+ uuid_copy (pargfid, parent->gfid);
+
+ if (uuid_is_null (pargfid))
+ goto out;
+
+ if (strcmp (parent->path, "/") == 0)
+ ret = gf_asprintf ((char **)&child->path, "/%s", name);
+ else
+ ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path,
+ name);
+
+ if (-1 == ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "asprintf failed while setting child path");
+ }
+
+ child->name = strrchr (child->path, '/');
+ if (child->name)
+ child->name++;
+
+ child->parent = inode_ref (parent->inode);
+ child->inode = inode_new (parent->inode->table);
+ uuid_copy (child->pargfid, pargfid);
+
+ if (!child->inode) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if ((ret == -1) && child)
+ loc_wipe (child);
+
+ return ret;
+}
+
+static void
+afr_build_root_loc (xlator_t *this, loc_t *loc)
+{
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ loc->path = gf_strdup ("/");
+ loc->name = "";
+ loc->inode = inode_ref (priv->root_inode);
+ uuid_copy (loc->gfid, loc->inode->gfid);
+}
+
+static void
+afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent)
+{
+ GF_ASSERT (loc);
+ GF_ASSERT (buf);
+
+ uuid_copy (loc->gfid, buf->ia_gfid);
+ if (postparent)
+ uuid_copy (loc->pargfid, postparent->ia_gfid);
+}
+
+static uint64_t pump_pid = 0;
+static inline void
+pump_fill_loc_info (loc_t *loc, struct iatt *iatt, struct iatt *parent)
+{
+ afr_update_loc_gfids (loc, iatt, parent);
+ uuid_copy (loc->inode->gfid, iatt->ia_gfid);
+}
+
+static int
+pump_mark_start_pending (xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ pump_private_t *pump_priv = NULL;
+
+ priv = this->private;
+ pump_priv = priv->pump_private;
+
+ pump_priv->pump_start_pending = 1;
+
+ return 0;
+}
+
+static int
+is_pump_start_pending (xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ pump_private_t *pump_priv = NULL;
+
+ priv = this->private;
+ pump_priv = priv->pump_private;
+
+ return (pump_priv->pump_start_pending);
+}
+
+static int
+pump_remove_start_pending (xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ pump_private_t *pump_priv = NULL;
+
+ priv = this->private;
+ pump_priv = priv->pump_private;
+
+ pump_priv->pump_start_pending = 0;
+
+ return 0;
+}
+
+static pump_state_t
+pump_get_state ()
+{
+ xlator_t *this = NULL;
+ afr_private_t *priv = NULL;
+ pump_private_t *pump_priv = NULL;
+
+ pump_state_t ret;
+
+ this = THIS;
+ priv = this->private;
+ pump_priv = priv->pump_private;
+
+ LOCK (&pump_priv->pump_state_lock);
+ {
+ ret = pump_priv->pump_state;
+ }
+ UNLOCK (&pump_priv->pump_state_lock);
+
+ return ret;
+}
+
+int
+pump_change_state (xlator_t *this, pump_state_t state)
+{
+ afr_private_t *priv = NULL;
+ pump_private_t *pump_priv = NULL;
+
+ pump_state_t state_old;
+ pump_state_t state_new;
+
+
+ priv = this->private;
+ pump_priv = priv->pump_private;
+
+ GF_ASSERT (pump_priv);
+
+ LOCK (&pump_priv->pump_state_lock);
+ {
+ state_old = pump_priv->pump_state;
+ state_new = state;
+
+ pump_priv->pump_state = state;
+
+ }
+ UNLOCK (&pump_priv->pump_state_lock);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Pump changing state from %d to %d",
+ state_old,
+ state_new);
+
+ return 0;
+}
+
+static int
+pump_set_resume_path (xlator_t *this, const char *path)
+{
+ int ret = 0;
+
+ afr_private_t *priv = NULL;
+ pump_private_t *pump_priv = NULL;
+
+ priv = this->private;
+ pump_priv = priv->pump_private;
+
+ GF_ASSERT (pump_priv);
+
+ LOCK (&pump_priv->resume_path_lock);
+ {
+ strncpy (pump_priv->resume_path, path, strlen (path) + 1);
+ }
+ UNLOCK (&pump_priv->resume_path_lock);
+
+ return ret;
+}
+
+static int
+pump_save_path (xlator_t *this, const char *path)
+{
+ afr_private_t *priv = NULL;
+ pump_state_t state;
+ dict_t *dict = NULL;
+ loc_t loc = {0};
+ int dict_ret = 0;
+ int ret = -1;
+
+ state = pump_get_state ();
+ if (state == PUMP_STATE_RESUME)
+ return 0;
+
+ priv = this->private;
+
+ GF_ASSERT (priv->root_inode);
+
+ afr_build_root_loc (this, &loc);
+
+ dict = dict_new ();
+ dict_ret = dict_set_str (dict, PUMP_PATH, (char *)path);
+ if (dict_ret)
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to set the key %s", path, PUMP_PATH);
+
+ ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0);
+
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_INFO,
+ "setxattr failed - could not save path=%s", path);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setxattr succeeded - saved path=%s", path);
+ }
+
+ dict_unref (dict);
+
+ loc_wipe (&loc);
+ return 0;
+}
+
+static int
+pump_check_and_update_status (xlator_t *this)
+{
+ pump_state_t state;
+ int ret = -1;
+
+ state = pump_get_state ();
+
+ switch (state) {
+
+ case PUMP_STATE_RESUME:
+ case PUMP_STATE_RUNNING:
+ {
+ ret = 0;
+ break;
+ }
+ case PUMP_STATE_PAUSE:
+ {
+ ret = -1;
+ break;
+ }
+ case PUMP_STATE_ABORT:
+ {
+ pump_save_path (this, "/");
+ ret = -1;
+ break;
+ }
+ default:
+ {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Unknown pump state");
+ ret = -1;
+ break;
+ }
+
+ }
+
+ return ret;
+}
+
+static const char *
+pump_get_resume_path (xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ pump_private_t *pump_priv = NULL;
+
+ const char *resume_path = NULL;
+
+ priv = this->private;
+ pump_priv = priv->pump_private;
+
+ resume_path = pump_priv->resume_path;
+
+ return resume_path;
+}
+
+static int
+pump_update_resume_state (xlator_t *this, const char *path)
+{
+ pump_state_t state;
+ const char *resume_path = NULL;
+
+ state = pump_get_state ();
+
+ if (state == PUMP_STATE_RESUME) {
+ resume_path = pump_get_resume_path (this);
+ if (strcmp (resume_path, "/") == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Reached the resume path (/). Proceeding to change state"
+ " to running");
+ pump_change_state (this, PUMP_STATE_RUNNING);
+ } else if (strcmp (resume_path, path) == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Reached the resume path. Proceeding to change state"
+ " to running");
+ pump_change_state (this, PUMP_STATE_RUNNING);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Not yet hit the resume path:res-path=%s,path=%s",
+ resume_path, path);
+ }
+ }
+
+ return 0;
+}
+
+static gf_boolean_t
+is_pump_traversal_allowed (xlator_t *this, const char *path)
+{
+ pump_state_t state;
+ const char *resume_path = NULL;
+ gf_boolean_t ret = _gf_true;
+
+ state = pump_get_state ();
+
+ if (state == PUMP_STATE_RESUME) {
+ resume_path = pump_get_resume_path (this);
+ if (strstr (resume_path, path)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "On the right path to resumption path");
+ ret = _gf_true;
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Not the right path to resuming=> ignoring traverse");
+ ret = _gf_false;
+ }
+ }
+
+ return ret;
+}
+
+static int
+pump_save_file_stats (xlator_t *this, const char *path)
+{
+ afr_private_t *priv = NULL;
+ pump_private_t *pump_priv = NULL;
+
+ priv = this->private;
+ pump_priv = priv->pump_private;
+
+ LOCK (&pump_priv->resume_path_lock);
+ {
+ pump_priv->number_files_pumped++;
+
+ strncpy (pump_priv->current_file, path,
+ PATH_MAX);
+ }
+ UNLOCK (&pump_priv->resume_path_lock);
+
+ return 0;
+}
+
+static int
+gf_pump_traverse_directory (loc_t *loc)
+{
+ xlator_t *this = NULL;
+ fd_t *fd = NULL;
+ off_t offset = 0;
+ loc_t entry_loc = {0};
+ gf_dirent_t *entry = NULL;
+ gf_dirent_t *tmp = NULL;
+ gf_dirent_t entries;
+ struct iatt iatt = {0};
+ struct iatt parent = {0};
+ dict_t *xattr_rsp = NULL;
+ int ret = 0;
+ gf_boolean_t is_directory_empty = _gf_true;
+ gf_boolean_t free_entries = _gf_false;
+
+ INIT_LIST_HEAD (&entries.list);
+ this = THIS;
+
+ GF_ASSERT (loc->inode);
+
+ fd = fd_create (loc->inode, pump_pid);
+ if (!fd) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to create fd for %s", loc->path);
+ goto out;
+ }
+
+ ret = syncop_opendir (this, loc, fd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "opendir failed on %s", loc->path);
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "pump opendir on %s returned=%d",
+ loc->path, ret);
+
+ while (syncop_readdirp (this, fd, 131072, offset, NULL, &entries)) {
+ free_entries = _gf_true;
+
+ if (list_empty (&entries.list)) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "no more entries in directory");
+ goto out;
+ }
+
+ list_for_each_entry_safe (entry, tmp, &entries.list, list) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "found readdir entry=%s", entry->d_name);
+
+ offset = entry->d_off;
+ if (uuid_is_null (entry->d_stat.ia_gfid)) {
+ gf_log (this->name, GF_LOG_WARNING, "%s/%s: No "
+ "gfid present skipping",
+ loc->path, entry->d_name);
+ continue;
+ }
+ loc_wipe (&entry_loc);
+ ret = afr_build_child_loc (this, &entry_loc, loc,
+ entry->d_name);
+ if (ret)
+ goto out;
+
+ if ((strcmp (entry->d_name, ".") == 0) ||
+ (strcmp (entry->d_name, "..") == 0))
+ continue;
+
+ is_directory_empty = _gf_false;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "lookup %s => %"PRId64,
+ entry_loc.path,
+ iatt.ia_ino);
+
+ ret = syncop_lookup (this, &entry_loc, NULL, &iatt,
+ &xattr_rsp, &parent);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: lookup failed", entry_loc.path);
+ continue;
+ }
+
+ ret = afr_selfheal_name (this, loc->gfid, entry->d_name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: name self-heal failed (%s/%s)",
+ entry_loc.path, uuid_utoa (loc->gfid),
+ entry->d_name);
+ continue;
+ }
+
+ ret = afr_selfheal (this, iatt.ia_gfid);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: self-heal failed (%s)",
+ entry_loc.path, uuid_utoa (iatt.ia_gfid));
+ continue;
+ }
+
+ pump_fill_loc_info (&entry_loc, &iatt, &parent);
+
+ pump_update_resume_state (this, entry_loc.path);
+
+ pump_save_path (this, entry_loc.path);
+ pump_save_file_stats (this, entry_loc.path);
+
+ ret = pump_check_and_update_status (this);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Pump beginning to exit out");
+ goto out;
+ }
+
+ if (IA_ISDIR (iatt.ia_type)) {
+ if (is_pump_traversal_allowed (this, entry_loc.path)) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "entering dir=%s", entry->d_name);
+ gf_pump_traverse_directory (&entry_loc);
+ }
+ }
+ }
+
+ gf_dirent_free (&entries);
+ free_entries = _gf_false;
+ gf_log (this->name, GF_LOG_TRACE, "offset incremented to %d",
+ (int32_t ) offset);
+
+ }
+
+ ret = syncop_close (fd);
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_DEBUG, "closing the fd failed");
+
+ if (is_directory_empty && (strcmp (loc->path, "/") == 0)) {
+ pump_change_state (this, PUMP_STATE_RUNNING);
+ gf_log (this->name, GF_LOG_INFO, "Empty source brick. "
+ "Nothing to be done.");
+ }
+
+out:
+ if (entry_loc.path)
+ loc_wipe (&entry_loc);
+ if (free_entries)
+ gf_dirent_free (&entries);
+ return 0;
+}
+
+static int
+pump_update_resume_path (xlator_t *this)
+{
+ const char *resume_path = NULL;
+
+ resume_path = pump_get_resume_path (this);
+
+ if (resume_path) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Found a path to resume from: %s",
+ resume_path);
+
+ }else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Did not find a path=> setting to '/'");
+ pump_set_resume_path (this, "/");
+ }
+
+ pump_change_state (this, PUMP_STATE_RESUME);
+
+ return 0;
+}
+
+static int32_t
+pump_xattr_cleaner (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ loc_t loc = {0};
+ int i = 0;
+ int ret = 0;
+ int source = 0;
+ int sink = 1;
+
+ priv = this->private;
+
+ afr_build_root_loc (this, &loc);
+
+ ret = syncop_removexattr (priv->children[source], &loc,
+ PUMP_PATH, 0);
+
+ ret = syncop_removexattr (priv->children[sink], &loc,
+ PUMP_SINK_COMPLETE, 0);
+
+ for (i = 0; i < priv->child_count; i++) {
+ ret = syncop_removexattr (priv->children[i], &loc,
+ PUMP_SOURCE_COMPLETE, 0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "removexattr "
+ "failed with %s", strerror (-ret));
+ }
+ }
+
+ loc_wipe (&loc);
+ return pump_command_reply (frame, this);
+}
+
+static int
+pump_complete_migration (xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ pump_private_t *pump_priv = NULL;
+ dict_t *dict = NULL;
+ pump_state_t state;
+ loc_t loc = {0};
+ int dict_ret = 0;
+ int ret = -1;
+
+ priv = this->private;
+ pump_priv = priv->pump_private;
+
+ GF_ASSERT (priv->root_inode);
+
+ afr_build_root_loc (this, &loc);
+
+ dict = dict_new ();
+
+ state = pump_get_state ();
+ if (state == PUMP_STATE_RUNNING) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Pump finished pumping");
+
+ pump_priv->pump_finished = _gf_true;
+
+ dict_ret = dict_set_str (dict, PUMP_SOURCE_COMPLETE, "jargon");
+ if (dict_ret)
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to set the key %s",
+ loc.path, PUMP_SOURCE_COMPLETE);
+
+ ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setxattr failed - while notifying source complete");
+ }
+ dict_ret = dict_set_str (dict, PUMP_SINK_COMPLETE, "jargon");
+ if (dict_ret)
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to set the key %s",
+ loc.path, PUMP_SINK_COMPLETE);
+
+ ret = syncop_setxattr (PUMP_SINK_CHILD (this), &loc, dict, 0);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setxattr failed - while notifying sink complete");
+ }
+
+ pump_save_path (this, "/");
+
+ } else if (state == PUMP_STATE_ABORT) {
+ gf_log (this->name, GF_LOG_DEBUG, "Starting cleanup "
+ "of pump internal xattrs");
+ call_resume (pump_priv->cleaner);
+ }
+
+ loc_wipe (&loc);
+ return 0;
+}
+
+static int
+pump_lookup_sink (loc_t *loc)
+{
+ xlator_t *this = NULL;
+ struct iatt iatt, parent;
+ dict_t *xattr_rsp;
+ dict_t *xattr_req = NULL;
+ int ret = 0;
+
+ this = THIS;
+
+ xattr_req = dict_new ();
+
+ ret = afr_set_root_gfid (xattr_req);
+ if (ret)
+ goto out;
+
+ ret = syncop_lookup (PUMP_SINK_CHILD (this), loc,
+ xattr_req, &iatt, &xattr_rsp, &parent);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Lookup on sink child failed");
+ ret = -1;
+ goto out;
+ }
+
+out:
+ if (xattr_req)
+ dict_unref (xattr_req);
+
+ return ret;
+}
+
+static int
+pump_task (void *data)
+{
+ xlator_t *this = NULL;
+ afr_private_t *priv = NULL;
+
+
+ loc_t loc = {0};
+ struct iatt iatt, parent;
+ dict_t *xattr_rsp = NULL;
+ dict_t *xattr_req = NULL;
+
+ int ret = -1;
+
+ this = THIS;
+ priv = this->private;
+
+ GF_ASSERT (priv->root_inode);
+
+ afr_build_root_loc (this, &loc);
+ xattr_req = dict_new ();
+ if (!xattr_req) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Out of memory");
+ ret = -1;
+ goto out;
+ }
+
+ afr_set_root_gfid (xattr_req);
+ ret = syncop_lookup (this, &loc, xattr_req,
+ &iatt, &xattr_rsp, &parent);
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "lookup: path=%s gfid=%s",
+ loc.path, uuid_utoa (loc.inode->gfid));
+
+ ret = pump_check_and_update_status (this);
+ if (ret < 0) {
+ goto out;
+ }
+
+ pump_update_resume_path (this);
+
+ afr_set_root_gfid (xattr_req);
+ ret = pump_lookup_sink (&loc);
+ if (ret) {
+ pump_update_resume_path (this);
+ goto out;
+ }
+
+ gf_pump_traverse_directory (&loc);
+
+ pump_complete_migration (this);
+out:
+ if (xattr_req)
+ dict_unref (xattr_req);
+
+ loc_wipe (&loc);
+ return 0;
+}
+
+
+static int
+pump_task_completion (int ret, call_frame_t *sync_frame, void *data)
+{
+ xlator_t *this = NULL;
+ afr_private_t *priv = NULL;
+
+ this = THIS;
+
+ priv = this->private;
+
+ inode_unref (priv->root_inode);
+ STACK_DESTROY (sync_frame->root);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Pump xlator exiting");
+ return 0;
+}
+
+int
+pump_start (call_frame_t *pump_frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ pump_private_t *pump_priv = NULL;
+
+ int ret = -1;
+
+ priv = this->private;
+ pump_priv = priv->pump_private;
+
+ afr_set_lk_owner (pump_frame, this, pump_frame->root);
+ pump_pid = (uint64_t) (unsigned long)pump_frame->root;
+
+ ret = synctask_new (pump_priv->env, pump_task,
+ pump_task_completion,
+ pump_frame, NULL);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "starting pump failed");
+ pump_change_state (this, PUMP_STATE_ABORT);
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setting pump as started lk_owner: %s %"PRIu64,
+ lkowner_utoa (&pump_frame->root->lk_owner), pump_pid);
+
+ priv->use_afr_in_pump = 1;
+out:
+ return ret;
+}
+
+static int
+pump_start_synctask (xlator_t *this)
+{
+ call_frame_t *frame = NULL;
+ int ret = 0;
+
+ frame = create_frame (this, this->ctx->pool);
+ if (!frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Out of memory");
+ ret = -1;
+ goto out;
+ }
+
+ pump_change_state (this, PUMP_STATE_RUNNING);
+
+ ret = pump_start (frame, this);
+
+out:
+ return ret;
+}
+
+int32_t
+pump_cmd_start_setxattr_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+
+{
+ call_frame_t *prev = NULL;
+ afr_local_t *local = NULL;
+ int ret = 0;
+
+ local = frame->local;
+
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not initiate destination "
+ "brick connect");
+ ret = op_ret;
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Successfully initiated destination "
+ "brick connect");
+
+ pump_mark_start_pending (this);
+
+ /* send the PARENT_UP as pump is ready now */
+ prev = cookie;
+ if (prev && prev->this)
+ prev->this->notify (prev->this, GF_EVENT_PARENT_UP, this);
+
+out:
+ local->op_ret = ret;
+ pump_command_reply (frame, this);
+
+ return 0;
+}
+
+static int
+pump_initiate_sink_connect (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *dict = NULL;
+ data_t *data = NULL;
+ char *clnt_cmd = NULL;
+ loc_t loc = {0};
+
+ int ret = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ GF_ASSERT (priv->root_inode);
+
+ afr_build_root_loc (this, &loc);
+
+ data = data_ref (dict_get (local->dict, RB_PUMP_CMD_START));
+ if (!data) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not get destination brick value");
+ goto out;
+ }
+
+ dict = dict_new ();
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+
+ clnt_cmd = GF_CALLOC (1, data->len+1, gf_common_mt_char);
+ if (!clnt_cmd) {
+ ret = -1;
+ goto out;
+ }
+
+ memcpy (clnt_cmd, data->data, data->len);
+ clnt_cmd[data->len] = '\0';
+ gf_log (this->name, GF_LOG_DEBUG, "Got destination brick %s\n",
+ clnt_cmd);
+
+ ret = dict_set_dynstr (dict, CLIENT_CMD_CONNECT, clnt_cmd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not inititiate destination brick "
+ "connect");
+ goto out;
+ }
+
+ STACK_WIND (frame,
+ pump_cmd_start_setxattr_cbk,
+ PUMP_SINK_CHILD(this),
+ PUMP_SINK_CHILD(this)->fops->setxattr,
+ &loc,
+ dict,
+ 0, NULL);
+
+ ret = 0;
+
+out:
+ if (dict)
+ dict_unref (dict);
+
+ if (data)
+ data_unref (data);
+
+ if (ret && clnt_cmd)
+ GF_FREE (clnt_cmd);
+
+ loc_wipe (&loc);
+ return ret;
+}
+
+static int
+is_pump_aborted (xlator_t *this)
+{
+ pump_state_t state;
+
+ state = pump_get_state ();
+
+ return ((state == PUMP_STATE_ABORT));
+}
+
+int32_t
+pump_cmd_start_getxattr_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ char *path = NULL;
+
+ pump_state_t state;
+ int ret = 0;
+ int need_unwind = 0;
+ int dict_ret = -1;
+
+ local = frame->local;
+
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "getxattr failed - changing pump "
+ "state to RUNNING with '/'");
+ path = "/";
+ ret = op_ret;
+ } else {
+ gf_log (this->name, GF_LOG_TRACE,
+ "getxattr succeeded");
+
+ dict_ret = dict_get_str (dict, PUMP_PATH, &path);
+ if (dict_ret < 0)
+ path = "/";
+ }
+
+ state = pump_get_state ();
+ if ((state == PUMP_STATE_RUNNING) ||
+ (state == PUMP_STATE_RESUME)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Pump is already started");
+ ret = -1;
+ goto out;
+ }
+
+ pump_set_resume_path (this, path);
+
+ if (is_pump_aborted (this))
+ /* We're re-starting pump afresh */
+ ret = pump_initiate_sink_connect (frame, this);
+ else {
+ /* We're re-starting pump from a previous
+ pause */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "about to start synctask");
+ ret = pump_start_synctask (this);
+ need_unwind = 1;
+ }
+
+out:
+ if ((ret < 0) || (need_unwind == 1)) {
+ local->op_ret = ret;
+ pump_command_reply (frame, this);
+ }
+ return 0;
+}
+
+int
+pump_execute_status (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ pump_private_t *pump_priv = NULL;
+
+ uint64_t number_files = 0;
+
+ char filename[PATH_MAX];
+ char summary[PATH_MAX+256];
+ char *dict_str = NULL;
+
+ int32_t op_ret = 0;
+ int32_t op_errno = 0;
+
+ dict_t *dict = NULL;
+ int ret = -1;
+
+ priv = this->private;
+ pump_priv = priv->pump_private;
+
+ LOCK (&pump_priv->resume_path_lock);
+ {
+ number_files = pump_priv->number_files_pumped;
+ strncpy (filename, pump_priv->current_file, PATH_MAX);
+ }
+ UNLOCK (&pump_priv->resume_path_lock);
+
+ dict_str = GF_CALLOC (1, PATH_MAX + 256, gf_afr_mt_char);
+ if (!dict_str) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Out of memory");
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ if (pump_priv->pump_finished) {
+ snprintf (summary, PATH_MAX+256,
+ "no_of_files=%"PRIu64, number_files);
+ } else {
+ snprintf (summary, PATH_MAX+256,
+ "no_of_files=%"PRIu64":current_file=%s",
+ number_files, filename);
+ }
+ snprintf (dict_str, PATH_MAX+256, "status=%d:%s",
+ (pump_priv->pump_finished)?1:0, summary);
+
+ dict = dict_new ();
+
+ ret = dict_set_dynstr (dict, RB_PUMP_CMD_STATUS, dict_str);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "dict_set_dynstr returned negative value");
+ } else {
+ dict_str = NULL;
+ }
+
+ op_ret = 0;
+
+out:
+
+ AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, NULL);
+
+ if (dict)
+ dict_unref (dict);
+
+ GF_FREE (dict_str);
+
+ return 0;
+}
+
+int
+pump_execute_pause (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ pump_change_state (this, PUMP_STATE_PAUSE);
+
+ local->op_ret = 0;
+ pump_command_reply (frame, this);
+
+ return 0;
+}
+
+int
+pump_execute_start (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ int ret = 0;
+ loc_t loc = {0};
+
+ priv = this->private;
+ local = frame->local;
+
+ if (!priv->root_inode) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Pump xlator cannot be started without an initial "
+ "lookup");
+ ret = -1;
+ goto out;
+ }
+
+ GF_ASSERT (priv->root_inode);
+
+ afr_build_root_loc (this, &loc);
+
+ STACK_WIND (frame,
+ pump_cmd_start_getxattr_cbk,
+ PUMP_SOURCE_CHILD(this),
+ PUMP_SOURCE_CHILD(this)->fops->getxattr,
+ &loc,
+ PUMP_PATH, NULL);
+
+ ret = 0;
+
+out:
+ if (ret < 0) {
+ local->op_ret = ret;
+ pump_command_reply (frame, this);
+ }
+
+ loc_wipe (&loc);
+ return 0;
+}
+
+static int
+pump_cleanup_helper (void *data) {
+ call_frame_t *frame = data;
+
+ pump_xattr_cleaner (frame, 0, frame->this, 0, 0, NULL);
+
+ return 0;
+}
+
+static int
+pump_cleanup_done (int ret, call_frame_t *sync_frame, void *data)
+{
+ STACK_DESTROY (sync_frame->root);
+
+ return 0;
+}
+
+int
+pump_execute_commit (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ pump_private_t *pump_priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *sync_frame = NULL;
+ int ret = 0;
+
+ priv = this->private;
+ pump_priv = priv->pump_private;
+ local = frame->local;
+
+ local->op_ret = 0;
+ if (pump_priv->pump_finished) {
+ pump_change_state (this, PUMP_STATE_COMMIT);
+ sync_frame = create_frame (this, this->ctx->pool);
+ ret = synctask_new (pump_priv->env, pump_cleanup_helper,
+ pump_cleanup_done, sync_frame, frame);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "Couldn't create "
+ "synctask for cleaning up xattrs.");
+ }
+
+ } else {
+ gf_log (this->name, GF_LOG_ERROR, "Commit can't proceed. "
+ "Migration in progress");
+ local->op_ret = -1;
+ local->op_errno = EINPROGRESS;
+ pump_command_reply (frame, this);
+ }
+
+ return 0;
+}
+int
+pump_execute_abort (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ pump_private_t *pump_priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *sync_frame = NULL;
+ int ret = 0;
+
+ priv = this->private;
+ pump_priv = priv->pump_private;
+ local = frame->local;
+
+ pump_change_state (this, PUMP_STATE_ABORT);
+
+ LOCK (&pump_priv->resume_path_lock);
+ {
+ pump_priv->number_files_pumped = 0;
+ pump_priv->current_file[0] = '\0';
+ }
+ UNLOCK (&pump_priv->resume_path_lock);
+
+ local->op_ret = 0;
+ if (pump_priv->pump_finished) {
+ sync_frame = create_frame (this, this->ctx->pool);
+ ret = synctask_new (pump_priv->env, pump_cleanup_helper,
+ pump_cleanup_done, sync_frame, frame);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "Couldn't create "
+ "synctask for cleaning up xattrs.");
+ }
+
+ } else {
+ pump_priv->cleaner = fop_setxattr_cbk_stub (frame,
+ pump_xattr_cleaner,
+ 0, 0, NULL);
+ }
+
+ return 0;
+}
+
+gf_boolean_t
+pump_command_status (xlator_t *this, dict_t *dict)
+{
+ char *cmd = NULL;
+ int dict_ret = -1;
+ int ret = _gf_true;
+
+ dict_ret = dict_get_str (dict, RB_PUMP_CMD_STATUS, &cmd);
+ if (dict_ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Not a pump status command");
+ ret = _gf_false;
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Hit a pump command - status");
+ ret = _gf_true;
+
+out:
+ return ret;
+
+}
+
+gf_boolean_t
+pump_command_pause (xlator_t *this, dict_t *dict)
+{
+ char *cmd = NULL;
+ int dict_ret = -1;
+ int ret = _gf_true;
+
+ dict_ret = dict_get_str (dict, RB_PUMP_CMD_PAUSE, &cmd);
+ if (dict_ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Not a pump pause command");
+ ret = _gf_false;
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Hit a pump command - pause");
+ ret = _gf_true;
+
+out:
+ return ret;
+
+}
+
+gf_boolean_t
+pump_command_commit (xlator_t *this, dict_t *dict)
+{
+ char *cmd = NULL;
+ int dict_ret = -1;
+ int ret = _gf_true;
+
+ dict_ret = dict_get_str (dict, RB_PUMP_CMD_COMMIT, &cmd);
+ if (dict_ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Not a pump commit command");
+ ret = _gf_false;
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Hit a pump command - commit");
+ ret = _gf_true;
+
+out:
+ return ret;
+
+}
+
+gf_boolean_t
+pump_command_abort (xlator_t *this, dict_t *dict)
+{
+ char *cmd = NULL;
+ int dict_ret = -1;
+ int ret = _gf_true;
+
+ dict_ret = dict_get_str (dict, RB_PUMP_CMD_ABORT, &cmd);
+ if (dict_ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Not a pump abort command");
+ ret = _gf_false;
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Hit a pump command - abort");
+ ret = _gf_true;
+
+out:
+ return ret;
+
+}
+
+gf_boolean_t
+pump_command_start (xlator_t *this, dict_t *dict)
+{
+ char *cmd = NULL;
+ int dict_ret = -1;
+ int ret = _gf_true;
+
+ dict_ret = dict_get_str (dict, RB_PUMP_CMD_START, &cmd);
+ if (dict_ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Not a pump start command");
+ ret = _gf_false;
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Hit a pump command - start");
+ ret = _gf_true;
+
+out:
+ return ret;
+
+}
+
+int
+pump_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ int op_errno = 0;
+ int ret = 0;
+
+ priv = this->private;
+
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame, default_getxattr_cbk,
+ FIRST_CHILD (this),
+ (FIRST_CHILD (this))->fops->getxattr,
+ loc, name, xdata);
+ return 0;
+ }
+
+ if (name) {
+ if (!strncmp (name, AFR_XATTR_PREFIX,
+ strlen (AFR_XATTR_PREFIX))) {
+
+ op_errno = ENODATA;
+ goto out;
+ }
+
+ if (!strcmp (name, RB_PUMP_CMD_STATUS)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Hit pump command - status");
+ pump_execute_status (frame, this);
+ ret = 0;
+ goto out;
+ }
+ }
+
+ afr_getxattr (frame, this, loc, name, xdata);
+
+ ret = 0;
+out:
+ if (ret < 0)
+ AFR_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL);
+ return 0;
+}
+
+int
+pump_command_reply (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (local->op_ret < 0)
+ gf_log (this->name, GF_LOG_INFO,
+ "Command failed");
+ else
+ gf_log (this->name, GF_LOG_INFO,
+ "Command succeeded");
+
+ AFR_STACK_UNWIND (setxattr,
+ frame,
+ local->op_ret,
+ local->op_errno, NULL);
+
+ return 0;
+}
+
+int
+pump_parse_command (call_frame_t *frame, xlator_t *this, dict_t *dict,
+ int *op_errno_p)
+{
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = 0;
+
+ if (pump_command_start (this, dict)) {
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+ local->dict = dict_ref (dict);
+ ret = pump_execute_start (frame, this);
+
+ } else if (pump_command_pause (this, dict)) {
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+ local->dict = dict_ref (dict);
+ ret = pump_execute_pause (frame, this);
+
+ } else if (pump_command_abort (this, dict)) {
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+ local->dict = dict_ref (dict);
+ ret = pump_execute_abort (frame, this);
+
+ } else if (pump_command_commit (this, dict)) {
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+ local->dict = dict_ref (dict);
+ ret = pump_execute_commit (frame, this);
+ }
+out:
+ if (op_errno_p)
+ *op_errno_p = op_errno;
+ return ret;
+}
+
+int
+pump_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ int op_errno = 0;
+
+ GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.pump*", dict, op_errno, out);
+
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame, default_setxattr_cbk,
+ FIRST_CHILD (this),
+ (FIRST_CHILD (this))->fops->setxattr,
+ loc, dict, flags, xdata);
+ return 0;
+ }
+
+ ret = pump_parse_command (frame, this, dict, &op_errno);
+ if (ret >= 0)
+ goto out;
+
+ afr_setxattr (frame, this, loc, dict, flags, xdata);
+
+ ret = 0;
+out:
+ if (ret < 0) {
+ AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+ }
+
+ return 0;
+}
+
+/* Defaults */
+static int32_t
+pump_lookup (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ dict_t *xattr_req)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame,
+ default_lookup_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup,
+ loc,
+ xattr_req);
+ return 0;
+ }
+
+ afr_lookup (frame, this, loc, xattr_req);
+ return 0;
+}
+
+
+static int32_t
+pump_truncate (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ off_t offset, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame,
+ default_truncate_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->truncate,
+ loc,
+ offset, xdata);
+ return 0;
+ }
+
+ afr_truncate (frame, this, loc, offset, xdata);
+ return 0;
+}
+
+
+static int32_t
+pump_ftruncate (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ off_t offset, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame,
+ default_ftruncate_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate,
+ fd,
+ offset, xdata);
+ return 0;
+ }
+
+ afr_ftruncate (frame, this, fd, offset, xdata);
+ return 0;
+}
+
+
+
+
+int
+pump_mknod (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame, default_mknod_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mknod,
+ loc, mode, rdev, umask, xdata);
+ return 0;
+ }
+ afr_mknod (frame, this, loc, mode, rdev, umask, xdata);
+ return 0;
+
+}
+
+
+
+int
+pump_mkdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame, default_mkdir_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mkdir,
+ loc, mode, umask, xdata);
+ return 0;
+ }
+ afr_mkdir (frame, this, loc, mode, umask, xdata);
+ return 0;
+
+}
+
+
+static int32_t
+pump_unlink (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc, int xflag, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame,
+ default_unlink_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink,
+ loc, xflag, xdata);
+ return 0;
+ }
+ afr_unlink (frame, this, loc, xflag, xdata);
+ return 0;
+
+}
+
+
+static int
+pump_rmdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int flags, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame, default_rmdir_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->rmdir,
+ loc, flags, xdata);
+ return 0;
+ }
+
+ afr_rmdir (frame, this, loc, flags, xdata);
+ return 0;
+
+}
+
+
+
+int
+pump_symlink (call_frame_t *frame, xlator_t *this,
+ const char *linkpath, loc_t *loc, mode_t umask, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame, default_symlink_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->symlink,
+ linkpath, loc, umask, xdata);
+ return 0;
+ }
+ afr_symlink (frame, this, linkpath, loc, umask, xdata);
+ return 0;
+
+}
+
+
+static int32_t
+pump_rename (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame,
+ default_rename_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->rename,
+ oldloc, newloc, xdata);
+ return 0;
+ }
+ afr_rename (frame, this, oldloc, newloc, xdata);
+ return 0;
+
+}
+
+
+static int32_t
+pump_link (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame,
+ default_link_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->link,
+ oldloc, newloc, xdata);
+ return 0;
+ }
+ afr_link (frame, this, oldloc, newloc, xdata);
+ return 0;
+
+}
+
+
+static int32_t
+pump_create (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, mode_t mode,
+ mode_t umask, fd_t *fd, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame, default_create_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->create,
+ loc, flags, mode, umask, fd, xdata);
+ return 0;
+ }
+ afr_create (frame, this, loc, flags, mode, umask, fd, xdata);
+ return 0;
+
+}
+
+
+static int32_t
+pump_open (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t flags, fd_t *fd, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame,
+ default_open_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->open,
+ loc, flags, fd, xdata);
+ return 0;
+ }
+ afr_open (frame, this, loc, flags, fd, xdata);
+ return 0;
+
+}
+
+
+static int32_t
+pump_writev (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ struct iovec *vector,
+ int32_t count,
+ off_t off, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame,
+ default_writev_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev,
+ fd,
+ vector,
+ count,
+ off, flags,
+ iobref, xdata);
+ return 0;
+ }
+
+ afr_writev (frame, this, fd, vector, count, off, flags, iobref, xdata);
+ return 0;
+}
+
+
+static int32_t
+pump_flush (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame,
+ default_flush_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->flush,
+ fd, xdata);
+ return 0;
+ }
+ afr_flush (frame, this, fd, xdata);
+ return 0;
+
+}
+
+
+static int32_t
+pump_fsync (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int32_t flags, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame,
+ default_fsync_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsync,
+ fd,
+ flags, xdata);
+ return 0;
+ }
+ afr_fsync (frame, this, fd, flags, xdata);
+ return 0;
+
+}
+
+
+static int32_t
+pump_opendir (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc, fd_t *fd, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame,
+ default_opendir_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->opendir,
+ loc, fd, xdata);
+ return 0;
+ }
+ afr_opendir (frame, this, loc, fd, xdata);
+ return 0;
+
+}
+
+
+static int32_t
+pump_fsyncdir (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ int32_t flags, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame,
+ default_fsyncdir_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsyncdir,
+ fd,
+ flags, xdata);
+ return 0;
+ }
+ afr_fsyncdir (frame, this, fd, flags, xdata);
+ return 0;
+
+}
+
+
+static int32_t
+pump_xattrop (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ gf_xattrop_flags_t flags,
+ dict_t *dict, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame,
+ default_xattrop_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->xattrop,
+ loc,
+ flags,
+ dict, xdata);
+ return 0;
+ }
+ afr_xattrop (frame, this, loc, flags, dict, xdata);
+ return 0;
+
+}
+
+static int32_t
+pump_fxattrop (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ gf_xattrop_flags_t flags,
+ dict_t *dict, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame,
+ default_fxattrop_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fxattrop,
+ fd,
+ flags,
+ dict, xdata);
+ return 0;
+ }
+ afr_fxattrop (frame, this, fd, flags, dict, xdata);
+ return 0;
+
+}
+
+
+static int32_t
+pump_removexattr (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ int op_errno = -1;
+
+ VALIDATE_OR_GOTO (this, out);
+
+ GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.pump*",
+ name, op_errno, out);
+
+ op_errno = 0;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame,
+ default_removexattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr,
+ loc,
+ name, xdata);
+ return 0;
+ }
+ afr_removexattr (frame, this, loc, name, xdata);
+
+ out:
+ if (op_errno)
+ AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL);
+ return 0;
+
+}
+
+
+
+static int32_t
+pump_readdir (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ size_t size,
+ off_t off, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame,
+ default_readdir_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdir,
+ fd, size, off, xdata);
+ return 0;
+ }
+ afr_readdir (frame, this, fd, size, off, xdata);
+ return 0;
+
+}
+
+
+static int32_t
+pump_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ size_t size, off_t off, dict_t *dict)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame,
+ default_readdirp_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp,
+ fd, size, off, dict);
+ return 0;
+ }
+ afr_readdirp (frame, this, fd, size, off, dict);
+ return 0;
+
+}
+
+
+
+static int32_t
+pump_releasedir (xlator_t *this,
+ fd_t *fd)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (priv->use_afr_in_pump)
+ afr_releasedir (this, fd);
+ return 0;
+
+}
+
+static int32_t
+pump_release (xlator_t *this,
+ fd_t *fd)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (priv->use_afr_in_pump)
+ afr_release (this, fd);
+ return 0;
+
+}
+
+static int32_t
+pump_forget (xlator_t *this, inode_t *inode)
+{
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ if (priv->use_afr_in_pump)
+ afr_forget (this, inode);
+
+ return 0;
+}
+
+static int32_t
+pump_setattr (call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ struct iatt *stbuf,
+ int32_t valid, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame,
+ default_setattr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setattr,
+ loc, stbuf, valid, xdata);
+ return 0;
+ }
+ afr_setattr (frame, this, loc, stbuf, valid, xdata);
+ return 0;
+
+}
+
+
+static int32_t
+pump_fsetattr (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ struct iatt *stbuf,
+ int32_t valid, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ if (!priv->use_afr_in_pump) {
+ STACK_WIND (frame,
+ default_fsetattr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fsetattr,
+ fd, stbuf, valid, xdata);
+ return 0;
+ }
+ afr_fsetattr (frame, this, fd, stbuf, valid, xdata);
+ return 0;
+
+}
+
+
+/* End of defaults */
+
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_afr_mt_end + 1);
+
+ if (ret != 0) {
+ gf_log(this->name, GF_LOG_ERROR, "Memory accounting init"
+ "failed");
+ return ret;
+ }
+
+ return ret;
+}
+
+static int
+is_xlator_pump_sink (xlator_t *child)
+{
+ return (child == PUMP_SINK_CHILD(THIS));
+}
+
+static int
+is_xlator_pump_source (xlator_t *child)
+{
+ return (child == PUMP_SOURCE_CHILD(THIS));
+}
+
+int32_t
+notify (xlator_t *this, int32_t event,
+ void *data, ...)
+{
+ int ret = -1;
+ xlator_t *child_xl = NULL;
+
+ child_xl = (xlator_t *) data;
+
+ ret = afr_notify (this, event, data, NULL);
+
+ switch (event) {
+ case GF_EVENT_CHILD_DOWN:
+ if (is_xlator_pump_source (child_xl))
+ pump_change_state (this, PUMP_STATE_ABORT);
+ break;
+
+ case GF_EVENT_CHILD_UP:
+ if (is_xlator_pump_sink (child_xl))
+ if (is_pump_start_pending (this)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "about to start synctask");
+ ret = pump_start_synctask (this);
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Could not start pump "
+ "synctask");
+ else
+ pump_remove_start_pending (this);
+ }
+ }
+
+ return ret;
+}
+
+int32_t
+init (xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ pump_private_t *pump_priv = NULL;
+ int child_count = 0;
+ xlator_list_t * trav = NULL;
+ int i = 0;
+ int ret = -1;
+ GF_UNUSED int op_errno = 0;
+
+ int source_child = 0;
+
+ if (!this->children) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "pump translator needs a source and sink"
+ "subvolumes defined.");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Volume is dangling.");
+ }
+
+ priv = GF_CALLOC (1, sizeof (afr_private_t), gf_afr_mt_afr_private_t);
+ if (!priv)
+ goto out;
+
+ LOCK_INIT (&priv->lock);
+
+ child_count = xlator_subvolume_count (this);
+ if (child_count != 2) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "There should be exactly 2 children - one source "
+ "and one sink");
+ return -1;
+ }
+ priv->child_count = child_count;
+
+ priv->read_child = source_child;
+ priv->favorite_child = source_child;
+ priv->background_self_heal_count = 0;
+
+ priv->data_self_heal = "on";
+ priv->metadata_self_heal = 1;
+ priv->entry_self_heal = 1;
+
+ priv->data_self_heal_window_size = 16;
+
+ priv->data_change_log = 1;
+ priv->metadata_change_log = 1;
+ priv->entry_change_log = 1;
+ priv->use_afr_in_pump = 1;
+ priv->sh_readdir_size = 65536;
+
+ /* Locking options */
+
+ /* Lock server count infact does not matter. Locks are held
+ on all subvolumes, in this case being the source
+ and the sink.
+ */
+
+ priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count,
+ gf_afr_mt_char);
+ if (!priv->child_up) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Out of memory.");
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ priv->children = GF_CALLOC (sizeof (xlator_t *), child_count,
+ gf_afr_mt_xlator_t);
+ if (!priv->children) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Out of memory.");
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ priv->pending_key = GF_CALLOC (sizeof (*priv->pending_key),
+ child_count,
+ gf_afr_mt_char);
+ if (!priv->pending_key) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Out of memory.");
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ trav = this->children;
+ i = 0;
+ while (i < child_count) {
+ priv->children[i] = trav->xlator;
+
+ ret = gf_asprintf (&priv->pending_key[i], "%s.%s",
+ AFR_XATTR_PREFIX,
+ trav->xlator->name);
+ if (-1 == ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "asprintf failed to set pending key");
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ trav = trav->next;
+ i++;
+ }
+
+ ret = gf_asprintf (&priv->sh_domain, "%s-self-heal", this->name);
+ if (-1 == ret) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ priv->root_inode = NULL;
+
+ priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event),
+ gf_afr_mt_int32_t);
+ if (!priv->last_event) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ pump_priv = GF_CALLOC (1, sizeof (*pump_priv),
+ gf_afr_mt_pump_priv);
+ if (!pump_priv) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Out of memory");
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ LOCK_INIT (&pump_priv->resume_path_lock);
+ LOCK_INIT (&pump_priv->pump_state_lock);
+
+ pump_priv->resume_path = GF_CALLOC (1, PATH_MAX,
+ gf_afr_mt_char);
+ if (!pump_priv->resume_path) {
+ gf_log (this->name, GF_LOG_ERROR, "Out of memory");
+ ret = -1;
+ goto out;
+ }
+
+ pump_priv->env = this->ctx->env;
+ if (!pump_priv->env) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not create new sync-environment");
+ ret = -1;
+ goto out;
+ }
+
+ /* keep more local here as we may need them for self-heal etc */
+ this->local_pool = mem_pool_new (afr_local_t, 128);
+ if (!this->local_pool) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to create local_t's memory pool");
+ goto out;
+ }
+
+ priv->pump_private = pump_priv;
+ pump_priv = NULL;
+
+ this->private = priv;
+ priv = NULL;
+
+ pump_change_state (this, PUMP_STATE_ABORT);
+
+ ret = 0;
+out:
+
+ if (pump_priv) {
+ GF_FREE (pump_priv->resume_path);
+ LOCK_DESTROY (&pump_priv->resume_path_lock);
+ LOCK_DESTROY (&pump_priv->pump_state_lock);
+ GF_FREE (pump_priv);
+ }
+
+ if (priv) {
+ GF_FREE (priv->child_up);
+ GF_FREE (priv->children);
+ GF_FREE (priv->pending_key);
+ GF_FREE (priv->last_event);
+ LOCK_DESTROY (&priv->lock);
+ GF_FREE (priv);
+ }
+
+ return ret;
+}
+
+int
+fini (xlator_t *this)
+{
+ afr_private_t * priv = NULL;
+ pump_private_t *pump_priv = NULL;
+
+ priv = this->private;
+ this->private = NULL;
+ if (!priv)
+ goto out;
+
+ pump_priv = priv->pump_private;
+ if (!pump_priv)
+ goto afr_priv;
+
+ GF_FREE (pump_priv->resume_path);
+ LOCK_DESTROY (&pump_priv->resume_path_lock);
+ LOCK_DESTROY (&pump_priv->pump_state_lock);
+ GF_FREE (pump_priv);
+afr_priv:
+ afr_priv_destroy (priv);
+out:
+ return 0;
+}
+
+
+struct xlator_fops fops = {
+ .lookup = pump_lookup,
+ .open = pump_open,
+ .flush = pump_flush,
+ .fsync = pump_fsync,
+ .fsyncdir = pump_fsyncdir,
+ .xattrop = pump_xattrop,
+ .fxattrop = pump_fxattrop,
+ .getxattr = pump_getxattr,
+
+ /* inode write */
+ .writev = pump_writev,
+ .truncate = pump_truncate,
+ .ftruncate = pump_ftruncate,
+ .setxattr = pump_setxattr,
+ .setattr = pump_setattr,
+ .fsetattr = pump_fsetattr,
+ .removexattr = pump_removexattr,
+
+ /* dir read */
+ .opendir = pump_opendir,
+ .readdir = pump_readdir,
+ .readdirp = pump_readdirp,
+
+ /* dir write */
+ .create = pump_create,
+ .mknod = pump_mknod,
+ .mkdir = pump_mkdir,
+ .unlink = pump_unlink,
+ .rmdir = pump_rmdir,
+ .link = pump_link,
+ .symlink = pump_symlink,
+ .rename = pump_rename,
+};
+
+struct xlator_dumpops dumpops = {
+ .priv = afr_priv_dump,
+};
+
+
+struct xlator_cbks cbks = {
+ .release = pump_release,
+ .releasedir = pump_releasedir,
+ .forget = pump_forget,
+};
+
+struct volume_options options[] = {
+ { .key = {NULL} },
+};
diff --git a/xlators/cluster/afr/src/pump.h b/xlators/cluster/afr/src/pump.h
new file mode 100644
index 000000000..9d0b6db6a
--- /dev/null
+++ b/xlators/cluster/afr/src/pump.h
@@ -0,0 +1,81 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __PUMP_H__
+#define __PUMP_H__
+
+#include "syncop.h"
+
+/* FIXME: Needs to be defined in a common file */
+#define CLIENT_CMD_CONNECT "trusted.glusterfs.client-connect"
+#define CLIENT_CMD_DISCONNECT "trusted.glusterfs.client-disconnect"
+
+#define PUMP_SOURCE_COMPLETE "trusted.glusterfs.pump-source-complete"
+#define PUMP_SINK_COMPLETE "trusted.glusterfs.pump-sink-complete"
+
+#define PUMP_PATH "trusted.glusterfs.pump-path"
+
+#define PUMP_SOURCE_CHILD(xl) (xl->children->xlator)
+#define PUMP_SINK_CHILD(xl) (xl->children->next->xlator)
+
+typedef enum {
+ PUMP_STATE_RUNNING, /* Pump is running and migrating files */
+ PUMP_STATE_RESUME, /* Pump is resuming from a previous pause */
+ PUMP_STATE_PAUSE, /* Pump is paused */
+ PUMP_STATE_ABORT, /* Pump is aborted */
+ PUMP_STATE_COMMIT, /* Pump is commited */
+} pump_state_t;
+
+typedef struct _pump_private {
+ struct syncenv *env; /* The env pointer to the pump synctask */
+ char *resume_path; /* path to resume from the last pause */
+ gf_lock_t resume_path_lock; /* Synchronize resume_path changes */
+ gf_lock_t pump_state_lock; /* Synchronize pump_state changes */
+ pump_state_t pump_state; /* State of pump */
+ char current_file[PATH_MAX]; /* Current file being pumped */
+ uint64_t number_files_pumped; /* Number of files pumped */
+ gf_boolean_t pump_finished; /* Boolean to indicate pump termination */
+ char pump_start_pending; /* Boolean to mark start pending until
+ CHILD_UP */
+ call_stub_t *cleaner;
+} pump_private_t;
+
+void
+build_root_loc (inode_t *inode, loc_t *loc);
+int pump_start (call_frame_t *frame, xlator_t *this);
+
+gf_boolean_t
+pump_command_start (xlator_t *this, dict_t *dict);
+
+int
+pump_execute_start (call_frame_t *frame, xlator_t *this);
+
+gf_boolean_t
+pump_command_pause (xlator_t *this, dict_t *dict);
+
+int
+pump_execute_pause (call_frame_t *frame, xlator_t *this);
+
+gf_boolean_t
+pump_command_abort (xlator_t *this, dict_t *dict);
+
+int
+pump_execute_abort (call_frame_t *frame, xlator_t *this);
+
+gf_boolean_t
+pump_command_status (xlator_t *this, dict_t *dict);
+
+int
+pump_execute_status (call_frame_t *frame, xlator_t *this);
+
+int
+pump_command_reply (call_frame_t *frame, xlator_t *this);
+
+#endif /* __PUMP_H__ */