summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr/src/afr-self-heal-data.c
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster/afr/src/afr-self-heal-data.c')
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c2094
1 files changed, 478 insertions, 1616 deletions
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index 9de26ee569c..c0385153ff5 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -8,1747 +8,609 @@
cases as published by the Free Software Foundation.
*/
-#include <libgen.h>
-#include <unistd.h>
-#include <fnmatch.h>
-#include <sys/time.h>
-#include <stdlib.h>
-#include <signal.h>
#ifndef _CONFIG_H
#define _CONFIG_H
#include "config.h"
#endif
-#include "glusterfs.h"
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-
-#include "afr-transaction.h"
#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-#include "afr-self-heal-algorithm.h"
-
-int
-afr_sh_data_fail (call_frame_t *frame, xlator_t *this);
-
-static inline gf_boolean_t
-afr_sh_data_proceed (unsigned int success_count)
-{
- return (success_count >= AFR_SH_MIN_PARTICIPANTS);
-}
-
-extern int
-sh_loop_finish (call_frame_t *loop_frame, xlator_t *this);
+#include "byte-order.h"
-int
-afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this);
+enum {
+ AFR_SELFHEAL_DATA_FULL = 0,
+ AFR_SELFHEAL_DATA_DIFF,
+};
-int
-afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this);
-int
-afr_sh_data_finish (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_data_done (call_frame_t *frame, xlator_t *this)
+#define HAS_HOLES(i) ((i->ia_blocks * 512) < (i->ia_size))
+static int
+__checksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, uint32_t weak, uint8_t *strong,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ afr_local_t *local = NULL;
+ int i = (long) cookie;
- local = frame->local;
- sh = &local->self_heal;
+ local = frame->local;
- sh->completion_cbk (frame, this);
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (strong)
+ memcpy (local->replies[i].checksum, strong, MD5_DIGEST_LENGTH);
- return 0;
+ syncbarrier_wake (&local->barrier);
+ return 0;
}
-int
-afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "flush failed on %s on subvolume %s: %s",
- local->loc.path, priv->children[child_index]->name,
- strerror (op_errno));
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- afr_sh_data_done (frame, this);
- }
-
- return 0;
-}
-
-int
-afr_sh_data_close (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- int i = 0;
- int call_count = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (!sh->healing_fd) {
- //This happens when file is non-reg
- afr_sh_data_done (frame, this);
- return 0;
- }
- call_count = afr_set_elem_count_get (sh->success,
- priv->child_count);
- local->call_count = call_count;
-
- if (call_count == 0) {
- afr_sh_data_done (frame, this);
- return 0;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (!sh->success[i])
- continue;
- gf_log (this->name, GF_LOG_DEBUG,
- "closing fd of %s on %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->flush,
- sh->healing_fd, NULL);
-
- if (!--call_count)
- break;
- }
-
- return 0;
-}
-
-int
-afr_sh_dom_unlock (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (sh->sh_dom_lock_held)
- afr_sh_data_unlock (frame, this, priv->sh_domain,
- afr_sh_data_close);
- else
- afr_sh_data_close (frame, this);
- return 0;
-}
-
-int
-afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *statpre,
- struct iatt *statpost, dict_t *xdata)
+static int
+attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre, struct iatt *post,
+ dict_t *xdata)
{
+ int i = (long) cookie;
+ afr_local_t *local = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
+ local = frame->local;
- local = frame->local;
- priv = this->private;
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (pre)
+ local->replies[i].prestat = *pre;
+ if (post)
+ local->replies[i].poststat = *post;
+ if (xdata)
+ local->replies[i].xdata = dict_ref (xdata);
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "setattr failed on %s on subvolume %s: %s",
- local->loc.path, priv->children[child_index]->name,
- strerror (op_errno));
- }
- }
- UNLOCK (&frame->lock);
+ syncbarrier_wake (&local->barrier);
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- afr_sh_data_finish (frame, this);
- }
-
- return 0;
+ return 0;
}
-int
-afr_sh_data_setattr (call_frame_t *frame, xlator_t *this, struct iatt* stbuf)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- int i = 0;
- int call_count = 0;
- int32_t valid = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- valid = (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME);
-
- call_count = afr_set_elem_count_get (sh->success,
- priv->child_count);
- local->call_count = call_count;
-
- if (call_count == 0) {
- GF_ASSERT (0);
- afr_sh_data_finish (frame, this);
- return 0;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (!sh->success[i])
- continue;
-
- STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setattr,
- &local->loc, stbuf, valid, NULL);
-
- if (!--call_count)
- break;
- }
-
- return 0;
-}
-int
-afr_sh_data_setattr_fstat_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iatt *buf, dict_t *xdata)
+static gf_boolean_t
+__afr_selfheal_data_checksums_match (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int source,
+ unsigned char *healed_sinks,
+ off_t offset, size_t size)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int child_index = (long) cookie;
-
- local = frame->local;
- sh = &local->self_heal;
-
- GF_ASSERT (sh->source == child_index);
- if (op_ret != -1) {
- sh->buf[child_index] = *buf;
- afr_sh_data_setattr (frame, this, buf);
- } else {
- gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set "
- "time-stamps after self-heal", local->loc.path);
- afr_sh_data_fail (frame, this);
- }
-
- return 0;
-}
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ unsigned char *wind_subvols = NULL;
+ int i = 0;
-/*
- * If there are any writes after the self-heal is triggered then the
- * stbuf stored in local->self_heal.buf[] will be invalid so we do one more
- * stat on the source and then set the [am]times
- */
-int
-afr_sh_set_timestamps (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- STACK_WIND_COOKIE (frame, afr_sh_data_setattr_fstat_cbk,
- (void *) (long) sh->source,
- priv->children[sh->source],
- priv->children[sh->source]->fops->fstat,
- sh->healing_fd, NULL);
- return 0;
-}
-
-//Fun fact, lock_cbk is being used for both lock & unlock
-int
-afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom,
- afr_lock_cbk_t lock_cbk)
-{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int ret = 0;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
- priv = this->private;
-
- if (strcmp (dom, this->name) == 0) {
- sh->data_lock_held = _gf_false;
- } else if (strcmp (dom, priv->sh_domain) == 0) {
- sh->sh_dom_lock_held = _gf_false;
- } else {
- ret = -1;
- goto out;
- }
- int_lock->lock_cbk = lock_cbk;
- int_lock->domain = dom;
- afr_unlock (frame, this);
-
-out:
- if (ret) {
- int_lock->lock_op_ret = -1;
- int_lock->lock_cbk (frame, this);
- }
- return 0;
-}
-
-int
-afr_sh_data_finish (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "finishing data selfheal of %s", local->loc.path);
-
- if (sh->data_lock_held)
- afr_sh_data_unlock (frame, this, this->name, afr_sh_dom_unlock);
- else
- afr_sh_dom_unlock (frame, this);
-
- return 0;
-}
-
-int
-afr_sh_data_fail (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ priv = this->private;
+ local = frame->local;
- local = frame->local;
- sh = &local->self_heal;
+ wind_subvols = alloca0 (priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || healed_sinks[i])
+ wind_subvols[i] = 1;
+ }
- gf_log (this->name, GF_LOG_DEBUG,
- "finishing failed data selfheal of %s", local->loc.path);
+ AFR_ONLIST (wind_subvols, frame, __checksum_cbk, rchecksum, fd,
+ offset, size, NULL);
- afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
- afr_sh_data_finish (frame, this);
- return 0;
-}
+ if (!local->replies[source].valid || local->replies[source].op_ret != 0)
+ return _gf_false;
-int
-afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xattr, dict_t *xdata)
-{
- int call_count = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int32_t child_index = (long) cookie;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "Erasing of pending change "
- "log failed on %s for subvol %s, reason: %s",
- local->loc.path, priv->children[child_index]->name,
- strerror (op_errno));
- afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
- }
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
- if (sh->old_loop_frame)
- sh_loop_finish (sh->old_loop_frame, this);
- sh->old_loop_frame = NULL;
- afr_sh_data_fail (frame, this);
- goto out;
- }
- if (!IA_ISREG (sh->type)) {
- afr_sh_data_finish (frame, this);
- goto out;
- }
- GF_ASSERT (sh->old_loop_frame);
- afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name,
- afr_post_sh_big_lock_success,
- afr_post_sh_big_lock_failure);
- }
-out:
- return 0;
-}
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source)
+ continue;
+ if (memcmp (local->replies[source].checksum,
+ local->replies[i].checksum,
+ MD5_DIGEST_LENGTH))
+ return _gf_false;
+ }
-int
-afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this)
-{
- afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION,
- afr_sh_data_erase_pending_cbk,
- afr_sh_data_finish);
- return 0;
+ return _gf_true;
}
-int
-afr_sh_data_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *pre,
- struct iatt *post, dict_t *xdata)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
-
- local = frame->local;
- priv = this->private;
- sh = &local->self_heal;
-
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "%s: Failed to fsync on "
- "%s - %s", local->loc.path,
- priv->children[child_index]->name, strerror (op_errno));
- LOCK (&frame->lock);
- {
- afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
- }
- UNLOCK (&frame->lock);
- if (sh->old_loop_frame)
- sh_loop_finish (sh->old_loop_frame, this);
- sh->old_loop_frame = NULL;
- }
-
- call_count = afr_frame_return (frame);
- if (call_count == 0) {
- if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC))
- afr_sh_data_fail (frame, this);
- else
- afr_sh_data_erase_pending (frame, this);
- }
- return 0;
-}
-/*
- * Before erasing xattrs, make sure the data is written to disk
- */
-int
-afr_sh_data_fsync (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- int i = 0;
- int call_count = 0;
-
- local = frame->local;
- priv = this->private;
- sh = &local->self_heal;
-
- call_count = sh->active_sinks;
- if (call_count == 0) {
- afr_sh_data_erase_pending (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (!sh->success[i] || sh->sources[i])
- continue;
-
- STACK_WIND_COOKIE (frame, afr_sh_data_fsync_cbk,
- (void *) (long) i, priv->children[i],
- priv->children[i]->fops->fsync,
- sh->healing_fd, 1, NULL);
- }
-
- return 0;
-}
-
-static struct afr_sh_algorithm *
-sh_algo_from_name (xlator_t *this, char *name)
+static int
+__afr_selfheal_data_read_write (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks,
+ off_t offset, size_t size,
+ struct afr_reply *replies)
{
- int i = 0;
+ struct iovec *iovec = NULL;
+ int count = 0;
+ struct iobref *iobref = NULL;
+ int ret = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
- if (name == NULL)
- goto out;
+ priv = this->private;
- while (afr_self_heal_algorithms[i].name) {
- if (!strcmp (name, afr_self_heal_algorithms[i].name)) {
- return &afr_self_heal_algorithms[i];
- }
+ ret = syncop_readv (priv->children[source], fd, size, offset, 0,
+ &iovec, &count, &iobref);
+ if (ret <= 0)
+ return ret;
- i++;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
+
+ /*
+ * TODO: Use fiemap() and discard() to heal holes
+ * in the future.
+ *
+ * For now,
+ *
+ * - if the source had any holes at all,
+ * AND
+ * - if we are writing past the original file size
+ * of the sink
+ * AND
+ * - is NOT the last block of the source file. if
+ * the block contains EOF, it has to be written
+ * in order to set the file size even if the
+ * last block is 0-filled.
+ * AND
+ * - if the read buffer is filled with only 0's
+ *
+ * then, skip writing to this source. We don't depend
+ * on the write to happen to update the size as we
+ * have performed an ftruncate() upfront anyways.
+ */
+#define is_last_block(o,b,s) ((s >= o) && (s <= (o + b)))
+ if (HAS_HOLES ((&replies[source].poststat)) &&
+ offset > replies[i].poststat.ia_size &&
+ !is_last_block (offset, size,
+ replies[source].poststat.ia_size) &&
+ (iov_0filled (iovec, count) == 0))
+ continue;
+
+ ret = syncop_writev (priv->children[i], fd, iovec, count,
+ offset, iobref, 0);
+ if (ret != iov_length (iovec, count)) {
+ /* write() failed on this sink. unset the corresponding
+ member in sinks[] (which is healed_sinks[] in the
+ caller) so that this server does NOT get considered
+ as successfully healed.
+ */
+ healed_sinks[i] = 0;
+ }
+ }
+ if (iobref)
+ iobref_unref (iobref);
-out:
- return NULL;
+ return ret;
}
static int
-sh_zero_byte_files_exist (afr_local_t *local, int child_count)
-{
- int i = 0;
- int ret = 0;
- afr_self_heal_t *sh = NULL;
-
- sh = &local->self_heal;
- for (i = 0; i < child_count; i++) {
- if (!local->child_up[i] || sh->child_errno[i])
- continue;
- if (sh->buf[i].ia_size == 0) {
- ret = 1;
- break;
- }
- }
-
- return ret;
-}
-
-
-struct afr_sh_algorithm *
-afr_sh_data_pick_algo (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- struct afr_sh_algorithm * algo = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
- algo = sh_algo_from_name (this, priv->data_self_heal_algorithm);
-
- if (algo == NULL) {
- /* option not set, so fall back on heuristics */
-
- if (sh_zero_byte_files_exist (local, priv->child_count)
- || (sh->file_size <= (priv->data_self_heal_window_size *
- this->ctx->page_size))) {
-
- /*
- * If the file does not exist on one of the subvolumes,
- * or a zero-byte file exists (created by entry self-heal)
- * the entire content has to be copied anyway, so there
- * is no benefit from using the "diff" algorithm.
- *
- * If the file size is about the same as page size,
- * the entire file can be read and written with a few
- * (pipelined) STACK_WINDs, which will be faster
- * than "diff" which has to read checksums and then
- * read and write.
- */
-
- algo = sh_algo_from_name (this, "full");
-
- } else {
- algo = sh_algo_from_name (this, "diff");
- }
- }
-
- return algo;
-}
-
-
-int
-afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- struct afr_sh_algorithm *sh_algo = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- sh->algo_completion_cbk = afr_sh_data_fsync;
- sh->algo_abort_cbk = afr_sh_data_fail;
-
- sh_algo = afr_sh_data_pick_algo (frame, this);
+afr_selfheal_data_block (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks, off_t offset,
+ size_t size, int type, struct afr_reply *replies)
+{
+ int ret = -1;
+ int sink_count = 0;
+ afr_private_t *priv = NULL;
+ unsigned char *data_lock = NULL;
+
+ priv = this->private;
+ sink_count = AFR_COUNT (healed_sinks, priv->child_count);
+ data_lock = alloca0 (priv->child_count);
+
+ ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name,
+ offset, size, data_lock);
+ {
+ if (ret < sink_count) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
- sh->algo = sh_algo;
- sh_algo->fn (frame, this);
+ if (type == AFR_SELFHEAL_DATA_DIFF &&
+ __afr_selfheal_data_checksums_match (frame, this, fd, source,
+ healed_sinks, offset, size)) {
+ ret = 0;
+ goto unlock;
+ }
- return 0;
+ ret = __afr_selfheal_data_read_write (frame, this, fd, source,
+ healed_sinks, offset, size,
+ replies);
+ }
+unlock:
+ afr_selfheal_uninodelk (frame, this, fd->inode, this->name,
+ offset, size, data_lock);
+ return ret;
}
-int
-afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- int call_count = 0;
- int child_index = 0;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "ftruncate of %s on subvolume %s failed (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "ftruncate of %s on subvolume %s completed",
- local->loc.path,
- priv->children[child_index]->name);
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC))
- afr_sh_data_fail (frame, this);
- else
- afr_sh_data_sync_prepare (frame, this);
- }
-
- return 0;
-}
-int
-afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this)
+static int
+afr_selfheal_data_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *healed_sinks)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t *sh = NULL;
- int *sources = NULL;
- int call_count = 0;
- int i = 0;
-
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- sources = sh->sources;
- call_count = sh->active_sinks;
-
- local->call_count = call_count;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
- for (i = 0; i < priv->child_count; i++) {
- if (sources[i] || !local->child_up[i])
- continue;
+ local = frame->local;
+ priv = this->private;
- STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->ftruncate,
- sh->healing_fd, sh->file_size,
- NULL);
+ AFR_ONLIST (healed_sinks, frame, attr_cbk, fsync, fd, 0, NULL);
- if (!--call_count)
- break;
- }
-
- return 0;
+ for (i = 0; i < priv->child_count; i++)
+ if (healed_sinks[i] && local->replies[i].op_ret != 0)
+ /* fsync() failed. Do NOT consider this server
+ as successfully healed. Mark it so.
+ */
+ healed_sinks[i] = 0;
+ return 0;
}
-int
-afr_sh_inode_set_read_ctx (afr_self_heal_t *sh, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- int ret = 0;
- int i = 0;
-
- priv = this->private;
- sh->source = afr_sh_select_source (sh->sources, priv->child_count);
- if (sh->source < 0) {
- ret = -1;
- goto out;
- }
-
- /* detect changes not visible through pending flags -- JIC */
- for (i = 0; i < priv->child_count; i++) {
- if (i == sh->source || sh->child_errno[i])
- continue;
-
- if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[sh->source]))
- sh->sources[i] = 0;
- }
-
- afr_reset_children (sh->fresh_children, priv->child_count);
- afr_get_fresh_children (sh->success_children, sh->sources,
- sh->fresh_children, priv->child_count);
- afr_inode_set_read_ctx (this, sh->inode, sh->source,
- sh->fresh_children);
-out:
- return ret;
-}
-char*
-afr_get_sizes_str (afr_local_t *local, struct iatt *bufs, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- int i = 0;
- char num[1024] = {0};
- size_t len = 0;
- char *sizes_str = NULL;
- size_t off = 0;
- char *fmt_str = "%llu bytes on %s, ";
- char *child_down = " %s,";
- char *child_unknown = " %s,";
- int down_child_present = 0;
- int down_count = 0;
- int unknown_count = 0;
- int unknown_child_present = 0;
- char *down_subvol_1 = " down subvolume is ";
- char *unknown_subvol_1 = " unknown subvolume is ";
- char *down_subvol_2 = " down subvolumes are ";
- char *unknown_subvol_2 = " unknown subvolumes are ";
-
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i] == 1) {
- len += snprintf (num, sizeof (num), fmt_str,
- (unsigned long long) bufs[i].ia_size,
- priv->children[i]->name);
- } else if (local->child_up[i] == 0) {
- len += snprintf (num, sizeof (num), child_down,
- priv->children[i]->name);
- if (!down_child_present)
- down_child_present = 1;
- down_count ++;
- } else if (local->child_up[i] == -1) {
- len += snprintf (num, sizeof (num), child_unknown,
- priv->children[i]->name);
- if (!unknown_child_present)
- unknown_child_present = 1;
- unknown_count++;
- }
-
- }
-
- if (down_child_present) {
- if (down_count > 1)
- len += snprintf (num, sizeof (num), "%s",
- down_subvol_2);
- else
- len += snprintf (num, sizeof (num), "%s",
- down_subvol_1);
- }
- if (unknown_child_present) {
- if (unknown_count > 1)
- len += snprintf (num, sizeof (num), "%s",
- unknown_subvol_2);
- else
- len += snprintf (num, sizeof (num), "%s",
- unknown_subvol_1);
- }
-
- len++;//for '\0'
-
- sizes_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char);
-
- if (!sizes_str)
- return NULL;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i] == 1) {
- off += snprintf (sizes_str + off, len - off, fmt_str,
- (unsigned long long) bufs[i].ia_size,
- priv->children[i]->name);
- }
- }
-
- if (down_child_present) {
- if (down_count > 1) {
- off += snprintf (sizes_str + off, len - off, "%s",
- down_subvol_2);
- } else {
- off += snprintf (sizes_str + off, len - off, "%s",
- down_subvol_1);
- }
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i] == 0) {
- off += snprintf (sizes_str + off, len - off, child_down,
- priv->children[i]->name);
- }
- }
-
- if (unknown_child_present) {
- if (unknown_count > 1) {
- off += snprintf (sizes_str + off, len - off, "%s",
- unknown_subvol_2);
- } else {
- off += snprintf (sizes_str + off, len - off, "%s",
- unknown_subvol_1);
- }
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i] == -1) {
- off += snprintf (sizes_str + off, len - off,
- child_unknown,
- priv->children[i]->name);
-
- }
- }
-
- return sizes_str;
-}
-
-char*
-afr_get_sinks_str (xlator_t *this, afr_local_t *local, afr_self_heal_t *sh)
+static int
+afr_selfheal_data_restore_time (call_frame_t *frame, xlator_t *this,
+ inode_t *inode, int source,
+ unsigned char *healed_sinks,
+ struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
- int i = 0;
- char num[1024] = {0};
- size_t len = 0;
- char *sinks_str = NULL;
- char *temp_str = " to sinks ";
- char *str_format = " %s,";
- char off = 0;
-
- priv = this->private;
-
- len += snprintf (num, sizeof (num), "%s", temp_str);
- for (i = 0; i < priv->child_count; i++) {
- if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) {
- len += snprintf (num, sizeof (num), str_format,
- priv->children[i]->name);
- }
- }
+ loc_t loc = {0, };
- len ++;
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
- sinks_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char);
+ AFR_ONLIST (healed_sinks, frame, attr_cbk, setattr, &loc,
+ &replies[source].poststat,
+ (GF_SET_ATTR_ATIME|GF_SET_ATTR_MTIME), NULL);
- if (!sinks_str)
- return NULL;
-
- off += snprintf (sinks_str + off, len - off, "%s", temp_str);
-
- for (i = 0; i < priv->child_count; i++) {
- if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) {
- off += snprintf (sinks_str + off, len - off,
- str_format,
- priv->children[i]->name);
- }
- }
-
- return sinks_str;
+ loc_wipe (&loc);
+ return 0;
}
+static int
+afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks,
+ struct afr_reply *replies)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ off_t off = 0;
+ size_t block = 128 * 1024;
+ int type = AFR_SELFHEAL_DATA_FULL;
+ int ret = -1;
+ call_frame_t *iter_frame = NULL;
+ char *sinks_str = NULL;
+ char *p = NULL;
+
+ priv = this->private;
+
+ sinks_str = alloca0 (priv->child_count * 8);
+ p = sinks_str;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
+ p += sprintf (p, "%d ", i);
+ }
-void
-afr_set_data_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, xlator_t *this)
-{
- char *pending_matrix_str = NULL;
- char *sizes_str = NULL;
- char *sinks_str = NULL;
- afr_private_t *priv = NULL;
-
- priv = this->private;
-
- pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix,
- this);
- if (!pending_matrix_str)
- pending_matrix_str = "";
-
- sizes_str = afr_get_sizes_str (local, sh->buf, this);
- if (!sizes_str)
- sizes_str = "";
+ gf_log (this->name, GF_LOG_INFO, "performing data selfheal on %s. "
+ "source=%d sinks=%s",
+ uuid_utoa (fd->inode->gfid), source, sinks_str);
- sinks_str = afr_get_sinks_str (this, local, sh);
- if (!sinks_str)
- sinks_str = "";
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i] && i != source)
+ continue;
+ if (replies[i].poststat.ia_size) {
+ type = AFR_SELFHEAL_DATA_DIFF;
+ break;
+ }
+ }
- gf_asprintf (&sh->data_sh_info, " data self heal from %s %s with "
- "%s data %s", priv->children[sh->source]->name, sinks_str,
- sizes_str, pending_matrix_str);
+ iter_frame = afr_copy_frame (frame);
+ if (!iter_frame)
+ return -ENOMEM;
- if (pending_matrix_str && strcmp (pending_matrix_str, ""))
- GF_FREE (pending_matrix_str);
+ for (off = 0; off < replies[source].poststat.ia_size; off += block) {
+ ret = afr_selfheal_data_block (iter_frame, this, fd, source,
+ healed_sinks, off, block, type,
+ replies);
+ if (ret < 0)
+ goto out;
- if (sizes_str && strcmp (sizes_str, ""))
- GF_FREE (sizes_str);
-}
+ AFR_STACK_RESET (iter_frame);
+ }
-void
-afr_sh_data_fix (call_frame_t *frame, xlator_t *this)
-{
- int source = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
- sh->block_size = this->ctx->page_size;
- sh->file_size = sh->buf[source].ia_size;
-
- if (FILE_HAS_HOLES (&sh->buf[source]))
- sh->file_has_holes = 1;
-
- if (sh->background && sh->unwind && !sh->unwound) {
- sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno,
- is_self_heal_failed (sh, AFR_CHECK_SPECIFIC));
- sh->unwound = _gf_true;
- }
-
- afr_sh_mark_source_sinks (frame, this);
- if (sh->active_sinks == 0) {
- gf_log (this->name, GF_LOG_INFO,
- "no active sinks for performing self-heal on file %s",
- local->loc.path);
- afr_sh_data_finish (frame, this);
- return;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "self-healing file %s from subvolume %s to %d other",
- local->loc.path, priv->children[sh->source]->name,
- sh->active_sinks);
-
- sh->actual_sh_started = _gf_true;
- afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN);
- afr_sh_data_trim_sinks (frame, this);
-}
+ afr_selfheal_data_restore_time (frame, this, fd->inode, source,
+ healed_sinks, replies);
-int
-afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int nsources = 0;
- int ret = 0;
- int *old_sources = NULL;
- int tstamp_source = 0;
- int i = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- gf_log (this->name, GF_LOG_DEBUG, "Pending matrix for: %s",
- lkowner_utoa (&frame->root->lk_owner));
- if (sh->sync_done) {
- //store sources before sync so that mtime can be set using the
- //iatt buf from one of them.
- old_sources = alloca (priv->child_count*sizeof (*old_sources));
- memcpy (old_sources, sh->sources,
- priv->child_count * sizeof (*old_sources));
- }
-
- nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix,
- sh->sources, sh->success_children,
- AFR_DATA_TRANSACTION, NULL, _gf_true);
- if ((nsources == -1)
- && (priv->favorite_child != -1)
- && (sh->child_errno[priv->favorite_child] == 0)) {
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Picking favorite child %s as authentic source to "
- "resolve conflicting data of %s",
- priv->children[priv->favorite_child]->name,
- local->loc.path);
-
- sh->sources[priv->favorite_child] = 1;
-
- nsources = afr_sh_source_count (sh->sources,
- priv->child_count);
- }
-
- if (nsources == -1) {
- afr_sh_print_split_brain_log (sh->pending_matrix, this,
- local->loc.path);
- afr_set_split_brain (this, sh->inode, DONT_KNOW, SPB);
-
- afr_sh_data_fail (frame, this);
- return 0;
- }
-
- afr_set_split_brain (this, sh->inode, DONT_KNOW, NO_SPB);
-
- ret = afr_sh_inode_set_read_ctx (sh, this);
- if (ret) {
- gf_log (this->name, GF_LOG_DEBUG,
- "No active sources found.");
-
- afr_sh_data_fail (frame, this);
- return 0;
- }
-
- if (sh->sync_done) {
- /* Perform setattr from one of the old_sources if possible
- * Because only they have the correct mtime, the new sources
- * (i.e. old sinks) have mtime from last writev in sync.
- */
- tstamp_source = sh->source;
- for (i = 0; i < priv->child_count; i++) {
- if (old_sources[i] && sh->sources[i])
- tstamp_source = i;
- }
- afr_sh_data_setattr (frame, this, &sh->buf[tstamp_source]);
- } else {
- afr_set_data_sh_info_str (local, sh, this);
- if (nsources == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "No self-heal needed for %s",
- local->loc.path);
-
- afr_sh_data_finish (frame, this);
- return 0;
- }
-
- if (sh->do_data_self_heal &&
- afr_data_self_heal_enabled (priv->data_self_heal))
- afr_sh_data_fix (frame, this);
- else
- afr_sh_data_finish (frame, this);
- }
- return 0;
-}
+ ret = afr_selfheal_data_fsync (frame, this, fd, healed_sinks);
-int
-afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local,
- dict_t **xattr,
- afr_transaction_type txn_type,
- uuid_t gfid)
-{
- afr_private_t *priv = NULL;
- int read_child = -1;
- int32_t **pending_matrix = NULL;
- int32_t *sources = NULL;
- int32_t *success_children = NULL;
- struct iatt *bufs = NULL;
- int32_t nsources = 0;
- int32_t prev_read_child = -1;
- int32_t config_read_child = -1;
- int32_t subvol_status = 0;
-
- priv = this->private;
- bufs = local->cont.lookup.bufs;
- success_children = local->cont.lookup.success_children;
-
- pending_matrix = local->cont.lookup.pending_matrix;
- sources = local->cont.lookup.sources;
- memset (sources, 0, sizeof (*sources) * priv->child_count);
-
- nsources = afr_build_sources (this, xattr, bufs, pending_matrix,
- sources, success_children, txn_type,
- &subvol_status, _gf_false);
- if (subvol_status & SPLIT_BRAIN) {
- gf_log (this->name, GF_LOG_DEBUG, "%s: Possible split-brain",
- local->loc.path);
- switch (txn_type) {
- case AFR_DATA_TRANSACTION:
- local->cont.lookup.possible_spb = _gf_true;
- nsources = 1;
- sources[success_children[0]] = 1;
- break;
- case AFR_ENTRY_TRANSACTION:
- read_child = afr_get_no_xattr_dir_read_child (this,
- success_children,
- bufs);
- sources[read_child] = 1;
- nsources = 1;
- break;
- default:
- break;
- }
- }
- if (nsources < 0)
- goto out;
-
- prev_read_child = local->read_child_index;
- config_read_child = priv->read_child;
- read_child = afr_select_read_child_from_policy (success_children,
- priv->child_count,
- prev_read_child,
- config_read_child,
- sources,
- priv->hash_mode, gfid);
out:
- gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d",
- read_child);
- return read_child;
+ if (iter_frame)
+ AFR_STACK_DESTROY (iter_frame);
+ return ret;
}
-int
-afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iatt *buf, dict_t *xdata)
+
+static int
+__afr_selfheal_truncate_sinks (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, unsigned char *healed_sinks,
+ struct afr_reply *replies, uint64_t size)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int call_count = -1;
- int child_index = (long) cookie;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- gf_log (this->name, GF_LOG_TRACE,
- "fstat of %s on %s succeeded",
- local->loc.path,
- priv->children[child_index]->name);
-
- sh->buf[child_index] = *buf;
- sh->success_children[sh->success_count] = child_index;
- sh->success_count++;
- } else {
- gf_log (this->name, GF_LOG_ERROR, "%s: fstat failed "
- "on %s, reason %s", local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- sh->child_errno[child_index] = op_errno;
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- /* Previous versions of glusterfs might have set
- * the pending data xattrs which need to be erased
- */
- if (!afr_sh_data_proceed (sh->success_count)) {
- gf_log (this->name, GF_LOG_ERROR, "inspecting metadata "
- "succeeded on < %d children, aborting "
- "self-heal for %s", AFR_SH_MIN_PARTICIPANTS,
- local->loc.path);
- afr_sh_data_fail (frame, this);
- goto out;
- }
- afr_sh_data_fxattrop_fstat_done (frame, this);
- }
-out:
- return 0;
-}
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ unsigned char *larger_sinks = 0;
+ int i = 0;
+ local = frame->local;
+ priv = this->private;
-int
-afr_sh_data_fstat (call_frame_t *frame, xlator_t *this)
-{
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int i = 0;
- int child = 0;
- int32_t *fstat_children = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- fstat_children = memdup (sh->success_children,
- sizeof (*fstat_children) * priv->child_count);
- if (!fstat_children) {
- afr_sh_data_fail (frame, this);
- goto out;
- }
- call_count = sh->success_count;
- local->call_count = call_count;
-
- memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count);
- afr_reset_children (sh->success_children, priv->child_count);
- sh->success_count = 0;
- for (i = 0; i < priv->child_count; i++) {
- child = fstat_children[i];
- if (child == -1)
- break;
- STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk,
- (void *) (long) child,
- priv->children[child],
- priv->children[child]->fops->fstat,
- sh->healing_fd, NULL);
- --call_count;
- }
- GF_ASSERT (!call_count);
-out:
- GF_FREE (fstat_children);
- return 0;
-}
+ larger_sinks = alloca0 (priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (healed_sinks[i] && replies[i].poststat.ia_size > size)
+ larger_sinks[i] = 1;
+ }
-void
-afr_sh_common_fxattrop_resp_handler (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xattr)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int child_index = (long) cookie;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- gf_log (this->name, GF_LOG_TRACE,
- "fxattrop of %s on %s succeeded",
- local->loc.path,
- priv->children[child_index]->name);
-
- sh->xattr[child_index] = dict_ref (xattr);
- sh->success_children[sh->success_count] = child_index;
- sh->success_count++;
- } else {
- gf_log (this->name, GF_LOG_ERROR, "fxattrop of %s "
- "failed on %s, reason %s", local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- sh->child_errno[child_index] = op_errno;
- }
- }
- UNLOCK (&frame->lock);
-}
+ AFR_ONLIST (larger_sinks, frame, attr_cbk, ftruncate, fd, size, NULL);
-int
-afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xattr, dict_t *xdata)
-{
- int call_count = -1;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- afr_sh_common_fxattrop_resp_handler (frame, cookie, this, op_ret,
- op_errno, xattr);
-
- call_count = afr_frame_return (frame);
- if (call_count == 0) {
- if (!afr_sh_data_proceed (sh->success_count)) {
- gf_log (this->name, GF_LOG_ERROR, "%s, inspecting "
- "change log succeeded on < %d children",
- local->loc.path, AFR_SH_MIN_PARTICIPANTS);
- afr_sh_data_fail (frame, this);
- goto out;
- }
- afr_sh_data_fstat (frame, this);
- }
-out:
- return 0;
+ for (i = 0; i < priv->child_count; i++)
+ if (healed_sinks[i] && local->replies[i].op_ret == -1)
+ /* truncate() failed. Do NOT consider this server
+ as successfully healed. Mark it so.
+ */
+ healed_sinks[i] = 0;
+ return 0;
}
-
-int
-afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this)
-{
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- dict_t **xattr_req;
- int32_t *zero_pending = NULL;
- int call_count = 0;
- int i = 0;
- int ret = 0;
- int j;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- call_count = afr_up_children_count (local->child_up,
- priv->child_count);
-
- local->call_count = call_count;
-
- xattr_req = GF_CALLOC(priv->child_count, sizeof(struct dict_t *),
- gf_afr_mt_dict_t);
- if (!xattr_req)
- goto out;
+/*
+ * If by chance there are multiple sources with differing sizes, select
+ * the largest file as the source.
+ *
+ * This can only happen if data was directly modified in the backend.
+ */
+static int
+__afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint64_t size = 0;
+ int source = -1;
+ int locked_count = 0;
+ int sources_count = 0;
+ int healed_sinks_count = 0;
+
+ priv = this->private;
+
+ locked_count = AFR_COUNT (locked_on, priv->child_count);
+ sources_count = AFR_COUNT (sources, priv->child_count);
+ healed_sinks_count = AFR_COUNT (healed_sinks, priv->child_count);
+
+ if (locked_count == healed_sinks_count || !sources_count) {
+ /* split brain */
+ return -EIO;
+ }
for (i = 0; i < priv->child_count; i++) {
- xattr_req[i] = dict_new();
- if (!xattr_req[i]) {
- ret = -1;
- goto out;
+ if (!sources[i])
+ continue;
+ if (size <= replies[i].poststat.ia_size) {
+ size = replies[i].poststat.ia_size;
+ source = i;
}
}
for (i = 0; i < priv->child_count; i++) {
- for (j = 0; j < priv->child_count; j++) {
- zero_pending = GF_CALLOC (3, sizeof (*zero_pending),
- gf_afr_mt_int32_t);
- if (!zero_pending) {
- ret = -1;
- goto out;
- }
- ret = dict_set_dynptr (xattr_req[i], priv->pending_key[j],
- zero_pending,
- 3 * sizeof (*zero_pending));
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Unable to set dict value");
- goto out;
- } else {
- zero_pending = NULL;
- }
+ if (!sources[i])
+ continue;
+ if (replies[i].poststat.ia_size < size) {
+ sources[i] = 0;
+ sinks[i] = 1;
}
}
- afr_reset_xattr (sh->xattr, priv->child_count);
- afr_reset_children (sh->success_children, priv->child_count);
- memset (sh->child_errno, 0,
- sizeof (*sh->child_errno) * priv->child_count);
- sh->success_count = 0;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_sh_data_fxattrop_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- sh->healing_fd, GF_XATTROP_ADD_ARRAY,
- xattr_req[i], NULL);
-
- if (!--call_count)
- break;
- }
- }
-
-out:
- if (xattr_req) {
- for (i = 0; i < priv->child_count; i++)
- if (xattr_req[i])
- dict_unref(xattr_req[i]);
- GF_FREE(xattr_req);
- }
-
- if (ret) {
- GF_FREE (zero_pending);
- afr_sh_data_fail (frame, this);
- }
-
- return 0;
+ return source;
}
-int
-afr_sh_data_big_lock_success (call_frame_t *frame, xlator_t *this)
+/*
+ * __afr_selfheal_data_prepare:
+ *
+ * This function inspects the on-disk xattrs and determines which subvols
+ * are sources and sinks.
+ *
+ * The return value is the index of the subvolume to be used as the source
+ * for self-healing, or -1 if no healing is necessary/split brain.
+ */
+static int
+__afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks,
+ struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
- local = frame->local;
- sh = &local->self_heal;
+ priv = this->private;
- sh->data_lock_held = _gf_true;
- afr_sh_data_fxattrop (frame, this);
- return 0;
-}
+ ret = afr_selfheal_unlocked_discover (frame, fd->inode, fd->inode->gfid,
+ replies);
+ if (ret)
+ return ret;
-int
-afr_sh_dom_lock_success (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ ret = afr_selfheal_find_direction (frame, this, replies,
+ AFR_DATA_TRANSACTION,
+ locked_on, sources, sinks);
+ if (ret)
+ return ret;
- local = frame->local;
- sh = &local->self_heal;
+ source = __afr_selfheal_data_finalize_source (this, sources, sinks,
+ healed_sinks, locked_on,
+ replies);
+ if (source < 0)
+ return -EIO;
- sh->sh_dom_lock_held = _gf_true;
- afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name,
- afr_sh_data_big_lock_success,
- afr_sh_data_fail);
- return 0;
-}
+ for (i = 0; i < priv->child_count; i++)
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
-int
-afr_sh_data_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ healed_sinks[i] = sinks[i] && locked_on[i];
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
+ return source;
+}
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "Blocking data inodelks "
- "failed for %s. by %s",
- local->loc.path, lkowner_utoa (&frame->root->lk_owner));
- sh->data_lock_failure_handler (frame, this);
- } else {
+static int
+__afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *locked_on)
+{
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *data_lock = NULL;
+ unsigned char *healed_sinks = NULL;
+ struct afr_reply *locked_replies = NULL;
+ int source = -1;
+ gf_boolean_t compat = _gf_false;
+ unsigned char *compat_lock = NULL;
+
+ priv = this->private;
+
+ sources = alloca0 (priv->child_count);
+ sinks = alloca0 (priv->child_count);
+ healed_sinks = alloca0 (priv->child_count);
+ data_lock = alloca0 (priv->child_count);
+ compat_lock = alloca0 (priv->child_count);
+
+ locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
+
+ ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, 0, 0,
+ data_lock);
+ {
+ if (ret < 2) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
- gf_log (this->name, GF_LOG_DEBUG, "Blocking data inodelks "
- "done for %s by %s. Proceding to self-heal",
- local->loc.path, lkowner_utoa (&frame->root->lk_owner));
+ ret = __afr_selfheal_data_prepare (frame, this, fd, data_lock,
+ sources, sinks, healed_sinks,
+ locked_replies);
+ if (ret < 0)
+ goto unlock;
+
+ source = ret;
+
+ ret = __afr_selfheal_truncate_sinks (frame, this, fd, healed_sinks,
+ locked_replies,
+ locked_replies[source].poststat.ia_size);
+ if (ret < 0)
+ goto unlock;
+
+ ret = 0;
+
+ /* Locking from (LLONG_MAX - 2) to (LLONG_MAX - 1) is for
+ compatibility with older self-heal clients which do not
+ hold a lock in the @priv->sh_domain domain to guard
+ against concurrent ongoing self-heals
+ */
+ afr_selfheal_inodelk (frame, this, fd->inode, this->name,
+ LLONG_MAX - 2, 1, compat_lock);
+ compat = _gf_true;
+ }
+unlock:
+ afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0,
+ data_lock);
+ if (ret < 0)
+ goto out;
- sh->data_lock_success_handler (frame, this);
- }
+ ret = afr_selfheal_data_do (frame, this, fd, source, healed_sinks,
+ locked_replies);
+ if (ret)
+ goto out;
- return 0;
+ ret = afr_selfheal_undo_pending (frame, this, fd->inode, sources, sinks,
+ healed_sinks, AFR_DATA_TRANSACTION,
+ locked_replies, data_lock);
+out:
+ if (compat)
+ afr_selfheal_uninodelk (frame, this, fd->inode, this->name,
+ LLONG_MAX - 2, 1, compat_lock);
+ return ret;
}
-int
-afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
-
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks "
- "failed for %s. by %s",
- local->loc.path, lkowner_utoa (&frame->root->lk_owner));
-
- if (!sh->data_lock_block) {
- sh->data_lock_failure_handler(frame, this);
- } else {
- int_lock->lock_cbk =
- afr_sh_data_post_blocking_inodelk_cbk;
- afr_blocking_lock (frame, this);
- }
- } else {
-
- gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks "
- "done for %s by %s. Proceeding to self-heal",
- local->loc.path, lkowner_utoa (&frame->root->lk_owner));
- sh->data_lock_success_handler (frame, this);
- }
-
- return 0;
-}
-int
-afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, char *dom,
- off_t start, off_t len)
+static fd_t *
+afr_selfheal_data_open (xlator_t *this, inode_t *inode)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_inodelk_t *inodelk = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- int_lock->transaction_lk_type = AFR_SELFHEAL_LK;
- int_lock->selfheal_lk_type = AFR_DATA_SELF_HEAL_LK;
+ loc_t loc = {0,};
+ int ret = 0;
+ fd_t *fd = NULL;
- afr_set_lock_number (frame, this);
+ fd = fd_create (inode, 0);
+ if (!fd)
+ return NULL;
- int_lock->lock_cbk = afr_sh_data_post_nonblocking_inodelk_cbk;
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
- int_lock->domain = dom;
- inodelk = afr_get_inodelk (int_lock, int_lock->domain);
- inodelk->flock.l_start = start;
- inodelk->flock.l_len = len;
- inodelk->flock.l_type = F_WRLCK;
-
- afr_nonblocking_inodelk (frame, this);
-
- return 0;
-}
-
-int
-afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- GF_ASSERT (sh->old_loop_frame);
- sh_loop_finish (sh->old_loop_frame, this);
- sh->old_loop_frame = NULL;
- sh->data_lock_held = _gf_true;
- sh->sync_done = _gf_true;
- afr_sh_data_fxattrop (frame, this);
- return 0;
-}
-
-int
-afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ ret = syncop_open (this, &loc, O_RDWR|O_LARGEFILE, fd);
+ if (ret) {
+ fd_unref (fd);
+ fd = NULL;
+ } else {
+ fd_bind (fd);
+ }
- local = frame->local;
- sh = &local->self_heal;
+ loc_wipe (&loc);
- GF_ASSERT (sh->old_loop_frame);
- sh_loop_finish (sh->old_loop_frame, this);
- sh->old_loop_frame = NULL;
- afr_sh_set_timestamps (frame, this);
- return 0;
+ return fd;
}
int
-afr_sh_data_lock (call_frame_t *frame, xlator_t *this,
- off_t start, off_t len, gf_boolean_t block,
- char *dom, afr_lock_cbk_t success_handler,
- afr_lock_cbk_t failure_handler)
+afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode)
{
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- sh->data_lock_success_handler = success_handler;
- sh->data_lock_failure_handler = failure_handler;
- sh->data_lock_block = block;
- return afr_sh_data_lock_rec (frame, this, dom, start, len);
-}
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ int ret = 0;
+ fd_t *fd = NULL;
-int
-afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- child_index = (long) cookie;
-
- /* TODO: some of the open's might fail.
- In that case, modify cleanup fn to send flush on those
- fd's which are already open */
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "open of %s failed on child %s (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "open of %s succeeded on child %s",
- local->loc.path,
- priv->children[child_index]->name);
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
- afr_sh_data_fail (frame, this);
- return 0;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "fd for %s opened, commencing sync",
- local->loc.path);
-
- afr_sh_data_lock (frame, this, 0, 0, _gf_true, priv->sh_domain,
- afr_sh_dom_lock_success, afr_sh_data_fail);
- }
-
- return 0;
-}
+ priv = this->private;
+ fd = afr_selfheal_data_open (this, inode);
+ if (!fd)
+ return -EIO;
-int
-afr_sh_data_open (call_frame_t *frame, xlator_t *this)
-{
- int i = 0;
- int call_count = 0;
- fd_t *fd = NULL;
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- call_count = afr_up_children_count (local->child_up, priv->child_count);
- local->call_count = call_count;
-
- fd = fd_create (local->loc.inode, frame->root->pid);
- sh->healing_fd = fd;
-
- /* open sinks */
- for (i = 0; i < priv->child_count; i++) {
- if(!local->child_up[i])
- continue;
-
- STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->open,
- &local->loc,
- O_RDWR|O_LARGEFILE, fd, NULL);
-
- if (!--call_count)
- break;
- }
-
- return 0;
-}
+ locked_on = alloca0 (priv->child_count);
-void
-afr_sh_non_reg_fix (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
- int i = 0;
-
- if (op_ret < 0) {
- afr_sh_data_fail (frame, this);
- return;
- }
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- for (i = 0; i < priv->child_count ; i++) {
- if (1 == local->child_up[i])
- sh->success[i] = 1;
- }
-
- afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION,
- afr_sh_data_erase_pending_cbk,
- afr_sh_data_finish);
-}
+ ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain, 0, 0,
+ locked_on);
+ {
+ if (ret < 2) {
+ /* Either less than two subvols available, or another
+ selfheal (from another server) is in progress. Skip
+ for now in any case there isn't anything to do.
+ */
+ ret = -ENOTCONN;
+ goto unlock;
+ }
-int
-afr_sh_non_reg_lock_success (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- sh->data_lock_held = _gf_true;
- afr_sh_common_lookup (frame, this, &local->loc,
- afr_sh_non_reg_fix, NULL,
- AFR_LOOKUP_FAIL_CONFLICTS |
- AFR_LOOKUP_FAIL_MISSING_GFIDS,
- NULL);
- return 0;
-}
+ ret = __afr_selfheal_data (frame, this, fd, locked_on);
+ }
+unlock:
+ afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on);
-gf_boolean_t
-afr_can_start_data_self_heal (afr_self_heal_t *sh, afr_private_t *priv)
-{
- if (sh->force_confirm_spb)
- return _gf_true;
- if (sh->do_data_self_heal &&
- afr_data_self_heal_enabled (priv->data_self_heal))
- return _gf_true;
- return _gf_false;
-}
+ if (fd)
+ fd_unref (fd);
-int
-afr_self_heal_data (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = this->private;
- int ret = -1;
-
- local = frame->local;
- sh = &local->self_heal;
-
- sh->sh_type_in_action = AFR_SELF_HEAL_DATA;
-
- if (afr_can_start_data_self_heal (sh, priv)) {
- afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED);
- ret = afr_inodelk_init (&local->internal_lock.inodelk[1],
- priv->sh_domain, priv->child_count);
- if (ret < 0) {
- afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
- afr_sh_data_done (frame, this);
- return 0;
- }
-
- if (IA_ISREG (sh->type)) {
- afr_sh_data_open (frame, this);
- } else {
- afr_sh_data_lock (frame, this, 0, 0, _gf_true,
- this->name,
- afr_sh_non_reg_lock_success,
- afr_sh_data_fail);
- }
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "not doing data self heal on %s",
- local->loc.path);
- afr_sh_data_done (frame, this);
- }
-
- return 0;
+ return ret;
}