summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr/src/afr-self-heal-common.c
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster/afr/src/afr-self-heal-common.c')
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c4474
1 files changed, 2539 insertions, 1935 deletions
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 377c72a88f2..a580a1584cc 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -1,2330 +1,2934 @@
/*
- Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#include "glusterfs.h"
-#include "xlator.h"
-#include "byte-order.h"
-
#include "afr.h"
-#include "afr-transaction.h"
-#include "afr-self-heal-common.h"
#include "afr-self-heal.h"
-#include "pump.h"
+#include <glusterfs/byte-order.h>
+#include "protocol-common.h"
+#include "afr-messages.h"
+#include <glusterfs/events.h>
-//Intersection[child]=1 if child is part of intersection
void
-afr_children_intersection_get (int32_t *set1, int32_t *set2,
- int *intersection, unsigned int child_count)
-{
- int i = 0;
+afr_heal_synctask(xlator_t *this, afr_local_t *local);
- memset (intersection, 0, sizeof (*intersection) * child_count);
- for (i = 0; i < child_count; i++) {
- intersection[i] = afr_is_child_present (set1, child_count, i)
- && afr_is_child_present (set2, child_count,
- i);
+int
+afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name,
+ inode_t *inode, struct afr_reply *replies, int source,
+ unsigned char *sources, void *gfid, int *gfid_idx)
+{
+ afr_private_t *priv = NULL;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ unsigned char *wind_on = NULL;
+ ia_type_t ia_type = IA_INVAL;
+ dict_t *xdata = NULL;
+ loc_t loc = {
+ 0,
+ };
+ int ret = 0;
+ int i = 0;
+
+ priv = this->private;
+ wind_on = alloca0(priv->child_count);
+ if (source >= 0 && replies[source].valid && replies[source].op_ret == 0)
+ ia_type = replies[source].poststat.ia_type;
+
+ if (ia_type != IA_INVAL)
+ goto heal;
+
+ /* If ia_type is still invalid, it means either
+ * (a)'source' was -1, i.e. parent dir pending xattrs are in split-brain
+ * (or) (b) The parent dir pending xattrs are all zeroes (i.e. all bricks
+ * are sources) and the 'source' we selected earlier might be the one where
+ * the file is not actually present.
+ *
+ * In both cases, let us pick a brick with a successful reply and use its
+ * ia_type.
+ * */
+ for (i = 0; i < priv->child_count; i++) {
+ if (source == -1) {
+ /* case (a) above. */
+ if (replies[i].valid && replies[i].op_ret == 0 &&
+ replies[i].poststat.ia_type != IA_INVAL) {
+ ia_type = replies[i].poststat.ia_type;
+ break;
+ }
+ } else {
+ /* case (b) above. */
+ if (i == source)
+ continue;
+ if (sources[i] && replies[i].valid && replies[i].op_ret == 0 &&
+ replies[i].poststat.ia_type != IA_INVAL) {
+ ia_type = replies[i].poststat.ia_type;
+ break;
+ }
}
-}
+ }
-/**
- * select_source - select a source and return it
- */
+heal:
+ /* gfid heal on those subvolumes that do not have gfid associated
+ * with the inode and update those replies.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret != 0)
+ continue;
-int
-afr_sh_select_source (int sources[], int child_count)
-{
- int i = 0;
- for (i = 0; i < child_count; i++)
- if (sources[i])
- return i;
+ if (gf_uuid_is_null(gfid) &&
+ !gf_uuid_is_null(replies[i].poststat.ia_gfid) &&
+ replies[i].poststat.ia_type == ia_type)
+ gfid = replies[i].poststat.ia_gfid;
- return -1;
-}
+ if (!gf_uuid_is_null(replies[i].poststat.ia_gfid) ||
+ replies[i].poststat.ia_type != ia_type)
+ continue;
+ wind_on[i] = 1;
+ }
-/**
- * sink_count - return number of sinks in sources array
- */
+ if (AFR_COUNT(wind_on, priv->child_count) == 0)
+ return 0;
-int
-afr_sh_sink_count (int sources[], int child_count)
-{
- int i = 0;
- int sinks = 0;
- for (i = 0; i < child_count; i++)
- if (!sources[i])
- sinks++;
- return sinks;
+ xdata = dict_new();
+ if (!xdata) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = dict_set_gfuuid(xdata, "gfid-req", gfid, true);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ frame = afr_frame_create(this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+
+ local = frame->local;
+ loc.parent = inode_ref(parent);
+ gf_uuid_copy(loc.pargfid, parent->gfid);
+ loc.name = name;
+ loc.inode = inode_ref(inode);
+
+ AFR_ONLIST(wind_on, frame, afr_selfheal_discover_cbk, lookup, &loc, xdata);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!wind_on[i])
+ continue;
+ afr_reply_wipe(&replies[i]);
+ afr_reply_copy(&replies[i], &local->replies[i]);
+ }
+ if (gfid_idx && (*gfid_idx == -1)) {
+ /*Pick a brick where the gifd heal was successful.*/
+ for (i = 0; i < priv->child_count; i++) {
+ if (!wind_on[i])
+ continue;
+ if (replies[i].valid && replies[i].op_ret == 0 &&
+ !gf_uuid_is_null(replies[i].poststat.ia_gfid)) {
+ *gfid_idx = i;
+ break;
+ }
+ }
+ }
+out:
+ if (gfid_idx && (*gfid_idx == -1) && (ret == 0) && local) {
+ ret = -afr_final_errno(local, priv);
+ }
+ loc_wipe(&loc);
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ if (xdata)
+ dict_unref(xdata);
+
+ return ret;
}
int
-afr_sh_source_count (int sources[], int child_count)
+afr_gfid_sbrain_source_from_src_brick(xlator_t *this, struct afr_reply *replies,
+ char *src_brick)
{
- int i = 0;
- int nsource = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
- for (i = 0; i < child_count; i++)
- if (sources[i])
- nsource++;
- return nsource;
-}
-
-void
-afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno)
-{
- sh->op_ret = -1;
- if (afr_error_more_important (sh->op_errno, op_errno))
- sh->op_errno = op_errno;
+ priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (strcmp(priv->children[i]->name, src_brick) == 0)
+ return i;
+ }
+ return -1;
}
-void
-afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this)
+int
+afr_selfheal_gfid_mismatch_by_majority(struct afr_reply *replies,
+ int child_count)
{
- afr_private_t * priv = this->private;
- char *buf = NULL;
- char *ptr = NULL;
- int i = 0;
- int j = 0;
+ int j = 0;
+ int i = 0;
+ int votes;
- /* 10 digits per entry + 1 space + '[' and ']' */
- buf = GF_MALLOC (priv->child_count * 11 + 8, gf_afr_mt_char);
+ for (i = 0; i < child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
- for (i = 0; i < priv->child_count; i++) {
- ptr = buf;
- ptr += sprintf (ptr, "[ ");
- for (j = 0; j < priv->child_count; j++) {
- ptr += sprintf (ptr, "%d ", pending_matrix[i][j]);
- }
- sprintf (ptr, "]");
- gf_log (this->name, GF_LOG_TRACE,
- "pending_matrix: %s", buf);
+ votes = 1;
+ for (j = i + 1; j < child_count; j++) {
+ if ((!gf_uuid_compare(replies[i].poststat.ia_gfid,
+ replies[j].poststat.ia_gfid)))
+ votes++;
+ if (votes > child_count / 2)
+ return i;
}
+ }
- GF_FREE (buf);
+ return -1;
}
-void
-afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count)
+int
+afr_gfid_sbrain_source_from_bigger_file(struct afr_reply *replies,
+ int child_count)
{
- int i = 0;
- int j = 0;
-
- GF_ASSERT (pending_matrix);
+ int i = 0;
+ int src = -1;
+ uint64_t size = 0;
- for (i = 0; i < child_count; i++) {
- for (j = 0; j < child_count; j++) {
- pending_matrix[i][j] = 0;
- }
+ for (i = 0; i < child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (size < replies[i].poststat.ia_size) {
+ src = i;
+ size = replies[i].poststat.ia_size;
+ } else if (replies[i].poststat.ia_size == size) {
+ src = -1;
}
+ }
+ return src;
}
-void
-afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix,
- unsigned char *ignorant_subvols,
- size_t child_count)
-{
- int i = 0;
- int j = 0;
-
- GF_ASSERT (pending_matrix);
- GF_ASSERT (ignorant_subvols);
-
- for (i = 0; i < child_count; i++) {
- if (ignorant_subvols[i]) {
- for (j = 0; j < child_count; j++) {
- if (!ignorant_subvols[j])
- pending_matrix[j][i] += 1;
- }
- }
- }
+int
+afr_gfid_sbrain_source_from_latest_mtime(struct afr_reply *replies,
+ int child_count)
+{
+ int i = 0;
+ int src = -1;
+ uint32_t mtime = 0;
+ uint32_t mtime_nsec = 0;
+
+ for (i = 0; i < child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret != 0)
+ continue;
+ if ((mtime < replies[i].poststat.ia_mtime) ||
+ ((mtime == replies[i].poststat.ia_mtime) &&
+ (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) {
+ src = i;
+ mtime = replies[i].poststat.ia_mtime;
+ mtime_nsec = replies[i].poststat.ia_mtime_nsec;
+ } else if ((mtime == replies[i].poststat.ia_mtime) &&
+ (mtime_nsec == replies[i].poststat.ia_mtime_nsec)) {
+ src = -1;
+ }
+ }
+ return src;
}
int
-afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix,
- dict_t *xattr[], afr_transaction_type type,
- size_t child_count)
-{
- /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
- int32_t pending[3] = {0,};
- void *pending_raw = NULL;
- int ret = -1;
- int i = 0;
- int j = 0;
- int k = 0;
- unsigned char *ignorant_subvols = NULL;
-
- ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), child_count,
- gf_afr_mt_char);
- if (NULL == ignorant_subvols)
- goto out;
-
- afr_init_pending_matrix (pending_matrix, child_count);
+afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies,
+ inode_t *inode, uuid_t pargfid, const char *bname,
+ int src_idx, int child_idx,
+ unsigned char *locked_on, int *src, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ char g1[64] = {
+ 0,
+ };
+ char g2[64] = {
+ 0,
+ };
+ int up_count = 0;
+ int heal_op = -1;
+ int ret = -1;
+ char *src_brick = NULL;
+
+ *src = -1;
+ priv = this->private;
+ up_count = AFR_COUNT(locked_on, priv->child_count);
+ if (up_count != priv->child_count) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "All the bricks should be up to resolve the gfid split "
+ "barin");
+ if (xdata) {
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ SALL_BRICKS_UP_TO_RESOLVE);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
+ "Error setting"
+ " gfid-heal-msg dict");
+ }
+ goto out;
+ }
+
+ if (xdata) {
+ ret = dict_get_int32_sizen(xdata, "heal-op", &heal_op);
+ if (ret)
+ goto fav_child;
+ } else {
+ goto fav_child;
+ }
+
+ switch (heal_op) {
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE:
+ *src = afr_gfid_sbrain_source_from_bigger_file(replies,
+ priv->child_count);
+ if (*src == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ SNO_BIGGER_FILE);
+ if (xdata) {
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ SNO_BIGGER_FILE);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_DICT_SET_FAILED,
+ "Error"
+ " setting gfid-heal-msg dict");
+ }
+ }
+ break;
+
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME:
+ *src = afr_gfid_sbrain_source_from_latest_mtime(replies,
+ priv->child_count);
+ if (*src == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ SNO_DIFF_IN_MTIME);
+ if (xdata) {
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ SNO_DIFF_IN_MTIME);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_DICT_SET_FAILED,
+ "Error"
+ "setting gfid-heal-msg dict");
+ }
+ }
+ break;
+
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK:
+ ret = dict_get_str_sizen(xdata, "child-name", &src_brick);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "Error getting the source "
+ "brick");
+ break;
+ }
+ *src = afr_gfid_sbrain_source_from_src_brick(this, replies,
+ src_brick);
+ if (*src == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ SERROR_GETTING_SRC_BRICK);
+ if (xdata) {
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ SERROR_GETTING_SRC_BRICK);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_DICT_SET_FAILED,
+ "Error"
+ " setting gfid-heal-msg dict");
+ }
+ }
+ break;
- for (i = 0; i < child_count; i++) {
- pending_raw = NULL;
+ default:
+ break;
+ }
+ goto out;
+
+fav_child:
+ switch (priv->fav_child_policy) {
+ case AFR_FAV_CHILD_BY_SIZE:
+ *src = afr_sh_fav_by_size(this, replies, inode);
+ break;
+ case AFR_FAV_CHILD_BY_MTIME:
+ *src = afr_sh_fav_by_mtime(this, replies, inode);
+ break;
+ case AFR_FAV_CHILD_BY_CTIME:
+ *src = afr_sh_fav_by_ctime(this, replies, inode);
+ break;
+ case AFR_FAV_CHILD_BY_MAJORITY:
+ if (priv->child_count != 2)
+ *src = afr_selfheal_gfid_mismatch_by_majority(
+ replies, priv->child_count);
+ else
+ *src = -1;
+
+ if (*src == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "No majority to resolve "
+ "gfid split brain");
+ }
+ break;
+ default:
+ break;
+ }
- for (j = 0; j < child_count; j++) {
- ret = dict_get_ptr (xattr[i], pending_key[j],
- &pending_raw);
+out:
+ if (*src == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "Gfid mismatch detected for <gfid:%s>/%s>, %s on %s and"
+ " %s on %s.",
+ uuid_utoa(pargfid), bname,
+ uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1),
+ priv->children[child_idx]->name,
+ uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2),
+ priv->children[src_idx]->name);
+ gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
+ "subvol=%s;type=gfid;file="
+ "<gfid:%s>/%s>;count=2;child-%d=%s;gfid-%d=%s;"
+ "child-%d=%s;gfid-%d=%s",
+ this->ctx->cmd_args.client_pid, this->name, uuid_utoa(pargfid),
+ bname, child_idx, priv->children[child_idx]->name, child_idx,
+ uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1), src_idx,
+ priv->children[src_idx]->name, src_idx,
+ uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2));
+ return -1;
+ }
+ return 0;
+}
- if (ret != 0) {
- /*
- * There is no xattr present. This means this
- * subvolume should be considered an 'ignorant'
- * subvolume.
- */
+int
+afr_selfheal_post_op_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
- ignorant_subvols[i] = 1;
- continue;
- }
+ local = frame->local;
- memcpy (pending, pending_raw, sizeof(pending));
- k = afr_index_for_transaction_type (type);
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ syncbarrier_wake(&local->barrier);
- pending_matrix[i][j] = ntoh32 (pending[k]);
- }
- }
-
- afr_mark_ignorant_subvols_as_pending (pending_matrix,
- ignorant_subvols,
- child_count);
- GF_FREE (ignorant_subvols);
-out:
- return ret;
+ return 0;
}
-typedef enum {
- AFR_NODE_INNOCENT,
- AFR_NODE_FOOL,
- AFR_NODE_WISE,
- AFR_NODE_INVALID = -1,
-} afr_node_type;
+int
+afr_selfheal_post_op(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int subvol, dict_t *xattr, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ loc_t loc = {
+ 0,
+ };
+ int ret = 0;
-typedef struct {
- afr_node_type type;
- int wisdom;
-} afr_node_character;
+ priv = this->private;
+ local = frame->local;
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
-static int
-afr_sh_is_innocent (int32_t *array, int child_count)
-{
- int i = 0;
- int ret = 1; /* innocent until proven guilty */
+ local->op_ret = 0;
- for (i = 0; i < child_count; i++) {
- if (array[i]) {
- ret = 0;
- break;
- }
- }
+ STACK_WIND(frame, afr_selfheal_post_op_cbk, priv->children[subvol],
+ priv->children[subvol]->fops->xattrop, &loc,
+ GF_XATTROP_ADD_ARRAY, xattr, xdata);
- return ret;
-}
+ syncbarrier_wait(&local->barrier, 1);
+ if (local->op_ret < 0)
+ ret = -local->op_errno;
+ loc_wipe(&loc);
+ local->op_ret = 0;
-static int
-afr_sh_is_fool (int32_t *array, int i, int child_count)
-{
- return array[i]; /* fool if accuses itself */
+ return ret;
}
-
-static int
-afr_sh_is_wise (int32_t *array, int i, int child_count)
-{
- return !array[i]; /* wise if does not accuse itself */
+int
+afr_check_stale_error(struct afr_reply *replies, afr_private_t *priv)
+{
+ int i = 0;
+ int op_errno = 0;
+ int tmp_errno = 0;
+ int stale_count = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ tmp_errno = replies[i].op_errno;
+ if (tmp_errno == ENOENT || tmp_errno == ESTALE) {
+ op_errno = afr_higher_errno(op_errno, tmp_errno);
+ stale_count++;
+ }
+ }
+ if (stale_count != priv->child_count)
+ return -ENOTCONN;
+ else
+ return -op_errno;
}
-
-static int
-afr_sh_all_nodes_innocent (afr_node_character *characters,
- int child_count)
+int
+afr_sh_generic_fop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
{
- int i = 0;
- int ret = 1;
+ int i = (long)cookie;
+ afr_local_t *local = NULL;
- for (i = 0; i < child_count; i++) {
- if (characters[i].type != AFR_NODE_INNOCENT) {
- ret = 0;
- break;
- }
- }
+ local = frame->local;
- return ret;
-}
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (pre)
+ local->replies[i].prestat = *pre;
+ if (post)
+ local->replies[i].poststat = *post;
+ if (xdata)
+ local->replies[i].xdata = dict_ref(xdata);
+ syncbarrier_wake(&local->barrier);
-static int
-afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count)
-{
- int i = 0;
- int ret = 0;
+ return 0;
+}
- for (i = 0; i < child_count; i++) {
- if (characters[i].type == AFR_NODE_WISE) {
- ret = 1;
- break;
- }
+int
+afr_selfheal_restore_time(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, unsigned char *healed_sinks,
+ struct afr_reply *replies)
+{
+ loc_t loc = {
+ 0,
+ };
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, setattr, &loc,
+ &replies[source].poststat,
+ (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME | GF_SET_ATTR_CTIME),
+ NULL);
+
+ loc_wipe(&loc);
+
+ return 0;
+}
+
+dict_t *
+afr_selfheal_output_xattr(xlator_t *this, gf_boolean_t is_full_crawl,
+ afr_transaction_type type, int *output_dirty,
+ int **output_matrix, int subvol,
+ int **full_heal_mtx_out)
+{
+ int j = 0;
+ int idx = 0;
+ int d_idx = 0;
+ int ret = 0;
+ int *raw = 0;
+ dict_t *xattr = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ idx = afr_index_for_transaction_type(type);
+ d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
+
+ xattr = dict_new();
+ if (!xattr)
+ return NULL;
+
+ /* clear dirty */
+ raw = GF_CALLOC(sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t);
+ if (!raw)
+ goto err;
+
+ raw[idx] = hton32(output_dirty[subvol]);
+ ret = dict_set_bin(xattr, AFR_DIRTY, raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ GF_FREE(raw);
+ goto err;
+ }
+
+ /* clear/set pending */
+ for (j = 0; j < priv->child_count; j++) {
+ raw = GF_CALLOC(sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t);
+ if (!raw)
+ goto err;
+
+ raw[idx] = hton32(output_matrix[subvol][j]);
+ if (is_full_crawl)
+ raw[d_idx] = hton32(full_heal_mtx_out[subvol][j]);
+
+ ret = dict_set_bin(xattr, priv->pending_key[j], raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ GF_FREE(raw);
+ goto err;
}
+ }
- return ret;
+ return xattr;
+err:
+ if (xattr)
+ dict_unref(xattr);
+ return NULL;
}
+int
+afr_selfheal_undo_pending(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ afr_transaction_type type, struct afr_reply *replies,
+ unsigned char *locked_on)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int j = 0;
+ unsigned char *pending = NULL;
+ int *input_dirty = NULL;
+ int **input_matrix = NULL;
+ int **full_heal_mtx_in = NULL;
+ int **full_heal_mtx_out = NULL;
+ int *output_dirty = NULL;
+ int **output_matrix = NULL;
+ dict_t *xattr = NULL;
+ dict_t *xdata = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ pending = alloca0(priv->child_count);
+
+ input_dirty = alloca0(priv->child_count * sizeof(int));
+ input_matrix = ALLOC_MATRIX(priv->child_count, int);
+ full_heal_mtx_in = ALLOC_MATRIX(priv->child_count, int);
+ full_heal_mtx_out = ALLOC_MATRIX(priv->child_count, int);
+ output_dirty = alloca0(priv->child_count * sizeof(int));
+ output_matrix = ALLOC_MATRIX(priv->child_count, int);
+
+ xdata = dict_new();
+ if (!xdata)
+ return -1;
-/*
- * The 'wisdom' of a wise node is 0 if any other wise node accuses it.
- * It is 1 if no other wise node accuses it.
- * Only wise nodes with wisdom 1 are sources.
- *
- * If no nodes with wisdom 1 exist, a split-brain has occured.
- */
-
-static void
-afr_sh_compute_wisdom (int32_t *pending_matrix[],
- afr_node_character characters[], int child_count)
-{
- int i = 0;
- int j = 0;
+ afr_selfheal_extract_xattr(this, replies, type, input_dirty, input_matrix);
- for (i = 0; i < child_count; i++) {
- if (characters[i].type == AFR_NODE_WISE) {
- characters[i].wisdom = 1;
+ if (local->need_full_crawl)
+ afr_selfheal_extract_xattr(this, replies, AFR_DATA_TRANSACTION, NULL,
+ full_heal_mtx_in);
- for (j = 0; j < child_count; j++) {
- if ((characters[j].type == AFR_NODE_WISE)
- && pending_matrix[j][i]) {
+ for (i = 0; i < priv->child_count; i++)
+ if (sinks[i] && !healed_sinks[i])
+ pending[i] = 1;
- characters[i].wisdom = 0;
- }
- }
- }
+ for (i = 0; i < priv->child_count; i++) {
+ for (j = 0; j < priv->child_count; j++) {
+ if (pending[j]) {
+ output_matrix[i][j] = 1;
+ if (type == AFR_ENTRY_TRANSACTION)
+ full_heal_mtx_out[i][j] = 1;
+ } else if (locked_on[j]) {
+ output_matrix[i][j] = -input_matrix[i][j];
+ if (type == AFR_ENTRY_TRANSACTION)
+ full_heal_mtx_out[i][j] = -full_heal_mtx_in[i][j];
+ }
}
-}
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!pending[i])
+ output_dirty[i] = -input_dirty[i];
+ }
-static int
-afr_sh_wise_nodes_conflict (afr_node_character *characters,
- int child_count)
-{
- int i = 0;
- int ret = 1;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!locked_on[i])
+ /* perform post-op only on subvols we had locked
+ and inspected on.
+ */
+ continue;
+ if (undid_pending[i])
+ /* We already unset the pending xattrs in
+ * _afr_fav_child_reset_sink_xattrs(). */
+ continue;
- for (i = 0; i < child_count; i++) {
- if ((characters[i].type == AFR_NODE_WISE)
- && characters[i].wisdom == 1) {
-
- /* There is atleast one bona-fide wise node */
- ret = 0;
- break;
- }
+ xattr = afr_selfheal_output_xattr(this, local->need_full_crawl, type,
+ output_dirty, output_matrix, i,
+ full_heal_mtx_out);
+ if (!xattr) {
+ continue;
}
- return ret;
-}
+ if ((type == AFR_ENTRY_TRANSACTION) && (priv->esh_granular)) {
+ if (xdata && dict_set_int8(xdata, GF_XATTROP_PURGE_INDEX, 1))
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_DICT_SET_FAILED,
+ "Failed to set"
+ " dict value for %s",
+ GF_XATTROP_PURGE_INDEX);
+ }
+ afr_selfheal_post_op(frame, this, inode, i, xattr, xdata);
+ dict_unref(xattr);
+ }
-static int
-afr_sh_mark_wisest_as_sources (int sources[],
- afr_node_character *characters,
- int child_count)
-{
- int nsources = 0;
- int i = 0;
+ if (xdata)
+ dict_unref(xdata);
- for (i = 0; i < child_count; i++) {
- if (characters[i].wisdom == 1) {
- sources[i] = 1;
- nsources++;
- }
- }
-
- return nsources;
+ return 0;
}
-static void
-afr_compute_witness_of_fools (int32_t *witnesses, int32_t **pending_matrix,
- afr_node_character *characters,
- int32_t child_count)
-{
- int i = 0;
- int j = 0;
- int witness = 0;
-
- GF_ASSERT (witnesses);
- GF_ASSERT (pending_matrix);
- GF_ASSERT (characters);
- GF_ASSERT (child_count > 0);
-
- for (i = 0; i < child_count; i++) {
- if (characters[i].type != AFR_NODE_FOOL)
- continue;
-
- witness = 0;
- for (j = 0; j < child_count; j++) {
- if (i == j)
- continue;
- witness += pending_matrix[i][j];
- }
- witnesses[i] = witness;
- }
+void
+afr_reply_copy(struct afr_reply *dst, struct afr_reply *src)
+{
+ dict_t *xdata = NULL;
+
+ dst->valid = src->valid;
+ dst->op_ret = src->op_ret;
+ dst->op_errno = src->op_errno;
+ dst->prestat = src->prestat;
+ dst->poststat = src->poststat;
+ dst->preparent = src->preparent;
+ dst->postparent = src->postparent;
+ dst->preparent2 = src->preparent2;
+ dst->postparent2 = src->postparent2;
+ if (src->xdata)
+ xdata = dict_ref(src->xdata);
+ else
+ xdata = NULL;
+ if (dst->xdata)
+ dict_unref(dst->xdata);
+ dst->xdata = xdata;
+ if (xdata && dict_get_str_boolean(xdata, "fips-mode-rchecksum",
+ _gf_false) == _gf_true) {
+ memcpy(dst->checksum, src->checksum, SHA256_DIGEST_LENGTH);
+ } else {
+ memcpy(dst->checksum, src->checksum, MD5_DIGEST_LENGTH);
+ }
+ dst->fips_mode_rchecksum = src->fips_mode_rchecksum;
}
-static int32_t
-afr_find_biggest_witness_among_fools (int32_t *witnesses,
- afr_node_character *characters,
- int32_t child_count)
+void
+afr_replies_copy(struct afr_reply *dst, struct afr_reply *src, int count)
{
- int i = 0;
- int biggest_witness = -1;
-
- GF_ASSERT (witnesses);
- GF_ASSERT (characters);
- GF_ASSERT (child_count > 0);
+ int i = 0;
- for (i = 0; i < child_count; i++) {
- if (characters[i].type != AFR_NODE_FOOL)
- continue;
+ if (dst == src)
+ return;
- if (biggest_witness < witnesses[i])
- biggest_witness = witnesses[i];
- }
- return biggest_witness;
+ for (i = 0; i < count; i++) {
+ afr_reply_copy(&dst[i], &src[i]);
+ }
}
int
-afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses,
- afr_node_character *characters,
- int32_t child_count, int32_t witness)
+afr_selfheal_fill_dirty(xlator_t *this, int *dirty, int subvol, int idx,
+ dict_t *xdata)
{
- int i = 0;
- int nsources = 0;
-
- GF_ASSERT (sources);
- GF_ASSERT (witnesses);
- GF_ASSERT (characters);
- GF_ASSERT (child_count > 0);
-
- for (i = 0; i < child_count; i++) {
- if (characters[i].type != AFR_NODE_FOOL)
- continue;
+ void *pending_raw = NULL;
+ int pending[3] = {
+ 0,
+ };
- if (witness == witnesses[i]) {
- sources[i] = 1;
- nsources++;
- }
- }
- return nsources;
-}
+ if (!dirty)
+ return 0;
-static int
-afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix,
- afr_node_character *characters,
- int child_count)
-{
- int32_t biggest_witness = 0;
- int nsources = 0;
- int32_t *witnesses = NULL;
+ if (dict_get_ptr(xdata, AFR_DIRTY, &pending_raw))
+ return -1;
- GF_ASSERT (child_count > 0);
+ if (!pending_raw)
+ return -1;
- witnesses = GF_CALLOC (child_count, sizeof (*witnesses),
- gf_afr_mt_int32_t);
- if (NULL == witnesses) {
- nsources = -1;
- goto out;
- }
+ memcpy(pending, pending_raw, sizeof(pending));
- afr_compute_witness_of_fools (witnesses, pending_matrix, characters,
- child_count);
- biggest_witness = afr_find_biggest_witness_among_fools (witnesses,
- characters,
- child_count);
- nsources = afr_mark_fool_as_source_by_witness (sources, witnesses,
- characters, child_count,
- biggest_witness);
-out:
- if (witnesses)
- GF_FREE (witnesses);
- return nsources;
-}
+ dirty[subvol] = ntoh32(pending[idx]);
-int
-afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs,
- int32_t *valid_children, int child_count,
- uint32_t uid)
-{
- int i = 0;
- int nsources = 0;
- int child = 0;
-
- GF_ASSERT (bufs);
- GF_ASSERT (valid_children);
- GF_ASSERT (sources);
- GF_ASSERT (child_count > 0);
-
- for (i = 0; i < child_count; i++) {
- if (-1 == valid_children[i])
- continue;
-
- child = valid_children[i];
- if (uid == bufs[child].ia_uid) {
- sources[child] = 1;
- nsources++;
- }
- }
- return nsources;
+ return 0;
}
int
-afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *valid_children,
- int child_count)
-{
- int i = 0;
- int smallest = -1;
- int child = 0;
-
- GF_ASSERT (bufs);
- GF_ASSERT (valid_children);
- GF_ASSERT (child_count > 0);
-
- for (i = 0; i < child_count; i++) {
- if (-1 == valid_children[i])
- continue;
- child = valid_children[i];
- if ((smallest == -1) ||
- (bufs[child].ia_uid < bufs[smallest].ia_uid)) {
- smallest = child;
- }
- }
- return smallest;
-}
-
-static int
-afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *valid_children,
- int child_count, int32_t *sources)
+afr_selfheal_fill_matrix(xlator_t *this, int **matrix, int subvol, int idx,
+ dict_t *xdata)
{
- int nsources = 0;
- int smallest = 0;
+ int i = 0;
+ void *pending_raw = NULL;
+ int pending[3] = {
+ 0,
+ };
+ afr_private_t *priv = NULL;
- smallest = afr_get_child_with_lowest_uid (bufs, valid_children,
- child_count);
- if (smallest < 0) {
- nsources = -1;
- goto out;
- }
- nsources = afr_mark_child_as_source_by_uid (sources, bufs,
- valid_children, child_count,
- bufs[smallest].ia_uid);
-out:
- return nsources;
-}
+ priv = this->private;
-char *
-afr_get_character_str (afr_node_type type)
-{
- char *character = NULL;
+ if (!matrix)
+ return 0;
- switch (type) {
- case AFR_NODE_INNOCENT:
- character = "innocent";
- break;
- case AFR_NODE_FOOL:
- character = "fool";
- break;
- case AFR_NODE_WISE:
- character = "wise";
- break;
- default:
- character = "invalid";
- break;
- }
- return character;
-}
+ for (i = 0; i < priv->child_count; i++) {
+ if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw))
+ continue;
-afr_node_type
-afr_find_child_character_type (int32_t *pending_row, int32_t child,
- int32_t child_count, const char *xlator_name)
-{
- afr_node_type type = AFR_NODE_INVALID;
+ if (!pending_raw)
+ continue;
- GF_ASSERT (pending_row);
- GF_ASSERT (child_count > 0);
- GF_ASSERT ((child >= 0) && (child < child_count));
+ memcpy(pending, pending_raw, sizeof(pending));
- if (afr_sh_is_innocent (pending_row, child_count))
- type = AFR_NODE_INNOCENT;
- else if (afr_sh_is_fool (pending_row, child, child_count))
- type = AFR_NODE_FOOL;
- else if (afr_sh_is_wise (pending_row, child, child_count))
- type = AFR_NODE_WISE;
- else
- GF_ASSERT (0);
+ matrix[subvol][i] = ntoh32(pending[idx]);
+ }
- gf_log (xlator_name, GF_LOG_DEBUG, "child %d character %s",
- child, afr_get_character_str (type));
- return type;
+ return 0;
}
int
-afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs,
- int32_t **pending_matrix, int32_t *sources,
- int32_t *success_children, afr_transaction_type type,
- afr_source_flags_t *flags)
+afr_selfheal_extract_xattr(xlator_t *this, struct afr_reply *replies,
+ afr_transaction_type type, int *dirty, int **matrix)
{
- afr_private_t *priv = NULL;
- afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID;
- int nsources = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ dict_t *xdata = NULL;
+ int idx = -1;
- priv = this->private;
+ idx = afr_index_for_transaction_type(type);
- if (afr_get_children_count (success_children, priv->child_count) == 0)
- goto out;
+ priv = this->private;
- afr_build_pending_matrix (priv->pending_key, pending_matrix,
- xattr, type, priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret != 0)
+ continue;
- sh_type = afr_self_heal_type_for_transaction (type);
- if (AFR_SELF_HEAL_INVALID == sh_type)
- goto out;
+ if (!replies[i].xdata)
+ continue;
- afr_sh_print_pending_matrix (pending_matrix, this);
+ xdata = replies[i].xdata;
- nsources = afr_mark_sources (sources, pending_matrix, bufs,
- priv->child_count, sh_type,
- success_children, this->name, flags);
-out:
- return nsources;
-}
+ afr_selfheal_fill_dirty(this, dirty, i, idx, xdata);
+ afr_selfheal_fill_matrix(this, matrix, i, idx, xdata);
+ }
-void
-afr_mark_valid_children_sources (int32_t *sources, int32_t *valid_children,
- unsigned int child_count)
-{
- int i = 0;
- memset (sources, 0, sizeof (*sources) * child_count);
- for (i = 0; i < child_count; i++) {
- if (valid_children[i] == -1)
- break;
- sources[valid_children[i]] = 1;
- }
+ return 0;
}
-/**
- * mark_sources: Mark all 'source' nodes and return number of source
- * nodes found
- *
- * A node (a row in the pending matrix) belongs to one of
- * three categories:
- *
- * M is the pending matrix.
- *
- * 'innocent' - M[i] is all zeroes
- * 'fool' - M[i] has i'th element = 1 (self-reference)
- * 'wise' - M[i] has i'th element = 0, others are 1 or 0.
- *
- * All 'innocent' nodes are sinks. If all nodes are innocent, no self-heal is
- * needed.
- *
- * A 'wise' node can be a source. If two 'wise' nodes conflict, it is
- * a split-brain. If one wise node refers to the other but the other doesn't
- * refer back, the referrer is a source.
+/*
+ * If by chance there are multiple sources with differing sizes, select
+ * the largest file as the source.
*
- * All fools are sinks, unless there are no 'wise' nodes. if 'allfools' is NULL,
- * biggest fool(s) is/are marked as source.
+ * This can happen if data was directly modified in the backend or for snapshots
*/
-
-int
-afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs,
- int32_t child_count, afr_self_heal_type type,
- int32_t *valid_children, const char *xlator_name,
- afr_source_flags_t *flags)
+void
+afr_mark_largest_file_as_source(xlator_t *this, unsigned char *sources,
+ struct afr_reply *replies)
{
- /* stores the 'characters' (innocent, fool, wise) of the nodes */
-
- afr_node_character *characters = NULL;
- int i = 0;
- int nsources = -1;
- xlator_t *this = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint64_t size = 0;
- characters = GF_CALLOC (sizeof (afr_node_character),
- child_count, gf_afr_mt_afr_node_character);
- if (!characters)
- goto out;
-
- this = THIS;
-
- /* start clean */
- for (i = 0; i < child_count; i++) {
- sources[i] = 0;
+ /* Find source with biggest file size */
+ priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (!replies[i].valid || replies[i].op_ret != 0) {
+ sources[i] = 0;
+ continue;
}
-
- nsources = 0;
- for (i = 0; i < child_count; i++) {
- characters[i].type =
- afr_find_child_character_type (pending_matrix[i], i,
- child_count,
- xlator_name);
- if (AFR_NODE_INVALID == characters[i].type)
- gf_log (xlator_name, GF_LOG_WARNING,
- "child %d had invalid xattrs", i);
+ if (size <= replies[i].poststat.ia_size) {
+ size = replies[i].poststat.ia_size;
}
+ }
- if ((type == AFR_SELF_HEAL_METADATA)
- && afr_sh_all_nodes_innocent (characters, child_count)) {
+ /* Mark sources with less size as not source */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (size > replies[i].poststat.ia_size)
+ sources[i] = 0;
+ }
+}
- nsources = afr_sh_mark_lowest_uid_as_source (bufs,
- valid_children,
- child_count,
- sources);
- goto out;
- }
+void
+afr_mark_latest_mtime_file_as_source(xlator_t *this, unsigned char *sources,
+ struct afr_reply *replies)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint32_t mtime = 0;
+ uint32_t mtime_nsec = 0;
+
+ priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (!replies[i].valid || replies[i].op_ret != 0) {
+ sources[i] = 0;
+ continue;
+ }
+ if ((mtime < replies[i].poststat.ia_mtime) ||
+ ((mtime == replies[i].poststat.ia_mtime) &&
+ (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) {
+ mtime = replies[i].poststat.ia_mtime;
+ mtime_nsec = replies[i].poststat.ia_mtime_nsec;
+ }
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if ((mtime > replies[i].poststat.ia_mtime) ||
+ ((mtime == replies[i].poststat.ia_mtime) &&
+ (mtime_nsec > replies[i].poststat.ia_mtime_nsec))) {
+ sources[i] = 0;
+ }
+ }
+}
- if (afr_sh_wise_nodes_exist (characters, child_count)) {
- afr_sh_compute_wisdom (pending_matrix, characters, child_count);
-
- if (afr_sh_wise_nodes_conflict (characters, child_count)) {
- /* split-brain */
- gf_log (this->name, GF_LOG_INFO,
- "split-brain possible, no source detected");
- if (flags)
- *flags |= AFR_SPLIT_BRAIN;
- nsources = -1;
-
- } else {
- nsources = afr_sh_mark_wisest_as_sources (sources,
- characters,
- child_count);
- }
- } else {
- if (flags) {
- *flags |= AFR_ALL_FOOLS;
- nsources = -1;
- goto out;
- }
- nsources = afr_mark_biggest_of_fools_as_source (sources,
- pending_matrix,
- characters,
- child_count);
- }
+void
+afr_mark_active_sinks(xlator_t *this, unsigned char *sources,
+ unsigned char *locked_on, unsigned char *sinks)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
-out:
- if (nsources == 0)
- afr_mark_valid_children_sources (sources, valid_children,
- child_count);
- if (characters)
- GF_FREE (characters);
+ priv = this->private;
- gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources);
- return nsources;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i] && locked_on[i])
+ sinks[i] = 1;
+ else
+ sinks[i] = 0;
+ }
}
-void
-afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr,
- int32_t *delta_matrix[], int success[],
- int child_count, afr_transaction_type type)
-{
- /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
- int32_t pending[3] = {0,};
- void *pending_raw = NULL;
- int ret = 0;
- int i = 0;
- int j = 0;
- int k = 0;
-
- /* start clean */
- for (i = 0; i < child_count; i++) {
- for (j = 0; j < child_count; j++) {
- delta_matrix[i][j] = 0;
- }
- }
+gf_boolean_t
+afr_dict_contains_heal_op(call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ dict_t *xdata_req = NULL;
+ int ret = 0;
+ int heal_op = -1;
- for (i = 0; i < child_count; i++) {
- if (pending_raw)
- pending_raw = NULL;
+ local = frame->local;
+ xdata_req = local->xdata_req;
+ ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op);
+ if (ret)
+ return _gf_false;
+ if (local->xdata_rsp == NULL) {
+ local->xdata_rsp = dict_new();
+ if (!local->xdata_rsp)
+ return _gf_true;
+ }
+ ret = dict_set_sizen_str_sizen(local->xdata_rsp, "sh-fail-msg",
+ SFILE_NOT_IN_SPLIT_BRAIN);
- for (j = 0; j < child_count; j++) {
- ret = dict_get_ptr (xattr[i], priv->pending_key[j],
- &pending_raw);
- if (ret < 0)
- gf_log (THIS->name, GF_LOG_DEBUG,
- "Unable to get dict value.");
- if (!success[j])
- continue;
+ return _gf_true;
+}
- k = afr_index_for_transaction_type (type);
+gf_boolean_t
+afr_can_decide_split_brain_source_sinks(struct afr_reply *replies,
+ int child_count)
+{
+ int i = 0;
- if (pending_raw != NULL) {
- memcpy (pending, pending_raw, sizeof(pending));
- delta_matrix[i][j] = -(ntoh32 (pending[k]));
- } else {
- delta_matrix[i][j] = 0;
- }
+ for (i = 0; i < child_count; i++)
+ if (replies[i].valid != 1 || replies[i].op_ret != 0)
+ return _gf_false;
- }
- }
+ return _gf_true;
}
+int
+afr_mark_split_brain_source_sinks_by_heal_op(
+ call_frame_t *frame, xlator_t *this, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+ struct afr_reply *replies, afr_transaction_type type, int heal_op)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xdata_req = NULL;
+ dict_t *xdata_rsp = NULL;
+ int ret = 0;
+ int i = 0;
+ char *name = NULL;
+ int source = -1;
+
+ local = frame->local;
+ priv = this->private;
+ xdata_req = local->xdata_req;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i])
+ if (sources[i] || !sinks[i] || !healed_sinks[i]) {
+ ret = -1;
+ goto out;
+ }
+ }
+ if (local->xdata_rsp == NULL) {
+ local->xdata_rsp = dict_new();
+ if (!local->xdata_rsp) {
+ ret = -1;
+ goto out;
+ }
+ }
+ xdata_rsp = local->xdata_rsp;
+
+ if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SBRAIN_HEAL_NO_GO_MSG);
+ ret = -1;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++)
+ if (locked_on[i])
+ sources[i] = 1;
+ switch (heal_op) {
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE:
+ if (type == AFR_METADATA_TRANSACTION) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SUSE_SOURCE_BRICK_TO_HEAL);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ afr_mark_largest_file_as_source(this, sources, replies);
+ if (AFR_COUNT(sources, priv->child_count) != 1) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SNO_BIGGER_FILE);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ break;
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME:
+ if (type == AFR_METADATA_TRANSACTION) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SUSE_SOURCE_BRICK_TO_HEAL);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ afr_mark_latest_mtime_file_as_source(this, sources, replies);
+ if (AFR_COUNT(sources, priv->child_count) != 1) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SNO_DIFF_IN_MTIME);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ break;
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK:
+ ret = dict_get_str_sizen(xdata_req, "child-name", &name);
+ if (ret)
+ goto out;
+ source = afr_get_child_index_from_name(this, name);
+ if (source < 0) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SINVALID_BRICK_NAME);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ if (locked_on[source] != 1) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SBRICK_IS_NOT_UP);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ memset(sources, 0, sizeof(*sources) * priv->child_count);
+ sources[source] = 1;
+ break;
+ default:
+ ret = -1;
+ goto out;
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i]) {
+ source = i;
+ break;
+ }
+ }
+ sinks[source] = 0;
+ healed_sinks[source] = 0;
+ ret = source;
+out:
+ if (ret < 0)
+ memset(sources, 0, sizeof(*sources) * priv->child_count);
+ return ret;
+}
int
-afr_sh_delta_to_xattr (afr_private_t *priv,
- int32_t *delta_matrix[], dict_t *xattr[],
- int child_count, afr_transaction_type type)
-{
- int i = 0;
- int j = 0;
- int k = 0;
- int ret = 0;
- int32_t *pending = NULL;
-
- for (i = 0; i < child_count; i++) {
- if (!xattr[i])
- continue;
-
- for (j = 0; j < child_count; j++) {
- pending = GF_CALLOC (sizeof (int32_t), 3,
- gf_afr_mt_int32_t);
-
- if (!pending)
- continue;
- /* 3 = data+metadata+entry */
-
- k = afr_index_for_transaction_type (type);
-
- pending[k] = hton32 (delta_matrix[i][j]);
-
- ret = dict_set_bin (xattr[i], priv->pending_key[j],
- pending,
- 3 * sizeof (int32_t));
- if (ret < 0)
- gf_log (THIS->name, GF_LOG_WARNING,
- "Unable to set dict value.");
+afr_sh_fav_by_majority(xlator_t *this, struct afr_reply *replies,
+ inode_t *inode)
+{
+ afr_private_t *priv;
+ int vote_count = -1;
+ int fav_child = -1;
+ int i = 0;
+ int k = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid == 1) {
+ gf_msg_debug(this->name, 0,
+ "Child:%s mtime_sec = %" PRId64 ", size = %" PRIu64
+ " for gfid %s",
+ priv->children[i]->name, replies[i].poststat.ia_mtime,
+ replies[i].poststat.ia_size, uuid_utoa(inode->gfid));
+ vote_count = 0;
+ for (k = 0; k < priv->child_count; k++) {
+ if ((replies[k].poststat.ia_mtime ==
+ replies[i].poststat.ia_mtime) &&
+ (replies[k].poststat.ia_size ==
+ replies[i].poststat.ia_size)) {
+ vote_count++;
}
+ }
+ if (vote_count > priv->child_count / 2) {
+ fav_child = i;
+ break;
+ }
}
- return 0;
+ }
+ return fav_child;
}
-
+/*
+ * afr_sh_fav_by_mtime: Choose favorite child by mtime.
+ */
int
-afr_sh_has_metadata_pending (dict_t *xattr, xlator_t *this)
-{
- /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
- int32_t pending[3] = {0,};
- void *pending_raw = NULL;
- afr_private_t *priv = NULL;
- int ret = -1;
- int i = 0;
- int j = 0;
-
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_get_ptr (xattr, priv->pending_key[i],
- &pending_raw);
-
- if (ret != 0)
- return 0;
-
- memcpy (pending, pending_raw, sizeof(pending));
- j = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION);
-
- if (pending[j])
- return 1;
- }
-
- return 0;
+afr_sh_fav_by_mtime(xlator_t *this, struct afr_reply *replies, inode_t *inode)
+{
+ afr_private_t *priv;
+ int fav_child = -1;
+ int i = 0;
+ uint32_t cmp_mtime = 0;
+ uint32_t cmp_mtime_nsec = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid == 1) {
+ gf_msg_debug(this->name, 0,
+ "Child:%s mtime = %" PRId64
+ ", mtime_nsec = %d for "
+ "gfid %s",
+ priv->children[i]->name, replies[i].poststat.ia_mtime,
+ replies[i].poststat.ia_mtime_nsec,
+ uuid_utoa(inode->gfid));
+ if (replies[i].poststat.ia_mtime > cmp_mtime) {
+ cmp_mtime = replies[i].poststat.ia_mtime;
+ cmp_mtime_nsec = replies[i].poststat.ia_mtime_nsec;
+ fav_child = i;
+ } else if ((replies[i].poststat.ia_mtime == cmp_mtime) &&
+ (replies[i].poststat.ia_mtime_nsec > cmp_mtime_nsec)) {
+ cmp_mtime = replies[i].poststat.ia_mtime;
+ cmp_mtime_nsec = replies[i].poststat.ia_mtime_nsec;
+ fav_child = i;
+ }
+ }
+ }
+ return fav_child;
}
-
+/*
+ * afr_sh_fav_by_ctime: Choose favorite child by ctime.
+ */
int
-afr_sh_has_data_pending (dict_t *xattr, xlator_t *this)
-{
- /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
- int32_t pending[3] = {0,};
- void *pending_raw = NULL;
- afr_private_t *priv = NULL;
- int ret = -1;
- int i = 0;
- int j = 0;
-
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_get_ptr (xattr, priv->pending_key[i],
- &pending_raw);
-
- if (ret != 0)
- return 0;
-
- memcpy (pending, pending_raw, sizeof(pending));
- j = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);
-
- if (pending[j])
- return 1;
- }
-
- return 0;
+afr_sh_fav_by_ctime(xlator_t *this, struct afr_reply *replies, inode_t *inode)
+{
+ afr_private_t *priv;
+ int fav_child = -1;
+ int i = 0;
+ uint32_t cmp_ctime = 0;
+ uint32_t cmp_ctime_nsec = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid == 1) {
+ gf_msg_debug(this->name, 0,
+ "Child:%s ctime = %" PRId64
+ ", ctime_nsec = %d for "
+ "gfid %s",
+ priv->children[i]->name, replies[i].poststat.ia_ctime,
+ replies[i].poststat.ia_ctime_nsec,
+ uuid_utoa(inode->gfid));
+ if (replies[i].poststat.ia_ctime > cmp_ctime) {
+ cmp_ctime = replies[i].poststat.ia_ctime;
+ cmp_ctime_nsec = replies[i].poststat.ia_ctime_nsec;
+ fav_child = i;
+ } else if ((replies[i].poststat.ia_ctime == cmp_ctime) &&
+ (replies[i].poststat.ia_ctime_nsec > cmp_ctime_nsec)) {
+ cmp_ctime = replies[i].poststat.ia_ctime;
+ cmp_ctime_nsec = replies[i].poststat.ia_ctime_nsec;
+ fav_child = i;
+ }
+ }
+ }
+ return fav_child;
}
+/*
+ * afr_sh_fav_by_size: Choose favorite child by size
+ * when not all files are of zero size.
+ */
+int
+afr_sh_fav_by_size(xlator_t *this, struct afr_reply *replies, inode_t *inode)
+{
+ afr_private_t *priv;
+ int fav_child = -1;
+ int i = 0;
+ uint64_t cmp_sz = 0;
+
+ priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid) {
+ continue;
+ }
+ gf_msg_debug(this->name, 0,
+ "Child:%s file size = %" PRIu64 " for gfid %s",
+ priv->children[i]->name, replies[i].poststat.ia_size,
+ uuid_utoa(inode->gfid));
+ if (replies[i].poststat.ia_type == IA_IFDIR) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ "Cannot perform selfheal on %s. "
+ "Size policy is not applicable to directories.",
+ uuid_utoa(inode->gfid));
+ break;
+ }
+ if (replies[i].poststat.ia_size > cmp_sz) {
+ cmp_sz = replies[i].poststat.ia_size;
+ fav_child = i;
+ } else if (replies[i].poststat.ia_size == cmp_sz) {
+ fav_child = -1;
+ }
+ }
+ if (fav_child == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "No bigger file");
+ }
+ return fav_child;
+}
int
-afr_sh_has_entry_pending (dict_t *xattr, xlator_t *this)
+afr_sh_get_fav_by_policy(xlator_t *this, struct afr_reply *replies,
+ inode_t *inode, char **policy_str)
{
- /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
- int32_t pending[3] = {0,};
- void *pending_raw = NULL;
- afr_private_t *priv = NULL;
- int ret = -1;
- int i = 0;
- int j = 0;
+ afr_private_t *priv = NULL;
+ int fav_child = -1;
- priv = this->private;
+ priv = this->private;
+ if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) {
+ return -1;
+ }
+
+ switch (priv->fav_child_policy) {
+ case AFR_FAV_CHILD_BY_SIZE:
+ fav_child = afr_sh_fav_by_size(this, replies, inode);
+ if (policy_str && fav_child >= 0) {
+ *policy_str = "SIZE";
+ }
+ break;
+ case AFR_FAV_CHILD_BY_CTIME:
+ fav_child = afr_sh_fav_by_ctime(this, replies, inode);
+ if (policy_str && fav_child >= 0) {
+ *policy_str = "CTIME";
+ }
+ break;
+ case AFR_FAV_CHILD_BY_MTIME:
+ fav_child = afr_sh_fav_by_mtime(this, replies, inode);
+ if (policy_str && fav_child >= 0) {
+ *policy_str = "MTIME";
+ }
+ break;
+ case AFR_FAV_CHILD_BY_MAJORITY:
+ fav_child = afr_sh_fav_by_majority(this, replies, inode);
+ if (policy_str && fav_child >= 0) {
+ *policy_str = "MAJORITY";
+ }
+ break;
+ case AFR_FAV_CHILD_NONE:
+ default:
+ break;
+ }
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_get_ptr (xattr, priv->pending_key[i],
- &pending_raw);
+ return fav_child;
+}
- if (ret != 0)
- return 0;
+int
+afr_mark_split_brain_source_sinks_by_policy(
+ call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+ struct afr_reply *replies, afr_transaction_type type)
+{
+ afr_private_t *priv = NULL;
+ int fav_child = -1;
+ char mtime_str[256];
+ char ctime_str[256];
+ char *policy_str = NULL;
+ struct tm *tm_ptr;
+ time_t time;
+
+ priv = this->private;
+
+ fav_child = afr_sh_get_fav_by_policy(this, replies, inode, &policy_str);
+ if (fav_child == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ "No child selected by favorite-child policy.");
+ } else if (fav_child > priv->child_count - 1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ "Invalid child (%d) "
+ "selected by policy %s.",
+ fav_child, policy_str);
+ } else if (fav_child >= 0) {
+ time = replies[fav_child].poststat.ia_mtime;
+ tm_ptr = localtime(&time);
+ strftime(mtime_str, sizeof(mtime_str), "%Y-%m-%d %H:%M:%S", tm_ptr);
+ time = replies[fav_child].poststat.ia_ctime;
+ tm_ptr = localtime(&time);
+ strftime(ctime_str, sizeof(ctime_str), "%Y-%m-%d %H:%M:%S", tm_ptr);
+
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ "Source %s selected as authentic to resolve conflicting data "
+ "in file (gfid:%s) by %s (%" PRIu64
+ " bytes @ %s mtime, %s "
+ "ctime).",
+ priv->children[fav_child]->name, uuid_utoa(inode->gfid),
+ policy_str, replies[fav_child].poststat.ia_size, mtime_str,
+ ctime_str);
+
+ sources[fav_child] = 1;
+ sinks[fav_child] = 0;
+ healed_sinks[fav_child] = 0;
+ }
+ return fav_child;
+}
- memcpy (pending, pending_raw, sizeof(pending));
- j = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION);
+gf_boolean_t
+afr_is_file_empty_on_all_children(afr_private_t *priv,
+ struct afr_reply *replies)
+{
+ int i = 0;
- if (pending[j])
- return 1;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if ((!replies[i].valid) || (replies[i].op_ret != 0) ||
+ (replies[i].poststat.ia_size != 0))
+ return _gf_false;
+ }
- return 0;
+ return _gf_true;
}
+int
+afr_mark_source_sinks_if_file_empty(xlator_t *this, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies,
+ afr_transaction_type type)
+{
+ int source = -1;
+ int i = 0;
+ afr_private_t *priv = this->private;
+ struct iatt stbuf = {
+ 0,
+ };
+
+ if ((AFR_COUNT(locked_on, priv->child_count) < priv->child_count) ||
+ (afr_success_count(replies, priv->child_count) < priv->child_count))
+ return -1;
-/**
- * is_matrix_zero - return true if pending matrix is all zeroes
+ if (type == AFR_DATA_TRANSACTION) {
+ if (!afr_is_file_empty_on_all_children(priv, replies))
+ return -1;
+ goto mark;
+ }
+
+ /*For AFR_METADATA_TRANSACTION, metadata must be same on all bricks.*/
+ stbuf = replies[0].poststat;
+ for (i = 1; i < priv->child_count; i++) {
+ if ((!IA_EQUAL(stbuf, replies[i].poststat, type)) ||
+ (!IA_EQUAL(stbuf, replies[i].poststat, uid)) ||
+ (!IA_EQUAL(stbuf, replies[i].poststat, gid)) ||
+ (!IA_EQUAL(stbuf, replies[i].poststat, prot)))
+ return -1;
+ }
+ for (i = 1; i < priv->child_count; i++) {
+ if (!afr_xattrs_are_equal(replies[0].xdata, replies[i].xdata))
+ return -1;
+ }
+
+mark:
+ /* data/metadata is same on all bricks. Pick one of them as source. Rest
+ * are sinks.*/
+ for (i = 0; i < priv->child_count; i++) {
+ if (source == -1) {
+ source = i;
+ sources[i] = 1;
+ sinks[i] = 0;
+ healed_sinks[i] = 0;
+ continue;
+ }
+ sources[i] = 0;
+ sinks[i] = 1;
+ healed_sinks[i] = 1;
+ }
+
+ return source;
+}
+
+/* Return a source depending on the type of heal_op, and set sources[source],
+ * sinks[source] and healed_sinks[source] to 1, 0 and 0 respectively. Do so
+ * only if the following condition is met:
+ * ∀i((i ∈ locked_on[] ∧ i=1)==>(sources[i]=0 ∧ sinks[i]=1 ∧ healed_sinks[i]=1))
+ * i.e. for each locked node, sources[node] is 0; healed_sinks[node] and
+ * sinks[node] are 1. This should be the case if the file is in split-brain.
*/
+int
+afr_mark_split_brain_source_sinks(
+ call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+ struct afr_reply *replies, afr_transaction_type type)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xdata_req = NULL;
+ int heal_op = -1;
+ int ret = -1;
+ int source = -1;
+
+ local = frame->local;
+ priv = this->private;
+ xdata_req = local->xdata_req;
+
+ source = afr_mark_source_sinks_if_file_empty(
+ this, sources, sinks, healed_sinks, locked_on, replies, type);
+ if (source >= 0)
+ return source;
+
+ ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op);
+ if (ret)
+ goto autoheal;
+
+ source = afr_mark_split_brain_source_sinks_by_heal_op(
+ frame, this, sources, sinks, healed_sinks, locked_on, replies, type,
+ heal_op);
+ return source;
+
+autoheal:
+ /* Automatically heal if fav_child_policy is set. */
+ if (priv->fav_child_policy != AFR_FAV_CHILD_NONE) {
+ source = afr_mark_split_brain_source_sinks_by_policy(
+ frame, this, inode, sources, sinks, healed_sinks, locked_on,
+ replies, type);
+ if (source != -1) {
+ ret = dict_set_int32_sizen(xdata_req, "fav-child-policy", 1);
+ if (ret)
+ return -1;
+ }
+ }
+
+ return source;
+}
int
-afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count)
-{
- int i = 0;
- int j = 0;
+_afr_fav_child_reset_sink_xattrs(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, int source,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ afr_transaction_type type,
+ unsigned char *locked_on,
+ struct afr_reply *replies)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int *input_dirty = NULL;
+ int **input_matrix = NULL;
+ int *output_dirty = NULL;
+ int **output_matrix = NULL;
+ dict_t *xattr = NULL;
+ dict_t *xdata = NULL;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (!dict_get_sizen(local->xdata_req, "fav-child-policy"))
+ return 0;
- for (i = 0; i < child_count; i++)
- for (j = 0; j < child_count; j++)
- if (pending_matrix[i][j])
- return 0;
- return 1;
-}
+ xdata = dict_new();
+ if (!xdata)
+ return -1;
+ input_dirty = alloca0(priv->child_count * sizeof(int));
+ input_matrix = ALLOC_MATRIX(priv->child_count, int);
+ output_dirty = alloca0(priv->child_count * sizeof(int));
+ output_matrix = ALLOC_MATRIX(priv->child_count, int);
-int
-afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
+ afr_selfheal_extract_xattr(this, replies, type, input_dirty, input_matrix);
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || !healed_sinks[i])
+ continue;
+ output_dirty[i] = -input_dirty[i];
+ output_matrix[i][source] = -input_matrix[i][source];
+ }
-// memset (sh->child_errno, 0, sizeof (int) * priv->child_count);
- memset (sh->buf, 0, sizeof (struct iatt) * priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i] || !locked_on[i])
+ continue;
+ xattr = afr_selfheal_output_xattr(this, _gf_false, type, output_dirty,
+ output_matrix, i, NULL);
- for (i = 0; i < priv->child_count; i++) {
- sh->locked_nodes[i] = 0;
- }
+ afr_selfheal_post_op(frame, this, inode, i, xattr, xdata);
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i])
- dict_unref (sh->xattr[i]);
- sh->xattr[i] = NULL;
- }
+ undid_pending[i] = 1;
+ dict_unref(xattr);
+ }
- if (local->govinda_gOvinda || sh->op_failed) {
- gf_log (this->name, GF_LOG_INFO,
- "split brain found, aborting selfheal of %s",
- local->loc.path);
- sh->op_failed = 1;
- sh->completion_cbk (frame, this);
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "proceeding to metadata check on %s",
- local->loc.path);
- afr_self_heal_metadata (frame, this);
- }
+ if (xdata)
+ dict_unref(xdata);
- return 0;
+ return 0;
}
-
-static int
-afr_sh_missing_entries_finish (call_frame_t *frame, xlator_t *this)
+gf_boolean_t
+afr_does_witness_exist(xlator_t *this, uint64_t *witness)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
+ int i = 0;
+ afr_private_t *priv = NULL;
- int_lock->lock_cbk = afr_sh_missing_entries_done;
- afr_unlock (frame, this);
+ priv = this->private;
- return 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (witness[i])
+ return _gf_true;
+ }
+ return _gf_false;
}
-int
-afr_sh_common_create (afr_self_heal_t *sh, unsigned int child_count)
+unsigned int
+afr_get_quorum_count(afr_private_t *priv)
{
- int ret = -ENOMEM;
- sh->buf = GF_CALLOC (child_count, sizeof (*sh->buf),
- gf_afr_mt_iatt);
- if (!sh->buf)
- goto out;
- sh->parentbufs = GF_CALLOC (child_count, sizeof (*sh->parentbufs),
- gf_afr_mt_iatt);
- if (!sh->parentbufs)
- goto out;
- sh->child_errno = GF_CALLOC (child_count, sizeof (*sh->child_errno),
- gf_afr_mt_int);
- if (!sh->child_errno)
- goto out;
- sh->child_success = afr_children_create (child_count);
- if (!sh->child_success)
- goto out;
- sh->fresh_children = afr_children_create (child_count);
- if (!sh->fresh_children)
- goto out;
- sh->xattr = GF_CALLOC (child_count, sizeof (*sh->xattr),
- gf_afr_mt_dict_t);
- if (!sh->xattr)
- goto out;
- ret = 0;
-out:
- return ret;
+ if (priv->quorum_count == AFR_QUORUM_AUTO) {
+ return priv->child_count / 2 + 1;
+ } else {
+ return priv->quorum_count;
+ }
}
void
-afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf,
- dict_t *xattr, struct iatt *postparent,
- loc_t *loc)
-{
- int child_index = 0;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- priv = this->private;
- sh = &local->self_heal;
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == 0) {
- sh->buf[child_index] = *buf;
- sh->parentbuf = *postparent;
- sh->parentbufs[child_index] = *postparent;
- sh->child_success[sh->success_count] = child_index;
- sh->success_count++;
- sh->xattr[child_index] = dict_ref (xattr);
- } else {
- gf_log (this->name, GF_LOG_ERROR,
- "path %s on subvolume %s => -1 (%s)",
- loc->path, priv->children[child_index]->name,
- strerror (op_errno));
- local->self_heal.child_errno[child_index] = op_errno;
- }
- }
- UNLOCK (&frame->lock);
+afr_selfheal_post_op_failure_accounting(afr_private_t *priv, char *accused,
+ unsigned char *sources,
+ unsigned char *locked_on)
+{
+ int i = 0;
+ unsigned int quorum_count = 0;
+
+ if (AFR_COUNT(sources, priv->child_count) != 0)
return;
-}
-gf_boolean_t
-afr_valid_ia_type (ia_type_t ia_type)
-{
- switch (ia_type) {
- case IA_IFSOCK:
- case IA_IFREG:
- case IA_IFBLK:
- case IA_IFCHR:
- case IA_IFIFO:
- case IA_IFLNK:
- case IA_IFDIR:
- return _gf_true;
- default:
- return _gf_false;
+ quorum_count = afr_get_quorum_count(priv);
+ for (i = 0; i < priv->child_count; i++) {
+ if ((accused[i] < quorum_count) && locked_on[i]) {
+ sources[i] = 1;
}
- return _gf_false;
+ }
+ return;
}
-int
-afr_impunge_frame_create (call_frame_t *frame, xlator_t *this,
- int active_source, int ret_child, mode_t entry_mode,
- call_frame_t **impunge_frame)
-{
- afr_local_t *local = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int32_t op_errno = 0;
- afr_private_t *priv = NULL;
- int ret = 0;
- call_frame_t *new_frame = NULL;
+/*
+ * This function determines if a self-heal is required for a given inode,
+ * and if needed, in what direction.
+ *
+ * locked_on[] is the array representing servers which have been locked and
+ * from which xattrs have been fetched for analysis.
+ *
+ * The output of the function is by filling the arrays sources[] and sinks[].
+ *
+ * sources[i] is set if i'th server is an eligible source for a selfheal.
+ *
+ * sinks[i] is set if i'th server needs to be healed.
+ *
+ * if sources[0..N] are all set, there is no need for a selfheal.
+ *
+ * if sinks[0..N] are all set, the inode is in split brain.
+ *
+ */
- op_errno = ENOMEM;
- priv = this->private;
- new_frame = copy_frame (frame);
- if (!new_frame) {
- goto out;
+int
+afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
+ struct afr_reply *replies,
+ afr_transaction_type type, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks,
+ uint64_t *witness, unsigned char *pflag)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int j = 0;
+ int *dirty = NULL; /* Denotes if dirty xattr is set */
+ int **matrix = NULL; /* Changelog matrix */
+ char *accused = NULL; /* Accused others without any self-accusal */
+ char *pending = NULL; /* Have pending operations on others */
+ char *self_accused = NULL; /* Accused itself */
+
+ priv = this->private;
+
+ dirty = alloca0(priv->child_count * sizeof(int));
+ accused = alloca0(priv->child_count);
+ pending = alloca0(priv->child_count);
+ self_accused = alloca0(priv->child_count);
+ matrix = ALLOC_MATRIX(priv->child_count, int);
+ memset(witness, 0, sizeof(*witness) * priv->child_count);
+
+ /* First construct the pending matrix for further analysis */
+ afr_selfheal_extract_xattr(this, replies, type, dirty, matrix);
+
+ if (pflag) {
+ for (i = 0; i < priv->child_count; i++) {
+ for (j = 0; j < priv->child_count; j++)
+ if (matrix[i][j])
+ *pflag |= PFLAG_PENDING;
+ if (*pflag)
+ break;
}
+ }
+
+ if (afr_success_count(replies, priv->child_count) < priv->child_count) {
+ /* Treat this just like locks not being acquired */
+ return -ENOTCONN;
+ }
+
+ /* short list all self-accused */
+ for (i = 0; i < priv->child_count; i++) {
+ if (matrix[i][i])
+ self_accused[i] = 1;
+ }
+
+ /* Next short list all accused to exclude them from being sources */
+ /* Self-accused can't accuse others as they are FOOLs */
+ for (i = 0; i < priv->child_count; i++) {
+ for (j = 0; j < priv->child_count; j++) {
+ if (matrix[i][j]) {
+ if (!self_accused[i])
+ accused[j] += 1;
+ if (i != j)
+ pending[i] += 1;
+ }
+ }
+ }
+
+ /* Short list all non-accused as sources */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!accused[i] && locked_on[i])
+ sources[i] = 1;
+ else
+ sources[i] = 0;
+ }
+
+ /* Everyone accused by non-self-accused sources are sinks */
+ memset(sinks, 0, priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (self_accused[i])
+ continue;
+ for (j = 0; j < priv->child_count; j++) {
+ if (matrix[i][j])
+ sinks[j] = 1;
+ }
+ }
+
+ /* For breaking ties provide with number of fops they witnessed */
+
+ /*
+ * count the pending fops witnessed from itself to others when it is
+ * self-accused
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!self_accused[i])
+ continue;
+ for (j = 0; j < priv->child_count; j++) {
+ if (i == j)
+ continue;
+ witness[i] += matrix[i][j];
+ }
+ }
+
+ if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION)
+ afr_selfheal_post_op_failure_accounting(priv, accused, sources,
+ locked_on);
+
+ /* If no sources, all locked nodes are sinks - split brain */
+ if (AFR_COUNT(sources, priv->child_count) == 0) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i])
+ sinks[i] = 1;
+ }
+ if (pflag)
+ *pflag |= PFLAG_SBRAIN;
+ }
+
+ /* One more class of witness similar to dirty in v2 is where no pending
+ * exists but we have self-accusing markers. This can happen in afr-v1
+ * if the brick crashes just after doing xattrop on self but
+ * before xattrop on the other xattrs on the brick in pre-op. */
+ if (AFR_COUNT(pending, priv->child_count) == 0) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (self_accused[i])
+ witness[i] += matrix[i][i];
+ }
+ } else {
+ /* In afr-v1 if a file is self-accused and has pending
+ * operations on others then it is similar to 'dirty' in afr-v2.
+ * Consider such cases as witness.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (self_accused[i] && pending[i])
+ witness[i] += matrix[i][i];
+ }
+ }
- ALLOC_OR_GOTO (impunge_local, afr_local_t, out);
-
- local = frame->local;
- sh = &local->self_heal;
- new_frame->local = impunge_local;
- impunge_sh = &impunge_local->self_heal;
- impunge_sh->sh_frame = frame;
- impunge_sh->active_source = active_source;
- impunge_sh->impunge_ret_child = ret_child;
- impunge_sh->impunging_entry_mode = entry_mode;
- impunge_local->child_up = memdup (local->child_up,
- sizeof (*local->child_up) *
- priv->child_count);
- if (!impunge_local->child_up)
- goto out;
+ /* count the number of dirty fops witnessed */
+ for (i = 0; i < priv->child_count; i++)
+ witness[i] += dirty[i];
- ret = afr_sh_common_create (impunge_sh, priv->child_count);
- if (ret) {
- op_errno = -ret;
- goto out;
- }
- op_errno = 0;
- *impunge_frame = new_frame;
-out:
- if (op_errno && new_frame)
- AFR_STACK_DESTROY (new_frame);
- return -op_errno;
+ return 0;
}
void
-afr_sh_call_entry_impunge_recreate (call_frame_t *frame, xlator_t *this,
- int child_index, struct iatt *buf,
- struct iatt *postparent,
- afr_impunge_done_cbk_t impunge_done)
-{
- call_frame_t *impunge_frame = NULL;
- afr_local_t *local = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *sh = NULL;
- int ret = 0;
- mode_t mode = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- mode = st_mode_from_ia (buf->ia_prot, buf->ia_type);
- ret = afr_impunge_frame_create (frame, this, sh->source, child_index,
- mode, &impunge_frame);
- if (ret)
- goto out;
- impunge_local = impunge_frame->local;
- loc_copy (&impunge_local->loc, &local->loc);
- sh->impunge_done = impunge_done;
- impunge_local->call_count = 1;
- afr_sh_entry_impunge_create (impunge_frame, this, child_index, buf,
- postparent);
- return;
-out:
- gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, reason: %s",
- local->loc.path, strerror (-ret));
- impunge_done (frame, this, child_index, -1, -ret);
+afr_log_selfheal(uuid_t gfid, xlator_t *this, int ret, char *type, int source,
+ unsigned char *sources, unsigned char *healed_sinks)
+{
+ char *status = NULL;
+ char *sinks_str = NULL;
+ char *p = NULL;
+ char *sources_str = NULL;
+ char *q = NULL;
+ afr_private_t *priv = NULL;
+ gf_loglevel_t loglevel = GF_LOG_NONE;
+ int i = 0;
+
+ priv = this->private;
+ sinks_str = alloca0(priv->child_count * 8);
+ p = sinks_str;
+ sources_str = alloca0(priv->child_count * 8);
+ q = sources_str;
+ for (i = 0; i < priv->child_count; i++) {
+ if (healed_sinks[i])
+ p += sprintf(p, "%d ", i);
+ if (sources[i]) {
+ if (source == i) {
+ q += sprintf(q, "[%d] ", i);
+ } else {
+ q += sprintf(q, "%d ", i);
+ }
+ }
+ }
+
+ if (ret < 0) {
+ status = "Failed";
+ loglevel = GF_LOG_DEBUG;
+ } else {
+ status = "Completed";
+ loglevel = GF_LOG_INFO;
+ }
+
+ gf_msg(this->name, loglevel, 0, AFR_MSG_SELF_HEAL_INFO,
+ "%s %s selfheal on %s. "
+ "sources=%s sinks=%s",
+ status, type, uuid_utoa(gfid), sources_str, sinks_str);
}
int
-afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this, int child,
- int32_t op_ret, int32_t op_errno)
+afr_selfheal_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *parbuf)
{
- int call_count = 0;
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ int i = -1;
+ GF_UNUSED int ret = -1;
+ int8_t need_heal = 1;
- local = frame->local;
+ local = frame->local;
+ i = (long)cookie;
- if (op_ret == -1)
- gf_log (this->name, GF_LOG_ERROR,
- "create entry %s failed, on child %d reason, %s",
- local->loc.path, child, strerror (op_errno));
- call_count = afr_frame_return (frame);
- if (call_count == 0)
- afr_sh_missing_entries_finish (frame, this);
- return 0;
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (buf)
+ local->replies[i].poststat = *buf;
+ if (parbuf)
+ local->replies[i].postparent = *parbuf;
+ if (xdata) {
+ local->replies[i].xdata = dict_ref(xdata);
+ ret = dict_get_int8(xdata, "link-count", &need_heal);
+ }
+
+ local->replies[i].need_heal = need_heal;
+ syncbarrier_wake(&local->barrier);
+
+ return 0;
}
-static int
-sh_missing_entries_create (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int type = 0;
- afr_private_t *priv = NULL;
- int enoent_count = 0;
- int i = 0;
- struct iatt *buf = NULL;
- struct iatt *postparent = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- enoent_count = afr_errno_count (NULL, sh->child_errno,
- priv->child_count, ENOENT);
- if (enoent_count == 0) {
- gf_log (this->name, GF_LOG_INFO,
- "no missing files - %s. proceeding to metadata check",
- local->loc.path);
- /* proceed to next step - metadata self-heal */
- afr_sh_missing_entries_finish (frame, this);
- return 0;
- }
+inode_t *
+afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent,
+ const char *name, struct afr_reply *replies,
+ unsigned char *lookup_on, dict_t *xattr)
+{
+ loc_t loc = {
+ 0,
+ };
+ dict_t *xattr_req = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ inode_t *inode = NULL;
- buf = &sh->buf[sh->source];
- postparent = &sh->parentbufs[sh->source];
+ local = frame->local;
+ priv = frame->this->private;
- type = buf->ia_type;
- if (!afr_valid_ia_type (type)) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: unknown file type: 0%o", local->loc.path, type);
- local->govinda_gOvinda = 1;
- afr_sh_missing_entries_finish (frame, this);
- goto out;
- }
+ xattr_req = dict_new();
+ if (!xattr_req)
+ return NULL;
- local->call_count = enoent_count;
- for (i = 0; i < priv->child_count; i++) {
- //If !child_up errno will be zero
- if (sh->child_errno[i] != ENOENT)
- continue;
- afr_sh_call_entry_impunge_recreate (frame, this, i,
- buf, postparent,
- afr_sh_create_entry_cbk);
- enoent_count--;
- }
- GF_ASSERT (enoent_count == 0);
-out:
- return 0;
-}
+ if (xattr)
+ dict_copy(xattr, xattr_req);
-void
-afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- ia_type_t ia_type = IA_INVAL;
- int32_t nsources = 0;
- loc_t *loc = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
- loc = &local->loc;
-
- if (op_ret < 0) {
- if (op_errno == EIO)
- local->govinda_gOvinda = 1;
- // EIO can happen if finding the fresh parent dir failed
- goto out;
- }
+ if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) {
+ dict_unref(xattr_req);
+ return NULL;
+ }
- //now No chance for the ia_type to conflict
- ia_type = sh->buf[sh->child_success[0]].ia_type;
- nsources = afr_build_sources (this, sh->xattr, sh->buf,
- sh->pending_matrix, sh->sources,
- sh->child_success,
- afr_transaction_type_get (ia_type), NULL);
- if (nsources < 0) {
- gf_log (this->name, GF_LOG_INFO, "No sources for dir of %s,"
- " in missing entry self-heal, continuing with the rest"
- " of the self-heals", local->loc.path);
- op_errno = EIO;
- goto out;
- }
+ inode = inode_new(parent->table);
+ if (!inode) {
+ dict_unref(xattr_req);
+ return NULL;
+ }
- afr_get_fresh_children (sh->child_success, sh->sources,
- sh->fresh_children, priv->child_count);
- sh->source = sh->fresh_children[0];
- if (sh->source == -1) {
- gf_log (this->name, GF_LOG_DEBUG, "No active sources found.");
- op_errno = EIO;
- goto out;
- }
+ loc.parent = inode_ref(parent);
+ gf_uuid_copy(loc.pargfid, parent->gfid);
+ loc.name = name;
+ loc.inode = inode_ref(inode);
- if (sh->gfid_sh_success_cbk)
- sh->gfid_sh_success_cbk (frame, this);
- if (uuid_is_null (loc->inode->gfid))
- uuid_copy (loc->gfid, sh->buf[sh->source].ia_gfid);
- sh_missing_entries_create (frame, this);
- return;
-out:
- sh->op_failed = 1;
- afr_sh_set_error (sh, op_errno);
- afr_sh_missing_entries_finish (frame, this);
- return;
+ AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xattr_req);
+
+ afr_replies_copy(replies, local->replies, priv->child_count);
+
+ loc_wipe(&loc);
+ dict_unref(xattr_req);
+
+ return inode;
}
static int
-afr_sh_common_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
-{
- int call_count = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret,
- op_errno, inode, buf, xattr,
- postparent, &sh->lookup_loc);
- call_count = afr_frame_return (frame);
-
- if (call_count)
- goto out;
- op_ret = -1;
- if (!sh->success_count) {
- op_errno = afr_resultant_errno_get (NULL, sh->child_errno,
- priv->child_count);
- gf_log (this->name, GF_LOG_ERROR, "Failed to lookup %s, "
- "reason %s", sh->lookup_loc.path,
- strerror (op_errno));
- goto done;
- }
+afr_set_multi_dom_lock_count_request(xlator_t *this, dict_t *dict)
+{
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ char *key1 = NULL;
+ char *key2 = NULL;
- if ((sh->lookup_flags & AFR_LOOKUP_FAIL_CONFLICTS) &&
- (afr_conflicting_iattrs (sh->buf, sh->child_success,
- priv->child_count,
- sh->lookup_loc.path, this->name))) {
- op_errno = EIO;
- gf_log (this->name, GF_LOG_ERROR, "Conflicting entries "
- "for %s", sh->lookup_loc.path);
- goto done;
- }
+ priv = this->private;
+ key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(this->name));
+ key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(priv->sh_domain));
- if ((sh->lookup_flags & AFR_LOOKUP_FAIL_MISSING_GFIDS) &&
- (afr_gfid_missing_count (this->name, sh->child_success,
- sh->buf, priv->child_count,
- sh->lookup_loc.path))) {
- op_errno = ENODATA;
- gf_log (this->name, GF_LOG_ERROR, "Missing Gfids "
- "for %s", sh->lookup_loc.path);
- goto done;
- }
- op_ret = 0;
+ ret = dict_set_uint32(dict, GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS, 1);
+ if (ret)
+ return ret;
-done:
- sh->lookup_done (frame, this, op_ret, op_errno);
-out:
- return 0;
+ sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name);
+ ret = dict_set_uint32(dict, key1, 1);
+ if (ret)
+ return ret;
+
+ sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain);
+ ret = dict_set_uint32(dict, key2, 1);
+ if (ret)
+ return ret;
+
+ return 0;
}
int
-afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child,
- int32_t op_ret, int32_t op_errno)
-{
- int call_count = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- GF_ASSERT (sh->post_remove_call);
- if ((op_ret == -1) && (op_errno != ENOENT)) {
- gf_log (this->name, GF_LOG_ERROR,
- "purge entry %s failed, on child %d reason, %s",
- local->loc.path, child, strerror (op_errno));
- LOCK (&frame->lock);
- {
- afr_sh_set_error (sh, EIO);
- sh->op_failed = 1;
- }
- UNLOCK (&frame->lock);
- }
- call_count = afr_frame_return (frame);
- if (call_count == 0)
- sh->post_remove_call (frame, this);
- return 0;
-}
+afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode,
+ uuid_t gfid, struct afr_reply *replies,
+ unsigned char *discover_on, dict_t *dict)
+{
+ loc_t loc = {
+ 0,
+ };
+ dict_t *xattr_req = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = frame->this->private;
+
+ xattr_req = dict_new();
+ if (!xattr_req)
+ return -ENOMEM;
+ if (dict)
+ dict_copy(dict, xattr_req);
+
+ if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) {
+ dict_unref(xattr_req);
+ return -ENOMEM;
+ }
+
+ if (afr_set_multi_dom_lock_count_request(frame->this, xattr_req)) {
+ dict_unref(xattr_req);
+ return -1;
+ }
-void
-afr_sh_call_entry_expunge_remove (call_frame_t *frame, xlator_t *this,
- int child_index, struct iatt *buf,
- afr_expunge_done_cbk_t expunge_done)
-{
- call_frame_t *expunge_frame = NULL;
- afr_local_t *local = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- int32_t op_errno = 0;
-
- expunge_frame = copy_frame (frame);
- if (!expunge_frame) {
- goto out;
- }
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, gfid);
- ALLOC_OR_GOTO (expunge_local, afr_local_t, out);
+ AFR_ONLIST(discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xattr_req);
- local = frame->local;
- sh = &local->self_heal;
- expunge_frame->local = expunge_local;
- expunge_sh = &expunge_local->self_heal;
- expunge_sh->sh_frame = frame;
- loc_copy (&expunge_local->loc, &local->loc);
- sh->expunge_done = expunge_done;
- afr_sh_entry_expunge_remove (expunge_frame, this, child_index, buf);
- return;
-out:
- gf_log (this->name, GF_LOG_ERROR, "Expunge of %s failed, reason: %s",
- local->loc.path, strerror (op_errno));
- expunge_done (frame, this, child_index, -1, op_errno);
-}
+ afr_replies_copy(replies, local->replies, priv->child_count);
-void
-afr_sh_remove_stale_lookup_info (afr_self_heal_t *sh, int32_t *success_children,
- int32_t *fresh_children,
- unsigned int child_count)
-{
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- if (afr_is_child_present (success_children, child_count, i) &&
- !afr_is_child_present (fresh_children, child_count, i)) {
- sh->child_errno[i] = ENOENT;
- GF_ASSERT (sh->xattr[i]);
- dict_unref (sh->xattr[i]);
- sh->xattr[i] = NULL;
- }
- }
+ loc_wipe(&loc);
+ dict_unref(xattr_req);
+
+ return 0;
}
int
-afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this)
+afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid,
+ struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ dict_t *dict = NULL;
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ local = frame->local;
- if (sh->op_failed) {
- afr_sh_missing_entries_finish (frame, this);
- } else {
- if (afr_gfid_missing_count (this->name, sh->fresh_children,
- sh->buf, priv->child_count,
- local->loc.path)) {
- afr_sh_common_lookup (frame, this, &local->loc,
- afr_sh_missing_entries_lookup_done,
- sh->sh_gfid_req,
- AFR_LOOKUP_FAIL_CONFLICTS|
- AFR_LOOKUP_FAIL_MISSING_GFIDS);
- } else {
- //No need to set gfid so goto missing entries lookup done
- //Behave as if you have done the lookup
- afr_sh_remove_stale_lookup_info (sh,
- sh->child_success,
- sh->fresh_children,
- priv->child_count);
- afr_children_copy (sh->child_success,
- sh->fresh_children,
- priv->child_count);
- afr_sh_missing_entries_lookup_done (frame, this, 0, 0);
- }
- }
- return 0;
+ if (local->xattr_req)
+ dict = local->xattr_req;
+
+ return afr_selfheal_unlocked_discover_on(frame, inode, gfid, replies,
+ local->child_up, dict);
}
-gf_boolean_t
-afr_sh_purge_entry_condition (afr_local_t *local, afr_private_t *priv,
- int child)
+unsigned int
+afr_success_count(struct afr_reply *replies, unsigned int count)
{
- afr_self_heal_t *sh = NULL;
-
- sh = &local->self_heal;
+ int i = 0;
+ unsigned int success = 0;
- if (local->child_up[child] &&
- (!afr_is_child_present (sh->fresh_parent_dirs, priv->child_count,
- child))
- && (sh->child_errno[child] != ENOENT))
- return _gf_true;
-
- return _gf_false;
+ for (i = 0; i < count; i++)
+ if (replies[i].valid && replies[i].op_ret == 0)
+ success++;
+ return success;
}
-gf_boolean_t
-afr_sh_purge_stale_entry_condition (afr_local_t *local, afr_private_t *priv,
- int child)
+int
+afr_selfheal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- afr_self_heal_t *sh = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
- sh = &local->self_heal;
+ local = frame->local;
+ i = (long)cookie;
- if (local->child_up[child] &&
- (!afr_is_child_present (sh->fresh_children, priv->child_count,
- child))
- && (sh->child_errno[child] != ENOENT))
- return _gf_true;
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
- return _gf_false;
+ syncbarrier_wake(&local->barrier);
+
+ return 0;
}
-void
-afr_sh_purge_entry_common (call_frame_t *frame, xlator_t *this,
- gf_boolean_t purge_condition (afr_local_t *local,
- afr_private_t *priv,
- int child))
+int
+afr_locked_fill(call_frame_t *frame, xlator_t *this, unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- int i = 0;
- int call_count = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int count = 0;
- for (i = 0; i < priv->child_count; i++) {
- if (purge_condition (local, priv, i))
- call_count++;
- }
+ local = frame->local;
+ priv = this->private;
- if (call_count == 0) {
- sh->post_remove_call (frame, this);
- goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && local->replies[i].op_ret == 0) {
+ locked_on[i] = 1;
+ count++;
+ } else {
+ locked_on[i] = 0;
}
+ }
- local->call_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (!purge_condition (local, priv, i))
- continue;
- afr_sh_call_entry_expunge_remove (frame, this,
- (long) i, &sh->buf[i],
- afr_sh_remove_entry_cbk);
- }
-out:
- return;
+ return count;
}
-void
-afr_sh_purge_entry (call_frame_t *frame, xlator_t *this)
+int
+afr_selfheal_tryinodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ loc_t loc = {
+ 0,
+ };
+ struct gf_flock flock = {
+ 0,
+ };
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ flock.l_type = F_WRLCK;
+ flock.l_start = off;
+ flock.l_len = size;
- local = frame->local;
- sh = &local->self_heal;
- sh->post_remove_call = afr_sh_missing_entries_finish;
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock,
+ NULL);
- afr_sh_purge_entry_common (frame, this, afr_sh_purge_entry_condition);
+ loc_wipe(&loc);
+
+ return afr_locked_fill(frame, this, locked_on);
}
-void
-afr_sh_purge_stale_entry (call_frame_t *frame, xlator_t *this)
+int
+afr_selfheal_inodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
+ loc_t loc = {
+ 0,
+ };
+ struct gf_flock flock = {
+ 0,
+ };
+ afr_local_t *local = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ priv = this->private;
+ local = frame->local;
- sh->post_remove_call = afr_sh_purge_stale_entries_done;
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- for (i = 0; i < priv->child_count; i++) {
- if (afr_is_child_present (sh->fresh_children,
- priv->child_count, i))
- continue;
+ flock.l_type = F_WRLCK;
+ flock.l_start = off;
+ flock.l_len = size;
- if ((!local->child_up[i]) || sh->child_errno[i] != 0)
- continue;
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock,
+ NULL);
- GF_ASSERT (!uuid_is_null (sh->entrybuf.ia_gfid) ||
- uuid_is_null (sh->buf[i].ia_gfid));
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == -1 &&
+ local->replies[i].op_errno == EAGAIN) {
+ afr_locked_fill(frame, this, locked_on);
+ afr_selfheal_uninodelk(frame, this, inode, dom, off, size,
+ locked_on);
- if ((sh->entrybuf.ia_type != sh->buf[i].ia_type) ||
- (uuid_compare (sh->buf[i].ia_gfid,
- sh->entrybuf.ia_gfid)))
- continue;
+ AFR_SEQ(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLKW,
+ &flock, NULL);
+ break;
+ }
+ }
- afr_fresh_children_add_child (sh->fresh_children,
- i, priv->child_count);
+ loc_wipe(&loc);
- }
- afr_sh_purge_entry_common (frame, this,
- afr_sh_purge_stale_entry_condition);
+ return afr_locked_fill(frame, this, locked_on);
}
-void
-afr_sh_save_child_iatts_from_policy (int32_t *children, struct iatt *bufs,
- struct iatt *save,
- unsigned int child_count)
-{
- int i = 0;
- int child = 0;
- gf_boolean_t saved = _gf_false;
-
- GF_ASSERT (save);
- //if iatt buf with gfid exists sets it
- for (i = 0; i < child_count; i++) {
- child = children[i];
- if (child == -1)
- break;
- *save = bufs[child];
- saved = _gf_true;
- if (!uuid_is_null (save->ia_gfid))
- break;
+static void
+afr_get_lock_and_eagain_counts(afr_private_t *priv, struct afr_reply *replies,
+ int *lock_count, int *eagain_count)
+{
+ int i = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+ if (replies[i].op_ret == 0) {
+ (*lock_count)++;
+ } else if (replies[i].op_ret == -1 && replies[i].op_errno == EAGAIN) {
+ (*eagain_count)++;
}
- GF_ASSERT (saved);
+ }
}
-void
-afr_get_children_of_fresh_parent_dirs (afr_self_heal_t *sh,
- unsigned int child_count)
+/*Do blocking locks if number of locks acquired is majority and there were some
+ * EAGAINs. Useful for odd-way replication*/
+int
+afr_selfheal_tie_breaker_inodelk(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, char *dom, off_t off,
+ size_t size, unsigned char *locked_on)
{
- afr_children_intersection_get (sh->child_success,
- sh->fresh_parent_dirs,
- sh->sources, child_count);
- afr_get_fresh_children (sh->child_success, sh->sources,
- sh->fresh_children, child_count);
- memset (sh->sources, 0, sizeof (*sh->sources) * child_count);
-}
+ loc_t loc = {
+ 0,
+ };
+ struct gf_flock flock = {
+ 0,
+ };
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int lock_count = 0;
+ int eagain_count = 0;
-void
-afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int32_t fresh_child_enoents = 0;
- int32_t fresh_parent_count = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (op_ret < 0)
- goto fail;
- afr_get_children_of_fresh_parent_dirs (sh, priv->child_count);
- fresh_parent_count = afr_get_children_count (sh->fresh_parent_dirs,
- priv->child_count);
- //we need the enoent count of the subvols present in fresh_parent_dirs
- fresh_child_enoents = afr_errno_count (sh->fresh_parent_dirs,
- sh->child_errno,
- priv->child_count, ENOENT);
- if (fresh_child_enoents == fresh_parent_count) {
- gf_log (this->name, GF_LOG_INFO, "Deleting stale file %s",
- local->loc.path);
- afr_sh_set_error (sh, ENOENT);
- sh->op_failed = 1;
- afr_sh_purge_entry (frame, this);
- } else if (!afr_conflicting_iattrs (sh->buf, sh->fresh_children,
- priv->child_count, local->loc.path,
- this->name)) {
- afr_sh_save_child_iatts_from_policy (sh->fresh_children,
- sh->buf, &sh->entrybuf,
- priv->child_count);
- afr_update_gfid_from_iatts (sh->sh_gfid_req, sh->buf,
- sh->fresh_children,
- priv->child_count);
- afr_sh_purge_stale_entry (frame, this);
- } else {
- op_errno = EIO;
- local->govinda_gOvinda = 1;
- goto fail;
- }
+ priv = this->private;
+ local = frame->local;
- return;
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
-fail:
- sh->op_failed = 1;
- afr_sh_set_error (sh, op_errno);
- afr_sh_missing_entries_finish (frame, this);
- return;
-}
+ flock.l_type = F_WRLCK;
+ flock.l_start = off;
+ flock.l_len = size;
-static void
-afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int enoent_count = 0;
- int nsources = 0;
- int source = -1;
- afr_source_flags_t flags = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- /* If We can't find a fresh parent directory here,
- * we wont know which subvol is correct without finding a parent dir
- * upwards which has correct xattrs, for that we may have to
- * do lookups till root, we dont wanna do that,
- * instead make sure that if there are conflicting gfid
- * parent dirs, self-heal thus lookup is failed with EIO.
- * if there are missing entries we dont know whether to delete or
- * create so fail with EIO,
- * If there are conflicting xattr fail with EIO.
- */
- if (op_ret < 0)
- goto out;
- enoent_count = afr_errno_count (NULL, sh->child_errno,
- priv->child_count, ENOENT);
- if (enoent_count > 0) {
- gf_log (this->name, GF_LOG_ERROR, "Parent dir missing for %s,"
- " in missing entry self-heal, aborting self-heal",
- local->loc.path);
- goto out;
- }
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock,
+ NULL);
- nsources = afr_build_sources (this, sh->xattr, sh->buf,
- sh->pending_matrix, sh->sources,
- sh->child_success,
- AFR_ENTRY_TRANSACTION, &flags);
- if ((nsources < 0) && !flags) {
- gf_log (this->name, GF_LOG_ERROR, "No sources for dir of %s,"
- " in missing entry self-heal, aborting self-heal",
- local->loc.path);
- goto out;
- }
+ afr_get_lock_and_eagain_counts(priv, local->replies, &lock_count,
+ &eagain_count);
- //if allfools/split-brain give the behavior of missing entry creation
- if (flags) {
- gf_log (this->name, GF_LOG_DEBUG, "%s: All subvols pending so "
- "do missing entry creation", local->loc.path);
- afr_mark_valid_children_sources (sh->sources, sh->child_success,
- priv->child_count);
- }
+ if (lock_count > priv->child_count / 2 && eagain_count) {
+ afr_locked_fill(frame, this, locked_on);
+ afr_selfheal_uninodelk(frame, this, inode, dom, off, size, locked_on);
- source = afr_sh_select_source (sh->sources, priv->child_count);
- if (source == -1) {
- GF_ASSERT (0);
- gf_log (this->name, GF_LOG_DEBUG, "No active sources found.");
- goto out;
- }
- afr_get_fresh_children (sh->child_success, sh->sources,
- sh->fresh_parent_dirs, priv->child_count);
- afr_sh_common_lookup (frame, this, &local->loc,
- afr_sh_children_lookup_done, NULL, 0);
- return;
+ AFR_SEQ(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLKW,
+ &flock, NULL);
+ }
-out:
- afr_sh_set_error (sh, EIO);
- sh->op_failed = 1;
- afr_sh_missing_entries_finish (frame, this);
- return;
-}
+ loc_wipe(&loc);
-void
-afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count)
-{
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- memset (&sh->buf[i], 0, sizeof (sh->buf[i]));
- memset (&sh->parentbufs[i], 0, sizeof (sh->parentbufs[i]));
- sh->child_errno[i] = 0;
- }
- memset (&sh->parentbuf, 0, sizeof (sh->parentbuf));
- sh->success_count = 0;
- afr_reset_children (sh->child_success, child_count);
- afr_reset_children (sh->fresh_children, child_count);
- afr_reset_xattr (sh->xattr, child_count);
- loc_wipe (&sh->lookup_loc);
+ return afr_locked_fill(frame, this, locked_on);
}
-/* afr self-heal state will be lost if this call is made
- * please check the afr_sh_common_reset that is called in this function
- */
int
-afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
- afr_lookup_done_cbk_t lookup_done , uuid_t gfid,
- int32_t flags)
-{
- afr_local_t *local = NULL;
- int i = 0;
- int call_count = 0;
- afr_private_t *priv = NULL;
- dict_t *xattr_req = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- priv = this->private;
- sh = &local->self_heal;
-
- call_count = afr_up_children_count (priv->child_count,
- local->child_up);
-
- local->call_count = call_count;
-
- xattr_req = dict_new();
-
- if (xattr_req) {
- afr_xattr_req_prepare (this, xattr_req, loc->path);
- if (gfid) {
- gf_log (this->name, GF_LOG_DEBUG,
- "looking up %s with gfid: %s",
- loc->path, uuid_utoa (gfid));
- GF_ASSERT (!uuid_is_null (gfid));
- afr_set_dict_gfid (xattr_req, gfid);
- }
- }
+afr_selfheal_uninodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ const unsigned char *locked_on)
+{
+ loc_t loc = {
+ 0,
+ };
+ struct gf_flock flock = {
+ 0,
+ };
- afr_sh_common_reset (sh, priv->child_count);
- sh->lookup_done = lookup_done;
- loc_copy (&sh->lookup_loc, loc);
- sh->lookup_flags = flags;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- gf_log (this->name, GF_LOG_DEBUG,
- "looking up %s on subvolume %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame,
- afr_sh_common_lookup_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->lookup,
- loc, xattr_req);
-
- if (!--call_count)
- break;
- }
- }
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- if (xattr_req)
- dict_unref (xattr_req);
+ flock.l_type = F_UNLCK;
+ flock.l_start = off;
+ flock.l_len = size;
- return 0;
-}
+ AFR_ONLIST(locked_on, frame, afr_selfheal_lock_cbk, inodelk, dom, &loc,
+ F_SETLK, &flock, NULL);
+ loc_wipe(&loc);
+ return 0;
+}
int
-afr_sh_post_nb_entrylk_conflicting_sh_cbk (call_frame_t *frame, xlator_t *this)
+afr_selfheal_tryentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ loc_t loc = {
+ 0,
+ };
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "Non blocking entrylks failed.");
- sh->op_failed = -1;
- afr_sh_missing_entries_done (frame, this);
- } else {
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
- gf_log (this->name, GF_LOG_DEBUG,
- "Non blocking entrylks done. Proceeding to FOP");
- afr_sh_common_lookup (frame, this, &sh->parent_loc,
- afr_sh_find_fresh_parents,
- NULL, AFR_LOOKUP_FAIL_CONFLICTS);
- }
+ loc_wipe(&loc);
- return 0;
+ return afr_locked_fill(frame, this, locked_on);
}
int
-afr_sh_post_nb_entrylk_gfid_sh_cbk (call_frame_t *frame, xlator_t *this)
+afr_selfheal_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ loc_t loc = {
+ 0,
+ };
+ afr_local_t *local = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
- local = frame->local;
- sh = &local->self_heal;
- int_lock = &local->internal_lock;
+ priv = this->private;
+ local = frame->local;
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "Non blocking entrylks failed.");
- afr_sh_missing_entries_done (frame, this);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "Non blocking entrylks done. Proceeding to FOP");
- afr_sh_common_lookup (frame, this, &local->loc,
- afr_sh_missing_entries_lookup_done,
- sh->sh_gfid_req, AFR_LOOKUP_FAIL_CONFLICTS|
- AFR_LOOKUP_FAIL_MISSING_GFIDS);
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == -1 &&
+ local->replies[i].op_errno == EAGAIN) {
+ afr_locked_fill(frame, this, locked_on);
+ afr_selfheal_unentrylk(frame, this, inode, dom, name, locked_on,
+ NULL);
+
+ AFR_SEQ(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
+ break;
}
+ }
- return 0;
+ loc_wipe(&loc);
+
+ return afr_locked_fill(frame, this, locked_on);
}
int
-afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc,
- char *base_name, afr_lock_cbk_t lock_cbk)
+afr_selfheal_tie_breaker_entrylk(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, char *dom, const char *name,
+ unsigned char *locked_on)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ loc_t loc = {
+ 0,
+ };
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int lock_count = 0;
+ int eagain_count = 0;
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
+ priv = this->private;
+ local = frame->local;
- int_lock->transaction_lk_type = AFR_SELFHEAL_LK;
- int_lock->selfheal_lk_type = AFR_ENTRY_SELF_HEAL_LK;
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- afr_set_lock_number (frame, this);
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
- int_lock->lk_basename = base_name;
- int_lock->lk_loc = loc;
- int_lock->lock_cbk = lock_cbk;
+ afr_get_lock_and_eagain_counts(priv, local->replies, &lock_count,
+ &eagain_count);
- afr_nonblocking_entrylk (frame, this);
+ if (lock_count > priv->child_count / 2 && eagain_count) {
+ afr_locked_fill(frame, this, locked_on);
+ afr_selfheal_unentrylk(frame, this, inode, dom, name, locked_on, NULL);
- return 0;
+ AFR_SEQ(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
+ }
+
+ loc_wipe(&loc);
+
+ return afr_locked_fill(frame, this, locked_on);
}
-static int
-afr_self_heal_parent_entrylk (call_frame_t *frame, xlator_t *this,
- afr_lock_cbk_t lock_cbk)
+int
+afr_selfheal_unentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on,
+ dict_t *xdata)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
+ loc_t loc = {
+ 0,
+ };
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
- priv = this->private;
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- gf_log (this->name, GF_LOG_TRACE,
- "attempting to recreate missing entries for path=%s",
- local->loc.path);
+ AFR_ONLIST(locked_on, frame, afr_selfheal_lock_cbk, entrylk, dom, &loc,
+ name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata);
- GF_ASSERT (local->loc.parent);
- afr_build_parent_loc (&sh->parent_loc, &local->loc);
+ loc_wipe(&loc);
- afr_sh_entrylk (frame, this, &sh->parent_loc, NULL,
- lock_cbk);
- return 0;
+ return 0;
}
-static int
-afr_self_heal_conflicting_entries (call_frame_t *frame, xlator_t *this)
+gf_boolean_t
+afr_is_data_set(xlator_t *this, dict_t *xdata)
{
- afr_self_heal_parent_entrylk (frame, this,
- afr_sh_post_nb_entrylk_conflicting_sh_cbk);
- return 0;
+ return afr_is_pending_set(this, xdata, AFR_DATA_TRANSACTION);
}
-static int
-afr_self_heal_gfids (call_frame_t *frame, xlator_t *this)
+gf_boolean_t
+afr_is_metadata_set(xlator_t *this, dict_t *xdata)
{
- afr_self_heal_parent_entrylk (frame, this,
- afr_sh_post_nb_entrylk_gfid_sh_cbk);
- return 0;
+ return afr_is_pending_set(this, xdata, AFR_METADATA_TRANSACTION);
}
-afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this)
+gf_boolean_t
+afr_is_entry_set(xlator_t *this, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *lc = NULL;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_t *shc = NULL;
+ return afr_is_pending_set(this, xdata, AFR_ENTRY_TRANSACTION);
+}
- priv = this->private;
+/*
+ * This function inspects the looked up replies (in an unlocked manner)
+ * and decides whether a locked verification and possible healing is
+ * required or not. It updates the three booleans for each type
+ * of healing. If the boolean flag gets set to FALSE, then we are sure
+ * no healing is required. If the boolean flag gets set to TRUE then
+ * we have to proceed with locked reinspection.
+ */
- sh = &l->self_heal;
+int
+afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
+ inode_t **link_inode, gf_boolean_t *data_selfheal,
+ gf_boolean_t *metadata_selfheal,
+ gf_boolean_t *entry_selfheal,
+ struct afr_reply *replies_dst)
+{
+ afr_private_t *priv = NULL;
+ inode_t *inode = NULL;
+ int i = 0;
+ int valid_cnt = 0;
+ struct iatt first = {
+ 0,
+ };
+ int first_idx = 0;
+ struct afr_reply *replies = NULL;
+ int ret = -1;
+
+ priv = this->private;
+
+ inode = afr_inode_find(this, gfid);
+ if (!inode)
+ goto out;
+
+ replies = alloca0(sizeof(*replies) * priv->child_count);
+
+ ret = afr_selfheal_unlocked_discover(frame, inode, gfid, replies);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+ if (replies[i].op_ret == -1)
+ continue;
+
+ /* The data segment of the changelog can be non-zero to indicate
+ * the directory needs a full heal. So the check below ensures
+ * it's not a directory before setting the data_selfheal boolean.
+ */
+ if (data_selfheal && !IA_ISDIR(replies[i].poststat.ia_type) &&
+ afr_is_data_set(this, replies[i].xdata))
+ *data_selfheal = _gf_true;
+
+ if (metadata_selfheal && afr_is_metadata_set(this, replies[i].xdata))
+ *metadata_selfheal = _gf_true;
+
+ if (entry_selfheal && afr_is_entry_set(this, replies[i].xdata))
+ *entry_selfheal = _gf_true;
+
+ valid_cnt++;
+ if (valid_cnt == 1) {
+ first = replies[i].poststat;
+ first_idx = i;
+ continue;
+ }
+
+ if (!IA_EQUAL(first, replies[i].poststat, type)) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "TYPE mismatch %d vs %d on %s for gfid:%s",
+ (int)first.ia_type, (int)replies[i].poststat.ia_type,
+ priv->children[i]->name,
+ uuid_utoa(replies[i].poststat.ia_gfid));
+ gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
+ "subvol=%s;"
+ "type=file;gfid=%s;"
+ "ia_type-%d=%s;ia_type-%d=%s",
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(replies[i].poststat.ia_gfid), first_idx,
+ gf_inode_type_to_str(first.ia_type), i,
+ gf_inode_type_to_str(replies[i].poststat.ia_type));
+ ret = -EIO;
+ goto out;
+ }
+
+ if (!IA_EQUAL(first, replies[i].poststat, uid)) {
+ gf_msg_debug(this->name, 0,
+ "UID mismatch "
+ "%d vs %d on %s for gfid:%s",
+ (int)first.ia_uid, (int)replies[i].poststat.ia_uid,
+ priv->children[i]->name,
+ uuid_utoa(replies[i].poststat.ia_gfid));
+
+ if (metadata_selfheal)
+ *metadata_selfheal = _gf_true;
+ }
+
+ if (!IA_EQUAL(first, replies[i].poststat, gid)) {
+ gf_msg_debug(this->name, 0,
+ "GID mismatch "
+ "%d vs %d on %s for gfid:%s",
+ (int)first.ia_uid, (int)replies[i].poststat.ia_uid,
+ priv->children[i]->name,
+ uuid_utoa(replies[i].poststat.ia_gfid));
+
+ if (metadata_selfheal)
+ *metadata_selfheal = _gf_true;
+ }
+
+ if (!IA_EQUAL(first, replies[i].poststat, prot)) {
+ gf_msg_debug(this->name, 0,
+ "MODE mismatch "
+ "%d vs %d on %s for gfid:%s",
+ (int)st_mode_from_ia(first.ia_prot, 0),
+ (int)st_mode_from_ia(replies[i].poststat.ia_prot, 0),
+ priv->children[i]->name,
+ uuid_utoa(replies[i].poststat.ia_gfid));
+
+ if (metadata_selfheal)
+ *metadata_selfheal = _gf_true;
+ }
+
+ if (IA_ISREG(first.ia_type) &&
+ !IA_EQUAL(first, replies[i].poststat, size)) {
+ gf_msg_debug(this->name, 0,
+ "SIZE mismatch "
+ "%lld vs %lld on %s for gfid:%s",
+ (long long)first.ia_size,
+ (long long)replies[i].poststat.ia_size,
+ priv->children[i]->name,
+ uuid_utoa(replies[i].poststat.ia_gfid));
+
+ if (data_selfheal)
+ *data_selfheal = _gf_true;
+ }
+ }
+
+ if (valid_cnt > 0 && link_inode) {
+ *link_inode = inode_link(inode, NULL, NULL, &first);
+ if (!*link_inode) {
+ ret = -EINVAL;
+ goto out;
+ }
+ } else if (valid_cnt < 2) {
+ ret = afr_check_stale_error(replies, priv);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (replies && replies_dst)
+ afr_replies_copy(replies_dst, replies, priv->child_count);
+ if (inode)
+ inode_unref(inode);
+ if (replies)
+ afr_replies_wipe(replies, priv->child_count);
- lc = GF_CALLOC (1, sizeof (afr_local_t),
- gf_afr_mt_afr_local_t);
- if (!lc)
- goto out;
+ return ret;
+}
- shc = &lc->self_heal;
-
- shc->unwind = sh->unwind;
- shc->gfid_sh_success_cbk = sh->gfid_sh_success_cbk;
- shc->need_missing_entry_self_heal = sh->need_missing_entry_self_heal;
- shc->need_gfid_self_heal = sh->need_gfid_self_heal;
- shc->need_data_self_heal = sh->need_data_self_heal;
- shc->need_metadata_self_heal = sh->need_metadata_self_heal;
- shc->need_entry_self_heal = sh->need_entry_self_heal;
- shc->forced_merge = sh->forced_merge;
- shc->healing_fd_opened = sh->healing_fd_opened;
- shc->data_lock_held = sh->data_lock_held;
- if (sh->healing_fd && !sh->healing_fd_opened)
- shc->healing_fd = fd_ref (sh->healing_fd);
- else
- shc->healing_fd = sh->healing_fd;
- shc->background = sh->background;
- shc->type = sh->type;
-
- uuid_copy (shc->sh_gfid_req, sh->sh_gfid_req);
- if (l->loc.path)
- loc_copy (&lc->loc, &l->loc);
-
- lc->child_up = memdup (l->child_up, priv->child_count);
- if (l->xattr_req)
- lc->xattr_req = dict_ref (l->xattr_req);
-
- if (l->cont.lookup.inode)
- lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode);
- if (l->cont.lookup.xattr)
- lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr);
- if (l->internal_lock.inode_locked_nodes)
- lc->internal_lock.inode_locked_nodes =
- memdup (l->internal_lock.inode_locked_nodes,
- priv->child_count);
- else
- lc->internal_lock.inode_locked_nodes =
- GF_CALLOC (sizeof (*l->internal_lock.inode_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
- if (l->internal_lock.entry_locked_nodes)
- lc->internal_lock.entry_locked_nodes =
- memdup (l->internal_lock.entry_locked_nodes,
- priv->child_count);
- else
- lc->internal_lock.entry_locked_nodes =
- GF_CALLOC (sizeof (*l->internal_lock.entry_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
- if (l->internal_lock.locked_nodes)
- lc->internal_lock.locked_nodes =
- memdup (l->internal_lock.locked_nodes,
- priv->child_count);
- else
- lc->internal_lock.locked_nodes =
- GF_CALLOC (sizeof (*l->internal_lock.locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
+inode_t *
+afr_inode_find(xlator_t *this, uuid_t gfid)
+{
+ inode_table_t *table = NULL;
+ inode_t *inode = NULL;
- lc->internal_lock.inodelk_lock_count =
- l->internal_lock.inodelk_lock_count;
- lc->internal_lock.entrylk_lock_count =
- l->internal_lock.entrylk_lock_count;
+ table = this->itable;
+ if (!table)
+ return NULL;
-out:
- return lc;
-}
+ inode = inode_find(table, gfid);
+ if (inode)
+ return inode;
-int
-afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
- char sh_type_str[256] = {0,};
- gf_boolean_t split_brain = _gf_false;
+ inode = inode_new(table);
+ if (!inode)
+ return NULL;
- priv = this->private;
- local = bgsh_frame->local;
- sh = &local->self_heal;
+ gf_uuid_copy(inode->gfid, gfid);
- if (local->govinda_gOvinda)
- split_brain = _gf_true;
+ return inode;
+}
- afr_set_split_brain (this, sh->inode, split_brain);
+call_frame_t *
+afr_frame_create(xlator_t *this, int32_t *op_errno)
+{
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ pid_t pid = GF_CLIENT_PID_SELF_HEALD;
- afr_self_heal_type_str_get (sh, sh_type_str,
- sizeof(sh_type_str));
- if (sh->op_failed) {
- gf_log (this->name, GF_LOG_ERROR, "background %s self-heal "
- "failed on %s", sh_type_str, local->loc.path);
- } else {
- gf_log (this->name, GF_LOG_INFO, "background %s self-heal "
- "completed on %s", sh_type_str, local->loc.path);
- }
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ return NULL;
+ }
- FRAME_SU_UNDO (bgsh_frame, afr_local_t);
+ local = AFR_FRAME_INIT(frame, (*op_errno));
+ if (!local) {
+ STACK_DESTROY(frame->root);
+ return NULL;
+ }
- if (!sh->unwound) {
- sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno);
- }
+ syncopctx_setfspid(&pid);
- if (sh->background) {
- LOCK (&priv->lock);
- {
- priv->background_self_heals_started--;
- }
- UNLOCK (&priv->lock);
- }
+ frame->root->pid = pid;
- AFR_STACK_DESTROY (bgsh_frame);
+ afr_set_lk_owner(frame, this, frame->root);
- return 0;
+ return frame;
}
int
-afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int32_t op_errno = 0;
- int ret = 0;
- afr_self_heal_t *orig_sh = NULL;
- call_frame_t *sh_frame = NULL;
- afr_local_t *sh_local = NULL;
- loc_t *loc = NULL;
-
- local = frame->local;
- orig_sh = &local->self_heal;
- priv = this->private;
-
- GF_ASSERT (local->loc.path);
-
- gf_log (this->name, GF_LOG_TRACE,
- "performing self heal on %s (metadata=%d data=%d entry=%d)",
- local->loc.path,
- local->self_heal.need_metadata_self_heal,
- local->self_heal.need_data_self_heal,
- local->self_heal.need_entry_self_heal);
+afr_selfheal_newentry_mark(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, struct afr_reply *replies,
+ unsigned char *sources, unsigned char *newentry)
+{
+ int ret = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int **changelog = NULL;
- op_errno = ENOMEM;
- sh_frame = copy_frame (frame);
- sh_frame->root->pid = SELF_HEAL_PID;
- if (!sh_frame)
- goto out;
- afr_set_lk_owner (sh_frame, this);
+ priv = this->private;
- sh_local = afr_local_copy (local, this);
- if (!sh_local)
- goto out;
- sh_frame->local = sh_local;
- sh = &sh_local->self_heal;
+ gf_uuid_copy(inode->gfid, replies[source].poststat.ia_gfid);
- sh->orig_frame = frame;
- sh->inode = inode_ref (inode);
+ xattr = dict_new();
+ if (!xattr)
+ return -ENOMEM;
- sh->completion_cbk = afr_self_heal_completion_cbk;
+ changelog = afr_mark_pending_changelog(priv, newentry, xattr,
+ replies[source].poststat.ia_type);
- sh->success = GF_CALLOC (priv->child_count, sizeof (*sh->success),
- gf_afr_mt_char);
- if (!sh->success)
- goto out;
- sh->sources = GF_CALLOC (sizeof (*sh->sources), priv->child_count,
- gf_afr_mt_int);
- if (!sh->sources)
- goto out;
- sh->locked_nodes = GF_CALLOC (sizeof (*sh->locked_nodes),
- priv->child_count,
- gf_afr_mt_int);
- if (!sh->locked_nodes)
- goto out;
+ if (!changelog) {
+ ret = -ENOMEM;
+ goto out;
+ }
- sh->pending_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count,
- gf_afr_mt_int32_t);
- if (!sh->pending_matrix)
- goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ ret |= afr_selfheal_post_op(frame, this, inode, i, xattr, NULL);
+ }
+out:
+ if (changelog)
+ afr_matrix_cleanup(changelog, priv->child_count);
+ if (xattr)
+ dict_unref(xattr);
+ return ret;
+}
- for (i = 0; i < priv->child_count; i++) {
- sh->pending_matrix[i] = GF_CALLOC (sizeof (int32_t),
- priv->child_count,
- gf_afr_mt_int32_t);
- if (!sh->pending_matrix[i])
- goto out;
- }
+int
+afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid)
+{
+ int ret = -1;
+ int entry_ret = 1;
+ int metadata_ret = 1;
+ int data_ret = 1;
+ int or_ret = 0;
+ inode_t *inode = NULL;
+ fd_t *fd = NULL;
+ gf_boolean_t data_selfheal = _gf_false;
+ gf_boolean_t metadata_selfheal = _gf_false;
+ gf_boolean_t entry_selfheal = _gf_false;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ ret = afr_selfheal_unlocked_inspect(frame, this, gfid, &inode,
+ &data_selfheal, &metadata_selfheal,
+ &entry_selfheal, NULL);
+ if (ret)
+ goto out;
+
+ if (!(data_selfheal || metadata_selfheal || entry_selfheal)) {
+ ret = 2;
+ goto out;
+ }
+
+ if (inode->ia_type == IA_IFREG) {
+ ret = afr_selfheal_data_open(this, inode, &fd);
+ if (!fd) {
+ ret = -EIO;
+ goto out;
+ }
+ }
+
+ if (data_selfheal && priv->data_self_heal)
+ data_ret = afr_selfheal_data(frame, this, fd);
+
+ if (metadata_selfheal && priv->metadata_self_heal)
+ metadata_ret = afr_selfheal_metadata(frame, this, inode);
+
+ if (entry_selfheal && priv->entry_self_heal)
+ entry_ret = afr_selfheal_entry(frame, this, inode);
+
+ or_ret = (data_ret | metadata_ret | entry_ret);
+
+ if (data_ret == -EIO || metadata_ret == -EIO || entry_ret == -EIO)
+ ret = -EIO;
+ else if (data_ret == 1 && metadata_ret == 1 && entry_ret == 1)
+ ret = 1;
+ else if (or_ret < 0)
+ ret = or_ret;
+ else
+ ret = 0;
- sh->delta_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count,
- gf_afr_mt_int32_t);
- if (!sh->delta_matrix)
- goto out;
- for (i = 0; i < priv->child_count; i++) {
- sh->delta_matrix[i] = GF_CALLOC (sizeof (int32_t),
- priv->child_count,
- gf_afr_mt_int32_t);
- if (!sh->delta_matrix)
- goto out;
- }
- sh->fresh_parent_dirs = afr_children_create (priv->child_count);
- if (!sh->fresh_parent_dirs)
- goto out;
- ret = afr_sh_common_create (sh, priv->child_count);
- if (ret) {
- op_errno = -ret;
- goto out;
- }
+out:
+ if (inode)
+ inode_unref(inode);
+ if (fd)
+ fd_unref(fd);
+ return ret;
+}
+/*
+ * This is the entry point for healing a given GFID. The return values for this
+ * function are as follows:
+ * '0' if the self-heal is successful
+ * '1' if the afr-xattrs are non-zero (due to on-going IO) and no heal is needed
+ * '2' if the afr-xattrs are all-zero and no heal is needed
+ * $errno if the heal on the gfid failed.
+ */
- if (local->self_heal.background) {
- LOCK (&priv->lock);
- {
- if (priv->background_self_heals_started
- < priv->background_self_heal_count) {
- priv->background_self_heals_started++;
+int
+afr_selfheal(xlator_t *this, uuid_t gfid)
+{
+ int ret = -1;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ frame = afr_frame_create(this, NULL);
+ if (!frame)
+ return ret;
- } else {
- local->self_heal.background = _gf_false;
- sh->background = _gf_false;
- }
- }
- UNLOCK (&priv->lock);
- }
+ local = frame->local;
+ local->xdata_req = dict_new();
- FRAME_SU_DO (sh_frame, afr_local_t);
- if (sh->need_missing_entry_self_heal) {
- afr_self_heal_conflicting_entries (sh_frame, this);
- } else if (sh->need_gfid_self_heal) {
- GF_ASSERT (!uuid_is_null (sh->sh_gfid_req));
- afr_self_heal_gfids (sh_frame, this);
- } else {
- loc = &sh_local->loc;
- if (uuid_is_null (loc->inode->gfid) && uuid_is_null (loc->gfid)) {
- if (!uuid_is_null (inode->gfid))
- GF_ASSERT (!uuid_compare (inode->gfid,
- sh->sh_gfid_req));
- uuid_copy (loc->gfid, sh->sh_gfid_req);
- }
- gf_log (this->name, GF_LOG_TRACE,
- "proceeding to metadata check on %s",
- local->loc.path);
+ ret = afr_selfheal_do(frame, this, gfid);
- afr_sh_missing_entries_done (sh_frame, this);
- }
- op_errno = 0;
+ if (frame)
+ AFR_STACK_DESTROY(frame);
-out:
- if (op_errno) {
- orig_sh->unwind (frame, this, -1, op_errno);
- }
- return 0;
+ return ret;
}
-void
-afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str,
- size_t size)
+afr_local_t *
+__afr_dequeue_heals(afr_private_t *priv)
{
- GF_ASSERT (str && (size > strlen (" missing-entry gfid "
- "meta-data data entry")));
+ afr_local_t *local = NULL;
- if (self_heal_p->need_metadata_self_heal) {
- snprintf (str, size, " meta-data");
- }
+ if (list_empty(&priv->heal_waiting))
+ goto none;
+ if ((priv->background_self_heal_count > 0) &&
+ (priv->healers >= priv->background_self_heal_count))
+ goto none;
- if (self_heal_p->need_data_self_heal) {
- snprintf (str + strlen(str), size - strlen(str), " data");
- }
+ local = list_entry(priv->heal_waiting.next, afr_local_t, healer);
+ priv->heal_waiters--;
+ GF_ASSERT(priv->heal_waiters >= 0);
+ list_del_init(&local->healer);
+ list_add(&local->healer, &priv->healing);
+ priv->healers++;
+ return local;
+none:
+ gf_msg_debug(THIS->name, 0,
+ "Nothing dequeued. "
+ "Num healers: %d, Num Waiters: %d",
+ priv->healers, priv->heal_waiters);
+ return NULL;
+}
- if (self_heal_p->need_entry_self_heal) {
- snprintf (str + strlen(str), size - strlen(str), " entry");
- }
+int
+afr_refresh_selfheal_wrap(void *opaque)
+{
+ call_frame_t *heal_frame = opaque;
+ afr_local_t *local = heal_frame->local;
+ int ret = 0;
- if (self_heal_p->need_missing_entry_self_heal) {
- snprintf (str + strlen(str), size - strlen(str),
- " missing-entry");
- }
+ ret = afr_selfheal(heal_frame->this, local->refreshinode->gfid);
+ return ret;
+}
- if (self_heal_p->need_gfid_self_heal) {
- snprintf (str + strlen(str), size - strlen(str), " gfid");
- }
+int
+afr_refresh_heal_done(int ret, call_frame_t *frame, void *opaque)
+{
+ call_frame_t *heal_frame = opaque;
+ xlator_t *this = heal_frame->this;
+ afr_private_t *priv = this->private;
+ afr_local_t *local = heal_frame->local;
+
+ LOCK(&priv->lock);
+ {
+ list_del_init(&local->healer);
+ priv->healers--;
+ GF_ASSERT(priv->healers >= 0);
+ local = __afr_dequeue_heals(priv);
+ }
+ UNLOCK(&priv->lock);
+
+ AFR_STACK_DESTROY(heal_frame);
+
+ if (local)
+ afr_heal_synctask(this, local);
+ return 0;
}
-afr_self_heal_type
-afr_self_heal_type_for_transaction (afr_transaction_type type)
+void
+afr_heal_synctask(xlator_t *this, afr_local_t *local)
{
- afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID;
+ int ret = 0;
+ call_frame_t *heal_frame = NULL;
- switch (type) {
- case AFR_DATA_TRANSACTION:
- sh_type = AFR_SELF_HEAL_DATA;
- break;
- case AFR_METADATA_TRANSACTION:
- sh_type = AFR_SELF_HEAL_METADATA;
- break;
- case AFR_ENTRY_TRANSACTION:
- sh_type = AFR_SELF_HEAL_ENTRY;
- break;
- case AFR_ENTRY_RENAME_TRANSACTION:
- GF_ASSERT (0);
- break;
+ heal_frame = local->heal_frame;
+ ret = synctask_new(this->ctx->env, afr_refresh_selfheal_wrap,
+ afr_refresh_heal_done, heal_frame, heal_frame);
+ if (ret < 0)
+ /* Heal not launched. Will be queued when the next inode
+ * refresh happens and shd hasn't healed it yet. */
+ afr_refresh_heal_done(ret, heal_frame, heal_frame);
+}
+
+gf_boolean_t
+afr_throttled_selfheal(call_frame_t *frame, xlator_t *this)
+{
+ gf_boolean_t can_heal = _gf_true;
+ afr_private_t *priv = this->private;
+ afr_local_t *local = frame->local;
+
+ LOCK(&priv->lock);
+ {
+ if ((priv->background_self_heal_count > 0) &&
+ (priv->heal_wait_qlen + priv->background_self_heal_count) >
+ (priv->heal_waiters + priv->healers)) {
+ list_add_tail(&local->healer, &priv->heal_waiting);
+ priv->heal_waiters++;
+ local = __afr_dequeue_heals(priv);
+ } else {
+ can_heal = _gf_false;
}
- return sh_type;
+ }
+ UNLOCK(&priv->lock);
+
+ if (can_heal) {
+ if (local)
+ afr_heal_synctask(this, local);
+ else
+ gf_msg_debug(this->name, 0,
+ "Max number of heals are "
+ "pending, background self-heal rejected.");
+ }
+
+ return can_heal;
}
int
-afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name, uuid_t gfid)
+afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources,
+ afr_transaction_type type)
{
- int ret = -1;
+ int source = -1;
+ int i = 0;
- if (!child) {
- goto out;
- }
+ /* Give preference to local child to save on bandwidth */
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->local[i] && sources[i]) {
+ if ((type == AFR_DATA_TRANSACTION) && AFR_IS_ARBITER_BRICK(priv, i))
+ continue;
- if (strcmp (parent->path, "/") == 0)
- ret = gf_asprintf ((char **)&child->path, "/%s", name);
- else
- ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path,
- name);
+ source = i;
+ goto out;
+ }
+ }
- if (-1 == ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "asprintf failed while setting child path");
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i]) {
+ source = i;
+ goto out;
}
+ }
+out:
+ return source;
+}
- if (!child->path) {
- goto out;
+static int
+afr_anon_inode_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ int i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->replies[i].poststat = *buf;
+ local->replies[i].preparent = *preparent;
+ local->replies[i].postparent = *postparent;
+ }
+ if (xdata) {
+ local->replies[i].xdata = dict_ref(xdata);
+ }
+
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
+
+int
+afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode)
+{
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = this->private;
+ unsigned char *mkdir_on = alloca0(priv->child_count);
+ unsigned char *lookup_on = alloca0(priv->child_count);
+ loc_t loc = {0};
+ int32_t op_errno = 0;
+ int32_t child_op_errno = 0;
+ struct iatt iatt = {0};
+ dict_t *xdata = NULL;
+ uuid_t anon_inode_gfid = {0};
+ int mkdir_count = 0;
+ int i = 0;
+
+ /*Try to mkdir everywhere and return success if the dir exists on 'child'
+ */
+
+ if (!priv->use_anon_inode) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ frame = afr_frame_create(this, &op_errno);
+ if (op_errno) {
+ goto out;
+ }
+ local = frame->local;
+ if (!local->child_up[child]) {
+ /*Other bricks may need mkdir so don't error out yet*/
+ child_op_errno = ENOTCONN;
+ }
+ gf_uuid_parse(priv->anon_gfid_str, anon_inode_gfid);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->child_up[i])
+ continue;
+
+ if (priv->anon_inode[i]) {
+ mkdir_on[i] = 0;
+ } else {
+ mkdir_on[i] = 1;
+ mkdir_count++;
}
+ }
- child->name = strrchr (child->path, '/');
- if (child->name)
- child->name++;
+ if (mkdir_count == 0) {
+ *linked_inode = inode_find(this->itable, anon_inode_gfid);
+ if (*linked_inode) {
+ op_errno = 0;
+ goto out;
+ }
+ }
- child->parent = inode_ref (parent->inode);
- child->inode = inode_new (parent->inode->table);
+ loc.parent = inode_ref(this->itable->root);
+ loc.name = priv->anon_inode_name;
+ loc.inode = inode_new(this->itable);
+ if (!loc.inode) {
+ op_errno = ENOMEM;
+ goto out;
+ }
- if (!child->inode) {
- ret = -1;
- goto out;
+ xdata = dict_new();
+ if (!xdata) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ op_errno = -dict_set_gfuuid(xdata, "gfid-req", anon_inode_gfid, _gf_true);
+ if (op_errno) {
+ goto out;
+ }
+
+ if (mkdir_count == 0) {
+ memcpy(lookup_on, local->child_up, priv->child_count);
+ goto lookup;
+ }
+
+ AFR_ONLIST(mkdir_on, frame, afr_anon_inode_mkdir_cbk, mkdir, &loc, 0755, 0,
+ xdata);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!mkdir_on[i]) {
+ continue;
+ }
+
+ if (local->replies[i].op_ret == 0) {
+ priv->anon_inode[i] = 1;
+ iatt = local->replies[i].poststat;
+ } else if (local->replies[i].op_ret < 0 &&
+ local->replies[i].op_errno == EEXIST) {
+ lookup_on[i] = 1;
+ } else if (i == child) {
+ child_op_errno = local->replies[i].op_errno;
+ }
+ }
+
+ if (AFR_COUNT(lookup_on, priv->child_count) == 0) {
+ goto link;
+ }
+
+lookup:
+ AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xdata);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!lookup_on[i]) {
+ continue;
+ }
+
+ if (local->replies[i].op_ret == 0) {
+ if (gf_uuid_compare(anon_inode_gfid,
+ local->replies[i].poststat.ia_gfid) == 0) {
+ priv->anon_inode[i] = 1;
+ iatt = local->replies[i].poststat;
+ } else {
+ if (i == child)
+ child_op_errno = EINVAL;
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_DATA,
+ "%s has gfid: %s", priv->anon_inode_name,
+ uuid_utoa(local->replies[i].poststat.ia_gfid));
+ }
+ } else if (i == child) {
+ child_op_errno = local->replies[i].op_errno;
+ }
+ }
+link:
+ if (!gf_uuid_is_null(iatt.ia_gfid)) {
+ *linked_inode = inode_link(loc.inode, loc.parent, loc.name, &iatt);
+ if (*linked_inode) {
+ op_errno = 0;
+ inode_lookup(*linked_inode);
+ } else {
+ op_errno = ENOMEM;
}
- uuid_copy (child->gfid, gfid);
+ goto out;
+ }
- ret = 0;
out:
- if (ret == -1)
- loc_wipe (child);
-
- return ret;
+ if (xdata)
+ dict_unref(xdata);
+ loc_wipe(&loc);
+ /*child_op_errno takes precedence*/
+ if (child_op_errno == 0) {
+ child_op_errno = op_errno;
+ }
+
+ if (child_op_errno && *linked_inode) {
+ inode_unref(*linked_inode);
+ *linked_inode = NULL;
+ }
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ return -child_op_errno;
}