diff options
Diffstat (limited to 'xlators/cluster/afr/src/afr-self-heal-common.c')
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 4055 |
1 files changed, 2584 insertions, 1471 deletions
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index abc9ccb0fbe..a580a1584cc 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -1,1821 +1,2934 @@ /* - Copyright (c) 2008-2010 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#include "glusterfs.h" -#include "xlator.h" -#include "byte-order.h" - #include "afr.h" -#include "afr-transaction.h" -#include "afr-self-heal-common.h" #include "afr-self-heal.h" -#include "pump.h" +#include <glusterfs/byte-order.h> +#include "protocol-common.h" +#include "afr-messages.h" +#include <glusterfs/events.h> -/** - * select_source - select a source and return it - */ +void +afr_heal_synctask(xlator_t *this, afr_local_t *local); int -afr_sh_select_source (int sources[], int child_count) +afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name, + inode_t *inode, struct afr_reply *replies, int source, + unsigned char *sources, void *gfid, int *gfid_idx) { - int i = 0; - for (i = 0; i < child_count; i++) - if (sources[i]) - return i; + afr_private_t *priv = NULL; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + unsigned char *wind_on = NULL; + ia_type_t ia_type = IA_INVAL; + dict_t *xdata = NULL; + loc_t loc = { + 0, + }; + int ret = 0; + int i = 0; + + priv = this->private; + wind_on = alloca0(priv->child_count); + if (source >= 0 && replies[source].valid && replies[source].op_ret == 0) + ia_type = replies[source].poststat.ia_type; + + if (ia_type != IA_INVAL) + goto heal; + + /* If ia_type is still invalid, it means either + * (a)'source' was -1, i.e. parent dir pending xattrs are in split-brain + * (or) (b) The parent dir pending xattrs are all zeroes (i.e. all bricks + * are sources) and the 'source' we selected earlier might be the one where + * the file is not actually present. + * + * In both cases, let us pick a brick with a successful reply and use its + * ia_type. + * */ + for (i = 0; i < priv->child_count; i++) { + if (source == -1) { + /* case (a) above. */ + if (replies[i].valid && replies[i].op_ret == 0 && + replies[i].poststat.ia_type != IA_INVAL) { + ia_type = replies[i].poststat.ia_type; + break; + } + } else { + /* case (b) above. */ + if (i == source) + continue; + if (sources[i] && replies[i].valid && replies[i].op_ret == 0 && + replies[i].poststat.ia_type != IA_INVAL) { + ia_type = replies[i].poststat.ia_type; + break; + } + } + } - return -1; -} +heal: + /* gfid heal on those subvolumes that do not have gfid associated + * with the inode and update those replies. + */ + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret != 0) + continue; + if (gf_uuid_is_null(gfid) && + !gf_uuid_is_null(replies[i].poststat.ia_gfid) && + replies[i].poststat.ia_type == ia_type) + gfid = replies[i].poststat.ia_gfid; -/** - * sink_count - return number of sinks in sources array - */ + if (!gf_uuid_is_null(replies[i].poststat.ia_gfid) || + replies[i].poststat.ia_type != ia_type) + continue; + + wind_on[i] = 1; + } + + if (AFR_COUNT(wind_on, priv->child_count) == 0) + return 0; + + xdata = dict_new(); + if (!xdata) { + ret = -ENOMEM; + goto out; + } + + ret = dict_set_gfuuid(xdata, "gfid-req", gfid, true); + if (ret) { + ret = -ENOMEM; + goto out; + } + + frame = afr_frame_create(this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + + local = frame->local; + loc.parent = inode_ref(parent); + gf_uuid_copy(loc.pargfid, parent->gfid); + loc.name = name; + loc.inode = inode_ref(inode); + + AFR_ONLIST(wind_on, frame, afr_selfheal_discover_cbk, lookup, &loc, xdata); + + for (i = 0; i < priv->child_count; i++) { + if (!wind_on[i]) + continue; + afr_reply_wipe(&replies[i]); + afr_reply_copy(&replies[i], &local->replies[i]); + } + if (gfid_idx && (*gfid_idx == -1)) { + /*Pick a brick where the gifd heal was successful.*/ + for (i = 0; i < priv->child_count; i++) { + if (!wind_on[i]) + continue; + if (replies[i].valid && replies[i].op_ret == 0 && + !gf_uuid_is_null(replies[i].poststat.ia_gfid)) { + *gfid_idx = i; + break; + } + } + } +out: + if (gfid_idx && (*gfid_idx == -1) && (ret == 0) && local) { + ret = -afr_final_errno(local, priv); + } + loc_wipe(&loc); + if (frame) + AFR_STACK_DESTROY(frame); + if (xdata) + dict_unref(xdata); + + return ret; +} int -afr_sh_sink_count (int sources[], int child_count) +afr_gfid_sbrain_source_from_src_brick(xlator_t *this, struct afr_reply *replies, + char *src_brick) { - int i = 0; - int sinks = 0; - for (i = 0; i < child_count; i++) - if (!sources[i]) - sinks++; - return sinks; + int i = 0; + afr_private_t *priv = NULL; + + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (strcmp(priv->children[i]->name, src_brick) == 0) + return i; + } + return -1; } int -afr_sh_source_count (int sources[], int child_count) +afr_selfheal_gfid_mismatch_by_majority(struct afr_reply *replies, + int child_count) { - int i = 0; - int nsource = 0; + int j = 0; + int i = 0; + int votes; - for (i = 0; i < child_count; i++) - if (sources[i]) - nsource++; - return nsource; + for (i = 0; i < child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + + votes = 1; + for (j = i + 1; j < child_count; j++) { + if ((!gf_uuid_compare(replies[i].poststat.ia_gfid, + replies[j].poststat.ia_gfid))) + votes++; + if (votes > child_count / 2) + return i; + } + } + + return -1; } -void -afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) +int +afr_gfid_sbrain_source_from_bigger_file(struct afr_reply *replies, + int child_count) { - afr_private_t * priv = this->private; - char *buf = NULL; - char *ptr = NULL; - int i = 0; - int j = 0; + int i = 0; + int src = -1; + uint64_t size = 0; + + for (i = 0; i < child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (size < replies[i].poststat.ia_size) { + src = i; + size = replies[i].poststat.ia_size; + } else if (replies[i].poststat.ia_size == size) { + src = -1; + } + } + return src; +} - /* 10 digits per entry + 1 space + '[' and ']' */ - buf = GF_MALLOC (priv->child_count * 11 + 8, gf_afr_mt_char); +int +afr_gfid_sbrain_source_from_latest_mtime(struct afr_reply *replies, + int child_count) +{ + int i = 0; + int src = -1; + uint32_t mtime = 0; + uint32_t mtime_nsec = 0; + + for (i = 0; i < child_count; i++) { + if (!replies[i].valid || replies[i].op_ret != 0) + continue; + if ((mtime < replies[i].poststat.ia_mtime) || + ((mtime == replies[i].poststat.ia_mtime) && + (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) { + src = i; + mtime = replies[i].poststat.ia_mtime; + mtime_nsec = replies[i].poststat.ia_mtime_nsec; + } else if ((mtime == replies[i].poststat.ia_mtime) && + (mtime_nsec == replies[i].poststat.ia_mtime_nsec)) { + src = -1; + } + } + return src; +} - for (i = 0; i < priv->child_count; i++) { - ptr = buf; - ptr += sprintf (ptr, "[ "); - for (j = 0; j < priv->child_count; j++) { - ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); +int +afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies, + inode_t *inode, uuid_t pargfid, const char *bname, + int src_idx, int child_idx, + unsigned char *locked_on, int *src, dict_t *xdata) +{ + afr_private_t *priv = NULL; + char g1[64] = { + 0, + }; + char g2[64] = { + 0, + }; + int up_count = 0; + int heal_op = -1; + int ret = -1; + char *src_brick = NULL; + + *src = -1; + priv = this->private; + up_count = AFR_COUNT(locked_on, priv->child_count); + if (up_count != priv->child_count) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "All the bricks should be up to resolve the gfid split " + "barin"); + if (xdata) { + ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", + SALL_BRICKS_UP_TO_RESOLVE); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED, + "Error setting" + " gfid-heal-msg dict"); + } + goto out; + } + + if (xdata) { + ret = dict_get_int32_sizen(xdata, "heal-op", &heal_op); + if (ret) + goto fav_child; + } else { + goto fav_child; + } + + switch (heal_op) { + case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE: + *src = afr_gfid_sbrain_source_from_bigger_file(replies, + priv->child_count); + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + SNO_BIGGER_FILE); + if (xdata) { + ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", + SNO_BIGGER_FILE); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_DICT_SET_FAILED, + "Error" + " setting gfid-heal-msg dict"); } - sprintf (ptr, "]"); - gf_log (this->name, GF_LOG_TRACE, - "pending_matrix: %s", buf); - } + } + break; + + case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME: + *src = afr_gfid_sbrain_source_from_latest_mtime(replies, + priv->child_count); + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + SNO_DIFF_IN_MTIME); + if (xdata) { + ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", + SNO_DIFF_IN_MTIME); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_DICT_SET_FAILED, + "Error" + "setting gfid-heal-msg dict"); + } + } + break; + + case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK: + ret = dict_get_str_sizen(xdata, "child-name", &src_brick); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "Error getting the source " + "brick"); + break; + } + *src = afr_gfid_sbrain_source_from_src_brick(this, replies, + src_brick); + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + SERROR_GETTING_SRC_BRICK); + if (xdata) { + ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", + SERROR_GETTING_SRC_BRICK); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_DICT_SET_FAILED, + "Error" + " setting gfid-heal-msg dict"); + } + } + break; - GF_FREE (buf); + default: + break; + } + goto out; + +fav_child: + switch (priv->fav_child_policy) { + case AFR_FAV_CHILD_BY_SIZE: + *src = afr_sh_fav_by_size(this, replies, inode); + break; + case AFR_FAV_CHILD_BY_MTIME: + *src = afr_sh_fav_by_mtime(this, replies, inode); + break; + case AFR_FAV_CHILD_BY_CTIME: + *src = afr_sh_fav_by_ctime(this, replies, inode); + break; + case AFR_FAV_CHILD_BY_MAJORITY: + if (priv->child_count != 2) + *src = afr_selfheal_gfid_mismatch_by_majority( + replies, priv->child_count); + else + *src = -1; + + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "No majority to resolve " + "gfid split brain"); + } + break; + default: + break; + } + +out: + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "Gfid mismatch detected for <gfid:%s>/%s>, %s on %s and" + " %s on %s.", + uuid_utoa(pargfid), bname, + uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1), + priv->children[child_idx]->name, + uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2), + priv->children[src_idx]->name); + gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" + "subvol=%s;type=gfid;file=" + "<gfid:%s>/%s>;count=2;child-%d=%s;gfid-%d=%s;" + "child-%d=%s;gfid-%d=%s", + this->ctx->cmd_args.client_pid, this->name, uuid_utoa(pargfid), + bname, child_idx, priv->children[child_idx]->name, child_idx, + uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1), src_idx, + priv->children[src_idx]->name, src_idx, + uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2)); + return -1; + } + return 0; } -void -afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count) +int +afr_selfheal_post_op_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - int i = 0; - int j = 0; + afr_local_t *local = NULL; - GF_ASSERT (pending_matrix); + local = frame->local; - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - pending_matrix[i][j] = 0; - } - } -} + local->op_ret = op_ret; + local->op_errno = op_errno; + syncbarrier_wake(&local->barrier); -void -afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix, - unsigned char *ignorant_subvols, - size_t child_count) -{ - int i = 0; - int j = 0; - - GF_ASSERT (pending_matrix); - GF_ASSERT (ignorant_subvols); - - for (i = 0; i < child_count; i++) { - if (ignorant_subvols[i]) { - for (j = 0; j < child_count; j++) { - if (!ignorant_subvols[j]) - pending_matrix[j][i] += 1; - } - } - } + return 0; } int -afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, - dict_t *xattr[], afr_transaction_type type, - size_t child_count) -{ - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - int ret = -1; - int i = 0; - int j = 0; - int k = 0; - unsigned char *ignorant_subvols = NULL; - - ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), child_count, - gf_afr_mt_char); - if (NULL == ignorant_subvols) - goto out; +afr_selfheal_post_op(call_frame_t *frame, xlator_t *this, inode_t *inode, + int subvol, dict_t *xattr, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + loc_t loc = { + 0, + }; + int ret = 0; - afr_init_pending_matrix (pending_matrix, child_count); + priv = this->private; + local = frame->local; - for (i = 0; i < child_count; i++) { - pending_raw = NULL; + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - for (j = 0; j < child_count; j++) { - ret = dict_get_ptr (xattr[i], pending_key[j], - &pending_raw); + local->op_ret = 0; - if (ret != 0) { - /* - * There is no xattr present. This means this - * subvolume should be considered an 'ignorant' - * subvolume. - */ + STACK_WIND(frame, afr_selfheal_post_op_cbk, priv->children[subvol], + priv->children[subvol]->fops->xattrop, &loc, + GF_XATTROP_ADD_ARRAY, xattr, xdata); - ignorant_subvols[i] = 1; - continue; - } + syncbarrier_wait(&local->barrier, 1); + if (local->op_ret < 0) + ret = -local->op_errno; - memcpy (pending, pending_raw, sizeof(pending)); - k = afr_index_for_transaction_type (type); + loc_wipe(&loc); + local->op_ret = 0; - pending_matrix[i][j] = ntoh32 (pending[k]); - } - } + return ret; +} - afr_mark_ignorant_subvols_as_pending (pending_matrix, - ignorant_subvols, - child_count); - GF_FREE (ignorant_subvols); -out: - return ret; +int +afr_check_stale_error(struct afr_reply *replies, afr_private_t *priv) +{ + int i = 0; + int op_errno = 0; + int tmp_errno = 0; + int stale_count = 0; + + for (i = 0; i < priv->child_count; i++) { + tmp_errno = replies[i].op_errno; + if (tmp_errno == ENOENT || tmp_errno == ESTALE) { + op_errno = afr_higher_errno(op_errno, tmp_errno); + stale_count++; + } + } + if (stale_count != priv->child_count) + return -ENOTCONN; + else + return -op_errno; } +int +afr_sh_generic_fop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + int i = (long)cookie; + afr_local_t *local = NULL; -/** - * mark_sources: Mark all 'source' nodes and return number of source - * nodes found - * - * A node (a row in the pending matrix) belongs to one of - * three categories: - * - * M is the pending matrix. - * - * 'innocent' - M[i] is all zeroes - * 'fool' - M[i] has i'th element = 1 (self-reference) - * 'wise' - M[i] has i'th element = 0, others are 1 or 0. - * - * All 'innocent' nodes are sinks. If all nodes are innocent, no self-heal is - * needed. - * - * A 'wise' node can be a source. If two 'wise' nodes conflict, it is - * a split-brain. If one wise node refers to the other but the other doesn't - * refer back, the referrer is a source. - * - * All fools are sinks, unless there are no 'wise' nodes. In that case, - * one of the fools is made a source. - */ + local = frame->local; -typedef enum { - AFR_NODE_INNOCENT, - AFR_NODE_FOOL, - AFR_NODE_WISE, - AFR_NODE_INVALID = -1, -} afr_node_type; + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (pre) + local->replies[i].prestat = *pre; + if (post) + local->replies[i].poststat = *post; + if (xdata) + local->replies[i].xdata = dict_ref(xdata); -typedef struct { - afr_node_type type; - int wisdom; -} afr_node_character; + syncbarrier_wake(&local->barrier); + return 0; +} -static int -afr_sh_is_innocent (int32_t *array, int child_count) +int +afr_selfheal_restore_time(call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, unsigned char *healed_sinks, + struct afr_reply *replies) { - int i = 0; - int ret = 1; /* innocent until proven guilty */ + loc_t loc = { + 0, + }; - for (i = 0; i < child_count; i++) { - if (array[i]) { - ret = 0; - break; - } - } + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - return ret; -} + AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, setattr, &loc, + &replies[source].poststat, + (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME | GF_SET_ATTR_CTIME), + NULL); + loc_wipe(&loc); -static int -afr_sh_is_fool (int32_t *array, int i, int child_count) -{ - return array[i]; /* fool if accuses itself */ + return 0; } +dict_t * +afr_selfheal_output_xattr(xlator_t *this, gf_boolean_t is_full_crawl, + afr_transaction_type type, int *output_dirty, + int **output_matrix, int subvol, + int **full_heal_mtx_out) +{ + int j = 0; + int idx = 0; + int d_idx = 0; + int ret = 0; + int *raw = 0; + dict_t *xattr = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + idx = afr_index_for_transaction_type(type); + d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION); + + xattr = dict_new(); + if (!xattr) + return NULL; + + /* clear dirty */ + raw = GF_CALLOC(sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t); + if (!raw) + goto err; + + raw[idx] = hton32(output_dirty[subvol]); + ret = dict_set_bin(xattr, AFR_DIRTY, raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + GF_FREE(raw); + goto err; + } + + /* clear/set pending */ + for (j = 0; j < priv->child_count; j++) { + raw = GF_CALLOC(sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t); + if (!raw) + goto err; + + raw[idx] = hton32(output_matrix[subvol][j]); + if (is_full_crawl) + raw[d_idx] = hton32(full_heal_mtx_out[subvol][j]); + + ret = dict_set_bin(xattr, priv->pending_key[j], raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + GF_FREE(raw); + goto err; + } + } + + return xattr; +err: + if (xattr) + dict_unref(xattr); + return NULL; +} -static int -afr_sh_is_wise (int32_t *array, int i, int child_count) +int +afr_selfheal_undo_pending(call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *undid_pending, + afr_transaction_type type, struct afr_reply *replies, + unsigned char *locked_on) { - return !array[i]; /* wise if does not accuse itself */ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int j = 0; + unsigned char *pending = NULL; + int *input_dirty = NULL; + int **input_matrix = NULL; + int **full_heal_mtx_in = NULL; + int **full_heal_mtx_out = NULL; + int *output_dirty = NULL; + int **output_matrix = NULL; + dict_t *xattr = NULL; + dict_t *xdata = NULL; + + priv = this->private; + local = frame->local; + + pending = alloca0(priv->child_count); + + input_dirty = alloca0(priv->child_count * sizeof(int)); + input_matrix = ALLOC_MATRIX(priv->child_count, int); + full_heal_mtx_in = ALLOC_MATRIX(priv->child_count, int); + full_heal_mtx_out = ALLOC_MATRIX(priv->child_count, int); + output_dirty = alloca0(priv->child_count * sizeof(int)); + output_matrix = ALLOC_MATRIX(priv->child_count, int); + + xdata = dict_new(); + if (!xdata) + return -1; + + afr_selfheal_extract_xattr(this, replies, type, input_dirty, input_matrix); + + if (local->need_full_crawl) + afr_selfheal_extract_xattr(this, replies, AFR_DATA_TRANSACTION, NULL, + full_heal_mtx_in); + + for (i = 0; i < priv->child_count; i++) + if (sinks[i] && !healed_sinks[i]) + pending[i] = 1; + + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + if (pending[j]) { + output_matrix[i][j] = 1; + if (type == AFR_ENTRY_TRANSACTION) + full_heal_mtx_out[i][j] = 1; + } else if (locked_on[j]) { + output_matrix[i][j] = -input_matrix[i][j]; + if (type == AFR_ENTRY_TRANSACTION) + full_heal_mtx_out[i][j] = -full_heal_mtx_in[i][j]; + } + } + } + + for (i = 0; i < priv->child_count; i++) { + if (!pending[i]) + output_dirty[i] = -input_dirty[i]; + } + + for (i = 0; i < priv->child_count; i++) { + if (!locked_on[i]) + /* perform post-op only on subvols we had locked + and inspected on. + */ + continue; + if (undid_pending[i]) + /* We already unset the pending xattrs in + * _afr_fav_child_reset_sink_xattrs(). */ + continue; + + xattr = afr_selfheal_output_xattr(this, local->need_full_crawl, type, + output_dirty, output_matrix, i, + full_heal_mtx_out); + if (!xattr) { + continue; + } + + if ((type == AFR_ENTRY_TRANSACTION) && (priv->esh_granular)) { + if (xdata && dict_set_int8(xdata, GF_XATTROP_PURGE_INDEX, 1)) + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_DICT_SET_FAILED, + "Failed to set" + " dict value for %s", + GF_XATTROP_PURGE_INDEX); + } + + afr_selfheal_post_op(frame, this, inode, i, xattr, xdata); + dict_unref(xattr); + } + + if (xdata) + dict_unref(xdata); + + return 0; } +void +afr_reply_copy(struct afr_reply *dst, struct afr_reply *src) +{ + dict_t *xdata = NULL; + + dst->valid = src->valid; + dst->op_ret = src->op_ret; + dst->op_errno = src->op_errno; + dst->prestat = src->prestat; + dst->poststat = src->poststat; + dst->preparent = src->preparent; + dst->postparent = src->postparent; + dst->preparent2 = src->preparent2; + dst->postparent2 = src->postparent2; + if (src->xdata) + xdata = dict_ref(src->xdata); + else + xdata = NULL; + if (dst->xdata) + dict_unref(dst->xdata); + dst->xdata = xdata; + if (xdata && dict_get_str_boolean(xdata, "fips-mode-rchecksum", + _gf_false) == _gf_true) { + memcpy(dst->checksum, src->checksum, SHA256_DIGEST_LENGTH); + } else { + memcpy(dst->checksum, src->checksum, MD5_DIGEST_LENGTH); + } + dst->fips_mode_rchecksum = src->fips_mode_rchecksum; +} -static int -afr_sh_all_nodes_innocent (afr_node_character *characters, - int child_count) +void +afr_replies_copy(struct afr_reply *dst, struct afr_reply *src, int count) { - int i = 0; - int ret = 1; + int i = 0; - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_INNOCENT) { - ret = 0; - break; - } - } + if (dst == src) + return; - return ret; + for (i = 0; i < count; i++) { + afr_reply_copy(&dst[i], &src[i]); + } } - -static int -afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count) +int +afr_selfheal_fill_dirty(xlator_t *this, int *dirty, int subvol, int idx, + dict_t *xdata) { - int i = 0; - int ret = 0; + void *pending_raw = NULL; + int pending[3] = { + 0, + }; - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_WISE) { - ret = 1; - break; - } - } + if (!dirty) + return 0; - return ret; -} + if (dict_get_ptr(xdata, AFR_DIRTY, &pending_raw)) + return -1; + if (!pending_raw) + return -1; -/* - * The 'wisdom' of a wise node is 0 if any other wise node accuses it. - * It is 1 if no other wise node accuses it. - * Only wise nodes with wisdom 1 are sources. - * - * If no nodes with wisdom 1 exist, a split-brain has occured. - */ + memcpy(pending, pending_raw, sizeof(pending)); -static void -afr_sh_compute_wisdom (int32_t *pending_matrix[], - afr_node_character characters[], int child_count) + dirty[subvol] = ntoh32(pending[idx]); + + return 0; +} + +int +afr_selfheal_fill_matrix(xlator_t *this, int **matrix, int subvol, int idx, + dict_t *xdata) { - int i = 0; - int j = 0; + int i = 0; + void *pending_raw = NULL; + int pending[3] = { + 0, + }; + afr_private_t *priv = NULL; - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_WISE) { - characters[i].wisdom = 1; + priv = this->private; - for (j = 0; j < child_count; j++) { - if ((characters[j].type == AFR_NODE_WISE) - && pending_matrix[j][i]) { + if (!matrix) + return 0; - characters[i].wisdom = 0; - } - } - } - } -} + for (i = 0; i < priv->child_count; i++) { + if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw)) + continue; + if (!pending_raw) + continue; -static int -afr_sh_wise_nodes_conflict (afr_node_character *characters, - int child_count) + memcpy(pending, pending_raw, sizeof(pending)); + + matrix[subvol][i] = ntoh32(pending[idx]); + } + + return 0; +} + +int +afr_selfheal_extract_xattr(xlator_t *this, struct afr_reply *replies, + afr_transaction_type type, int *dirty, int **matrix) { - int i = 0; - int ret = 1; + afr_private_t *priv = NULL; + int i = 0; + dict_t *xdata = NULL; + int idx = -1; - for (i = 0; i < child_count; i++) { - if ((characters[i].type == AFR_NODE_WISE) - && characters[i].wisdom == 1) { + idx = afr_index_for_transaction_type(type); - /* There is atleast one bona-fide wise node */ - ret = 0; - break; - } - } + priv = this->private; - return ret; -} + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret != 0) + continue; + if (!replies[i].xdata) + continue; -static int -afr_sh_mark_wisest_as_sources (int sources[], - afr_node_character *characters, - int child_count) -{ - int nsources = 0; - int i = 0; + xdata = replies[i].xdata; - for (i = 0; i < child_count; i++) { - if (characters[i].wisdom == 1) { - sources[i] = 1; - nsources++; - } - } + afr_selfheal_fill_dirty(this, dirty, i, idx, xdata); + afr_selfheal_fill_matrix(this, matrix, i, idx, xdata); + } - return nsources; + return 0; } -static void -afr_compute_witness_of_fools (int32_t *witnesses, int32_t **pending_matrix, - afr_node_character *characters, - int32_t child_count) -{ - int i = 0; - int j = 0; - int witness = 0; - - GF_ASSERT (witnesses); - GF_ASSERT (pending_matrix); - GF_ASSERT (characters); - GF_ASSERT (child_count > 0); - - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_FOOL) - continue; - - witness = 0; - for (j = 0; j < child_count; j++) { - if (i == j) - continue; - witness += pending_matrix[i][j]; - } - witnesses[i] = witness; - } +/* + * If by chance there are multiple sources with differing sizes, select + * the largest file as the source. + * + * This can happen if data was directly modified in the backend or for snapshots + */ +void +afr_mark_largest_file_as_source(xlator_t *this, unsigned char *sources, + struct afr_reply *replies) +{ + int i = 0; + afr_private_t *priv = NULL; + uint64_t size = 0; + + /* Find source with biggest file size */ + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (!replies[i].valid || replies[i].op_ret != 0) { + sources[i] = 0; + continue; + } + if (size <= replies[i].poststat.ia_size) { + size = replies[i].poststat.ia_size; + } + } + + /* Mark sources with less size as not source */ + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (size > replies[i].poststat.ia_size) + sources[i] = 0; + } } -static int32_t -afr_find_biggest_witness_among_fools (int32_t *witnesses, - afr_node_character *characters, - int32_t child_count) +void +afr_mark_latest_mtime_file_as_source(xlator_t *this, unsigned char *sources, + struct afr_reply *replies) { - int i = 0; - int biggest_witness = -1; + int i = 0; + afr_private_t *priv = NULL; + uint32_t mtime = 0; + uint32_t mtime_nsec = 0; + + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (!replies[i].valid || replies[i].op_ret != 0) { + sources[i] = 0; + continue; + } + if ((mtime < replies[i].poststat.ia_mtime) || + ((mtime == replies[i].poststat.ia_mtime) && + (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) { + mtime = replies[i].poststat.ia_mtime; + mtime_nsec = replies[i].poststat.ia_mtime_nsec; + } + } + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if ((mtime > replies[i].poststat.ia_mtime) || + ((mtime == replies[i].poststat.ia_mtime) && + (mtime_nsec > replies[i].poststat.ia_mtime_nsec))) { + sources[i] = 0; + } + } +} - GF_ASSERT (witnesses); - GF_ASSERT (characters); - GF_ASSERT (child_count > 0); +void +afr_mark_active_sinks(xlator_t *this, unsigned char *sources, + unsigned char *locked_on, unsigned char *sinks) +{ + int i = 0; + afr_private_t *priv = NULL; - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_FOOL) - continue; + priv = this->private; - if (biggest_witness < witnesses[i]) - biggest_witness = witnesses[i]; - } - return biggest_witness; + for (i = 0; i < priv->child_count; i++) { + if (!sources[i] && locked_on[i]) + sinks[i] = 1; + else + sinks[i] = 0; + } } -int -afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses, - afr_node_character *characters, - int32_t child_count, int32_t witness) +gf_boolean_t +afr_dict_contains_heal_op(call_frame_t *frame) { - int i = 0; - int nsources = 0; + afr_local_t *local = NULL; + dict_t *xdata_req = NULL; + int ret = 0; + int heal_op = -1; + + local = frame->local; + xdata_req = local->xdata_req; + ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op); + if (ret) + return _gf_false; + if (local->xdata_rsp == NULL) { + local->xdata_rsp = dict_new(); + if (!local->xdata_rsp) + return _gf_true; + } + ret = dict_set_sizen_str_sizen(local->xdata_rsp, "sh-fail-msg", + SFILE_NOT_IN_SPLIT_BRAIN); + + return _gf_true; +} - GF_ASSERT (sources); - GF_ASSERT (witnesses); - GF_ASSERT (characters); - GF_ASSERT (child_count > 0); +gf_boolean_t +afr_can_decide_split_brain_source_sinks(struct afr_reply *replies, + int child_count) +{ + int i = 0; - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_FOOL) - continue; + for (i = 0; i < child_count; i++) + if (replies[i].valid != 1 || replies[i].op_ret != 0) + return _gf_false; - if (witness == witnesses[i]) { - sources[i] = 1; - nsources++; - } - } - return nsources; + return _gf_true; } -static int -afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, - afr_node_character *characters, - int child_count) +int +afr_mark_split_brain_source_sinks_by_heal_op( + call_frame_t *frame, xlator_t *this, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, + struct afr_reply *replies, afr_transaction_type type, int heal_op) { - int32_t biggest_witness = 0; - int nsources = 0; - int32_t *witnesses = NULL; - - GF_ASSERT (child_count > 0); - - witnesses = GF_CALLOC (child_count, sizeof (*witnesses), - gf_afr_mt_int32_t); - if (NULL == witnesses) { - nsources = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xdata_req = NULL; + dict_t *xdata_rsp = NULL; + int ret = 0; + int i = 0; + char *name = NULL; + int source = -1; + + local = frame->local; + priv = this->private; + xdata_req = local->xdata_req; + + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) + if (sources[i] || !sinks[i] || !healed_sinks[i]) { + ret = -1; goto out; - } - - afr_compute_witness_of_fools (witnesses, pending_matrix, characters, - child_count); - biggest_witness = afr_find_biggest_witness_among_fools (witnesses, - characters, - child_count); - nsources = afr_mark_fool_as_source_by_witness (sources, witnesses, - characters, child_count, - biggest_witness); + } + } + if (local->xdata_rsp == NULL) { + local->xdata_rsp = dict_new(); + if (!local->xdata_rsp) { + ret = -1; + goto out; + } + } + xdata_rsp = local->xdata_rsp; + + if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SBRAIN_HEAL_NO_GO_MSG); + ret = -1; + goto out; + } + + for (i = 0; i < priv->child_count; i++) + if (locked_on[i]) + sources[i] = 1; + switch (heal_op) { + case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE: + if (type == AFR_METADATA_TRANSACTION) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SUSE_SOURCE_BRICK_TO_HEAL); + if (!ret) + ret = -1; + goto out; + } + afr_mark_largest_file_as_source(this, sources, replies); + if (AFR_COUNT(sources, priv->child_count) != 1) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SNO_BIGGER_FILE); + if (!ret) + ret = -1; + goto out; + } + break; + case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME: + if (type == AFR_METADATA_TRANSACTION) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SUSE_SOURCE_BRICK_TO_HEAL); + if (!ret) + ret = -1; + goto out; + } + afr_mark_latest_mtime_file_as_source(this, sources, replies); + if (AFR_COUNT(sources, priv->child_count) != 1) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SNO_DIFF_IN_MTIME); + if (!ret) + ret = -1; + goto out; + } + break; + case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK: + ret = dict_get_str_sizen(xdata_req, "child-name", &name); + if (ret) + goto out; + source = afr_get_child_index_from_name(this, name); + if (source < 0) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SINVALID_BRICK_NAME); + if (!ret) + ret = -1; + goto out; + } + if (locked_on[source] != 1) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SBRICK_IS_NOT_UP); + if (!ret) + ret = -1; + goto out; + } + memset(sources, 0, sizeof(*sources) * priv->child_count); + sources[source] = 1; + break; + default: + ret = -1; + goto out; + } + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source = i; + break; + } + } + sinks[source] = 0; + healed_sinks[source] = 0; + ret = source; out: - if (witnesses) - GF_FREE (witnesses); - return nsources; + if (ret < 0) + memset(sources, 0, sizeof(*sources) * priv->child_count); + return ret; } int -afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs, - int32_t *valid_children, int child_count, - uint32_t uid) -{ - int i = 0; - int nsources = 0; - int child = 0; - - GF_ASSERT (bufs); - GF_ASSERT (valid_children); - GF_ASSERT (sources); - GF_ASSERT (child_count > 0); - - for (i = 0; i < child_count; i++) { - if (-1 == valid_children[i]) - continue; - - child = valid_children[i]; - if (uid == bufs[child].ia_uid) { - sources[child] = 1; - nsources++; +afr_sh_fav_by_majority(xlator_t *this, struct afr_reply *replies, + inode_t *inode) +{ + afr_private_t *priv; + int vote_count = -1; + int fav_child = -1; + int i = 0; + int k = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid == 1) { + gf_msg_debug(this->name, 0, + "Child:%s mtime_sec = %" PRId64 ", size = %" PRIu64 + " for gfid %s", + priv->children[i]->name, replies[i].poststat.ia_mtime, + replies[i].poststat.ia_size, uuid_utoa(inode->gfid)); + vote_count = 0; + for (k = 0; k < priv->child_count; k++) { + if ((replies[k].poststat.ia_mtime == + replies[i].poststat.ia_mtime) && + (replies[k].poststat.ia_size == + replies[i].poststat.ia_size)) { + vote_count++; } + } + if (vote_count > priv->child_count / 2) { + fav_child = i; + break; + } } - return nsources; + } + return fav_child; } +/* + * afr_sh_fav_by_mtime: Choose favorite child by mtime. + */ int -afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *valid_children, - int child_count) -{ - int i = 0; - int smallest = -1; - int child = 0; - - GF_ASSERT (bufs); - GF_ASSERT (valid_children); - GF_ASSERT (child_count > 0); - - for (i = 0; i < child_count; i++) { - if (-1 == valid_children[i]) - continue; - child = valid_children[i]; - if ((smallest == -1) || - (bufs[child].ia_uid < bufs[smallest].ia_uid)) { - smallest = child; - } - } - return smallest; +afr_sh_fav_by_mtime(xlator_t *this, struct afr_reply *replies, inode_t *inode) +{ + afr_private_t *priv; + int fav_child = -1; + int i = 0; + uint32_t cmp_mtime = 0; + uint32_t cmp_mtime_nsec = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid == 1) { + gf_msg_debug(this->name, 0, + "Child:%s mtime = %" PRId64 + ", mtime_nsec = %d for " + "gfid %s", + priv->children[i]->name, replies[i].poststat.ia_mtime, + replies[i].poststat.ia_mtime_nsec, + uuid_utoa(inode->gfid)); + if (replies[i].poststat.ia_mtime > cmp_mtime) { + cmp_mtime = replies[i].poststat.ia_mtime; + cmp_mtime_nsec = replies[i].poststat.ia_mtime_nsec; + fav_child = i; + } else if ((replies[i].poststat.ia_mtime == cmp_mtime) && + (replies[i].poststat.ia_mtime_nsec > cmp_mtime_nsec)) { + cmp_mtime = replies[i].poststat.ia_mtime; + cmp_mtime_nsec = replies[i].poststat.ia_mtime_nsec; + fav_child = i; + } + } + } + return fav_child; } -static int -afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *valid_children, - int child_count, int32_t *sources) +/* + * afr_sh_fav_by_ctime: Choose favorite child by ctime. + */ +int +afr_sh_fav_by_ctime(xlator_t *this, struct afr_reply *replies, inode_t *inode) { - int nsources = 0; - int smallest = 0; + afr_private_t *priv; + int fav_child = -1; + int i = 0; + uint32_t cmp_ctime = 0; + uint32_t cmp_ctime_nsec = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid == 1) { + gf_msg_debug(this->name, 0, + "Child:%s ctime = %" PRId64 + ", ctime_nsec = %d for " + "gfid %s", + priv->children[i]->name, replies[i].poststat.ia_ctime, + replies[i].poststat.ia_ctime_nsec, + uuid_utoa(inode->gfid)); + if (replies[i].poststat.ia_ctime > cmp_ctime) { + cmp_ctime = replies[i].poststat.ia_ctime; + cmp_ctime_nsec = replies[i].poststat.ia_ctime_nsec; + fav_child = i; + } else if ((replies[i].poststat.ia_ctime == cmp_ctime) && + (replies[i].poststat.ia_ctime_nsec > cmp_ctime_nsec)) { + cmp_ctime = replies[i].poststat.ia_ctime; + cmp_ctime_nsec = replies[i].poststat.ia_ctime_nsec; + fav_child = i; + } + } + } + return fav_child; +} - smallest = afr_get_child_with_lowest_uid (bufs, valid_children, - child_count); - if (smallest < 0) { - nsources = -1; - goto out; - } - nsources = afr_mark_child_as_source_by_uid (sources, bufs, - valid_children, child_count, - bufs[smallest].ia_uid); -out: - return nsources; +/* + * afr_sh_fav_by_size: Choose favorite child by size + * when not all files are of zero size. + */ +int +afr_sh_fav_by_size(xlator_t *this, struct afr_reply *replies, inode_t *inode) +{ + afr_private_t *priv; + int fav_child = -1; + int i = 0; + uint64_t cmp_sz = 0; + + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) { + continue; + } + gf_msg_debug(this->name, 0, + "Child:%s file size = %" PRIu64 " for gfid %s", + priv->children[i]->name, replies[i].poststat.ia_size, + uuid_utoa(inode->gfid)); + if (replies[i].poststat.ia_type == IA_IFDIR) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, + "Cannot perform selfheal on %s. " + "Size policy is not applicable to directories.", + uuid_utoa(inode->gfid)); + break; + } + if (replies[i].poststat.ia_size > cmp_sz) { + cmp_sz = replies[i].poststat.ia_size; + fav_child = i; + } else if (replies[i].poststat.ia_size == cmp_sz) { + fav_child = -1; + } + } + if (fav_child == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "No bigger file"); + } + return fav_child; } -char * -afr_get_character_str (afr_node_type type) +int +afr_sh_get_fav_by_policy(xlator_t *this, struct afr_reply *replies, + inode_t *inode, char **policy_str) { - char *character = NULL; + afr_private_t *priv = NULL; + int fav_child = -1; - switch (type) { - case AFR_NODE_INNOCENT: - character = "innocent"; - break; - case AFR_NODE_FOOL: - character = "fool"; - break; - case AFR_NODE_WISE: - character = "wise"; - break; + priv = this->private; + if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) { + return -1; + } + + switch (priv->fav_child_policy) { + case AFR_FAV_CHILD_BY_SIZE: + fav_child = afr_sh_fav_by_size(this, replies, inode); + if (policy_str && fav_child >= 0) { + *policy_str = "SIZE"; + } + break; + case AFR_FAV_CHILD_BY_CTIME: + fav_child = afr_sh_fav_by_ctime(this, replies, inode); + if (policy_str && fav_child >= 0) { + *policy_str = "CTIME"; + } + break; + case AFR_FAV_CHILD_BY_MTIME: + fav_child = afr_sh_fav_by_mtime(this, replies, inode); + if (policy_str && fav_child >= 0) { + *policy_str = "MTIME"; + } + break; + case AFR_FAV_CHILD_BY_MAJORITY: + fav_child = afr_sh_fav_by_majority(this, replies, inode); + if (policy_str && fav_child >= 0) { + *policy_str = "MAJORITY"; + } + break; + case AFR_FAV_CHILD_NONE: default: - character = "invalid"; - break; - } - return character; + break; + } + + return fav_child; } -afr_node_type -afr_find_child_character_type (int32_t *pending_row, int32_t child, - int32_t child_count, const char *xlator_name) +int +afr_mark_split_brain_source_sinks_by_policy( + call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, + struct afr_reply *replies, afr_transaction_type type) { - afr_node_type type = AFR_NODE_INVALID; + afr_private_t *priv = NULL; + int fav_child = -1; + char mtime_str[256]; + char ctime_str[256]; + char *policy_str = NULL; + struct tm *tm_ptr; + time_t time; + + priv = this->private; + + fav_child = afr_sh_get_fav_by_policy(this, replies, inode, &policy_str); + if (fav_child == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, + "No child selected by favorite-child policy."); + } else if (fav_child > priv->child_count - 1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, + "Invalid child (%d) " + "selected by policy %s.", + fav_child, policy_str); + } else if (fav_child >= 0) { + time = replies[fav_child].poststat.ia_mtime; + tm_ptr = localtime(&time); + strftime(mtime_str, sizeof(mtime_str), "%Y-%m-%d %H:%M:%S", tm_ptr); + time = replies[fav_child].poststat.ia_ctime; + tm_ptr = localtime(&time); + strftime(ctime_str, sizeof(ctime_str), "%Y-%m-%d %H:%M:%S", tm_ptr); + + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, + "Source %s selected as authentic to resolve conflicting data " + "in file (gfid:%s) by %s (%" PRIu64 + " bytes @ %s mtime, %s " + "ctime).", + priv->children[fav_child]->name, uuid_utoa(inode->gfid), + policy_str, replies[fav_child].poststat.ia_size, mtime_str, + ctime_str); + + sources[fav_child] = 1; + sinks[fav_child] = 0; + healed_sinks[fav_child] = 0; + } + return fav_child; +} - GF_ASSERT (pending_row); - GF_ASSERT (child_count > 0); - GF_ASSERT ((child >= 0) && (child < child_count)); +gf_boolean_t +afr_is_file_empty_on_all_children(afr_private_t *priv, + struct afr_reply *replies) +{ + int i = 0; - if (afr_sh_is_innocent (pending_row, child_count)) - type = AFR_NODE_INNOCENT; - else if (afr_sh_is_fool (pending_row, child, child_count)) - type = AFR_NODE_FOOL; - else if (afr_sh_is_wise (pending_row, child, child_count)) - type = AFR_NODE_WISE; - else - GF_ASSERT (0); + for (i = 0; i < priv->child_count; i++) { + if ((!replies[i].valid) || (replies[i].op_ret != 0) || + (replies[i].poststat.ia_size != 0)) + return _gf_false; + } - gf_log (xlator_name, GF_LOG_DEBUG, "child %d character %s", - child, afr_get_character_str (type)); - return type; + return _gf_true; } int -afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, - int32_t child_count, afr_self_heal_type type, - int32_t *valid_children, const char *xlator_name) +afr_mark_source_sinks_if_file_empty(xlator_t *this, unsigned char *sources, + unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *locked_on, + struct afr_reply *replies, + afr_transaction_type type) { - /* stores the 'characters' (innocent, fool, wise) of the nodes */ + int source = -1; + int i = 0; + afr_private_t *priv = this->private; + struct iatt stbuf = { + 0, + }; + + if ((AFR_COUNT(locked_on, priv->child_count) < priv->child_count) || + (afr_success_count(replies, priv->child_count) < priv->child_count)) + return -1; - afr_node_character *characters = NULL; - int i = 0; - int nsources = -1; - xlator_t *this = NULL; + if (type == AFR_DATA_TRANSACTION) { + if (!afr_is_file_empty_on_all_children(priv, replies)) + return -1; + goto mark; + } + + /*For AFR_METADATA_TRANSACTION, metadata must be same on all bricks.*/ + stbuf = replies[0].poststat; + for (i = 1; i < priv->child_count; i++) { + if ((!IA_EQUAL(stbuf, replies[i].poststat, type)) || + (!IA_EQUAL(stbuf, replies[i].poststat, uid)) || + (!IA_EQUAL(stbuf, replies[i].poststat, gid)) || + (!IA_EQUAL(stbuf, replies[i].poststat, prot))) + return -1; + } + for (i = 1; i < priv->child_count; i++) { + if (!afr_xattrs_are_equal(replies[0].xdata, replies[i].xdata)) + return -1; + } + +mark: + /* data/metadata is same on all bricks. Pick one of them as source. Rest + * are sinks.*/ + for (i = 0; i < priv->child_count; i++) { + if (source == -1) { + source = i; + sources[i] = 1; + sinks[i] = 0; + healed_sinks[i] = 0; + continue; + } + sources[i] = 0; + sinks[i] = 1; + healed_sinks[i] = 1; + } + + return source; +} - characters = GF_CALLOC (sizeof (afr_node_character), - child_count, gf_afr_mt_afr_node_character); - if (!characters) - goto out; +/* Return a source depending on the type of heal_op, and set sources[source], + * sinks[source] and healed_sinks[source] to 1, 0 and 0 respectively. Do so + * only if the following condition is met: + * ∀i((i ∈ locked_on[] ∧ i=1)==>(sources[i]=0 ∧ sinks[i]=1 ∧ healed_sinks[i]=1)) + * i.e. for each locked node, sources[node] is 0; healed_sinks[node] and + * sinks[node] are 1. This should be the case if the file is in split-brain. + */ +int +afr_mark_split_brain_source_sinks( + call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, + struct afr_reply *replies, afr_transaction_type type) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xdata_req = NULL; + int heal_op = -1; + int ret = -1; + int source = -1; + + local = frame->local; + priv = this->private; + xdata_req = local->xdata_req; + + source = afr_mark_source_sinks_if_file_empty( + this, sources, sinks, healed_sinks, locked_on, replies, type); + if (source >= 0) + return source; + + ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op); + if (ret) + goto autoheal; + + source = afr_mark_split_brain_source_sinks_by_heal_op( + frame, this, sources, sinks, healed_sinks, locked_on, replies, type, + heal_op); + return source; + +autoheal: + /* Automatically heal if fav_child_policy is set. */ + if (priv->fav_child_policy != AFR_FAV_CHILD_NONE) { + source = afr_mark_split_brain_source_sinks_by_policy( + frame, this, inode, sources, sinks, healed_sinks, locked_on, + replies, type); + if (source != -1) { + ret = dict_set_int32_sizen(xdata_req, "fav-child-policy", 1); + if (ret) + return -1; + } + } + + return source; +} - this = THIS; +int +_afr_fav_child_reset_sink_xattrs(call_frame_t *frame, xlator_t *this, + inode_t *inode, int source, + unsigned char *healed_sinks, + unsigned char *undid_pending, + afr_transaction_type type, + unsigned char *locked_on, + struct afr_reply *replies) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int *input_dirty = NULL; + int **input_matrix = NULL; + int *output_dirty = NULL; + int **output_matrix = NULL; + dict_t *xattr = NULL; + dict_t *xdata = NULL; + int i = 0; + + priv = this->private; + local = frame->local; + + if (!dict_get_sizen(local->xdata_req, "fav-child-policy")) + return 0; - /* start clean */ - for (i = 0; i < child_count; i++) { - sources[i] = 0; - } + xdata = dict_new(); + if (!xdata) + return -1; - nsources = 0; - for (i = 0; i < child_count; i++) { - characters[i].type = - afr_find_child_character_type (pending_matrix[i], i, - child_count, - xlator_name); - if (AFR_NODE_INVALID == characters[i].type) - gf_log (xlator_name, GF_LOG_WARNING, - "child %d had invalid xattrs", i); - } + input_dirty = alloca0(priv->child_count * sizeof(int)); + input_matrix = ALLOC_MATRIX(priv->child_count, int); + output_dirty = alloca0(priv->child_count * sizeof(int)); + output_matrix = ALLOC_MATRIX(priv->child_count, int); - if ((type == AFR_SELF_HEAL_METADATA) - && afr_sh_all_nodes_innocent (characters, child_count)) { + afr_selfheal_extract_xattr(this, replies, type, input_dirty, input_matrix); - nsources = afr_sh_mark_lowest_uid_as_source (bufs, - valid_children, - child_count, - sources); - goto out; - } + for (i = 0; i < priv->child_count; i++) { + if (i == source || !healed_sinks[i]) + continue; + output_dirty[i] = -input_dirty[i]; + output_matrix[i][source] = -input_matrix[i][source]; + } - if (afr_sh_wise_nodes_exist (characters, child_count)) { - afr_sh_compute_wisdom (pending_matrix, characters, child_count); + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i] || !locked_on[i]) + continue; + xattr = afr_selfheal_output_xattr(this, _gf_false, type, output_dirty, + output_matrix, i, NULL); - if (afr_sh_wise_nodes_conflict (characters, child_count)) { - /* split-brain */ - gf_log (this->name, GF_LOG_INFO, - "split-brain possible, no source detected"); - nsources = -1; + afr_selfheal_post_op(frame, this, inode, i, xattr, xdata); - } else { - nsources = afr_sh_mark_wisest_as_sources (sources, - characters, - child_count); - } - } else { - nsources = afr_mark_biggest_of_fools_as_source (sources, - pending_matrix, - characters, - child_count); - } + undid_pending[i] = 1; + dict_unref(xattr); + } -out: - if (nsources == 0) { - for (i = 0; i < child_count; i++) { - if (valid_children[i] != -1) - sources[valid_children[i]] = 1; - } - } - if (characters) - GF_FREE (characters); + if (xdata) + dict_unref(xdata); - gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources); - return nsources; + return 0; } -void -afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, - int32_t *delta_matrix[], int success[], - int child_count, afr_transaction_type type) -{ - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - int ret = 0; - int i = 0; - int j = 0; - int k = 0; - - /* start clean */ - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - delta_matrix[i][j] = 0; - } - } - - for (i = 0; i < child_count; i++) { - if (pending_raw) - pending_raw = NULL; - - for (j = 0; j < child_count; j++) { - ret = dict_get_ptr (xattr[i], priv->pending_key[j], - &pending_raw); - if (ret < 0) - gf_log (THIS->name, GF_LOG_DEBUG, - "Unable to get dict value."); - if (!success[j]) - continue; +gf_boolean_t +afr_does_witness_exist(xlator_t *this, uint64_t *witness) +{ + int i = 0; + afr_private_t *priv = NULL; - k = afr_index_for_transaction_type (type); + priv = this->private; - if (pending_raw != NULL) { - memcpy (pending, pending_raw, sizeof(pending)); - delta_matrix[i][j] = -(ntoh32 (pending[k])); - } else { - delta_matrix[i][j] = 0; - } + for (i = 0; i < priv->child_count; i++) { + if (witness[i]) + return _gf_true; + } + return _gf_false; +} - } - } +unsigned int +afr_get_quorum_count(afr_private_t *priv) +{ + if (priv->quorum_count == AFR_QUORUM_AUTO) { + return priv->child_count / 2 + 1; + } else { + return priv->quorum_count; + } } +void +afr_selfheal_post_op_failure_accounting(afr_private_t *priv, char *accused, + unsigned char *sources, + unsigned char *locked_on) +{ + int i = 0; + unsigned int quorum_count = 0; -int -afr_sh_delta_to_xattr (afr_private_t *priv, - int32_t *delta_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type) -{ - int i = 0; - int j = 0; - int k = 0; - int ret = 0; - int32_t *pending = NULL; - - for (i = 0; i < child_count; i++) { - if (!xattr[i]) - continue; - - for (j = 0; j < child_count; j++) { - pending = GF_CALLOC (sizeof (int32_t), 3, - gf_afr_mt_int32_t); - - if (!pending) - continue; - /* 3 = data+metadata+entry */ - - k = afr_index_for_transaction_type (type); - - pending[k] = hton32 (delta_matrix[i][j]); - - ret = dict_set_bin (xattr[i], priv->pending_key[j], - pending, - 3 * sizeof (int32_t)); - if (ret < 0) - gf_log (THIS->name, GF_LOG_WARNING, - "Unable to set dict value."); - } + if (AFR_COUNT(sources, priv->child_count) != 0) + return; + + quorum_count = afr_get_quorum_count(priv); + for (i = 0; i < priv->child_count; i++) { + if ((accused[i] < quorum_count) && locked_on[i]) { + sources[i] = 1; } - return 0; + } + return; } +/* + * This function determines if a self-heal is required for a given inode, + * and if needed, in what direction. + * + * locked_on[] is the array representing servers which have been locked and + * from which xattrs have been fetched for analysis. + * + * The output of the function is by filling the arrays sources[] and sinks[]. + * + * sources[i] is set if i'th server is an eligible source for a selfheal. + * + * sinks[i] is set if i'th server needs to be healed. + * + * if sources[0..N] are all set, there is no need for a selfheal. + * + * if sinks[0..N] are all set, the inode is in split brain. + * + */ int -afr_sh_has_metadata_pending (dict_t *xattr, xlator_t *this) +afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + afr_transaction_type type, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks, + uint64_t *witness, unsigned char *pflag) { - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - afr_private_t *priv = NULL; - int ret = -1; - int i = 0; - int j = 0; - - priv = this->private; - + afr_private_t *priv = NULL; + int i = 0; + int j = 0; + int *dirty = NULL; /* Denotes if dirty xattr is set */ + int **matrix = NULL; /* Changelog matrix */ + char *accused = NULL; /* Accused others without any self-accusal */ + char *pending = NULL; /* Have pending operations on others */ + char *self_accused = NULL; /* Accused itself */ + + priv = this->private; + + dirty = alloca0(priv->child_count * sizeof(int)); + accused = alloca0(priv->child_count); + pending = alloca0(priv->child_count); + self_accused = alloca0(priv->child_count); + matrix = ALLOC_MATRIX(priv->child_count, int); + memset(witness, 0, sizeof(*witness) * priv->child_count); + + /* First construct the pending matrix for further analysis */ + afr_selfheal_extract_xattr(this, replies, type, dirty, matrix); + + if (pflag) { for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); - - if (ret != 0) - return 0; - - memcpy (pending, pending_raw, sizeof(pending)); - j = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); - - if (pending[j]) - return 1; + for (j = 0; j < priv->child_count; j++) + if (matrix[i][j]) + *pflag |= PFLAG_PENDING; + if (*pflag) + break; + } + } + + if (afr_success_count(replies, priv->child_count) < priv->child_count) { + /* Treat this just like locks not being acquired */ + return -ENOTCONN; + } + + /* short list all self-accused */ + for (i = 0; i < priv->child_count; i++) { + if (matrix[i][i]) + self_accused[i] = 1; + } + + /* Next short list all accused to exclude them from being sources */ + /* Self-accused can't accuse others as they are FOOLs */ + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + if (matrix[i][j]) { + if (!self_accused[i]) + accused[j] += 1; + if (i != j) + pending[i] += 1; + } + } + } + + /* Short list all non-accused as sources */ + for (i = 0; i < priv->child_count; i++) { + if (!accused[i] && locked_on[i]) + sources[i] = 1; + else + sources[i] = 0; + } + + /* Everyone accused by non-self-accused sources are sinks */ + memset(sinks, 0, priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (self_accused[i]) + continue; + for (j = 0; j < priv->child_count; j++) { + if (matrix[i][j]) + sinks[j] = 1; + } + } + + /* For breaking ties provide with number of fops they witnessed */ + + /* + * count the pending fops witnessed from itself to others when it is + * self-accused + */ + for (i = 0; i < priv->child_count; i++) { + if (!self_accused[i]) + continue; + for (j = 0; j < priv->child_count; j++) { + if (i == j) + continue; + witness[i] += matrix[i][j]; + } + } + + if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION) + afr_selfheal_post_op_failure_accounting(priv, accused, sources, + locked_on); + + /* If no sources, all locked nodes are sinks - split brain */ + if (AFR_COUNT(sources, priv->child_count) == 0) { + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) + sinks[i] = 1; + } + if (pflag) + *pflag |= PFLAG_SBRAIN; + } + + /* One more class of witness similar to dirty in v2 is where no pending + * exists but we have self-accusing markers. This can happen in afr-v1 + * if the brick crashes just after doing xattrop on self but + * before xattrop on the other xattrs on the brick in pre-op. */ + if (AFR_COUNT(pending, priv->child_count) == 0) { + for (i = 0; i < priv->child_count; i++) { + if (self_accused[i]) + witness[i] += matrix[i][i]; + } + } else { + /* In afr-v1 if a file is self-accused and has pending + * operations on others then it is similar to 'dirty' in afr-v2. + * Consider such cases as witness. + */ + for (i = 0; i < priv->child_count; i++) { + if (self_accused[i] && pending[i]) + witness[i] += matrix[i][i]; } + } - return 0; + /* count the number of dirty fops witnessed */ + for (i = 0; i < priv->child_count; i++) + witness[i] += dirty[i]; + + return 0; } +void +afr_log_selfheal(uuid_t gfid, xlator_t *this, int ret, char *type, int source, + unsigned char *sources, unsigned char *healed_sinks) +{ + char *status = NULL; + char *sinks_str = NULL; + char *p = NULL; + char *sources_str = NULL; + char *q = NULL; + afr_private_t *priv = NULL; + gf_loglevel_t loglevel = GF_LOG_NONE; + int i = 0; + + priv = this->private; + sinks_str = alloca0(priv->child_count * 8); + p = sinks_str; + sources_str = alloca0(priv->child_count * 8); + q = sources_str; + for (i = 0; i < priv->child_count; i++) { + if (healed_sinks[i]) + p += sprintf(p, "%d ", i); + if (sources[i]) { + if (source == i) { + q += sprintf(q, "[%d] ", i); + } else { + q += sprintf(q, "%d ", i); + } + } + } + + if (ret < 0) { + status = "Failed"; + loglevel = GF_LOG_DEBUG; + } else { + status = "Completed"; + loglevel = GF_LOG_INFO; + } + + gf_msg(this->name, loglevel, 0, AFR_MSG_SELF_HEAL_INFO, + "%s %s selfheal on %s. " + "sources=%s sinks=%s", + status, type, uuid_utoa(gfid), sources_str, sinks_str); +} int -afr_sh_has_data_pending (dict_t *xattr, xlator_t *this) +afr_selfheal_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *parbuf) { - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - afr_private_t *priv = NULL; - int ret = -1; - int i = 0; - int j = 0; - - priv = this->private; + afr_local_t *local = NULL; + int i = -1; + GF_UNUSED int ret = -1; + int8_t need_heal = 1; + + local = frame->local; + i = (long)cookie; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (buf) + local->replies[i].poststat = *buf; + if (parbuf) + local->replies[i].postparent = *parbuf; + if (xdata) { + local->replies[i].xdata = dict_ref(xdata); + ret = dict_get_int8(xdata, "link-count", &need_heal); + } + + local->replies[i].need_heal = need_heal; + syncbarrier_wake(&local->barrier); + + return 0; +} - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); +inode_t * +afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent, + const char *name, struct afr_reply *replies, + unsigned char *lookup_on, dict_t *xattr) +{ + loc_t loc = { + 0, + }; + dict_t *xattr_req = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + inode_t *inode = NULL; - if (ret != 0) - return 0; + local = frame->local; + priv = frame->this->private; - memcpy (pending, pending_raw, sizeof(pending)); - j = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); + xattr_req = dict_new(); + if (!xattr_req) + return NULL; - if (pending[j]) - return 1; - } + if (xattr) + dict_copy(xattr, xattr_req); - return 0; -} + if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) { + dict_unref(xattr_req); + return NULL; + } + inode = inode_new(parent->table); + if (!inode) { + dict_unref(xattr_req); + return NULL; + } -int -afr_sh_has_entry_pending (dict_t *xattr, xlator_t *this) -{ - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - afr_private_t *priv = NULL; - int ret = -1; - int i = 0; - int j = 0; + loc.parent = inode_ref(parent); + gf_uuid_copy(loc.pargfid, parent->gfid); + loc.name = name; + loc.inode = inode_ref(inode); - priv = this->private; + AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xattr_req); - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); + afr_replies_copy(replies, local->replies, priv->child_count); - if (ret != 0) - return 0; + loc_wipe(&loc); + dict_unref(xattr_req); - memcpy (pending, pending_raw, sizeof(pending)); - j = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); - - if (pending[j]) - return 1; - } - - return 0; + return inode; } +static int +afr_set_multi_dom_lock_count_request(xlator_t *this, dict_t *dict) +{ + int ret = 0; + afr_private_t *priv = NULL; + char *key1 = NULL; + char *key2 = NULL; + + priv = this->private; + key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 + + strlen(this->name)); + key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 + + strlen(priv->sh_domain)); + + ret = dict_set_uint32(dict, GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS, 1); + if (ret) + return ret; -/** - * is_matrix_zero - return true if pending matrix is all zeroes - */ + sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name); + ret = dict_set_uint32(dict, key1, 1); + if (ret) + return ret; -int -afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count) -{ - int i = 0; - int j = 0; + sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain); + ret = dict_set_uint32(dict, key2, 1); + if (ret) + return ret; - for (i = 0; i < child_count; i++) - for (j = 0; j < child_count; j++) - if (pending_matrix[i][j]) - return 0; - return 1; + return 0; } - int -afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) +afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies, + unsigned char *discover_on, dict_t *dict) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; + loc_t loc = { + 0, + }; + dict_t *xattr_req = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = frame->this->private; + + xattr_req = dict_new(); + if (!xattr_req) + return -ENOMEM; + if (dict) + dict_copy(dict, xattr_req); + + if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) { + dict_unref(xattr_req); + return -ENOMEM; + } + + if (afr_set_multi_dom_lock_count_request(frame->this, xattr_req)) { + dict_unref(xattr_req); + return -1; + } - local = frame->local; - sh = &local->self_heal; - priv = this->private; + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, gfid); -// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); - memset (sh->buf, 0, sizeof (struct iatt) * priv->child_count); + AFR_ONLIST(discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xattr_req); - for (i = 0; i < priv->child_count; i++) { - sh->locked_nodes[i] = 0; - } + afr_replies_copy(replies, local->replies, priv->child_count); - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } + loc_wipe(&loc); + dict_unref(xattr_req); - if (local->govinda_gOvinda) { - gf_log (this->name, GF_LOG_INFO, - "split brain found, aborting selfheal of %s", - local->loc.path); - sh->op_failed = 1; - sh->completion_cbk (frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to metadata check on %s", - local->loc.path); - afr_self_heal_metadata (frame, this); - } - - return 0; + return 0; } - -static int -sh_missing_entries_finish (call_frame_t *frame, xlator_t *this) +int +afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid, + struct afr_reply *replies) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + afr_local_t *local = NULL; + dict_t *dict = NULL; - local = frame->local; - int_lock = &local->internal_lock; + local = frame->local; - int_lock->lock_cbk = afr_sh_missing_entries_done; - afr_unlock (frame, this); + if (local->xattr_req) + dict = local->xattr_req; - return 0; + return afr_selfheal_unlocked_discover_on(frame, inode, gfid, replies, + local->child_up, dict); } - -static int -sh_destroy_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int op_errno, - struct iatt *preop, struct iatt *postop) +unsigned int +afr_success_count(struct afr_reply *replies, unsigned int count) { - afr_local_t *local = NULL; - loc_t *parent_loc = cookie; - int call_count = 0; + int i = 0; + unsigned int success = 0; - local = frame->local; + for (i = 0; i < count; i++) + if (replies[i].valid && replies[i].op_ret == 0) + success++; + return success; +} - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "setattr on %s failed: %s", - local->loc.path, strerror (op_errno)); - } +int +afr_selfheal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + afr_local_t *local = NULL; + int i = 0; - if (parent_loc) { - loc_wipe (parent_loc); - GF_FREE (parent_loc); - } + local = frame->local; + i = (long)cookie; - call_count = afr_frame_return (frame); + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; - if (call_count == 0) { - STACK_DESTROY (frame->root); - } + syncbarrier_wake(&local->barrier); - return 0; + return 0; } - -static int -sh_missing_entries_newentry_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, - struct iatt *preparent, - struct iatt *postparent) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - call_frame_t *setattr_frame = NULL; - int call_count = 0; - int child_index = 0; - loc_t *parent_loc = NULL; - struct iatt stbuf = {0,}; - int32_t valid = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - stbuf.ia_atime = sh->buf[sh->source].ia_atime; - stbuf.ia_atime_nsec = sh->buf[sh->source].ia_atime_nsec; - stbuf.ia_mtime = sh->buf[sh->source].ia_mtime; - stbuf.ia_mtime_nsec = sh->buf[sh->source].ia_mtime_nsec; - - stbuf.ia_uid = sh->buf[sh->source].ia_uid; - stbuf.ia_gid = sh->buf[sh->source].ia_gid; - - valid = GF_SET_ATTR_UID | GF_SET_ATTR_GID | - GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to mknod on %s (%s)", - local->loc.path, priv->children[child_index]->name, - strerror (op_errno)); +int +afr_locked_fill(call_frame_t *frame, xlator_t *this, unsigned char *locked_on) +{ + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int count = 0; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && local->replies[i].op_ret == 0) { + locked_on[i] = 1; + count++; + } else { + locked_on[i] = 0; } + } - if (op_ret == 0) { - setattr_frame = copy_frame (frame); + return count; +} - setattr_frame->local = GF_CALLOC (1, sizeof (afr_local_t), - gf_afr_mt_afr_local_t); +int +afr_selfheal_tryinodelk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on) +{ + loc_t loc = { + 0, + }; + struct gf_flock flock = { + 0, + }; - ((afr_local_t *)setattr_frame->local)->call_count = 2; + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - gf_log (this->name, GF_LOG_TRACE, - "setattr (%s) on subvolume %s", - local->loc.path, priv->children[child_index]->name); + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; - STACK_WIND_COOKIE (setattr_frame, sh_destroy_cbk, - (void *) (long) 0, - priv->children[child_index], - priv->children[child_index]->fops->setattr, - &local->loc, &stbuf, valid); + AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock, + NULL); - valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - parent_loc = GF_CALLOC (1, sizeof (*parent_loc), - gf_afr_mt_loc_t); - afr_build_parent_loc (parent_loc, &local->loc); + loc_wipe(&loc); - STACK_WIND_COOKIE (setattr_frame, sh_destroy_cbk, - (void *) (long) parent_loc, - priv->children[child_index], - priv->children[child_index]->fops->setattr, - parent_loc, &sh->parentbuf, valid); - } + return afr_locked_fill(frame, this, locked_on); +} - call_count = afr_frame_return (frame); +int +afr_selfheal_inodelk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on) +{ + loc_t loc = { + 0, + }; + struct gf_flock flock = { + 0, + }; + afr_local_t *local = NULL; + int i = 0; + afr_private_t *priv = NULL; - if (call_count == 0) { - sh_missing_entries_finish (frame, this); - } + priv = this->private; + local = frame->local; - return 0; -} + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; -static int -sh_missing_entries_mknod (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - int ret = 0; - int enoent_count = 0; - int call_count = 0; - mode_t st_mode = 0; - dev_t ia_rdev = 0; - dict_t *dict = NULL; - dev_t st_rdev = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) - if (sh->child_errno[i] == ENOENT) - enoent_count++; - - call_count = enoent_count; - local->call_count = call_count; - - st_mode = st_mode_from_ia (sh->buf[sh->source].ia_prot, - sh->buf[sh->source].ia_type); - ia_rdev = sh->buf[sh->source].ia_rdev; - st_rdev = makedev (ia_major (ia_rdev), ia_minor (ia_rdev)); - - gf_log (this->name, GF_LOG_TRACE, - "mknod %s mode 0%o device type %"PRId64" on %d subvolumes", - local->loc.path, st_mode, (uint64_t)st_rdev, enoent_count); - - dict = dict_new (); - if (!dict) - gf_log (this->name, GF_LOG_ERROR, "out of memory"); - - ret = afr_set_dict_gfid (dict, sh->buf[sh->source].ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed", - local->loc.path); + AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock, + NULL); - for (i = 0; i < priv->child_count; i++) { - if (sh->child_errno[i] == ENOENT) { - STACK_WIND_COOKIE (frame, - sh_missing_entries_newentry_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mknod, - &local->loc, st_mode, st_rdev, dict); - if (!--call_count) - break; - } + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == -1 && + local->replies[i].op_errno == EAGAIN) { + afr_locked_fill(frame, this, locked_on); + afr_selfheal_uninodelk(frame, this, inode, dom, off, size, + locked_on); + + AFR_SEQ(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLKW, + &flock, NULL); + break; } + } - if (dict) - dict_unref (dict); + loc_wipe(&loc); - return 0; + return afr_locked_fill(frame, this, locked_on); } +static void +afr_get_lock_and_eagain_counts(afr_private_t *priv, struct afr_reply *replies, + int *lock_count, int *eagain_count) +{ + int i = 0; -static int -sh_missing_entries_mkdir (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - dict_t *dict = NULL; - int i = 0; - int ret = 0; - int enoent_count = 0; - int call_count = 0; - mode_t st_mode = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) - if (sh->child_errno[i] == ENOENT) - enoent_count++; - - call_count = enoent_count; - local->call_count = call_count; - - st_mode = st_mode_from_ia (sh->buf[sh->source].ia_prot, - sh->buf[sh->source].ia_type); - - dict = dict_new (); - if (!dict) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - sh_missing_entries_finish (frame, this); - return 0; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + if (replies[i].op_ret == 0) { + (*lock_count)++; + } else if (replies[i].op_ret == -1 && replies[i].op_errno == EAGAIN) { + (*eagain_count)++; } + } +} - ret = afr_set_dict_gfid (dict, sh->buf[sh->source].ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_INFO, - "%s: inode gfid set failed", local->loc.path); +/*Do blocking locks if number of locks acquired is majority and there were some + * EAGAINs. Useful for odd-way replication*/ +int +afr_selfheal_tie_breaker_inodelk(call_frame_t *frame, xlator_t *this, + inode_t *inode, char *dom, off_t off, + size_t size, unsigned char *locked_on) +{ + loc_t loc = { + 0, + }; + struct gf_flock flock = { + 0, + }; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int lock_count = 0; + int eagain_count = 0; + priv = this->private; + local = frame->local; - gf_log (this->name, GF_LOG_TRACE, - "mkdir %s mode 0%o on %d subvolumes", - local->loc.path, st_mode, enoent_count); + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - for (i = 0; i < priv->child_count; i++) { - if (sh->child_errno[i] == ENOENT) { - if (!strcmp (local->loc.path, "/")) { - /* We shouldn't try to create "/" */ - - sh_missing_entries_finish (frame, this); - - return 0; - } else { - STACK_WIND_COOKIE (frame, - sh_missing_entries_newentry_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mkdir, - &local->loc, st_mode, dict); - if (!--call_count) - break; - } - } - } + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; - if (dict) - dict_unref (dict); + AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock, + NULL); - return 0; -} + afr_get_lock_and_eagain_counts(priv, local->replies, &lock_count, + &eagain_count); + if (lock_count > priv->child_count / 2 && eagain_count) { + afr_locked_fill(frame, this, locked_on); + afr_selfheal_uninodelk(frame, this, inode, dom, off, size, locked_on); -static int -sh_missing_entries_symlink (call_frame_t *frame, xlator_t *this, - const char *link, struct iatt *buf) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - dict_t *dict = NULL; - int i = 0; - int ret = 0; - int enoent_count = 0; - int call_count = 0; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) - if (sh->child_errno[i] == ENOENT) - enoent_count++; - - call_count = enoent_count; - local->call_count = call_count; - - dict = dict_new (); - if (!dict) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - sh_missing_entries_finish (frame, this); - return 0; - } + AFR_SEQ(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLKW, + &flock, NULL); + } - ret = afr_set_dict_gfid (dict, buf->ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "%s: dict gfid set failed", local->loc.path); + loc_wipe(&loc); - gf_log (this->name, GF_LOG_TRACE, - "symlink %s -> %s on %d subvolumes", - local->loc.path, link, enoent_count); + return afr_locked_fill(frame, this, locked_on); +} - for (i = 0; i < priv->child_count; i++) { - if (sh->child_errno[i] == ENOENT) { - STACK_WIND_COOKIE (frame, - sh_missing_entries_newentry_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->symlink, - link, &local->loc, dict); - if (!--call_count) - break; - } - } +int +afr_selfheal_uninodelk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + const unsigned char *locked_on) +{ + loc_t loc = { + 0, + }; + struct gf_flock flock = { + 0, + }; - return 0; -} + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + flock.l_type = F_UNLCK; + flock.l_start = off; + flock.l_len = size; -static int -sh_missing_entries_readlink_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - const char *link, struct iatt *sbuf) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (op_ret > 0) - sh_missing_entries_symlink (frame, this, link, sbuf); - else { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to do readlink on %s (%s)", - local->loc.path, priv->children[sh->source]->name, - strerror (op_errno)); - sh_missing_entries_finish (frame, this); - } + AFR_ONLIST(locked_on, frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, + F_SETLK, &flock, NULL); - return 0; -} + loc_wipe(&loc); + return 0; +} -static int -sh_missing_entries_readlink (call_frame_t *frame, xlator_t *this) +int +afr_selfheal_tryentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; + loc_t loc = { + 0, + }; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - STACK_WIND (frame, sh_missing_entries_readlink_cbk, - priv->children[sh->source], - priv->children[sh->source]->fops->readlink, - &local->loc, 4096); + AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); - return 0; -} + loc_wipe(&loc); + return afr_locked_fill(frame, this, locked_on); +} -static int -sh_missing_entries_create (call_frame_t *frame, xlator_t *this) +int +afr_selfheal_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int type = 0; - int i = 0; - afr_private_t *priv = NULL; - int enoent_count = 0; - int govinda_gOvinda = 0; + loc_t loc = { + 0, + }; + afr_local_t *local = NULL; + int i = 0; + afr_private_t *priv = NULL; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + priv = this->private; + local = frame->local; - for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i]) - continue; - - if (sh->child_errno[i]) { - if (sh->child_errno[i] == ENOENT) - enoent_count++; - } else { - if (type) { - if (type != sh->buf[i].ia_type) { - gf_log (this->name, GF_LOG_DEBUG, - "file %s is not recoverable " - "automatically!", - local->loc.path); - - govinda_gOvinda = 1; - } - } else { - sh->source = i; - type = sh->buf[i].ia_type; - } - } - } + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - if (govinda_gOvinda) { - gf_log (this->name, GF_LOG_ERROR, - "conflicting filetypes exist for path %s. returning.", - local->loc.path); + AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); - local->govinda_gOvinda = 1; - sh_missing_entries_finish (frame, this); - return 0; - } + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == -1 && + local->replies[i].op_errno == EAGAIN) { + afr_locked_fill(frame, this, locked_on); + afr_selfheal_unentrylk(frame, this, inode, dom, name, locked_on, + NULL); - if (!type) { - gf_log (this->name, GF_LOG_ERROR, - "no source found for %s. all nodes down?. returning.", - local->loc.path); - /* subvolumes down and/or file does not exist */ - sh_missing_entries_finish (frame, this); - return 0; + AFR_SEQ(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + break; } + } - if (enoent_count == 0) { - gf_log (this->name, GF_LOG_INFO, - "no missing files - %s. proceeding to metadata check", - local->loc.path); - /* proceed to next step - metadata self-heal */ - sh_missing_entries_finish (frame, this); - return 0; - } - - switch (type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - sh_missing_entries_mknod (frame, this); - break; - case IA_IFLNK: - sh_missing_entries_readlink (frame, this); - break; - case IA_IFDIR: - sh_missing_entries_mkdir (frame, this); - break; - default: - gf_log (this->name, GF_LOG_ERROR, - "%s: unknown file type: 0%o", local->loc.path, type); - local->govinda_gOvinda = 1; - sh_missing_entries_finish (frame, this); - } + loc_wipe(&loc); - return 0; + return afr_locked_fill(frame, this, locked_on); } +int +afr_selfheal_tie_breaker_entrylk(call_frame_t *frame, xlator_t *this, + inode_t *inode, char *dom, const char *name, + unsigned char *locked_on) +{ + loc_t loc = { + 0, + }; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int lock_count = 0; + int eagain_count = 0; -static int -sh_missing_entries_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - int child_index = 0; - afr_local_t *local = NULL; - int call_count = 0; - afr_private_t *priv = NULL; - mode_t st_mode = 0; - - local = frame->local; - priv = this->private; - - child_index = (long) cookie; - - if (buf) - st_mode = st_mode_from_ia (buf->ia_prot, buf->ia_type); - - LOCK (&frame->lock); - { - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "path %s on subvolume %s is of mode 0%o", - local->loc.path, - priv->children[child_index]->name, - st_mode); - - local->self_heal.buf[child_index] = *buf; - local->self_heal.parentbuf = *postparent; - } else { - gf_log (this->name, GF_LOG_INFO, - "path %s on subvolume %s => -1 (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - local->self_heal.child_errno[child_index] = op_errno; - } + priv = this->private; + local = frame->local; - } - UNLOCK (&frame->lock); + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - call_count = afr_frame_return (frame); + AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); - if (call_count == 0) { - sh_missing_entries_create (frame, this); - } + afr_get_lock_and_eagain_counts(priv, local->replies, &lock_count, + &eagain_count); - return 0; -} + if (lock_count > priv->child_count / 2 && eagain_count) { + afr_locked_fill(frame, this, locked_on); + afr_selfheal_unentrylk(frame, this, inode, dom, name, locked_on, NULL); + AFR_SEQ(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + } -static int -sh_missing_entries_lookup (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - int i = 0; - int call_count = 0; - afr_private_t *priv = NULL; - dict_t *xattr_req = NULL; - int ret = -1; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (priv->child_count, - local->child_up); - - local->call_count = call_count; - - xattr_req = dict_new(); - - if (xattr_req) { - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_uint64 (xattr_req, - priv->pending_key[i], - 3 * sizeof(int32_t)); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set value for %s", - local->loc.path, priv->pending_key[i]); - } - } + loc_wipe(&loc); - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, - sh_missing_entries_lookup_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - &local->loc, xattr_req); - - if (!--call_count) - break; - } - } - - if (xattr_req) - dict_unref (xattr_req); - - return 0; + return afr_locked_fill(frame, this, locked_on); } - - int -afr_sh_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) +afr_selfheal_unentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on, + dict_t *xdata) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + loc_t loc = { + 0, + }; - local = frame->local; - int_lock = &local->internal_lock; + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "Non blocking entrylks failed."); - afr_sh_missing_entries_done (frame, this); - } else { + AFR_ONLIST(locked_on, frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, + name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata); - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking entrylks done. Proceeding to FOP"); - sh_missing_entries_lookup (frame, this); - } + loc_wipe(&loc); - return 0; + return 0; } -static int -afr_sh_entrylk (call_frame_t *frame, xlator_t *this) +gf_boolean_t +afr_is_data_set(xlator_t *this, dict_t *xdata) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - - int_lock->transaction_lk_type = AFR_SELFHEAL_LK; - int_lock->selfheal_lk_type = AFR_ENTRY_SELF_HEAL_LK; + return afr_is_pending_set(this, xdata, AFR_DATA_TRANSACTION); +} - afr_set_lock_number (frame, this); +gf_boolean_t +afr_is_metadata_set(xlator_t *this, dict_t *xdata) +{ + return afr_is_pending_set(this, xdata, AFR_METADATA_TRANSACTION); +} - int_lock->lk_basename = local->loc.name; - int_lock->lk_loc = &sh->parent_loc; - int_lock->lock_cbk = afr_sh_post_nonblocking_entrylk_cbk; +gf_boolean_t +afr_is_entry_set(xlator_t *this, dict_t *xdata) +{ + return afr_is_pending_set(this, xdata, AFR_ENTRY_TRANSACTION); +} - afr_nonblocking_entrylk (frame, this); +/* + * This function inspects the looked up replies (in an unlocked manner) + * and decides whether a locked verification and possible healing is + * required or not. It updates the three booleans for each type + * of healing. If the boolean flag gets set to FALSE, then we are sure + * no healing is required. If the boolean flag gets set to TRUE then + * we have to proceed with locked reinspection. + */ - return 0; +int +afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid, + inode_t **link_inode, gf_boolean_t *data_selfheal, + gf_boolean_t *metadata_selfheal, + gf_boolean_t *entry_selfheal, + struct afr_reply *replies_dst) +{ + afr_private_t *priv = NULL; + inode_t *inode = NULL; + int i = 0; + int valid_cnt = 0; + struct iatt first = { + 0, + }; + int first_idx = 0; + struct afr_reply *replies = NULL; + int ret = -1; + + priv = this->private; + + inode = afr_inode_find(this, gfid); + if (!inode) + goto out; + + replies = alloca0(sizeof(*replies) * priv->child_count); + + ret = afr_selfheal_unlocked_discover(frame, inode, gfid, replies); + if (ret) + goto out; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + if (replies[i].op_ret == -1) + continue; + + /* The data segment of the changelog can be non-zero to indicate + * the directory needs a full heal. So the check below ensures + * it's not a directory before setting the data_selfheal boolean. + */ + if (data_selfheal && !IA_ISDIR(replies[i].poststat.ia_type) && + afr_is_data_set(this, replies[i].xdata)) + *data_selfheal = _gf_true; + + if (metadata_selfheal && afr_is_metadata_set(this, replies[i].xdata)) + *metadata_selfheal = _gf_true; + + if (entry_selfheal && afr_is_entry_set(this, replies[i].xdata)) + *entry_selfheal = _gf_true; + + valid_cnt++; + if (valid_cnt == 1) { + first = replies[i].poststat; + first_idx = i; + continue; + } + + if (!IA_EQUAL(first, replies[i].poststat, type)) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "TYPE mismatch %d vs %d on %s for gfid:%s", + (int)first.ia_type, (int)replies[i].poststat.ia_type, + priv->children[i]->name, + uuid_utoa(replies[i].poststat.ia_gfid)); + gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" + "subvol=%s;" + "type=file;gfid=%s;" + "ia_type-%d=%s;ia_type-%d=%s", + this->ctx->cmd_args.client_pid, this->name, + uuid_utoa(replies[i].poststat.ia_gfid), first_idx, + gf_inode_type_to_str(first.ia_type), i, + gf_inode_type_to_str(replies[i].poststat.ia_type)); + ret = -EIO; + goto out; + } + + if (!IA_EQUAL(first, replies[i].poststat, uid)) { + gf_msg_debug(this->name, 0, + "UID mismatch " + "%d vs %d on %s for gfid:%s", + (int)first.ia_uid, (int)replies[i].poststat.ia_uid, + priv->children[i]->name, + uuid_utoa(replies[i].poststat.ia_gfid)); + + if (metadata_selfheal) + *metadata_selfheal = _gf_true; + } + + if (!IA_EQUAL(first, replies[i].poststat, gid)) { + gf_msg_debug(this->name, 0, + "GID mismatch " + "%d vs %d on %s for gfid:%s", + (int)first.ia_uid, (int)replies[i].poststat.ia_uid, + priv->children[i]->name, + uuid_utoa(replies[i].poststat.ia_gfid)); + + if (metadata_selfheal) + *metadata_selfheal = _gf_true; + } + + if (!IA_EQUAL(first, replies[i].poststat, prot)) { + gf_msg_debug(this->name, 0, + "MODE mismatch " + "%d vs %d on %s for gfid:%s", + (int)st_mode_from_ia(first.ia_prot, 0), + (int)st_mode_from_ia(replies[i].poststat.ia_prot, 0), + priv->children[i]->name, + uuid_utoa(replies[i].poststat.ia_gfid)); + + if (metadata_selfheal) + *metadata_selfheal = _gf_true; + } + + if (IA_ISREG(first.ia_type) && + !IA_EQUAL(first, replies[i].poststat, size)) { + gf_msg_debug(this->name, 0, + "SIZE mismatch " + "%lld vs %lld on %s for gfid:%s", + (long long)first.ia_size, + (long long)replies[i].poststat.ia_size, + priv->children[i]->name, + uuid_utoa(replies[i].poststat.ia_gfid)); + + if (data_selfheal) + *data_selfheal = _gf_true; + } + } + + if (valid_cnt > 0 && link_inode) { + *link_inode = inode_link(inode, NULL, NULL, &first); + if (!*link_inode) { + ret = -EINVAL; + goto out; + } + } else if (valid_cnt < 2) { + ret = afr_check_stale_error(replies, priv); + goto out; + } + + ret = 0; +out: + if (replies && replies_dst) + afr_replies_copy(replies_dst, replies, priv->child_count); + if (inode) + inode_unref(inode); + if (replies) + afr_replies_wipe(replies, priv->child_count); + + return ret; } -static int -afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) +inode_t * +afr_inode_find(xlator_t *this, uuid_t gfid) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; + inode_table_t *table = NULL; + inode_t *inode = NULL; - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - priv = this->private; + table = this->itable; + if (!table) + return NULL; - gf_log (this->name, GF_LOG_TRACE, - "attempting to recreate missing entries for path=%s", - local->loc.path); + inode = inode_find(table, gfid); + if (inode) + return inode; - afr_build_parent_loc (&sh->parent_loc, &local->loc); + inode = inode_new(table); + if (!inode) + return NULL; - afr_sh_entrylk (frame, this); - return 0; + gf_uuid_copy(inode->gfid, gfid); + + return inode; } -afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) +call_frame_t * +afr_frame_create(xlator_t *this, int32_t *op_errno) { - afr_private_t *priv = NULL; - afr_local_t *lc = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *shc = NULL; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + pid_t pid = GF_CLIENT_PID_SELF_HEALD; - priv = this->private; + frame = create_frame(this, this->ctx->pool); + if (!frame) { + if (op_errno) + *op_errno = ENOMEM; + return NULL; + } - sh = &l->self_heal; + local = AFR_FRAME_INIT(frame, (*op_errno)); + if (!local) { + STACK_DESTROY(frame->root); + return NULL; + } - lc = GF_CALLOC (1, sizeof (afr_local_t), - gf_afr_mt_afr_local_t); - if (!lc) - goto out; + syncopctx_setfspid(&pid); - shc = &lc->self_heal; - - shc->unwind = sh->unwind; - shc->need_data_self_heal = sh->need_data_self_heal; - shc->need_metadata_self_heal = sh->need_metadata_self_heal; - shc->need_entry_self_heal = sh->need_entry_self_heal; - shc->forced_merge = sh->forced_merge; - shc->healing_fd_opened = sh->healing_fd_opened; - shc->data_lock_held = sh->data_lock_held; - if (sh->healing_fd && !sh->healing_fd_opened) - shc->healing_fd = fd_ref (sh->healing_fd); - else - shc->healing_fd = sh->healing_fd; - shc->background = sh->background; - shc->type = sh->type; - - if (l->loc.path) - loc_copy (&lc->loc, &l->loc); - - lc->child_up = memdup (l->child_up, priv->child_count); - if (l->xattr_req) - lc->xattr_req = dict_ref (l->xattr_req); - - if (l->cont.lookup.inode) - lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode); - if (l->cont.lookup.xattr) - lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr); - if (l->internal_lock.inode_locked_nodes) - lc->internal_lock.inode_locked_nodes = - memdup (l->internal_lock.inode_locked_nodes, - priv->child_count); - else - lc->internal_lock.inode_locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.inode_locked_nodes), - priv->child_count, - gf_afr_mt_char); - if (l->internal_lock.entry_locked_nodes) - lc->internal_lock.entry_locked_nodes = - memdup (l->internal_lock.entry_locked_nodes, - priv->child_count); - else - lc->internal_lock.entry_locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.entry_locked_nodes), - priv->child_count, - gf_afr_mt_char); - if (l->internal_lock.locked_nodes) - lc->internal_lock.locked_nodes = - memdup (l->internal_lock.locked_nodes, - priv->child_count); - else - lc->internal_lock.locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), - priv->child_count, - gf_afr_mt_char); + frame->root->pid = pid; - lc->internal_lock.inodelk_lock_count = - l->internal_lock.inodelk_lock_count; - lc->internal_lock.entrylk_lock_count = - l->internal_lock.entrylk_lock_count; + afr_set_lk_owner(frame, this, frame->root); -out: - return lc; + return frame; } int -afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) +afr_selfheal_newentry_mark(call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, struct afr_reply *replies, + unsigned char *sources, unsigned char *newentry) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - char sh_type_str[256] = {0,}; + int ret = 0; + int i = 0; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int **changelog = NULL; - priv = this->private; - local = bgsh_frame->local; - sh = &local->self_heal; + priv = this->private; - if (local->govinda_gOvinda) { - afr_set_split_brain (this, local->cont.lookup.inode, - _gf_true); - } else { - afr_set_split_brain (this, local->cont.lookup.inode, - _gf_false); - } + gf_uuid_copy(inode->gfid, replies[source].poststat.ia_gfid); - afr_self_heal_type_str_get (sh, sh_type_str, - sizeof(sh_type_str)); - if (sh->op_failed) { - gf_log (this->name, GF_LOG_ERROR, "background %s self-heal " - "failed on %s", sh_type_str, local->loc.path); - } else { - gf_log (this->name, GF_LOG_INFO, "background %s self-heal " - "completed on %s", sh_type_str, local->loc.path); - } + xattr = dict_new(); + if (!xattr) + return -ENOMEM; - FRAME_SU_UNDO (bgsh_frame, afr_local_t); + changelog = afr_mark_pending_changelog(priv, newentry, xattr, + replies[source].poststat.ia_type); - if (!sh->unwound) { - sh->unwind (sh->orig_frame, this); - } + if (!changelog) { + ret = -ENOMEM; + goto out; + } - if (sh->background) { - LOCK (&priv->lock); - { - priv->background_self_heals_started--; - } - UNLOCK (&priv->lock); - } + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + ret |= afr_selfheal_post_op(frame, this, inode, i, xattr, NULL); + } +out: + if (changelog) + afr_matrix_cleanup(changelog, priv->child_count); + if (xattr) + dict_unref(xattr); + return ret; +} - AFR_STACK_DESTROY (bgsh_frame); +int +afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid) +{ + int ret = -1; + int entry_ret = 1; + int metadata_ret = 1; + int data_ret = 1; + int or_ret = 0; + inode_t *inode = NULL; + fd_t *fd = NULL; + gf_boolean_t data_selfheal = _gf_false; + gf_boolean_t metadata_selfheal = _gf_false; + gf_boolean_t entry_selfheal = _gf_false; + afr_private_t *priv = NULL; + + priv = this->private; + + ret = afr_selfheal_unlocked_inspect(frame, this, gfid, &inode, + &data_selfheal, &metadata_selfheal, + &entry_selfheal, NULL); + if (ret) + goto out; + + if (!(data_selfheal || metadata_selfheal || entry_selfheal)) { + ret = 2; + goto out; + } + + if (inode->ia_type == IA_IFREG) { + ret = afr_selfheal_data_open(this, inode, &fd); + if (!fd) { + ret = -EIO; + goto out; + } + } + + if (data_selfheal && priv->data_self_heal) + data_ret = afr_selfheal_data(frame, this, fd); + + if (metadata_selfheal && priv->metadata_self_heal) + metadata_ret = afr_selfheal_metadata(frame, this, inode); + + if (entry_selfheal && priv->entry_self_heal) + entry_ret = afr_selfheal_entry(frame, this, inode); + + or_ret = (data_ret | metadata_ret | entry_ret); + + if (data_ret == -EIO || metadata_ret == -EIO || entry_ret == -EIO) + ret = -EIO; + else if (data_ret == 1 && metadata_ret == 1 && entry_ret == 1) + ret = 1; + else if (or_ret < 0) + ret = or_ret; + else + ret = 0; - return 0; +out: + if (inode) + inode_unref(inode); + if (fd) + fd_unref(fd); + return ret; } +/* + * This is the entry point for healing a given GFID. The return values for this + * function are as follows: + * '0' if the self-heal is successful + * '1' if the afr-xattrs are non-zero (due to on-going IO) and no heal is needed + * '2' if the afr-xattrs are all-zero and no heal is needed + * $errno if the heal on the gfid failed. + */ int -afr_self_heal (call_frame_t *frame, xlator_t *this) +afr_selfheal(xlator_t *this, uuid_t gfid) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; + int ret = -1; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t *sh_local = NULL; - - local = frame->local; - priv = this->private; + frame = afr_frame_create(this, NULL); + if (!frame) + return ret; - GF_ASSERT (local->loc.path); + local = frame->local; + local->xdata_req = dict_new(); - afr_set_lk_owner (frame, this); + ret = afr_selfheal_do(frame, this, gfid); - if (local->self_heal.background) { - LOCK (&priv->lock); - { - if (priv->background_self_heals_started - < priv->background_self_heal_count) { - priv->background_self_heals_started++; + if (frame) + AFR_STACK_DESTROY(frame); + return ret; +} - } else { - local->self_heal.background = _gf_false; - } - } - UNLOCK (&priv->lock); - } +afr_local_t * +__afr_dequeue_heals(afr_private_t *priv) +{ + afr_local_t *local = NULL; + + if (list_empty(&priv->heal_waiting)) + goto none; + if ((priv->background_self_heal_count > 0) && + (priv->healers >= priv->background_self_heal_count)) + goto none; + + local = list_entry(priv->heal_waiting.next, afr_local_t, healer); + priv->heal_waiters--; + GF_ASSERT(priv->heal_waiters >= 0); + list_del_init(&local->healer); + list_add(&local->healer, &priv->healing); + priv->healers++; + return local; +none: + gf_msg_debug(THIS->name, 0, + "Nothing dequeued. " + "Num healers: %d, Num Waiters: %d", + priv->healers, priv->heal_waiters); + return NULL; +} - gf_log (this->name, GF_LOG_TRACE, - "performing self heal on %s (metadata=%d data=%d entry=%d)", - local->loc.path, - local->self_heal.need_metadata_self_heal, - local->self_heal.need_data_self_heal, - local->self_heal.need_entry_self_heal); - - sh_frame = copy_frame (frame); - sh_local = afr_local_copy (local, this); - sh_frame->local = sh_local; - sh = &sh_local->self_heal; - - sh->orig_frame = frame; - - sh->completion_cbk = afr_self_heal_completion_cbk; - - sh->buf = GF_CALLOC (priv->child_count, sizeof (struct iatt), - gf_afr_mt_iatt); - sh->child_errno = GF_CALLOC (priv->child_count, sizeof (int), - gf_afr_mt_int); - sh->success = GF_CALLOC (priv->child_count, sizeof (int), - gf_afr_mt_int); - sh->xattr = GF_CALLOC (priv->child_count, sizeof (dict_t *), - gf_afr_mt_dict_t); - sh->sources = GF_CALLOC (sizeof (*sh->sources), priv->child_count, - gf_afr_mt_int); - sh->locked_nodes = GF_CALLOC (sizeof (*sh->locked_nodes), - priv->child_count, - gf_afr_mt_int); - - sh->pending_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count, - gf_afr_mt_int32_t); +int +afr_refresh_selfheal_wrap(void *opaque) +{ + call_frame_t *heal_frame = opaque; + afr_local_t *local = heal_frame->local; + int ret = 0; - for (i = 0; i < priv->child_count; i++) { - sh->pending_matrix[i] = GF_CALLOC (sizeof (int32_t), - priv->child_count, - gf_afr_mt_int32_t); - } + ret = afr_selfheal(heal_frame->this, local->refreshinode->gfid); + return ret; +} - sh->delta_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count, - gf_afr_mt_int32_t); - for (i = 0; i < priv->child_count; i++) { - sh->delta_matrix[i] = GF_CALLOC (sizeof (int32_t), - priv->child_count, - gf_afr_mt_int32_t); - } - sh->child_success = GF_CALLOC (sizeof (*sh->child_success), - priv->child_count, gf_afr_mt_int32_t); +int +afr_refresh_heal_done(int ret, call_frame_t *frame, void *opaque) +{ + call_frame_t *heal_frame = opaque; + xlator_t *this = heal_frame->this; + afr_private_t *priv = this->private; + afr_local_t *local = heal_frame->local; + + LOCK(&priv->lock); + { + list_del_init(&local->healer); + priv->healers--; + GF_ASSERT(priv->healers >= 0); + local = __afr_dequeue_heals(priv); + } + UNLOCK(&priv->lock); + + AFR_STACK_DESTROY(heal_frame); + + if (local) + afr_heal_synctask(this, local); + return 0; +} +void +afr_heal_synctask(xlator_t *this, afr_local_t *local) +{ + int ret = 0; + call_frame_t *heal_frame = NULL; + + heal_frame = local->heal_frame; + ret = synctask_new(this->ctx->env, afr_refresh_selfheal_wrap, + afr_refresh_heal_done, heal_frame, heal_frame); + if (ret < 0) + /* Heal not launched. Will be queued when the next inode + * refresh happens and shd hasn't healed it yet. */ + afr_refresh_heal_done(ret, heal_frame, heal_frame); +} - FRAME_SU_DO (sh_frame, afr_local_t); - if (local->success_count && local->enoent_count) { - afr_self_heal_missing_entries (sh_frame, this); +gf_boolean_t +afr_throttled_selfheal(call_frame_t *frame, xlator_t *this) +{ + gf_boolean_t can_heal = _gf_true; + afr_private_t *priv = this->private; + afr_local_t *local = frame->local; + + LOCK(&priv->lock); + { + if ((priv->background_self_heal_count > 0) && + (priv->heal_wait_qlen + priv->background_self_heal_count) > + (priv->heal_waiters + priv->healers)) { + list_add_tail(&local->healer, &priv->heal_waiting); + priv->heal_waiters++; + local = __afr_dequeue_heals(priv); } else { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to metadata check on %s", - local->loc.path); - - afr_sh_missing_entries_done (sh_frame, this); + can_heal = _gf_false; } + } + UNLOCK(&priv->lock); - return 0; + if (can_heal) { + if (local) + afr_heal_synctask(this, local); + else + gf_msg_debug(this->name, 0, + "Max number of heals are " + "pending, background self-heal rejected."); + } + + return can_heal; } -void -afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, - size_t size) +int +afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources, + afr_transaction_type type) { - GF_ASSERT (str && (size > strlen (" meta-data data entry"))); + int source = -1; + int i = 0; - if (self_heal_p->need_metadata_self_heal) { - snprintf(str, size, " meta-data"); - } + /* Give preference to local child to save on bandwidth */ + for (i = 0; i < priv->child_count; i++) { + if (priv->local[i] && sources[i]) { + if ((type == AFR_DATA_TRANSACTION) && AFR_IS_ARBITER_BRICK(priv, i)) + continue; - if (self_heal_p->need_data_self_heal) { - snprintf(str + strlen(str), size - strlen(str), " data"); + source = i; + goto out; } + } - if (self_heal_p->need_entry_self_heal) { - snprintf(str + strlen(str), size - strlen(str), " entry"); + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source = i; + goto out; } + } +out: + return source; } -afr_self_heal_type -afr_self_heal_type_for_transaction (afr_transaction_type type) +static int +afr_anon_inode_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; + afr_local_t *local = frame->local; + int i = (long)cookie; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (op_ret == 0) { + local->op_ret = 0; + local->replies[i].poststat = *buf; + local->replies[i].preparent = *preparent; + local->replies[i].postparent = *postparent; + } + if (xdata) { + local->replies[i].xdata = dict_ref(xdata); + } + + syncbarrier_wake(&local->barrier); + return 0; +} - switch (type) { - case AFR_DATA_TRANSACTION: - sh_type = AFR_SELF_HEAL_DATA; - break; - case AFR_METADATA_TRANSACTION: - sh_type = AFR_SELF_HEAL_METADATA; - break; - case AFR_ENTRY_TRANSACTION: - sh_type = AFR_SELF_HEAL_ENTRY; - break; - case AFR_ENTRY_RENAME_TRANSACTION: - GF_ASSERT (0); - break; +int +afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode) +{ + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = this->private; + unsigned char *mkdir_on = alloca0(priv->child_count); + unsigned char *lookup_on = alloca0(priv->child_count); + loc_t loc = {0}; + int32_t op_errno = 0; + int32_t child_op_errno = 0; + struct iatt iatt = {0}; + dict_t *xdata = NULL; + uuid_t anon_inode_gfid = {0}; + int mkdir_count = 0; + int i = 0; + + /*Try to mkdir everywhere and return success if the dir exists on 'child' + */ + + if (!priv->use_anon_inode) { + op_errno = EINVAL; + goto out; + } + + frame = afr_frame_create(this, &op_errno); + if (op_errno) { + goto out; + } + local = frame->local; + if (!local->child_up[child]) { + /*Other bricks may need mkdir so don't error out yet*/ + child_op_errno = ENOTCONN; + } + gf_uuid_parse(priv->anon_gfid_str, anon_inode_gfid); + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + + if (priv->anon_inode[i]) { + mkdir_on[i] = 0; + } else { + mkdir_on[i] = 1; + mkdir_count++; + } + } + + if (mkdir_count == 0) { + *linked_inode = inode_find(this->itable, anon_inode_gfid); + if (*linked_inode) { + op_errno = 0; + goto out; + } + } + + loc.parent = inode_ref(this->itable->root); + loc.name = priv->anon_inode_name; + loc.inode = inode_new(this->itable); + if (!loc.inode) { + op_errno = ENOMEM; + goto out; + } + + xdata = dict_new(); + if (!xdata) { + op_errno = ENOMEM; + goto out; + } + + op_errno = -dict_set_gfuuid(xdata, "gfid-req", anon_inode_gfid, _gf_true); + if (op_errno) { + goto out; + } + + if (mkdir_count == 0) { + memcpy(lookup_on, local->child_up, priv->child_count); + goto lookup; + } + + AFR_ONLIST(mkdir_on, frame, afr_anon_inode_mkdir_cbk, mkdir, &loc, 0755, 0, + xdata); + + for (i = 0; i < priv->child_count; i++) { + if (!mkdir_on[i]) { + continue; + } + + if (local->replies[i].op_ret == 0) { + priv->anon_inode[i] = 1; + iatt = local->replies[i].poststat; + } else if (local->replies[i].op_ret < 0 && + local->replies[i].op_errno == EEXIST) { + lookup_on[i] = 1; + } else if (i == child) { + child_op_errno = local->replies[i].op_errno; + } + } + + if (AFR_COUNT(lookup_on, priv->child_count) == 0) { + goto link; + } + +lookup: + AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xdata); + for (i = 0; i < priv->child_count; i++) { + if (!lookup_on[i]) { + continue; + } + + if (local->replies[i].op_ret == 0) { + if (gf_uuid_compare(anon_inode_gfid, + local->replies[i].poststat.ia_gfid) == 0) { + priv->anon_inode[i] = 1; + iatt = local->replies[i].poststat; + } else { + if (i == child) + child_op_errno = EINVAL; + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_DATA, + "%s has gfid: %s", priv->anon_inode_name, + uuid_utoa(local->replies[i].poststat.ia_gfid)); + } + } else if (i == child) { + child_op_errno = local->replies[i].op_errno; + } + } +link: + if (!gf_uuid_is_null(iatt.ia_gfid)) { + *linked_inode = inode_link(loc.inode, loc.parent, loc.name, &iatt); + if (*linked_inode) { + op_errno = 0; + inode_lookup(*linked_inode); + } else { + op_errno = ENOMEM; } - return sh_type; + goto out; + } + +out: + if (xdata) + dict_unref(xdata); + loc_wipe(&loc); + /*child_op_errno takes precedence*/ + if (child_op_errno == 0) { + child_op_errno = op_errno; + } + + if (child_op_errno && *linked_inode) { + inode_unref(*linked_inode); + *linked_inode = NULL; + } + if (frame) + AFR_STACK_DESTROY(frame); + return -child_op_errno; } |
