diff options
Diffstat (limited to 'xlators/cluster/afr/src/afr-self-heal-common.c')
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 1770 |
1 files changed, 690 insertions, 1080 deletions
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 7c2e403c7..4dac83113 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -1,1399 +1,1009 @@ /* - Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#include "glusterfs.h" -#include "xlator.h" -#include "byte-order.h" + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif #include "afr.h" -#include "afr-transaction.h" -#include "afr-self-heal-common.h" #include "afr-self-heal.h" +#include "byte-order.h" -/** - * select_source - select a source and return it - * TODO: take into account option 'favorite-child' - */ - int -afr_sh_select_source (int sources[], int child_count) +afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - int i; - for (i = 0; i < child_count; i++) - if (sources[i]) - return i; - - return -1; -} + afr_local_t *local = NULL; + local = frame->local; -/** - * sink_count - return number of sinks in sources array - */ + syncbarrier_wake (&local->barrier); -int -afr_sh_sink_count (int sources[], int child_count) -{ - int i; - int sinks = 0; - for (i = 0; i < child_count; i++) - if (!sources[i]) - sinks++; - return sinks; + return 0; } + int -afr_sh_source_count (int sources[], int child_count) +afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode, + int subvol, dict_t *xattr) { - int i; - int nsource = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + loc_t loc = {0, }; - for (i = 0; i < child_count; i++) - if (sources[i]) - nsource++; - return nsource; -} + priv = this->private; + local = frame->local; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); -int -afr_sh_supress_errenous_children (int sources[], int child_errno[], - int child_count) -{ - int i = 0; + STACK_WIND (frame, afr_selfheal_post_op_cbk, priv->children[subvol], + priv->children[subvol]->fops->xattrop, &loc, + GF_XATTROP_ADD_ARRAY, xattr, NULL); - for (i = 0; i < child_count; i++) { - if (child_errno[i] && sources[i]) { - sources[i] = 0; - } - } + syncbarrier_wait (&local->barrier, 1); return 0; } -int -afr_sh_supress_empty_children (int sources[], dict_t *xattr[], - struct stat *buf, - int child_count, const char *key) +dict_t * +afr_selfheal_output_xattr (xlator_t *this, afr_transaction_type type, + int *output_dirty, int **output_matrix, int subvol) { - int i = 0; - int32_t *pending = NULL; - int ret = 0; - int all_xattr_missing = 1; - - /* if the file was created by afr with xattrs */ - for (i = 0; i < child_count; i++) { - if (!xattr[i]) - continue; + dict_t *xattr = NULL; + afr_private_t *priv = NULL; + int j = 0; + int idx = 0; + int ret = 0; + int *raw = 0; - ret = dict_get_ptr (xattr[i], (char *)key, VOID(&pending)); - if (ret != 0) { - continue; - } + priv = this->private; + idx = afr_index_for_transaction_type (type); - all_xattr_missing = 0; - break; - } + xattr = dict_new (); + if (!xattr) + return NULL; - if (all_xattr_missing) { - /* supress 0byte files.. this avoids empty file created - by dir selfheal to overwrite the 'good' file */ - for (i = 0; i < child_count; i++) { - if (!buf[i].st_size) - sources[i] = 0; - } - goto out; - } + if (output_dirty[subvol]) { + /* clear dirty */ + raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t); + if (!raw) + goto err; + raw[idx] = hton32 (output_dirty[subvol]); + ret = dict_set_bin (xattr, AFR_DIRTY, raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) + goto err; + } - for (i = 0; i < child_count; i++) { - if (!xattr[i]) { - sources[i] = 0; + /* clear/set pending */ + for (j = 0; j < priv->child_count; j++) { + if (!output_matrix[subvol][j]) continue; - } - ret = dict_get_ptr (xattr[i], (char *)key, VOID(&pending)); - if (ret != 0) { - sources[i] = 0; - continue; - } + raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS, + gf_afr_mt_int32_t); + if (!raw) + goto err; - if (!pending) { - sources[i] = 0; - continue; - } + raw[idx] = hton32 (output_matrix[subvol][j]); + + ret = dict_set_bin (xattr, priv->pending_key[j], + raw, sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) + goto err; } -out: - return 0; + return xattr; +err: + if (xattr) + dict_unref (xattr); + return NULL; } -void -afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) +int +afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, afr_transaction_type type, + struct afr_reply *replies, unsigned char *locked_on) { - afr_private_t * priv = this->private; + afr_private_t *priv = NULL; + int i = 0; + int j = 0; + unsigned char *pending = NULL; + int *input_dirty = NULL; + int **input_matrix = NULL; + int *output_dirty = NULL; + int **output_matrix = NULL; + dict_t *xattr = NULL; + + priv = this->private; - char *buf = NULL; - char *ptr = NULL; + pending = alloca0 (priv->child_count); - int i, j; + input_dirty = alloca0 (priv->child_count * sizeof (int)); + input_matrix = ALLOC_MATRIX (priv->child_count, int); + output_dirty = alloca0 (priv->child_count * sizeof (int)); + output_matrix = ALLOC_MATRIX (priv->child_count, int); - /* 10 digits per entry + 1 space + '[' and ']' */ - buf = MALLOC (priv->child_count * 11 + 8); + afr_selfheal_extract_xattr (this, replies, type, input_dirty, + input_matrix); + + for (i = 0; i < priv->child_count; i++) + if (sinks[i] && !healed_sinks[i]) + pending[i] = 1; for (i = 0; i < priv->child_count; i++) { - ptr = buf; - ptr += sprintf (ptr, "[ "); for (j = 0; j < priv->child_count; j++) { - ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); + if (pending[j]) + output_matrix[i][j] = 1; + else + output_matrix[i][j] = -input_matrix[i][j]; } - ptr += sprintf (ptr, "]"); - gf_log (this->name, GF_LOG_DEBUG, - "pending_matrix: %s", buf); } - FREE (buf); -} - - -void -afr_sh_build_pending_matrix (int32_t *pending_matrix[], dict_t *xattr[], - int child_count, const char *key) -{ - int i = 0; - int j = 0; - int32_t *pending = NULL; - int ret = -1; - - unsigned char *ignorant_subvols = NULL; - - ignorant_subvols = CALLOC (sizeof (*ignorant_subvols), child_count); - - /* start clean */ - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - pending_matrix[i][j] = 0; - } + for (i = 0; i < priv->child_count; i++) { + if (!pending[i]) + output_dirty[i] = -input_dirty[i]; } - for (i = 0; i < child_count; i++) { - if (!xattr[i]) + for (i = 0; i < priv->child_count; i++) { + if (!locked_on[i]) + /* perform post-op only on subvols we had locked + and inspected on. + */ continue; - pending = NULL; - - ret = dict_get_ptr (xattr[i], (char *) key, - VOID(&pending)); - if (ret != 0) { - /* - * There is no xattr present. This means this - * subvolume should be considered an 'ignorant' - * subvolume. - */ - - ignorant_subvols[i] = 1; + xattr = afr_selfheal_output_xattr (this, type, output_dirty, + output_matrix, i); + if (!xattr) { + gf_log (this->name, GF_LOG_ERROR, + "unable to allocate xdata for subvol %d", i); continue; - } - - for (j = 0; j < child_count; j++) { - pending_matrix[i][j] = ntoh32 (pending[j]); } - } - - /* - * Make all non-ignorant subvols point towards the ignorant - * subvolumes. - */ - - for (i = 0; i < child_count; i++) { - if (ignorant_subvols[i]) { - for (j = 0; j < child_count; j++) { - if (!ignorant_subvols[j]) - pending_matrix[j][i] += 1; - } - } - } -} + afr_selfheal_post_op (frame, this, inode, i, xattr); -/** - * mark_sources: Mark all 'source' nodes and return number of source - * nodes found - * - * A node (a row in the pending matrix) belongs to one of - * three categories: - * - * M is the pending matrix. - * - * 'innocent' - M[i] is all zeroes - * 'fool' - M[i] has i'th element = 1 (self-reference) - * 'wise' - M[i] has i'th element = 0, others are 1 or 0. - * - * All 'innocent' nodes are sinks. If all nodes are innocent, no self-heal is - * needed. - * - * A 'wise' node can be a source. If two 'wise' nodes conflict, it is - * a split-brain. If one wise node refers to the other but the other doesn't - * refer back, the referrer is a source. - * - * All fools are sinks, unless there are no 'wise' nodes. In that case, - * one of the fools is made a source. - */ - -typedef enum { - AFR_NODE_INNOCENT, - AFR_NODE_FOOL, - AFR_NODE_WISE -} afr_node_type; - -typedef struct { - afr_node_type type; - int wisdom; -} afr_node_character; - - -static int -afr_sh_is_innocent (int32_t *array, int child_count) -{ - int i = 0; - int ret = 1; /* innocent until proven guilty */ - - for (i = 0; i < child_count; i++) { - if (array[i]) { - ret = 0; - break; - } - } + dict_unref (xattr); + } - return ret; + return 0; } -static int -afr_sh_is_fool (int32_t *array, int i, int child_count) +void +afr_replies_copy (struct afr_reply *dst, struct afr_reply *src, int count) { - return array[i]; /* fool if accuses itself */ + int i = 0; + dict_t *xdata = NULL; + + if (dst == src) + return; + + for (i = 0; i < count; i++) { + dst[i].valid = src[i].valid; + dst[i].op_ret = src[i].op_ret; + dst[i].op_errno = src[i].op_errno; + dst[i].prestat = src[i].prestat; + dst[i].poststat = src[i].poststat; + dst[i].preparent = src[i].preparent; + dst[i].postparent = src[i].postparent; + dst[i].preparent2 = src[i].preparent2; + dst[i].postparent2 = src[i].postparent2; + if (src[i].xdata) + xdata = dict_ref (src[i].xdata); + else + xdata = NULL; + if (dst[i].xdata) + dict_unref (dst[i].xdata); + dst[i].xdata = xdata; + memcpy (dst[i].checksum, src[i].checksum, + MD5_DIGEST_LENGTH); + } } -static int -afr_sh_is_wise (int32_t *array, int i, int child_count) +int +afr_selfheal_fill_dirty (xlator_t *this, int *dirty, int subvol, + int idx, dict_t *xdata) { - return !array[i]; /* wise if does not accuse itself */ -} - + void *pending_raw = NULL; + int pending[3] = {0, }; -static int -afr_sh_all_nodes_innocent (afr_node_character *characters, - int child_count) -{ - int i = 0; - int ret = 1; + if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw)) + return -1; - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_INNOCENT) { - ret = 0; - break; - } - } + if (!pending_raw) + return -1; - return ret; -} + memcpy (pending, pending_raw, sizeof(pending)); + dirty[subvol] = ntoh32 (pending[idx]); -static int -afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count) -{ - int i = 0; - int ret = 0; - - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_WISE) { - ret = 1; - break; - } - } - - return ret; + return 0; } -/* - * The 'wisdom' of a wise node is 0 if any other wise node accuses it. - * It is 1 if no other wise node accuses it. - * Only wise nodes with wisdom 1 are sources. - * - * If no nodes with wisdom 1 exist, a split-brain has occured. - */ - -static void -afr_sh_compute_wisdom (int32_t *pending_matrix[], - afr_node_character characters[], int child_count) +int +afr_selfheal_fill_matrix (xlator_t *this, int **matrix, int subvol, + int idx, dict_t *xdata) { - int i = 0; - int j = 0; - - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_WISE) { - characters[i].wisdom = 1; - - for (j = 0; j < child_count; j++) { - if ((characters[j].type == AFR_NODE_WISE) - && pending_matrix[j][i]) { - - characters[i].wisdom = 0; - } - } - } - } -} - + int i = 0; + void *pending_raw = NULL; + int pending[3] = {0, }; + afr_private_t *priv = NULL; -static int -afr_sh_wise_nodes_conflict (afr_node_character *characters, - int child_count) -{ - int i = 0; - int ret = 1; + priv = this->private; - for (i = 0; i < child_count; i++) { - if ((characters[i].type == AFR_NODE_WISE) - && characters[i].wisdom == 1) { + for (i = 0; i < priv->child_count; i++) { + if (dict_get_ptr (xdata, priv->pending_key[i], &pending_raw)) + continue; - /* There is atleast one bona-fide wise node */ - ret = 0; - break; - } - } + if (!pending_raw) + continue; - return ret; -} + memcpy (pending, pending_raw, sizeof(pending)); + matrix[subvol][i] = ntoh32 (pending[idx]); + } -static int -afr_sh_mark_wisest_as_sources (int sources[], - afr_node_character *characters, - int child_count) -{ - int nsources = 0; - - int i = 0; - - for (i = 0; i < child_count; i++) { - if (characters[i].wisdom == 1) { - sources[i] = 1; - nsources++; - } - } - - return nsources; + return 0; } -static int -afr_sh_mark_if_size_differs (afr_self_heal_t *sh, int child_count) +int +afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies, + afr_transaction_type type, int *dirty, int **matrix) { - int32_t ** pending_matrix; - int i, j; + afr_private_t *priv = NULL; + int i = 0; + dict_t *xdata = NULL; + int idx = -1; - int size_differs = 0; + idx = afr_index_for_transaction_type (type); - pending_matrix = sh->pending_matrix; + priv = this->private; - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[j]) - && (pending_matrix[i][j] == 0) - && (pending_matrix[j][i] == 0)) { - - pending_matrix[i][j] = 1; - pending_matrix[j][i] = 1; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].xdata) + continue; - size_differs = 1; - } - } - } + xdata = replies[i].xdata; - return size_differs; -} + afr_selfheal_fill_dirty (this, dirty, i, idx, xdata); + afr_selfheal_fill_matrix (this, matrix, i, idx, xdata); + } - -static int -afr_sh_mark_biggest_fool_as_source (afr_self_heal_t *sh, - afr_node_character *characters, - int child_count) -{ - int i = 0; - int biggest = 0; - - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_FOOL) { - biggest = i; - break; - } - } - - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_FOOL) - continue; - - if (SIZE_GREATER (&sh->buf[i], &sh->buf[biggest])) { - biggest = i; - } - } - - sh->sources[biggest] = 1; - - return 1; + return 0; } -static int -afr_sh_mark_biggest_as_source (afr_self_heal_t *sh, int child_count) -{ - int biggest = 0; - int i; - - for (i = 0; i < child_count; i++) { - if (SIZE_GREATER (&sh->buf[i], &sh->buf[biggest])) { - biggest = i; - } - } - - sh->sources[biggest] = 1; - - return 1; -} +/* + * This function determines if a self-heal is required for a given inode, + * and if needed, in what direction. + * + * locked_on[] is the array representing servers which have been locked and + * from which xattrs have been fetched for analysis. + * + * The output of the function is by filling the arrays sources[] and sinks[]. + * + * sources[i] is set if i'th server is an eligible source for a selfheal. + * + * sinks[i] is set if i'th server needs to be healed. + * + * if sources[0..N] are all set, there is no need for a selfheal. + * + * if sinks[0..N] are all set, the inode is in split brain. + * + */ int -afr_sh_mark_sources (afr_self_heal_t *sh, int child_count, - afr_self_heal_type type) +afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + afr_transaction_type type, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks) { + afr_private_t *priv = NULL; int i = 0; + int j = 0; + int *dirty = NULL; + int **matrix = NULL; + char *accused = NULL; - int32_t ** pending_matrix; - int * sources; - - int size_differs = 0; - - pending_matrix = sh->pending_matrix; - sources = sh->sources; + priv = this->private; - int nsources = 0; + dirty = alloca0 (priv->child_count * sizeof (int)); + accused = alloca0 (priv->child_count); + matrix = ALLOC_MATRIX(priv->child_count, int); - /* stores the 'characters' (innocent, fool, wise) of the nodes */ - afr_node_character * - characters = CALLOC (sizeof (afr_node_character), - child_count); + /* First construct the pending matrix for further analysis */ + afr_selfheal_extract_xattr (this, replies, type, dirty, matrix); - /* start clean */ - for (i = 0; i < child_count; i++) { - sources[i] = 0; + /* Next short list all accused to exclude them from being sources */ + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + if (matrix[i][j]) + accused[j] = 1; + } } - - for (i = 0; i < child_count; i++) { - if (afr_sh_is_innocent (pending_matrix[i], child_count)) { - characters[i].type = AFR_NODE_INNOCENT; - - } else if (afr_sh_is_fool (pending_matrix[i], i, child_count)) { - characters[i].type = AFR_NODE_FOOL; - - } else if (afr_sh_is_wise (pending_matrix[i], i, child_count)) { - characters[i].type = AFR_NODE_WISE; - - } else { - gf_log ("[module:afr]", GF_LOG_ERROR, - "node %d is diabolical! " - "(This message should never appear." - " Please file a bug report.)", i); - } - } - - if (type == AFR_SELF_HEAL_DATA) { - size_differs = afr_sh_mark_if_size_differs (sh, child_count); - } - - if (afr_sh_all_nodes_innocent (characters, child_count)) { - if (size_differs) { - nsources = afr_sh_mark_biggest_as_source (sh, - child_count); - } - - } else if (afr_sh_wise_nodes_exist (characters, child_count)) { - afr_sh_compute_wisdom (pending_matrix, characters, child_count); - - if (afr_sh_wise_nodes_conflict (characters, child_count)) { - /* split-brain */ - - nsources = -1; - goto out; - - } else { - nsources = afr_sh_mark_wisest_as_sources (sources, - characters, - child_count); - } - } else { - nsources = afr_sh_mark_biggest_fool_as_source (sh, characters, - child_count); - } - -out: - return nsources; -} - - -void -afr_sh_pending_to_delta (dict_t **xattr, char *key, - int32_t *delta_matrix[], int success[], - int child_count) -{ - int i = 0; - int j = 0; - int32_t * pending = NULL; - int ret = 0; + /* Short list all non-accused as sources */ + memset (sources, 0, priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (!accused[i] && locked_on[i]) + sources[i] = 1; + } - /* start clean */ - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - delta_matrix[i][j] = 0; + /* Everyone accused by sources are sinks */ + memset (sinks, 0, priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + for (j = 0; j < priv->child_count; j++) { + if (matrix[i][j]) + sinks[j] = 1; } } - for (i = 0; i < child_count; i++) { - pending = NULL; - - ret = dict_get_ptr (xattr[i], (char *) key, - VOID (&pending)); - - for (j = 0; j < child_count; j++) { - if (!success[j]) - continue; + /* If any source has 'dirty' bit, pick first + 'dirty' source and make everybody else sinks */ + for (i = 0; i < priv->child_count; i++) { + if (sources[i] && dirty[i]) { + for (j = 0; j < priv->child_count; j++) { + if (j != i) { + sources[j] = 0; + sinks[j] = 1; + } + } + break; + } + } - if (pending) { - delta_matrix[i][j] = -(ntoh32 (pending[j])); - } else { - delta_matrix[i][j] = 0; - } + /* If no sources, all locked nodes are sinks - split brain */ + if (AFR_COUNT (sources, priv->child_count) == 0) { + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) + sinks[i] = 1; } } + + return 0; } int -afr_sh_delta_to_xattr (int32_t *delta_matrix[], dict_t *xattr[], - int child_count, const char *key) +afr_selfheal_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *parbuf) { - int i = 0; - int j = 0; - - int ret = 0; + afr_local_t *local = NULL; + int i = -1; - int32_t *pending = 0; - - for (i = 0; i < child_count; i++) { - if (!xattr[i]) - continue; + local = frame->local; + i = (long) cookie; - pending = CALLOC (sizeof (int32_t), child_count); - for (j = 0; j < child_count; j++) { - pending[j] = hton32 (delta_matrix[i][j]); - } + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (buf) + local->replies[i].poststat = *buf; + if (parbuf) + local->replies[i].postparent = *parbuf; + if (xdata) + local->replies[i].xdata = dict_ref (xdata); - ret = dict_set_bin (xattr[i], (char *) key, pending, - child_count * sizeof (int32_t)); - } + syncbarrier_wake (&local->barrier); return 0; } -int -afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this) +inode_t * +afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent, + const char *name, struct afr_reply *replies, + unsigned char *lookup_on) { + loc_t loc = {0, }; + dict_t *xattr_req = NULL; + afr_local_t *local = NULL; afr_private_t *priv = NULL; - int32_t *pending = NULL; - void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */ + inode_t *inode = NULL; - int ret = -1; - int i = 0; - - priv = this->private; + local = frame->local; + priv = frame->this->private; - ret = dict_get_ptr (xattr, AFR_METADATA_PENDING, &tmp_pending); + xattr_req = dict_new (); + if (!xattr_req) + return NULL; - if (ret != 0) - return 0; + if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) { + dict_destroy (xattr_req); + return NULL; + } - pending = tmp_pending; - for (i = 0; i < priv->child_count; i++) { - if (i == child_count) - continue; - if (pending[i]) - return 1; + inode = inode_new (parent->table); + if (!inode) { + dict_destroy (xattr_req); + return NULL; } - return 0; + loc.parent = inode_ref (parent); + uuid_copy (loc.pargfid, parent->gfid); + loc.name = name; + loc.inode = inode_ref (inode); + + AFR_ONLIST (lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xattr_req); + + afr_replies_copy (replies, local->replies, priv->child_count); + + loc_wipe (&loc); + dict_unref (xattr_req); + + return inode; } int -afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this) +afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies, + unsigned char *discover_on) { + loc_t loc = {0, }; + dict_t *xattr_req = NULL; + afr_local_t *local = NULL; afr_private_t *priv = NULL; - int32_t *pending = NULL; - void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */ - int ret = -1; - int i = 0; + local = frame->local; + priv = frame->this->private; + + xattr_req = dict_new (); + if (!xattr_req) + return -ENOMEM; - priv = this->private; + if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) { + dict_destroy (xattr_req); + return -ENOMEM; + } - ret = dict_get_ptr (xattr, AFR_DATA_PENDING, &tmp_pending); + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, gfid); - if (ret != 0) - return 0; + AFR_ONLIST (discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xattr_req); - pending = tmp_pending; - for (i = 0; i < priv->child_count; i++) { - if (i == child_count) - continue; - if (pending[i]) - return 1; - } + afr_replies_copy (replies, local->replies, priv->child_count); + + loc_wipe (&loc); + dict_unref (xattr_req); return 0; } - int -afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this) +afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies) { afr_private_t *priv = NULL; - int32_t *pending = NULL; - void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */ - - int ret = -1; - int i = 0; - - priv = this->private; - ret = dict_get_ptr (xattr, AFR_ENTRY_PENDING, &tmp_pending); + priv = frame->this->private; - if (ret != 0) - return 0; - - pending = tmp_pending; - for (i = 0; i < priv->child_count; i++) { - if (i == child_count) - continue; - if (pending[i]) - return 1; - } - - return 0; + return afr_selfheal_unlocked_discover_on (frame, inode, gfid, replies, + priv->child_up); } - -/** - * is_matrix_zero - return true if pending matrix is all zeroes - */ - int -afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count) +afr_selfheal_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - int i, j; + afr_local_t *local = NULL; + int i = 0; + + local = frame->local; + i = (long) cookie; - for (i = 0; i < child_count; i++) - for (j = 0; j < child_count; j++) - if (pending_matrix[i][j]) - return 0; - return 1; + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + + syncbarrier_wake (&local->barrier); + + return 0; } int -afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) +afr_selfheal_locked_fill (call_frame_t *frame, xlator_t *this, + unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int count = 0; local = frame->local; - sh = &local->self_heal; priv = this->private; -// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); - memset (sh->buf, 0, sizeof (struct stat) * priv->child_count); - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } - - if (local->govinda_gOvinda) { - gf_log (this->name, GF_LOG_WARNING, - "aborting selfheal of %s", - local->loc.path); - sh->completion_cbk (frame, this); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "proceeding to metadata check on %s", - local->loc.path); - afr_self_heal_metadata (frame, this); + if (local->replies[i].valid && local->replies[i].op_ret == 0) { + locked_on[i] = 1; + count++; + } else { + locked_on[i] = 0; + } } - return 0; + return count; } int -sh_missing_entries_unlck_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno) +afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; + loc_t loc = {0,}; + struct gf_flock flock = {0, }; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - LOCK (&frame->lock); - { - } - UNLOCK (&frame->lock); + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; - call_count = afr_frame_return (frame); + AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom, + &loc, F_SETLK, &flock, NULL); - if (call_count == 0) { - afr_sh_missing_entries_done (frame, this); - } + loc_wipe (&loc); - return 0; + return afr_selfheal_locked_fill (frame, this, locked_on); } - -static int -sh_missing_entries_finish (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int i = 0; - int call_count = 0; - afr_self_heal_t *sh = NULL; +int +afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on) +{ + loc_t loc = {0,}; + struct gf_flock flock = {0, }; + afr_local_t *local = NULL; + int i = 0; + afr_private_t *priv = NULL; - local = frame->local; - sh = &local->self_heal; priv = this->private; + local = frame->local; + + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - call_count = local->child_count; + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; - local->call_count = call_count; + AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom, + &loc, F_SETLK, &flock, NULL); for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - gf_log (this->name, GF_LOG_DEBUG, - "unlocking %"PRId64"/%s on subvolume %s", - sh->parent_loc.inode->ino, local->loc.name, - priv->children[i]->name); - - STACK_WIND (frame, sh_missing_entries_unlck_cbk, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - &sh->parent_loc, local->loc.name, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK); - - if (!--call_count) - break; + if (local->replies[i].op_ret == -1 && + local->replies[i].op_errno == EAGAIN) { + afr_selfheal_locked_fill (frame, this, locked_on); + afr_selfheal_uninodelk (frame, this, inode, dom, off, + size, locked_on); + + AFR_SEQ (frame, afr_selfheal_lock_cbk, inodelk, dom, + &loc, F_SETLKW, &flock, NULL); + break; } } - return 0; -} + loc_wipe (&loc); -static int -sh_destroy_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int op_errno, struct stat *stbuf) -{ - STACK_DESTROY (frame->root); - return 0; + return afr_selfheal_locked_fill (frame, this, locked_on); } -static int -sh_missing_entries_newentry_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *stbuf) +int +afr_selfheal_uninodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + const unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - call_frame_t *chown_frame = NULL; - int call_count = 0; - int child_index = 0; - struct stat *buf = NULL; + loc_t loc = {0,}; + struct gf_flock flock = {0, }; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - buf = &sh->buf[sh->source]; - child_index = (long) cookie; + flock.l_type = F_UNLCK; + flock.l_start = off; + flock.l_len = size; - if (op_ret == 0) { - chown_frame = copy_frame (frame); + AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, inodelk, + dom, &loc, F_SETLK, &flock, NULL); - gf_log (this->name, GF_LOG_DEBUG, - "chown %s to %d %d on subvolume %s", - local->loc.path, buf->st_uid, buf->st_gid, - priv->children[child_index]->name); + loc_wipe (&loc); - STACK_WIND (chown_frame, sh_destroy_cbk, - priv->children[child_index], - priv->children[child_index]->fops->chown, - &local->loc, - buf->st_uid, buf->st_gid); - } + return 0; +} - LOCK (&frame->lock); - { - } - UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); +int +afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) +{ + loc_t loc = {0,}; + + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - if (call_count == 0) { - sh_missing_entries_finish (frame, this); - } + AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom, + &loc, name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); - return 0; + loc_wipe (&loc); + + return afr_selfheal_locked_fill (frame, this, locked_on); } -static int -sh_missing_entries_mknod (call_frame_t *frame, xlator_t *this) +int +afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - int enoent_count = 0; - int call_count = 0; - mode_t st_mode = 0; - dev_t st_dev = 0; - + loc_t loc = {0,}; + afr_local_t *local = NULL; + int i = 0; + afr_private_t *priv = NULL; - local = frame->local; - sh = &local->self_heal; priv = this->private; + local = frame->local; - for (i = 0; i < priv->child_count; i++) - if (sh->child_errno[i] == ENOENT) - enoent_count++; - - call_count = enoent_count; - local->call_count = call_count; - - st_mode = sh->buf[sh->source].st_mode; - st_dev = sh->buf[sh->source].st_dev; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - gf_log (this->name, GF_LOG_DEBUG, - "mknod %s mode 0%o on %d subvolumes", - local->loc.path, st_mode, enoent_count); + AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, + name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); for (i = 0; i < priv->child_count; i++) { - if (sh->child_errno[i] == ENOENT) { - STACK_WIND_COOKIE (frame, - sh_missing_entries_newentry_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mknod, - &local->loc, st_mode, st_dev); - if (!--call_count) - break; + if (local->replies[i].op_ret == -1 && + local->replies[i].op_errno == EAGAIN) { + afr_selfheal_locked_fill (frame, this, locked_on); + afr_selfheal_unentrylk (frame, this, inode, dom, name, + locked_on); + + AFR_SEQ (frame, afr_selfheal_lock_cbk, entrylk, dom, + &loc, name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + break; } } - return 0; + loc_wipe (&loc); + + return afr_selfheal_locked_fill (frame, this, locked_on); } -static int -sh_missing_entries_mkdir (call_frame_t *frame, xlator_t *this) +int +afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - int enoent_count = 0; - int call_count = 0; - mode_t st_mode = 0; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) - if (sh->child_errno[i] == ENOENT) - enoent_count++; - - call_count = enoent_count; - local->call_count = call_count; + loc_t loc = {0,}; - st_mode = sh->buf[sh->source].st_mode; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - gf_log (this->name, GF_LOG_DEBUG, - "mkdir %s mode 0%o on %d subvolumes", - local->loc.path, st_mode, enoent_count); + AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, entrylk, + dom, &loc, name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); - for (i = 0; i < priv->child_count; i++) { - if (sh->child_errno[i] == ENOENT) { - STACK_WIND_COOKIE (frame, - sh_missing_entries_newentry_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mkdir, - &local->loc, st_mode); - if (!--call_count) - break; - } - } + loc_wipe (&loc); return 0; } -static int -sh_missing_entries_symlink (call_frame_t *frame, xlator_t *this, - const char *link) +gf_boolean_t +afr_is_pending_set (xlator_t *this, dict_t *xdata, int type) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - int enoent_count = 0; - int call_count = 0; - + int idx = -1; + afr_private_t *priv = NULL; + void *pending_raw = NULL; + int *pending_int = NULL; + int i = 0; - local = frame->local; - sh = &local->self_heal; priv = this->private; + idx = afr_index_for_transaction_type (type); - for (i = 0; i < priv->child_count; i++) - if (sh->child_errno[i] == ENOENT) - enoent_count++; - - call_count = enoent_count; - local->call_count = call_count; + if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw) == 0) { + if (pending_raw) { + pending_int = pending_raw; - gf_log (this->name, GF_LOG_DEBUG, - "symlink %s -> %s on %d subvolumes", - local->loc.path, link, enoent_count); + if (ntoh32 (pending_int[idx])) + return _gf_true; + } + } for (i = 0; i < priv->child_count; i++) { - if (sh->child_errno[i] == ENOENT) { - STACK_WIND_COOKIE (frame, - sh_missing_entries_newentry_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->symlink, - link, &local->loc); - if (!--call_count) - break; - } + if (dict_get_ptr (xdata, priv->pending_key[i], + &pending_raw)) + continue; + if (!pending_raw) + continue; + pending_int = pending_raw; + + if (ntoh32 (pending_int[idx])) + return _gf_true; } - return 0; + return _gf_false; } -static int -sh_missing_entries_readlink_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - const char *link) +gf_boolean_t +afr_is_data_set (xlator_t *this, dict_t *xdata) { - if (op_ret > 0) - sh_missing_entries_symlink (frame, this, link); - else - sh_missing_entries_finish (frame, this); - - return 0; + return afr_is_pending_set (this, xdata, AFR_DATA_TRANSACTION); } - -static int -sh_missing_entries_readlink (call_frame_t *frame, xlator_t *this) +gf_boolean_t +afr_is_metadata_set (xlator_t *this, dict_t *xdata) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - STACK_WIND (frame, sh_missing_entries_readlink_cbk, - priv->children[sh->source], - priv->children[sh->source]->fops->readlink, - &local->loc, 4096); - - return 0; + return afr_is_pending_set (this, xdata, AFR_METADATA_TRANSACTION); } - -static int -sh_missing_entries_create (call_frame_t *frame, xlator_t *this) +gf_boolean_t +afr_is_entry_set (xlator_t *this, dict_t *xdata) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int type = 0; - int i = 0; - afr_private_t *priv = NULL; - int enoent_count = 0; - int govinda_gOvinda = 0; + return afr_is_pending_set (this, xdata, AFR_ENTRY_TRANSACTION); +} - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (sh->child_errno[i]) { - if (sh->child_errno[i] == ENOENT) - enoent_count++; - } else { - if (type) { - if (type != (sh->buf[i].st_mode & S_IFMT)) - govinda_gOvinda = 1; - } else { - sh->source = i; - type = sh->buf[i].st_mode & S_IFMT; - } - } - } +void +afr_inode_link (inode_t *inode, struct iatt *iatt) +{ + inode_t *linked_inode = NULL; - if (govinda_gOvinda) { - gf_log (this->name, GF_LOG_ERROR, - "conflicing filetypes exist for path %s. returning.", - local->loc.path); + linked_inode = inode_link (inode, NULL, NULL, iatt); - local->govinda_gOvinda = 1; - sh_missing_entries_finish (frame, this); - return 0; - } + uuid_copy (inode->gfid, iatt->ia_gfid); + inode->ia_type = iatt->ia_type; - if (!type) { - gf_log (this->name, GF_LOG_ERROR, - "no source found for %s. all nodes down?. returning.", - local->loc.path); - /* subvolumes down and/or file does not exist */ - sh_missing_entries_finish (frame, this); - return 0; + if (linked_inode) { + inode_lookup (linked_inode); + inode_unref (linked_inode); } +} - if (enoent_count == 0) { - gf_log (this->name, GF_LOG_ERROR, - "no missing files - %s. proceeding to metadata check", - local->loc.path); - /* proceed to next step - metadata self-heal */ - sh_missing_entries_finish (frame, this); - return 0; - } - switch (type) { - case S_IFSOCK: - case S_IFREG: - case S_IFBLK: - case S_IFCHR: - case S_IFIFO: - sh_missing_entries_mknod (frame, this); - break; - case S_IFLNK: - sh_missing_entries_readlink (frame, this); - break; - case S_IFDIR: - sh_missing_entries_mkdir (frame, this); - break; - default: - gf_log (this->name, GF_LOG_ERROR, - "unknown file type: 0%o", type); - local->govinda_gOvinda = 1; - sh_missing_entries_finish (frame, this); - } +/* + * This function inspects the looked up replies (in an unlocked manner) + * and decides whether a locked verification and possible healing is + * required or not. It updates the three booleans for each type + * of healing. If the boolean flag gets set to FALSE, then we are sure + * no healing is required. If the boolean flag gets set to TRUE then + * we have to proceed with locked reinspection. + */ - return 0; -} +int +afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, + inode_t *inode, uuid_t gfid, + gf_boolean_t *data_selfheal, + gf_boolean_t *metadata_selfheal, + gf_boolean_t *entry_selfheal) +{ + afr_private_t *priv = NULL; + int i = 0; + int valid_cnt = 0; + struct iatt first = {0, }; + struct afr_reply *replies = NULL; + int ret = -1; + priv = this->private; -static int -sh_missing_entries_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct stat *buf, dict_t *xattr) -{ - int child_index = 0; - afr_local_t *local = NULL; - int call_count = 0; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; + replies = alloca0 (sizeof (*replies) * priv->child_count); + ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies); + if (ret) + return ret; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + if (replies[i].op_ret == -1) + continue; - child_index = (long) cookie; + if (afr_is_data_set (this, replies[i].xdata)) + *data_selfheal = _gf_true; - LOCK (&frame->lock); - { - if (op_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "path %s on subvolume %s is of mode 0%o", - local->loc.path, - priv->children[child_index]->name, - buf->st_mode); + if (afr_is_metadata_set (this, replies[i].xdata)) + *metadata_selfheal = _gf_true; - local->self_heal.buf[child_index] = *buf; - } else { - gf_log (this->name, GF_LOG_WARNING, - "path %s on subvolume %s => -1 (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); + if (afr_is_entry_set (this, replies[i].xdata)) + *entry_selfheal = _gf_true; - local->self_heal.child_errno[child_index] = op_errno; + valid_cnt ++; + if (valid_cnt == 1) { + first = replies[i].poststat; + continue; } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); + if (!IA_EQUAL (first, replies[i].poststat, type)) { + gf_log (this->name, GF_LOG_ERROR, + "TYPE mismatch %d vs %d on %s for gfid:%s", + (int) first.ia_type, + (int) replies[i].poststat.ia_type, + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); + return -EIO; + } - if (call_count == 0) { - sh_missing_entries_create (frame, this); - } + if (!IA_EQUAL (first, replies[i].poststat, uid)) { + gf_log (this->name, GF_LOG_DEBUG, + "UID mismatch %d vs %d on %s for gfid:%s", + (int) first.ia_uid, + (int) replies[i].poststat.ia_uid, + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); - return 0; -} + *metadata_selfheal = _gf_true; + } + if (!IA_EQUAL (first, replies[i].poststat, gid)) { + gf_log (this->name, GF_LOG_DEBUG, + "GID mismatch %d vs %d on %s for gfid:%s", + (int) first.ia_uid, + (int) replies[i].poststat.ia_uid, + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); -static int -sh_missing_entries_lookup (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - int i = 0; - int call_count = 0; - afr_private_t *priv = NULL; - dict_t *xattr_req = NULL; - int ret = -1; + *metadata_selfheal = _gf_true; + } - local = frame->local; - call_count = local->child_count; - priv = this->private; + if (!IA_EQUAL (first, replies[i].poststat, prot)) { + gf_log (this->name, GF_LOG_DEBUG, + "MODE mismatch %d vs %d on %s for gfid:%s", + (int) st_mode_from_ia (first.ia_prot, 0), + (int) st_mode_from_ia (replies[i].poststat.ia_prot, 0), + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); - local->call_count = call_count; - - xattr_req = dict_new(); - - if (xattr_req) - ret = dict_set_uint64 (xattr_req, AFR_ENTRY_PENDING, - priv->child_count * sizeof(int32_t)); + *metadata_selfheal = _gf_true; + } - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { + if (IA_ISREG(first.ia_type) && + !IA_EQUAL (first, replies[i].poststat, size)) { gf_log (this->name, GF_LOG_DEBUG, - "looking up %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, - sh_missing_entries_lookup_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - &local->loc, xattr_req); - - if (!--call_count) - break; + "SIZE mismatch %lld vs %lld on %s for gfid:%s", + (long long) first.ia_size, + (long long) replies[i].poststat.ia_size, + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); + + *data_selfheal = _gf_true; } } - - if (xattr_req) - dict_unref (xattr_req); + + if (valid_cnt > 0) + afr_inode_link (inode, &first); + + if (valid_cnt < 2) + return -ENOTCONN; return 0; } -static int -sh_missing_entries_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno) +inode_t * +afr_inode_find (xlator_t *this, uuid_t gfid) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int call_count = 0; - int child_index = (long) cookie; - + inode_table_t *table = NULL; + inode_t *inode = NULL; - local = frame->local; - sh = &local->self_heal; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - sh->op_failed = 1; - - gf_log (this->name, - (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), - "locking inode of %s on child %d failed: %s", - local->loc.path, child_index, - strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "inode of %s on child %d locked", - local->loc.path, child_index); - } - } - UNLOCK (&frame->lock); + table = this->itable; + if (!table) + return NULL; - call_count = afr_frame_return (frame); + inode = inode_find (table, gfid); + if (inode) + return inode; - if (call_count == 0) { - if (sh->op_failed == 1) { - sh_missing_entries_finish (frame, this); - return 0; - } + inode = inode_new (table); + if (!inode) + return NULL; - sh_missing_entries_lookup (frame, this); - } + uuid_copy (inode->gfid, gfid); - return 0; + return inode; } -static int -afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) +call_frame_t * +afr_frame_create (xlator_t *this) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - int call_count = 0; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + int op_errno = 0; + pid_t pid = -1; + frame = create_frame (this, this->ctx->pool); + if (!frame) + return NULL; - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - gf_log (this->name, GF_LOG_DEBUG, - "attempting to recreate missing entries for path=%s", - local->loc.path); - - afr_build_parent_loc (&sh->parent_loc, &local->loc); + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) { + STACK_DESTROY (frame->root); + return NULL; + } - call_count = local->child_count; + syncopctx_setfspid (&pid); - local->call_count = call_count; + frame->root->pid = pid; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, sh_missing_entries_lk_cbk, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - &sh->parent_loc, local->loc.name, - ENTRYLK_LOCK_NB, ENTRYLK_WRLCK); - if (!--call_count) - break; - } - } + afr_set_lk_owner (frame, this, frame->root); - return 0; + return frame; } +/* + * This is the entry point for healing a given GFID + */ + int -afr_self_heal (call_frame_t *frame, xlator_t *this, - int (*completion_cbk) (call_frame_t *, xlator_t *)) +afr_selfheal (xlator_t *this, uuid_t gfid) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - + inode_t *inode = NULL; + call_frame_t *frame = NULL; + int ret = -1; + gf_boolean_t data_selfheal = _gf_false; + gf_boolean_t metadata_selfheal = _gf_false; + gf_boolean_t entry_selfheal = _gf_false; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + inode = afr_inode_find (this, gfid); + if (!inode) + goto out; - gf_log (this->name, GF_LOG_DEBUG, - "performing self heal on %s (metadata=%d data=%d entry=%d)", - local->loc.path, - local->need_metadata_self_heal, - local->need_data_self_heal, - local->need_entry_self_heal); + frame = afr_frame_create (this); + if (!frame) + goto out; - sh->completion_cbk = completion_cbk; + ret = afr_selfheal_unlocked_inspect (frame, this, inode, gfid, + &data_selfheal, + &metadata_selfheal, + &entry_selfheal); + if (ret) + goto out; - sh->buf = CALLOC (priv->child_count, sizeof (struct stat)); - sh->child_errno = CALLOC (priv->child_count, sizeof (int)); - sh->success = CALLOC (priv->child_count, sizeof (int)); - sh->xattr = CALLOC (priv->child_count, sizeof (dict_t *)); - sh->sources = CALLOC (sizeof (*sh->sources), priv->child_count); + if (data_selfheal) + afr_selfheal_data (frame, this, inode); - sh->pending_matrix = CALLOC (sizeof (int32_t *), priv->child_count); - for (i = 0; i < priv->child_count; i++) { - sh->pending_matrix[i] = CALLOC (sizeof (int32_t), - priv->child_count); - } + if (metadata_selfheal) + afr_selfheal_metadata (frame, this, inode); - sh->delta_matrix = CALLOC (sizeof (int32_t *), priv->child_count); - for (i = 0; i < priv->child_count; i++) { - sh->delta_matrix[i] = CALLOC (sizeof (int32_t), - priv->child_count); - } + if (entry_selfheal) + afr_selfheal_entry (frame, this, inode); - if (local->success_count && local->enoent_count) { - afr_self_heal_missing_entries (frame, this); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "proceeding to metadata check on %s", - local->loc.path); - afr_sh_missing_entries_done (frame, this); - } + inode_forget (inode, 1); +out: + if (inode) + inode_unref (inode); + if (frame) + AFR_STACK_DESTROY (frame); - return 0; + return ret; } |
