summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr/src/afr-self-heal-common.c
diff options
context:
space:
mode:
authorPranith Kumar K <pranithk@gluster.com>2011-07-18 12:30:17 +0530
committerVijay Bellur <vijay@gluster.com>2011-10-17 06:13:56 -0700
commitab2b2fefdcb10aa29749190b9e908b3bc81d86e5 (patch)
treed028703a64e86a9be31ec81f4c5ef4e7465817ee /xlators/cluster/afr/src/afr-self-heal-common.c
parent57abdb20ade542e9f57461ec953dad8ca93b23c7 (diff)
cluster/afr: Handle lookups when self-heal is off
Change-Id: Ibc23fef417bcf613850e03dc4dadcc88f89e2b6f BUG: 2586 Reviewed-on: http://review.gluster.com/59 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Vijay Bellur <vijay@gluster.com>
Diffstat (limited to 'xlators/cluster/afr/src/afr-self-heal-common.c')
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c441
1 files changed, 287 insertions, 154 deletions
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 4f50f35775b..b29deb8bca2 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -70,23 +70,6 @@ afr_sh_source_count (int sources[], int child_count)
return nsource;
}
-
-int
-afr_sh_supress_errenous_children (int sources[], int child_errno[],
- int child_count)
-{
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- if (child_errno[i] && sources[i]) {
- sources[i] = 0;
- }
- }
-
- return 0;
-}
-
-
void
afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this)
{
@@ -113,11 +96,46 @@ afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this)
GF_FREE (buf);
}
+void
+afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count)
+{
+ int i = 0;
+ int j = 0;
+
+ GF_ASSERT (pending_matrix);
+
+ for (i = 0; i < child_count; i++) {
+ for (j = 0; j < child_count; j++) {
+ pending_matrix[i][j] = 0;
+ }
+ }
+}
void
-afr_sh_build_pending_matrix (afr_private_t *priv,
- int32_t *pending_matrix[], dict_t *xattr[],
- int child_count, afr_transaction_type type)
+afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix,
+ unsigned char *ignorant_subvols,
+ size_t child_count)
+{
+ int i = 0;
+ int j = 0;
+
+ GF_ASSERT (pending_matrix);
+ GF_ASSERT (ignorant_subvols);
+
+ for (i = 0; i < child_count; i++) {
+ if (ignorant_subvols[i]) {
+ for (j = 0; j < child_count; j++) {
+ if (!ignorant_subvols[j])
+ pending_matrix[j][i] += 1;
+ }
+ }
+ }
+}
+
+int
+afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix,
+ dict_t *xattr[], afr_transaction_type type,
+ size_t child_count)
{
/* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
int32_t pending[3] = {0,};
@@ -130,19 +148,16 @@ afr_sh_build_pending_matrix (afr_private_t *priv,
ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), child_count,
gf_afr_mt_char);
+ if (NULL == ignorant_subvols)
+ goto out;
- /* start clean */
- for (i = 0; i < child_count; i++) {
- for (j = 0; j < child_count; j++) {
- pending_matrix[i][j] = 0;
- }
- }
+ afr_init_pending_matrix (pending_matrix, child_count);
for (i = 0; i < child_count; i++) {
pending_raw = NULL;
for (j = 0; j < child_count; j++) {
- ret = dict_get_ptr (xattr[i], priv->pending_key[j],
+ ret = dict_get_ptr (xattr[i], pending_key[j],
&pending_raw);
if (ret != 0) {
@@ -163,21 +178,12 @@ afr_sh_build_pending_matrix (afr_private_t *priv,
}
}
- /*
- * Make all non-ignorant subvols point towards the ignorant
- * subvolumes.
- */
-
- for (i = 0; i < child_count; i++) {
- if (ignorant_subvols[i]) {
- for (j = 0; j < child_count; j++) {
- if (!ignorant_subvols[j])
- pending_matrix[j][i] += 1;
- }
- }
- }
-
+ afr_mark_ignorant_subvols_as_pending (pending_matrix,
+ ignorant_subvols,
+ child_count);
GF_FREE (ignorant_subvols);
+out:
+ return ret;
}
@@ -208,7 +214,8 @@ afr_sh_build_pending_matrix (afr_private_t *priv,
typedef enum {
AFR_NODE_INNOCENT,
AFR_NODE_FOOL,
- AFR_NODE_WISE
+ AFR_NODE_WISE,
+ AFR_NODE_INVALID = -1,
} afr_node_type;
typedef struct {
@@ -353,182 +360,276 @@ afr_sh_mark_wisest_as_sources (int sources[],
return nsources;
}
-
-static int
-afr_sh_mark_if_size_differs (afr_self_heal_t *sh, int child_count)
+static void
+afr_compute_witness_of_fools (int32_t *witnesses, int32_t **pending_matrix,
+ afr_node_character *characters,
+ int32_t child_count)
{
- int32_t ** pending_matrix = NULL;
- int i = 0;
- int j = 0;
- int size_differs = 0;
+ int i = 0;
+ int j = 0;
+ int witness = 0;
- pending_matrix = sh->pending_matrix;
+ GF_ASSERT (witnesses);
+ GF_ASSERT (pending_matrix);
+ GF_ASSERT (characters);
+ GF_ASSERT (child_count > 0);
for (i = 0; i < child_count; i++) {
+ if (characters[i].type != AFR_NODE_FOOL)
+ continue;
+
+ witness = 0;
for (j = 0; j < child_count; j++) {
- if (!sh->buf)
- break;
+ if (i == j)
+ continue;
+ witness += pending_matrix[i][j];
+ }
+ witnesses[i] = witness;
+ }
+}
- if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[j])
- && (pending_matrix[i][j] == 0)
- && (pending_matrix[j][i] == 0)) {
+static int32_t
+afr_find_biggest_witness_among_fools (int32_t *witnesses,
+ afr_node_character *characters,
+ int32_t child_count)
+{
+ int i = 0;
+ int biggest_witness = -1;
- pending_matrix[i][j] = 1;
- pending_matrix[j][i] = 1;
+ GF_ASSERT (witnesses);
+ GF_ASSERT (characters);
+ GF_ASSERT (child_count > 0);
- size_differs = 1;
- }
- }
- }
+ for (i = 0; i < child_count; i++) {
+ if (characters[i].type != AFR_NODE_FOOL)
+ continue;
- return size_differs;
+ if (biggest_witness < witnesses[i])
+ biggest_witness = witnesses[i];
+ }
+ return biggest_witness;
}
-
-static int
-afr_sh_mark_biggest_fool_as_source (afr_self_heal_t *sh,
+int
+afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses,
afr_node_character *characters,
- int child_count)
+ int32_t child_count, int32_t witness)
{
- int i = 0;
- int biggest = 0;
+ int i = 0;
+ int nsources = 0;
- for (i = 0; i < child_count; i++) {
- if (characters[i].type == AFR_NODE_FOOL) {
- biggest = i;
- break;
- }
- }
+ GF_ASSERT (sources);
+ GF_ASSERT (witnesses);
+ GF_ASSERT (characters);
+ GF_ASSERT (child_count > 0);
for (i = 0; i < child_count; i++) {
if (characters[i].type != AFR_NODE_FOOL)
continue;
- if (!sh->buf)
- break;
-
- if (SIZE_GREATER (&sh->buf[i], &sh->buf[biggest])) {
- biggest = i;
+ if (witness == witnesses[i]) {
+ sources[i] = 1;
+ nsources++;
}
}
+ return nsources;
+}
+
+static int
+afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix,
+ afr_node_character *characters,
+ int child_count)
+{
+ int32_t biggest_witness = 0;
+ int nsources = 0;
+ int32_t *witnesses = NULL;
- sh->sources[biggest] = 1;
+ GF_ASSERT (child_count > 0);
- return 1;
-}
+ witnesses = GF_CALLOC (child_count, sizeof (*witnesses),
+ gf_afr_mt_int32_t);
+ if (NULL == witnesses) {
+ nsources = -1;
+ goto out;
+ }
+ afr_compute_witness_of_fools (witnesses, pending_matrix, characters,
+ child_count);
+ biggest_witness = afr_find_biggest_witness_among_fools (witnesses,
+ characters,
+ child_count);
+ nsources = afr_mark_fool_as_source_by_witness (sources, witnesses,
+ characters, child_count,
+ biggest_witness);
+out:
+ if (witnesses)
+ GF_FREE (witnesses);
+ return nsources;
+}
-static int
-afr_sh_mark_biggest_as_source (afr_self_heal_t *sh, int child_count)
+int
+afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs,
+ int32_t *valid_children, int child_count,
+ uint32_t uid)
{
- int biggest = 0;
- int i = 0;
+ int i = 0;
+ int nsources = 0;
+ int child = 0;
+
+ GF_ASSERT (bufs);
+ GF_ASSERT (valid_children);
+ GF_ASSERT (sources);
+ GF_ASSERT (child_count > 0);
for (i = 0; i < child_count; i++) {
- if (!sh->buf)
- break;
+ if (-1 == valid_children[i])
+ continue;
- if (SIZE_GREATER (&sh->buf[i], &sh->buf[biggest])) {
- biggest = i;
+ child = valid_children[i];
+ if (uid == bufs[child].ia_uid) {
+ sources[child] = 1;
+ nsources++;
}
}
+ return nsources;
+}
- sh->sources[biggest] = 1;
+int
+afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *valid_children,
+ int child_count)
+{
+ int i = 0;
+ int smallest = -1;
+ int child = 0;
- return 1;
-}
+ GF_ASSERT (bufs);
+ GF_ASSERT (valid_children);
+ GF_ASSERT (child_count > 0);
+ for (i = 0; i < child_count; i++) {
+ if (-1 == valid_children[i])
+ continue;
+ child = valid_children[i];
+ if ((smallest == -1) ||
+ (bufs[child].ia_uid < bufs[smallest].ia_uid)) {
+ smallest = child;
+ }
+ }
+ return smallest;
+}
static int
-afr_sh_mark_loweia_uid_as_source (afr_self_heal_t *sh, int child_count)
+afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *valid_children,
+ int child_count, int32_t *sources)
{
- uid_t smallest = 0;
- int i = 0;
+ int nsources = 0;
+ int smallest = 0;
- for (i = 0; i < child_count; i++) {
- if (!sh->buf)
- break;
-
- if (sh->buf[i].ia_uid < sh->buf[smallest].ia_uid) {
- smallest = i;
- }
+ smallest = afr_get_child_with_lowest_uid (bufs, valid_children,
+ child_count);
+ if (smallest < 0) {
+ nsources = -1;
+ goto out;
}
+ nsources = afr_mark_child_as_source_by_uid (sources, bufs,
+ valid_children, child_count,
+ bufs[smallest].ia_uid);
+out:
+ return nsources;
+}
- sh->sources[smallest] = 1;
+char *
+afr_get_character_str (afr_node_type type)
+{
+ char *character = NULL;
- return 1;
+ switch (type) {
+ case AFR_NODE_INNOCENT:
+ character = "innocent";
+ break;
+ case AFR_NODE_FOOL:
+ character = "fool";
+ break;
+ case AFR_NODE_WISE:
+ character = "wise";
+ break;
+ default:
+ character = "invalid";
+ break;
+ }
+ return character;
}
+afr_node_type
+afr_find_child_character_type (int32_t *pending_row, int32_t child,
+ int32_t child_count, const char *xlator_name)
+{
+ afr_node_type type = AFR_NODE_INVALID;
+
+ GF_ASSERT (pending_row);
+ GF_ASSERT (child_count > 0);
+ GF_ASSERT ((child >= 0) && (child < child_count));
+
+ if (afr_sh_is_innocent (pending_row, child_count))
+ type = AFR_NODE_INNOCENT;
+ else if (afr_sh_is_fool (pending_row, child, child_count))
+ type = AFR_NODE_FOOL;
+ else if (afr_sh_is_wise (pending_row, child, child_count))
+ type = AFR_NODE_WISE;
+ else
+ GF_ASSERT (0);
+
+ gf_log (xlator_name, GF_LOG_DEBUG, "child %d character %s",
+ child, afr_get_character_str (type));
+ return type;
+}
int
-afr_sh_mark_sources (afr_self_heal_t *sh, int child_count,
- afr_self_heal_type type)
+afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs,
+ int32_t child_count, afr_self_heal_type type,
+ int32_t *valid_children, const char *xlator_name)
{
/* stores the 'characters' (innocent, fool, wise) of the nodes */
- afr_node_character *characters = NULL;
+ afr_node_character *characters = NULL;
int i = 0;
- int32_t ** pending_matrix = NULL;
- int * sources = NULL;
- int size_differs = 0;
- int nsources = 0;
+ int nsources = -1;
xlator_t *this = NULL;
- afr_private_t *priv = NULL;
characters = GF_CALLOC (sizeof (afr_node_character),
- child_count,
- gf_afr_mt_afr_node_character) ;
+ child_count, gf_afr_mt_afr_node_character);
if (!characters)
goto out;
this = THIS;
- priv = this->private;
- pending_matrix = sh->pending_matrix;
- sources = sh->sources;
/* start clean */
for (i = 0; i < child_count; i++) {
sources[i] = 0;
}
+ nsources = 0;
for (i = 0; i < child_count; i++) {
- if (afr_sh_is_innocent (pending_matrix[i], child_count)) {
- characters[i].type = AFR_NODE_INNOCENT;
-
- } else if (afr_sh_is_fool (pending_matrix[i], i, child_count)) {
- characters[i].type = AFR_NODE_FOOL;
-
- } else if (afr_sh_is_wise (pending_matrix[i], i, child_count)) {
- characters[i].type = AFR_NODE_WISE;
-
- } else {
- gf_log (this->name, GF_LOG_CRITICAL,
- "Could not determine the state of subvolume %s!"
- " (This message should never appear."
- " Please file a bug report to "
- "<gluster-devel@nongnu.org>.)",
- priv->children[i]->name);
- }
- }
-
- if (type == AFR_SELF_HEAL_DATA) {
- size_differs = afr_sh_mark_if_size_differs (sh, child_count);
+ characters[i].type =
+ afr_find_child_character_type (pending_matrix[i], i,
+ child_count,
+ xlator_name);
+ if (AFR_NODE_INVALID == characters[i].type)
+ gf_log (xlator_name, GF_LOG_WARNING,
+ "child %d had invalid xattrs", i);
}
if ((type == AFR_SELF_HEAL_METADATA)
&& afr_sh_all_nodes_innocent (characters, child_count)) {
- nsources = afr_sh_mark_loweia_uid_as_source (sh, child_count);
+ nsources = afr_sh_mark_lowest_uid_as_source (bufs,
+ valid_children,
+ child_count,
+ sources);
goto out;
}
- if (afr_sh_all_nodes_innocent (characters, child_count)) {
- if (size_differs) {
- nsources = afr_sh_mark_biggest_as_source (sh,
- child_count);
- }
-
- } else if (afr_sh_wise_nodes_exist (characters, child_count)) {
+ if (afr_sh_wise_nodes_exist (characters, child_count)) {
afr_sh_compute_wisdom (pending_matrix, characters, child_count);
if (afr_sh_wise_nodes_conflict (characters, child_count)) {
@@ -536,7 +637,6 @@ afr_sh_mark_sources (afr_self_heal_t *sh, int child_count,
gf_log (this->name, GF_LOG_INFO,
"split-brain possible, no source detected");
nsources = -1;
- goto out;
} else {
nsources = afr_sh_mark_wisest_as_sources (sources,
@@ -544,18 +644,26 @@ afr_sh_mark_sources (afr_self_heal_t *sh, int child_count,
child_count);
}
} else {
- nsources = afr_sh_mark_biggest_fool_as_source (sh, characters,
- child_count);
+ nsources = afr_mark_biggest_of_fools_as_source (sources,
+ pending_matrix,
+ characters,
+ child_count);
}
out:
+ if (nsources == 0) {
+ for (i = 0; i < child_count; i++) {
+ if (valid_children[i] != -1)
+ sources[valid_children[i]] = 1;
+ }
+ }
if (characters)
GF_FREE (characters);
+ gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources);
return nsources;
}
-
void
afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr,
int32_t *delta_matrix[], int success[],
@@ -643,7 +751,7 @@ afr_sh_delta_to_xattr (afr_private_t *priv,
int
-afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this)
+afr_sh_has_metadata_pending (dict_t *xattr, xlator_t *this)
{
/* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
int32_t pending[3] = {0,};
@@ -674,7 +782,7 @@ afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this)
int
-afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this)
+afr_sh_has_data_pending (dict_t *xattr, xlator_t *this)
{
/* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
int32_t pending[3] = {0,};
@@ -705,7 +813,7 @@ afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this)
int
-afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this)
+afr_sh_has_entry_pending (dict_t *xattr, xlator_t *this)
{
/* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
int32_t pending[3] = {0,};
@@ -1655,6 +1763,9 @@ afr_self_heal (call_frame_t *frame, xlator_t *this)
priv->child_count,
gf_afr_mt_int32_t);
}
+ sh->child_success = GF_CALLOC (sizeof (*sh->child_success),
+ priv->child_count, gf_afr_mt_int32_t);
+
FRAME_SU_DO (sh_frame, afr_local_t);
if (local->success_count && local->enoent_count) {
@@ -1688,3 +1799,25 @@ afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str,
snprintf(str + strlen(str), size - strlen(str), " entry");
}
}
+
+afr_self_heal_type
+afr_self_heal_type_for_transaction (afr_transaction_type type)
+{
+ afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID;
+
+ switch (type) {
+ case AFR_DATA_TRANSACTION:
+ sh_type = AFR_SELF_HEAL_DATA;
+ break;
+ case AFR_METADATA_TRANSACTION:
+ sh_type = AFR_SELF_HEAL_METADATA;
+ break;
+ case AFR_ENTRY_TRANSACTION:
+ sh_type = AFR_SELF_HEAL_ENTRY;
+ break;
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ GF_ASSERT (0);
+ break;
+ }
+ return sh_type;
+}