diff options
| author | Vikas Gorur <vikas@gluster.com> | 2009-03-24 05:44:59 -0700 | 
|---|---|---|
| committer | Anand V. Avati <avati@amp.gluster.com> | 2009-03-24 21:53:19 +0530 | 
| commit | 6e8017479fd9997ae47e7c8cbb74247d7e2b4fd0 (patch) | |
| tree | dbd48b0cd4c089db017f1f09ae04abb6a8f53d89 | |
| parent | 8f590ad883c170827cc06dcbcb17e5aaa1788996 (diff) | |
Made self heal logic more precise.
Discard earlier patch sent for the same error. This patch fixes
it more comprehensively.
This solves the spurious split-brain seen by many users.
Signed-off-by: Anand V. Avati <avati@amp.gluster.com>
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 255 | 
1 files changed, 233 insertions, 22 deletions
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index aea31a748f5..04530ccb0a0 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -211,43 +211,254 @@ afr_sh_build_pending_matrix (int32_t *pending_matrix[], dict_t *xattr[],  /**   * mark_sources: Mark all 'source' nodes and return number of source   * nodes found + * + * A node (a row in the pending matrix) belongs to one of + * three categories: + * + * M is the pending matrix. + * + * 'innocent' - M[i] is all zeroes + * 'fool'     - M[i] has i'th element = 1 (self-reference) + * 'wise'     - M[i] has i'th element = 0, others are 1 or 0. + * + * All 'innocent' nodes are sinks. If all nodes are innocent, no self-heal is + * needed. + * + * A 'wise' node can be a source. If two 'wise' nodes conflict, it is  + * a split-brain. If one wise node refers to the other but the other doesn't + * refer back, the referrer is a source. + * + * All fools are sinks, unless there are no 'wise' nodes. In that case, + * one of the fools is made a source.   */ +typedef enum { +        AFR_NODE_INNOCENT, +        AFR_NODE_FOOL, +        AFR_NODE_WISE +} afr_node_type; + +typedef struct { +        afr_node_type type; +        int           wisdom; +} afr_node_character; + + +static int +afr_sh_is_innocent (int32_t *array, int child_count) +{ +        int i   = 0; +        int ret = 1;   /* innocent until proven guilty */ + +        for (i = 0; i < child_count; i++) { +                if (array[i]) { +                        ret = 0; +                        break; +                } +        } + +        return ret; +} + + +static int +afr_sh_is_fool (int32_t *array, int i, int child_count) +{ +        return array[i];   /* fool if accuses itself */ +} + + +static int +afr_sh_is_wise (int32_t *array, int i, int child_count) +{ +        return !array[i];  /* wise if does not accuse itself */ +} + + +static int +afr_sh_all_nodes_innocent (afr_node_character *characters,  +                           int child_count) +{ +        int i   = 0; +        int ret = 1; + +        for (i = 0; i < child_count; i++) { +                if (characters[i].type != AFR_NODE_INNOCENT) { +                        ret = 0; +                        break; +                } +        } + +        return ret; +} + + +static int +afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count) +{ +        int i   = 0; +        int ret = 0; + +        for (i = 0; i < child_count; i++) { +                if (characters[i].type == AFR_NODE_WISE) { +                        ret = 1; +                        break; +                } +        } + +        return ret; +} + + +/* + * The 'wisdom' of a wise node is 0 if any other wise node accuses to it. + * It is 1 if no other wise node accuses it.  + * Only wise nodes with wisdom 1 are sources. + * + * If no nodes with wisdom 1 exist, a split-brain has occured. + */ + +static void +afr_sh_compute_wisdom (int32_t *pending_matrix[], +                       afr_node_character characters[], int child_count) +{ +        int i = 0; +        int j = 0; + +        for (i = 0; i < child_count; i++) { +                if (characters[i].type == AFR_NODE_WISE) { +                        characters[i].wisdom = 1; + +                        for (j = 0; j < child_count; j++) { +                                if ((characters[j].type == AFR_NODE_WISE) +                                    && pending_matrix[j][i]) { +                                         +                                        characters[i].wisdom = 0; +                                } +                        } +                } +        } +} + + +static int +afr_sh_wise_nodes_conflict (afr_node_character *characters,  +                            int child_count) +{ +        int i   = 0; +        int ret = 1; + +        for (i = 0; i < child_count; i++) { +                if ((characters[i].type == AFR_NODE_WISE) +                    && characters[i].wisdom == 1) { + +                        /* There is atleast one bona-fide wise node */ +                        ret = 0; +                        break; +                } +        } + +        return ret; +} + + +static int +afr_sh_mark_wisest_as_sources (int sources[],  +                               afr_node_character *characters,  +                               int child_count) +{ +        int nsources = 0; +         +        int i = 0; + +        for (i = 0; i < child_count; i++) { +                if (characters[i].wisdom == 1) { +                        sources[i] = 1; +                        nsources++; +                } +        } + +        return nsources; +} + +         +static int +afr_sh_mark_a_fool_as_source (int sources[], afr_node_character *characters,  +                              int child_count) +{ +        int i = 0; +         +        int nsources = 0; +         +        for (i = 0; i < child_count; i++) { +                if (characters[i].type == AFR_NODE_FOOL) { +                        sources[i] = 1; +                        nsources++; +                        break; +                } +        } + +        return nsources; +} + +          int  afr_sh_mark_sources (int32_t *pending_matrix[], int sources[], int child_count)  {  	int i = 0; -	int j = 0;  	int nsources = 0; +        /* stores the 'characters' (innocent, fool, wise) of the nodes */ +        afr_node_character * +                characters = CALLOC (sizeof (afr_node_character),  +                                     child_count);  	/* start clean */  	for (i = 0; i < child_count; i++) {  		sources[i] = 0;  	} +         +        for (i = 0; i < child_count; i++) { +                if (afr_sh_is_innocent (pending_matrix[i], child_count)) { +                        characters[i].type = AFR_NODE_INNOCENT; + +                } else if (afr_sh_is_fool (pending_matrix[i], i, child_count)) { +                        characters[i].type = AFR_NODE_FOOL; + +                } else if (afr_sh_is_wise (pending_matrix[i], i, child_count)) { +                        characters[i].type = AFR_NODE_WISE; + +                } else { +                        gf_log ("[module:afr]", GF_LOG_ERROR, +                                "node %d is diabolical! " +                                "(This message should never appear." +                                " Please file a bug report.)", i); +                } +        } + +        if (afr_sh_all_nodes_innocent (characters, child_count)) { +                /* no self-heal needed */ +                goto out; + +        } else if (afr_sh_wise_nodes_exist (characters, child_count)) { +                afr_sh_compute_wisdom (pending_matrix, characters, child_count); + +                if (afr_sh_wise_nodes_conflict (characters, child_count)) { +                        /* split-brain */ +                        goto out; + +                } else { +                        nsources = afr_sh_mark_wisest_as_sources (sources,  +                                                                  characters, +                                                                  child_count); +                } +        } else { +                nsources = afr_sh_mark_a_fool_as_source (sources, characters,  +                                                         child_count); +        } -	/* -	  Let's 'normalize' the pending matrix first, -	  by disregarding all pending entries that refer -	  to themselves -	*/ -	for (i = 0; i < child_count; i++) { -		pending_matrix[i][i] = 0; -	} - -	for (i = 0; i < child_count; i++) { -		for (j = 0; j < child_count; j++) { -			if (pending_matrix[j][i]) -				break; -		} - -		if (j == child_count) { -			nsources++; -			sources[i] = 1; -		} -	} - +out:  	return nsources;  }  | 
