summaryrefslogtreecommitdiffstats
path: root/xlators
diff options
context:
space:
mode:
authorJeff Darcy <jdarcy@redhat.com>2016-11-17 10:42:02 -0500
committerPranith Kumar Karampuri <pkarampu@redhat.com>2016-11-28 22:41:24 -0800
commit77f03db0131c88d607886bb02dd2a4276ab584d4 (patch)
tree8a3c464e02f2c9b391c94281821a2c8605c8c612 /xlators
parent1876454d2e7950f25d1e5bb8e2c07ab27d521498 (diff)
afr: fix auto-quorum
(1) afr_have_quorum is dead code. It was copied to afr_has_quorum, and everything else uses that, but the original was never deleted (until now). (2) Auto-quorum should be default for any N>2. Leaving quorum disabled is BAD, but apparently deemed acceptable for N=2 because there's no real quorum in that case. For any larger number (including arbiter configurations) there is such a thing as real quorum and we should use it by default. Note that for N=3 the answers we get from "N % 2" (the old check) and "N > 2" (the new one) are the same. (3) The special case for even N in afr_has_quorum has been simplified and explained more thoroughly in a comment. Change-Id: I48b33c15093512fecf516b26dcf09afecb7ae33b Signed-off-by: Jeff Darcy <jdarcy@redhat.com> Reviewed-on: http://review.gluster.org/15873 Smoke: Gluster Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Vijay Bellur <vbellur@redhat.com> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Diffstat (limited to 'xlators')
-rw-r--r--xlators/cluster/afr/src/afr-common.c37
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c39
-rw-r--r--xlators/cluster/afr/src/afr.c4
3 files changed, 31 insertions, 49 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 3521e63e6d0..77b68d34c18 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -5105,43 +5105,6 @@ afr_set_low_priority (call_frame_t *frame)
}
-gf_boolean_t
-afr_have_quorum (char *logname, afr_private_t *priv)
-{
- unsigned int quorum = 0;
- unsigned int up_children = 0;
-
- GF_VALIDATE_OR_GOTO(logname,priv,out);
-
- up_children = __afr_get_up_children_count (priv);
- quorum = priv->quorum_count;
- if (quorum != AFR_QUORUM_AUTO)
- return up_children >= quorum;
-
- quorum = priv->child_count / 2 + 1;
- if (up_children >= quorum)
- return _gf_true;
-
- /*
- * Special case for even numbers of nodes: if we have exactly half
- * and that includes the first ("senior-most") node, then that counts
- * as quorum even if it wouldn't otherwise. This supports e.g. N=2
- * while preserving the critical property that there can only be one
- * such group.
- */
- if ((priv->child_count % 2) == 0) {
- quorum = priv->child_count / 2;
- if (up_children >= quorum) {
- if (priv->child_up[0]) {
- return _gf_true;
- }
- }
- }
-
-out:
- return _gf_false;
-}
-
void
afr_priv_destroy (afr_private_t *priv)
{
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index eb7571db5f1..d23654d8354 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -681,17 +681,36 @@ afr_has_quorum (unsigned char *subvols, xlator_t *this)
up_children_count = AFR_COUNT (subvols, priv->child_count);
if (priv->quorum_count == AFR_QUORUM_AUTO) {
- /*
- * Special case for even numbers of nodes in auto-quorum:
- * if we have exactly half children up
- * and that includes the first ("senior-most") node, then that counts
- * as quorum even if it wouldn't otherwise. This supports e.g. N=2
- * while preserving the critical property that there can only be one
- * such group.
- */
- if ((priv->child_count % 2 == 0) &&
- (up_children_count == (priv->child_count/2)))
+ /*
+ * Special case for auto-quorum with an even number of nodes.
+ *
+ * A replica set with even count N can only handle the same
+ * number of failures as odd N-1 before losing "vanilla"
+ * quorum, and the probability of more simultaneous failures is
+ * actually higher. For example, with a 1% chance of failure
+ * we'd have a 0.03% chance of two simultaneous failures with
+ * N=3 but a 0.06% chance with N=4. However, the special case
+ * is necessary for N=2 because there's no real quorum in that
+ * case (i.e. can't normally survive *any* failures). In that
+ * case, we treat the first node as a tie-breaker, allowing
+ * quorum to be retained in some cases while still honoring the
+ * all-important constraint that there can not simultaneously
+ * be two partitioned sets of nodes each believing they have
+ * quorum. Of two equally sized sets, the one without that
+ * first node will lose.
+ *
+ * It turns out that the special case is beneficial for higher
+ * values of N as well. Continuing the example above, the
+ * probability of losing quorum with N=4 and this type of
+ * quorum is (very) slightly lower than with N=3 and vanilla
+ * quorum. The difference becomes even more pronounced with
+ * higher N. Therefore, even though such replica counts are
+ * unlikely to be seen in practice, we might as well use the
+ * "special" quorum then as well.
+ */
+ if ((up_children_count * 2) == priv->child_count) {
return subvols[0];
+ }
}
if (priv->quorum_count == AFR_QUORUM_AUTO) {
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 0dbb209df1b..1df45b5a68f 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -91,8 +91,8 @@ fix_quorum_options (xlator_t *this, afr_private_t *priv, char *qtype,
{
if (dict_get (options, "quorum-type") == NULL) {
/* If user doesn't configure anything enable auto-quorum if the
- * replica has odd number of subvolumes */
- if (priv->child_count % 2)
+ * replica has more than two subvolumes */
+ if (priv->child_count > 2)
qtype = "auto";
}