Add option to toggle x-halo fail-over

Summary: - Adds "halo-failover-enabled" option to enable/disable failing over to a brick outside of the defined halo to satisfy min-replicas - There are some use-cases where failing over to a brick which is out of region will be undesirable. I such cases we will more than likely opt to have more replicas within the region to tolerate the loss of a single replica in that region without losing quorum. - Fixed quorum accounting problem as well, now correctly goes RO in case where we lose a brick and aren't able to swap one in for some reason (fail-over not enabled or otherwise) Test Plan: - run prove -v tests/basic/halo.t - run prove -v tests/basic/halo-disable.t - run prove -v tests/basic/halo-failover-enabled.t - run prove -v tests/basic/halo-failover-disabled.t Reviewers: dph, cjh, jackl, mmckeen Reviewed By: mmckeen Conflicts: xlators/cluster/afr/src/afr.h xlators/mount/fuse/utils/mount.glusterfs.in Change-Id: Ia3ebf83f34b53118ca4491a3c4b66a178cc9795e Signed-off-by: Kevin Vigor <kvigor@fb.com> Reviewed-on: http://review.gluster.org/16275 CentOS-regression: Gluster Build System <jenkins@build.gluster.org> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> Reviewed-by: Shreyas Siravara <sshreyas@fb.com> Smoke: Gluster Build System <jenkins@build.gluster.org>
author: Richard Wareing <rwareing@fb.com> 2014-07-08 20:07:54 -0700
committer: Kevin Vigor <kvigor@fb.com> 2016-12-27 12:16:06 -0800
commit: 88ef24b83f49c7d670720d59832d4e0f09efbe78 (patch)
tree: 1ec9c5b77308d8af57baa5ced91f916039e9cf5c
parent: 3bb25b0882964b6c9c1623593f3a81902ff69aa0 (diff)
7 files changed, 132 insertions, 19 deletions
diff --git a/tests/basic/halo-failover-disabled.t b/tests/basic/halo-failover-disabled.t
new file mode 100644
index 00000000000..05ccd7e822a
--- /dev/null
+++ b/tests/basic/halo-failover-disabled.t
@@ -0,0 +1,67 @@
+#!/bin/bash
+#
+# Tests that fail-over works correctly for Halo Geo-replication
+#
+# 1. Create a volume @ 3x replication w/ halo + quorum enabled
+# 2. Write some data, background it & fail a brick
+# 3. The expected result is that the writes fail-over to the 3rd
+#    brick immediatelly, and md5s will show they are equal once
+#    the write completes.
+# 4. The mount should also be RW after the brick is killed as
+#    quorum will be immediately restored by swapping in the
+#    other brick.
+#
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 cluster.background-self-heal-count 0
+TEST $CLI volume set $V0 cluster.shd-max-threads 1
+TEST $CLI volume set $V0 cluster.halo-enabled True
+TEST $CLI volume set $V0 cluster.halo-max-latency 9999
+TEST $CLI volume set $V0 cluster.halo-shd-max-latency 9999
+TEST $CLI volume set $V0 cluster.halo-max-replicas 2
+TEST $CLI volume set $V0 cluster.halo-failover-enabled off
+TEST $CLI volume set $V0 cluster.quorum-type fixed
+TEST $CLI volume set $V0 cluster.quorum-count 2
+TEST $CLI volume set $V0 cluster.heal-timeout 5
+TEST $CLI volume set $V0 cluster.entry-self-heal on
+TEST $CLI volume set $V0 cluster.data-self-heal on
+TEST $CLI volume set $V0 cluster.metadata-self-heal on
+TEST $CLI volume set $V0 cluster.self-heal-daemon on
+TEST $CLI volume set $V0 cluster.eager-lock off
+# Use a large ping time here so the spare brick is not marked up
+# based on the ping time.  The only way it can get marked up is
+# by being swapped in via the down event (which is what we are disabling).
+TEST $CLI volume set $V0 network.ping-timeout 1000
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume start $V0
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
+cd $M0
+
+# Write some data to the mount
+dd if=/dev/urandom of=$M0/test bs=1k count=200 oflag=sync &> /dev/null &
+
+sleep 0.5
+# Kill the first brick, fail-over to 3rd
+TEST kill_brick $V0 $H0 $B0/${V0}0
+
+# Test that quorum should fail and the mount is RO, the reason here
+# is that although there _is_ another brick running which _could_
+# take the failed bricks place, it is not marked "up" so quorum
+# will not be fullfilled.  If we waited 1000 second the brick would
+# indeed be activated based on ping time, but for our test we want
+# the decision to be solely "down event" driven, not ping driven.
+TEST ! dd if=/dev/urandom of=$M0/test_rw bs=1M count=1
+
+TEST $CLI volume start $V0 force
+sleep 2
+
+# Test that quorum should be restored and the file is writable
+TEST dd if=/dev/urandom of=$M0/test_rw bs=1M count=1
+
+cleanup
diff --git a/tests/basic/halo-failover.t b/tests/basic/halo-failover-enabled.t
index 220fa1f2207..e897d076813 100644
--- a/tests/basic/halo-failover.t
+++ b/tests/basic/halo-failover-enabled.t
@@ -22,6 +22,7 @@ TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
 TEST $CLI volume set $V0 cluster.background-self-heal-count 0
 TEST $CLI volume set $V0 cluster.shd-max-threads 1
 TEST $CLI volume set $V0 cluster.halo-enabled True
+TEST $CLI volume set $V0 cluster.halo-failover-enabled on
 TEST $CLI volume set $V0 cluster.halo-max-replicas 2
 TEST $CLI volume set $V0 cluster.quorum-type fixed
 TEST $CLI volume set $V0 cluster.quorum-count 2
@@ -38,26 +39,29 @@ TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0
 cd $M0
 
 # Write some data to the mount
-dd if=/dev/urandom of=$M0/test bs=1k count=200 oflag=sync &> /dev/null &
+dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync
+
+# Calulate the MD5s on the two up volumes.
+MD5_B0=$(md5sum $B0/${V0}0/test | cut -d' ' -f1)
+MD5_B1=$(md5sum $B0/${V0}1/test | cut -d' ' -f1)
+
+# Verify they are the same
+TEST [ "$MD5_B0" == "$MD5_B1" ]
 
 sleep 0.5
 # Kill the first brick, fail-over to 3rd
 TEST kill_brick $V0 $H0 $B0/${V0}0
 
 # Test the mount is still RW (i.e. quorum works)
-TEST dd if=/dev/urandom of=$M0/test_rw bs=1M count=1
-
-# Wait for the dd to finish
-wait
-sleep 3
+TEST dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 conv=fsync
 
 # Calulate the MD5s
-MD5_B0=$(md5sum $B0/${V0}0/test | cut -d' ' -f1)
-MD5_B1=$(md5sum $B0/${V0}1/test | cut -d' ' -f1)
-MD5_B2=$(md5sum $B0/${V0}2/test | cut -d' ' -f1)
+MD5_B0=$(md5sum $B0/${V0}0/test_rw | cut -d' ' -f1)
+MD5_B1=$(md5sum $B0/${V0}1/test_rw | cut -d' ' -f1)
+MD5_B2=$(md5sum $B0/${V0}2/test_rw | cut -d' ' -f1)
 
 # Verify they are the same
-TEST [ "$MD5_B1" == "$MD5_B2" ]
+TEST [ x"$MD5_B1" == x"$MD5_B2" ]
 
 # Verify the failed brick has a different MD5
 TEST [ x"$MD5_B0" != x"$MD5_B1" ]
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 630bee80be3..7245c619b6a 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -4238,7 +4238,7 @@ find_worst_up_child (xlator_t *this)
         for (i = 0; i < priv->child_count; i++) {
                 if (priv->child_up[i] &&
                     priv->child_latency[i] >= 0 &&
-                    priv->child_latency[i] > worst_latency) {
+                    priv->child_latency[i] >= worst_latency) {
                         worst_child = i;
                         worst_latency = priv->child_latency[i];
                 }
@@ -4275,7 +4275,8 @@ _afr_handle_ping_event (xlator_t *this, xlator_t *child_xlator,
             priv->child_up[idx] == 1 &&
             up_children > priv->halo_min_replicas) {
                 if ((up_children - 1) <
-                    priv->halo_min_replicas) {
+                    priv->halo_min_replicas &&
+                    priv->halo_failover_enabled) {
                         gf_log (child_xlator->name, GF_LOG_INFO,
                                "Overriding halo threshold, "
                                "min replicas: %d",
@@ -4318,6 +4319,7 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator,
         int             i                   = -1;
         int             up_children         = 0;
         int             worst_up_child      = -1;
+        gf_boolean_t    was_down            = _gf_false;
 
         priv = this->private;
 
@@ -4328,6 +4330,11 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator,
          * explanation.
          */
         if (priv->child_up[idx] != 1) {
+                /*
+                 * Track the fact we did this, we may need to repeal this
+                 * if we later decide to mark this brick down.
+                 */
+                was_down = _gf_true;
                 priv->event_generation++;
         }
         priv->child_up[idx] = 1;
@@ -4351,6 +4358,11 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator,
                 if (worst_up_child >= 0 &&
                     priv->child_latency[worst_up_child] >
                     halo_max_latency_msec) {
+                        if (was_down == _gf_true)
+                                priv->event_generation--;
+                        *call_psh = 0;
+                        priv->child_up[worst_up_child] = 0;
+                        up_children--;
                         gf_log (this->name, GF_LOG_DEBUG,
                                 "Marking child %d down, "
                                 "doesn't meet halo threshold "
@@ -4359,28 +4371,31 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator,
                                 worst_up_child,
                                 halo_max_latency_msec,
                                 priv->halo_min_replicas);
-                        priv->child_up[worst_up_child] = 0;
-                        up_children--;
+                        goto out;
                 }
         }
         if (priv->halo_enabled == _gf_true &&
                         up_children > priv->halo_max_replicas &&
             !priv->shd.iamshd) {
+                if (was_down == _gf_true)
+                        priv->event_generation--;
+                *call_psh = 0;
                 worst_up_child = find_worst_up_child (this);
                 if (worst_up_child < 0) {
                         worst_up_child = idx;
                 }
                 priv->child_up[worst_up_child] = 0;
                 up_children--;
-                gf_log (this->name, GF_LOG_DEBUG,
+                gf_log (this->name, GF_LOG_INFO,
                         "Marking child %d down, "
                         "up_children (%d) > "
                         "halo_max_replicas (%d)",
                         worst_up_child,
                         up_children,
                         priv->halo_max_replicas);
+                goto out;
         }
-
+out:
         if (up_children == 1) {
                 gf_log (this->name, GF_LOG_INFO,
                         "Subvolume '%s' came back up; "
@@ -4445,10 +4460,11 @@ _afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator,
          * begin using it synchronously.
          */
         if (priv->halo_enabled == _gf_true &&
-                        up_children < priv->halo_min_replicas) {
+            up_children < priv->halo_min_replicas &&
+            priv->halo_failover_enabled == _gf_true) {
                 best_down_child = find_best_down_child (this);
                 if (best_down_child >= 0) {
-                        gf_log (this->name, GF_LOG_DEBUG,
+                        gf_log (this->name, GF_LOG_INFO,
                                 "Swapping out child %d for "
                                 "child %d to satisfy "
                                 "halo_min_replicas (%d).",
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index d9c740dc498..27309985e82 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -180,6 +180,10 @@ reconfigure (xlator_t *this, dict_t *options)
                           priv->halo_enabled, options, bool,
                           out);
 
+        GF_OPTION_RECONF ("halo-failover-enabled",
+                          priv->halo_failover_enabled, options, bool,
+                          out);
+
         GF_OPTION_RECONF ("halo-shd-max-latency",
                           priv->shd.halo_max_latency_msec, options, uint32,
                           out);
@@ -420,9 +424,11 @@ init (xlator_t *this)
         GF_OPTION_INIT ("halo-enabled",
                         priv->halo_enabled, bool, out);
 
+        GF_OPTION_INIT ("halo-failover-enabled",
+                        priv->halo_failover_enabled, bool, out);
+
         GF_OPTION_INIT ("halo-shd-max-latency", priv->shd.halo_max_latency_msec,
                         uint32, out);
-
         GF_OPTION_INIT ("halo-max-latency", priv->halo_max_latency_msec,
                         uint32, out);
         GF_OPTION_INIT ("halo-max-replicas", priv->halo_max_replicas, uint32,
@@ -719,6 +725,13 @@ struct volume_options options[] = {
           .default_value = "False",
            .description = "Enable Halo (geo) replication mode."
         },
+        { .key   = {"halo-failover-enabled"},
+          .type  = GF_OPTION_TYPE_BOOL,
+          .default_value = "False",
+           .description = "Enable x-halo failover: will allow failover "
+                          "to bricks outside the client or daemons' halo "
+                          "in an attempt to satisfy halo-min-replicas."
+        },
         { .key   = {"halo-nfsd-max-latency"},
           .type  = GF_OPTION_TYPE_INT,
           .min   = 1,
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 0a51c38d06e..eebcc12b1ee 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -149,6 +149,7 @@ typedef struct _afr_private {
 	char                   *afr_dirty;
         gf_boolean_t           halo_enabled;
 
+        gf_boolean_t           halo_failover_enabled;
         uint32_t               halo_max_latency_msec;
         uint32_t               halo_max_replicas;
         uint32_t               halo_min_replicas;
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index ab782db9f5d..ab00914e3e2 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -3038,6 +3038,11 @@ struct volopt_map_entry glusterd_volopt_map[] = {
           .op_version = 2,
           .flags      = OPT_FLAG_CLIENT_OPT
         },
+        { .key        = "cluster.halo-failover-enabled",
+          .voltype    = "cluster/replicate",
+          .op_version = 2,
+          .flags      = OPT_FLAG_CLIENT_OPT
+        },
         { .key        = "cluster.halo-shd-max-latency",
           .voltype    = "cluster/replicate",
           .op_version = 2,
diff --git a/xlators/mount/fuse/utils/mount.glusterfs.in b/xlators/mount/fuse/utils/mount.glusterfs.in
index b5c90ba1dff..598f62fee7a 100755
--- a/xlators/mount/fuse/utils/mount.glusterfs.in
+++ b/xlators/mount/fuse/utils/mount.glusterfs.in
@@ -186,6 +186,10 @@ start_glusterfs ()
     fi
 
 #options with values start here
+    if [ -n "$halo_failover_enabled" ]; then
+      cmd_line=$(echo "$cmd_line --xlator-option \
+ *replicate*.halo-failover-enabled=$halo_failover_enabled");
+    fi
     if [ -n "$halo_max_latency" ]; then
       cmd_line=$(echo "$cmd_line --xlator-option \
  *replicate*.halo-max-latency=$halo_max_latency");
@@ -503,6 +507,9 @@ with_options()
         "halo-min-replicas")
           halo_min_replicas=$value
           ;;
+        "halo-failover-enabled")
+          halo_failover_enabled=$value
+          ;;
         x-*)
             # comments or userspace application-specific options, drop them
             ;;
author	Richard Wareing <rwareing@fb.com>	2014-07-08 20:07:54 -0700
committer	Kevin Vigor <kvigor@fb.com>	2016-12-27 12:16:06 -0800
commit	88ef24b83f49c7d670720d59832d4e0f09efbe78 (patch)
tree	1ec9c5b77308d8af57baa5ced91f916039e9cf5c
parent	3bb25b0882964b6c9c1623593f3a81902ff69aa0 (diff)