diff options
| author | Richard Wareing <rwareing@fb.com> | 2014-07-08 20:07:54 -0700 |
|---|---|---|
| committer | Kevin Vigor <kvigor@fb.com> | 2016-12-27 12:16:06 -0800 |
| commit | 88ef24b83f49c7d670720d59832d4e0f09efbe78 (patch) | |
| tree | 1ec9c5b77308d8af57baa5ced91f916039e9cf5c | |
| parent | 3bb25b0882964b6c9c1623593f3a81902ff69aa0 (diff) | |
Add option to toggle x-halo fail-over
Summary:
- Adds "halo-failover-enabled" option to enable/disable failing over to a brick outside of the defined halo to satisfy min-replicas
- There are some use-cases where failing over to a brick which is out of region will be undesirable. I such cases we will more than likely opt to have more replicas within the region to tolerate the loss of a single replica in that region without losing quorum.
- Fixed quorum accounting problem as well, now correctly goes RO in case where we lose a brick and aren't able to swap one in for some reason (fail-over not enabled or otherwise)
Test Plan:
- run prove -v tests/basic/halo.t
- run prove -v tests/basic/halo-disable.t
- run prove -v tests/basic/halo-failover-enabled.t
- run prove -v tests/basic/halo-failover-disabled.t
Reviewers: dph, cjh, jackl, mmckeen
Reviewed By: mmckeen
Conflicts:
xlators/cluster/afr/src/afr.h
xlators/mount/fuse/utils/mount.glusterfs.in
Change-Id: Ia3ebf83f34b53118ca4491a3c4b66a178cc9795e
Signed-off-by: Kevin Vigor <kvigor@fb.com>
Reviewed-on: http://review.gluster.org/16275
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
Reviewed-by: Shreyas Siravara <sshreyas@fb.com>
Smoke: Gluster Build System <jenkins@build.gluster.org>
| -rw-r--r-- | tests/basic/halo-failover-disabled.t | 67 | ||||
| -rw-r--r-- | tests/basic/halo-failover-enabled.t (renamed from tests/basic/halo-failover.t) | 24 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 32 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.c | 15 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 1 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 5 | ||||
| -rwxr-xr-x | xlators/mount/fuse/utils/mount.glusterfs.in | 7 |
7 files changed, 132 insertions, 19 deletions
diff --git a/tests/basic/halo-failover-disabled.t b/tests/basic/halo-failover-disabled.t new file mode 100644 index 00000000000..05ccd7e822a --- /dev/null +++ b/tests/basic/halo-failover-disabled.t @@ -0,0 +1,67 @@ +#!/bin/bash +# +# Tests that fail-over works correctly for Halo Geo-replication +# +# 1. Create a volume @ 3x replication w/ halo + quorum enabled +# 2. Write some data, background it & fail a brick +# 3. The expected result is that the writes fail-over to the 3rd +# brick immediatelly, and md5s will show they are equal once +# the write completes. +# 4. The mount should also be RW after the brick is killed as +# quorum will be immediately restored by swapping in the +# other brick. +# +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 cluster.background-self-heal-count 0 +TEST $CLI volume set $V0 cluster.shd-max-threads 1 +TEST $CLI volume set $V0 cluster.halo-enabled True +TEST $CLI volume set $V0 cluster.halo-max-latency 9999 +TEST $CLI volume set $V0 cluster.halo-shd-max-latency 9999 +TEST $CLI volume set $V0 cluster.halo-max-replicas 2 +TEST $CLI volume set $V0 cluster.halo-failover-enabled off +TEST $CLI volume set $V0 cluster.quorum-type fixed +TEST $CLI volume set $V0 cluster.quorum-count 2 +TEST $CLI volume set $V0 cluster.heal-timeout 5 +TEST $CLI volume set $V0 cluster.entry-self-heal on +TEST $CLI volume set $V0 cluster.data-self-heal on +TEST $CLI volume set $V0 cluster.metadata-self-heal on +TEST $CLI volume set $V0 cluster.self-heal-daemon on +TEST $CLI volume set $V0 cluster.eager-lock off +# Use a large ping time here so the spare brick is not marked up +# based on the ping time. The only way it can get marked up is +# by being swapped in via the down event (which is what we are disabling). +TEST $CLI volume set $V0 network.ping-timeout 1000 +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 +cd $M0 + +# Write some data to the mount +dd if=/dev/urandom of=$M0/test bs=1k count=200 oflag=sync &> /dev/null & + +sleep 0.5 +# Kill the first brick, fail-over to 3rd +TEST kill_brick $V0 $H0 $B0/${V0}0 + +# Test that quorum should fail and the mount is RO, the reason here +# is that although there _is_ another brick running which _could_ +# take the failed bricks place, it is not marked "up" so quorum +# will not be fullfilled. If we waited 1000 second the brick would +# indeed be activated based on ping time, but for our test we want +# the decision to be solely "down event" driven, not ping driven. +TEST ! dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 + +TEST $CLI volume start $V0 force +sleep 2 + +# Test that quorum should be restored and the file is writable +TEST dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 + +cleanup diff --git a/tests/basic/halo-failover.t b/tests/basic/halo-failover-enabled.t index 220fa1f2207..e897d076813 100644 --- a/tests/basic/halo-failover.t +++ b/tests/basic/halo-failover-enabled.t @@ -22,6 +22,7 @@ TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} TEST $CLI volume set $V0 cluster.background-self-heal-count 0 TEST $CLI volume set $V0 cluster.shd-max-threads 1 TEST $CLI volume set $V0 cluster.halo-enabled True +TEST $CLI volume set $V0 cluster.halo-failover-enabled on TEST $CLI volume set $V0 cluster.halo-max-replicas 2 TEST $CLI volume set $V0 cluster.quorum-type fixed TEST $CLI volume set $V0 cluster.quorum-count 2 @@ -38,26 +39,29 @@ TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 cd $M0 # Write some data to the mount -dd if=/dev/urandom of=$M0/test bs=1k count=200 oflag=sync &> /dev/null & +dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync + +# Calulate the MD5s on the two up volumes. +MD5_B0=$(md5sum $B0/${V0}0/test | cut -d' ' -f1) +MD5_B1=$(md5sum $B0/${V0}1/test | cut -d' ' -f1) + +# Verify they are the same +TEST [ "$MD5_B0" == "$MD5_B1" ] sleep 0.5 # Kill the first brick, fail-over to 3rd TEST kill_brick $V0 $H0 $B0/${V0}0 # Test the mount is still RW (i.e. quorum works) -TEST dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 - -# Wait for the dd to finish -wait -sleep 3 +TEST dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 conv=fsync # Calulate the MD5s -MD5_B0=$(md5sum $B0/${V0}0/test | cut -d' ' -f1) -MD5_B1=$(md5sum $B0/${V0}1/test | cut -d' ' -f1) -MD5_B2=$(md5sum $B0/${V0}2/test | cut -d' ' -f1) +MD5_B0=$(md5sum $B0/${V0}0/test_rw | cut -d' ' -f1) +MD5_B1=$(md5sum $B0/${V0}1/test_rw | cut -d' ' -f1) +MD5_B2=$(md5sum $B0/${V0}2/test_rw | cut -d' ' -f1) # Verify they are the same -TEST [ "$MD5_B1" == "$MD5_B2" ] +TEST [ x"$MD5_B1" == x"$MD5_B2" ] # Verify the failed brick has a different MD5 TEST [ x"$MD5_B0" != x"$MD5_B1" ] diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 630bee80be3..7245c619b6a 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -4238,7 +4238,7 @@ find_worst_up_child (xlator_t *this) for (i = 0; i < priv->child_count; i++) { if (priv->child_up[i] && priv->child_latency[i] >= 0 && - priv->child_latency[i] > worst_latency) { + priv->child_latency[i] >= worst_latency) { worst_child = i; worst_latency = priv->child_latency[i]; } @@ -4275,7 +4275,8 @@ _afr_handle_ping_event (xlator_t *this, xlator_t *child_xlator, priv->child_up[idx] == 1 && up_children > priv->halo_min_replicas) { if ((up_children - 1) < - priv->halo_min_replicas) { + priv->halo_min_replicas && + priv->halo_failover_enabled) { gf_log (child_xlator->name, GF_LOG_INFO, "Overriding halo threshold, " "min replicas: %d", @@ -4318,6 +4319,7 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator, int i = -1; int up_children = 0; int worst_up_child = -1; + gf_boolean_t was_down = _gf_false; priv = this->private; @@ -4328,6 +4330,11 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator, * explanation. */ if (priv->child_up[idx] != 1) { + /* + * Track the fact we did this, we may need to repeal this + * if we later decide to mark this brick down. + */ + was_down = _gf_true; priv->event_generation++; } priv->child_up[idx] = 1; @@ -4351,6 +4358,11 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator, if (worst_up_child >= 0 && priv->child_latency[worst_up_child] > halo_max_latency_msec) { + if (was_down == _gf_true) + priv->event_generation--; + *call_psh = 0; + priv->child_up[worst_up_child] = 0; + up_children--; gf_log (this->name, GF_LOG_DEBUG, "Marking child %d down, " "doesn't meet halo threshold " @@ -4359,28 +4371,31 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator, worst_up_child, halo_max_latency_msec, priv->halo_min_replicas); - priv->child_up[worst_up_child] = 0; - up_children--; + goto out; } } if (priv->halo_enabled == _gf_true && up_children > priv->halo_max_replicas && !priv->shd.iamshd) { + if (was_down == _gf_true) + priv->event_generation--; + *call_psh = 0; worst_up_child = find_worst_up_child (this); if (worst_up_child < 0) { worst_up_child = idx; } priv->child_up[worst_up_child] = 0; up_children--; - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_INFO, "Marking child %d down, " "up_children (%d) > " "halo_max_replicas (%d)", worst_up_child, up_children, priv->halo_max_replicas); + goto out; } - +out: if (up_children == 1) { gf_log (this->name, GF_LOG_INFO, "Subvolume '%s' came back up; " @@ -4445,10 +4460,11 @@ _afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator, * begin using it synchronously. */ if (priv->halo_enabled == _gf_true && - up_children < priv->halo_min_replicas) { + up_children < priv->halo_min_replicas && + priv->halo_failover_enabled == _gf_true) { best_down_child = find_best_down_child (this); if (best_down_child >= 0) { - gf_log (this->name, GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_INFO, "Swapping out child %d for " "child %d to satisfy " "halo_min_replicas (%d).", diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index d9c740dc498..27309985e82 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -180,6 +180,10 @@ reconfigure (xlator_t *this, dict_t *options) priv->halo_enabled, options, bool, out); + GF_OPTION_RECONF ("halo-failover-enabled", + priv->halo_failover_enabled, options, bool, + out); + GF_OPTION_RECONF ("halo-shd-max-latency", priv->shd.halo_max_latency_msec, options, uint32, out); @@ -420,9 +424,11 @@ init (xlator_t *this) GF_OPTION_INIT ("halo-enabled", priv->halo_enabled, bool, out); + GF_OPTION_INIT ("halo-failover-enabled", + priv->halo_failover_enabled, bool, out); + GF_OPTION_INIT ("halo-shd-max-latency", priv->shd.halo_max_latency_msec, uint32, out); - GF_OPTION_INIT ("halo-max-latency", priv->halo_max_latency_msec, uint32, out); GF_OPTION_INIT ("halo-max-replicas", priv->halo_max_replicas, uint32, @@ -719,6 +725,13 @@ struct volume_options options[] = { .default_value = "False", .description = "Enable Halo (geo) replication mode." }, + { .key = {"halo-failover-enabled"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "False", + .description = "Enable x-halo failover: will allow failover " + "to bricks outside the client or daemons' halo " + "in an attempt to satisfy halo-min-replicas." + }, { .key = {"halo-nfsd-max-latency"}, .type = GF_OPTION_TYPE_INT, .min = 1, diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 0a51c38d06e..eebcc12b1ee 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -149,6 +149,7 @@ typedef struct _afr_private { char *afr_dirty; gf_boolean_t halo_enabled; + gf_boolean_t halo_failover_enabled; uint32_t halo_max_latency_msec; uint32_t halo_max_replicas; uint32_t halo_min_replicas; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index ab782db9f5d..ab00914e3e2 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -3038,6 +3038,11 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = 2, .flags = OPT_FLAG_CLIENT_OPT }, + { .key = "cluster.halo-failover-enabled", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, { .key = "cluster.halo-shd-max-latency", .voltype = "cluster/replicate", .op_version = 2, diff --git a/xlators/mount/fuse/utils/mount.glusterfs.in b/xlators/mount/fuse/utils/mount.glusterfs.in index b5c90ba1dff..598f62fee7a 100755 --- a/xlators/mount/fuse/utils/mount.glusterfs.in +++ b/xlators/mount/fuse/utils/mount.glusterfs.in @@ -186,6 +186,10 @@ start_glusterfs () fi #options with values start here + if [ -n "$halo_failover_enabled" ]; then + cmd_line=$(echo "$cmd_line --xlator-option \ + *replicate*.halo-failover-enabled=$halo_failover_enabled"); + fi if [ -n "$halo_max_latency" ]; then cmd_line=$(echo "$cmd_line --xlator-option \ *replicate*.halo-max-latency=$halo_max_latency"); @@ -503,6 +507,9 @@ with_options() "halo-min-replicas") halo_min_replicas=$value ;; + "halo-failover-enabled") + halo_failover_enabled=$value + ;; x-*) # comments or userspace application-specific options, drop them ;; |
