summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--rpc/rpc-lib/src/rpc-clnt-ping.c31
-rw-r--r--tests/basic/halo-failover-disabled.t6
-rw-r--r--tests/basic/halo.t1
-rw-r--r--xlators/cluster/afr/src/afr-common.c155
-rw-r--r--xlators/cluster/afr/src/afr.c17
-rw-r--r--xlators/cluster/afr/src/afr.h1
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c5
-rw-r--r--xlators/protocol/client/src/client.c2
8 files changed, 161 insertions, 57 deletions
diff --git a/rpc/rpc-lib/src/rpc-clnt-ping.c b/rpc/rpc-lib/src/rpc-clnt-ping.c
index a48a265b71d..7ce066dec5f 100644
--- a/rpc/rpc-lib/src/rpc-clnt-ping.c
+++ b/rpc/rpc-lib/src/rpc-clnt-ping.c
@@ -173,17 +173,28 @@ out:
}
void
-_update_client_latency (call_frame_t *frame, double elapsed_usec)
+_update_client_latency (const rpc_clnt_connection_t *conn,
+ call_frame_t *frame,
+ uint64_t elapsed_usec)
{
fop_latency_t *lat;
lat = &frame->this->client_latency;
+ if (elapsed_usec < lat->min) {
+ lat->min = elapsed_usec;
+ }
+
+ if (elapsed_usec > lat->max) {
+ lat->max = elapsed_usec;
+ }
+
lat->total += elapsed_usec;
lat->count++;
lat->mean = lat->mean + (elapsed_usec - lat->mean) / lat->count;
- gf_log (THIS->name, GF_LOG_DEBUG, "Ping latency is %0.6lf ms, "
- "avg: %0.6lf ms, count:%ld", elapsed_usec / 1000.0,
+ gf_log (THIS->name, GF_LOG_DEBUG, "%s - Ping latency is %0.6lf ms, "
+ "avg: %0.6lf ms, count:%ld",
+ conn->trans->peerinfo.identifier, elapsed_usec / 1000.0,
lat->mean / 1000.0, lat->count);
}
@@ -217,13 +228,6 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count,
pthread_mutex_lock (&conn->lock);
{
- timespec_now (&now);
- timespec_sub (&local->submit_time, &now, &delta);
- latency_usec = delta.tv_sec * 1000000UL +
- delta.tv_nsec / 1000UL;
-
- _update_client_latency (frame, (double)latency_usec);
-
if (req->rpc_status == -1) {
unref = rpc_clnt_remove_ping_timer_locked (local->rpc);
if (unref) {
@@ -240,6 +244,13 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count,
goto unlock;
}
+ timespec_now (&now);
+ timespec_sub (&local->submit_time, &now, &delta);
+ latency_usec = delta.tv_sec * 1000000UL +
+ delta.tv_nsec / 1000UL;
+
+ _update_client_latency (conn, frame, latency_usec);
+ call_notify = _gf_true;
unref = rpc_clnt_remove_ping_timer_locked (local->rpc);
if (__rpc_clnt_rearm_ping_timer (local->rpc,
rpc_clnt_start_ping) == -1) {
diff --git a/tests/basic/halo-failover-disabled.t b/tests/basic/halo-failover-disabled.t
index 05ccd7e822a..31a1d166404 100644
--- a/tests/basic/halo-failover-disabled.t
+++ b/tests/basic/halo-failover-disabled.t
@@ -25,6 +25,7 @@ TEST $CLI volume set $V0 cluster.halo-enabled True
TEST $CLI volume set $V0 cluster.halo-max-latency 9999
TEST $CLI volume set $V0 cluster.halo-shd-max-latency 9999
TEST $CLI volume set $V0 cluster.halo-max-replicas 2
+TEST $CLI volume set $V0 cluster.halo-min-samples 1
TEST $CLI volume set $V0 cluster.halo-failover-enabled off
TEST $CLI volume set $V0 cluster.quorum-type fixed
TEST $CLI volume set $V0 cluster.quorum-count 2
@@ -44,9 +45,8 @@ TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0
cd $M0
# Write some data to the mount
-dd if=/dev/urandom of=$M0/test bs=1k count=200 oflag=sync &> /dev/null &
+TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync
-sleep 0.5
# Kill the first brick, fail-over to 3rd
TEST kill_brick $V0 $H0 $B0/${V0}0
@@ -56,7 +56,7 @@ TEST kill_brick $V0 $H0 $B0/${V0}0
# will not be fullfilled. If we waited 1000 second the brick would
# indeed be activated based on ping time, but for our test we want
# the decision to be solely "down event" driven, not ping driven.
-TEST ! dd if=/dev/urandom of=$M0/test_rw bs=1M count=1
+TEST ! dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 conv=fsync
TEST $CLI volume start $V0 force
sleep 2
diff --git a/tests/basic/halo.t b/tests/basic/halo.t
index 03fc0f88a19..25aca3442ab 100644
--- a/tests/basic/halo.t
+++ b/tests/basic/halo.t
@@ -23,6 +23,7 @@ TEST $CLI volume set $V0 cluster.background-self-heal-count 0
TEST $CLI volume set $V0 cluster.shd-max-threads 1
TEST $CLI volume set $V0 cluster.halo-enabled True
TEST $CLI volume set $V0 cluster.halo-max-replicas 2
+TEST $CLI volume set $V0 cluster.halo-min-samples 1
TEST $CLI volume set $V0 cluster.heal-timeout 5
TEST $CLI volume set $V0 cluster.self-heal-daemon off
TEST $CLI volume set $V0 cluster.eager-lock off
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 6973433a03e..8e46117b025 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -43,6 +43,9 @@
#include "afr-self-heald.h"
#include "afr-messages.h"
+#define CHILD_UP_STR "UP"
+#define CHILD_DOWN_STR "DOWN"
+
call_frame_t *
afr_copy_frame (call_frame_t *base)
{
@@ -4250,48 +4253,76 @@ find_worst_up_child (xlator_t *this)
return worst_child;
}
-void
+static void
_afr_handle_ping_event (xlator_t *this, xlator_t *child_xlator,
- const int idx, int64_t halo_max_latency_msec, int32_t *event,
- int64_t *child_latency_msec)
+ const int idx, const int64_t halo_max_latency_msec,
+ int32_t *event, int64_t *child_latency_msec,
+ gf_boolean_t child_halo_enabled)
{
afr_private_t *priv = NULL;
int i = -1;
int up_children = 0;
+ int best_down_child = 0;
+ uint64_t latency_samples = 0;
+ char *child_state_str = NULL;
priv = this->private;
- *child_latency_msec = child_xlator->client_latency.mean / 1000.0;
+ /* Base it off the _minimum_ latency we've ever seen */
+ *child_latency_msec = child_xlator->client_latency.min / 1000.0;
+ latency_samples = child_xlator->client_latency.count;
priv->child_latency[idx] = *child_latency_msec;
- for (i = 0; i < priv->child_count; i++)
- if (priv->child_up[i] == 1)
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->child_up[i] == 1) {
up_children++;
+ child_state_str = CHILD_UP_STR;
+ } else {
+ child_state_str = CHILD_DOWN_STR;
+ }
+ gf_log (child_xlator->name, GF_LOG_DEBUG,
+ "Child %d halo state: %s (%"PRIi64"ms)",
+ i, child_state_str, priv->child_latency[i]);
+ }
- if (priv->halo_enabled &&
+ /* Don't do anything until you have some minimum numbner of
+ * latency samples */
+ if (priv->halo_enabled == _gf_true && child_halo_enabled == _gf_false) {
+ gf_log (child_xlator->name, GF_LOG_INFO, "In-sufficient "
+ " number of latency samples (%" PRIu64
+ " < %d), halo in-active.",
+ latency_samples, priv->halo_min_samples);
+ }
+
+ /*
+ * Case 1: This child's latency exceeds the maximum allowable
+ * for this halo.
+ */
+ if (child_halo_enabled &&
*child_latency_msec > halo_max_latency_msec &&
priv->child_up[idx] == 1 &&
up_children > priv->halo_min_replicas) {
- if ((up_children - 1) <
- priv->halo_min_replicas &&
- priv->halo_failover_enabled) {
+ if (find_worst_up_child (this) == idx) {
gf_log (child_xlator->name, GF_LOG_INFO,
- "Overriding halo threshold, "
- "min replicas: %d",
- priv->halo_min_replicas);
- } else {
- gf_log (child_xlator->name, GF_LOG_INFO,
- "Child latency (%ld ms) "
- "exceeds halo threshold (%ld), "
- "marking child down.",
- *child_latency_msec,
- halo_max_latency_msec);
+ "Child latency (%"PRIi64"ms) "
+ "exceeds halo threshold (%"PRIi64"), "
+ "marking child down, "
+ "min_replicas (%d) still "
+ "satisfied.",
+ *child_latency_msec,
+ halo_max_latency_msec,
+ priv->halo_min_replicas);
*event = GF_EVENT_CHILD_DOWN;
}
- } else if ((priv->halo_enabled == _gf_false ||
- *child_latency_msec < halo_max_latency_msec) &&
+ /*
+ * Case 2: Child latency is within halo and currently marked down,
+ * mark it up.
+ */
+ } else if ((child_halo_enabled == _gf_false ||
+ *child_latency_msec <= halo_max_latency_msec) &&
priv->child_up[idx] == 0) {
- if (up_children < priv->halo_max_replicas) {
+ if (child_halo_enabled == _gf_false ||
+ up_children < priv->halo_max_replicas) {
gf_log (child_xlator->name, GF_LOG_INFO,
"Child latency (%ld ms) "
"below halo threshold (%ld) or halo is "
@@ -4305,13 +4336,35 @@ _afr_handle_ping_event (xlator_t *this, xlator_t *child_xlator,
"max replicas (%d) reached.", idx,
priv->halo_max_replicas);
}
+ /*
+ * Case 3: Child latency is within halo,and currently marked up,
+ * mark it down if it's the highest latency child and the
+ * number of up children is greater than halo_max_replicas.
+ */
+ } else if ((child_halo_enabled == _gf_true &&
+ *child_latency_msec <= halo_max_latency_msec) &&
+ priv->child_up[idx] == 1) {
+ if (find_worst_up_child (this) == idx &&
+ up_children > priv->halo_max_replicas &&
+ !priv->shd.iamshd) {
+ gf_log (child_xlator->name, GF_LOG_INFO,
+ "Child latency (%"PRIi64"ms) "
+ "exceeds halo threshold (%"PRIi64"), "
+ "but halo_max_replicas (%d) exceeded, "
+ "marking child down.",
+ *child_latency_msec,
+ halo_max_latency_msec,
+ priv->halo_max_replicas);
+ *event = GF_EVENT_CHILD_DOWN;
+ }
}
}
void
_afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator,
const int idx, int64_t halo_max_latency_msec,
- int32_t *event, int32_t *call_psh, int32_t *up_child)
+ int32_t *event, int32_t *call_psh, int32_t *up_child,
+ gf_boolean_t child_halo_enabled)
{
afr_private_t *priv = NULL;
int i = -1;
@@ -4321,7 +4374,7 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator,
priv = this->private;
- /*
+ /*
* This only really counts if the child was never up
* (value = -1) or had been down (value = 0). See
* comment at GF_EVENT_CHILD_DOWN for a more detailed
@@ -4350,8 +4403,8 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator,
* halo_min_replicas even though it's latency exceeds
* halo_max_latency_msec.
*/
- if (priv->halo_enabled == _gf_true &&
- up_children > priv->halo_min_replicas) {
+ if (child_halo_enabled == _gf_true &&
+ up_children > priv->halo_min_replicas) {
worst_up_child = find_worst_up_child (this);
if (worst_up_child >= 0 &&
priv->child_latency[worst_up_child] >
@@ -4372,8 +4425,8 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator,
goto out;
}
}
- if (priv->halo_enabled == _gf_true &&
- up_children > priv->halo_max_replicas &&
+ if (child_halo_enabled == _gf_true &&
+ up_children > priv->halo_max_replicas &&
!priv->shd.iamshd) {
if (was_down == _gf_true)
priv->event_generation--;
@@ -4383,7 +4436,6 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator,
worst_up_child = idx;
}
priv->child_up[worst_up_child] = 0;
- up_children--;
gf_log (this->name, GF_LOG_INFO,
"Marking child %d down, "
"up_children (%d) > "
@@ -4391,6 +4443,7 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator,
worst_up_child,
up_children,
priv->halo_max_replicas);
+ up_children--;
goto out;
}
out:
@@ -4408,14 +4461,17 @@ out:
void
_afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator,
- int idx, int64_t child_latency_msec, int32_t *event,
- int32_t *call_psh, int32_t *up_child)
+ int idx, int64_t child_latency_msec,
+ int64_t halo_max_latency_msec, int32_t *event,
+ int32_t *call_psh, int32_t *up_child,
+ gf_boolean_t child_halo_enabled)
{
afr_private_t *priv = NULL;
int i = -1;
int up_children = 0;
int down_children = 0;
int best_down_child = -1;
+ gf_boolean_t swap_child = _gf_false;
priv = this->private;
@@ -4457,10 +4513,19 @@ _afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator,
* as we want it to be up to date if we are going to
* begin using it synchronously.
*/
- if (priv->halo_enabled == _gf_true &&
- up_children < priv->halo_min_replicas &&
- priv->halo_failover_enabled == _gf_true) {
- best_down_child = find_best_down_child (this);
+ best_down_child = find_best_down_child (this);
+ if (child_halo_enabled == _gf_true) {
+ if (up_children < priv->halo_min_replicas &&
+ priv->halo_failover_enabled == _gf_true)
+ swap_child = _gf_true;
+ else if (up_children < priv->halo_max_replicas &&
+ priv->child_latency[best_down_child] <=
+ halo_max_latency_msec &&
+ priv->halo_failover_enabled == _gf_true)
+ swap_child = _gf_true;
+ }
+
+ if (swap_child) {
if (best_down_child >= 0) {
gf_log (this->name, GF_LOG_INFO,
"Swapping out child %d for "
@@ -4524,12 +4589,14 @@ afr_notify (xlator_t *this, int32_t event,
int ret = -1;
int call_psh = 0;
int up_child = -1;
+ uint64_t latency_samples = 0;
dict_t *input = NULL;
dict_t *output = NULL;
gf_boolean_t had_quorum = _gf_false;
gf_boolean_t has_quorum = _gf_false;
int64_t halo_max_latency_msec = 0;
int64_t child_latency_msec = -1;
+ gf_boolean_t child_halo_enabled = _gf_false;
child_xlator = (xlator_t *)data;
priv = this->private;
@@ -4544,7 +4611,7 @@ afr_notify (xlator_t *this, int32_t event,
* O(N^2) overall, but N is small for AFR so it shouldn't be an issue.
*/
priv->did_discovery = _gf_false;
-
+ latency_samples = child_xlator->client_latency.count;
/* parent xlators dont need to know about every child_up, child_down
* because of afr ha. If all subvolumes go down, child_down has
@@ -4565,9 +4632,12 @@ afr_notify (xlator_t *this, int32_t event,
had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up,
this);
- if (!priv->halo_enabled) {
+ if (!priv->halo_enabled ||
+ latency_samples < priv->halo_min_samples) {
+ child_halo_enabled = _gf_false;
halo_max_latency_msec = INT64_MAX;
} else {
+ child_halo_enabled = _gf_true;
halo_max_latency_msec = _afr_get_halo_latency (this);
}
@@ -4578,7 +4648,7 @@ afr_notify (xlator_t *this, int32_t event,
{
_afr_handle_ping_event (this, child_xlator, idx,
halo_max_latency_msec, &event,
- &child_latency_msec);
+ &child_latency_msec, child_halo_enabled);
}
UNLOCK (&priv->lock);
}
@@ -4611,13 +4681,14 @@ afr_notify (xlator_t *this, int32_t event,
case GF_EVENT_CHILD_UP:
_afr_handle_child_up_event (this, child_xlator,
idx, halo_max_latency_msec, &event, &call_psh,
- &up_child);
+ &up_child, child_halo_enabled);
break;
case GF_EVENT_CHILD_DOWN:
_afr_handle_child_down_event (this, child_xlator, idx,
- child_latency_msec, &event, &call_psh,
- &up_child);
+ child_latency_msec, halo_max_latency_msec,
+ &event, &call_psh, &up_child,
+ child_halo_enabled);
break;
case GF_EVENT_CHILD_CONNECTING:
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 27309985e82..d4dd8ff8815 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -201,6 +201,9 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("halo-min-replicas", priv->halo_min_replicas, options,
uint32, out);
+ GF_OPTION_RECONF ("halo-min-samples", priv->halo_min_samples, options,
+ uint32, out);
+
GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out);
GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode,
@@ -435,6 +438,8 @@ init (xlator_t *this)
out);
GF_OPTION_INIT ("halo-min-replicas", priv->halo_min_replicas, uint32,
out);
+ GF_OPTION_INIT ("halo-min-samples", priv->halo_min_samples, uint32,
+ out);
GF_OPTION_INIT ("halo-nfsd-max-latency",
priv->nfsd.halo_max_latency_msec, uint32, out);
@@ -762,8 +767,16 @@ struct volume_options options[] = {
.default_value = "2",
.description = "The minimum number of halo replicas, before adding "
"out of region replicas."
- },
- { .key = {"heal-wait-queue-length"},
+ },
+ { .key = {"halo-min-samples"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "3",
+ .description = "The minimum number of halo latency samples, before "
+ "we start forming the halos."
+ },
+ { .key = {"heal-wait-queue-length"},
.type = GF_OPTION_TYPE_INT,
.min = 0,
.max = 10000, /*Around 100MB with sizeof(afr_local_t)= 10496 bytes*/
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index eebcc12b1ee..f66cdbbf56a 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -153,6 +153,7 @@ typedef struct _afr_private {
uint32_t halo_max_latency_msec;
uint32_t halo_max_replicas;
uint32_t halo_min_replicas;
+ uint32_t halo_min_samples;
afr_self_heald_t shd;
struct afr_nfsd nfsd;
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index ab00914e3e2..f28294301f8 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -3068,6 +3068,11 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = 2,
.flags = OPT_FLAG_CLIENT_OPT
},
+ { .key = "cluster.halo-min-samples",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
{ .key = NULL
}
};
diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c
index 7bde1c21410..3e18b4870ae 100644
--- a/xlators/protocol/client/src/client.c
+++ b/xlators/protocol/client/src/client.c
@@ -2750,6 +2750,8 @@ init (xlator_t *this)
this->private = conf;
+ this->client_latency.min = UINT64_MAX;
+
/* If it returns -1, then its a failure, if it returns +1 we need
have to understand that 'this' is subvolume of a xlator which,
will set the remote host and remote subvolume in a setxattr