diff options
| -rw-r--r-- | rpc/rpc-lib/src/rpc-clnt-ping.c | 31 | ||||
| -rw-r--r-- | tests/basic/halo-failover-disabled.t | 6 | ||||
| -rw-r--r-- | tests/basic/halo.t | 1 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 155 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.c | 17 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 1 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 5 | ||||
| -rw-r--r-- | xlators/protocol/client/src/client.c | 2 |
8 files changed, 161 insertions, 57 deletions
diff --git a/rpc/rpc-lib/src/rpc-clnt-ping.c b/rpc/rpc-lib/src/rpc-clnt-ping.c index a48a265b71d..7ce066dec5f 100644 --- a/rpc/rpc-lib/src/rpc-clnt-ping.c +++ b/rpc/rpc-lib/src/rpc-clnt-ping.c @@ -173,17 +173,28 @@ out: } void -_update_client_latency (call_frame_t *frame, double elapsed_usec) +_update_client_latency (const rpc_clnt_connection_t *conn, + call_frame_t *frame, + uint64_t elapsed_usec) { fop_latency_t *lat; lat = &frame->this->client_latency; + if (elapsed_usec < lat->min) { + lat->min = elapsed_usec; + } + + if (elapsed_usec > lat->max) { + lat->max = elapsed_usec; + } + lat->total += elapsed_usec; lat->count++; lat->mean = lat->mean + (elapsed_usec - lat->mean) / lat->count; - gf_log (THIS->name, GF_LOG_DEBUG, "Ping latency is %0.6lf ms, " - "avg: %0.6lf ms, count:%ld", elapsed_usec / 1000.0, + gf_log (THIS->name, GF_LOG_DEBUG, "%s - Ping latency is %0.6lf ms, " + "avg: %0.6lf ms, count:%ld", + conn->trans->peerinfo.identifier, elapsed_usec / 1000.0, lat->mean / 1000.0, lat->count); } @@ -217,13 +228,6 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count, pthread_mutex_lock (&conn->lock); { - timespec_now (&now); - timespec_sub (&local->submit_time, &now, &delta); - latency_usec = delta.tv_sec * 1000000UL + - delta.tv_nsec / 1000UL; - - _update_client_latency (frame, (double)latency_usec); - if (req->rpc_status == -1) { unref = rpc_clnt_remove_ping_timer_locked (local->rpc); if (unref) { @@ -240,6 +244,13 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count, goto unlock; } + timespec_now (&now); + timespec_sub (&local->submit_time, &now, &delta); + latency_usec = delta.tv_sec * 1000000UL + + delta.tv_nsec / 1000UL; + + _update_client_latency (conn, frame, latency_usec); + call_notify = _gf_true; unref = rpc_clnt_remove_ping_timer_locked (local->rpc); if (__rpc_clnt_rearm_ping_timer (local->rpc, rpc_clnt_start_ping) == -1) { diff --git a/tests/basic/halo-failover-disabled.t b/tests/basic/halo-failover-disabled.t index 05ccd7e822a..31a1d166404 100644 --- a/tests/basic/halo-failover-disabled.t +++ b/tests/basic/halo-failover-disabled.t @@ -25,6 +25,7 @@ TEST $CLI volume set $V0 cluster.halo-enabled True TEST $CLI volume set $V0 cluster.halo-max-latency 9999 TEST $CLI volume set $V0 cluster.halo-shd-max-latency 9999 TEST $CLI volume set $V0 cluster.halo-max-replicas 2 +TEST $CLI volume set $V0 cluster.halo-min-samples 1 TEST $CLI volume set $V0 cluster.halo-failover-enabled off TEST $CLI volume set $V0 cluster.quorum-type fixed TEST $CLI volume set $V0 cluster.quorum-count 2 @@ -44,9 +45,8 @@ TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 cd $M0 # Write some data to the mount -dd if=/dev/urandom of=$M0/test bs=1k count=200 oflag=sync &> /dev/null & +TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync -sleep 0.5 # Kill the first brick, fail-over to 3rd TEST kill_brick $V0 $H0 $B0/${V0}0 @@ -56,7 +56,7 @@ TEST kill_brick $V0 $H0 $B0/${V0}0 # will not be fullfilled. If we waited 1000 second the brick would # indeed be activated based on ping time, but for our test we want # the decision to be solely "down event" driven, not ping driven. -TEST ! dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 +TEST ! dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 conv=fsync TEST $CLI volume start $V0 force sleep 2 diff --git a/tests/basic/halo.t b/tests/basic/halo.t index 03fc0f88a19..25aca3442ab 100644 --- a/tests/basic/halo.t +++ b/tests/basic/halo.t @@ -23,6 +23,7 @@ TEST $CLI volume set $V0 cluster.background-self-heal-count 0 TEST $CLI volume set $V0 cluster.shd-max-threads 1 TEST $CLI volume set $V0 cluster.halo-enabled True TEST $CLI volume set $V0 cluster.halo-max-replicas 2 +TEST $CLI volume set $V0 cluster.halo-min-samples 1 TEST $CLI volume set $V0 cluster.heal-timeout 5 TEST $CLI volume set $V0 cluster.self-heal-daemon off TEST $CLI volume set $V0 cluster.eager-lock off diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 6973433a03e..8e46117b025 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -43,6 +43,9 @@ #include "afr-self-heald.h" #include "afr-messages.h" +#define CHILD_UP_STR "UP" +#define CHILD_DOWN_STR "DOWN" + call_frame_t * afr_copy_frame (call_frame_t *base) { @@ -4250,48 +4253,76 @@ find_worst_up_child (xlator_t *this) return worst_child; } -void +static void _afr_handle_ping_event (xlator_t *this, xlator_t *child_xlator, - const int idx, int64_t halo_max_latency_msec, int32_t *event, - int64_t *child_latency_msec) + const int idx, const int64_t halo_max_latency_msec, + int32_t *event, int64_t *child_latency_msec, + gf_boolean_t child_halo_enabled) { afr_private_t *priv = NULL; int i = -1; int up_children = 0; + int best_down_child = 0; + uint64_t latency_samples = 0; + char *child_state_str = NULL; priv = this->private; - *child_latency_msec = child_xlator->client_latency.mean / 1000.0; + /* Base it off the _minimum_ latency we've ever seen */ + *child_latency_msec = child_xlator->client_latency.min / 1000.0; + latency_samples = child_xlator->client_latency.count; priv->child_latency[idx] = *child_latency_msec; - for (i = 0; i < priv->child_count; i++) - if (priv->child_up[i] == 1) + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i] == 1) { up_children++; + child_state_str = CHILD_UP_STR; + } else { + child_state_str = CHILD_DOWN_STR; + } + gf_log (child_xlator->name, GF_LOG_DEBUG, + "Child %d halo state: %s (%"PRIi64"ms)", + i, child_state_str, priv->child_latency[i]); + } - if (priv->halo_enabled && + /* Don't do anything until you have some minimum numbner of + * latency samples */ + if (priv->halo_enabled == _gf_true && child_halo_enabled == _gf_false) { + gf_log (child_xlator->name, GF_LOG_INFO, "In-sufficient " + " number of latency samples (%" PRIu64 + " < %d), halo in-active.", + latency_samples, priv->halo_min_samples); + } + + /* + * Case 1: This child's latency exceeds the maximum allowable + * for this halo. + */ + if (child_halo_enabled && *child_latency_msec > halo_max_latency_msec && priv->child_up[idx] == 1 && up_children > priv->halo_min_replicas) { - if ((up_children - 1) < - priv->halo_min_replicas && - priv->halo_failover_enabled) { + if (find_worst_up_child (this) == idx) { gf_log (child_xlator->name, GF_LOG_INFO, - "Overriding halo threshold, " - "min replicas: %d", - priv->halo_min_replicas); - } else { - gf_log (child_xlator->name, GF_LOG_INFO, - "Child latency (%ld ms) " - "exceeds halo threshold (%ld), " - "marking child down.", - *child_latency_msec, - halo_max_latency_msec); + "Child latency (%"PRIi64"ms) " + "exceeds halo threshold (%"PRIi64"), " + "marking child down, " + "min_replicas (%d) still " + "satisfied.", + *child_latency_msec, + halo_max_latency_msec, + priv->halo_min_replicas); *event = GF_EVENT_CHILD_DOWN; } - } else if ((priv->halo_enabled == _gf_false || - *child_latency_msec < halo_max_latency_msec) && + /* + * Case 2: Child latency is within halo and currently marked down, + * mark it up. + */ + } else if ((child_halo_enabled == _gf_false || + *child_latency_msec <= halo_max_latency_msec) && priv->child_up[idx] == 0) { - if (up_children < priv->halo_max_replicas) { + if (child_halo_enabled == _gf_false || + up_children < priv->halo_max_replicas) { gf_log (child_xlator->name, GF_LOG_INFO, "Child latency (%ld ms) " "below halo threshold (%ld) or halo is " @@ -4305,13 +4336,35 @@ _afr_handle_ping_event (xlator_t *this, xlator_t *child_xlator, "max replicas (%d) reached.", idx, priv->halo_max_replicas); } + /* + * Case 3: Child latency is within halo,and currently marked up, + * mark it down if it's the highest latency child and the + * number of up children is greater than halo_max_replicas. + */ + } else if ((child_halo_enabled == _gf_true && + *child_latency_msec <= halo_max_latency_msec) && + priv->child_up[idx] == 1) { + if (find_worst_up_child (this) == idx && + up_children > priv->halo_max_replicas && + !priv->shd.iamshd) { + gf_log (child_xlator->name, GF_LOG_INFO, + "Child latency (%"PRIi64"ms) " + "exceeds halo threshold (%"PRIi64"), " + "but halo_max_replicas (%d) exceeded, " + "marking child down.", + *child_latency_msec, + halo_max_latency_msec, + priv->halo_max_replicas); + *event = GF_EVENT_CHILD_DOWN; + } } } void _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator, const int idx, int64_t halo_max_latency_msec, - int32_t *event, int32_t *call_psh, int32_t *up_child) + int32_t *event, int32_t *call_psh, int32_t *up_child, + gf_boolean_t child_halo_enabled) { afr_private_t *priv = NULL; int i = -1; @@ -4321,7 +4374,7 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator, priv = this->private; - /* + /* * This only really counts if the child was never up * (value = -1) or had been down (value = 0). See * comment at GF_EVENT_CHILD_DOWN for a more detailed @@ -4350,8 +4403,8 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator, * halo_min_replicas even though it's latency exceeds * halo_max_latency_msec. */ - if (priv->halo_enabled == _gf_true && - up_children > priv->halo_min_replicas) { + if (child_halo_enabled == _gf_true && + up_children > priv->halo_min_replicas) { worst_up_child = find_worst_up_child (this); if (worst_up_child >= 0 && priv->child_latency[worst_up_child] > @@ -4372,8 +4425,8 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator, goto out; } } - if (priv->halo_enabled == _gf_true && - up_children > priv->halo_max_replicas && + if (child_halo_enabled == _gf_true && + up_children > priv->halo_max_replicas && !priv->shd.iamshd) { if (was_down == _gf_true) priv->event_generation--; @@ -4383,7 +4436,6 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator, worst_up_child = idx; } priv->child_up[worst_up_child] = 0; - up_children--; gf_log (this->name, GF_LOG_INFO, "Marking child %d down, " "up_children (%d) > " @@ -4391,6 +4443,7 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator, worst_up_child, up_children, priv->halo_max_replicas); + up_children--; goto out; } out: @@ -4408,14 +4461,17 @@ out: void _afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator, - int idx, int64_t child_latency_msec, int32_t *event, - int32_t *call_psh, int32_t *up_child) + int idx, int64_t child_latency_msec, + int64_t halo_max_latency_msec, int32_t *event, + int32_t *call_psh, int32_t *up_child, + gf_boolean_t child_halo_enabled) { afr_private_t *priv = NULL; int i = -1; int up_children = 0; int down_children = 0; int best_down_child = -1; + gf_boolean_t swap_child = _gf_false; priv = this->private; @@ -4457,10 +4513,19 @@ _afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator, * as we want it to be up to date if we are going to * begin using it synchronously. */ - if (priv->halo_enabled == _gf_true && - up_children < priv->halo_min_replicas && - priv->halo_failover_enabled == _gf_true) { - best_down_child = find_best_down_child (this); + best_down_child = find_best_down_child (this); + if (child_halo_enabled == _gf_true) { + if (up_children < priv->halo_min_replicas && + priv->halo_failover_enabled == _gf_true) + swap_child = _gf_true; + else if (up_children < priv->halo_max_replicas && + priv->child_latency[best_down_child] <= + halo_max_latency_msec && + priv->halo_failover_enabled == _gf_true) + swap_child = _gf_true; + } + + if (swap_child) { if (best_down_child >= 0) { gf_log (this->name, GF_LOG_INFO, "Swapping out child %d for " @@ -4524,12 +4589,14 @@ afr_notify (xlator_t *this, int32_t event, int ret = -1; int call_psh = 0; int up_child = -1; + uint64_t latency_samples = 0; dict_t *input = NULL; dict_t *output = NULL; gf_boolean_t had_quorum = _gf_false; gf_boolean_t has_quorum = _gf_false; int64_t halo_max_latency_msec = 0; int64_t child_latency_msec = -1; + gf_boolean_t child_halo_enabled = _gf_false; child_xlator = (xlator_t *)data; priv = this->private; @@ -4544,7 +4611,7 @@ afr_notify (xlator_t *this, int32_t event, * O(N^2) overall, but N is small for AFR so it shouldn't be an issue. */ priv->did_discovery = _gf_false; - + latency_samples = child_xlator->client_latency.count; /* parent xlators dont need to know about every child_up, child_down * because of afr ha. If all subvolumes go down, child_down has @@ -4565,9 +4632,12 @@ afr_notify (xlator_t *this, int32_t event, had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up, this); - if (!priv->halo_enabled) { + if (!priv->halo_enabled || + latency_samples < priv->halo_min_samples) { + child_halo_enabled = _gf_false; halo_max_latency_msec = INT64_MAX; } else { + child_halo_enabled = _gf_true; halo_max_latency_msec = _afr_get_halo_latency (this); } @@ -4578,7 +4648,7 @@ afr_notify (xlator_t *this, int32_t event, { _afr_handle_ping_event (this, child_xlator, idx, halo_max_latency_msec, &event, - &child_latency_msec); + &child_latency_msec, child_halo_enabled); } UNLOCK (&priv->lock); } @@ -4611,13 +4681,14 @@ afr_notify (xlator_t *this, int32_t event, case GF_EVENT_CHILD_UP: _afr_handle_child_up_event (this, child_xlator, idx, halo_max_latency_msec, &event, &call_psh, - &up_child); + &up_child, child_halo_enabled); break; case GF_EVENT_CHILD_DOWN: _afr_handle_child_down_event (this, child_xlator, idx, - child_latency_msec, &event, &call_psh, - &up_child); + child_latency_msec, halo_max_latency_msec, + &event, &call_psh, &up_child, + child_halo_enabled); break; case GF_EVENT_CHILD_CONNECTING: diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 27309985e82..d4dd8ff8815 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -201,6 +201,9 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("halo-min-replicas", priv->halo_min_replicas, options, uint32, out); + GF_OPTION_RECONF ("halo-min-samples", priv->halo_min_samples, options, + uint32, out); + GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out); GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode, @@ -435,6 +438,8 @@ init (xlator_t *this) out); GF_OPTION_INIT ("halo-min-replicas", priv->halo_min_replicas, uint32, out); + GF_OPTION_INIT ("halo-min-samples", priv->halo_min_samples, uint32, + out); GF_OPTION_INIT ("halo-nfsd-max-latency", priv->nfsd.halo_max_latency_msec, uint32, out); @@ -762,8 +767,16 @@ struct volume_options options[] = { .default_value = "2", .description = "The minimum number of halo replicas, before adding " "out of region replicas." - }, - { .key = {"heal-wait-queue-length"}, + }, + { .key = {"halo-min-samples"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "3", + .description = "The minimum number of halo latency samples, before " + "we start forming the halos." + }, + { .key = {"heal-wait-queue-length"}, .type = GF_OPTION_TYPE_INT, .min = 0, .max = 10000, /*Around 100MB with sizeof(afr_local_t)= 10496 bytes*/ diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index eebcc12b1ee..f66cdbbf56a 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -153,6 +153,7 @@ typedef struct _afr_private { uint32_t halo_max_latency_msec; uint32_t halo_max_replicas; uint32_t halo_min_replicas; + uint32_t halo_min_samples; afr_self_heald_t shd; struct afr_nfsd nfsd; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index ab00914e3e2..f28294301f8 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -3068,6 +3068,11 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = 2, .flags = OPT_FLAG_CLIENT_OPT }, + { .key = "cluster.halo-min-samples", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT + }, { .key = NULL } }; diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c index 7bde1c21410..3e18b4870ae 100644 --- a/xlators/protocol/client/src/client.c +++ b/xlators/protocol/client/src/client.c @@ -2750,6 +2750,8 @@ init (xlator_t *this) this->private = conf; + this->client_latency.min = UINT64_MAX; + /* If it returns -1, then its a failure, if it returns +1 we need have to understand that 'this' is subvolume of a xlator which, will set the remote host and remote subvolume in a setxattr |
