8 files changed, 161 insertions, 57 deletions
diff --git a/rpc/rpc-lib/src/rpc-clnt-ping.c b/rpc/rpc-lib/src/rpc-clnt-ping.c
index a48a265b71d..7ce066dec5f 100644
--- a/rpc/rpc-lib/src/rpc-clnt-ping.c
+++ b/rpc/rpc-lib/src/rpc-clnt-ping.c
@@ -173,17 +173,28 @@ out:
 }
 
 void
-_update_client_latency (call_frame_t *frame, double elapsed_usec)
+_update_client_latency (const rpc_clnt_connection_t *conn,
+                        call_frame_t *frame,
+                        uint64_t elapsed_usec)
 {
         fop_latency_t *lat;
 
         lat = &frame->this->client_latency;
 
+        if (elapsed_usec < lat->min) {
+                lat->min = elapsed_usec;
+        }
+
+        if (elapsed_usec > lat->max) {
+                lat->max = elapsed_usec;
+        }
+
         lat->total += elapsed_usec;
         lat->count++;
         lat->mean = lat->mean + (elapsed_usec - lat->mean) / lat->count;
-        gf_log (THIS->name, GF_LOG_DEBUG, "Ping latency is %0.6lf ms, "
-                "avg: %0.6lf ms, count:%ld", elapsed_usec / 1000.0,
+        gf_log (THIS->name, GF_LOG_DEBUG, "%s - Ping latency is %0.6lf ms, "
+                "avg: %0.6lf ms, count:%ld",
+                conn->trans->peerinfo.identifier, elapsed_usec / 1000.0,
                 lat->mean / 1000.0, lat->count);
 }
 
@@ -217,13 +228,6 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count,
 
         pthread_mutex_lock (&conn->lock);
         {
-                timespec_now (&now);
-                timespec_sub (&local->submit_time, &now, &delta);
-                latency_usec = delta.tv_sec * 1000000UL +
-                               delta.tv_nsec / 1000UL;
-
-                _update_client_latency (frame, (double)latency_usec);
-
                 if (req->rpc_status == -1) {
                         unref = rpc_clnt_remove_ping_timer_locked (local->rpc);
                         if (unref) {
@@ -240,6 +244,13 @@ rpc_clnt_ping_cbk (struct rpc_req *req, struct iovec *iov, int count,
                         goto unlock;
                 }
 
+                timespec_now (&now);
+                timespec_sub (&local->submit_time, &now, &delta);
+                latency_usec = delta.tv_sec * 1000000UL +
+                               delta.tv_nsec / 1000UL;
+
+                _update_client_latency (conn, frame, latency_usec);
+                call_notify = _gf_true;
                 unref = rpc_clnt_remove_ping_timer_locked (local->rpc);
                 if (__rpc_clnt_rearm_ping_timer (local->rpc,
                                                  rpc_clnt_start_ping) == -1) {
diff --git a/tests/basic/halo-failover-disabled.t b/tests/basic/halo-failover-disabled.t
index 05ccd7e822a..31a1d166404 100644
--- a/tests/basic/halo-failover-disabled.t
+++ b/tests/basic/halo-failover-disabled.t
@@ -25,6 +25,7 @@ TEST $CLI volume set $V0 cluster.halo-enabled True
 TEST $CLI volume set $V0 cluster.halo-max-latency 9999
 TEST $CLI volume set $V0 cluster.halo-shd-max-latency 9999
 TEST $CLI volume set $V0 cluster.halo-max-replicas 2
+TEST $CLI volume set $V0 cluster.halo-min-samples 1
 TEST $CLI volume set $V0 cluster.halo-failover-enabled off
 TEST $CLI volume set $V0 cluster.quorum-type fixed
 TEST $CLI volume set $V0 cluster.quorum-count 2
@@ -44,9 +45,8 @@ TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0
 cd $M0
 
 # Write some data to the mount
-dd if=/dev/urandom of=$M0/test bs=1k count=200 oflag=sync &> /dev/null &
+TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync
 
-sleep 0.5
 # Kill the first brick, fail-over to 3rd
 TEST kill_brick $V0 $H0 $B0/${V0}0
 
@@ -56,7 +56,7 @@ TEST kill_brick $V0 $H0 $B0/${V0}0
 # will not be fullfilled.  If we waited 1000 second the brick would
 # indeed be activated based on ping time, but for our test we want
 # the decision to be solely "down event" driven, not ping driven.
-TEST ! dd if=/dev/urandom of=$M0/test_rw bs=1M count=1
+TEST ! dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 conv=fsync
 
 TEST $CLI volume start $V0 force
 sleep 2
diff --git a/tests/basic/halo.t b/tests/basic/halo.t
index 03fc0f88a19..25aca3442ab 100644
--- a/tests/basic/halo.t
+++ b/tests/basic/halo.t
@@ -23,6 +23,7 @@ TEST $CLI volume set $V0 cluster.background-self-heal-count 0
 TEST $CLI volume set $V0 cluster.shd-max-threads 1
 TEST $CLI volume set $V0 cluster.halo-enabled True
 TEST $CLI volume set $V0 cluster.halo-max-replicas 2
+TEST $CLI volume set $V0 cluster.halo-min-samples 1
 TEST $CLI volume set $V0 cluster.heal-timeout 5
 TEST $CLI volume set $V0 cluster.self-heal-daemon off
 TEST $CLI volume set $V0 cluster.eager-lock off
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 6973433a03e..8e46117b025 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -43,6 +43,9 @@
 #include "afr-self-heald.h"
 #include "afr-messages.h"
 
+#define CHILD_UP_STR "UP"
+#define CHILD_DOWN_STR "DOWN"
+
 call_frame_t *
 afr_copy_frame (call_frame_t *base)
 {
@@ -4250,48 +4253,76 @@ find_worst_up_child (xlator_t *this)
         return worst_child;
 }
 
-void
+static void
 _afr_handle_ping_event (xlator_t *this, xlator_t *child_xlator,
-                const int idx, int64_t halo_max_latency_msec, int32_t *event,
-                int64_t *child_latency_msec)
+                const int idx, const int64_t halo_max_latency_msec,
+                int32_t *event, int64_t *child_latency_msec,
+                gf_boolean_t child_halo_enabled)
 {
         afr_private_t   *priv               = NULL;
         int             i                   = -1;
         int             up_children         = 0;
+        int             best_down_child     = 0;
+        uint64_t        latency_samples     = 0;
+        char            *child_state_str    = NULL;
 
         priv = this->private;
 
-        *child_latency_msec = child_xlator->client_latency.mean / 1000.0;
+        /* Base it off the _minimum_ latency we've ever seen */
+        *child_latency_msec = child_xlator->client_latency.min / 1000.0;
+        latency_samples = child_xlator->client_latency.count;
         priv->child_latency[idx] = *child_latency_msec;
 
-        for (i = 0; i < priv->child_count; i++)
-                if (priv->child_up[i] == 1)
+        for (i = 0; i < priv->child_count; i++) {
+                if (priv->child_up[i] == 1) {
                         up_children++;
+                        child_state_str = CHILD_UP_STR;
+                } else {
+                    child_state_str = CHILD_DOWN_STR;
+                }
+                gf_log (child_xlator->name, GF_LOG_DEBUG,
+                        "Child %d halo state: %s (%"PRIi64"ms)",
+                        i, child_state_str, priv->child_latency[i]);
+        }
 
-        if (priv->halo_enabled &&
+        /* Don't do anything until you have some minimum numbner of
+         * latency samples */
+        if (priv->halo_enabled == _gf_true && child_halo_enabled == _gf_false) {
+                gf_log (child_xlator->name, GF_LOG_INFO, "In-sufficient "
+                        " number of latency samples (%" PRIu64
+                        " < %d), halo in-active.",
+                        latency_samples, priv->halo_min_samples);
+        }
+
+        /*
+         * Case 1: This child's latency exceeds the maximum allowable
+         * for this halo.
+         */
+        if (child_halo_enabled &&
             *child_latency_msec > halo_max_latency_msec &&
             priv->child_up[idx] == 1 &&
             up_children > priv->halo_min_replicas) {
-                if ((up_children - 1) <
-                    priv->halo_min_replicas &&
-                    priv->halo_failover_enabled) {
+                if (find_worst_up_child (this) == idx) {
                         gf_log (child_xlator->name, GF_LOG_INFO,
-                               "Overriding halo threshold, "
-                               "min replicas: %d",
-                               priv->halo_min_replicas);
-                } else {
-                        gf_log (child_xlator->name, GF_LOG_INFO,
-                                "Child latency (%ld ms) "
-                                "exceeds halo threshold (%ld), "
-                                "marking child down.",
-                                *child_latency_msec,
-                                halo_max_latency_msec);
+                                "Child latency (%"PRIi64"ms) "
+                                 "exceeds halo threshold (%"PRIi64"), "
+                                 "marking child down, "
+                                 "min_replicas (%d) still "
+                                 "satisfied.",
+                                 *child_latency_msec,
+                                 halo_max_latency_msec,
+                                 priv->halo_min_replicas);
                         *event = GF_EVENT_CHILD_DOWN;
                 }
-        } else if ((priv->halo_enabled == _gf_false ||
-                    *child_latency_msec < halo_max_latency_msec) &&
+        /*
+         * Case 2: Child latency is within halo and currently marked down,
+         * mark it up.
+         */
+        } else if ((child_halo_enabled == _gf_false ||
+                    *child_latency_msec <= halo_max_latency_msec) &&
                    priv->child_up[idx] == 0) {
-                if (up_children < priv->halo_max_replicas) {
+                if (child_halo_enabled == _gf_false ||
+                        up_children < priv->halo_max_replicas) {
                         gf_log (child_xlator->name, GF_LOG_INFO,
                                 "Child latency (%ld ms) "
                                 "below halo threshold (%ld) or halo is "
@@ -4305,13 +4336,35 @@ _afr_handle_ping_event (xlator_t *this, xlator_t *child_xlator,
                             "max replicas (%d) reached.", idx,
                             priv->halo_max_replicas);
                 }
+        /*
+         * Case 3: Child latency is within halo,and currently marked up,
+         * mark it down if it's the highest latency child and the
+         * number of up children is greater than halo_max_replicas.
+         */
+        } else if ((child_halo_enabled == _gf_true &&
+                        *child_latency_msec <= halo_max_latency_msec) &&
+                        priv->child_up[idx] == 1) {
+                if (find_worst_up_child (this) == idx &&
+                                up_children > priv->halo_max_replicas &&
+                                !priv->shd.iamshd) {
+                        gf_log (child_xlator->name, GF_LOG_INFO,
+                                "Child latency (%"PRIi64"ms) "
+                                "exceeds halo threshold (%"PRIi64"), "
+                                "but halo_max_replicas (%d) exceeded, "
+                                "marking child down.",
+                                *child_latency_msec,
+                                halo_max_latency_msec,
+                                priv->halo_max_replicas);
+                        *event = GF_EVENT_CHILD_DOWN;
+                }
         }
 }
 
 void
 _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator,
                 const int idx, int64_t halo_max_latency_msec,
-                int32_t *event, int32_t *call_psh, int32_t *up_child)
+                int32_t *event, int32_t *call_psh, int32_t *up_child,
+                gf_boolean_t child_halo_enabled)
 {
         afr_private_t   *priv               = NULL;
         int             i                   = -1;
@@ -4321,7 +4374,7 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator,
 
         priv = this->private;
 
-        /*
+       /*
          * This only really counts if the child was never up
          * (value = -1) or had been down (value = 0).  See
          * comment at GF_EVENT_CHILD_DOWN for a more detailed
@@ -4350,8 +4403,8 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator,
          * halo_min_replicas even though it's latency exceeds
          * halo_max_latency_msec.
          */
-        if (priv->halo_enabled == _gf_true &&
-                        up_children > priv->halo_min_replicas) {
+        if (child_halo_enabled == _gf_true &&
+            up_children > priv->halo_min_replicas) {
                 worst_up_child = find_worst_up_child (this);
                 if (worst_up_child >= 0 &&
                     priv->child_latency[worst_up_child] >
@@ -4372,8 +4425,8 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator,
                         goto out;
                 }
         }
-        if (priv->halo_enabled == _gf_true &&
-                        up_children > priv->halo_max_replicas &&
+        if (child_halo_enabled == _gf_true &&
+            up_children > priv->halo_max_replicas &&
             !priv->shd.iamshd) {
                 if (was_down == _gf_true)
                         priv->event_generation--;
@@ -4383,7 +4436,6 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator,
                         worst_up_child = idx;
                 }
                 priv->child_up[worst_up_child] = 0;
-                up_children--;
                 gf_log (this->name, GF_LOG_INFO,
                         "Marking child %d down, "
                         "up_children (%d) > "
@@ -4391,6 +4443,7 @@ _afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator,
                         worst_up_child,
                         up_children,
                         priv->halo_max_replicas);
+                up_children--;
                 goto out;
         }
 out:
@@ -4408,14 +4461,17 @@ out:
 
 void
 _afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator,
-                int idx, int64_t child_latency_msec, int32_t *event,
-                int32_t *call_psh, int32_t *up_child)
+                int idx, int64_t child_latency_msec,
+                int64_t halo_max_latency_msec, int32_t *event,
+                int32_t *call_psh, int32_t *up_child,
+                gf_boolean_t child_halo_enabled)
 {
         afr_private_t   *priv               = NULL;
         int             i                   = -1;
         int             up_children         = 0;
         int             down_children       = 0;
         int             best_down_child     = -1;
+        gf_boolean_t    swap_child          = _gf_false;
 
         priv = this->private;
 
@@ -4457,10 +4513,19 @@ _afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator,
          * as we want it to be up to date if we are going to
          * begin using it synchronously.
          */
-        if (priv->halo_enabled == _gf_true &&
-            up_children < priv->halo_min_replicas &&
-            priv->halo_failover_enabled == _gf_true) {
-                best_down_child = find_best_down_child (this);
+        best_down_child = find_best_down_child (this);
+        if (child_halo_enabled == _gf_true) {
+                if (up_children < priv->halo_min_replicas &&
+                                priv->halo_failover_enabled == _gf_true)
+                        swap_child = _gf_true;
+                else if (up_children < priv->halo_max_replicas &&
+                                priv->child_latency[best_down_child] <=
+                                halo_max_latency_msec &&
+                                priv->halo_failover_enabled == _gf_true)
+                        swap_child = _gf_true;
+        }
+
+        if (swap_child) {
                 if (best_down_child >= 0) {
                         gf_log (this->name, GF_LOG_INFO,
                                 "Swapping out child %d for "
@@ -4524,12 +4589,14 @@ afr_notify (xlator_t *this, int32_t event,
         int             ret                 = -1;
         int             call_psh            = 0;
         int             up_child            = -1;
+        uint64_t        latency_samples     = 0;
         dict_t          *input              = NULL;
         dict_t          *output             = NULL;
         gf_boolean_t    had_quorum          = _gf_false;
         gf_boolean_t    has_quorum          = _gf_false;
         int64_t         halo_max_latency_msec = 0;
         int64_t         child_latency_msec   = -1;
+        gf_boolean_t    child_halo_enabled   = _gf_false;
 
         child_xlator = (xlator_t *)data;
         priv = this->private;
@@ -4544,7 +4611,7 @@ afr_notify (xlator_t *this, int32_t event,
          * O(N^2) overall, but N is small for AFR so it shouldn't be an issue.
          */
         priv->did_discovery = _gf_false;
-
+        latency_samples = child_xlator->client_latency.count;
 
         /* parent xlators dont need to know about every child_up, child_down
          * because of afr ha. If all subvolumes go down, child_down has
@@ -4565,9 +4632,12 @@ afr_notify (xlator_t *this, int32_t event,
         had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up,
                                                            this);
 
-        if (!priv->halo_enabled) {
+        if (!priv->halo_enabled ||
+            latency_samples < priv->halo_min_samples) {
+                child_halo_enabled = _gf_false;
                 halo_max_latency_msec = INT64_MAX;
         } else {
+                child_halo_enabled = _gf_true;
                 halo_max_latency_msec = _afr_get_halo_latency (this);
         }
 
@@ -4578,7 +4648,7 @@ afr_notify (xlator_t *this, int32_t event,
                 {
                         _afr_handle_ping_event (this, child_xlator, idx,
                                 halo_max_latency_msec, &event,
-                                &child_latency_msec);
+                                &child_latency_msec, child_halo_enabled);
                 }
                 UNLOCK (&priv->lock);
         }
@@ -4611,13 +4681,14 @@ afr_notify (xlator_t *this, int32_t event,
                 case GF_EVENT_CHILD_UP:
                         _afr_handle_child_up_event (this, child_xlator,
                                 idx, halo_max_latency_msec, &event, &call_psh,
-                                &up_child);
+                                &up_child, child_halo_enabled);
                         break;
 
                 case GF_EVENT_CHILD_DOWN:
                         _afr_handle_child_down_event (this, child_xlator, idx,
-                                child_latency_msec, &event, &call_psh,
-                                &up_child);
+                                child_latency_msec, halo_max_latency_msec,
+                                &event, &call_psh, &up_child,
+                                child_halo_enabled);
                         break;
 
                 case GF_EVENT_CHILD_CONNECTING:
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 27309985e82..d4dd8ff8815 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -201,6 +201,9 @@ reconfigure (xlator_t *this, dict_t *options)
         GF_OPTION_RECONF ("halo-min-replicas", priv->halo_min_replicas, options,
                               uint32, out);
 
+        GF_OPTION_RECONF ("halo-min-samples", priv->halo_min_samples, options,
+                              uint32, out);
+
         GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out);
 
         GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode,
@@ -435,6 +438,8 @@ init (xlator_t *this)
                         out);
         GF_OPTION_INIT ("halo-min-replicas", priv->halo_min_replicas, uint32,
                         out);
+        GF_OPTION_INIT ("halo-min-samples", priv->halo_min_samples, uint32,
+                        out);
 
         GF_OPTION_INIT ("halo-nfsd-max-latency",
                         priv->nfsd.halo_max_latency_msec, uint32, out);
@@ -762,8 +767,16 @@ struct volume_options options[] = {
           .default_value = "2",
            .description = "The minimum number of halo replicas, before adding "
                           "out of region replicas."
-         },
-         { .key  = {"heal-wait-queue-length"},
+        },
+        { .key   = {"halo-min-samples"},
+          .type  = GF_OPTION_TYPE_INT,
+          .min   = 1,
+          .max   = 99999,
+          .default_value = "3",
+           .description = "The minimum number of halo latency samples, before "
+                          "we start forming the halos."
+        },
+        { .key  = {"heal-wait-queue-length"},
           .type = GF_OPTION_TYPE_INT,
           .min  = 0,
           .max  = 10000, /*Around 100MB with sizeof(afr_local_t)= 10496 bytes*/
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index eebcc12b1ee..f66cdbbf56a 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -153,6 +153,7 @@ typedef struct _afr_private {
         uint32_t               halo_max_latency_msec;
         uint32_t               halo_max_replicas;
         uint32_t               halo_min_replicas;
+        uint32_t               halo_min_samples;
 
         afr_self_heald_t       shd;
         struct afr_nfsd        nfsd;
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index ab00914e3e2..f28294301f8 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -3068,6 +3068,11 @@ struct volopt_map_entry glusterd_volopt_map[] = {
           .op_version = 2,
           .flags      = OPT_FLAG_CLIENT_OPT
         },
+        { .key        = "cluster.halo-min-samples",
+          .voltype    = "cluster/replicate",
+          .op_version = 2,
+          .flags      = OPT_FLAG_CLIENT_OPT
+        },
         { .key         = NULL
         }
 };
diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c
index 7bde1c21410..3e18b4870ae 100644
--- a/xlators/protocol/client/src/client.c
+++ b/xlators/protocol/client/src/client.c
@@ -2750,6 +2750,8 @@ init (xlator_t *this)
 
         this->private = conf;
 
+        this->client_latency.min = UINT64_MAX;
+
         /* If it returns -1, then its a failure, if it returns +1 we need
            have to understand that 'this' is subvolume of a xlator which,
            will set the remote host and remote subvolume in a setxattr