afr/read: Implement latency based read child selection

Network latency is an important factor selecting a read subvolume. So this patch is adding two new policy. 1) We measure the latency of a child during a GF_DUMP rpc call. Then use this latency to pick a read subvol having the least latency. 2) Second one is an hybrid mode where it calculates the effective latency by multiplying outstanding pending read request and latency, and choose the least one. Change-Id: Ia49c8a08ab61f7dcdad8b8950aa4d338e7accf97 fixes: #520 Signed-off-by: Mohammed Rafi KC <rkavunga@redhat.com>
author: Mohammed Rafi KC <rkavunga@redhat.com> 2019-06-19 15:11:59 +0530
committer: Pranith Kumar Karampuri <pkarampu@redhat.com> 2019-06-20 12:30:59 +0000
commit: 2e11f61029d0b36893c4aa859f99c22b6202ad12 (patch)
tree: 632750b74d48b29d7ebf644886603467279eeb12 /xlators/cluster/afr
parent: bcdb77023e2efbbf06ad576851f0f38a0b8b11ab (diff)
3 files changed, 98 insertions, 27 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 96f13ce2cee..6863bd02c50 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -1597,19 +1597,18 @@ out:
 }
 
 int
-afr_least_pending_reads_child(afr_private_t *priv)
+afr_least_pending_reads_child(afr_private_t *priv, unsigned char *readable)
 {
     int i = 0;
-    int child = 0;
+    int child = -1;
     int64_t read_iter = -1;
     int64_t pending_read = -1;
 
-    pending_read = GF_ATOMIC_GET(priv->pending_reads[0]);
-    for (i = 1; i < priv->child_count; i++) {
-        if (AFR_IS_ARBITER_BRICK(priv, i))
+    for (i = 0; i < priv->child_count; i++) {
+        if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i])
             continue;
         read_iter = GF_ATOMIC_GET(priv->pending_reads[i]);
-        if (read_iter < pending_read) {
+        if (child == -1 || read_iter < pending_read) {
             pending_read = read_iter;
             child = i;
         }
@@ -1618,8 +1617,54 @@ afr_least_pending_reads_child(afr_private_t *priv)
     return child;
 }
 
+static int32_t
+afr_least_latency_child(afr_private_t *priv, unsigned char *readable)
+{
+    int32_t i = 0;
+    int child = -1;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] ||
+            priv->child_latency[i] < 0)
+            continue;
+
+        if (child == -1 ||
+            priv->child_latency[i] < priv->child_latency[child]) {
+            child = i;
+        }
+    }
+    return child;
+}
+
+static int32_t
+afr_least_latency_times_pending_reads_child(afr_private_t *priv,
+                                            unsigned char *readable)
+{
+    int32_t i = 0;
+    int child = -1;
+    int64_t pending_read = 0;
+    int64_t latency = -1;
+    int64_t least_latency = -1;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] ||
+            priv->child_latency[i] < 0)
+            continue;
+
+        pending_read = GF_ATOMIC_GET(priv->pending_reads[i]);
+        latency = (pending_read + 1) * priv->child_latency[i];
+
+        if (child == -1 || latency < least_latency) {
+            least_latency = latency;
+            child = i;
+        }
+    }
+    return child;
+}
+
 int
-afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv)
+afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv,
+               unsigned char *readable)
 {
     uuid_t gfid_copy = {
         0,
@@ -1628,14 +1673,14 @@ afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv)
     int child = -1;
 
     switch (priv->hash_mode) {
-        case 0:
+        case AFR_READ_POLICY_FIRST_UP:
             break;
-        case 1:
+        case AFR_READ_POLICY_GFID_HASH:
             gf_uuid_copy(gfid_copy, args->gfid);
             child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) %
                     priv->child_count;
             break;
-        case 2:
+        case AFR_READ_POLICY_GFID_PID_HASH:
             if (args->ia_type != IA_IFDIR) {
                 /*
                  * Why getpid?  Because it's one of the cheapest calls
@@ -1653,8 +1698,14 @@ afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv)
             child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) %
                     priv->child_count;
             break;
-        case 3:
-            child = afr_least_pending_reads_child(priv);
+        case AFR_READ_POLICY_LESS_LOAD:
+            child = afr_least_pending_reads_child(priv, readable);
+            break;
+        case AFR_READ_POLICY_LEAST_LATENCY:
+            child = afr_least_latency_child(priv, readable);
+            break;
+        case AFR_READ_POLICY_LOAD_LATENCY_HYBRID:
+            child = afr_least_latency_times_pending_reads_child(priv, readable);
             break;
     }
 
@@ -1687,7 +1738,7 @@ afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this,
     }
 
     /* second preference - use hashed mode */
-    read_subvol = afr_hash_child(&local_args, priv);
+    read_subvol = afr_hash_child(&local_args, priv, readable);
     if (read_subvol >= 0 && readable[read_subvol])
         return read_subvol;
 
@@ -5174,7 +5225,10 @@ __afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator,
      * want to set the child_latency to MAX to indicate
      * the child needs ping data to be available before doing child-up
      */
-    if (child_latency_msec < 0 && priv->halo_enabled) {
+    if (!priv->halo_enabled)
+        goto out;
+
+    if (child_latency_msec < 0) {
         /*set to INT64_MAX-1 so that it is found for best_down_child*/
         priv->child_latency[idx] = AFR_HALO_MAX_LATENCY;
     }
@@ -5214,7 +5268,7 @@ __afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator,
                      "up_children (%d) > halo_max_replicas (%d)",
                      worst_up_child, up_children, priv->halo_max_replicas);
     }
-
+out:
     if (up_children == 1) {
         gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP,
                "Subvolume '%s' came back up; "
@@ -5277,7 +5331,7 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,
      * as we want it to be up to date if we are going to
      * begin using it synchronously.
      */
-    if (up_children < priv->halo_min_replicas) {
+    if (priv->halo_enabled && up_children < priv->halo_min_replicas) {
         best_down_child = find_best_down_child(this);
         if (best_down_child >= 0) {
             gf_msg_debug(this->name, 0,
@@ -5289,7 +5343,6 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,
             *up_child = best_down_child;
         }
     }
-
     for (i = 0; i < priv->child_count; i++)
         if (priv->child_up[i] == 0)
             down_children++;
@@ -5461,13 +5514,13 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
 
     had_quorum = priv->quorum_count &&
                  afr_has_quorum(priv->child_up, this, NULL);
-    if (priv->halo_enabled) {
-        halo_max_latency_msec = afr_get_halo_latency(this);
+    if (event == GF_EVENT_CHILD_PING) {
+        child_latency_msec = (int64_t)(uintptr_t)data2;
+        if (priv->halo_enabled) {
+            halo_max_latency_msec = afr_get_halo_latency(this);
 
-        if (event == GF_EVENT_CHILD_PING) {
             /* Calculates the child latency and sets event
              */
-            child_latency_msec = (int64_t)(uintptr_t)data2;
             LOCK(&priv->lock);
             {
                 __afr_handle_ping_event(this, child_xlator, idx,
@@ -5475,6 +5528,12 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
                                         child_latency_msec);
             }
             UNLOCK(&priv->lock);
+        } else {
+            LOCK(&priv->lock);
+            {
+                priv->child_latency[idx] = child_latency_msec;
+            }
+            UNLOCK(&priv->lock);
         }
     }
 
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 67ff3409bb9..33a25cc5c0c 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -790,7 +790,7 @@ struct volume_options options[] = {
     {.key = {"read-hash-mode"},
      .type = GF_OPTION_TYPE_INT,
      .min = 0,
-     .max = 3,
+     .max = 5,
      .default_value = "1",
      .op_version = {2},
      .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
@@ -803,7 +803,10 @@ struct volume_options options[] = {
          "1 = hash by GFID of file (all clients use "
          "same subvolume).\n"
          "2 = hash by GFID of file and client PID.\n"
-         "3 = brick having the least outstanding read requests."},
+         "3 = brick having the least outstanding read requests.\n"
+         "4 = brick having the least network ping latency.\n"
+         "5 = Hybrid mode between 3 and 4, ie least value among "
+         "network-latency multiplied by outstanding-read-requests."},
     {
         .key = {"choose-local"},
         .type = GF_OPTION_TYPE_BOOL,
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 1a409ec625b..db83b395e02 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -96,6 +96,15 @@ typedef int (*afr_changelog_resume_t)(call_frame_t *frame, xlator_t *this);
     } while (0)
 
 typedef enum {
+    AFR_READ_POLICY_FIRST_UP,
+    AFR_READ_POLICY_GFID_HASH,
+    AFR_READ_POLICY_GFID_PID_HASH,
+    AFR_READ_POLICY_LESS_LOAD,
+    AFR_READ_POLICY_LEAST_LATENCY,
+    AFR_READ_POLICY_LOAD_LATENCY_HYBRID,
+} afr_read_hash_mode_t;
+
+typedef enum {
     AFR_FAV_CHILD_NONE,
     AFR_FAV_CHILD_BY_SIZE,
     AFR_FAV_CHILD_BY_CTIME,
@@ -183,10 +192,10 @@ typedef struct _afr_private {
 
     gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
     int read_child;                               /* read-subvolume */
-    unsigned int hash_mode;     /* for when read_child is not set */
-    gf_atomic_t *pending_reads; /*No. of pending read cbks per child.*/
-    int favorite_child;         /* subvolume to be preferred in resolving
-                                            split-brain cases */
+    afr_read_hash_mode_t hash_mode; /* for when read_child is not set */
+    gf_atomic_t *pending_reads;     /*No. of pending read cbks per child.*/
+    int favorite_child;             /* subvolume to be preferred in resolving
+                                                split-brain cases */
 
     afr_favorite_child_policy fav_child_policy; /*Policy to use for automatic
                                               resolution of split-brains.*/
author	Mohammed Rafi KC <rkavunga@redhat.com>	2019-06-19 15:11:59 +0530
committer	Pranith Kumar Karampuri <pkarampu@redhat.com>	2019-06-20 12:30:59 +0000
commit	2e11f61029d0b36893c4aa859f99c22b6202ad12 (patch)
tree	632750b74d48b29d7ebf644886603467279eeb12 /xlators/cluster/afr
parent	bcdb77023e2efbbf06ad576851f0f38a0b8b11ab (diff)