From 2e11f61029d0b36893c4aa859f99c22b6202ad12 Mon Sep 17 00:00:00 2001 From: Mohammed Rafi KC Date: Wed, 19 Jun 2019 15:11:59 +0530 Subject: afr/read: Implement latency based read child selection Network latency is an important factor selecting a read subvolume. So this patch is adding two new policy. 1) We measure the latency of a child during a GF_DUMP rpc call. Then use this latency to pick a read subvol having the least latency. 2) Second one is an hybrid mode where it calculates the effective latency by multiplying outstanding pending read request and latency, and choose the least one. Change-Id: Ia49c8a08ab61f7dcdad8b8950aa4d338e7accf97 fixes: #520 Signed-off-by: Mohammed Rafi KC --- xlators/cluster/afr/src/afr-common.c | 101 +++++++++++++++++++++++++++-------- 1 file changed, 80 insertions(+), 21 deletions(-) (limited to 'xlators/cluster/afr/src/afr-common.c') diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 96f13ce2cee..6863bd02c50 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1597,19 +1597,18 @@ out: } int -afr_least_pending_reads_child(afr_private_t *priv) +afr_least_pending_reads_child(afr_private_t *priv, unsigned char *readable) { int i = 0; - int child = 0; + int child = -1; int64_t read_iter = -1; int64_t pending_read = -1; - pending_read = GF_ATOMIC_GET(priv->pending_reads[0]); - for (i = 1; i < priv->child_count; i++) { - if (AFR_IS_ARBITER_BRICK(priv, i)) + for (i = 0; i < priv->child_count; i++) { + if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i]) continue; read_iter = GF_ATOMIC_GET(priv->pending_reads[i]); - if (read_iter < pending_read) { + if (child == -1 || read_iter < pending_read) { pending_read = read_iter; child = i; } @@ -1618,8 +1617,54 @@ afr_least_pending_reads_child(afr_private_t *priv) return child; } +static int32_t +afr_least_latency_child(afr_private_t *priv, unsigned char *readable) +{ + int32_t i = 0; + int child = -1; + + for (i = 0; i < priv->child_count; i++) { + if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] || + priv->child_latency[i] < 0) + continue; + + if (child == -1 || + priv->child_latency[i] < priv->child_latency[child]) { + child = i; + } + } + return child; +} + +static int32_t +afr_least_latency_times_pending_reads_child(afr_private_t *priv, + unsigned char *readable) +{ + int32_t i = 0; + int child = -1; + int64_t pending_read = 0; + int64_t latency = -1; + int64_t least_latency = -1; + + for (i = 0; i < priv->child_count; i++) { + if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] || + priv->child_latency[i] < 0) + continue; + + pending_read = GF_ATOMIC_GET(priv->pending_reads[i]); + latency = (pending_read + 1) * priv->child_latency[i]; + + if (child == -1 || latency < least_latency) { + least_latency = latency; + child = i; + } + } + return child; +} + int -afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv) +afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv, + unsigned char *readable) { uuid_t gfid_copy = { 0, @@ -1628,14 +1673,14 @@ afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv) int child = -1; switch (priv->hash_mode) { - case 0: + case AFR_READ_POLICY_FIRST_UP: break; - case 1: + case AFR_READ_POLICY_GFID_HASH: gf_uuid_copy(gfid_copy, args->gfid); child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) % priv->child_count; break; - case 2: + case AFR_READ_POLICY_GFID_PID_HASH: if (args->ia_type != IA_IFDIR) { /* * Why getpid? Because it's one of the cheapest calls @@ -1653,8 +1698,14 @@ afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv) child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) % priv->child_count; break; - case 3: - child = afr_least_pending_reads_child(priv); + case AFR_READ_POLICY_LESS_LOAD: + child = afr_least_pending_reads_child(priv, readable); + break; + case AFR_READ_POLICY_LEAST_LATENCY: + child = afr_least_latency_child(priv, readable); + break; + case AFR_READ_POLICY_LOAD_LATENCY_HYBRID: + child = afr_least_latency_times_pending_reads_child(priv, readable); break; } @@ -1687,7 +1738,7 @@ afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this, } /* second preference - use hashed mode */ - read_subvol = afr_hash_child(&local_args, priv); + read_subvol = afr_hash_child(&local_args, priv, readable); if (read_subvol >= 0 && readable[read_subvol]) return read_subvol; @@ -5174,7 +5225,10 @@ __afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator, * want to set the child_latency to MAX to indicate * the child needs ping data to be available before doing child-up */ - if (child_latency_msec < 0 && priv->halo_enabled) { + if (!priv->halo_enabled) + goto out; + + if (child_latency_msec < 0) { /*set to INT64_MAX-1 so that it is found for best_down_child*/ priv->child_latency[idx] = AFR_HALO_MAX_LATENCY; } @@ -5214,7 +5268,7 @@ __afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator, "up_children (%d) > halo_max_replicas (%d)", worst_up_child, up_children, priv->halo_max_replicas); } - +out: if (up_children == 1) { gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP, "Subvolume '%s' came back up; " @@ -5277,7 +5331,7 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx, * as we want it to be up to date if we are going to * begin using it synchronously. */ - if (up_children < priv->halo_min_replicas) { + if (priv->halo_enabled && up_children < priv->halo_min_replicas) { best_down_child = find_best_down_child(this); if (best_down_child >= 0) { gf_msg_debug(this->name, 0, @@ -5289,7 +5343,6 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx, *up_child = best_down_child; } } - for (i = 0; i < priv->child_count; i++) if (priv->child_up[i] == 0) down_children++; @@ -5461,13 +5514,13 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) had_quorum = priv->quorum_count && afr_has_quorum(priv->child_up, this, NULL); - if (priv->halo_enabled) { - halo_max_latency_msec = afr_get_halo_latency(this); + if (event == GF_EVENT_CHILD_PING) { + child_latency_msec = (int64_t)(uintptr_t)data2; + if (priv->halo_enabled) { + halo_max_latency_msec = afr_get_halo_latency(this); - if (event == GF_EVENT_CHILD_PING) { /* Calculates the child latency and sets event */ - child_latency_msec = (int64_t)(uintptr_t)data2; LOCK(&priv->lock); { __afr_handle_ping_event(this, child_xlator, idx, @@ -5475,6 +5528,12 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) child_latency_msec); } UNLOCK(&priv->lock); + } else { + LOCK(&priv->lock); + { + priv->child_latency[idx] = child_latency_msec; + } + UNLOCK(&priv->lock); } } -- cgit