summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr/src/afr-common.c
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster/afr/src/afr-common.c')
-rw-r--r--xlators/cluster/afr/src/afr-common.c101
1 files changed, 80 insertions, 21 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 96f13ce2cee..6863bd02c50 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -1597,19 +1597,18 @@ out:
}
int
-afr_least_pending_reads_child(afr_private_t *priv)
+afr_least_pending_reads_child(afr_private_t *priv, unsigned char *readable)
{
int i = 0;
- int child = 0;
+ int child = -1;
int64_t read_iter = -1;
int64_t pending_read = -1;
- pending_read = GF_ATOMIC_GET(priv->pending_reads[0]);
- for (i = 1; i < priv->child_count; i++) {
- if (AFR_IS_ARBITER_BRICK(priv, i))
+ for (i = 0; i < priv->child_count; i++) {
+ if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i])
continue;
read_iter = GF_ATOMIC_GET(priv->pending_reads[i]);
- if (read_iter < pending_read) {
+ if (child == -1 || read_iter < pending_read) {
pending_read = read_iter;
child = i;
}
@@ -1618,8 +1617,54 @@ afr_least_pending_reads_child(afr_private_t *priv)
return child;
}
+static int32_t
+afr_least_latency_child(afr_private_t *priv, unsigned char *readable)
+{
+ int32_t i = 0;
+ int child = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] ||
+ priv->child_latency[i] < 0)
+ continue;
+
+ if (child == -1 ||
+ priv->child_latency[i] < priv->child_latency[child]) {
+ child = i;
+ }
+ }
+ return child;
+}
+
+static int32_t
+afr_least_latency_times_pending_reads_child(afr_private_t *priv,
+ unsigned char *readable)
+{
+ int32_t i = 0;
+ int child = -1;
+ int64_t pending_read = 0;
+ int64_t latency = -1;
+ int64_t least_latency = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] ||
+ priv->child_latency[i] < 0)
+ continue;
+
+ pending_read = GF_ATOMIC_GET(priv->pending_reads[i]);
+ latency = (pending_read + 1) * priv->child_latency[i];
+
+ if (child == -1 || latency < least_latency) {
+ least_latency = latency;
+ child = i;
+ }
+ }
+ return child;
+}
+
int
-afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv)
+afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv,
+ unsigned char *readable)
{
uuid_t gfid_copy = {
0,
@@ -1628,14 +1673,14 @@ afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv)
int child = -1;
switch (priv->hash_mode) {
- case 0:
+ case AFR_READ_POLICY_FIRST_UP:
break;
- case 1:
+ case AFR_READ_POLICY_GFID_HASH:
gf_uuid_copy(gfid_copy, args->gfid);
child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) %
priv->child_count;
break;
- case 2:
+ case AFR_READ_POLICY_GFID_PID_HASH:
if (args->ia_type != IA_IFDIR) {
/*
* Why getpid? Because it's one of the cheapest calls
@@ -1653,8 +1698,14 @@ afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv)
child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) %
priv->child_count;
break;
- case 3:
- child = afr_least_pending_reads_child(priv);
+ case AFR_READ_POLICY_LESS_LOAD:
+ child = afr_least_pending_reads_child(priv, readable);
+ break;
+ case AFR_READ_POLICY_LEAST_LATENCY:
+ child = afr_least_latency_child(priv, readable);
+ break;
+ case AFR_READ_POLICY_LOAD_LATENCY_HYBRID:
+ child = afr_least_latency_times_pending_reads_child(priv, readable);
break;
}
@@ -1687,7 +1738,7 @@ afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this,
}
/* second preference - use hashed mode */
- read_subvol = afr_hash_child(&local_args, priv);
+ read_subvol = afr_hash_child(&local_args, priv, readable);
if (read_subvol >= 0 && readable[read_subvol])
return read_subvol;
@@ -5174,7 +5225,10 @@ __afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator,
* want to set the child_latency to MAX to indicate
* the child needs ping data to be available before doing child-up
*/
- if (child_latency_msec < 0 && priv->halo_enabled) {
+ if (!priv->halo_enabled)
+ goto out;
+
+ if (child_latency_msec < 0) {
/*set to INT64_MAX-1 so that it is found for best_down_child*/
priv->child_latency[idx] = AFR_HALO_MAX_LATENCY;
}
@@ -5214,7 +5268,7 @@ __afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator,
"up_children (%d) > halo_max_replicas (%d)",
worst_up_child, up_children, priv->halo_max_replicas);
}
-
+out:
if (up_children == 1) {
gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP,
"Subvolume '%s' came back up; "
@@ -5277,7 +5331,7 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,
* as we want it to be up to date if we are going to
* begin using it synchronously.
*/
- if (up_children < priv->halo_min_replicas) {
+ if (priv->halo_enabled && up_children < priv->halo_min_replicas) {
best_down_child = find_best_down_child(this);
if (best_down_child >= 0) {
gf_msg_debug(this->name, 0,
@@ -5289,7 +5343,6 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,
*up_child = best_down_child;
}
}
-
for (i = 0; i < priv->child_count; i++)
if (priv->child_up[i] == 0)
down_children++;
@@ -5461,13 +5514,13 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
had_quorum = priv->quorum_count &&
afr_has_quorum(priv->child_up, this, NULL);
- if (priv->halo_enabled) {
- halo_max_latency_msec = afr_get_halo_latency(this);
+ if (event == GF_EVENT_CHILD_PING) {
+ child_latency_msec = (int64_t)(uintptr_t)data2;
+ if (priv->halo_enabled) {
+ halo_max_latency_msec = afr_get_halo_latency(this);
- if (event == GF_EVENT_CHILD_PING) {
/* Calculates the child latency and sets event
*/
- child_latency_msec = (int64_t)(uintptr_t)data2;
LOCK(&priv->lock);
{
__afr_handle_ping_event(this, child_xlator, idx,
@@ -5475,6 +5528,12 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
child_latency_msec);
}
UNLOCK(&priv->lock);
+ } else {
+ LOCK(&priv->lock);
+ {
+ priv->child_latency[idx] = child_latency_msec;
+ }
+ UNLOCK(&priv->lock);
}
}