diff options
Diffstat (limited to 'xlators')
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 101 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.c | 7 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 17 | 
3 files changed, 98 insertions, 27 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 96f13ce2cee..6863bd02c50 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1597,19 +1597,18 @@ out:  }  int -afr_least_pending_reads_child(afr_private_t *priv) +afr_least_pending_reads_child(afr_private_t *priv, unsigned char *readable)  {      int i = 0; -    int child = 0; +    int child = -1;      int64_t read_iter = -1;      int64_t pending_read = -1; -    pending_read = GF_ATOMIC_GET(priv->pending_reads[0]); -    for (i = 1; i < priv->child_count; i++) { -        if (AFR_IS_ARBITER_BRICK(priv, i)) +    for (i = 0; i < priv->child_count; i++) { +        if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i])              continue;          read_iter = GF_ATOMIC_GET(priv->pending_reads[i]); -        if (read_iter < pending_read) { +        if (child == -1 || read_iter < pending_read) {              pending_read = read_iter;              child = i;          } @@ -1618,8 +1617,54 @@ afr_least_pending_reads_child(afr_private_t *priv)      return child;  } +static int32_t +afr_least_latency_child(afr_private_t *priv, unsigned char *readable) +{ +    int32_t i = 0; +    int child = -1; + +    for (i = 0; i < priv->child_count; i++) { +        if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] || +            priv->child_latency[i] < 0) +            continue; + +        if (child == -1 || +            priv->child_latency[i] < priv->child_latency[child]) { +            child = i; +        } +    } +    return child; +} + +static int32_t +afr_least_latency_times_pending_reads_child(afr_private_t *priv, +                                            unsigned char *readable) +{ +    int32_t i = 0; +    int child = -1; +    int64_t pending_read = 0; +    int64_t latency = -1; +    int64_t least_latency = -1; + +    for (i = 0; i < priv->child_count; i++) { +        if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] || +            priv->child_latency[i] < 0) +            continue; + +        pending_read = GF_ATOMIC_GET(priv->pending_reads[i]); +        latency = (pending_read + 1) * priv->child_latency[i]; + +        if (child == -1 || latency < least_latency) { +            least_latency = latency; +            child = i; +        } +    } +    return child; +} +  int -afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv) +afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv, +               unsigned char *readable)  {      uuid_t gfid_copy = {          0, @@ -1628,14 +1673,14 @@ afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv)      int child = -1;      switch (priv->hash_mode) { -        case 0: +        case AFR_READ_POLICY_FIRST_UP:              break; -        case 1: +        case AFR_READ_POLICY_GFID_HASH:              gf_uuid_copy(gfid_copy, args->gfid);              child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) %                      priv->child_count;              break; -        case 2: +        case AFR_READ_POLICY_GFID_PID_HASH:              if (args->ia_type != IA_IFDIR) {                  /*                   * Why getpid?  Because it's one of the cheapest calls @@ -1653,8 +1698,14 @@ afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv)              child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) %                      priv->child_count;              break; -        case 3: -            child = afr_least_pending_reads_child(priv); +        case AFR_READ_POLICY_LESS_LOAD: +            child = afr_least_pending_reads_child(priv, readable); +            break; +        case AFR_READ_POLICY_LEAST_LATENCY: +            child = afr_least_latency_child(priv, readable); +            break; +        case AFR_READ_POLICY_LOAD_LATENCY_HYBRID: +            child = afr_least_latency_times_pending_reads_child(priv, readable);              break;      } @@ -1687,7 +1738,7 @@ afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this,      }      /* second preference - use hashed mode */ -    read_subvol = afr_hash_child(&local_args, priv); +    read_subvol = afr_hash_child(&local_args, priv, readable);      if (read_subvol >= 0 && readable[read_subvol])          return read_subvol; @@ -5174,7 +5225,10 @@ __afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator,       * want to set the child_latency to MAX to indicate       * the child needs ping data to be available before doing child-up       */ -    if (child_latency_msec < 0 && priv->halo_enabled) { +    if (!priv->halo_enabled) +        goto out; + +    if (child_latency_msec < 0) {          /*set to INT64_MAX-1 so that it is found for best_down_child*/          priv->child_latency[idx] = AFR_HALO_MAX_LATENCY;      } @@ -5214,7 +5268,7 @@ __afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator,                       "up_children (%d) > halo_max_replicas (%d)",                       worst_up_child, up_children, priv->halo_max_replicas);      } - +out:      if (up_children == 1) {          gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP,                 "Subvolume '%s' came back up; " @@ -5277,7 +5331,7 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,       * as we want it to be up to date if we are going to       * begin using it synchronously.       */ -    if (up_children < priv->halo_min_replicas) { +    if (priv->halo_enabled && up_children < priv->halo_min_replicas) {          best_down_child = find_best_down_child(this);          if (best_down_child >= 0) {              gf_msg_debug(this->name, 0, @@ -5289,7 +5343,6 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,              *up_child = best_down_child;          }      } -      for (i = 0; i < priv->child_count; i++)          if (priv->child_up[i] == 0)              down_children++; @@ -5461,13 +5514,13 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)      had_quorum = priv->quorum_count &&                   afr_has_quorum(priv->child_up, this, NULL); -    if (priv->halo_enabled) { -        halo_max_latency_msec = afr_get_halo_latency(this); +    if (event == GF_EVENT_CHILD_PING) { +        child_latency_msec = (int64_t)(uintptr_t)data2; +        if (priv->halo_enabled) { +            halo_max_latency_msec = afr_get_halo_latency(this); -        if (event == GF_EVENT_CHILD_PING) {              /* Calculates the child latency and sets event               */ -            child_latency_msec = (int64_t)(uintptr_t)data2;              LOCK(&priv->lock);              {                  __afr_handle_ping_event(this, child_xlator, idx, @@ -5475,6 +5528,12 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)                                          child_latency_msec);              }              UNLOCK(&priv->lock); +        } else { +            LOCK(&priv->lock); +            { +                priv->child_latency[idx] = child_latency_msec; +            } +            UNLOCK(&priv->lock);          }      } diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 67ff3409bb9..33a25cc5c0c 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -790,7 +790,7 @@ struct volume_options options[] = {      {.key = {"read-hash-mode"},       .type = GF_OPTION_TYPE_INT,       .min = 0, -     .max = 3, +     .max = 5,       .default_value = "1",       .op_version = {2},       .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, @@ -803,7 +803,10 @@ struct volume_options options[] = {           "1 = hash by GFID of file (all clients use "           "same subvolume).\n"           "2 = hash by GFID of file and client PID.\n" -         "3 = brick having the least outstanding read requests."}, +         "3 = brick having the least outstanding read requests.\n" +         "4 = brick having the least network ping latency.\n" +         "5 = Hybrid mode between 3 and 4, ie least value among " +         "network-latency multiplied by outstanding-read-requests."},      {          .key = {"choose-local"},          .type = GF_OPTION_TYPE_BOOL, diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 1a409ec625b..db83b395e02 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -96,6 +96,15 @@ typedef int (*afr_changelog_resume_t)(call_frame_t *frame, xlator_t *this);      } while (0)  typedef enum { +    AFR_READ_POLICY_FIRST_UP, +    AFR_READ_POLICY_GFID_HASH, +    AFR_READ_POLICY_GFID_PID_HASH, +    AFR_READ_POLICY_LESS_LOAD, +    AFR_READ_POLICY_LEAST_LATENCY, +    AFR_READ_POLICY_LOAD_LATENCY_HYBRID, +} afr_read_hash_mode_t; + +typedef enum {      AFR_FAV_CHILD_NONE,      AFR_FAV_CHILD_BY_SIZE,      AFR_FAV_CHILD_BY_CTIME, @@ -183,10 +192,10 @@ typedef struct _afr_private {      gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */      int read_child;                               /* read-subvolume */ -    unsigned int hash_mode;     /* for when read_child is not set */ -    gf_atomic_t *pending_reads; /*No. of pending read cbks per child.*/ -    int favorite_child;         /* subvolume to be preferred in resolving -                                            split-brain cases */ +    afr_read_hash_mode_t hash_mode; /* for when read_child is not set */ +    gf_atomic_t *pending_reads;     /*No. of pending read cbks per child.*/ +    int favorite_child;             /* subvolume to be preferred in resolving +                                                split-brain cases */      afr_favorite_child_policy fav_child_policy; /*Policy to use for automatic                                                resolution of split-brains.*/  | 
