From c87bd439ef12adc70dc580e75304121c3cd38e9a Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Thu, 22 Mar 2018 17:55:15 +0530 Subject: afr: add new value for read-hash-mode volume option Updates: #363 This new value (3) will try to wind read requests to the child of AFR having the least amount of pending requests in its queue. Change-Id: If6bda2aac9bf7aec3fc39622f78659313c4b6508 Signed-off-by: Ravishankar N --- xlators/cluster/afr/src/afr-common.c | 86 ++++++++++++++++++++++--------- xlators/cluster/afr/src/afr-mem-types.h | 1 + xlators/cluster/afr/src/afr-read-txn.c | 39 ++++++++++++-- xlators/cluster/afr/src/afr-transaction.h | 6 +++ xlators/cluster/afr/src/afr.c | 14 +++-- xlators/cluster/afr/src/afr.h | 5 ++ 6 files changed, 119 insertions(+), 32 deletions(-) (limited to 'xlators/cluster/afr') diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index bfd8c2e8c2c..685a349ac49 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1630,38 +1630,69 @@ out: return ret; } - int -afr_hash_child (afr_read_subvol_args_t *args, int32_t child_count, int hashmode) +afr_least_pending_reads_child (afr_private_t *priv) { - uuid_t gfid_copy = {0,}; - pid_t pid; + int i = 0; + int child = 0; + int64_t read_iter = -1; + int64_t pending_read = -1; - if (!hashmode) { - return -1; + pending_read = GF_ATOMIC_GET (priv->pending_reads[0]); + for (i = 1; i < priv->child_count; i++) { + if (AFR_IS_ARBITER_BRICK(priv, i)) + continue; + read_iter = GF_ATOMIC_GET(priv->pending_reads[i]); + if (read_iter < pending_read) { + pending_read = read_iter; + child = i; + } } - gf_uuid_copy (gfid_copy, args->gfid); + return child; +} - if ((hashmode > 1) && (args->ia_type != IA_IFDIR)) { - /* - * Why getpid? Because it's one of the cheapest calls - * available - faster than gethostname etc. - and returns a - * constant-length value that's sure to be shorter than a UUID. - * It's still very unlikely to be the same across clients, so - * it still provides good mixing. We're not trying for - * perfection here. All we need is a low probability that - * multiple clients won't converge on the same subvolume. - */ - pid = getpid(); - memcpy (gfid_copy, &pid, sizeof(pid)); +int +afr_hash_child (afr_read_subvol_args_t *args, afr_private_t *priv) +{ + uuid_t gfid_copy = {0,}; + pid_t pid; + int child = -1; + + switch (priv->hash_mode) { + case 0: + break; + case 1: + gf_uuid_copy (gfid_copy, args->gfid); + child = SuperFastHash((char *)gfid_copy, + sizeof(gfid_copy)) % priv->child_count; + break; + case 2: + if (args->ia_type != IA_IFDIR) { + /* + * Why getpid? Because it's one of the cheapest calls + * available - faster than gethostname etc. - and + * returns a constant-length value that's sure to be + * shorter than a UUID. It's still very unlikely to be + * the same across clients, so it still provides good + * mixing. We're not trying for perfection here. All we + * need is a low probability that multiple clients + * won't converge on the same subvolume. + */ + pid = getpid(); + memcpy (gfid_copy, &pid, sizeof(pid)); + } + child = SuperFastHash((char *)gfid_copy, + sizeof(gfid_copy)) % priv->child_count; + break; + case 3: + child = afr_least_pending_reads_child (priv); + break; } - return SuperFastHash((char *)gfid_copy, - sizeof(gfid_copy)) % child_count; + return child; } - int afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, unsigned char *readable, @@ -1686,8 +1717,7 @@ afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, } /* second preference - use hashed mode */ - read_subvol = afr_hash_child (&local_args, priv->child_count, - priv->hash_mode); + read_subvol = afr_hash_child (&local_args, priv); if (read_subvol >= 0 && readable[read_subvol]) return read_subvol; @@ -4611,6 +4641,8 @@ afr_priv_dump (xlator_t *this) gf_proc_dump_write(key, "%d", priv->child_up[i]); sprintf (key, "pending_key[%d]", i); gf_proc_dump_write(key, "%s", priv->pending_key[i]); + sprintf (key, "pending_reads[%d]", i); + gf_proc_dump_write(key, "%"PRId64, GF_ATOMIC_GET(priv->pending_reads[i])); } gf_proc_dump_write("data_self_heal", "%s", priv->data_self_heal); gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal); @@ -4623,6 +4655,7 @@ afr_priv_dump (xlator_t *this) gf_proc_dump_write("background-self-heal-count", "%d", priv->background_self_heal_count); gf_proc_dump_write("healers", "%d", priv->healers); + gf_proc_dump_write("read-hash-mode", "%d", priv->hash_mode); if (priv->quorum_count == AFR_QUORUM_AUTO) { gf_proc_dump_write ("quorum-type", "auto"); } else if (priv->quorum_count == 0) { @@ -5325,6 +5358,8 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) goto out; } + local->read_subvol = -1; + local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), gf_afr_mt_reply_t); if (!local->replies) { @@ -5474,9 +5509,12 @@ afr_priv_destroy (afr_private_t *priv) for (i = 0; i < priv->child_count; i++) GF_FREE (priv->pending_key[i]); } + GF_FREE (priv->pending_reads); + GF_FREE (priv->local); GF_FREE (priv->pending_key); GF_FREE (priv->children); GF_FREE (priv->child_up); + GF_FREE (priv->child_latency); LOCK_DESTROY (&priv->lock); GF_FREE (priv); diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index c7d6261b110..2e1117fc18c 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -47,6 +47,7 @@ enum gf_afr_mem_types_ { gf_afr_mt_spb_status_t, gf_afr_mt_empty_brick_t, gf_afr_mt_child_latency_t, + gf_afr_mt_atomic_t, gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c index f6c491b713e..a8a4090efd1 100644 --- a/xlators/cluster/afr/src/afr-read-txn.c +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -12,6 +12,39 @@ #include "afr-transaction.h" #include "afr-messages.h" +void +afr_pending_read_increment (afr_private_t *priv, int child_index) +{ + if (child_index < 0 || child_index > priv->child_count) + return; + + GF_ATOMIC_INC(priv->pending_reads[child_index]); +} + +void +afr_pending_read_decrement (afr_private_t *priv, int child_index) +{ + if (child_index < 0 || child_index > priv->child_count) + return; + + GF_ATOMIC_DEC(priv->pending_reads[child_index]); +} + +void +afr_read_txn_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + afr_pending_read_decrement (priv, local->read_subvol); + local->read_subvol = subvol; + afr_pending_read_increment (priv, subvol); + local->readfn (frame, this, subvol); +} + int afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this) { @@ -43,7 +76,7 @@ afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this) readable subvols. */ if (subvol != -1) local->read_attempted[subvol] = 1; - local->readfn (frame, this, subvol); + afr_read_txn_wind (frame, this, subvol); return 0; } @@ -89,7 +122,7 @@ readfn: if (read_subvol == -1) { AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN (-1, -err); } - local->readfn (frame, this, read_subvol); + afr_read_txn_wind (frame, this, read_subvol); return 0; } @@ -246,7 +279,7 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, local->read_attempted[read_subvol] = 1; read: - local->readfn (frame, this, read_subvol); + afr_read_txn_wind (frame, this, read_subvol); return 0; diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index a27e9a3c0b4..cb62c185938 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -37,6 +37,12 @@ int afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, int afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol); +void +afr_pending_read_increment (afr_private_t *priv, int child_index); + +void +afr_pending_read_decrement (afr_private_t *priv, int child_index); + call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame); gf_boolean_t afr_has_quorum (unsigned char *subvols, xlator_t *this); gf_boolean_t afr_needs_changelog_update (afr_local_t *local); diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index cfba5d5d3c9..22ce0a35ece 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -429,6 +429,9 @@ init (xlator_t *this) } GF_OPTION_INIT ("choose-local", priv->choose_local, bool, out); + priv->pending_reads = GF_CALLOC (sizeof(*priv->pending_reads), + priv->child_count, gf_afr_mt_atomic_t); + GF_OPTION_INIT ("read-hash-mode", priv->hash_mode, uint32, out); priv->favorite_child = -1; @@ -703,18 +706,19 @@ struct volume_options options[] = { { .key = {"read-hash-mode" }, .type = GF_OPTION_TYPE_INT, .min = 0, - .max = 2, + .max = 3, .default_value = "1", .op_version = {2}, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC, .tags = {"replicate"}, .description = "inode-read fops happen only on one of the bricks in " "replicate. AFR will prefer the one computed using " - "the method specified using this option" - "0 = first up server, " + "the method specified using this option.\n" + "0 = first readable child of AFR, starting from 1st child.\n" "1 = hash by GFID of file (all clients use " - "same subvolume), " - "2 = hash by GFID of file and client PID", + "same subvolume).\n" + "2 = hash by GFID of file and client PID.\n" + "3 = brick having the least outstanding read requests." }, { .key = {"choose-local" }, .type = GF_OPTION_TYPE_BOOL, diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index b2f3af136bd..129670517f3 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -113,6 +113,7 @@ typedef struct _afr_private { gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */ int read_child; /* read-subvolume */ unsigned int hash_mode; /* for when read_child is not set */ + gf_atomic_t *pending_reads; /*No. of pending read cbks per child.*/ int favorite_child; /* subvolume to be preferred in resolving split-brain cases */ @@ -425,6 +426,8 @@ typedef struct _afr_local { unsigned char *readable; unsigned char *readable2; /*For rename transaction*/ + int read_subvol; /* Current read subvolume */ + afr_inode_refresh_cbk_t refreshfn; /* @refreshinode: @@ -974,6 +977,8 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd); __this = frame->this; \ afr_handle_inconsistent_fop (frame, &__op_ret,\ &__op_errno);\ + if (__local && __local->is_read_txn) \ + afr_pending_read_decrement (__this->private, __local->read_subvol); \ frame->local = NULL; \ } \ \ -- cgit