summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--tests/basic/afr/afr-read-hash-mode.t56
-rw-r--r--xlators/cluster/afr/src/afr-common.c86
-rw-r--r--xlators/cluster/afr/src/afr-mem-types.h1
-rw-r--r--xlators/cluster/afr/src/afr-read-txn.c39
-rw-r--r--xlators/cluster/afr/src/afr-transaction.h6
-rw-r--r--xlators/cluster/afr/src/afr.c14
-rw-r--r--xlators/cluster/afr/src/afr.h5
7 files changed, 175 insertions, 32 deletions
diff --git a/tests/basic/afr/afr-read-hash-mode.t b/tests/basic/afr/afr-read-hash-mode.t
new file mode 100644
index 00000000000..eeff10d8ebd
--- /dev/null
+++ b/tests/basic/afr/afr-read-hash-mode.t
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+cleanup
+
+function reads_brick_count {
+ $CLI volume profile $V0 info incremental | grep -w READ | wc -l
+}
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 arbiter 1 $H0:$B0/${V0}{0..2}
+
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume set $V0 performance.quick-read off
+TEST $CLI volume set $V0 performance.io-cache off
+TEST $CLI volume set $V0 performance.write-behind off
+TEST $CLI volume set $V0 performance.stat-prefetch off
+TEST $CLI volume set $V0 performance.read-ahead off
+TEST $CLI volume start $V0
+
+# Disable all caching
+TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0
+TEST dd if=/dev/urandom of=$M0/FILE bs=1M count=8
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+
+# TEST if the option gives the intended behavior. The way we perform this test
+# is by performing reads from the mount and write to /dev/null. If the
+# read-hash-mode is 3, then for a given file, more than 1 brick should serve the
+# read-fops where as with the default read-hash-mode (i.e. 1), only 1 brick will.
+
+# read-hash-mode=1
+TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0
+EXPECT "1" mount_get_option_value $M0 $V0-replicate-0 read-hash-mode
+TEST $CLI volume profile $V0 start
+TEST dd if=$M0/FILE of=/dev/null bs=1M
+count=`reads_brick_count`
+TEST [ $count -eq 1 ]
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+
+# read-hash-mode=3
+TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0
+TEST $CLI volume set $V0 cluster.read-hash-mode 3
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "3" mount_get_option_value $M0 $V0-replicate-0 read-hash-mode
+TEST $CLI volume profile $V0 info clear
+TEST dd if=$M0/FILE of=/dev/null bs=1M
+count=`reads_brick_count`
+TEST [ $count -eq 2 ]
+
+# Check that the arbiter did not serve any reads
+arbiter_reads=$($CLI volume top $V0 read brick $H0:$B0/${V0}2|grep FILE|awk '{print $1}')
+TEST [ -z $arbiter_reads ]
+
+cleanup;
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index bfd8c2e8c2c..685a349ac49 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -1630,38 +1630,69 @@ out:
return ret;
}
-
int
-afr_hash_child (afr_read_subvol_args_t *args, int32_t child_count, int hashmode)
+afr_least_pending_reads_child (afr_private_t *priv)
{
- uuid_t gfid_copy = {0,};
- pid_t pid;
+ int i = 0;
+ int child = 0;
+ int64_t read_iter = -1;
+ int64_t pending_read = -1;
- if (!hashmode) {
- return -1;
+ pending_read = GF_ATOMIC_GET (priv->pending_reads[0]);
+ for (i = 1; i < priv->child_count; i++) {
+ if (AFR_IS_ARBITER_BRICK(priv, i))
+ continue;
+ read_iter = GF_ATOMIC_GET(priv->pending_reads[i]);
+ if (read_iter < pending_read) {
+ pending_read = read_iter;
+ child = i;
+ }
}
- gf_uuid_copy (gfid_copy, args->gfid);
+ return child;
+}
- if ((hashmode > 1) && (args->ia_type != IA_IFDIR)) {
- /*
- * Why getpid? Because it's one of the cheapest calls
- * available - faster than gethostname etc. - and returns a
- * constant-length value that's sure to be shorter than a UUID.
- * It's still very unlikely to be the same across clients, so
- * it still provides good mixing. We're not trying for
- * perfection here. All we need is a low probability that
- * multiple clients won't converge on the same subvolume.
- */
- pid = getpid();
- memcpy (gfid_copy, &pid, sizeof(pid));
+int
+afr_hash_child (afr_read_subvol_args_t *args, afr_private_t *priv)
+{
+ uuid_t gfid_copy = {0,};
+ pid_t pid;
+ int child = -1;
+
+ switch (priv->hash_mode) {
+ case 0:
+ break;
+ case 1:
+ gf_uuid_copy (gfid_copy, args->gfid);
+ child = SuperFastHash((char *)gfid_copy,
+ sizeof(gfid_copy)) % priv->child_count;
+ break;
+ case 2:
+ if (args->ia_type != IA_IFDIR) {
+ /*
+ * Why getpid? Because it's one of the cheapest calls
+ * available - faster than gethostname etc. - and
+ * returns a constant-length value that's sure to be
+ * shorter than a UUID. It's still very unlikely to be
+ * the same across clients, so it still provides good
+ * mixing. We're not trying for perfection here. All we
+ * need is a low probability that multiple clients
+ * won't converge on the same subvolume.
+ */
+ pid = getpid();
+ memcpy (gfid_copy, &pid, sizeof(pid));
+ }
+ child = SuperFastHash((char *)gfid_copy,
+ sizeof(gfid_copy)) % priv->child_count;
+ break;
+ case 3:
+ child = afr_least_pending_reads_child (priv);
+ break;
}
- return SuperFastHash((char *)gfid_copy,
- sizeof(gfid_copy)) % child_count;
+ return child;
}
-
int
afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
unsigned char *readable,
@@ -1686,8 +1717,7 @@ afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
}
/* second preference - use hashed mode */
- read_subvol = afr_hash_child (&local_args, priv->child_count,
- priv->hash_mode);
+ read_subvol = afr_hash_child (&local_args, priv);
if (read_subvol >= 0 && readable[read_subvol])
return read_subvol;
@@ -4611,6 +4641,8 @@ afr_priv_dump (xlator_t *this)
gf_proc_dump_write(key, "%d", priv->child_up[i]);
sprintf (key, "pending_key[%d]", i);
gf_proc_dump_write(key, "%s", priv->pending_key[i]);
+ sprintf (key, "pending_reads[%d]", i);
+ gf_proc_dump_write(key, "%"PRId64, GF_ATOMIC_GET(priv->pending_reads[i]));
}
gf_proc_dump_write("data_self_heal", "%s", priv->data_self_heal);
gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal);
@@ -4623,6 +4655,7 @@ afr_priv_dump (xlator_t *this)
gf_proc_dump_write("background-self-heal-count", "%d",
priv->background_self_heal_count);
gf_proc_dump_write("healers", "%d", priv->healers);
+ gf_proc_dump_write("read-hash-mode", "%d", priv->hash_mode);
if (priv->quorum_count == AFR_QUORUM_AUTO) {
gf_proc_dump_write ("quorum-type", "auto");
} else if (priv->quorum_count == 0) {
@@ -5325,6 +5358,8 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
goto out;
}
+ local->read_subvol = -1;
+
local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies),
gf_afr_mt_reply_t);
if (!local->replies) {
@@ -5474,9 +5509,12 @@ afr_priv_destroy (afr_private_t *priv)
for (i = 0; i < priv->child_count; i++)
GF_FREE (priv->pending_key[i]);
}
+ GF_FREE (priv->pending_reads);
+ GF_FREE (priv->local);
GF_FREE (priv->pending_key);
GF_FREE (priv->children);
GF_FREE (priv->child_up);
+ GF_FREE (priv->child_latency);
LOCK_DESTROY (&priv->lock);
GF_FREE (priv);
diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h
index c7d6261b110..2e1117fc18c 100644
--- a/xlators/cluster/afr/src/afr-mem-types.h
+++ b/xlators/cluster/afr/src/afr-mem-types.h
@@ -47,6 +47,7 @@ enum gf_afr_mem_types_ {
gf_afr_mt_spb_status_t,
gf_afr_mt_empty_brick_t,
gf_afr_mt_child_latency_t,
+ gf_afr_mt_atomic_t,
gf_afr_mt_end
};
#endif
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
index f6c491b713e..a8a4090efd1 100644
--- a/xlators/cluster/afr/src/afr-read-txn.c
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -12,6 +12,39 @@
#include "afr-transaction.h"
#include "afr-messages.h"
+void
+afr_pending_read_increment (afr_private_t *priv, int child_index)
+{
+ if (child_index < 0 || child_index > priv->child_count)
+ return;
+
+ GF_ATOMIC_INC(priv->pending_reads[child_index]);
+}
+
+void
+afr_pending_read_decrement (afr_private_t *priv, int child_index)
+{
+ if (child_index < 0 || child_index > priv->child_count)
+ return;
+
+ GF_ATOMIC_DEC(priv->pending_reads[child_index]);
+}
+
+void
+afr_read_txn_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ afr_pending_read_decrement (priv, local->read_subvol);
+ local->read_subvol = subvol;
+ afr_pending_read_increment (priv, subvol);
+ local->readfn (frame, this, subvol);
+}
+
int
afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this)
{
@@ -43,7 +76,7 @@ afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this)
readable subvols. */
if (subvol != -1)
local->read_attempted[subvol] = 1;
- local->readfn (frame, this, subvol);
+ afr_read_txn_wind (frame, this, subvol);
return 0;
}
@@ -89,7 +122,7 @@ readfn:
if (read_subvol == -1) {
AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN (-1, -err);
}
- local->readfn (frame, this, read_subvol);
+ afr_read_txn_wind (frame, this, read_subvol);
return 0;
}
@@ -246,7 +279,7 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
local->read_attempted[read_subvol] = 1;
read:
- local->readfn (frame, this, read_subvol);
+ afr_read_txn_wind (frame, this, read_subvol);
return 0;
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
index a27e9a3c0b4..cb62c185938 100644
--- a/xlators/cluster/afr/src/afr-transaction.h
+++ b/xlators/cluster/afr/src/afr-transaction.h
@@ -37,6 +37,12 @@ int afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
int afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol);
+void
+afr_pending_read_increment (afr_private_t *priv, int child_index);
+
+void
+afr_pending_read_decrement (afr_private_t *priv, int child_index);
+
call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame);
gf_boolean_t afr_has_quorum (unsigned char *subvols, xlator_t *this);
gf_boolean_t afr_needs_changelog_update (afr_local_t *local);
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index cfba5d5d3c9..22ce0a35ece 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -429,6 +429,9 @@ init (xlator_t *this)
}
GF_OPTION_INIT ("choose-local", priv->choose_local, bool, out);
+ priv->pending_reads = GF_CALLOC (sizeof(*priv->pending_reads),
+ priv->child_count, gf_afr_mt_atomic_t);
+
GF_OPTION_INIT ("read-hash-mode", priv->hash_mode, uint32, out);
priv->favorite_child = -1;
@@ -703,18 +706,19 @@ struct volume_options options[] = {
{ .key = {"read-hash-mode" },
.type = GF_OPTION_TYPE_INT,
.min = 0,
- .max = 2,
+ .max = 3,
.default_value = "1",
.op_version = {2},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "inode-read fops happen only on one of the bricks in "
"replicate. AFR will prefer the one computed using "
- "the method specified using this option"
- "0 = first up server, "
+ "the method specified using this option.\n"
+ "0 = first readable child of AFR, starting from 1st child.\n"
"1 = hash by GFID of file (all clients use "
- "same subvolume), "
- "2 = hash by GFID of file and client PID",
+ "same subvolume).\n"
+ "2 = hash by GFID of file and client PID.\n"
+ "3 = brick having the least outstanding read requests."
},
{ .key = {"choose-local" },
.type = GF_OPTION_TYPE_BOOL,
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index b2f3af136bd..129670517f3 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -113,6 +113,7 @@ typedef struct _afr_private {
gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
int read_child; /* read-subvolume */
unsigned int hash_mode; /* for when read_child is not set */
+ gf_atomic_t *pending_reads; /*No. of pending read cbks per child.*/
int favorite_child; /* subvolume to be preferred in resolving
split-brain cases */
@@ -425,6 +426,8 @@ typedef struct _afr_local {
unsigned char *readable;
unsigned char *readable2; /*For rename transaction*/
+ int read_subvol; /* Current read subvolume */
+
afr_inode_refresh_cbk_t refreshfn;
/* @refreshinode:
@@ -974,6 +977,8 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);
__this = frame->this; \
afr_handle_inconsistent_fop (frame, &__op_ret,\
&__op_errno);\
+ if (__local && __local->is_read_txn) \
+ afr_pending_read_decrement (__this->private, __local->read_subvol); \
frame->local = NULL; \
} \
\