summaryrefslogtreecommitdiffstats
path: root/xlators/cluster
diff options
context:
space:
mode:
authorJeff Darcy <jdarcy@redhat.com>2012-03-12 09:32:40 -0400
committerAnand Avati <avati@redhat.com>2012-05-31 17:29:01 -0700
commitddc044bfa2840981de4003c3b9efcac84387dc2b (patch)
treea83d476702cac7ecc7ae59057c368f622a51af4c /xlators/cluster
parente066a5fea7bdaa5da78e49c9a5bf344af2f33d3c (diff)
replicate: add hashed read-child method.
Both the first-to-respond method and the round-robin method are susceptible to clients repeatedly choosing the same servers across a series of opens, creating hot spots. Also, the code to handle a replica being down will ignore both methods and just choose the first remaining (which is not an issue for two-way but can be otherwise). The hashed method more reliably avoids such hot spots. There are three values/modes. 0: use the old (broken) methods. 1: select a read-child based on a hash of the file's GFID, so all clients will choose the same subvolume for a file (ensuring maximum consistency) but will distribute load for a set of files. 2: select a read-child based on a hash of the file's GFID plus the client's PID, so different children will distribute load even for one file. Mode 2 will probably be optimal for most cases. Using response time when we open the file is problematic, both because a single sample might not have been representative even then and because load might have shifted in the hours or days since (for long-lived files). Trying to use more current load information can lead to "herd following" behavior which is just as bad. Pseudo-random distribution is likely to be the best we can reasonably do, just as it is for DHT. Change-Id: I798c2760411eacf32e82a85f03bb7b08a4a49461 BUG: 802513 Signed-off-by: Jeff Darcy <jdarcy@redhat.com> Reviewed-on: http://review.gluster.com/2926 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Anand Avati <avati@redhat.com>
Diffstat (limited to 'xlators/cluster')
-rw-r--r--xlators/cluster/afr/src/afr-common.c67
-rw-r--r--xlators/cluster/afr/src/afr-dir-write.c15
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c6
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h3
-rw-r--r--xlators/cluster/afr/src/afr.c14
-rw-r--r--xlators/cluster/afr/src/afr.h6
6 files changed, 93 insertions, 18 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 300ab92efaf..21a2be3dd6f 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -546,6 +546,10 @@ afr_is_read_child (int32_t *success_children, int32_t *sources,
gf_boolean_t success_child = _gf_false;
gf_boolean_t source = _gf_false;
+ if (child < 0) {
+ return _gf_false;
+ }
+
GF_ASSERT (success_children);
GF_ASSERT (child_count > 0);
@@ -562,13 +566,44 @@ out:
return (success_child && source);
}
+int32_t
+afr_hash_child (int32_t *success_children, int32_t child_count,
+ unsigned int hmode, uuid_t gfid)
+{
+ uuid_t gfid_copy = {0,};
+
+ if (!hmode) {
+ return -1;
+ }
+
+ if (gfid) {
+ uuid_copy(gfid_copy,gfid);
+ }
+ if (hmode > 1) {
+ /*
+ * Why getpid? Because it's one of the cheapest calls
+ * available - faster than gethostname etc. - and returns a
+ * constant-length value that's sure to be shorter than a UUID.
+ * It's still very unlikely to be the same across clients, so
+ * it still provides good mixing. We're not trying for
+ * perfection here. All we need is a low probability that
+ * multiple clients won't converge on the same subvolume.
+ */
+ *((pid_t *)gfid_copy) = getpid();
+ }
+
+ return SuperFastHash((char *)gfid_copy,
+ sizeof(gfid_copy)) % child_count;
+}
+
/* If sources is NULL the xattrs are assumed to be of source for all
* success_children.
*/
int
-afr_select_read_child_from_policy (int32_t *success_children, int32_t child_count,
- int32_t prev_read_child,
- int32_t config_read_child, int32_t *sources)
+afr_select_read_child_from_policy (int32_t *success_children,
+ int32_t child_count, int32_t prev_read_child,
+ int32_t config_read_child, int32_t *sources,
+ unsigned int hmode, uuid_t gfid)
{
int32_t read_child = -1;
int i = 0;
@@ -585,6 +620,13 @@ afr_select_read_child_from_policy (int32_t *success_children, int32_t child_coun
read_child))
goto out;
+ read_child = afr_hash_child (success_children, child_count,
+ hmode, gfid);
+ if (afr_is_read_child (success_children, sources, child_count,
+ read_child)) {
+ goto out;
+ }
+
for (i = 0; i < child_count; i++) {
read_child = success_children[i];
if (read_child < 0)
@@ -604,7 +646,7 @@ out:
void
afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode,
int32_t *fresh_children, int32_t prev_read_child,
- int32_t config_read_child)
+ int32_t config_read_child, uuid_t gfid)
{
int read_child = -1;
afr_private_t *priv = NULL;
@@ -614,7 +656,8 @@ afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode,
priv->child_count,
prev_read_child,
config_read_child,
- NULL);
+ NULL,
+ priv->hash_mode, gfid);
if (read_child >= 0)
afr_inode_set_read_ctx (this, inode, read_child,
fresh_children);
@@ -1271,6 +1314,7 @@ afr_lookup_select_read_child (afr_local_t *local, xlator_t *this,
dict_t **xattrs = NULL;
int32_t *success_children = NULL;
afr_transaction_type type = AFR_METADATA_TRANSACTION;
+ uuid_t *gfid = NULL;
GF_ASSERT (local);
GF_ASSERT (this);
@@ -1284,8 +1328,9 @@ afr_lookup_select_read_child (afr_local_t *local, xlator_t *this,
ia_type = local->cont.lookup.bufs[success_children[0]].ia_type;
type = afr_transaction_type_get (ia_type);
xattrs = local->cont.lookup.xattrs;
+ gfid = &local->cont.lookup.buf.ia_gfid;
source = afr_lookup_select_read_child_by_txn_type (this, local, xattrs,
- type);
+ type, *gfid);
if (source < 0) {
gf_log (this->name, GF_LOG_DEBUG, "failed to select source "
"for %s", local->loc.path);
@@ -2131,8 +2176,14 @@ afr_lookup (call_frame_t *frame, xlator_t *this,
} else {
LOCK (&priv->read_child_lock);
{
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
+ if (priv->hash_mode) {
+ local->read_child_index = -1;
+ }
+ else {
+ local->read_child_index =
+ (++priv->read_child_rr) %
+ (priv->child_count);
+ }
}
UNLOCK (&priv->read_child_lock);
local->cont.lookup.fresh_lookup = _gf_true;
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
index 9f2b975df6f..0b804bef580 100644
--- a/xlators/cluster/afr/src/afr-dir-write.c
+++ b/xlators/cluster/afr/src/afr-dir-write.c
@@ -196,7 +196,8 @@ unlock:
afr_set_read_ctx_from_policy (this, inode,
local->fresh_children,
local->read_child_index,
- priv->read_child);
+ priv->read_child,
+ local->cont.create.buf.ia_gfid);
local->transaction.unwind (frame, this);
local->transaction.resume (frame, this);
@@ -429,7 +430,8 @@ afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
afr_set_read_ctx_from_policy (this, inode,
local->fresh_children,
local->read_child_index,
- priv->read_child);
+ priv->read_child,
+ local->cont.mknod.buf.ia_gfid);
local->transaction.unwind (frame, this);
local->transaction.resume (frame, this);
@@ -657,7 +659,8 @@ afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
afr_set_read_ctx_from_policy (this, inode,
local->fresh_children,
local->read_child_index,
- priv->read_child);
+ priv->read_child,
+ local->cont.mkdir.buf.ia_gfid);
local->transaction.unwind (frame, this);
local->transaction.resume (frame, this);
@@ -887,7 +890,8 @@ afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
afr_set_read_ctx_from_policy (this, inode,
local->fresh_children,
local->read_child_index,
- priv->read_child);
+ priv->read_child,
+ local->cont.link.buf.ia_gfid);
local->transaction.unwind (frame, this);
local->transaction.resume (frame, this);
@@ -1110,7 +1114,8 @@ afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
afr_set_read_ctx_from_policy (this, inode,
local->fresh_children,
local->read_child_index,
- priv->read_child);
+ priv->read_child,
+ local->cont.symlink.buf.ia_gfid);
local->transaction.unwind (frame, this);
local->transaction.resume (frame, this);
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index acc96697ff4..93b64529543 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -796,7 +796,8 @@ afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this)
int
afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local,
dict_t **xattr,
- afr_transaction_type txn_type)
+ afr_transaction_type txn_type,
+ uuid_t gfid)
{
afr_private_t *priv = NULL;
int read_child = -1;
@@ -855,7 +856,8 @@ afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local,
priv->child_count,
prev_read_child,
config_read_child,
- sources);
+ sources,
+ priv->hash_mode, gfid);
out:
afr_matrix_cleanup (pending_matrix, priv->child_count);
gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d",
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 627a2115a09..2efc1116d33 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -45,5 +45,6 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode);
int
afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local,
dict_t **xattr,
- afr_transaction_type txn_type);
+ afr_transaction_type txn_type,
+ uuid_t gfid);
#endif /* __AFR_SELF_HEAL_H__ */
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 8e94d549737..b7ba2619711 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -149,6 +149,9 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out);
+ GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode,
+ options, uint32, out);
+
if (read_subvol) {
index = xlator_subvolume_index (this, read_subvol);
if (index == -1) {
@@ -237,6 +240,8 @@ init (xlator_t *this)
}
}
+ GF_OPTION_INIT ("read-hash-mode", priv->hash_mode, uint32, out);
+
priv->favorite_child = -1;
GF_OPTION_INIT ("favorite-child", fav_child, xlator, out);
if (fav_child) {
@@ -494,6 +499,15 @@ struct volume_options options[] = {
{ .key = {"read-subvolume" },
.type = GF_OPTION_TYPE_XLATOR
},
+ { .key = {"read-hash-mode" },
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = 2,
+ .default_value = "0",
+ .description = "0 = first responder, "
+ "1 = hash by GFID (all clients use same subvolume), "
+ "2 = hash by GFID and client PID",
+ },
{ .key = {"favorite-child"},
.type = GF_OPTION_TYPE_XLATOR
},
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 27cb83a5791..a1a30562bf1 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -130,6 +130,7 @@ typedef struct _afr_private {
gf_boolean_t entry_change_log; /* on/off */
int read_child; /* read-subvolume */
+ unsigned int hash_mode; /* for when read_child is not set */
int favorite_child; /* subvolume to be preferred in resolving
split-brain cases */
@@ -936,12 +937,13 @@ afr_first_up_child (unsigned char *child_up, size_t child_count);
int
afr_select_read_child_from_policy (int32_t *fresh_children, int32_t child_count,
int32_t prev_read_child,
- int32_t config_read_child, int32_t *sources);
+ int32_t config_read_child, int32_t *sources,
+ unsigned int hmode, uuid_t gfid);
void
afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode,
int32_t *fresh_children, int32_t prev_read_child,
- int32_t config_read_child);
+ int32_t config_read_child, uuid_t gfid);
int32_t
afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child,