replicate: add hashed read-child method.

Both the first-to-respond method and the round-robin method are susceptible to clients repeatedly choosing the same servers across a series of opens, creating hot spots. Also, the code to handle a replica being down will ignore both methods and just choose the first remaining (which is not an issue for two-way but can be otherwise). The hashed method more reliably avoids such hot spots. There are three values/modes. 0: use the old (broken) methods. 1: select a read-child based on a hash of the file's GFID, so all clients will choose the same subvolume for a file (ensuring maximum consistency) but will distribute load for a set of files. 2: select a read-child based on a hash of the file's GFID plus the client's PID, so different children will distribute load even for one file. Mode 2 will probably be optimal for most cases. Using response time when we open the file is problematic, both because a single sample might not have been representative even then and because load might have shifted in the hours or days since (for long-lived files). Trying to use more current load information can lead to "herd following" behavior which is just as bad. Pseudo-random distribution is likely to be the best we can reasonably do, just as it is for DHT. Change-Id: I798c2760411eacf32e82a85f03bb7b08a4a49461 BUG: 802513 Signed-off-by: Jeff Darcy <jdarcy@redhat.com> Reviewed-on: http://review.gluster.com/2926 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Anand Avati <avati@redhat.com>
author: Jeff Darcy <jdarcy@redhat.com> 2012-03-12 09:32:40 -0400
committer: Anand Avati <avati@redhat.com> 2012-05-31 17:29:01 -0700
commit: ddc044bfa2840981de4003c3b9efcac84387dc2b (patch)
tree: a83d476702cac7ecc7ae59057c368f622a51af4c /xlators/cluster/afr/src/afr-common.c
parent: e066a5fea7bdaa5da78e49c9a5bf344af2f33d3c (diff)
1 files changed, 59 insertions, 8 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 300ab92e..21a2be3d 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -546,6 +546,10 @@ afr_is_read_child (int32_t *success_children, int32_t *sources,
         gf_boolean_t             success_child = _gf_false;
         gf_boolean_t             source        = _gf_false;
 
+        if (child < 0) {
+                return _gf_false;
+        }
+
         GF_ASSERT (success_children);
         GF_ASSERT (child_count > 0);
 
@@ -562,13 +566,44 @@ out:
         return (success_child && source);
 }
 
+int32_t
+afr_hash_child (int32_t *success_children, int32_t child_count,
+                unsigned int hmode, uuid_t gfid)
+{
+        uuid_t  gfid_copy = {0,};
+
+        if (!hmode) {
+                return -1;
+        }
+
+        if (gfid) {
+               uuid_copy(gfid_copy,gfid);
+        }
+        if (hmode > 1) {
+                /*
+                 * Why getpid?  Because it's one of the cheapest calls
+                 * available - faster than gethostname etc. - and returns a
+                 * constant-length value that's sure to be shorter than a UUID.
+                 * It's still very unlikely to be the same across clients, so
+                 * it still provides good mixing.  We're not trying for
+                 * perfection here.  All we need is a low probability that
+                 * multiple clients won't converge on the same subvolume.
+                 */
+                *((pid_t *)gfid_copy) = getpid();
+        }
+
+        return SuperFastHash((char *)gfid_copy,
+                             sizeof(gfid_copy)) % child_count;
+}
+
 /* If sources is NULL the xattrs are assumed to be of source for all
  * success_children.
  */
 int
-afr_select_read_child_from_policy (int32_t *success_children, int32_t child_count,
-                                   int32_t prev_read_child,
-                                   int32_t config_read_child, int32_t *sources)
+afr_select_read_child_from_policy (int32_t *success_children,
+                                   int32_t child_count, int32_t prev_read_child,
+                                   int32_t config_read_child, int32_t *sources,
+                                   unsigned int hmode, uuid_t gfid)
 {
         int32_t                  read_child   = -1;
         int                      i            = 0;
@@ -585,6 +620,13 @@ afr_select_read_child_from_policy (int32_t *success_children, int32_t child_coun
                                read_child))
                 goto out;
 
+        read_child = afr_hash_child (success_children, child_count,
+                                     hmode, gfid);
+        if (afr_is_read_child (success_children, sources, child_count,
+                               read_child)) {
+                goto out;
+        }
+
         for (i = 0; i < child_count; i++) {
                 read_child = success_children[i];
                 if (read_child < 0)
@@ -604,7 +646,7 @@ out:
 void
 afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode,
                               int32_t *fresh_children, int32_t prev_read_child,
-                              int32_t config_read_child)
+                              int32_t config_read_child, uuid_t gfid)
 {
         int                      read_child = -1;
         afr_private_t            *priv = NULL;
@@ -614,7 +656,8 @@ afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode,
                                                         priv->child_count,
                                                         prev_read_child,
                                                         config_read_child,
-                                                        NULL);
+                                                        NULL,
+                                                        priv->hash_mode, gfid);
         if (read_child >= 0)
                 afr_inode_set_read_ctx (this, inode, read_child,
                                         fresh_children);
@@ -1271,6 +1314,7 @@ afr_lookup_select_read_child (afr_local_t *local, xlator_t *this,
         dict_t                  **xattrs       = NULL;
         int32_t                 *success_children = NULL;
         afr_transaction_type    type           = AFR_METADATA_TRANSACTION;
+        uuid_t                  *gfid          = NULL;
 
         GF_ASSERT (local);
         GF_ASSERT (this);
@@ -1284,8 +1328,9 @@ afr_lookup_select_read_child (afr_local_t *local, xlator_t *this,
         ia_type = local->cont.lookup.bufs[success_children[0]].ia_type;
         type = afr_transaction_type_get (ia_type);
         xattrs = local->cont.lookup.xattrs;
+        gfid = &local->cont.lookup.buf.ia_gfid;
         source = afr_lookup_select_read_child_by_txn_type (this, local, xattrs,
-                                                           type);
+                                                           type, *gfid);
         if (source < 0) {
                 gf_log (this->name, GF_LOG_DEBUG, "failed to select source "
                         "for %s", local->loc.path);
@@ -2131,8 +2176,14 @@ afr_lookup (call_frame_t *frame, xlator_t *this,
         } else {
                 LOCK (&priv->read_child_lock);
                 {
-                        local->read_child_index = (++priv->read_child_rr)
-                                % (priv->child_count);
+                        if (priv->hash_mode) {
+                                local->read_child_index = -1;
+                        }
+                        else {
+                                local->read_child_index =
+                                        (++priv->read_child_rr) %
+                                        (priv->child_count);
+                        }
                 }
                 UNLOCK (&priv->read_child_lock);
                 local->cont.lookup.fresh_lookup = _gf_true;
author	Jeff Darcy <jdarcy@redhat.com>	2012-03-12 09:32:40 -0400
committer	Anand Avati <avati@redhat.com>	2012-05-31 17:29:01 -0700
commit	ddc044bfa2840981de4003c3b9efcac84387dc2b (patch)
tree	a83d476702cac7ecc7ae59057c368f622a51af4c /xlators/cluster/afr/src/afr-common.c
parent	e066a5fea7bdaa5da78e49c9a5bf344af2f33d3c (diff)