summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr/src/afr-common.c
diff options
context:
space:
mode:
authorKevin Vigor <kvigor@fb.com>2017-03-21 08:23:25 -0700
committerPranith Kumar Karampuri <pkarampu@redhat.com>2017-05-02 10:23:53 +0000
commit07cc8679cdf3b29680f4f105d0222da168d8bfc1 (patch)
treeda838a4f8afc045253300604e2536f97c533e41d /xlators/cluster/afr/src/afr-common.c
parent9374338f9c2f126c6608625f750d5ea1f7ef6a06 (diff)
Halo Replication feature for AFR translator
Summary: Halo Geo-replication is a feature which allows Gluster or NFS clients to write locally to their region (as defined by a latency "halo" or threshold if you like), and have their writes asynchronously propagate from their origin to the rest of the cluster. Clients can also write synchronously to the cluster simply by specifying a halo-latency which is very large (e.g. 10seconds) which will include all bricks. In other words, it allows clients to decide at mount time if they desire synchronous or asynchronous IO into a cluster and the cluster can support both of these modes to any number of clients simultaneously. There are a few new volume options due to this feature: halo-shd-latency: The threshold below which self-heal daemons will consider children (bricks) connected. halo-nfsd-latency: The threshold below which NFS daemons will consider children (bricks) connected. halo-latency: The threshold below which all other clients will consider children (bricks) connected. halo-min-replicas: The minimum number of replicas which are to be enforced regardless of latency specified in the above 3 options. If the number of children falls below this threshold the next best (chosen by latency) shall be swapped in. New FUSE mount options: halo-latency & halo-min-replicas: As descripted above. This feature combined with multi-threaded SHD support (D1271745) results in some pretty cool geo-replication possibilities. Operational Notes: - Global consistency is gaurenteed for synchronous clients, this is provided by the existing entry-locking mechanism. - Asynchronous clients on the other hand and merely consistent to their region. Writes & deletes will be protected via entry-locks as usual preventing concurrent writes into files which are undergoing replication. Read operations on the other hand should never block. - Writes are allowed from _any_ region and propagated from the origin to all other regions. The take away from this is care should be taken to ensure multiple writers do not write the same files resulting in a gfid split-brain which will require resolution via split-brain policies (majority, mtime & size). Recommended method for preventing this is using the nfs-auth feature to define which region for each share has RW permissions, tiers not in the origin region should have RO perms. TODO: - Synchronous clients (including the SHD) should choose clients from their own region as preferred sources for reads. Most of the plumbing is in place for this via the child_latency array. - Better GFID split brain handling & better dent type split brain handling (i.e. create a trash can and move the offending files into it). - Tagging in addition to latency as a means of defining which children you wish to synchronously write to Test Plan: - The usual suspects, clang, gcc w/ address sanitizer & valgrind - Prove tests Reviewers: jackl, dph, cjh, meyering Reviewed By: meyering Subscribers: ethanr Differential Revision: https://phabricator.fb.com/D1272053 Tasks: 4117827 Change-Id: I694a9ab429722da538da171ec528406e77b5e6d1 BUG: 1428061 Signed-off-by: Kevin Vigor <kvigor@fb.com> Reviewed-on: http://review.gluster.org/16099 Reviewed-on: https://review.gluster.org/16177 Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com> Smoke: Gluster Build System <jenkins@build.gluster.org> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Diffstat (limited to 'xlators/cluster/afr/src/afr-common.c')
-rw-r--r--xlators/cluster/afr/src/afr-common.c364
1 files changed, 310 insertions, 54 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index ac834e9..17943d7 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -2531,7 +2531,6 @@ unwind:
return 0;
}
-
int
afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)
{
@@ -3227,7 +3226,7 @@ afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
UNLOCK (&frame->lock);
- call_count = afr_frame_return (frame);
+ call_count = afr_frame_return (frame);
if (call_count == 0)
AFR_STACK_UNWIND (flush, frame, local->op_ret,
@@ -4655,20 +4654,292 @@ __get_heard_from_all_status (xlator_t *this)
return heard_from_all;
}
+static int
+find_best_down_child (xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int i = -1;
+ int32_t best_child = -1;
+ int64_t best_latency = INT64_MAX;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->child_up[i] &&
+ priv->child_latency[i] >= 0 &&
+ priv->child_latency[i] < best_latency) {
+ best_child = i;
+ best_latency = priv->child_latency[i];
+ }
+ }
+ if (best_child >= 0) {
+ gf_msg_debug (this->name, 0, "Found best down child (%d) "
+ "@ %ld ms latency", best_child, best_latency);
+ }
+ return best_child;
+}
+
+int
+find_worst_up_child (xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int i = -1;
+ int32_t worst_child = -1;
+ int64_t worst_latency = INT64_MIN;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->child_up[i] &&
+ priv->child_latency[i] >= 0 &&
+ priv->child_latency[i] > worst_latency) {
+ worst_child = i;
+ worst_latency = priv->child_latency[i];
+ }
+ }
+ if (worst_child >= 0) {
+ gf_msg_debug (this->name, 0, "Found worst up child (%d)"
+ " @ %ld ms latency", worst_child, worst_latency);
+ }
+ return worst_child;
+}
+
+void
+__afr_handle_ping_event (xlator_t *this, xlator_t *child_xlator,
+ const int idx, int64_t halo_max_latency_msec, int32_t *event,
+ int64_t child_latency_msec)
+{
+ afr_private_t *priv = NULL;
+ int up_children = 0;
+
+ priv = this->private;
+
+ priv->child_latency[idx] = child_latency_msec;
+ gf_msg_debug (child_xlator->name, 0, "Client ping @ %ld ms",
+ child_latency_msec);
+
+ up_children = __afr_get_up_children_count (priv);
+
+ if (child_latency_msec > halo_max_latency_msec &&
+ priv->child_up[idx] == 1 &&
+ up_children > priv->halo_min_replicas) {
+ if ((up_children - 1) <
+ priv->halo_min_replicas) {
+ gf_log (child_xlator->name, GF_LOG_INFO,
+ "Overriding halo threshold, "
+ "min replicas: %d",
+ priv->halo_min_replicas);
+ } else {
+ gf_log (child_xlator->name, GF_LOG_INFO,
+ "Child latency (%ld ms) "
+ "exceeds halo threshold (%ld), "
+ "marking child down.",
+ child_latency_msec,
+ halo_max_latency_msec);
+ *event = GF_EVENT_CHILD_DOWN;
+ }
+ } else if (child_latency_msec < halo_max_latency_msec &&
+ priv->child_up[idx] == 0) {
+ if (up_children < priv->halo_max_replicas) {
+ gf_log (child_xlator->name, GF_LOG_INFO,
+ "Child latency (%ld ms) "
+ "below halo threshold (%ld), "
+ "marking child up.",
+ child_latency_msec,
+ halo_max_latency_msec);
+ *event = GF_EVENT_CHILD_UP;
+ } else {
+ gf_log (child_xlator->name, GF_LOG_INFO,
+ "Not marking child %d up, "
+ "max replicas (%d) reached.", idx,
+ priv->halo_max_replicas);
+ }
+ }
+}
+
+void
+__afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator,
+ const int idx, int64_t halo_max_latency_msec,
+ int32_t *event, int32_t *call_psh, int32_t *up_child)
+{
+ afr_private_t *priv = NULL;
+ int up_children = 0;
+ int worst_up_child = -1;
+
+ priv = this->private;
+
+ /*
+ * This only really counts if the child was never up
+ * (value = -1) or had been down (value = 0). See
+ * comment at GF_EVENT_CHILD_DOWN for a more detailed
+ * explanation.
+ */
+ if (priv->child_up[idx] != 1) {
+ priv->event_generation++;
+ }
+ priv->child_up[idx] = 1;
+
+ *call_psh = 1;
+ *up_child = idx;
+ up_children = __afr_get_up_children_count (priv);
+
+ /*
+ * Handle the edge case where we exceed
+ * halo_min_replicas and we've got a child which is
+ * marked up as it was helping to satisfy the
+ * halo_min_replicas even though it's latency exceeds
+ * halo_max_latency_msec.
+ */
+ if (up_children > priv->halo_min_replicas) {
+ worst_up_child = find_worst_up_child (this);
+ if (worst_up_child >= 0 &&
+ priv->child_latency[worst_up_child] >
+ halo_max_latency_msec) {
+ gf_msg_debug (this->name, 0, "Marking child %d down, "
+ "doesn't meet halo threshold (%ld), and > "
+ "halo_min_replicas (%d)",
+ worst_up_child, halo_max_latency_msec,
+ priv->halo_min_replicas);
+ priv->child_up[worst_up_child] = 0;
+ up_children--;
+ }
+ }
+ if (up_children > priv->halo_max_replicas &&
+ !priv->shd.iamshd) {
+ worst_up_child = find_worst_up_child (this);
+ if (worst_up_child < 0) {
+ worst_up_child = idx;
+ }
+ priv->child_up[worst_up_child] = 0;
+ up_children--;
+ gf_msg_debug (this->name, 0, "Marking child %d down, "
+ "up_children (%d) > halo_max_replicas (%d)",
+ worst_up_child, up_children, priv->halo_max_replicas);
+ }
+
+ if (up_children == 1) {
+ gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP,
+ "Subvolume '%s' came back up; "
+ "going online.",
+ child_xlator->name);
+ } else {
+ *event = GF_EVENT_SOME_DESCENDENT_UP;
+ }
+
+ priv->last_event[idx] = *event;
+}
+
+void
+__afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator,
+ int idx, int64_t child_latency_msec, int32_t *event,
+ int32_t *call_psh, int32_t *up_child)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int up_children = 0;
+ int down_children = 0;
+ int best_down_child = -1;
+
+ priv = this->private;
+
+ /*
+ * If a brick is down when we start, we'll get a
+ * CHILD_DOWN to indicate its initial state. There
+ * was never a CHILD_UP in this case, so if we
+ * increment "down_count" the difference between than
+ * and "up_count" will no longer be the number of
+ * children that are currently up. This has serious
+ * implications e.g. for quorum enforcement, so we
+ * don't increment these values unless the event
+ * represents an actual state transition between "up"
+ * (value = 1) and anything else.
+ */
+ if (priv->child_up[idx] == 1) {
+ priv->event_generation++;
+ }
+
+ /*
+ * If this is an _actual_ CHILD_DOWN event, we
+ * want to set the child_latency to < 0 to indicate
+ * the child is really disconnected.
+ */
+ if (child_latency_msec < 0) {
+ priv->child_latency[idx] = child_latency_msec;
+ }
+ priv->child_up[idx] = 0;
+
+ up_children = __afr_get_up_children_count (priv);
+ /*
+ * Handle the edge case where we need to find the
+ * next best child (to mark up) as marking this child
+ * down would cause us to fall below halo_min_replicas.
+ * We will also force the SHD to heal this child _now_
+ * as we want it to be up to date if we are going to
+ * begin using it synchronously.
+ */
+ if (up_children < priv->halo_min_replicas) {
+ best_down_child = find_best_down_child (this);
+ if (best_down_child >= 0) {
+ gf_msg_debug (this->name, 0,
+ "Swapping out child %d for "
+ "child %d to satisfy halo_min_replicas (%d).",
+ idx, best_down_child, priv->halo_min_replicas);
+ priv->child_up[best_down_child] = 1;
+ *call_psh = 1;
+ *up_child = best_down_child;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++)
+ if (priv->child_up[i] == 0)
+ down_children++;
+ if (down_children == priv->child_count) {
+ gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_SUBVOLS_DOWN,
+ "All subvolumes are down. Going "
+ "offline until atleast one of them "
+ "comes back up.");
+ } else {
+ *event = GF_EVENT_SOME_DESCENDENT_DOWN;
+ }
+ priv->last_event[idx] = *event;
+}
+
+static int64_t
+afr_get_halo_latency (xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int64_t halo_max_latency_msec = 0;
+
+ priv = this->private;
+
+ if (priv->shd.iamshd) {
+ halo_max_latency_msec = priv->shd.halo_max_latency_msec;
+ } else if (priv->nfsd.iamnfsd) {
+ halo_max_latency_msec =
+ priv->nfsd.halo_max_latency_msec;
+ } else {
+ halo_max_latency_msec = priv->halo_max_latency_msec;
+ }
+ gf_msg_debug (this->name, 0, "Using halo latency %ld",
+ halo_max_latency_msec);
+ return halo_max_latency_msec;
+}
+
+
int32_t
afr_notify (xlator_t *this, int32_t event,
void *data, void *data2)
{
afr_private_t *priv = NULL;
+ xlator_t *child_xlator = NULL;
int i = -1;
- int up_children = 0;
- int down_children = 0;
int propagate = 0;
int had_heard_from_all = 0;
int have_heard_from_all = 0;
int idx = -1;
int ret = -1;
int call_psh = 0;
+ int up_child = -1;
dict_t *input = NULL;
dict_t *output = NULL;
gf_boolean_t had_quorum = _gf_false;
@@ -4677,6 +4948,10 @@ afr_notify (xlator_t *this, int32_t event,
struct gf_upcall_cache_invalidation *up_ci = NULL;
inode_table_t *itable = NULL;
inode_t *inode = NULL;
+ int64_t halo_max_latency_msec = 0;
+ int64_t child_latency_msec = -1;
+
+ child_xlator = (xlator_t *)data;
priv = this->private;
@@ -4701,7 +4976,7 @@ afr_notify (xlator_t *this, int32_t event,
* subsequent revalidate lookup happens on all the dht's subvolumes
* which triggers afr self-heals if any.
*/
- idx = find_child_index (this, data);
+ idx = find_child_index (this, child_xlator);
if (idx < 0) {
gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP,
"Received child_up from invalid subvolume");
@@ -4710,6 +4985,30 @@ afr_notify (xlator_t *this, int32_t event,
had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up,
this);
+ if (priv->halo_enabled) {
+ halo_max_latency_msec = afr_get_halo_latency (this);
+
+ if (event == GF_EVENT_CHILD_PING) {
+ /* Calculates the child latency and sets event
+ */
+ child_latency_msec = (int64_t)(uintptr_t)data2;
+ LOCK (&priv->lock);
+ {
+ __afr_handle_ping_event (this, child_xlator,
+ idx, halo_max_latency_msec, &event,
+ child_latency_msec);
+ }
+ UNLOCK (&priv->lock);
+ }
+ }
+
+ if (event == GF_EVENT_CHILD_PING) {
+ /* This is the only xlator that handles PING, no reason to
+ * propagate.
+ */
+ goto out;
+ }
+
if (event == GF_EVENT_TRANSLATOR_OP) {
LOCK (&priv->lock);
{
@@ -4736,57 +5035,15 @@ afr_notify (xlator_t *this, int32_t event,
propagate = 1;
break;
case GF_EVENT_CHILD_UP:
- /*
- * This only really counts if the child was never up
- * (value = -1) or had been down (value = 0). See
- * comment at GF_EVENT_CHILD_DOWN for a more detailed
- * explanation.
- */
- if (priv->child_up[idx] != 1) {
- priv->event_generation++;
- }
- priv->child_up[idx] = 1;
-
- call_psh = 1;
- up_children = __afr_get_up_children_count (priv);
- if (up_children == 1) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- AFR_MSG_SUBVOL_UP,
- "Subvolume '%s' came back up; "
- "going online.", ((xlator_t *)data)->name);
- gf_event (EVENT_AFR_SUBVOL_UP,
- "subvol=%s", this->name);
-
- } else {
- event = GF_EVENT_SOME_DESCENDENT_UP;
- }
-
- priv->last_event[idx] = event;
-
+ __afr_handle_child_up_event (this, child_xlator,
+ idx, halo_max_latency_msec, &event, &call_psh,
+ &up_child);
break;
case GF_EVENT_CHILD_DOWN:
- if (priv->child_up[idx] == 1) {
- priv->event_generation++;
- }
- priv->child_up[idx] = 0;
-
- for (i = 0; i < priv->child_count; i++)
- if (priv->child_up[i] == 0)
- down_children++;
- if (down_children == priv->child_count) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_SUBVOLS_DOWN,
- "All subvolumes are down. Going offline "
- "until atleast one of them comes back up.");
- gf_event (EVENT_AFR_SUBVOLS_DOWN,
- "subvol=%s", this->name);
- } else {
- event = GF_EVENT_SOME_DESCENDENT_DOWN;
- }
-
- priv->last_event[idx] = event;
-
+ __afr_handle_child_down_event (this, child_xlator, idx,
+ child_latency_msec, &event, &call_psh,
+ &up_child);
break;
case GF_EVENT_CHILD_CONNECTING:
@@ -4839,7 +5096,6 @@ afr_notify (xlator_t *this, int32_t event,
had come up, propagate CHILD_UP, but only this time
*/
event = GF_EVENT_CHILD_DOWN;
- up_children = __afr_get_up_children_count (priv);
for (i = 0; i < priv->child_count; i++) {
if (priv->last_event[i] == GF_EVENT_CHILD_UP) {
event = GF_EVENT_CHILD_UP;