summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRichard Wareing <rwareing@fb.com>2015-08-21 21:44:44 -0700
committerKevin Vigor <kvigor@fb.com>2017-03-06 19:53:31 -0500
commitf6cc23fb1d8f157ec598e0bbb63081c881388380 (patch)
treebdb0a579a0a548e3e2113d5641ffa951bb3fbaa9
parent259d65ffb7296415cb9110ba1877d0378265bf52 (diff)
cluster/afr: AFR2 discovery should always do entry heal flow
Summary: - Fixes case where when a brick is completely wiped, the AFR2 discovery mechanism would potentially (1/R chance where R is your replication factor) pin a NFSd or client to the wiped brick. This would in turn prevent the client from seeing the contents of the (degraded) subvolume. - The fix proposed in this patch is to force the entry-self heal code path when the discovery process happens. And furthermore, forcing a conservative merge in the case where no brick is found to be degraded. - This also restores the property of our 3.4.x builds where-by bricks automagically rebuild via the SHDs without having to run any sort of "full heal". SHDs are given enough signal via this patch to figure out what they need to heal. Test Plan: Run "prove -v tests/bugs/fb8149516.t" Output: https://phabricator.fb.com/P19989638 Prove test showing failed run on v3.6.3-fb_10 without the patch -> https://phabricator.fb.com/P19989643 Reviewers: dph, moox, sshreyas Reviewed By: sshreyas FB-commit-id: 3d6f171 Change-Id: I7e0dec82c160a2981837d3f07e3aa6f6a701703f Signed-off-by: Kevin Vigor <kvigor@fb.com> Reviewed-on: https://review.gluster.org/16862 CentOS-regression: Gluster Build System <jenkins@build.gluster.org> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> Smoke: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Shreyas Siravara <sshreyas@fb.com>
-rw-r--r--tests/bugs/fb8149516.t40
-rw-r--r--xlators/cluster/afr/src/afr-common.c18
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c32
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c4
-rw-r--r--xlators/cluster/afr/src/afr.c1
-rw-r--r--xlators/cluster/afr/src/afr.h2
6 files changed, 83 insertions, 14 deletions
diff --git a/tests/bugs/fb8149516.t b/tests/bugs/fb8149516.t
new file mode 100644
index 00000000000..54372794c6f
--- /dev/null
+++ b/tests/bugs/fb8149516.t
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 cluster.read-subvolume-index 2
+TEST $CLI volume set $V0 cluster.background-self-heal-count 0
+TEST $CLI volume set $V0 cluster.heal-timeout 30
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume set $V0 cluster.entry-self-heal off
+TEST $CLI volume set $V0 cluster.data-self-heal off
+TEST $CLI volume set $V0 cluster.metadata-self-heal off
+TEST $CLI volume set $V0 nfs.disable off
+TEST $CLI volume start $V0
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
+cd $M0
+for i in {1..10}
+do
+ dd if=/dev/urandom of=testfile$i bs=1M count=1 2>/dev/null
+done
+cd ~
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST rm -rf $B0/${V0}2/testfile*
+TEST rm -rf $B0/${V0}2/.glusterfs
+
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN 20 "1" afr_child_up_status_in_shd $V0 2
+
+# Verify we see all ten files when ls'ing, without the patch this should
+# return no files and fail.
+FILE_LIST=($(\ls $M0))
+TEST "((${#FILE_LIST[@]} == 10))"
+EXPECT_WITHIN 30 "0" get_pending_heal_count $V0
+
+cleanup
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index ebadba99a05..cfea53208c8 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -1085,7 +1085,8 @@ refresh_done:
int
afr_inode_refresh_done (call_frame_t *frame, xlator_t *this)
{
- call_frame_t *heal_frame = NULL;
+ afr_private_t *priv = NULL;
+ call_frame_t *heal_frame = NULL;
afr_local_t *local = NULL;
gf_boolean_t start_heal = _gf_false;
afr_local_t *heal_local = NULL;
@@ -1094,13 +1095,15 @@ afr_inode_refresh_done (call_frame_t *frame, xlator_t *this)
int err = 0;
local = frame->local;
+ priv = this->private;
ret = afr_replies_interpret (frame, this, local->refreshinode,
&start_heal);
err = afr_inode_refresh_err (frame, this);
- if (ret && afr_selfheal_enabled (this) && start_heal) {
+ if (priv->did_discovery == _gf_false ||
+ (afr_selfheal_enabled (this) && start_heal)) {
heal_frame = copy_frame (frame);
if (!heal_frame)
goto refresh_done;
@@ -2580,6 +2583,8 @@ unwind:
local->op_errno = ENOTCONN;
}
+ priv->did_discovery = _gf_true;
+
AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
local->inode, &local->replies[read_subvol].poststat,
local->replies[read_subvol].xdata,
@@ -2612,7 +2617,7 @@ afr_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->replies[child_index].xdata = dict_ref (xdata);
}
- if (local->do_discovery && (op_ret == 0))
+ if (local->do_local_discovery && (op_ret == 0))
afr_attempt_local_discovery (this, child_index);
if (xdata) {
@@ -2717,12 +2722,12 @@ afr_discover (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req
if (!priv->root_inode)
priv->root_inode = inode_ref (loc->inode);
- if (priv->choose_local && !priv->did_discovery) {
+ if (priv->choose_local && !priv->did_local_discovery) {
/* Logic to detect which subvolumes of AFR are
local, in order to prefer them for reads
*/
- local->do_discovery = _gf_true;
- priv->did_discovery = _gf_true;
+ local->do_local_discovery = _gf_true;
+ priv->did_local_discovery = _gf_true;
}
}
@@ -4951,6 +4956,7 @@ afr_notify (xlator_t *this, int32_t event,
* that we could end up issuing N lookups to the first subvolume, and
* O(N^2) overall, but N is small for AFR so it shouldn't be an issue.
*/
+ priv->did_local_discovery = _gf_false;
priv->did_discovery = _gf_false;
latency_samples = child_xlator->client_latency.count;
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index c11ca11fdd9..bc3a3ee5ca1 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -1763,8 +1763,10 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
afr_is_metadata_set (this, replies[i].xdata))
*metadata_selfheal = _gf_true;
- if (entry_selfheal && afr_is_entry_set (this, replies[i].xdata))
- *entry_selfheal = _gf_true;
+ if (priv->did_discovery == _gf_false ||
+ (entry_selfheal &&
+ afr_is_entry_set (this, replies[i].xdata)))
+ *entry_selfheal = _gf_true;
valid_cnt++;
if (valid_cnt == 1) {
@@ -1919,6 +1921,7 @@ afr_selfheal_newentry_mark (call_frame_t *frame, xlator_t *this, inode_t *inode,
{
int ret = 0;
int i = 0;
+ int source_count = 0;
afr_private_t *priv = NULL;
dict_t *xattr = NULL;
int **changelog = NULL;
@@ -1937,11 +1940,26 @@ afr_selfheal_newentry_mark (call_frame_t *frame, xlator_t *this, inode_t *inode,
if (!changelog)
goto out;
- for (i = 0; i < priv->child_count; i++) {
- if (!sources[i])
- continue;
- afr_selfheal_post_op (frame, this, inode, i, xattr, NULL);
- }
+ /* Pre-compute how many sources we have, if we made it in here
+ * without any sources defined, we are doing a conservative
+ * merge
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i]) {
+ source_count++;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ /* If there are no sources we are doing a conservative
+ * merge. In such a case ensure we mark the changelog
+ * on all replicas.
+ */
+ if (!sources[i] && source_count) {
+ continue;
+ }
+ afr_selfheal_post_op (frame, this, inode, i, xattr, NULL);
+ }
out:
if (changelog)
afr_matrix_cleanup (changelog, priv->child_count);
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index 9ab566b8242..12a5940e7cd 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -443,7 +443,9 @@ __afr_selfheal_entry_finalize_source (xlator_t *this, unsigned char *sources,
sources_count = AFR_COUNT (sources, priv->child_count);
if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0)
- || !sources_count || afr_does_witness_exist (this, witness)) {
+ || !sources_count || afr_does_witness_exist (this, witness)
+ || (sources_count == priv->child_count &&
+ priv->did_discovery == _gf_false)) {
memset (sources, 0, sizeof (*sources) * priv->child_count);
afr_mark_active_sinks (this, sources, locked_on, healed_sinks);
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index ae9b28c7fb4..6d4b3c73429 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -292,6 +292,7 @@ reconfigure (xlator_t *this, dict_t *options)
if (afr_set_favorite_child_policy (priv, fav_child_policy) == -1)
goto out;
+ priv->did_local_discovery = _gf_false;
priv->did_discovery = _gf_false;
ret = 0;
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index aa19f1eeb37..bbb444c7974 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -163,6 +163,7 @@ typedef struct _afr_private {
uint32_t event_generation;
gf_boolean_t choose_local;
+ gf_boolean_t did_local_discovery;
gf_boolean_t did_discovery;
uint64_t sh_readdir_size;
gf_boolean_t ensure_durability;
@@ -813,6 +814,7 @@ typedef struct _afr_local {
mode_t umask;
int xflag;
gf_boolean_t do_discovery;
+ gf_boolean_t do_local_discovery;
struct afr_reply *replies;
/* For client side background heals. */