summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--tests/bugs/ec/bug-1547662.t41
-rw-r--r--xlators/cluster/ec/src/ec-heal.c9
-rw-r--r--xlators/cluster/ec/src/ec-heald.c27
-rw-r--r--xlators/cluster/ec/src/ec-heald.h4
-rw-r--r--xlators/cluster/ec/src/ec.c101
5 files changed, 134 insertions, 48 deletions
diff --git a/tests/bugs/ec/bug-1547662.t b/tests/bugs/ec/bug-1547662.t
new file mode 100644
index 00000000000..5748218587e
--- /dev/null
+++ b/tests/bugs/ec/bug-1547662.t
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+# Immediately after replace-brick, trusted.ec.version will be absent, so if it
+# is present we can assume that heal was started on root
+function root_heal_attempted {
+ if [ -z $(get_hex_xattr trusted.ec.version $1) ]; then
+ echo "N"
+ else
+ echo "Y"
+ fi
+}
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+TEST ${CLI} volume create ${V0} disperse 6 redundancy 2 ${H0}:${B0}/${V0}{0..5}
+TEST ${CLI} volume start ${V0}
+TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0}
+EXPECT_WITHIN ${CHILD_UP_TIMEOUT} "6" ec_child_up_count ${V0} 0
+
+TEST mkdir ${M0}/base
+TEST mkdir ${M0}/base/dir.{1,2}
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}
+
+TEST ${CLI} volume replace-brick ${V0} ${H0}:${B0}/${V0}5 ${H0}:${B0}/${V0}6 commit force
+EXPECT_WITHIN ${CHILD_UP_TIMEOUT} "6" ec_child_up_count ${V0} 0
+EXPECT_WITHIN ${PROCESS_UP_TIMEOUT} "Y" glustershd_up_status
+EXPECT_WITHIN ${CHILD_UP_TIMEOUT} "6" ec_child_up_count_shd ${V0} 0
+EXPECT_WITHIN ${HEAL_TIMEOUT} "Y" root_heal_attempted ${B0}/${V0}6
+EXPECT_WITHIN ${HEAL_TIMEOUT} "^0$" get_pending_heal_count ${V0}
+EXPECT "^127$" echo $(find ${B0}/${V0}6/base -type d | wc -l)
+
+cleanup;
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
index 6562adf9e24..d1e40607e33 100644
--- a/xlators/cluster/ec/src/ec-heal.c
+++ b/xlators/cluster/ec/src/ec-heal.c
@@ -25,6 +25,7 @@
#include "ec-combine.h"
#include "ec-method.h"
#include "ec-fops.h"
+#include "ec-heald.h"
#define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr; })
#define EC_COUNT(array, max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res; })
@@ -2769,6 +2770,14 @@ ec_replace_heal (ec_t *ec, inode_t *inode)
gf_msg_debug (ec->xl->name, 0,
"Heal failed for replace brick ret = %d", ret);
+ /* Once the root inode has been checked, it might have triggered a
+ * self-heal on it after a replace brick command or for some other
+ * reason. It can also happen that the volume already had damaged
+ * files in the index, even if the heal on the root directory failed.
+ * In both cases we need to wake all index healers to continue
+ * healing remaining entries that are marked as dirty. */
+ ec_shd_index_healer_wake(ec);
+
loc_wipe (&loc);
return ret;
}
diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c
index b4fa6f87189..a703379a59b 100644
--- a/xlators/cluster/ec/src/ec-heald.c
+++ b/xlators/cluster/ec/src/ec-heald.c
@@ -184,8 +184,19 @@ ec_shd_index_purge (xlator_t *subvol, inode_t *inode, char *name)
int
ec_shd_selfheal (struct subvol_healer *healer, int child, loc_t *loc)
{
- return syncop_getxattr (healer->this, loc, NULL, EC_XATTR_HEAL, NULL,
- NULL);
+ int32_t ret;
+
+ ret = syncop_getxattr (healer->this, loc, NULL, EC_XATTR_HEAL, NULL,
+ NULL);
+ if ((ret >= 0) && (loc->inode->ia_type == IA_IFDIR)) {
+ /* If we have just healed a directory, it's possible that
+ * other index entries have appeared to be healed. We put a
+ * mark so that we can check it later and restart a scan
+ * without delay. */
+ healer->rerun = _gf_true;
+ }
+
+ return ret;
}
@@ -472,11 +483,15 @@ ec_shd_index_healer_spawn (xlator_t *this, int subvol)
}
void
-ec_selfheal_childup (ec_t *ec, int child)
+ec_shd_index_healer_wake(ec_t *ec)
{
- if (!ec->shd.iamshd)
- return;
- ec_shd_index_healer_spawn (ec->xl, child);
+ int32_t i;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (((ec->xl_up >> i) & 1) != 0) {
+ ec_shd_index_healer_spawn(ec->xl, i);
+ }
+ }
}
int
diff --git a/xlators/cluster/ec/src/ec-heald.h b/xlators/cluster/ec/src/ec-heald.h
index 4ae02e2df3c..2a8488124c4 100644
--- a/xlators/cluster/ec/src/ec-heald.h
+++ b/xlators/cluster/ec/src/ec-heald.h
@@ -20,6 +20,8 @@ ec_xl_op (xlator_t *this, dict_t *input, dict_t *output);
int
ec_selfheal_daemon_init (xlator_t *this);
-void ec_selfheal_childup (ec_t *ec, int child);
+
+void
+ec_shd_index_healer_wake(ec_t *ec);
#endif /* __EC_HEALD_H__ */
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
index 59b4aeefe69..f25760049c3 100644
--- a/xlators/cluster/ec/src/ec.c
+++ b/xlators/cluster/ec/src/ec.c
@@ -322,7 +322,7 @@ ec_get_event_from_state (ec_t *ec)
/* If ec is up but some subvolumes are yet to notify, give
* grace time for other subvols to notify to prevent start of
* I/O which may result in self-heals */
- if (ec->timer && ec->xl_notify_count < ec->nodes)
+ if (ec->xl_notify_count < ec->nodes)
return GF_EVENT_MAXVAL;
return GF_EVENT_CHILD_UP;
@@ -344,8 +344,8 @@ ec_up (xlator_t *this, ec_t *ec)
}
ec->up = 1;
- gf_msg (this->name, GF_LOG_INFO, 0,
- EC_MSG_EC_UP, "Going UP");
+ gf_msg (this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP, "Going UP");
+
gf_event (EVENT_EC_MIN_BRICKS_UP, "subvol=%s", this->name);
}
@@ -358,8 +358,8 @@ ec_down (xlator_t *this, ec_t *ec)
}
ec->up = 0;
- gf_msg (this->name, GF_LOG_INFO, 0,
- EC_MSG_EC_DOWN, "Going DOWN");
+ gf_msg (this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN, "Going DOWN");
+
gf_event (EVENT_EC_MIN_BRICKS_NOT_UP, "subvol=%s", this->name);
}
@@ -383,31 +383,38 @@ ec_notify_cbk (void *data)
gf_timer_call_cancel (ec->xl->ctx, ec->timer);
ec->timer = NULL;
+ /* The timeout has expired, so any subvolume that has not
+ * already reported its state, will be considered to be down.
+ * We mark as if all bricks had reported. */
+ ec->xl_notify = (1ULL << ec->nodes) - 1ULL;
+ ec->xl_notify_count = ec->nodes;
+
+ /* Since we have marked all subvolumes as notified, it's
+ * guaranteed that ec_get_event_from_state() will return
+ * CHILD_UP or CHILD_DOWN, but not MAXVAL. */
event = ec_get_event_from_state (ec);
- /* If event is still MAXVAL then enough subvolumes didn't
- * notify, treat it as CHILD_DOWN. */
- if (event == GF_EVENT_MAXVAL) {
- event = GF_EVENT_CHILD_DOWN;
- ec->xl_notify = (1ULL << ec->nodes) - 1ULL;
- ec->xl_notify_count = ec->nodes;
- } else if (event == GF_EVENT_CHILD_UP) {
- /* Rest of the bricks are still not coming up,
- * notify that ec is up. Files/directories will be
- * healed as in when they come up. */
+ if (event == GF_EVENT_CHILD_UP) {
+ /* We are ready to bring the volume up. If there are
+ * still bricks DOWN, they will be healed when they
+ * come up. */
ec_up (ec->xl, ec);
}
- /* CHILD_DOWN should not come here as no grace period is given
- * for notifying CHILD_DOWN. */
-
propagate = _gf_true;
}
unlock:
UNLOCK(&ec->lock);
if (propagate) {
+ if ((event == GF_EVENT_CHILD_UP) && ec->shd.iamshd) {
+ /* We have just brought the volume UP, so we trigger
+ * a self-heal check on the root directory. */
+ ec_launch_replace_heal (ec);
+ }
+
default_notify (ec->xl, event, NULL);
}
+
}
void
@@ -442,7 +449,7 @@ ec_pending_fops_completed(ec_t *ec)
}
}
-static void
+static gf_boolean_t
ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state)
{
uintptr_t current_state = 0;
@@ -455,7 +462,11 @@ ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state)
if (current_state != new_state) {
ec->xl_up ^= index_mask;
ec->xl_up_count += (current_state ? -1 : 1);
+
+ return _gf_true;
}
+
+ return _gf_false;
}
static gf_boolean_t
@@ -498,15 +509,16 @@ ec_upcall(ec_t *ec, struct gf_upcall *upcall)
int32_t
ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
{
- ec_t *ec = this->private;
- int32_t idx = 0;
- int32_t error = 0;
- glusterfs_event_t old_event = GF_EVENT_MAXVAL;
- dict_t *input = NULL;
- dict_t *output = NULL;
- gf_boolean_t propagate = _gf_true;
- int32_t orig_event = event;
- uintptr_t mask = 0;
+ ec_t *ec = this->private;
+ int32_t idx = 0;
+ int32_t error = 0;
+ glusterfs_event_t old_event = GF_EVENT_MAXVAL;
+ dict_t *input = NULL;
+ dict_t *output = NULL;
+ gf_boolean_t propagate = _gf_true;
+ gf_boolean_t needs_shd_check = _gf_false;
+ int32_t orig_event = event;
+ uintptr_t mask = 0;
gf_msg_trace (this->name, 0, "NOTIFY(%d): %p, %p",
event, data, data2);
@@ -529,8 +541,6 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
for (idx = 0; idx < ec->nodes; idx++) {
if (ec->xl_list[idx] == data) {
- if (event == GF_EVENT_CHILD_UP)
- ec_selfheal_childup (ec, idx);
break;
}
}
@@ -556,17 +566,27 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
mask = 1ULL << idx;
if (event == GF_EVENT_CHILD_UP) {
- ec_set_up_state(ec, mask, mask);
+ /* We need to trigger a selfheal if a brick changes
+ * to UP state. */
+ needs_shd_check = ec_set_up_state(ec, mask, mask);
} else if (event == GF_EVENT_CHILD_DOWN) {
- ec_set_up_state(ec, mask, 0);
+ ec_set_up_state(ec, mask, 0);
}
event = ec_get_event_from_state (ec);
- if (event == GF_EVENT_CHILD_UP && !ec->up) {
- ec_up (this, ec);
- } else if (event == GF_EVENT_CHILD_DOWN && ec->up) {
- ec_down (this, ec);
+ if (event == GF_EVENT_CHILD_UP) {
+ if (!ec->up) {
+ ec_up (this, ec);
+ }
+ } else {
+ /* If the volume is not UP, it's irrelevant if one
+ * brick has come up. We cannot heal anything. */
+ needs_shd_check = _gf_false;
+
+ if ((event == GF_EVENT_CHILD_DOWN) && ec->up) {
+ ec_down (this, ec);
+ }
}
if (event != GF_EVENT_MAXVAL) {
@@ -585,14 +605,13 @@ unlock:
done:
if (propagate) {
+ if (needs_shd_check && ec->shd.iamshd) {
+ ec_launch_replace_heal (ec);
+ }
+
error = default_notify (this, event, data);
}
- if (ec->shd.iamshd &&
- ec->xl_notify_count == ec->nodes &&
- event == GF_EVENT_CHILD_UP) {
- ec_launch_replace_heal (ec);
- }
out:
return error;
}