summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPranith Kumar K <pkarampu@redhat.com>2015-01-08 15:39:40 +0530
committerRaghavendra Bhat <raghavendra@redhat.com>2015-03-30 00:20:38 -0700
commitd1eb4f520b35c1057c7cb3427a51dd6ae75cc61f (patch)
tree521fcac11f8571dd395e0a028498c7bc55a13b95
parentae75db655b683bb47df1590c7815c90b7cbefe4b (diff)
cluster/ec: Handle CHILD UP/DOWN in all cases
Backport of http://review.gluster.org/9396 Problem: When all the bricks are down at the time of mounting the volume, then mount command hangs. Fix: 1. Ignore all CHILD_CONNECTING events comming from subvolumes. 2. On timer expiration (without enough up or down childs) send CHILD_DOWN. 3. Once enough up or down subvolumes are detected, send the appropriate event. When rest of the subvols go up/down without changing the overall ec-up/ec-down send CHILD_MODIFIED to parent subvols. BUG: 1188471 Change-Id: If92bd84107d49495cd104deb34601afe7f9b155c Signed-off-by: Pranith Kumar K <pkarampu@redhat.com> Reviewed-on: http://review.gluster.org/9551 Reviewed-by: Xavier Hernandez <xhernandez@datalab.es> Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Raghavendra Bhat <raghavendra@redhat.com>
-rw-r--r--tests/basic/ec/ec-notify.t79
-rw-r--r--tests/include.rc1
-rw-r--r--xlators/cluster/ec/src/ec.c236
-rw-r--r--xlators/cluster/ec/src/ec.h2
4 files changed, 214 insertions, 104 deletions
diff --git a/tests/basic/ec/ec-notify.t b/tests/basic/ec/ec-notify.t
new file mode 100644
index 0000000..586be91
--- /dev/null
+++ b/tests/basic/ec/ec-notify.t
@@ -0,0 +1,79 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+# This test checks notify part of ec
+
+cleanup
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/${V0}{0..2}
+TEST $CLI volume start $V0
+
+#First time mount tests.
+# When all the bricks are up, mount should succeed and up-children
+# count should be 3
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+TEST stat $M0
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+
+# When the volume is stopped mount succeeds and up-children will be 0
+TEST $CLI volume stop $V0
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
+# Wait for 5 seconds even after that up_count should show 0
+sleep 5;
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "0" ec_child_up_count $V0 0
+TEST ! stat $M0
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+
+# When 2 bricks are up, mount should succeed and up-children
+# count should be 2
+
+TEST $CLI volume start $V0
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0
+TEST stat $M0
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+
+# When only 1 brick is up mount should fail.
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
+# Wait for 5 seconds even after that up_count should show 1
+sleep 5
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" ec_child_up_count $V0 0
+TEST ! stat $M0
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+
+# Mount already succeeded. Test that the brick up down are leading to correct
+# state changes in ec.
+TEST $CLI volume stop $V0
+TEST $CLI volume start $V0 force
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+TEST touch $M0/a
+
+# kill 1 brick and the up_count should become 2, fops should still succeed
+TEST kill_brick $V0 $H0 $B0/${V0}1
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0
+TEST touch $M0/b
+
+# kill one more brick and the up_count should become 1, fops should fail
+TEST kill_brick $V0 $H0 $B0/${V0}2
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" ec_child_up_count $V0 0
+TEST ! touch $M0/c
+
+# kill one more brick and the up_count should become 0, fops should still fail
+TEST kill_brick $V0 $H0 $B0/${V0}0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "0" ec_child_up_count $V0 0
+TEST ! touch $M0/c
+
+# Bring up all the bricks up and see that up_count is 3 and fops are succeeding
+# again.
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+TEST touch $M0/c
+
+cleanup
diff --git a/tests/include.rc b/tests/include.rc
index 599ad1b..d4e0ed2 100644
--- a/tests/include.rc
+++ b/tests/include.rc
@@ -41,6 +41,7 @@ UMOUNT_TIMEOUT=5
statedumpdir=`gluster --print-statedumpdir`; # Default directory for statedump
CLI="gluster --mode=script --wignore";
+GFS="glusterfs --attribute-timeout=0 --entry-timeout=0";
mkdir -p $B0;
mkdir -p $M0 $M1;
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
index bd3fbc7..bb79f2c 100644
--- a/xlators/cluster/ec/src/ec.c
+++ b/xlators/cluster/ec/src/ec.c
@@ -189,151 +189,179 @@ int32_t reconfigure(xlator_t * this, dict_t * options)
return -1;
}
-void ec_up(xlator_t * this, ec_t * ec)
+glusterfs_event_t
+ec_get_event_from_state (ec_t *ec)
{
- if (ec->timer != NULL)
- {
- gf_timer_call_cancel(this->ctx, ec->timer);
- ec->timer = NULL;
- }
+ int down_count = 0;
- if (!ec->up && (ec->xl_up_count >= ec->fragments))
- {
- if (ec->xl_up_count < ec->nodes)
- {
- gf_log("ec", GF_LOG_WARNING, "Starting volume with only %d bricks",
- ec->xl_up_count);
+ if (ec->xl_up_count >= ec->fragments)
+ return GF_EVENT_CHILD_UP;
+
+ down_count = ec->xl_notify_count - ec->xl_up_count;
+ if (down_count > ec->redundancy)
+ return GF_EVENT_CHILD_DOWN;
+
+ return GF_EVENT_MAXVAL;
+}
+
+void
+ec_up (xlator_t *this, ec_t *ec)
+{
+ if (ec->timer != NULL) {
+ gf_timer_call_cancel (this->ctx, ec->timer);
+ ec->timer = NULL;
}
ec->up = 1;
gf_log(this->name, GF_LOG_INFO, "Going UP");
-
- default_notify(this, GF_EVENT_CHILD_UP, NULL);
- }
}
-void ec_down(xlator_t * this, ec_t * ec)
+void
+ec_down (xlator_t *this, ec_t *ec)
{
- if (ec->timer != NULL)
- {
- gf_timer_call_cancel(this->ctx, ec->timer);
- ec->timer = NULL;
- }
+ if (ec->timer != NULL) {
+ gf_timer_call_cancel(this->ctx, ec->timer);
+ ec->timer = NULL;
+ }
- if (ec->up)
- {
ec->up = 0;
gf_log(this->name, GF_LOG_INFO, "Going DOWN");
-
- default_notify(this, GF_EVENT_CHILD_DOWN, NULL);
- }
}
-void ec_notify_up_cbk(void * data)
+void
+ec_notify_down (void *data)
{
- ec_t * ec = data;
-
- LOCK(&ec->lock);
-
- if (ec->timer != NULL)
- {
- ec_up(ec->xl, ec);
- }
+ ec_t *ec = data;
- UNLOCK(&ec->lock);
+ LOCK(&ec->lock);
+ {
+ if (!ec->timer) {
+ /*
+ * Either child_up/child_down is already sent to parent
+ * This is a spurious wake up.
+ */
+ goto unlock;
+ }
+
+ gf_timer_call_cancel (ec->xl->ctx, ec->timer);
+ ec->timer = NULL;
+
+ if (GF_EVENT_MAXVAL == ec_get_event_from_state (ec)) {
+ /* Change the state as if the bricks are down */
+ ec->xl_notify = (1ULL << ec->nodes) - 1ULL;
+ ec->xl_notify_count = ec->nodes;
+ default_notify (ec->xl, GF_EVENT_CHILD_DOWN, NULL);
+ }
+ }
+unlock:
+ UNLOCK(&ec->lock);
}
-int32_t ec_notify_up(xlator_t * this, ec_t * ec, int32_t idx)
+void
+ec_launch_notify_child_down_timer (xlator_t *this, ec_t *ec)
{
- struct timespec delay = {0, };
-
- if (((ec->xl_up >> idx) & 1) == 0)
- {
- ec->xl_up |= 1ULL << idx;
- ec->xl_up_count++;
-
- gf_log("ec", GF_LOG_DEBUG, "Child %d is UP (%lX, %u)", idx, ec->xl_up,
- ec->xl_up_count);
+ struct timespec delay = {0, };
- if (ec->xl_up_count == ec->fragments)
- {
- gf_log("ec", GF_LOG_DEBUG, "Initiating up timer");
-
- delay.tv_sec = 5;
- delay.tv_nsec = 0;
- ec->timer = gf_timer_call_after(this->ctx, delay, ec_notify_up_cbk,
- ec);
- if (ec->timer == NULL)
- {
- gf_log(this->name, GF_LOG_ERROR, "Cannot create timer for "
- "delayed initialization");
-
- return ENOMEM;
- }
+ gf_log (this->name, GF_LOG_DEBUG, "Initiating child-down timer");
+ delay.tv_sec = 10;
+ delay.tv_nsec = 0;
+ ec->timer = gf_timer_call_after (this->ctx, delay, ec_notify_down, ec);
+ if (ec->timer == NULL) {
+ gf_log(this->name, GF_LOG_ERROR, "Cannot create timer "
+ "for delayed initialization");
}
- else if (ec->xl_up_count == ec->nodes)
- {
- ec_up(this, ec);
+}
+
+void
+ec_handle_up (xlator_t *this, ec_t *ec, int32_t idx)
+{
+ if (((ec->xl_notify >> idx) & 1) == 0) {
+ ec->xl_notify |= 1ULL << idx;
+ ec->xl_notify_count++;
}
- }
- return EAGAIN;
+ if (((ec->xl_up >> idx) & 1) == 0) { /* Duplicate event */
+ ec->xl_up |= 1ULL << idx;
+ ec->xl_up_count++;
+ }
}
-int32_t ec_notify_down(xlator_t * this, ec_t * ec, int32_t idx)
+void
+ec_handle_down (xlator_t *this, ec_t *ec, int32_t idx)
{
- if (((ec->xl_up >> idx) & 1) != 0)
- {
- gf_log("ec", GF_LOG_DEBUG, "Child %d is DOWN", idx);
-
- ec->xl_up ^= 1ULL << idx;
- if (ec->xl_up_count-- == ec->fragments)
- {
- ec_down(this, ec);
+ if (((ec->xl_notify >> idx) & 1) == 0) {
+ ec->xl_notify |= 1ULL << idx;
+ ec->xl_notify_count++;
}
- }
- return EAGAIN;
+ if (((ec->xl_up >> idx) & 1) != 0) { /* Duplicate event */
+ gf_log(this->name, GF_LOG_DEBUG, "Child %d is DOWN", idx);
+
+ ec->xl_up ^= 1ULL << idx;
+ ec->xl_up_count--;
+ }
}
-int32_t notify(xlator_t * this, int32_t event, void * data, ...)
+int32_t
+notify (xlator_t *this, int32_t event, void *data, ...)
{
ec_t * ec = this->private;
int32_t idx = 0;
int32_t error = 0;
+ glusterfs_event_t old_event = GF_EVENT_MAXVAL;
+ glusterfs_event_t new_event = GF_EVENT_MAXVAL;
+
+ LOCK (&ec->lock);
+
+ if (event == GF_EVENT_PARENT_UP) {
+ /*
+ * Start a timer which sends CHILD_DOWN event to parent
+ * xlator to prevent the 'mount' syscall from hanging.
+ */
+ ec_launch_notify_child_down_timer (this, ec);
+ goto unlock;
+ }
- LOCK(&ec->lock);
-
- for (idx = 0; idx < ec->nodes; idx++)
- {
- if (ec->xl_list[idx] == data)
- {
- break;
+ for (idx = 0; idx < ec->nodes; idx++) {
+ if (ec->xl_list[idx] == data)
+ break;
}
- }
- gf_log("ec", GF_LOG_TRACE, "NOTIFY(%d): %p, %d", event, data, idx);
+ gf_log (this->name, GF_LOG_TRACE, "NOTIFY(%d): %p, %d",
+ event, data, idx);
- if (idx < ec->nodes)
- {
- if (event == GF_EVENT_CHILD_UP)
- {
- error = ec_notify_up(this, ec, idx);
- }
- else if (event == GF_EVENT_CHILD_DOWN)
- {
- error = ec_notify_down(this, ec, idx);
- }
- }
+ if (idx < ec->nodes) { /* CHILD_* events */
- UNLOCK(&ec->lock);
+ old_event = ec_get_event_from_state (ec);
- if (error == 0)
- {
- return default_notify(this, event, data);
- }
+ if (event == GF_EVENT_CHILD_UP) {
+ ec_handle_up (this, ec, idx);
+ } else if (event == GF_EVENT_CHILD_DOWN) {
+ ec_handle_down (this, ec, idx);
+ }
- return 0;
+ new_event = ec_get_event_from_state (ec);
+
+ if (new_event == GF_EVENT_CHILD_UP && !ec->up) {
+ ec_up (this, ec);
+ } else if (new_event == GF_EVENT_CHILD_DOWN && ec->up) {
+ ec_down (this, ec);
+ }
+
+ if ((new_event == old_event) && (new_event != GF_EVENT_MAXVAL))
+ new_event = GF_EVENT_CHILD_MODIFIED;
+
+ event = GF_EVENT_MAXVAL;/* Take care of notifying inside lock */
+ if (new_event != GF_EVENT_MAXVAL)
+ error = default_notify (this, new_event, data);
+ }
+unlock:
+ UNLOCK (&ec->lock);
+
+ if (event != GF_EVENT_MAXVAL)
+ return default_notify (this, event, data);
+
+ return error;
}
int32_t init(xlator_t * this)
diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h
index 2a042ae..6a97698 100644
--- a/xlators/cluster/ec/src/ec.h
+++ b/xlators/cluster/ec/src/ec.h
@@ -36,6 +36,8 @@ struct _ec
uint32_t idx;
uint32_t xl_up_count;
uintptr_t xl_up;
+ uint32_t xl_notify_count;
+ uintptr_t xl_notify;
uintptr_t node_mask;
xlator_t ** xl_list;
gf_lock_t lock;