From bd7f4451aef70c4c968d3ca4e5996ffc96cf64fa Mon Sep 17 00:00:00 2001 From: Pranith Kumar K Date: Sun, 1 Feb 2015 15:03:46 +0530 Subject: cluster/ec: Wait for all bricks to notify before notifying parent Backport of http://review.gluster.org/9523 This is to prevent spurious heals that can result in self-heal. BUG: 1188471 Change-Id: Iaea335d59431d8d85a236963a365f5c791fc7c49 Signed-off-by: Pranith Kumar K Reviewed-on: http://review.gluster.org/9552 Reviewed-by: Xavier Hernandez Tested-by: Gluster Build System Reviewed-by: Raghavendra Bhat --- xlators/cluster/ec/src/ec.c | 48 ++++++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index bb79f2cf02c..1d681d14c84 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -192,14 +192,21 @@ int32_t reconfigure(xlator_t * this, dict_t * options) glusterfs_event_t ec_get_event_from_state (ec_t *ec) { - int down_count = 0; + int down_count = 0; - if (ec->xl_up_count >= ec->fragments) - return GF_EVENT_CHILD_UP; + if (ec->xl_up_count >= ec->fragments) { + /* If ec is up but some subvolumes are yet to notify, give + * grace time for other subvols to notify to prevent start of + * I/O which may result in self-heals */ + if (ec->timer && ec->xl_notify_count < ec->nodes) + return GF_EVENT_MAXVAL; - down_count = ec->xl_notify_count - ec->xl_up_count; - if (down_count > ec->redundancy) - return GF_EVENT_CHILD_DOWN; + return GF_EVENT_CHILD_UP; + } else { + down_count = ec->xl_notify_count - ec->xl_up_count; + if (down_count > ec->redundancy) + return GF_EVENT_CHILD_DOWN; + } return GF_EVENT_MAXVAL; } @@ -229,9 +236,10 @@ ec_down (xlator_t *this, ec_t *ec) } void -ec_notify_down (void *data) +ec_notify_cbk (void *data) { ec_t *ec = data; + glusterfs_event_t event = GF_EVENT_MAXVAL; LOCK(&ec->lock); { @@ -246,26 +254,38 @@ ec_notify_down (void *data) gf_timer_call_cancel (ec->xl->ctx, ec->timer); ec->timer = NULL; - if (GF_EVENT_MAXVAL == ec_get_event_from_state (ec)) { - /* Change the state as if the bricks are down */ + event = ec_get_event_from_state (ec); + /* If event is still MAXVAL then enough subvolumes didn't + * notify, treat it as CHILD_DOWN. */ + if (event == GF_EVENT_MAXVAL) { + event = GF_EVENT_CHILD_DOWN; ec->xl_notify = (1ULL << ec->nodes) - 1ULL; ec->xl_notify_count = ec->nodes; - default_notify (ec->xl, GF_EVENT_CHILD_DOWN, NULL); + } else if (event == GF_EVENT_CHILD_UP) { + /* Rest of the bricks are still not coming up, + * notify that ec is up. Files/directories will be + * healed as in when they come up. */ + ec_up (ec->xl, ec); } + + /* CHILD_DOWN should not come here as no grace period is given + * for notifying CHILD_DOWN. */ + + default_notify (ec->xl, event, NULL); } unlock: UNLOCK(&ec->lock); } void -ec_launch_notify_child_down_timer (xlator_t *this, ec_t *ec) +ec_launch_notify_timer (xlator_t *this, ec_t *ec) { struct timespec delay = {0, }; gf_log (this->name, GF_LOG_DEBUG, "Initiating child-down timer"); delay.tv_sec = 10; delay.tv_nsec = 0; - ec->timer = gf_timer_call_after (this->ctx, delay, ec_notify_down, ec); + ec->timer = gf_timer_call_after (this->ctx, delay, ec_notify_cbk, ec); if (ec->timer == NULL) { gf_log(this->name, GF_LOG_ERROR, "Cannot create timer " "for delayed initialization"); @@ -315,10 +335,10 @@ notify (xlator_t *this, int32_t event, void *data, ...) if (event == GF_EVENT_PARENT_UP) { /* - * Start a timer which sends CHILD_DOWN event to parent + * Start a timer which sends appropriate event to parent * xlator to prevent the 'mount' syscall from hanging. */ - ec_launch_notify_child_down_timer (this, ec); + ec_launch_notify_timer (this, ec); goto unlock; } -- cgit