diff options
| author | Pranith Kumar K <pkarampu@redhat.com> | 2015-01-08 15:39:40 +0530 | 
|---|---|---|
| committer | Raghavendra Bhat <raghavendra@redhat.com> | 2015-03-30 00:20:38 -0700 | 
| commit | d1eb4f520b35c1057c7cb3427a51dd6ae75cc61f (patch) | |
| tree | 521fcac11f8571dd395e0a028498c7bc55a13b95 | |
| parent | ae75db655b683bb47df1590c7815c90b7cbefe4b (diff) | |
cluster/ec: Handle CHILD UP/DOWN in all cases
        Backport of http://review.gluster.org/9396
Problem:
When all the bricks are down at the time of mounting the volume, then mount
command hangs.
Fix:
1. Ignore all CHILD_CONNECTING events comming from subvolumes.
2. On timer expiration (without enough up or down childs) send
   CHILD_DOWN.
3. Once enough up or down subvolumes are detected, send the appropriate event.
   When rest of the subvols go up/down without changing the overall
   ec-up/ec-down send CHILD_MODIFIED to parent subvols.
BUG: 1188471
Change-Id: If92bd84107d49495cd104deb34601afe7f9b155c
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Reviewed-on: http://review.gluster.org/9551
Reviewed-by: Xavier Hernandez <xhernandez@datalab.es>
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Raghavendra Bhat <raghavendra@redhat.com>
| -rw-r--r-- | tests/basic/ec/ec-notify.t | 79 | ||||
| -rw-r--r-- | tests/include.rc | 1 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec.c | 236 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec.h | 2 | 
4 files changed, 214 insertions, 104 deletions
diff --git a/tests/basic/ec/ec-notify.t b/tests/basic/ec/ec-notify.t new file mode 100644 index 00000000000..586be91bdbe --- /dev/null +++ b/tests/basic/ec/ec-notify.t @@ -0,0 +1,79 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# This test checks notify part of ec + +cleanup +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/${V0}{0..2} +TEST $CLI volume start $V0 + +#First time mount tests. +# When all the bricks are up, mount should succeed and up-children +# count should be 3 +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 +TEST stat $M0 +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + +# When the volume is stopped mount succeeds and up-children will be 0 +TEST $CLI volume stop $V0 +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; +# Wait for 5 seconds even after that up_count should show 0 +sleep 5; +EXPECT_WITHIN $CHILD_UP_TIMEOUT "0" ec_child_up_count $V0 0 +TEST ! stat $M0 +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + +# When 2 bricks are up, mount should succeed and up-children +# count should be 2 + +TEST $CLI volume start $V0 +TEST kill_brick $V0 $H0 $B0/${V0}2 +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; +EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0 +TEST stat $M0 +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + +# When only 1 brick is up mount should fail. +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; +# Wait for 5 seconds even after that up_count should show 1 +sleep 5 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" ec_child_up_count $V0 0 +TEST ! stat $M0 +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + +# Mount already succeeded. Test that the brick up down are leading to correct +# state changes in ec. +TEST $CLI volume stop $V0 +TEST $CLI volume start $V0 force +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 +TEST touch $M0/a + +# kill 1 brick and the up_count should become 2, fops should still succeed +TEST kill_brick $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0 +TEST touch $M0/b + +# kill one more brick and the up_count should become 1, fops should fail +TEST kill_brick $V0 $H0 $B0/${V0}2 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" ec_child_up_count $V0 0 +TEST ! touch $M0/c + +# kill one more brick and the up_count should become 0, fops should still fail +TEST kill_brick $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "0" ec_child_up_count $V0 0 +TEST ! touch $M0/c + +# Bring up all the bricks up and see that up_count is 3 and fops are succeeding +# again. +TEST $CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 +TEST touch $M0/c + +cleanup diff --git a/tests/include.rc b/tests/include.rc index 599ad1bd4c2..d4e0ed25556 100644 --- a/tests/include.rc +++ b/tests/include.rc @@ -41,6 +41,7 @@ UMOUNT_TIMEOUT=5  statedumpdir=`gluster --print-statedumpdir`; # Default directory for statedump  CLI="gluster --mode=script --wignore"; +GFS="glusterfs --attribute-timeout=0 --entry-timeout=0";  mkdir -p $B0;  mkdir -p $M0 $M1; diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index bd3fbc717e5..bb79f2cf02c 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -189,151 +189,179 @@ int32_t reconfigure(xlator_t * this, dict_t * options)      return -1;  } -void ec_up(xlator_t * this, ec_t * ec) +glusterfs_event_t +ec_get_event_from_state (ec_t *ec)  { -    if (ec->timer != NULL) -    { -        gf_timer_call_cancel(this->ctx, ec->timer); -        ec->timer = NULL; -    } +        int     down_count = 0; -    if (!ec->up && (ec->xl_up_count >= ec->fragments)) -    { -        if (ec->xl_up_count < ec->nodes) -        { -            gf_log("ec", GF_LOG_WARNING, "Starting volume with only %d bricks", -                   ec->xl_up_count); +        if (ec->xl_up_count >= ec->fragments) +                return GF_EVENT_CHILD_UP; + +        down_count = ec->xl_notify_count - ec->xl_up_count; +        if (down_count > ec->redundancy) +                return GF_EVENT_CHILD_DOWN; + +        return GF_EVENT_MAXVAL; +} + +void +ec_up (xlator_t *this, ec_t *ec) +{ +        if (ec->timer != NULL) { +                gf_timer_call_cancel (this->ctx, ec->timer); +                ec->timer = NULL;          }          ec->up = 1;          gf_log(this->name, GF_LOG_INFO, "Going UP"); - -        default_notify(this, GF_EVENT_CHILD_UP, NULL); -    }  } -void ec_down(xlator_t * this, ec_t * ec) +void +ec_down (xlator_t *this, ec_t *ec)  { -    if (ec->timer != NULL) -    { -        gf_timer_call_cancel(this->ctx, ec->timer); -        ec->timer = NULL; -    } +        if (ec->timer != NULL) { +                gf_timer_call_cancel(this->ctx, ec->timer); +                ec->timer = NULL; +        } -    if (ec->up) -    {          ec->up = 0;          gf_log(this->name, GF_LOG_INFO, "Going DOWN"); - -        default_notify(this, GF_EVENT_CHILD_DOWN, NULL); -    }  } -void ec_notify_up_cbk(void * data) +void +ec_notify_down (void *data)  { -    ec_t * ec = data; - -    LOCK(&ec->lock); - -    if (ec->timer != NULL) -    { -        ec_up(ec->xl, ec); -    } +        ec_t *ec = data; -    UNLOCK(&ec->lock); +        LOCK(&ec->lock); +        { +                if (!ec->timer) { +                        /* +                         * Either child_up/child_down is already sent to parent +                         * This is a spurious wake up. +                         */ +                        goto unlock; +                } + +                gf_timer_call_cancel (ec->xl->ctx, ec->timer); +                ec->timer = NULL; + +                if (GF_EVENT_MAXVAL == ec_get_event_from_state (ec)) { +                        /* Change the state as if the bricks are down */ +                        ec->xl_notify = (1ULL << ec->nodes) - 1ULL; +                        ec->xl_notify_count = ec->nodes; +                        default_notify (ec->xl, GF_EVENT_CHILD_DOWN, NULL); +                } +        } +unlock: +        UNLOCK(&ec->lock);  } -int32_t ec_notify_up(xlator_t * this, ec_t * ec, int32_t idx) +void +ec_launch_notify_child_down_timer (xlator_t *this, ec_t *ec)  { -    struct timespec delay = {0, }; - -    if (((ec->xl_up >> idx) & 1) == 0) -    { -        ec->xl_up |= 1ULL << idx; -        ec->xl_up_count++; - -        gf_log("ec", GF_LOG_DEBUG, "Child %d is UP (%lX, %u)", idx, ec->xl_up, -               ec->xl_up_count); +        struct timespec delay = {0, }; -        if (ec->xl_up_count == ec->fragments) -        { -            gf_log("ec", GF_LOG_DEBUG, "Initiating up timer"); - -            delay.tv_sec = 5; -            delay.tv_nsec = 0; -            ec->timer = gf_timer_call_after(this->ctx, delay, ec_notify_up_cbk, -                                            ec); -            if (ec->timer == NULL) -            { -                gf_log(this->name, GF_LOG_ERROR, "Cannot create timer for " -                                                 "delayed initialization"); - -                return ENOMEM; -            } +        gf_log (this->name, GF_LOG_DEBUG, "Initiating child-down timer"); +        delay.tv_sec = 10; +        delay.tv_nsec = 0; +        ec->timer = gf_timer_call_after (this->ctx, delay, ec_notify_down, ec); +        if (ec->timer == NULL) { +                gf_log(this->name, GF_LOG_ERROR, "Cannot create timer " +                       "for delayed initialization");          } -        else if (ec->xl_up_count == ec->nodes) -        { -            ec_up(this, ec); +} + +void +ec_handle_up (xlator_t *this, ec_t *ec, int32_t idx) +{ +        if (((ec->xl_notify >> idx) & 1) == 0) { +                ec->xl_notify |= 1ULL << idx; +                ec->xl_notify_count++;          } -    } -    return EAGAIN; +        if (((ec->xl_up >> idx) & 1) == 0) { /* Duplicate event */ +                ec->xl_up |= 1ULL << idx; +                ec->xl_up_count++; +        }  } -int32_t ec_notify_down(xlator_t * this, ec_t * ec, int32_t idx) +void +ec_handle_down (xlator_t *this, ec_t *ec, int32_t idx)  { -    if (((ec->xl_up >> idx) & 1) != 0) -    { -        gf_log("ec", GF_LOG_DEBUG, "Child %d is DOWN", idx); - -        ec->xl_up ^= 1ULL << idx; -        if (ec->xl_up_count-- == ec->fragments) -        { -            ec_down(this, ec); +        if (((ec->xl_notify >> idx) & 1) == 0) { +                ec->xl_notify |= 1ULL << idx; +                ec->xl_notify_count++;          } -    } -    return EAGAIN; +        if (((ec->xl_up >> idx) & 1) != 0) { /* Duplicate event */ +                gf_log(this->name, GF_LOG_DEBUG, "Child %d is DOWN", idx); + +                ec->xl_up ^= 1ULL << idx; +                ec->xl_up_count--; +        }  } -int32_t notify(xlator_t * this, int32_t event, void * data, ...) +int32_t +notify (xlator_t *this, int32_t event, void *data, ...)  {      ec_t * ec = this->private;      int32_t idx = 0;      int32_t error = 0; +    glusterfs_event_t old_event = GF_EVENT_MAXVAL; +    glusterfs_event_t new_event = GF_EVENT_MAXVAL; + +        LOCK (&ec->lock); + +        if (event == GF_EVENT_PARENT_UP) { +                /* +                 * Start a timer which sends CHILD_DOWN event to parent +                 * xlator to prevent the 'mount' syscall from hanging. +                 */ +                ec_launch_notify_child_down_timer (this, ec); +                goto unlock; +        } -    LOCK(&ec->lock); - -    for (idx = 0; idx < ec->nodes; idx++) -    { -        if (ec->xl_list[idx] == data) -        { -            break; +        for (idx = 0; idx < ec->nodes; idx++) { +                if (ec->xl_list[idx] == data) +                        break;          } -    } -    gf_log("ec", GF_LOG_TRACE, "NOTIFY(%d): %p, %d", event, data, idx); +        gf_log (this->name, GF_LOG_TRACE, "NOTIFY(%d): %p, %d", +                event, data, idx); -    if (idx < ec->nodes) -    { -        if (event == GF_EVENT_CHILD_UP) -        { -            error = ec_notify_up(this, ec, idx); -        } -        else if (event == GF_EVENT_CHILD_DOWN) -        { -            error = ec_notify_down(this, ec, idx); -        } -    } +        if (idx < ec->nodes) { /* CHILD_* events */ -    UNLOCK(&ec->lock); +                old_event = ec_get_event_from_state (ec); -    if (error == 0) -    { -        return default_notify(this, event, data); -    } +                if (event == GF_EVENT_CHILD_UP) { +                        ec_handle_up (this, ec, idx); +                } else if (event == GF_EVENT_CHILD_DOWN) { +                        ec_handle_down (this, ec, idx); +                } -    return 0; +                new_event = ec_get_event_from_state (ec); + +                if (new_event == GF_EVENT_CHILD_UP && !ec->up) { +                        ec_up (this, ec); +                } else if (new_event == GF_EVENT_CHILD_DOWN && ec->up) { +                        ec_down (this, ec); +                } + +                if ((new_event == old_event) && (new_event != GF_EVENT_MAXVAL)) +                        new_event = GF_EVENT_CHILD_MODIFIED; + +                event = GF_EVENT_MAXVAL;/* Take care of notifying inside lock */ +                if (new_event != GF_EVENT_MAXVAL) +                        error = default_notify (this, new_event, data); +        } +unlock: +        UNLOCK (&ec->lock); + +        if (event != GF_EVENT_MAXVAL) +                return default_notify (this, event, data); + +        return error;  }  int32_t init(xlator_t * this) diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h index 2a042ae577f..6a976980074 100644 --- a/xlators/cluster/ec/src/ec.h +++ b/xlators/cluster/ec/src/ec.h @@ -36,6 +36,8 @@ struct _ec      uint32_t          idx;      uint32_t          xl_up_count;      uintptr_t         xl_up; +    uint32_t          xl_notify_count; +    uintptr_t         xl_notify;      uintptr_t         node_mask;      xlator_t **       xl_list;      gf_lock_t         lock;  | 
