From 61cfcf65f0d4ad70fc8a47395c583d4b5bf1efbe Mon Sep 17 00:00:00 2001 From: Xavier Hernandez Date: Thu, 14 May 2015 20:07:10 +0200 Subject: cluster/ec: Correctly cleanup delayed locks When a delayed lock is pending, a graph switch doesn't correctly terminate it. This means that the update of version and size xattrs is lost, causing EIO errors. This patch handles GF_EVENT_PARENT_DOWN event to correctly finish pending udpdates before completing the graph switch. Change-Id: I394f3b8d41df8d83cdd36636aeb62330f30a66d5 BUG: 1188145 Signed-off-by: Xavier Hernandez Reviewed-on: http://review.gluster.org/10787 Tested-by: NetBSD Build System Tested-by: Gluster Build System Reviewed-by: Pranith Kumar Karampuri --- xlators/cluster/ec/src/ec-common.c | 74 ++++++++++++++++++++++------- xlators/cluster/ec/src/ec-common.h | 1 + xlators/cluster/ec/src/ec-data.c | 13 ++--- xlators/cluster/ec/src/ec-data.h | 31 ++++++------ xlators/cluster/ec/src/ec-heal.c | 7 +-- xlators/cluster/ec/src/ec.c | 97 +++++++++++++++++++++++++++++--------- xlators/cluster/ec/src/ec.h | 5 ++ 7 files changed, 163 insertions(+), 65 deletions(-) (limited to 'xlators') diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index afd46c095f3..9f312e0c37c 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -1531,31 +1531,46 @@ void ec_unlock_now(ec_fop_data_t *fop, ec_lock_t *lock) ec_resume(fop, 0); } -void ec_unlock_timer_cbk(void *data) +void +ec_unlock_timer_del(ec_fop_data_t *fop, ec_lock_t *lock) { - ec_lock_link_t *link = data; - ec_lock_t *lock = link->lock; - ec_fop_data_t *fop = NULL; + inode_t *inode; + gf_boolean_t now = _gf_false; + + /* A race condition can happen if timer expires, calls this function + * and the lock is released (lock->loc is wiped) but the fop is not + * fully completed yet (it's still on the list of pending fops). In + * this case, this function can also be called if ec_unlock_force() is + * called. */ + inode = lock->loc.inode; + if (inode == NULL) { + return; + } - LOCK(&lock->loc.inode->lock); + LOCK(&inode->lock); - if (lock->timer != NULL) { - fop = link->fop; + if (lock->timer != NULL) { + ec_trace("UNLOCK_DELAYED", fop, "lock=%p", lock); - ec_trace("UNLOCK_DELAYED", fop, "lock=%p", lock); + gf_timer_call_cancel(fop->xl->ctx, lock->timer); + lock->timer = NULL; + *lock->plock = NULL; - GF_ASSERT(lock->refs == 1); + now = _gf_true; + } - gf_timer_call_cancel(fop->xl->ctx, lock->timer); - lock->timer = NULL; - *lock->plock = NULL; - } + UNLOCK(&inode->lock); - UNLOCK(&lock->loc.inode->lock); + if (now) { + ec_unlock_now(fop, lock); + } +} - if (fop != NULL) { - ec_unlock_now(fop, lock); - } +void ec_unlock_timer_cbk(void *data) +{ + ec_lock_link_t *link = data; + + ec_unlock_timer_del(link->fop, link->lock); } void ec_unlock_timer_add(ec_lock_link_t *link) @@ -1626,6 +1641,18 @@ void ec_unlock(ec_fop_data_t *fop) } } +void +ec_unlock_force(ec_fop_data_t *fop) +{ + int32_t i; + + for (i = 0; i < fop->lock_count; i++) { + ec_trace("UNLOCK_FORCED", fop, "lock=%p", &fop->locks[i]); + + ec_unlock_timer_del(fop, fop->locks[i].lock); + } +} + void ec_flush_size_version(ec_fop_data_t * fop) { ec_lock_t * lock; @@ -1740,8 +1767,21 @@ void __ec_manager(ec_fop_data_t * fop, int32_t error) } if ((fop->state == EC_STATE_END) || (fop->state == -EC_STATE_END)) { + gf_boolean_t notify; + + LOCK(&ec->lock); + + list_del_init(&fop->pending_list); + notify = list_empty(&ec->pending_fops); + + UNLOCK(&ec->lock); + ec_fop_data_release(fop); + if (notify) { + ec_pending_fops_completed(ec); + } + break; } diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h index 9e0aaa0f079..08993f03c8f 100644 --- a/xlators/cluster/ec/src/ec-common.h +++ b/xlators/cluster/ec/src/ec-common.h @@ -87,6 +87,7 @@ void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, int32_t update); void ec_lock(ec_fop_data_t * fop); void ec_lock_reuse(ec_fop_data_t *fop); void ec_unlock(ec_fop_data_t * fop); +void ec_unlock_force(ec_fop_data_t *fop); void ec_get_size_version(ec_fop_data_t * fop); void ec_prepare_update(ec_fop_data_t *fop); diff --git a/xlators/cluster/ec/src/ec-data.c b/xlators/cluster/ec/src/ec-data.c index fb47aea90a8..b747fc42348 100644 --- a/xlators/cluster/ec/src/ec-data.c +++ b/xlators/cluster/ec/src/ec-data.c @@ -165,17 +165,18 @@ ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this, parent = frame->local; if (parent != NULL) { - LOCK(&parent->lock); - - parent->jobs++; - parent->refs++; - - UNLOCK(&parent->lock); + ec_sleep(parent); } fop->parent = parent; } + LOCK(&ec->lock); + + list_add_tail(&fop->pending_list, &ec->pending_fops); + + UNLOCK(&ec->lock); + return fop; } diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h index 9e5c92dd5b8..8a58ffb288b 100644 --- a/xlators/cluster/ec/src/ec-data.h +++ b/xlators/cluster/ec/src/ec-data.h @@ -172,13 +172,14 @@ struct _ec_fop_data int32_t winds; int32_t jobs; int32_t error; - ec_fop_data_t * parent; - xlator_t * xl; - call_frame_t * req_frame; // frame of the calling xlator - call_frame_t * frame; // frame used by this fop - struct list_head cbk_list; // sorted list of groups of answers - struct list_head answer_list; // list of answers - ec_cbk_data_t * answer; // accepted answer + ec_fop_data_t *parent; + xlator_t *xl; + call_frame_t *req_frame; /* frame of the calling xlator */ + call_frame_t *frame; /* frame used by this fop */ + struct list_head cbk_list; /* sorted list of groups of answers */ + struct list_head answer_list; /* list of answers */ + struct list_head pending_list; /* member of ec_t.pending_fops */ + ec_cbk_data_t *answer; /* accepted answer */ int32_t lock_count; int32_t locked; ec_lock_link_t locks[2]; @@ -203,7 +204,7 @@ struct _ec_fop_data ec_handler_f handler; ec_resume_f resume; ec_cbk_t cbks; - void * data; + void *data; ec_heal_t *heal; uint64_t user_size; @@ -211,8 +212,8 @@ struct _ec_fop_data int32_t use_fd; - dict_t * xdata; - dict_t * dict; + dict_t *xdata; + dict_t *dict; int32_t int32; uint32_t uint32; uint64_t size; @@ -222,14 +223,14 @@ struct _ec_fop_data entrylk_type entrylk_type; gf_xattrop_flags_t xattrop_flags; dev_t dev; - inode_t * inode; - fd_t * fd; + inode_t *inode; + fd_t *fd; struct iatt iatt; - char * str[2]; + char *str[2]; loc_t loc[2]; struct gf_flock flock; - struct iovec * vector; - struct iobref * buffers; + struct iovec *vector; + struct iobref *buffers; }; struct _ec_cbk_data diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c index ceddfeb6ac7..4fe5754cbb8 100644 --- a/xlators/cluster/ec/src/ec-heal.c +++ b/xlators/cluster/ec/src/ec-heal.c @@ -532,12 +532,7 @@ ec_heal_init (ec_fop_data_t * fop) gf_log("ec", GF_LOG_INFO, "Healing '%s', gfid %s", heal->loc.path, uuid_utoa(heal->loc.gfid)); } else { - LOCK(&fop->lock); - - fop->jobs++; - fop->refs++; - - UNLOCK(&fop->lock); + ec_sleep(fop); } list_add_tail(&heal->list, &ctx->heal); diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index 3dd04299541..4028aa4d2bb 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -278,6 +278,7 @@ ec_notify_cbk (void *data) { ec_t *ec = data; glusterfs_event_t event = GF_EVENT_MAXVAL; + gf_boolean_t propagate = _gf_false; LOCK(&ec->lock); { @@ -309,10 +310,14 @@ ec_notify_cbk (void *data) /* CHILD_DOWN should not come here as no grace period is given * for notifying CHILD_DOWN. */ - default_notify (ec->xl, event, NULL); + propagate = _gf_true; } unlock: UNLOCK(&ec->lock); + + if (propagate) { + default_notify (ec->xl, event, NULL); + } } void @@ -360,6 +365,49 @@ ec_handle_down (xlator_t *this, ec_t *ec, int32_t idx) } } +gf_boolean_t +ec_force_unlocks(ec_t *ec) +{ + struct list_head list; + ec_fop_data_t *fop; + + if (list_empty(&ec->pending_fops)) { + return _gf_true; + } + + INIT_LIST_HEAD(&list); + + /* All pending fops when GF_EVENT_PARENT_DOWN is received should only + * be fops waiting for a delayed unlock. However the unlock can + * generate new fops. We don't want to trverse these new fops while + * forcing unlocks, so we move all fops to a temporal list. To process + * them without interferences.*/ + list_splice_init(&ec->pending_fops, &list); + + while (!list_empty(&list)) { + fop = list_entry(list.next, ec_fop_data_t, pending_list); + list_move_tail(&fop->pending_list, &ec->pending_fops); + + UNLOCK(&ec->lock); + + ec_unlock_force(fop); + + LOCK(&ec->lock); + } + + ec->shutdown = _gf_true; + + return list_empty(&ec->pending_fops); +} + +void +ec_pending_fops_completed(ec_t *ec) +{ + if (ec->shutdown) { + default_notify(ec->xl, GF_EVENT_PARENT_DOWN, NULL); + } +} + int32_t ec_notify (xlator_t *this, int32_t event, void *data, void *data2) { @@ -367,14 +415,16 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2) int32_t idx = 0; int32_t error = 0; glusterfs_event_t old_event = GF_EVENT_MAXVAL; - glusterfs_event_t new_event = GF_EVENT_MAXVAL; dict_t *input = NULL; dict_t *output = NULL; + gf_boolean_t propagate = _gf_true; + + gf_log (this->name, GF_LOG_TRACE, "NOTIFY(%d): %p, %p", + event, data, data2); if (event == GF_EVENT_TRANSLATOR_OP) { if (!ec->up) { error = -1; - goto out; } else { input = data; output = data2; @@ -400,13 +450,14 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2) */ ec_launch_notify_timer (this, ec); goto unlock; + } else if (event == GF_EVENT_PARENT_DOWN) { + /* If there aren't pending fops running after we have waken up + * them, we immediately propagate the notification. */ + propagate = ec_force_unlocks(ec); + goto unlock; } - gf_log (this->name, GF_LOG_TRACE, "NOTIFY(%d): %p, %d", - event, data, idx); - if (idx < ec->nodes) { /* CHILD_* events */ - old_event = ec_get_event_from_state (ec); if (event == GF_EVENT_CHILD_UP) { @@ -415,28 +466,30 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2) ec_handle_down (this, ec, idx); } - new_event = ec_get_event_from_state (ec); + event = ec_get_event_from_state (ec); - if (new_event == GF_EVENT_CHILD_UP && !ec->up) { + if (event == GF_EVENT_CHILD_UP && !ec->up) { ec_up (this, ec); - } else if (new_event == GF_EVENT_CHILD_DOWN && ec->up) { + } else if (event == GF_EVENT_CHILD_DOWN && ec->up) { ec_down (this, ec); } - if ((new_event == old_event) && (new_event != GF_EVENT_MAXVAL)) - new_event = GF_EVENT_CHILD_MODIFIED; - - event = GF_EVENT_MAXVAL;/* Take care of notifying inside lock */ - if (new_event != GF_EVENT_MAXVAL) - error = default_notify (this, new_event, data); + if (event != GF_EVENT_MAXVAL) { + if (event == old_event) { + event = GF_EVENT_CHILD_MODIFIED; + } + } else { + propagate = _gf_false; + } } - unlock: - UNLOCK (&ec->lock); +unlock: + UNLOCK (&ec->lock); - if (event != GF_EVENT_MAXVAL) - return default_notify (this, event, data); + if (propagate) { + error = default_notify (this, event, data); + } out: - return error; + return error; } int32_t @@ -478,6 +531,8 @@ init (xlator_t *this) ec->xl = this; LOCK_INIT(&ec->lock); + INIT_LIST_HEAD(&ec->pending_fops); + ec->fop_pool = mem_pool_new(ec_fop_data_t, 1024); ec->cbk_pool = mem_pool_new(ec_cbk_data_t, 4096); ec->lock_pool = mem_pool_new(ec_lock_t, 1024); diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h index b8f3e288197..fdedb89ec18 100644 --- a/xlators/cluster/ec/src/ec.h +++ b/xlators/cluster/ec/src/ec.h @@ -44,6 +44,8 @@ struct _ec xlator_t ** xl_list; gf_lock_t lock; gf_timer_t * timer; + gf_boolean_t shutdown; + struct list_head pending_fops; struct mem_pool * fop_pool; struct mem_pool * cbk_pool; struct mem_pool * lock_pool; @@ -51,4 +53,7 @@ struct _ec char vol_uuid[UUID_SIZE + 1]; dict_t *leaf_to_subvolid; }; + +void ec_pending_fops_completed(ec_t *ec); + #endif /* __EC_H__ */ -- cgit