summaryrefslogtreecommitdiffstats
path: root/xlators
diff options
context:
space:
mode:
authorXavier Hernandez <xhernandez@datalab.es>2016-06-13 12:42:47 +0200
committerXavier Hernandez <xhernandez@datalab.es>2016-07-17 23:29:05 -0700
commit74d2aaf51c7ff601e4394cad9f8e23092267af55 (patch)
tree900ec3b53c2e0d62e77aedeb4fb1b9735a42a16a /xlators
parent338aa4634fe0aa8723349763871b3933b14d54ff (diff)
cluster/ec: Fix race in timer cancellation
A race in timer cancellation for delayed unlock could cause a crash if the cancelling thread fails to cancel the timer because it has already been fired but not executed, and the callback is scheduled out of the CPU, delaying it until the thread has released important resources needed by the callback. This patch improves the handling of this case to make it robust. Backport of: > Change-Id: I5c8a8c6610c5136f71b938aa78b5878ba05238d4 > BUG: 1345855 > Signed-off-by: Xavier Hernandez <xhernandez@datalab.es> > Reviewed-on: http://review.gluster.org/14712 > Smoke: Gluster Build System <jenkins@build.gluster.com> > NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> > CentOS-regression: Gluster Build System <jenkins@build.gluster.com> > Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com> Change-Id: I5c8a8c6610c5136f71b938aa78b5878ba05238d4 BUG: 1346156 Signed-off-by: Xavier Hernandez <xhernandez@datalab.es> Reviewed-on: http://review.gluster.org/14724 Smoke: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
Diffstat (limited to 'xlators')
-rw-r--r--xlators/cluster/ec/src/ec-common.c71
1 files changed, 56 insertions, 15 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index d1cdda8..43e7fcf 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -1529,8 +1529,23 @@ ec_lock_assign_owner(ec_lock_link_t *link)
timer_link = lock->timer->data;
GF_ASSERT(timer_link != NULL);
- gf_timer_call_cancel(fop->xl->ctx, lock->timer);
- ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
+ if (gf_timer_call_cancel(fop->xl->ctx, lock->timer) < 0) {
+ /* It's too late to avoid the execution of the timer callback.
+ * Since we need to be sure that the callback has access to all
+ * needed resources, we cannot resume the execution of the timer
+ * fop now. This will be done in the callback.
+ */
+ timer_link = NULL;
+ } else {
+ /* The timer has been cancelled, so we need to release the owner
+ * reference that was held by the fop waiting for the timer. This
+ * can be the last reference, but we'll immediately increment it
+ * for the current fop, so no need to check it.
+ */
+ lock->refs_owners--;
+
+ ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
+ }
/* We have two options here:
*
@@ -1548,13 +1563,8 @@ ec_lock_assign_owner(ec_lock_link_t *link)
* safe until we release it. In this case we can safely clear
* lock->timer. This will cause that the timer callback does nothing
* once it acquires the mutex.
- *
- * In both cases we must release the owner reference assigned to the
- * fop that was handling the unlock because ec_unlock_now() won't be
- * called for that fop.
*/
lock->timer = NULL;
- lock->refs_owners--;
}
lock->exclusive |= (fop->flags & EC_FLAG_LOCK_SHARED) == 0;
@@ -1965,6 +1975,8 @@ ec_unlock_now(ec_lock_link_t *link)
ec_resume(link->fop, 0);
}
+void ec_unlock_timer_add(ec_lock_link_t *link);
+
void
ec_unlock_timer_del(ec_lock_link_t *link)
{
@@ -1972,17 +1984,21 @@ ec_unlock_timer_del(ec_lock_link_t *link)
inode_t *inode;
gf_boolean_t now = _gf_false;
+ /* If we are here, it means that the timer has expired before having
+ * been cancelled. This guarantees that 'link' is still valid because
+ * the fop that contains it must be pending (if timer cancellation in
+ * ec_lock_assign_owner() fails, the fop is left sleeping).
+ *
+ * At the same time, the fop still has a reference to the lock, so
+ * it must also be valid.
+ */
lock = link->lock;
- /* A race condition can happen if timer expires, calls this function
- * and the lock is released (lock->loc is wiped) but the fop is not
- * fully completed yet (it's still on the list of pending fops). In
- * this case, this function can also be called if ec_unlock_force() is
- * called. */
+ /* 'lock' must have a valid inode since it can only be destroyed
+ * when the lock itself is destroyed, but we have a reference to the
+ * lock to avoid this.
+ */
inode = lock->loc.inode;
- if (inode == NULL) {
- return;
- }
LOCK(&inode->lock);
@@ -2013,6 +2029,31 @@ ec_unlock_timer_del(ec_lock_link_t *link)
if (now) {
ec_unlock_now(link);
+ } else {
+ /* The timer has been cancelled just after firing it but before
+ * getting here. This means that another fop has used the lock
+ * and everything should be handled as if this callback were
+ * have not been executed. However we still have an owner
+ * reference.
+ *
+ * We need to release our reference. If this is not the last
+ * reference (the most common case because another fop has
+ * taken another ref) we only need to decrement the counter.
+ * Otherwise we have been delayed enough so that the other fop
+ * has had time to acquire the reference, do its operation and
+ * release it. At the time of releasing it, the fop did found
+ * that the ref counter was > 1 (our reference), so the delayed
+ * unlock timer wasn't started. We need to start it again if we
+ * are the last reference.
+ *
+ * ec_unlock_timer_add() handles both cases.
+ */
+ ec_unlock_timer_add(link);
+
+ /* We need to resume the fop that was waiting for the delayed
+ * unlock.
+ */
+ ec_resume(link->fop, 0);
}
}