cluster/ec: Fix race in timer cancellation

A race in timer cancellation for delayed unlock could cause a crash if the cancelling thread fails to cancel the timer because it has already been fired but not executed, and the callback is scheduled out of the CPU, delaying it until the thread has released important resources needed by the callback. This patch improves the handling of this case to make it robust. Change-Id: I5c8a8c6610c5136f71b938aa78b5878ba05238d4 BUG: 1345855 Signed-off-by: Xavier Hernandez <xhernandez@datalab.es> Reviewed-on: http://review.gluster.org/14712 Smoke: Gluster Build System <jenkins@build.gluster.com> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
author: Xavier Hernandez <xhernandez@datalab.es> 2016-06-13 12:42:47 +0200
committer: Pranith Kumar Karampuri <pkarampu@redhat.com> 2016-06-13 18:03:18 -0700
commit: fb013a9db2cc019d36b07644f24e6c15ed39725c (patch)
tree: d8a6d3f462fd3601a21096971ed434730da9726c /xlators/cluster/ec
parent: e0b057cf28abc534f1fc74f4ea921c78a52ab6fe (diff)
1 files changed, 56 insertions, 15 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index c32a28d2ed1..175f6dfa71f 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -1529,8 +1529,23 @@ ec_lock_assign_owner(ec_lock_link_t *link)
         timer_link = lock->timer->data;
         GF_ASSERT(timer_link != NULL);
 
-        gf_timer_call_cancel(fop->xl->ctx, lock->timer);
-        ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
+        if (gf_timer_call_cancel(fop->xl->ctx, lock->timer) < 0) {
+            /* It's too late to avoid the execution of the timer callback.
+             * Since we need to be sure that the callback has access to all
+             * needed resources, we cannot resume the execution of the timer
+             * fop now. This will be done in the callback.
+             */
+            timer_link = NULL;
+        } else {
+            /* The timer has been cancelled, so we need to release the owner
+             * reference that was held by the fop waiting for the timer. This
+             * can be the last reference, but we'll immediately increment it
+             * for the current fop, so no need to check it.
+             */
+            lock->refs_owners--;
+
+            ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
+        }
 
         /* We have two options here:
          *
@@ -1548,13 +1563,8 @@ ec_lock_assign_owner(ec_lock_link_t *link)
          *    safe until we release it. In this case we can safely clear
          *    lock->timer. This will cause that the timer callback does nothing
          *    once it acquires the mutex.
-         *
-         * In both cases we must release the owner reference assigned to the
-         * fop that was handling the unlock because ec_unlock_now() won't be
-         * called for that fop.
          */
         lock->timer = NULL;
-        lock->refs_owners--;
     }
 
     lock->exclusive |= (fop->flags & EC_FLAG_LOCK_SHARED) == 0;
@@ -1965,6 +1975,8 @@ ec_unlock_now(ec_lock_link_t *link)
     ec_resume(link->fop, 0);
 }
 
+void ec_unlock_timer_add(ec_lock_link_t *link);
+
 void
 ec_unlock_timer_del(ec_lock_link_t *link)
 {
@@ -1972,17 +1984,21 @@ ec_unlock_timer_del(ec_lock_link_t *link)
         inode_t *inode;
         gf_boolean_t now = _gf_false;
 
+        /* If we are here, it means that the timer has expired before having
+         * been cancelled. This guarantees that 'link' is still valid because
+         * the fop that contains it must be pending (if timer cancellation in
+         * ec_lock_assign_owner() fails, the fop is left sleeping).
+         *
+         * At the same time, the fop still has a reference to the lock, so
+         * it must also be valid.
+         */
         lock = link->lock;
 
-        /* A race condition can happen if timer expires, calls this function
-         * and the lock is released (lock->loc is wiped) but the fop is not
-         * fully completed yet (it's still on the list of pending fops). In
-         * this case, this function can also be called if ec_unlock_force() is
-         * called. */
+        /* 'lock' must have a valid inode since it can only be destroyed
+         * when the lock itself is destroyed, but we have a reference to the
+         * lock to avoid this.
+         */
         inode = lock->loc.inode;
-        if (inode == NULL) {
-                return;
-        }
 
         LOCK(&inode->lock);
 
@@ -2013,6 +2029,31 @@ ec_unlock_timer_del(ec_lock_link_t *link)
 
         if (now) {
                 ec_unlock_now(link);
+        } else {
+                /* The timer has been cancelled just after firing it but before
+                 * getting here. This means that another fop has used the lock
+                 * and everything should be handled as if this callback were
+                 * have not been executed. However we still have an owner
+                 * reference.
+                 *
+                 * We need to release our reference. If this is not the last
+                 * reference (the most common case because another fop has
+                 * taken another ref) we only need to decrement the counter.
+                 * Otherwise we have been delayed enough so that the other fop
+                 * has had time to acquire the reference, do its operation and
+                 * release it. At the time of releasing it, the fop did found
+                 * that the ref counter was > 1 (our reference), so the delayed
+                 * unlock timer wasn't started. We need to start it again if we
+                 * are the last reference.
+                 *
+                 * ec_unlock_timer_add() handles both cases.
+                 */
+                ec_unlock_timer_add(link);
+
+                /* We need to resume the fop that was waiting for the delayed
+                 * unlock.
+                 */
+                ec_resume(link->fop, 0);
         }
 }
author	Xavier Hernandez <xhernandez@datalab.es>	2016-06-13 12:42:47 +0200
committer	Pranith Kumar Karampuri <pkarampu@redhat.com>	2016-06-13 18:03:18 -0700
commit	fb013a9db2cc019d36b07644f24e6c15ed39725c (patch)
tree	d8a6d3f462fd3601a21096971ed434730da9726c /xlators/cluster/ec
parent	e0b057cf28abc534f1fc74f4ea921c78a52ab6fe (diff)