From 7ba7a4b27d124f4ee16fe4776a4670cd5b0160c4 Mon Sep 17 00:00:00 2001
From: Xavier Hernandez <xhernandez@datalab.es>
Date: Wed, 15 Jun 2016 14:42:19 +0200
Subject: locks: added inodelk/entrylk contention upcall notifications

The locks xlator now is able to send a contention notification to
the current owner of the lock.

This is only a notification that can be used to improve performance
of some client side operations that might benefit from extended
duration of lock ownership. Nothing is done if the lock owner decides
to ignore the message and to not release the lock. For forced
release of acquired resources, leases must be used.

Change-Id: I7f1ad32a0b4b445505b09908a050080ad848f8e0
Signed-off-by: Xavier Hernandez <xhernandez@datalab.es>
---
 xlators/cluster/ec/src/ec-common.c | 253 ++++++++++++++++++++++++-------------
 xlators/cluster/ec/src/ec-common.h |   1 +
 xlators/cluster/ec/src/ec-types.h  |   2 +
 xlators/cluster/ec/src/ec.c        |  77 ++++++++++-
 4 files changed, 239 insertions(+), 94 deletions(-)

(limited to 'xlators/cluster')

diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index cb627a92c9c..fbc7ac97aa0 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -1847,6 +1847,67 @@ gf_boolean_t ec_lock_acquire(ec_lock_link_t *link)
     return _gf_true;
 }
 
+static ec_lock_link_t *
+ec_lock_timer_cancel(xlator_t *xl, ec_lock_t *lock)
+{
+        ec_lock_link_t *timer_link;
+
+        /* If we don't have any timer, there's nothing to cancel. */
+        if (lock->timer == NULL) {
+                return NULL;
+        }
+
+        /* We are trying to access a lock that has an unlock timer active.
+         * This means that the lock must be idle, i.e. no fop can be in the
+         * owner, waiting or frozen lists. It also means that the lock cannot
+         * have been marked as being released (this is done without timers).
+         * There should only be one owner reference, but it's possible that
+         * some fops are being prepared to use this lock. */
+        GF_ASSERT ((lock->refs_owners == 1) &&
+                   list_empty(&lock->owners) && list_empty(&lock->waiting));
+
+        /* We take the timer_link before cancelling the timer, since a
+         * successful cancellation will destroy it. It must not be NULL
+         * because it references the fop responsible for the delayed unlock
+         * that we are currently trying to cancel. */
+        timer_link = lock->timer->data;
+        GF_ASSERT(timer_link != NULL);
+
+        if (gf_timer_call_cancel(xl->ctx, lock->timer) < 0) {
+                /* It's too late to avoid the execution of the timer callback.
+                 * Since we need to be sure that the callback has access to all
+                 * needed resources, we cannot resume the execution of the
+                 * timer fop now. This will be done in the callback. */
+                timer_link = NULL;
+        } else {
+                /* The timer has been cancelled. The fop referenced by
+                 * timer_link holds the last reference. The caller is
+                 * responsible to release it when not needed anymore. */
+                ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
+        }
+
+        /* We have two options here:
+         *
+         * 1. The timer has been successfully cancelled.
+         *
+         *    This is the easiest case and we can continue with the currently
+         *    acquired lock.
+         *
+         * 2. The timer callback has already been fired.
+         *
+         *    In this case we have not been able to cancel the timer before
+         *    the timer callback has been fired, but we also know that
+         *    lock->timer != NULL. This means that the timer callback is still
+         *    trying to acquire the inode mutex that we currently own. We are
+         *    safe until we release it. In this case we can safely clear
+         *    lock->timer. This will cause that the timer callback does nothing
+         *    once it acquires the mutex.
+         */
+        lock->timer = NULL;
+
+        return timer_link;
+}
+
 static gf_boolean_t
 ec_lock_assign_owner(ec_lock_link_t *link)
 {
@@ -1891,61 +1952,7 @@ ec_lock_assign_owner(ec_lock_link_t *link)
      * empty. */
     GF_ASSERT(list_empty(&lock->frozen));
 
-    if (lock->timer != NULL) {
-        /* We are trying to acquire a lock that has an unlock timer active.
-         * This means that the lock must be idle, i.e. no fop can be in the
-         * owner, waiting or frozen lists. It also means that the lock cannot
-         * have been marked as being released (this is done without timers).
-         * There should only be one owner reference, but it's possible that
-         * some fops are being prepared to use this lock.
-         */
-        GF_ASSERT ((lock->refs_owners == 1) &&
-                   list_empty(&lock->owners) && list_empty(&lock->waiting));
-
-        /* We take the timer_link before cancelling the timer, since a
-         * successful cancellation will destroy it. It must not be NULL
-         * because it references the fop responsible for the delayed unlock
-         * that we are currently trying to cancel. */
-        timer_link = lock->timer->data;
-        GF_ASSERT(timer_link != NULL);
-
-        if (gf_timer_call_cancel(fop->xl->ctx, lock->timer) < 0) {
-            /* It's too late to avoid the execution of the timer callback.
-             * Since we need to be sure that the callback has access to all
-             * needed resources, we cannot resume the execution of the timer
-             * fop now. This will be done in the callback.
-             */
-            timer_link = NULL;
-        } else {
-            /* The timer has been cancelled, so we need to release the owner
-             * reference that was held by the fop waiting for the timer. This
-             * can be the last reference, but we'll immediately increment it
-             * for the current fop, so no need to check it.
-             */
-            lock->refs_owners--;
-
-            ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
-        }
-
-        /* We have two options here:
-         *
-         * 1. The timer has been successfully cancelled.
-         *
-         *    This is the easiest case and we can continue with the currently
-         *    acquired lock.
-         *
-         * 2. The timer callback has already been fired.
-         *
-         *    In this case we have not been able to cancel the timer before
-         *    the timer callback has been fired, but we also know that
-         *    lock->timer != NULL. This means that the timer callback is still
-         *    trying to acquire the inode mutex that we currently own. We are
-         *    safe until we release it. In this case we can safely clear
-         *    lock->timer. This will cause that the timer callback does nothing
-         *    once it acquires the mutex.
-         */
-        lock->timer = NULL;
-    }
+    timer_link = ec_lock_timer_cancel(fop->xl, lock);
 
     if (!list_empty(&lock->owners)) {
         /* There are other owners of this lock. We can only take ownership if
@@ -1965,7 +1972,13 @@ ec_lock_assign_owner(ec_lock_link_t *link)
     }
 
     list_add_tail(&link->owner_list, &lock->owners);
-    lock->refs_owners++;
+
+    /* If timer_link is not NULL, it means that we have inherited the owner
+     * reference assigned to the timer fop. In this case we simply reuse it.
+     * Otherwise we need to increase the number of owners. */
+    if (timer_link == NULL) {
+            lock->refs_owners++;
+    }
 
     assigned = _gf_true;
 
@@ -2383,6 +2396,48 @@ ec_unlock_now(ec_lock_link_t *link)
     ec_resume(link->fop, 0);
 }
 
+void
+ec_lock_release(ec_t *ec, inode_t *inode)
+{
+        ec_lock_t *lock;
+        ec_inode_t *ctx;
+        ec_lock_link_t *timer_link = NULL;
+
+        LOCK(&inode->lock);
+
+        ctx = __ec_inode_get(inode, ec->xl);
+        if (ctx == NULL) {
+                goto done;
+        }
+        lock = ctx->inode_lock;
+        if ((lock == NULL) || !lock->acquired || lock->release) {
+                goto done;
+        }
+
+        gf_msg_debug(ec->xl->name, 0,
+                     "Releasing inode %p due to lock contention", inode);
+
+        /* The lock is not marked to be released, so the frozen list should be
+         * empty. */
+        GF_ASSERT(list_empty(&lock->frozen));
+
+        timer_link = ec_lock_timer_cancel(ec->xl, lock);
+
+        /* We mark the lock to be released as soon as possible. */
+        lock->release = _gf_true;
+
+done:
+        UNLOCK(&inode->lock);
+
+        /* If we have cancelled the timer, we need to start the unlock of the
+         * inode. If there was a timer but we have been unable to cancel it
+         * because it was just triggered, the timer callback will take care
+         * of releasing the inode. */
+        if (timer_link != NULL) {
+                ec_unlock_now(timer_link);
+        }
+}
+
 void ec_unlock_timer_add(ec_lock_link_t *link);
 
 void
@@ -2470,9 +2525,60 @@ void ec_unlock_timer_cbk(void *data)
         ec_unlock_timer_del(data);
 }
 
+static gf_boolean_t
+ec_eager_lock_used(ec_t *ec, ec_fop_data_t *fop)
+{
+        /* Fops with no locks at this point mean that they are sent as sub-fops
+         * of other higher level fops. In this case we simply assume that the
+         * parent fop will take correct care of the eager lock. */
+        if (fop->lock_count == 0) {
+                return _gf_true;
+        }
+
+        /* We may have more than one lock, but this only happens in the rename
+         * fop, and both locks will reference an inode of the same type (a
+         * directory in this case), so we only need to check the first lock. */
+        if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) {
+                return ec->eager_lock;
+        }
+
+        return ec->other_eager_lock;
+}
+
+static uint32_t
+ec_eager_lock_timeout(ec_t *ec, ec_lock_t *lock)
+{
+        if (lock->loc.inode->ia_type == IA_IFREG) {
+                return ec->eager_lock_timeout;
+        }
+
+        return ec->other_eager_lock_timeout;
+}
+
+static gf_boolean_t
+ec_lock_delay_create(ec_lock_link_t *link)
+{
+        struct timespec delay;
+        ec_fop_data_t *fop = link->fop;
+        ec_lock_t *lock = link->lock;
+
+        delay.tv_sec = ec_eager_lock_timeout(fop->xl->private, lock);
+        delay.tv_nsec = 0;
+        lock->timer = gf_timer_call_after(fop->xl->ctx, delay,
+                                          ec_unlock_timer_cbk, link);
+        if (lock->timer == NULL) {
+                gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM,
+                       EC_MSG_UNLOCK_DELAY_FAILED,
+                       "Unable to delay an unlock");
+
+                return _gf_false;
+        }
+
+        return _gf_true;
+}
+
 void ec_unlock_timer_add(ec_lock_link_t *link)
 {
-    struct timespec delay;
     ec_fop_data_t *fop = link->fop;
     ec_lock_t *lock = link->lock;
     gf_boolean_t now = _gf_false;
@@ -2526,19 +2632,12 @@ void ec_unlock_timer_add(ec_lock_link_t *link)
             ec_trace("UNLOCK_DELAY", fop, "lock=%p, release=%d", lock,
                      lock->release);
 
-            delay.tv_sec = 1;
-            delay.tv_nsec = 0;
-            lock->timer = gf_timer_call_after(fop->xl->ctx, delay,
-                                              ec_unlock_timer_cbk, link);
-            if (lock->timer == NULL) {
-                gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM,
-                       EC_MSG_UNLOCK_DELAY_FAILED,
-                       "Unable to delay an unlock");
-
+            if (!ec_lock_delay_create(link)) {
                 /* We are unable to create a new timer. We immediately release
                  * the lock. */
                 lock->release = now = _gf_true;
             }
+
         } else {
             ec_trace("UNLOCK_FORCE", fop, "lock=%p, release=%d", lock,
                      lock->release);
@@ -2583,26 +2682,6 @@ void ec_flush_size_version(ec_fop_data_t * fop)
     ec_update_info(&fop->locks[0]);
 }
 
-static gf_boolean_t
-ec_use_eager_lock(ec_t *ec, ec_fop_data_t *fop)
-{
-        /* Fops with no locks at this point mean that they are sent as sub-fops
-         * of other higher level fops. In this case we simply assume that the
-         * parent fop will take correct care of the eager lock. */
-        if (fop->lock_count == 0) {
-                return _gf_true;
-        }
-
-        /* We may have more than one lock, but this only happens in the rename
-         * fop, and both locks will reference an inode of the same type (a
-         * directory in this case), so we only need to check the first lock. */
-        if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) {
-                return ec->eager_lock;
-        }
-
-        return ec->other_eager_lock;
-}
-
 static void
 ec_update_stripe(ec_t *ec, ec_stripe_list_t *stripe_cache, ec_stripe_t *stripe,
                  ec_fop_data_t *fop)
@@ -2708,7 +2787,7 @@ void ec_lock_reuse(ec_fop_data_t *fop)
     ec = fop->xl->private;
     cbk = fop->answer;
 
-    if (ec_use_eager_lock(ec, fop) && cbk != NULL) {
+    if (ec_eager_lock_used(ec, fop) && cbk != NULL) {
         if (cbk->xdata != NULL) {
             if ((dict_get_int32(cbk->xdata, GLUSTERFS_INODELK_COUNT,
                                 &count) == 0) && (count > 1)) {
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
index c3e291585ef..99e2f0653be 100644
--- a/xlators/cluster/ec/src/ec-common.h
+++ b/xlators/cluster/ec/src/ec-common.h
@@ -102,6 +102,7 @@ void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags,
 void ec_lock(ec_fop_data_t * fop);
 void ec_lock_reuse(ec_fop_data_t *fop);
 void ec_unlock(ec_fop_data_t * fop);
+void ec_lock_release(ec_t *ec, inode_t *inode);
 
 gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
                                uint64_t *size);
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
index 23dc434bc42..15b4c77abfe 100644
--- a/xlators/cluster/ec/src/ec-types.h
+++ b/xlators/cluster/ec/src/ec-types.h
@@ -669,6 +669,8 @@ struct _ec {
     uint32_t           background_heals;
     uint32_t           heal_wait_qlen;
     uint32_t           self_heal_window_size; /* max size of read/writes */
+    uint32_t           eager_lock_timeout;
+    uint32_t           other_eager_lock_timeout;
     struct list_head   pending_fops;
     struct list_head   heal_waiting;
     struct list_head   healing;
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
index 4c80f1283f1..30b0bdcb29c 100644
--- a/xlators/cluster/ec/src/ec.c
+++ b/xlators/cluster/ec/src/ec.c
@@ -271,6 +271,11 @@ reconfigure (xlator_t *this, dict_t *options)
                           bool, failed);
         GF_OPTION_RECONF ("other-eager-lock", ec->other_eager_lock, options,
                           bool, failed);
+        GF_OPTION_RECONF ("eager-lock-timeout", ec->eager_lock_timeout,
+                          options, uint32, failed);
+        GF_OPTION_RECONF ("other-eager-lock-timeout",
+                          ec->other_eager_lock_timeout, options, uint32,
+                          failed);
         GF_OPTION_RECONF ("background-heals", background_heals, options,
                           uint32, failed);
         GF_OPTION_RECONF ("heal-wait-qlength", heal_wait_qlen, options,
@@ -453,6 +458,43 @@ ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state)
         }
 }
 
+static gf_boolean_t
+ec_upcall(ec_t *ec, struct gf_upcall *upcall)
+{
+        struct gf_upcall_cache_invalidation *ci = NULL;
+        struct gf_upcall_inodelk_contention *lc = NULL;
+        inode_t *inode;
+
+        switch (upcall->event_type) {
+        case GF_UPCALL_CACHE_INVALIDATION:
+                ci = upcall->data;
+                ci->flags |= UP_INVAL_ATTR;
+                return _gf_true;
+
+        case GF_UPCALL_INODELK_CONTENTION:
+                lc = upcall->data;
+                if (strcmp(lc->domain, ec->xl->name) != 0) {
+                        /* The lock is not owned by EC, ignore it. */
+                        return _gf_true;
+                }
+                inode = inode_find(((xlator_t *)ec->xl->graph->top)->itable,
+                                   upcall->gfid);
+                /* If inode is not found, it means that it's already released,
+                 * so we can ignore it. Probably it has been released and
+                 * destroyed while the contention notification was being sent.
+                 */
+                if (inode != NULL) {
+                        ec_lock_release(ec, inode);
+                        inode_unref(inode);
+                }
+
+                return _gf_false;
+
+        default:
+                return _gf_true;
+        }
+}
+
 int32_t
 ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
 {
@@ -464,19 +506,13 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
         dict_t            *output   = NULL;
         gf_boolean_t      propagate = _gf_true;
         int32_t           orig_event = event;
-        struct gf_upcall *up_data   = NULL;
-        struct gf_upcall_cache_invalidation *up_ci = NULL;
         uintptr_t mask = 0;
 
         gf_msg_trace (this->name, 0, "NOTIFY(%d): %p, %p",
                 event, data, data2);
 
         if (event == GF_EVENT_UPCALL) {
-                up_data = (struct gf_upcall *)data;
-                if (up_data->event_type == GF_UPCALL_CACHE_INVALIDATION) {
-                        up_ci = (struct gf_upcall_cache_invalidation *)up_data->data;
-                        up_ci->flags |= UP_INVAL_ATTR;
-                }
+                propagate = ec_upcall(ec, data);
                 goto done;
         }
 
@@ -664,6 +700,10 @@ init (xlator_t *this)
     GF_OPTION_INIT ("iam-self-heal-daemon", ec->shd.iamshd, bool, failed);
     GF_OPTION_INIT ("eager-lock", ec->eager_lock, bool, failed);
     GF_OPTION_INIT ("other-eager-lock", ec->other_eager_lock, bool, failed);
+    GF_OPTION_INIT ("eager-lock-timeout", ec->eager_lock_timeout, uint32,
+                    failed);
+    GF_OPTION_INIT ("other-eager-lock-timeout", ec->other_eager_lock_timeout,
+                    uint32, failed);
     GF_OPTION_INIT ("background-heals", ec->background_heals, uint32, failed);
     GF_OPTION_INIT ("heal-wait-qlength", ec->heal_wait_qlen, uint32, failed);
     GF_OPTION_INIT ("self-heal-window-size", ec->self_heal_window_size, uint32,
@@ -1456,6 +1496,29 @@ struct volume_options options[] =
       .description = "It's equivalent to the eager-lock option but for non "
                      "regular files."
     },
+    { .key = {"eager-lock-timeout"},
+      .type = GF_OPTION_TYPE_INT,
+      .min = 1,
+      .max = 60,
+      .default_value = "1",
+      .op_version = { GD_OP_VERSION_4_0_0 },
+      .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+      .tags = { "disperse", "locks", "timeout" },
+      .description = "Maximum time (in seconds) that a lock on an inode is "
+                     "kept held if no new operations on the inode are "
+                     "received."
+    },
+    { .key = {"other-eager-lock-timeout"},
+      .type = GF_OPTION_TYPE_INT,
+      .min = 1,
+      .max = 60,
+      .default_value = "1",
+      .op_version = { GD_OP_VERSION_4_0_0 },
+      .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+      .tags = { "disperse", "locks", "timeout" },
+      .description = "It's equivalent ot eager-lock-timeout option but for "
+                     "non regular files."
+    },
     { .key = {"background-heals"},
       .type = GF_OPTION_TYPE_INT,
       .min = 0,/*Disabling background heals*/
-- 
cgit