cluster/ec: Allow parallel writes in EC if possible

Problem: Ec at the moment sends one modification fop after another, so if some of the disks become slow, for a while then the wait time for the writes that are waiting in the queue becomes really bad. Fix: Allow parallel writes when possible. For this we need to make 3 changes. 1) Each fop now has range parameters they will be updating. 2) Xattrop is changed to handle parallel xattrop requests where some would be modifying just dirty xattr. 3) Fops that refer to size now take locks and update the locks. Fixes #251 Change-Id: Ibc3c15372f91bbd6fb617f0d99399b3149fa64b2 Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
author: Pranith Kumar K <pkarampu@redhat.com> 2017-06-25 16:34:01 +0530
committer: Pranith Kumar Karampuri <pkarampu@redhat.com> 2017-10-24 09:30:25 +0000
commit: a8e16c81e3d14b577ac738574e19a16543df419e (patch)
tree: 963abf9f48d65fc469564d9f558aa6fe60bfe296 /xlators/cluster/ec/src/ec-common.c
parent: 1260ee53b1674234e6f083563bdcd258e46a6faa (diff)
1 files changed, 132 insertions, 59 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index 9b9cde8cbc5..f3463ba4489 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -26,6 +26,40 @@
                                    EC_FLAG_WAITING_DATA_DIRTY |\
                                    EC_FLAG_WAITING_METADATA_DIRTY)
 
+off_t
+ec_range_end_get (off_t fl_start, size_t fl_size)
+{
+        off_t fl_end = 0;
+        switch (fl_size) {
+        case 0:
+                return fl_start;
+        case LLONG_MAX: /*Infinity*/
+                return LLONG_MAX;
+        default:
+                fl_end = fl_start + fl_size - 1;
+                if (fl_end < 0) /*over-flow*/
+                        return LLONG_MAX;
+                else
+                        return fl_end;
+        }
+}
+
+static gf_boolean_t
+ec_is_range_conflict (ec_lock_link_t *l1, ec_lock_link_t *l2)
+{
+        return ((l1->fl_end >= l2->fl_start) && (l2->fl_end >= l1->fl_start));
+}
+
+static gf_boolean_t
+ec_lock_conflict (ec_lock_link_t *l1, ec_lock_link_t *l2)
+{
+        if ((l1->fop->flags & EC_FLAG_LOCK_SHARED) &&
+            (l2->fop->flags & EC_FLAG_LOCK_SHARED))
+                return _gf_false;
+
+        return ec_is_range_conflict (l1, l2);
+}
+
 uint32_t
 ec_select_first_by_read_policy (ec_t *ec, ec_fop_data_t *fop)
 {
@@ -729,7 +763,7 @@ int32_t ec_lock_compare(ec_lock_t * lock1, ec_lock_t * lock2)
 }
 
 void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags,
-                    loc_t *base)
+                    loc_t *base, off_t fl_start, size_t fl_size)
 {
     ec_lock_link_t *link;
 
@@ -763,12 +797,15 @@ void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags,
     link->update[EC_DATA_TXN] = (flags & EC_UPDATE_DATA) != 0;
     link->update[EC_METADATA_TXN] = (flags & EC_UPDATE_META) != 0;
     link->base = base;
+    link->fl_start = fl_start;
+    link->fl_end = ec_range_end_get (fl_start, fl_size);
 
     lock->refs_pending++;
 }
 
 void ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc,
-                                    uint32_t flags, loc_t *base)
+                                    uint32_t flags, loc_t *base,
+                                    off_t fl_start, size_t fl_size)
 {
     ec_lock_t *lock = NULL;
     ec_inode_t *ctx;
@@ -829,16 +866,17 @@ void ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc,
     ctx->inode_lock = lock;
 
 insert:
-    ec_lock_insert(fop, lock, flags, base);
+    ec_lock_insert(fop, lock, flags, base, fl_start, fl_size);
 update_query:
     lock->query |= (flags & EC_QUERY_INFO) != 0;
 unlock:
     UNLOCK(&loc->inode->lock);
 }
 
-void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags)
+void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags,
+                           off_t fl_start, size_t fl_size)
 {
-    ec_lock_prepare_inode_internal(fop, loc, flags, NULL);
+    ec_lock_prepare_inode_internal(fop, loc, flags, NULL, fl_start, fl_size);
 }
 
 void ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, loc_t *base,
@@ -864,12 +902,13 @@ void ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, loc_t *base,
             base = NULL;
     }
 
-    ec_lock_prepare_inode_internal(fop, &tmp, flags, base);
+    ec_lock_prepare_inode_internal(fop, &tmp, flags, base, 0, LLONG_MAX);
 
     loc_wipe(&tmp);
 }
 
-void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags)
+void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags,
+                        off_t fl_start, size_t fl_size)
 {
     loc_t loc;
     int32_t err;
@@ -885,7 +924,7 @@ void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags)
         return;
     }
 
-    ec_lock_prepare_inode_internal(fop, &loc, flags, NULL);
+    ec_lock_prepare_inode_internal(fop, &loc, flags, NULL, fl_start, fl_size);
 
     loc_wipe(&loc);
 }
@@ -1319,17 +1358,16 @@ out:
     }
 }
 
-gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
-                               uint64_t *size)
+gf_boolean_t
+__ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
+                    uint64_t *size)
 {
     ec_inode_t *ctx;
     gf_boolean_t found = _gf_false;
 
-    LOCK(&inode->lock);
-
     ctx = __ec_inode_get(inode, fop->xl);
     if (ctx == NULL) {
-        goto unlock;
+        goto out;
     }
 
     if (ctx->have_size) {
@@ -1337,23 +1375,35 @@ gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
         found = _gf_true;
     }
 
-unlock:
+out:
+    return found;
+}
+
+gf_boolean_t
+ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
+                  uint64_t *size)
+{
+    gf_boolean_t found = _gf_false;
+
+    LOCK(&inode->lock);
+    {
+            found = __ec_get_inode_size (fop, inode, size);
+    }
     UNLOCK(&inode->lock);
 
     return found;
 }
 
-gf_boolean_t ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
-                               uint64_t size)
+gf_boolean_t
+__ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
+                    uint64_t size)
 {
     ec_inode_t *ctx;
     gf_boolean_t found = _gf_false;
 
-    LOCK(&inode->lock);
-
     ctx = __ec_inode_get(inode, fop->xl);
     if (ctx == NULL) {
-        goto unlock;
+        goto out;
     }
 
     /* Normal fops always have ctx->have_size set. However self-heal calls this
@@ -1368,8 +1418,21 @@ gf_boolean_t ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
 
     found = _gf_true;
 
-unlock:
-    UNLOCK(&inode->lock);
+out:
+    return found;
+}
+
+gf_boolean_t
+ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
+                  uint64_t size)
+{
+    gf_boolean_t found = _gf_false;
+
+    LOCK (&inode->lock);
+    {
+            found = __ec_set_inode_size (fop, inode, size);
+    }
+    UNLOCK (&inode->lock);
 
     return found;
 }
@@ -1476,34 +1539,47 @@ ec_lock_update_fd(ec_lock_t *lock, ec_fop_data_t *fop)
     }
 }
 
+static gf_boolean_t
+ec_link_has_lock_conflict (ec_lock_link_t *link, struct list_head *owners)
+{
+        ec_lock_link_t *owner_link = NULL;
+        ec_t           *ec = link->fop->xl->private;
+
+        if (!ec->parallel_writes)
+                return _gf_true;
+
+        list_for_each_entry (owner_link, owners, owner_list) {
+                if (ec_lock_conflict (owner_link, link))
+                        return _gf_true;
+        }
+        return _gf_false;
+}
+
 static void
 ec_lock_wake_shared(ec_lock_t *lock, struct list_head *list)
 {
     ec_fop_data_t *fop;
     ec_lock_link_t *link;
-    gf_boolean_t exclusive = _gf_false;
+    gf_boolean_t conflict = _gf_false;
 
-    while (!exclusive && !list_empty(&lock->waiting)) {
+    while (!conflict && !list_empty(&lock->waiting)) {
         link = list_entry(lock->waiting.next, ec_lock_link_t, wait_list);
         fop = link->fop;
 
         /* If lock is not acquired, at most one fop can be assigned as owner.
          * The following fops will need to wait in the lock->waiting queue
          * until the lock has been fully acquired. */
-        exclusive = !lock->acquired;
+        conflict = !lock->acquired;
 
         /* If the fop is not shareable, only this fop can be assigned as owner.
          * Other fops will need to wait until this one finishes. */
-        if ((fop->flags & EC_FLAG_LOCK_SHARED) == 0) {
-            exclusive = _gf_true;
-
-            /* Avoid other requests to be assigned as owners. */
-            lock->exclusive = 1;
+        if (ec_link_has_lock_conflict (link, &lock->owners)) {
+            conflict = _gf_true;
         }
 
         /* If only one fop is allowed, it can be assigned as the owner of the
          * lock only if there weren't any other owner. */
-        if (exclusive && !list_empty(&lock->owners)) {
+        if (conflict && !list_empty(&lock->owners)) {
             break;
         }
 
@@ -1570,9 +1646,7 @@ void ec_lock_acquired(ec_lock_link_t *link)
     lock->acquired = _gf_true;
 
     ec_lock_update_fd(lock, fop);
-    if ((fop->flags & EC_FLAG_LOCK_SHARED) != 0) {
-        ec_lock_wake_shared(lock, &list);
-    }
+    ec_lock_wake_shared(lock, &list);
 
     UNLOCK(&lock->loc.inode->lock);
 
@@ -1683,11 +1757,11 @@ ec_lock_assign_owner(ec_lock_link_t *link)
         /* We are trying to acquire a lock that has an unlock timer active.
          * This means that the lock must be idle, i.e. no fop can be in the
          * owner, waiting or frozen lists. It also means that the lock cannot
-         * have been marked as being released (this is done without timers)
-         * and it must not be exclusive. There should only be one owner
-         * reference, but it's possible that some fops are being prepared to
-         * use this lock. */
-        GF_ASSERT ((lock->exclusive == 0) && (lock->refs_owners == 1) &&
+         * have been marked as being released (this is done without timers).
+         * There should only be one owner reference, but it's possible that
+         * some fops are being prepared to use this lock.
+         */
+        GF_ASSERT ((lock->refs_owners == 1) &&
                    list_empty(&lock->owners) && list_empty(&lock->waiting));
 
         /* We take the timer_link before cancelling the timer, since a
@@ -1735,13 +1809,15 @@ ec_lock_assign_owner(ec_lock_link_t *link)
         lock->timer = NULL;
     }
 
-    lock->exclusive |= (fop->flags & EC_FLAG_LOCK_SHARED) == 0;
-
     if (!list_empty(&lock->owners)) {
         /* There are other owners of this lock. We can only take ownership if
-         * the lock is already acquired and can be shared. Otherwise we need
-         * to wait. */
-        if (!lock->acquired || (lock->exclusive != 0)) {
+         * the lock is already acquired and doesn't have conflict with existing
+         * owners, or waiters(to prevent starvation).
+         * Otherwise we need to wait.
+         */
+        if (!lock->acquired ||
+            ec_link_has_lock_conflict (link, &lock->owners) ||
+            ec_link_has_lock_conflict (link, &lock->waiting)) {
             ec_trace("LOCK_QUEUE_WAIT", fop, "lock=%p", lock);
 
             list_add_tail(&link->wait_list, &lock->waiting);
@@ -1819,10 +1895,7 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
     }
     ec_lock_update_good(lock, fop);
 
-    lock->exclusive -= (fop->flags & EC_FLAG_LOCK_SHARED) == 0;
-    if (list_empty(&lock->owners)) {
-        ec_lock_wake_shared(lock, &list);
-    }
+    ec_lock_wake_shared(lock, &list);
 
     UNLOCK(&lock->loc.inode->lock);
 
@@ -1876,11 +1949,11 @@ ec_lock_unfreeze(ec_lock_link_t *link)
     lock->acquired = _gf_false;
 
     /* We are unfreezing a lock. This means that the lock has already been
-     * released. In this state it shouldn't be exclusive nor have a pending
-     * timer nor have any owner, and the waiting list should be empty. Only
-     * the frozen list can contain some fop. */
-    GF_ASSERT((lock->exclusive == 0) && (lock->timer == NULL) &&
-              list_empty(&lock->waiting) && list_empty(&lock->owners));
+     * released. In this state it shouldn't have a pending timer nor have any
+     * owner, and the waiting list should be empty. Only the frozen list can
+     * contain some fop. */
+    GF_ASSERT((lock->timer == NULL) && list_empty(&lock->waiting) &&
+              list_empty(&lock->owners));
 
     /* We move all frozen fops to the waiting list. */
     list_splice_init(&lock->frozen, &lock->waiting);
@@ -2013,7 +2086,7 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
     ec_fop_data_t *fop;
     ec_lock_t *lock;
     ec_inode_t *ctx;
-    dict_t * dict;
+    dict_t *dict = NULL;
     uintptr_t   update_on = 0;
 
     int32_t err = -ENOMEM;
@@ -2203,12 +2276,12 @@ ec_unlock_timer_del(ec_lock_link_t *link)
                 ec_trace("UNLOCK_DELAYED", link->fop, "lock=%p", lock);
 
                 /* The unlock timer has expired without anyone cancelling it.
-                 * This means that it shouldn't have any owner, and the
-                 * waiting and frozen lists should be empty. It shouldn't have
-                 * been marked as release nor be exclusive either. It must have
-                 * only one owner reference, but there can be fops being
-                 * prepared though. */
-                GF_ASSERT(!lock->release && (lock->exclusive == 0) &&
+                 * This means that it shouldn't have any owner, and the waiting
+                 * and frozen lists should be empty.  It must have only one
+                 * owner reference, but there can be fops being prepared
+                 * though.
+                 * */
+                GF_ASSERT(!lock->release &&
                           (lock->refs_owners == 1) &&
                           list_empty(&lock->owners) &&
                           list_empty(&lock->waiting) &&
author	Pranith Kumar K <pkarampu@redhat.com>	2017-06-25 16:34:01 +0530
committer	Pranith Kumar Karampuri <pkarampu@redhat.com>	2017-10-24 09:30:25 +0000
commit	a8e16c81e3d14b577ac738574e19a16543df419e (patch)
tree	963abf9f48d65fc469564d9f558aa6fe60bfe296 /xlators/cluster/ec/src/ec-common.c
parent	1260ee53b1674234e6f083563bdcd258e46a6faa (diff)