cluster/ec: Keep last written strip in in-memory cache

Problem: Consider an EC volume with configuration 4 + 2. The stripe size for this would be 512 * 4 = 2048. That means, 2048 bytes of user data stored in one stripe. Let's say 2048 + 512 = 2560 bytes are already written on this volume. 512 Bytes would be in second stripe. Now, if there are sequential writes with offset 2560 and of size 1 Byte, we have to read the whole stripe, encode it with 1 Byte and then again have to write it back. Next, write with offset 2561 and size of 1 Byte will again READ-MODIFY-WRITE the whole stripe. This is causing bad performance because of lots of READ request travelling over the network. There are some tools and scenario's where such kind of load is coming and users are not aware of that. Example: fio and zip Solution: One possible solution to deal with this issue is to keep last stripe in memory. This way, we need not to read it again and we can save READ fop going over the network. Considering the above example, we have to keep last 2048 bytes (maximum) in memory per file. Change-Id: I3f95e6fc3ff81953646d374c445a40c6886b0b85 BUG: 1471753 Signed-off-by: Ashish Pandey <aspandey@redhat.com>
author: Ashish Pandey <aspandey@redhat.com> 2017-09-18 14:07:31 +0530
committer: Jeff Darcy <jeff@pl.atyp.us> 2017-11-10 22:15:37 +0000
commit: a87abbd42e8b02deabbdfe290b16ed0d2f2e4c45 (patch)
tree: ac36ba79b6e7a104cc49a9413a330d8499aee647 /xlators/cluster/ec/src/ec-common.c
parent: 83558c69736878d2554ba77af3a6e27574da9447 (diff)
1 files changed, 115 insertions, 0 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index ea5773c9879..29ab66f374c 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -1437,6 +1437,23 @@ ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
     return found;
 }
 
+static void
+ec_release_stripe_cache (ec_inode_t *ctx)
+{
+        ec_stripe_list_t *stripe_cache = NULL;
+        ec_stripe_t      *stripe = NULL;
+
+        stripe_cache = &ctx->stripe_cache;
+        while (!list_empty (&stripe_cache->lru)) {
+                stripe = list_first_entry (&stripe_cache->lru, ec_stripe_t,
+                                           lru);
+                list_del (&stripe->lru);
+                GF_FREE (stripe);
+        }
+        stripe_cache->count = 0;
+        stripe_cache->max = 0;
+}
+
 void ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode)
 {
     ec_inode_t *ctx;
@@ -1448,6 +1465,7 @@ void ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode)
         goto unlock;
     }
 
+    ec_release_stripe_cache (ctx);
     ctx->have_info = _gf_false;
     ctx->have_config = _gf_false;
     ctx->have_version = _gf_false;
@@ -2465,6 +2483,102 @@ ec_use_eager_lock(ec_t *ec, ec_fop_data_t *fop)
         return ec->other_eager_lock;
 }
 
+static void
+ec_update_stripe(ec_t *ec, ec_stripe_list_t *stripe_cache, ec_stripe_t *stripe,
+                 ec_fop_data_t *fop)
+{
+        off_t base;
+
+        /* On write fops, we only update existing fragments if the write has
+         * succeeded. Otherwise, we remove them from the cache. */
+        if ((fop->id == GF_FOP_WRITE) && (fop->answer != NULL) &&
+            (fop->answer->op_ret >= 0)) {
+                base = stripe->frag_offset - fop->frag_range.first;
+                base *= ec->fragments;
+
+                /* We check if the stripe offset falls inside the real region
+                 * modified by the write fop (a write request is allowed,
+                 * though uncommon, to write less bytes than requested). The
+                 * current write fop implementation doesn't allow partial
+                 * writes of fragments, so if there's no error, we are sure
+                 * that a full stripe has been completely modified or not
+                 * touched at all. The value of op_ret may not be a multiple
+                 * of the stripe size because it depends on the requested
+                 * size by the user, so we update the stripe if the write has
+                 * modified at least one byte (meaning ec has written the full
+                 * stripe). */
+                if (base < fop->answer->op_ret) {
+                        memcpy(stripe->data, fop->vector[0].iov_base + base,
+                               ec->stripe_size);
+                        list_move_tail(&stripe->lru, &stripe_cache->lru);
+
+                        GF_ATOMIC_INC(ec->stats.stripe_cache.updates);
+                }
+        } else {
+                stripe->frag_offset = -1;
+                list_move (&stripe->lru, &stripe_cache->lru);
+
+                GF_ATOMIC_INC(ec->stats.stripe_cache.invals);
+        }
+}
+
+static void
+ec_update_cached_stripes (ec_fop_data_t *fop)
+{
+        uint64_t          first;
+        uint64_t          last;
+        ec_stripe_t      *stripe       = NULL;
+        ec_inode_t       *ctx          = NULL;
+        ec_stripe_list_t *stripe_cache = NULL;
+        inode_t          *inode        = NULL;
+        struct list_head *temp;
+        struct list_head  sentinel;
+
+        first = fop->frag_range.first;
+        /* 'last' represents the first stripe not touched by the operation */
+        last = fop->frag_range.last;
+
+        /* If there are no modified stripes, we don't need to do anything
+         * else. */
+        if (last <= first) {
+                return;
+        }
+
+        if (!fop->use_fd) {
+                inode = fop->loc[0].inode;
+        } else {
+                inode = fop->fd->inode;
+        }
+
+        LOCK(&inode->lock);
+
+        ctx = __ec_inode_get (inode, fop->xl);
+        if (ctx == NULL) {
+                goto out;
+        }
+        stripe_cache = &ctx->stripe_cache;
+
+        /* Since we'll be moving elements of the list to the tail, we might
+         * end in an infinite loop. To avoid it, we insert a sentinel element
+         * into the list, so that it will be used to detect when we have
+         * traversed all existing elements once. */
+        list_add_tail(&sentinel, &stripe_cache->lru);
+        temp = stripe_cache->lru.next;
+        while (temp != &sentinel) {
+                stripe = list_entry(temp, ec_stripe_t, lru);
+                temp = temp->next;
+                if ((first <= stripe->frag_offset) &&
+                    (stripe->frag_offset < last)) {
+                        ec_update_stripe (fop->xl->private, stripe_cache,
+                                          stripe, fop);
+                }
+        }
+        list_del(&sentinel);
+
+out:
+        UNLOCK(&inode->lock);
+}
+
 void ec_lock_reuse(ec_fop_data_t *fop)
 {
     ec_cbk_data_t *cbk;
@@ -2491,6 +2605,7 @@ void ec_lock_reuse(ec_fop_data_t *fop)
          * the lock. */
         release = _gf_true;
     }
+    ec_update_cached_stripes (fop);
 
     for (i = 0; i < fop->lock_count; i++) {
         ec_lock_next_owner(&fop->locks[i], cbk, release);
author	Ashish Pandey <aspandey@redhat.com>	2017-09-18 14:07:31 +0530
committer	Jeff Darcy <jeff@pl.atyp.us>	2017-11-10 22:15:37 +0000
commit	a87abbd42e8b02deabbdfe290b16ed0d2f2e4c45 (patch)
tree	ac36ba79b6e7a104cc49a9413a330d8499aee647 /xlators/cluster/ec/src/ec-common.c
parent	83558c69736878d2554ba77af3a6e27574da9447 (diff)