cluster/ec: Keep last written strip in in-memory cache

Problem: Consider an EC volume with configuration 4 + 2. The stripe size for this would be 512 * 4 = 2048. That means, 2048 bytes of user data stored in one stripe. Let's say 2048 + 512 = 2560 bytes are already written on this volume. 512 Bytes would be in second stripe. Now, if there are sequential writes with offset 2560 and of size 1 Byte, we have to read the whole stripe, encode it with 1 Byte and then again have to write it back. Next, write with offset 2561 and size of 1 Byte will again READ-MODIFY-WRITE the whole stripe. This is causing bad performance because of lots of READ request travelling over the network. There are some tools and scenario's where such kind of load is coming and users are not aware of that. Example: fio and zip Solution: One possible solution to deal with this issue is to keep last stripe in memory. This way, we need not to read it again and we can save READ fop going over the network. Considering the above example, we have to keep last 2048 bytes (maximum) in memory per file. Change-Id: I3f95e6fc3ff81953646d374c445a40c6886b0b85 BUG: 1471753 Signed-off-by: Ashish Pandey <aspandey@redhat.com>
author: Ashish Pandey <aspandey@redhat.com> 2017-09-18 14:07:31 +0530
committer: Jeff Darcy <jeff@pl.atyp.us> 2017-11-10 22:15:37 +0000
commit: a87abbd42e8b02deabbdfe290b16ed0d2f2e4c45 (patch)
tree: ac36ba79b6e7a104cc49a9413a330d8499aee647 /xlators/cluster/ec/src/ec.c
parent: 83558c69736878d2554ba77af3a6e27574da9447 (diff)
1 files changed, 52 insertions, 0 deletions
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
index 9f361a54aa3..275dd15a302 100644
--- a/xlators/cluster/ec/src/ec.c
+++ b/xlators/cluster/ec/src/ec.c
@@ -299,6 +299,8 @@ reconfigure (xlator_t *this, dict_t *options)
                           options, bool, failed);
         GF_OPTION_RECONF ("parallel-writes", ec->parallel_writes,
                           options, bool, failed);
+        GF_OPTION_RECONF ("stripe-cache", ec->stripe_cache, options, uint32,
+                          failed);
         ret = 0;
         if (ec_assign_read_policy (ec, read_policy)) {
                 ret = -1;
@@ -581,6 +583,18 @@ notify (xlator_t *this, int32_t event, void *data, ...)
         return ret;
 }
 
+static void
+ec_statistics_init(ec_t *ec)
+{
+        GF_ATOMIC_INIT(ec->stats.stripe_cache.hits, 0);
+        GF_ATOMIC_INIT(ec->stats.stripe_cache.misses, 0);
+        GF_ATOMIC_INIT(ec->stats.stripe_cache.updates, 0);
+        GF_ATOMIC_INIT(ec->stats.stripe_cache.invals, 0);
+        GF_ATOMIC_INIT(ec->stats.stripe_cache.evicts, 0);
+        GF_ATOMIC_INIT(ec->stats.stripe_cache.allocs, 0);
+        GF_ATOMIC_INIT(ec->stats.stripe_cache.errors, 0);
+}
+
 int32_t
 init (xlator_t *this)
 {
@@ -671,6 +685,7 @@ init (xlator_t *this)
     GF_OPTION_INIT ("shd-wait-qlength", ec->shd.wait_qlength, uint32, failed);
     GF_OPTION_INIT ("optimistic-change-log", ec->optimistic_changelog, bool, failed);
     GF_OPTION_INIT ("parallel-writes", ec->parallel_writes, bool, failed);
+    GF_OPTION_INIT ("stripe-cache", ec->stripe_cache, uint32, failed);
 
     this->itable = inode_table_new (EC_SHD_INODE_LRU_LIMIT, this);
     if (!this->itable)
@@ -697,6 +712,8 @@ init (xlator_t *this)
         goto failed;
     }
 
+    ec_statistics_init(ec);
+
     return 0;
 
 failed:
@@ -1252,6 +1269,9 @@ int32_t ec_gf_forget(xlator_t * this, inode_t * inode)
     if ((inode_ctx_del(inode, this, &value) == 0) && (value != 0))
     {
         ctx = (ec_inode_t *)(uintptr_t)value;
+        /* We can only forget an inode if it has been unlocked, so the stripe
+         * cache should also be empty. */
+        GF_ASSERT(list_empty(&ctx->stripe_cache.lru));
         GF_FREE(ctx);
     }
 
@@ -1313,6 +1333,25 @@ int32_t ec_dump_private(xlator_t *this)
     gf_proc_dump_write("heal-waiters", "%d", ec->heal_waiters);
     gf_proc_dump_write("read-policy", "%s", ec_read_policies[ec->read_policy]);
 
+    snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s.stats.stripe_cache",
+             this->type, this->name);
+    gf_proc_dump_add_section(key_prefix);
+
+    gf_proc_dump_write("hits", "%llu",
+                       GF_ATOMIC_GET(ec->stats.stripe_cache.hits));
+    gf_proc_dump_write("misses", "%llu",
+                       GF_ATOMIC_GET(ec->stats.stripe_cache.misses));
+    gf_proc_dump_write("updates", "%llu",
+                       GF_ATOMIC_GET(ec->stats.stripe_cache.updates));
+    gf_proc_dump_write("invalidations", "%llu",
+                       GF_ATOMIC_GET(ec->stats.stripe_cache.invals));
+    gf_proc_dump_write("evicts", "%llu",
+                       GF_ATOMIC_GET(ec->stats.stripe_cache.evicts));
+    gf_proc_dump_write("allocations", "%llu",
+                       GF_ATOMIC_GET(ec->stats.stripe_cache.allocs));
+    gf_proc_dump_write("errors", "%llu",
+                       GF_ATOMIC_GET(ec->stats.stripe_cache.errors));
+
     return 0;
 }
 
@@ -1512,5 +1551,18 @@ struct volume_options options[] =
       .description = "This controls if writes can be wound in parallel as long"
                      "as it doesn't modify same stripes"
     },
+    {   .key = {"stripe-cache"},
+        .type = GF_OPTION_TYPE_INT,
+        .min = 0,/*Disabling stripe_cache*/
+        .max = EC_STRIPE_CACHE_MAX_SIZE,
+        .default_value = "0",
+        .description =  "This option will keep the last stripe of write fop"
+                        "in memory. If next write falls in this stripe, we need"
+                        "not to read it again from backend and we can save READ"
+                        "fop going over the network. This will improve performance,"
+                        "specially for sequential writes. However, this will also"
+                        "lead to extra memory consumption, maximum "
+                        "(cache size * stripe size) Bytes per open file."
+    },
     { .key = {NULL} }
 };
author	Ashish Pandey <aspandey@redhat.com>	2017-09-18 14:07:31 +0530
committer	Jeff Darcy <jeff@pl.atyp.us>	2017-11-10 22:15:37 +0000
commit	a87abbd42e8b02deabbdfe290b16ed0d2f2e4c45 (patch)
tree	ac36ba79b6e7a104cc49a9413a330d8499aee647 /xlators/cluster/ec/src/ec.c
parent	83558c69736878d2554ba77af3a6e27574da9447 (diff)