summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/ec/src/ec-inode-write.c
diff options
context:
space:
mode:
authorAshish Pandey <aspandey@redhat.com>2017-09-18 14:07:31 +0530
committerJeff Darcy <jeff@pl.atyp.us>2017-11-10 22:15:37 +0000
commita87abbd42e8b02deabbdfe290b16ed0d2f2e4c45 (patch)
treeac36ba79b6e7a104cc49a9413a330d8499aee647 /xlators/cluster/ec/src/ec-inode-write.c
parent83558c69736878d2554ba77af3a6e27574da9447 (diff)
cluster/ec: Keep last written strip in in-memory cache
Problem: Consider an EC volume with configuration 4 + 2. The stripe size for this would be 512 * 4 = 2048. That means, 2048 bytes of user data stored in one stripe. Let's say 2048 + 512 = 2560 bytes are already written on this volume. 512 Bytes would be in second stripe. Now, if there are sequential writes with offset 2560 and of size 1 Byte, we have to read the whole stripe, encode it with 1 Byte and then again have to write it back. Next, write with offset 2561 and size of 1 Byte will again READ-MODIFY-WRITE the whole stripe. This is causing bad performance because of lots of READ request travelling over the network. There are some tools and scenario's where such kind of load is coming and users are not aware of that. Example: fio and zip Solution: One possible solution to deal with this issue is to keep last stripe in memory. This way, we need not to read it again and we can save READ fop going over the network. Considering the above example, we have to keep last 2048 bytes (maximum) in memory per file. Change-Id: I3f95e6fc3ff81953646d374c445a40c6886b0b85 BUG: 1471753 Signed-off-by: Ashish Pandey <aspandey@redhat.com>
Diffstat (limited to 'xlators/cluster/ec/src/ec-inode-write.c')
-rw-r--r--xlators/cluster/ec/src/ec-inode-write.c211
1 files changed, 197 insertions, 14 deletions
diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c
index ae5120226e3..2f8170d1ac0 100644
--- a/xlators/cluster/ec/src/ec-inode-write.c
+++ b/xlators/cluster/ec/src/ec-inode-write.c
@@ -18,6 +18,7 @@
#include "ec-combine.h"
#include "ec-method.h"
#include "ec-fops.h"
+#include "ec-mem-types.h"
int32_t
ec_update_writev_cbk (call_frame_t *frame, void *cookie,
@@ -1169,12 +1170,14 @@ void ec_discard_adjust_offset_size(ec_fop_data_t *fop)
* write zeros.
*/
fop->int32 = ec_adjust_offset_up(ec, &fop->offset, _gf_true);
+ fop->frag_range.first = fop->offset;
if (fop->size < fop->int32) {
fop->size = 0;
} else {
fop->size -= fop->int32;
ec_adjust_size_down(ec, &fop->size, _gf_true);
}
+ fop->frag_range.last = fop->offset + fop->size;
}
int32_t ec_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
@@ -1436,6 +1439,8 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state)
case EC_STATE_INIT:
fop->user_size = fop->offset;
ec_adjust_offset_up(fop->xl->private, &fop->offset, _gf_true);
+ fop->frag_range.first = fop->offset;
+ fop->frag_range.last = UINT64_MAX;
/* Fall through */
@@ -1696,6 +1701,78 @@ out:
}
/* FOP: writev */
+static ec_stripe_t *
+ec_allocate_stripe (ec_t *ec, ec_stripe_list_t *stripe_cache)
+{
+ ec_stripe_t *stripe = NULL;
+
+ if (stripe_cache->count >= stripe_cache->max) {
+ GF_ASSERT (!list_empty(&stripe_cache->lru));
+ stripe = list_first_entry(&stripe_cache->lru, ec_stripe_t, lru);
+ list_move_tail(&stripe->lru, &stripe_cache->lru);
+ GF_ATOMIC_INC(ec->stats.stripe_cache.evicts);
+ } else {
+ stripe = GF_MALLOC (sizeof (ec_stripe_t) + ec->stripe_size,
+ ec_mt_ec_stripe_t);
+ if (stripe != NULL) {
+ stripe_cache->count++;
+ list_add_tail (&stripe->lru, &stripe_cache->lru);
+ GF_ATOMIC_INC(ec->stats.stripe_cache.allocs);
+ } else {
+ GF_ATOMIC_INC(ec->stats.stripe_cache.errors);
+ }
+ }
+
+ return stripe;
+}
+
+static void
+ec_write_stripe_data (ec_t *ec, ec_fop_data_t *fop,
+ ec_stripe_t *stripe)
+{
+ off_t base;
+
+ base = fop->size - ec->stripe_size;
+ memcpy(stripe->data, fop->vector[0].iov_base + base, ec->stripe_size);
+ stripe->frag_offset = fop->frag_range.last - ec->fragment_size;
+}
+
+static void
+ec_add_stripe_in_cache (ec_t *ec, ec_fop_data_t *fop)
+{
+ ec_inode_t *ctx = NULL;
+ ec_stripe_t *stripe = NULL;
+ ec_stripe_list_t *stripe_cache = NULL;
+ gf_boolean_t failed = _gf_true;
+
+ LOCK(&fop->fd->inode->lock);
+
+ ctx = __ec_inode_get (fop->fd->inode, fop->xl);
+ if (ctx == NULL) {
+ goto out;
+ }
+
+ stripe_cache = &ctx->stripe_cache;
+ if (stripe_cache->max > 0) {
+ stripe = ec_allocate_stripe (ec, stripe_cache);
+ if (stripe == NULL) {
+ goto out;
+ }
+
+ ec_write_stripe_data (ec, fop, stripe);
+ }
+
+ failed = _gf_false;
+
+out:
+ UNLOCK(&fop->fd->inode->lock);
+
+ if (failed) {
+ gf_msg (ec->xl->name, GF_LOG_DEBUG, ENOMEM,
+ EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to create and add stripe in cache");
+ }
+}
int32_t ec_writev_merge_tail(call_frame_t * frame, void * cookie,
xlator_t * this, int32_t op_ret, int32_t op_errno,
@@ -1725,8 +1802,11 @@ int32_t ec_writev_merge_tail(call_frame_t * frame, void * cookie,
{
memset(fop->vector[0].iov_base + fop->size - size, 0, size);
}
- }
+ if (ec->stripe_cache) {
+ ec_add_stripe_in_cache (ec, fop);
+ }
+ }
return 0;
}
@@ -1775,7 +1855,7 @@ ec_make_internal_fop_xdata (dict_t **xdata)
dict_t *dict = NULL;
if (*xdata)
- return 0;
+ return 0;
dict = dict_new();
if (!dict)
@@ -1802,8 +1882,10 @@ ec_writev_prepare_buffers(ec_t *ec, ec_fop_data_t *fop)
fop->user_size = iov_length(fop->vector, fop->int32);
fop->head = ec_adjust_offset_down(ec, &fop->offset, _gf_false);
+ fop->frag_range.first = fop->offset / ec->fragments;
fop->size = fop->user_size + fop->head;
ec_adjust_size_up(ec, &fop->size, _gf_false);
+ fop->frag_range.last = fop->frag_range.first + fop->size / ec->fragments;
if ((fop->int32 != 1) || (fop->head != 0) ||
(fop->size > fop->user_size) ||
@@ -1850,6 +1932,98 @@ out:
return err;
}
+static void
+ec_merge_stripe_head_locked (ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe)
+{
+ size_t head, size;
+
+ head = fop->head;
+ memcpy(fop->vector[0].iov_base, stripe->data, head);
+
+ size = ec->stripe_size - head;
+ if (size > fop->user_size) {
+ head += fop->user_size;
+ size = ec->stripe_size - head;
+ memcpy(fop->vector[0].iov_base + head, stripe->data + head,
+ size);
+ }
+}
+
+static void
+ec_merge_stripe_tail_locked (ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe)
+{
+ size_t head, tail;
+ off_t offset;
+
+ offset = fop->user_size + fop->head;
+ tail = fop->size - offset;
+ head = ec->stripe_size - tail;
+
+ memcpy(fop->vector[0].iov_base + offset, stripe->data + head, tail);
+}
+
+static ec_stripe_t *
+ec_get_stripe_from_cache_locked (ec_t *ec, ec_fop_data_t *fop,
+ uint64_t frag_offset)
+{
+ ec_inode_t *ctx = NULL;
+ ec_stripe_t *stripe = NULL;
+ ec_stripe_list_t *stripe_cache = NULL;
+
+ ctx = __ec_inode_get (fop->fd->inode, fop->xl);
+ if (ctx == NULL) {
+ GF_ATOMIC_INC(ec->stats.stripe_cache.errors);
+ return NULL;
+ }
+
+ stripe_cache = &ctx->stripe_cache;
+ list_for_each_entry (stripe, &stripe_cache->lru, lru) {
+ if (stripe->frag_offset == frag_offset) {
+ list_move_tail (&stripe->lru, &stripe_cache->lru);
+ GF_ATOMIC_INC(ec->stats.stripe_cache.hits);
+ return stripe;
+ }
+ }
+
+ GF_ATOMIC_INC(ec->stats.stripe_cache.misses);
+
+ return NULL;
+}
+
+static gf_boolean_t
+ec_get_and_merge_stripe (ec_t *ec, ec_fop_data_t *fop, ec_stripe_part_t which)
+{
+ uint64_t frag_offset;
+ ec_stripe_t *stripe = NULL;
+ gf_boolean_t found = _gf_false;
+
+ if (!ec->stripe_cache) {
+ return found;
+ }
+
+ LOCK(&fop->fd->inode->lock);
+ if (which == EC_STRIPE_HEAD) {
+ frag_offset = fop->frag_range.first;
+ stripe = ec_get_stripe_from_cache_locked(ec, fop, frag_offset);
+ if (stripe) {
+ ec_merge_stripe_head_locked (ec, fop, stripe);
+ found = _gf_true;
+ }
+ }
+
+ if (which == EC_STRIPE_TAIL) {
+ frag_offset = fop->frag_range.last - ec->fragment_size;
+ stripe = ec_get_stripe_from_cache_locked(ec, fop, frag_offset);
+ if (stripe) {
+ ec_merge_stripe_tail_locked (ec, fop, stripe);
+ found = _gf_true;
+ }
+ }
+ UNLOCK(&fop->fd->inode->lock);
+
+ return found;
+}
+
void ec_writev_start(ec_fop_data_t *fop)
{
ec_t *ec = fop->xl->private;
@@ -1858,6 +2032,7 @@ void ec_writev_start(ec_fop_data_t *fop)
dict_t *xdata = NULL;
uint64_t tail, current;
int32_t err = -ENOMEM;
+ gf_boolean_t found_stripe = _gf_false;
/* This shouldn't fail because we have the inode locked. */
GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, &current));
@@ -1884,15 +2059,19 @@ void ec_writev_start(ec_fop_data_t *fop)
if (err != 0) {
goto failed_fd;
}
-
if (fop->head > 0) {
- if (ec_make_internal_fop_xdata (&xdata)) {
- err = -ENOMEM;
- goto failed_xdata;
+ found_stripe = ec_get_and_merge_stripe (ec, fop, EC_STRIPE_HEAD);
+ if (!found_stripe) {
+ if (ec_make_internal_fop_xdata (&xdata)) {
+ err = -ENOMEM;
+ goto failed_xdata;
+ }
+ ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
+ ec_writev_merge_head,
+ NULL, fd, ec->stripe_size, fop->offset, 0, xdata);
}
- ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, ec_writev_merge_head,
- NULL, fd, ec->stripe_size, fop->offset, 0, xdata);
}
+
tail = fop->size - fop->user_size - fop->head;
if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) {
/* Current locking scheme will make sure the 'current' below will
@@ -1900,13 +2079,17 @@ void ec_writev_start(ec_fop_data_t *fop)
* work as expected
*/
if (current > fop->offset + fop->head + fop->user_size) {
- if (ec_make_internal_fop_xdata (&xdata)) {
- err = -ENOMEM;
- goto failed_xdata;
+ found_stripe = ec_get_and_merge_stripe (ec, fop, EC_STRIPE_TAIL);
+ if (!found_stripe) {
+ if (ec_make_internal_fop_xdata (&xdata)) {
+ err = -ENOMEM;
+ goto failed_xdata;
+ }
+ ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
+ ec_writev_merge_tail, NULL, fd, ec->stripe_size,
+ fop->offset + fop->size - ec->stripe_size,
+ 0, xdata);
}
- ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
- ec_writev_merge_tail, NULL, fd, ec->stripe_size,
- fop->offset + fop->size - ec->stripe_size, 0, xdata);
} else {
memset(fop->vector[0].iov_base + fop->size - tail, 0, tail);
}