diff options
author | Ashish Pandey <aspandey@redhat.com> | 2017-09-18 14:07:31 +0530 |
---|---|---|
committer | Jeff Darcy <jeff@pl.atyp.us> | 2017-11-10 22:15:37 +0000 |
commit | a87abbd42e8b02deabbdfe290b16ed0d2f2e4c45 (patch) | |
tree | ac36ba79b6e7a104cc49a9413a330d8499aee647 /xlators/cluster/ec/src/ec-inode-write.c | |
parent | 83558c69736878d2554ba77af3a6e27574da9447 (diff) |
cluster/ec: Keep last written strip in in-memory cache
Problem:
Consider an EC volume with configuration 4 + 2.
The stripe size for this would be 512 * 4 = 2048.
That means, 2048 bytes of user data stored in one
stripe. Let's say 2048 + 512 = 2560 bytes are
already written on this volume. 512 Bytes would
be in second stripe. Now, if there are sequential
writes with offset 2560 and of size 1 Byte, we have
to read the whole stripe, encode it with 1 Byte and
then again have to write it back. Next, write with
offset 2561 and size of 1 Byte will again
READ-MODIFY-WRITE the whole stripe. This is causing
bad performance because of lots of READ request
travelling over the network.
There are some tools and scenario's where such kind
of load is coming and users are not aware of that.
Example: fio and zip
Solution:
One possible solution to deal with this issue is to
keep last stripe in memory. This way, we need not to
read it again and we can save READ fop going over the
network. Considering the above example, we have to
keep last 2048 bytes (maximum) in memory per file.
Change-Id: I3f95e6fc3ff81953646d374c445a40c6886b0b85
BUG: 1471753
Signed-off-by: Ashish Pandey <aspandey@redhat.com>
Diffstat (limited to 'xlators/cluster/ec/src/ec-inode-write.c')
-rw-r--r-- | xlators/cluster/ec/src/ec-inode-write.c | 211 |
1 files changed, 197 insertions, 14 deletions
diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c index ae5120226e3..2f8170d1ac0 100644 --- a/xlators/cluster/ec/src/ec-inode-write.c +++ b/xlators/cluster/ec/src/ec-inode-write.c @@ -18,6 +18,7 @@ #include "ec-combine.h" #include "ec-method.h" #include "ec-fops.h" +#include "ec-mem-types.h" int32_t ec_update_writev_cbk (call_frame_t *frame, void *cookie, @@ -1169,12 +1170,14 @@ void ec_discard_adjust_offset_size(ec_fop_data_t *fop) * write zeros. */ fop->int32 = ec_adjust_offset_up(ec, &fop->offset, _gf_true); + fop->frag_range.first = fop->offset; if (fop->size < fop->int32) { fop->size = 0; } else { fop->size -= fop->int32; ec_adjust_size_down(ec, &fop->size, _gf_true); } + fop->frag_range.last = fop->offset + fop->size; } int32_t ec_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, @@ -1436,6 +1439,8 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state) case EC_STATE_INIT: fop->user_size = fop->offset; ec_adjust_offset_up(fop->xl->private, &fop->offset, _gf_true); + fop->frag_range.first = fop->offset; + fop->frag_range.last = UINT64_MAX; /* Fall through */ @@ -1696,6 +1701,78 @@ out: } /* FOP: writev */ +static ec_stripe_t * +ec_allocate_stripe (ec_t *ec, ec_stripe_list_t *stripe_cache) +{ + ec_stripe_t *stripe = NULL; + + if (stripe_cache->count >= stripe_cache->max) { + GF_ASSERT (!list_empty(&stripe_cache->lru)); + stripe = list_first_entry(&stripe_cache->lru, ec_stripe_t, lru); + list_move_tail(&stripe->lru, &stripe_cache->lru); + GF_ATOMIC_INC(ec->stats.stripe_cache.evicts); + } else { + stripe = GF_MALLOC (sizeof (ec_stripe_t) + ec->stripe_size, + ec_mt_ec_stripe_t); + if (stripe != NULL) { + stripe_cache->count++; + list_add_tail (&stripe->lru, &stripe_cache->lru); + GF_ATOMIC_INC(ec->stats.stripe_cache.allocs); + } else { + GF_ATOMIC_INC(ec->stats.stripe_cache.errors); + } + } + + return stripe; +} + +static void +ec_write_stripe_data (ec_t *ec, ec_fop_data_t *fop, + ec_stripe_t *stripe) +{ + off_t base; + + base = fop->size - ec->stripe_size; + memcpy(stripe->data, fop->vector[0].iov_base + base, ec->stripe_size); + stripe->frag_offset = fop->frag_range.last - ec->fragment_size; +} + +static void +ec_add_stripe_in_cache (ec_t *ec, ec_fop_data_t *fop) +{ + ec_inode_t *ctx = NULL; + ec_stripe_t *stripe = NULL; + ec_stripe_list_t *stripe_cache = NULL; + gf_boolean_t failed = _gf_true; + + LOCK(&fop->fd->inode->lock); + + ctx = __ec_inode_get (fop->fd->inode, fop->xl); + if (ctx == NULL) { + goto out; + } + + stripe_cache = &ctx->stripe_cache; + if (stripe_cache->max > 0) { + stripe = ec_allocate_stripe (ec, stripe_cache); + if (stripe == NULL) { + goto out; + } + + ec_write_stripe_data (ec, fop, stripe); + } + + failed = _gf_false; + +out: + UNLOCK(&fop->fd->inode->lock); + + if (failed) { + gf_msg (ec->xl->name, GF_LOG_DEBUG, ENOMEM, + EC_MSG_FILE_DESC_REF_FAIL, + "Failed to create and add stripe in cache"); + } +} int32_t ec_writev_merge_tail(call_frame_t * frame, void * cookie, xlator_t * this, int32_t op_ret, int32_t op_errno, @@ -1725,8 +1802,11 @@ int32_t ec_writev_merge_tail(call_frame_t * frame, void * cookie, { memset(fop->vector[0].iov_base + fop->size - size, 0, size); } - } + if (ec->stripe_cache) { + ec_add_stripe_in_cache (ec, fop); + } + } return 0; } @@ -1775,7 +1855,7 @@ ec_make_internal_fop_xdata (dict_t **xdata) dict_t *dict = NULL; if (*xdata) - return 0; + return 0; dict = dict_new(); if (!dict) @@ -1802,8 +1882,10 @@ ec_writev_prepare_buffers(ec_t *ec, ec_fop_data_t *fop) fop->user_size = iov_length(fop->vector, fop->int32); fop->head = ec_adjust_offset_down(ec, &fop->offset, _gf_false); + fop->frag_range.first = fop->offset / ec->fragments; fop->size = fop->user_size + fop->head; ec_adjust_size_up(ec, &fop->size, _gf_false); + fop->frag_range.last = fop->frag_range.first + fop->size / ec->fragments; if ((fop->int32 != 1) || (fop->head != 0) || (fop->size > fop->user_size) || @@ -1850,6 +1932,98 @@ out: return err; } +static void +ec_merge_stripe_head_locked (ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe) +{ + size_t head, size; + + head = fop->head; + memcpy(fop->vector[0].iov_base, stripe->data, head); + + size = ec->stripe_size - head; + if (size > fop->user_size) { + head += fop->user_size; + size = ec->stripe_size - head; + memcpy(fop->vector[0].iov_base + head, stripe->data + head, + size); + } +} + +static void +ec_merge_stripe_tail_locked (ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe) +{ + size_t head, tail; + off_t offset; + + offset = fop->user_size + fop->head; + tail = fop->size - offset; + head = ec->stripe_size - tail; + + memcpy(fop->vector[0].iov_base + offset, stripe->data + head, tail); +} + +static ec_stripe_t * +ec_get_stripe_from_cache_locked (ec_t *ec, ec_fop_data_t *fop, + uint64_t frag_offset) +{ + ec_inode_t *ctx = NULL; + ec_stripe_t *stripe = NULL; + ec_stripe_list_t *stripe_cache = NULL; + + ctx = __ec_inode_get (fop->fd->inode, fop->xl); + if (ctx == NULL) { + GF_ATOMIC_INC(ec->stats.stripe_cache.errors); + return NULL; + } + + stripe_cache = &ctx->stripe_cache; + list_for_each_entry (stripe, &stripe_cache->lru, lru) { + if (stripe->frag_offset == frag_offset) { + list_move_tail (&stripe->lru, &stripe_cache->lru); + GF_ATOMIC_INC(ec->stats.stripe_cache.hits); + return stripe; + } + } + + GF_ATOMIC_INC(ec->stats.stripe_cache.misses); + + return NULL; +} + +static gf_boolean_t +ec_get_and_merge_stripe (ec_t *ec, ec_fop_data_t *fop, ec_stripe_part_t which) +{ + uint64_t frag_offset; + ec_stripe_t *stripe = NULL; + gf_boolean_t found = _gf_false; + + if (!ec->stripe_cache) { + return found; + } + + LOCK(&fop->fd->inode->lock); + if (which == EC_STRIPE_HEAD) { + frag_offset = fop->frag_range.first; + stripe = ec_get_stripe_from_cache_locked(ec, fop, frag_offset); + if (stripe) { + ec_merge_stripe_head_locked (ec, fop, stripe); + found = _gf_true; + } + } + + if (which == EC_STRIPE_TAIL) { + frag_offset = fop->frag_range.last - ec->fragment_size; + stripe = ec_get_stripe_from_cache_locked(ec, fop, frag_offset); + if (stripe) { + ec_merge_stripe_tail_locked (ec, fop, stripe); + found = _gf_true; + } + } + UNLOCK(&fop->fd->inode->lock); + + return found; +} + void ec_writev_start(ec_fop_data_t *fop) { ec_t *ec = fop->xl->private; @@ -1858,6 +2032,7 @@ void ec_writev_start(ec_fop_data_t *fop) dict_t *xdata = NULL; uint64_t tail, current; int32_t err = -ENOMEM; + gf_boolean_t found_stripe = _gf_false; /* This shouldn't fail because we have the inode locked. */ GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, ¤t)); @@ -1884,15 +2059,19 @@ void ec_writev_start(ec_fop_data_t *fop) if (err != 0) { goto failed_fd; } - if (fop->head > 0) { - if (ec_make_internal_fop_xdata (&xdata)) { - err = -ENOMEM; - goto failed_xdata; + found_stripe = ec_get_and_merge_stripe (ec, fop, EC_STRIPE_HEAD); + if (!found_stripe) { + if (ec_make_internal_fop_xdata (&xdata)) { + err = -ENOMEM; + goto failed_xdata; + } + ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, + ec_writev_merge_head, + NULL, fd, ec->stripe_size, fop->offset, 0, xdata); } - ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, ec_writev_merge_head, - NULL, fd, ec->stripe_size, fop->offset, 0, xdata); } + tail = fop->size - fop->user_size - fop->head; if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) { /* Current locking scheme will make sure the 'current' below will @@ -1900,13 +2079,17 @@ void ec_writev_start(ec_fop_data_t *fop) * work as expected */ if (current > fop->offset + fop->head + fop->user_size) { - if (ec_make_internal_fop_xdata (&xdata)) { - err = -ENOMEM; - goto failed_xdata; + found_stripe = ec_get_and_merge_stripe (ec, fop, EC_STRIPE_TAIL); + if (!found_stripe) { + if (ec_make_internal_fop_xdata (&xdata)) { + err = -ENOMEM; + goto failed_xdata; + } + ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, + ec_writev_merge_tail, NULL, fd, ec->stripe_size, + fop->offset + fop->size - ec->stripe_size, + 0, xdata); } - ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, - ec_writev_merge_tail, NULL, fd, ec->stripe_size, - fop->offset + fop->size - ec->stripe_size, 0, xdata); } else { memset(fop->vector[0].iov_base + fop->size - tail, 0, tail); } |