summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/ec/src
diff options
context:
space:
mode:
authorAshish Pandey <aspandey@redhat.com>2017-09-18 14:07:31 +0530
committerJeff Darcy <jeff@pl.atyp.us>2017-11-10 22:15:37 +0000
commita87abbd42e8b02deabbdfe290b16ed0d2f2e4c45 (patch)
treeac36ba79b6e7a104cc49a9413a330d8499aee647 /xlators/cluster/ec/src
parent83558c69736878d2554ba77af3a6e27574da9447 (diff)
cluster/ec: Keep last written strip in in-memory cache
Problem: Consider an EC volume with configuration 4 + 2. The stripe size for this would be 512 * 4 = 2048. That means, 2048 bytes of user data stored in one stripe. Let's say 2048 + 512 = 2560 bytes are already written on this volume. 512 Bytes would be in second stripe. Now, if there are sequential writes with offset 2560 and of size 1 Byte, we have to read the whole stripe, encode it with 1 Byte and then again have to write it back. Next, write with offset 2561 and size of 1 Byte will again READ-MODIFY-WRITE the whole stripe. This is causing bad performance because of lots of READ request travelling over the network. There are some tools and scenario's where such kind of load is coming and users are not aware of that. Example: fio and zip Solution: One possible solution to deal with this issue is to keep last stripe in memory. This way, we need not to read it again and we can save READ fop going over the network. Considering the above example, we have to keep last 2048 bytes (maximum) in memory per file. Change-Id: I3f95e6fc3ff81953646d374c445a40c6886b0b85 BUG: 1471753 Signed-off-by: Ashish Pandey <aspandey@redhat.com>
Diffstat (limited to 'xlators/cluster/ec/src')
-rw-r--r--xlators/cluster/ec/src/ec-common.c115
-rw-r--r--xlators/cluster/ec/src/ec-common.h1
-rw-r--r--xlators/cluster/ec/src/ec-helpers.c15
-rw-r--r--xlators/cluster/ec/src/ec-inode-write.c211
-rw-r--r--xlators/cluster/ec/src/ec-mem-types.h1
-rw-r--r--xlators/cluster/ec/src/ec-types.h206
-rw-r--r--xlators/cluster/ec/src/ec.c52
-rw-r--r--xlators/cluster/ec/src/ec.h2
8 files changed, 519 insertions, 84 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index ea5773c9879..29ab66f374c 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -1437,6 +1437,23 @@ ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
return found;
}
+static void
+ec_release_stripe_cache (ec_inode_t *ctx)
+{
+ ec_stripe_list_t *stripe_cache = NULL;
+ ec_stripe_t *stripe = NULL;
+
+ stripe_cache = &ctx->stripe_cache;
+ while (!list_empty (&stripe_cache->lru)) {
+ stripe = list_first_entry (&stripe_cache->lru, ec_stripe_t,
+ lru);
+ list_del (&stripe->lru);
+ GF_FREE (stripe);
+ }
+ stripe_cache->count = 0;
+ stripe_cache->max = 0;
+}
+
void ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode)
{
ec_inode_t *ctx;
@@ -1448,6 +1465,7 @@ void ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode)
goto unlock;
}
+ ec_release_stripe_cache (ctx);
ctx->have_info = _gf_false;
ctx->have_config = _gf_false;
ctx->have_version = _gf_false;
@@ -2465,6 +2483,102 @@ ec_use_eager_lock(ec_t *ec, ec_fop_data_t *fop)
return ec->other_eager_lock;
}
+static void
+ec_update_stripe(ec_t *ec, ec_stripe_list_t *stripe_cache, ec_stripe_t *stripe,
+ ec_fop_data_t *fop)
+{
+ off_t base;
+
+ /* On write fops, we only update existing fragments if the write has
+ * succeeded. Otherwise, we remove them from the cache. */
+ if ((fop->id == GF_FOP_WRITE) && (fop->answer != NULL) &&
+ (fop->answer->op_ret >= 0)) {
+ base = stripe->frag_offset - fop->frag_range.first;
+ base *= ec->fragments;
+
+ /* We check if the stripe offset falls inside the real region
+ * modified by the write fop (a write request is allowed,
+ * though uncommon, to write less bytes than requested). The
+ * current write fop implementation doesn't allow partial
+ * writes of fragments, so if there's no error, we are sure
+ * that a full stripe has been completely modified or not
+ * touched at all. The value of op_ret may not be a multiple
+ * of the stripe size because it depends on the requested
+ * size by the user, so we update the stripe if the write has
+ * modified at least one byte (meaning ec has written the full
+ * stripe). */
+ if (base < fop->answer->op_ret) {
+ memcpy(stripe->data, fop->vector[0].iov_base + base,
+ ec->stripe_size);
+ list_move_tail(&stripe->lru, &stripe_cache->lru);
+
+ GF_ATOMIC_INC(ec->stats.stripe_cache.updates);
+ }
+ } else {
+ stripe->frag_offset = -1;
+ list_move (&stripe->lru, &stripe_cache->lru);
+
+ GF_ATOMIC_INC(ec->stats.stripe_cache.invals);
+ }
+}
+
+static void
+ec_update_cached_stripes (ec_fop_data_t *fop)
+{
+ uint64_t first;
+ uint64_t last;
+ ec_stripe_t *stripe = NULL;
+ ec_inode_t *ctx = NULL;
+ ec_stripe_list_t *stripe_cache = NULL;
+ inode_t *inode = NULL;
+ struct list_head *temp;
+ struct list_head sentinel;
+
+ first = fop->frag_range.first;
+ /* 'last' represents the first stripe not touched by the operation */
+ last = fop->frag_range.last;
+
+ /* If there are no modified stripes, we don't need to do anything
+ * else. */
+ if (last <= first) {
+ return;
+ }
+
+ if (!fop->use_fd) {
+ inode = fop->loc[0].inode;
+ } else {
+ inode = fop->fd->inode;
+ }
+
+ LOCK(&inode->lock);
+
+ ctx = __ec_inode_get (inode, fop->xl);
+ if (ctx == NULL) {
+ goto out;
+ }
+ stripe_cache = &ctx->stripe_cache;
+
+ /* Since we'll be moving elements of the list to the tail, we might
+ * end in an infinite loop. To avoid it, we insert a sentinel element
+ * into the list, so that it will be used to detect when we have
+ * traversed all existing elements once. */
+ list_add_tail(&sentinel, &stripe_cache->lru);
+ temp = stripe_cache->lru.next;
+ while (temp != &sentinel) {
+ stripe = list_entry(temp, ec_stripe_t, lru);
+ temp = temp->next;
+ if ((first <= stripe->frag_offset) &&
+ (stripe->frag_offset < last)) {
+ ec_update_stripe (fop->xl->private, stripe_cache,
+ stripe, fop);
+ }
+ }
+ list_del(&sentinel);
+
+out:
+ UNLOCK(&inode->lock);
+}
+
void ec_lock_reuse(ec_fop_data_t *fop)
{
ec_cbk_data_t *cbk;
@@ -2491,6 +2605,7 @@ void ec_lock_reuse(ec_fop_data_t *fop)
* the lock. */
release = _gf_true;
}
+ ec_update_cached_stripes (fop);
for (i = 0; i < fop->lock_count; i++) {
ec_lock_next_owner(&fop->locks[i], cbk, release);
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
index 9a35391a781..35c6a8107c2 100644
--- a/xlators/cluster/ec/src/ec-common.h
+++ b/xlators/cluster/ec/src/ec-common.h
@@ -139,4 +139,5 @@ ec_get_heal_info (xlator_t *this, loc_t *loc, dict_t **dict);
int32_t ec_lock_unlocked(call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
dict_t *xdata);
+
#endif /* __EC_COMMON_H__ */
diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c
index 122fe24b5d3..fe610748f0f 100644
--- a/xlators/cluster/ec/src/ec-helpers.c
+++ b/xlators/cluster/ec/src/ec-helpers.c
@@ -706,6 +706,17 @@ void ec_owner_copy(call_frame_t *frame, gf_lkowner_t *owner)
lk_owner_copy (&frame->root->lk_owner, owner);
}
+static void
+ec_stripe_cache_init (ec_t *ec, ec_inode_t *ctx)
+{
+ ec_stripe_list_t *stripe_cache = NULL;
+
+ stripe_cache = &(ctx->stripe_cache);
+ if (stripe_cache->max == 0) {
+ stripe_cache->max = ec->stripe_cache;
+ }
+}
+
ec_inode_t * __ec_inode_get(inode_t * inode, xlator_t * xl)
{
ec_inode_t * ctx = NULL;
@@ -718,7 +729,7 @@ ec_inode_t * __ec_inode_get(inode_t * inode, xlator_t * xl)
{
memset(ctx, 0, sizeof(*ctx));
INIT_LIST_HEAD(&ctx->heal);
-
+ INIT_LIST_HEAD(&ctx->stripe_cache.lru);
value = (uint64_t)(uintptr_t)ctx;
if (__inode_ctx_set(inode, xl, &value) != 0)
{
@@ -732,6 +743,8 @@ ec_inode_t * __ec_inode_get(inode_t * inode, xlator_t * xl)
{
ctx = (ec_inode_t *)(uintptr_t)value;
}
+ if (ctx)
+ ec_stripe_cache_init (xl->private, ctx);
return ctx;
}
diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c
index ae5120226e3..2f8170d1ac0 100644
--- a/xlators/cluster/ec/src/ec-inode-write.c
+++ b/xlators/cluster/ec/src/ec-inode-write.c
@@ -18,6 +18,7 @@
#include "ec-combine.h"
#include "ec-method.h"
#include "ec-fops.h"
+#include "ec-mem-types.h"
int32_t
ec_update_writev_cbk (call_frame_t *frame, void *cookie,
@@ -1169,12 +1170,14 @@ void ec_discard_adjust_offset_size(ec_fop_data_t *fop)
* write zeros.
*/
fop->int32 = ec_adjust_offset_up(ec, &fop->offset, _gf_true);
+ fop->frag_range.first = fop->offset;
if (fop->size < fop->int32) {
fop->size = 0;
} else {
fop->size -= fop->int32;
ec_adjust_size_down(ec, &fop->size, _gf_true);
}
+ fop->frag_range.last = fop->offset + fop->size;
}
int32_t ec_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
@@ -1436,6 +1439,8 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state)
case EC_STATE_INIT:
fop->user_size = fop->offset;
ec_adjust_offset_up(fop->xl->private, &fop->offset, _gf_true);
+ fop->frag_range.first = fop->offset;
+ fop->frag_range.last = UINT64_MAX;
/* Fall through */
@@ -1696,6 +1701,78 @@ out:
}
/* FOP: writev */
+static ec_stripe_t *
+ec_allocate_stripe (ec_t *ec, ec_stripe_list_t *stripe_cache)
+{
+ ec_stripe_t *stripe = NULL;
+
+ if (stripe_cache->count >= stripe_cache->max) {
+ GF_ASSERT (!list_empty(&stripe_cache->lru));
+ stripe = list_first_entry(&stripe_cache->lru, ec_stripe_t, lru);
+ list_move_tail(&stripe->lru, &stripe_cache->lru);
+ GF_ATOMIC_INC(ec->stats.stripe_cache.evicts);
+ } else {
+ stripe = GF_MALLOC (sizeof (ec_stripe_t) + ec->stripe_size,
+ ec_mt_ec_stripe_t);
+ if (stripe != NULL) {
+ stripe_cache->count++;
+ list_add_tail (&stripe->lru, &stripe_cache->lru);
+ GF_ATOMIC_INC(ec->stats.stripe_cache.allocs);
+ } else {
+ GF_ATOMIC_INC(ec->stats.stripe_cache.errors);
+ }
+ }
+
+ return stripe;
+}
+
+static void
+ec_write_stripe_data (ec_t *ec, ec_fop_data_t *fop,
+ ec_stripe_t *stripe)
+{
+ off_t base;
+
+ base = fop->size - ec->stripe_size;
+ memcpy(stripe->data, fop->vector[0].iov_base + base, ec->stripe_size);
+ stripe->frag_offset = fop->frag_range.last - ec->fragment_size;
+}
+
+static void
+ec_add_stripe_in_cache (ec_t *ec, ec_fop_data_t *fop)
+{
+ ec_inode_t *ctx = NULL;
+ ec_stripe_t *stripe = NULL;
+ ec_stripe_list_t *stripe_cache = NULL;
+ gf_boolean_t failed = _gf_true;
+
+ LOCK(&fop->fd->inode->lock);
+
+ ctx = __ec_inode_get (fop->fd->inode, fop->xl);
+ if (ctx == NULL) {
+ goto out;
+ }
+
+ stripe_cache = &ctx->stripe_cache;
+ if (stripe_cache->max > 0) {
+ stripe = ec_allocate_stripe (ec, stripe_cache);
+ if (stripe == NULL) {
+ goto out;
+ }
+
+ ec_write_stripe_data (ec, fop, stripe);
+ }
+
+ failed = _gf_false;
+
+out:
+ UNLOCK(&fop->fd->inode->lock);
+
+ if (failed) {
+ gf_msg (ec->xl->name, GF_LOG_DEBUG, ENOMEM,
+ EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to create and add stripe in cache");
+ }
+}
int32_t ec_writev_merge_tail(call_frame_t * frame, void * cookie,
xlator_t * this, int32_t op_ret, int32_t op_errno,
@@ -1725,8 +1802,11 @@ int32_t ec_writev_merge_tail(call_frame_t * frame, void * cookie,
{
memset(fop->vector[0].iov_base + fop->size - size, 0, size);
}
- }
+ if (ec->stripe_cache) {
+ ec_add_stripe_in_cache (ec, fop);
+ }
+ }
return 0;
}
@@ -1775,7 +1855,7 @@ ec_make_internal_fop_xdata (dict_t **xdata)
dict_t *dict = NULL;
if (*xdata)
- return 0;
+ return 0;
dict = dict_new();
if (!dict)
@@ -1802,8 +1882,10 @@ ec_writev_prepare_buffers(ec_t *ec, ec_fop_data_t *fop)
fop->user_size = iov_length(fop->vector, fop->int32);
fop->head = ec_adjust_offset_down(ec, &fop->offset, _gf_false);
+ fop->frag_range.first = fop->offset / ec->fragments;
fop->size = fop->user_size + fop->head;
ec_adjust_size_up(ec, &fop->size, _gf_false);
+ fop->frag_range.last = fop->frag_range.first + fop->size / ec->fragments;
if ((fop->int32 != 1) || (fop->head != 0) ||
(fop->size > fop->user_size) ||
@@ -1850,6 +1932,98 @@ out:
return err;
}
+static void
+ec_merge_stripe_head_locked (ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe)
+{
+ size_t head, size;
+
+ head = fop->head;
+ memcpy(fop->vector[0].iov_base, stripe->data, head);
+
+ size = ec->stripe_size - head;
+ if (size > fop->user_size) {
+ head += fop->user_size;
+ size = ec->stripe_size - head;
+ memcpy(fop->vector[0].iov_base + head, stripe->data + head,
+ size);
+ }
+}
+
+static void
+ec_merge_stripe_tail_locked (ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe)
+{
+ size_t head, tail;
+ off_t offset;
+
+ offset = fop->user_size + fop->head;
+ tail = fop->size - offset;
+ head = ec->stripe_size - tail;
+
+ memcpy(fop->vector[0].iov_base + offset, stripe->data + head, tail);
+}
+
+static ec_stripe_t *
+ec_get_stripe_from_cache_locked (ec_t *ec, ec_fop_data_t *fop,
+ uint64_t frag_offset)
+{
+ ec_inode_t *ctx = NULL;
+ ec_stripe_t *stripe = NULL;
+ ec_stripe_list_t *stripe_cache = NULL;
+
+ ctx = __ec_inode_get (fop->fd->inode, fop->xl);
+ if (ctx == NULL) {
+ GF_ATOMIC_INC(ec->stats.stripe_cache.errors);
+ return NULL;
+ }
+
+ stripe_cache = &ctx->stripe_cache;
+ list_for_each_entry (stripe, &stripe_cache->lru, lru) {
+ if (stripe->frag_offset == frag_offset) {
+ list_move_tail (&stripe->lru, &stripe_cache->lru);
+ GF_ATOMIC_INC(ec->stats.stripe_cache.hits);
+ return stripe;
+ }
+ }
+
+ GF_ATOMIC_INC(ec->stats.stripe_cache.misses);
+
+ return NULL;
+}
+
+static gf_boolean_t
+ec_get_and_merge_stripe (ec_t *ec, ec_fop_data_t *fop, ec_stripe_part_t which)
+{
+ uint64_t frag_offset;
+ ec_stripe_t *stripe = NULL;
+ gf_boolean_t found = _gf_false;
+
+ if (!ec->stripe_cache) {
+ return found;
+ }
+
+ LOCK(&fop->fd->inode->lock);
+ if (which == EC_STRIPE_HEAD) {
+ frag_offset = fop->frag_range.first;
+ stripe = ec_get_stripe_from_cache_locked(ec, fop, frag_offset);
+ if (stripe) {
+ ec_merge_stripe_head_locked (ec, fop, stripe);
+ found = _gf_true;
+ }
+ }
+
+ if (which == EC_STRIPE_TAIL) {
+ frag_offset = fop->frag_range.last - ec->fragment_size;
+ stripe = ec_get_stripe_from_cache_locked(ec, fop, frag_offset);
+ if (stripe) {
+ ec_merge_stripe_tail_locked (ec, fop, stripe);
+ found = _gf_true;
+ }
+ }
+ UNLOCK(&fop->fd->inode->lock);
+
+ return found;
+}
+
void ec_writev_start(ec_fop_data_t *fop)
{
ec_t *ec = fop->xl->private;
@@ -1858,6 +2032,7 @@ void ec_writev_start(ec_fop_data_t *fop)
dict_t *xdata = NULL;
uint64_t tail, current;
int32_t err = -ENOMEM;
+ gf_boolean_t found_stripe = _gf_false;
/* This shouldn't fail because we have the inode locked. */
GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, &current));
@@ -1884,15 +2059,19 @@ void ec_writev_start(ec_fop_data_t *fop)
if (err != 0) {
goto failed_fd;
}
-
if (fop->head > 0) {
- if (ec_make_internal_fop_xdata (&xdata)) {
- err = -ENOMEM;
- goto failed_xdata;
+ found_stripe = ec_get_and_merge_stripe (ec, fop, EC_STRIPE_HEAD);
+ if (!found_stripe) {
+ if (ec_make_internal_fop_xdata (&xdata)) {
+ err = -ENOMEM;
+ goto failed_xdata;
+ }
+ ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
+ ec_writev_merge_head,
+ NULL, fd, ec->stripe_size, fop->offset, 0, xdata);
}
- ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, ec_writev_merge_head,
- NULL, fd, ec->stripe_size, fop->offset, 0, xdata);
}
+
tail = fop->size - fop->user_size - fop->head;
if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) {
/* Current locking scheme will make sure the 'current' below will
@@ -1900,13 +2079,17 @@ void ec_writev_start(ec_fop_data_t *fop)
* work as expected
*/
if (current > fop->offset + fop->head + fop->user_size) {
- if (ec_make_internal_fop_xdata (&xdata)) {
- err = -ENOMEM;
- goto failed_xdata;
+ found_stripe = ec_get_and_merge_stripe (ec, fop, EC_STRIPE_TAIL);
+ if (!found_stripe) {
+ if (ec_make_internal_fop_xdata (&xdata)) {
+ err = -ENOMEM;
+ goto failed_xdata;
+ }
+ ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
+ ec_writev_merge_tail, NULL, fd, ec->stripe_size,
+ fop->offset + fop->size - ec->stripe_size,
+ 0, xdata);
}
- ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
- ec_writev_merge_tail, NULL, fd, ec->stripe_size,
- fop->offset + fop->size - ec->stripe_size, 0, xdata);
} else {
memset(fop->vector[0].iov_base + fop->size - tail, 0, tail);
}
diff --git a/xlators/cluster/ec/src/ec-mem-types.h b/xlators/cluster/ec/src/ec-mem-types.h
index 9a4b6c58049..8109a422d9d 100644
--- a/xlators/cluster/ec/src/ec-mem-types.h
+++ b/xlators/cluster/ec/src/ec-mem-types.h
@@ -25,6 +25,7 @@ enum gf_ec_mem_types_
ec_mt_ec_code_t,
ec_mt_ec_code_builder_t,
ec_mt_ec_matrix_t,
+ ec_mt_ec_stripe_t,
ec_mt_end
};
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
index 23b30548450..7b2b7b8247d 100644
--- a/xlators/cluster/ec/src/ec-types.h
+++ b/xlators/cluster/ec/src/ec-types.h
@@ -14,12 +14,16 @@
#include "xlator.h"
#include "timer.h"
#include "libxlator.h"
+#include "atomic.h"
#define EC_GF_MAX_REGS 16
enum _ec_heal_need;
typedef enum _ec_heal_need ec_heal_need_t;
+enum _ec_stripe_part;
+typedef enum _ec_stripe_part ec_stripe_part_t;
+
enum _ec_read_policy;
typedef enum _ec_read_policy ec_read_policy_t;
@@ -29,6 +33,9 @@ typedef struct _ec_config ec_config_t;
struct _ec_fd;
typedef struct _ec_fd ec_fd_t;
+struct _ec_fragment_range;
+typedef struct _ec_fragment_range ec_fragment_range_t;
+
struct _ec_inode;
typedef struct _ec_inode ec_inode_t;
@@ -77,6 +84,12 @@ typedef struct _ec_code_builder ec_code_builder_t;
struct _ec_code_chunk;
typedef struct _ec_code_chunk ec_code_chunk_t;
+struct _ec_stripe;
+typedef struct _ec_stripe ec_stripe_t;
+
+struct _ec_stripe_list;
+typedef struct _ec_stripe_list ec_stripe_list_t;
+
struct _ec_code_space;
typedef struct _ec_code_space ec_code_space_t;
@@ -105,6 +118,9 @@ typedef struct _ec_heal ec_heal_t;
struct _ec_self_heald;
typedef struct _ec_self_heald ec_self_heald_t;
+struct _ec_statistics;
+typedef struct _ec_statistics ec_statistics_t;
+
struct _ec;
typedef struct _ec ec_t;
@@ -124,6 +140,11 @@ enum _ec_heal_need {
EC_HEAL_MUST
};
+enum _ec_stripe_part {
+ EC_STRIPE_HEAD,
+ EC_STRIPE_TAIL
+};
+
struct _ec_config {
uint32_t version;
uint8_t algorithm;
@@ -139,6 +160,18 @@ struct _ec_fd {
int32_t flags;
};
+struct _ec_stripe {
+ struct list_head lru; /* LRU list member */
+ uint64_t frag_offset; /* Fragment offset of this stripe */
+ char data[]; /* Contents of the stripe */
+};
+
+struct _ec_stripe_list {
+ struct list_head lru;
+ uint32_t count;
+ uint32_t max;
+};
+
struct _ec_inode {
ec_lock_t *inode_lock;
gf_boolean_t have_info;
@@ -152,8 +185,10 @@ struct _ec_inode {
uint64_t post_size;
uint64_t dirty[2];
struct list_head heal;
+ ec_stripe_list_t stripe_cache;
};
+
typedef int32_t (*fop_heal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
int32_t, uintptr_t, uintptr_t, uintptr_t,
dict_t *);
@@ -263,75 +298,88 @@ struct _ec_lock_link {
off_t fl_end;
};
-struct _ec_fop_data {
- int32_t id;
- int32_t refs;
- int32_t state;
- int32_t minimum;
- int32_t expected;
- int32_t winds;
- int32_t jobs;
- int32_t error;
- ec_fop_data_t *parent;
- xlator_t *xl;
- call_frame_t *req_frame; /* frame of the calling xlator */
- call_frame_t *frame; /* frame used by this fop */
- struct list_head cbk_list; /* sorted list of groups of answers */
- struct list_head answer_list; /* list of answers */
- struct list_head pending_list; /* member of ec_t.pending_fops */
- ec_cbk_data_t *answer; /* accepted answer */
- int32_t lock_count;
- int32_t locked;
- ec_lock_link_t locks[2];
- int32_t first_lock;
- gf_lock_t lock;
+/* This structure keeps a range of fragment offsets affected by a fop. Since
+ * real file offsets can be difficult to handle correctly because of overflows,
+ * we use the 'scaled' offset, which corresponds to the offset of the fragment
+ * seen by the bricks, which is always smaller and cannot overflow. */
+struct _ec_fragment_range {
+ uint64_t first; /* Address of the first affected fragment as seen by the
+ bricks (offset on brick) */
+ uint64_t last; /* Address of the first non affected fragment as seen by
+ the bricks (offset on brick) */
+};
- uint32_t flags;
- uint32_t first;
- uintptr_t mask;
- uintptr_t healing; /*Dispatch is done but call is successful only
- if fop->minimum number of subvolumes succeed
- which are not healing*/
- uintptr_t remaining;
- uintptr_t received; /* Mask of responses */
- uintptr_t good;
-
- uid_t uid;
- gid_t gid;
-
- ec_wind_f wind;
- ec_handler_f handler;
- ec_resume_f resume;
- ec_cbk_t cbks;
- void *data;
- ec_heal_t *heal;
- struct list_head healer;
-
- uint64_t user_size;
- uint32_t head;
-
- int32_t use_fd;
-
- dict_t *xdata;
- dict_t *dict;
- int32_t int32;
- uint32_t uint32;
- uint64_t size;
- off_t offset;
- mode_t mode[2];
- entrylk_cmd entrylk_cmd;
- entrylk_type entrylk_type;
- gf_xattrop_flags_t xattrop_flags;
- dev_t dev;
- inode_t *inode;
- fd_t *fd;
- struct iatt iatt;
- char *str[2];
- loc_t loc[2];
- struct gf_flock flock;
- struct iovec *vector;
- struct iobref *buffers;
- gf_seek_what_t seek;
+struct _ec_fop_data {
+ int32_t id;
+ int32_t refs;
+ int32_t state;
+ int32_t minimum;
+ int32_t expected;
+ int32_t winds;
+ int32_t jobs;
+ int32_t error;
+ ec_fop_data_t *parent;
+ xlator_t *xl;
+ call_frame_t *req_frame; /* frame of the calling xlator */
+ call_frame_t *frame; /* frame used by this fop */
+ struct list_head cbk_list; /* sorted list of groups of answers */
+ struct list_head answer_list; /* list of answers */
+ struct list_head pending_list; /* member of ec_t.pending_fops */
+ ec_cbk_data_t *answer; /* accepted answer */
+ int32_t lock_count;
+ int32_t locked;
+ ec_lock_link_t locks[2];
+ int32_t first_lock;
+ gf_lock_t lock;
+
+ uint32_t flags;
+ uint32_t first;
+ uintptr_t mask;
+ uintptr_t healing; /*Dispatch is done but call is successful
+ only if fop->minimum number of subvolumes
+ succeed which are not healing*/
+ uintptr_t remaining;
+ uintptr_t received; /* Mask of responses */
+ uintptr_t good;
+
+ uid_t uid;
+ gid_t gid;
+
+ ec_wind_f wind;
+ ec_handler_f handler;
+ ec_resume_f resume;
+ ec_cbk_t cbks;
+ void *data;
+ ec_heal_t *heal;
+ struct list_head healer;
+
+ uint64_t user_size;
+ uint32_t head;
+
+ int32_t use_fd;
+
+ dict_t *xdata;
+ dict_t *dict;
+ int32_t int32;
+ uint32_t uint32;
+ uint64_t size;
+ off_t offset;
+ mode_t mode[2];
+ entrylk_cmd entrylk_cmd;
+ entrylk_type entrylk_type;
+ gf_xattrop_flags_t xattrop_flags;
+ dev_t dev;
+ inode_t *inode;
+ fd_t *fd;
+ struct iatt iatt;
+ char *str[2];
+ loc_t loc[2];
+ struct gf_flock flock;
+ struct iovec *vector;
+ struct iobref *buffers;
+ gf_seek_what_t seek;
+ ec_fragment_range_t frag_range; /* This will hold the range of stripes
+ affected by the fop. */
};
struct _ec_cbk_data {
@@ -551,6 +599,26 @@ struct _ec_self_heald {
struct subvol_healer *full_healers;
};
+struct _ec_statistics {
+ struct {
+ gf_atomic_t hits; /* Cache hits. */
+ gf_atomic_t misses; /* Cache misses. */
+ gf_atomic_t updates; /* Number of times an existing stripe has
+ been updated with new content. */
+ gf_atomic_t invals; /* Number of times an existing stripe has
+ been invalidated because of truncates
+ or discards. */
+ gf_atomic_t evicts; /* Number of times that an existing entry
+ has been evicted to make room for newer
+ entries. */
+ gf_atomic_t allocs; /* Number of memory allocations made to
+ store stripes. */
+ gf_atomic_t errors; /* Number of errors that have caused extra
+ requests. (Basically memory allocation
+ errors). */
+ } stripe_cache;
+};
+
struct _ec {
xlator_t *xl;
int32_t healers;
@@ -576,6 +644,7 @@ struct _ec {
gf_boolean_t other_eager_lock;
gf_boolean_t optimistic_changelog;
gf_boolean_t parallel_writes;
+ uint32_t stripe_cache;
uint32_t background_heals;
uint32_t heal_wait_qlen;
uint32_t self_heal_window_size; /* max size of read/writes */
@@ -590,6 +659,7 @@ struct _ec {
dict_t *leaf_to_subvolid;
ec_read_policy_t read_policy;
ec_matrix_list_t matrix;
+ ec_statistics_t stats;
};
#endif /* __EC_TYPES_H__ */
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
index 9f361a54aa3..275dd15a302 100644
--- a/xlators/cluster/ec/src/ec.c
+++ b/xlators/cluster/ec/src/ec.c
@@ -299,6 +299,8 @@ reconfigure (xlator_t *this, dict_t *options)
options, bool, failed);
GF_OPTION_RECONF ("parallel-writes", ec->parallel_writes,
options, bool, failed);
+ GF_OPTION_RECONF ("stripe-cache", ec->stripe_cache, options, uint32,
+ failed);
ret = 0;
if (ec_assign_read_policy (ec, read_policy)) {
ret = -1;
@@ -581,6 +583,18 @@ notify (xlator_t *this, int32_t event, void *data, ...)
return ret;
}
+static void
+ec_statistics_init(ec_t *ec)
+{
+ GF_ATOMIC_INIT(ec->stats.stripe_cache.hits, 0);
+ GF_ATOMIC_INIT(ec->stats.stripe_cache.misses, 0);
+ GF_ATOMIC_INIT(ec->stats.stripe_cache.updates, 0);
+ GF_ATOMIC_INIT(ec->stats.stripe_cache.invals, 0);
+ GF_ATOMIC_INIT(ec->stats.stripe_cache.evicts, 0);
+ GF_ATOMIC_INIT(ec->stats.stripe_cache.allocs, 0);
+ GF_ATOMIC_INIT(ec->stats.stripe_cache.errors, 0);
+}
+
int32_t
init (xlator_t *this)
{
@@ -671,6 +685,7 @@ init (xlator_t *this)
GF_OPTION_INIT ("shd-wait-qlength", ec->shd.wait_qlength, uint32, failed);
GF_OPTION_INIT ("optimistic-change-log", ec->optimistic_changelog, bool, failed);
GF_OPTION_INIT ("parallel-writes", ec->parallel_writes, bool, failed);
+ GF_OPTION_INIT ("stripe-cache", ec->stripe_cache, uint32, failed);
this->itable = inode_table_new (EC_SHD_INODE_LRU_LIMIT, this);
if (!this->itable)
@@ -697,6 +712,8 @@ init (xlator_t *this)
goto failed;
}
+ ec_statistics_init(ec);
+
return 0;
failed:
@@ -1252,6 +1269,9 @@ int32_t ec_gf_forget(xlator_t * this, inode_t * inode)
if ((inode_ctx_del(inode, this, &value) == 0) && (value != 0))
{
ctx = (ec_inode_t *)(uintptr_t)value;
+ /* We can only forget an inode if it has been unlocked, so the stripe
+ * cache should also be empty. */
+ GF_ASSERT(list_empty(&ctx->stripe_cache.lru));
GF_FREE(ctx);
}
@@ -1313,6 +1333,25 @@ int32_t ec_dump_private(xlator_t *this)
gf_proc_dump_write("heal-waiters", "%d", ec->heal_waiters);
gf_proc_dump_write("read-policy", "%s", ec_read_policies[ec->read_policy]);
+ snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s.stats.stripe_cache",
+ this->type, this->name);
+ gf_proc_dump_add_section(key_prefix);
+
+ gf_proc_dump_write("hits", "%llu",
+ GF_ATOMIC_GET(ec->stats.stripe_cache.hits));
+ gf_proc_dump_write("misses", "%llu",
+ GF_ATOMIC_GET(ec->stats.stripe_cache.misses));
+ gf_proc_dump_write("updates", "%llu",
+ GF_ATOMIC_GET(ec->stats.stripe_cache.updates));
+ gf_proc_dump_write("invalidations", "%llu",
+ GF_ATOMIC_GET(ec->stats.stripe_cache.invals));
+ gf_proc_dump_write("evicts", "%llu",
+ GF_ATOMIC_GET(ec->stats.stripe_cache.evicts));
+ gf_proc_dump_write("allocations", "%llu",
+ GF_ATOMIC_GET(ec->stats.stripe_cache.allocs));
+ gf_proc_dump_write("errors", "%llu",
+ GF_ATOMIC_GET(ec->stats.stripe_cache.errors));
+
return 0;
}
@@ -1512,5 +1551,18 @@ struct volume_options options[] =
.description = "This controls if writes can be wound in parallel as long"
"as it doesn't modify same stripes"
},
+ { .key = {"stripe-cache"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,/*Disabling stripe_cache*/
+ .max = EC_STRIPE_CACHE_MAX_SIZE,
+ .default_value = "0",
+ .description = "This option will keep the last stripe of write fop"
+ "in memory. If next write falls in this stripe, we need"
+ "not to read it again from backend and we can save READ"
+ "fop going over the network. This will improve performance,"
+ "specially for sequential writes. However, this will also"
+ "lead to extra memory consumption, maximum "
+ "(cache size * stripe size) Bytes per open file."
+ },
{ .key = {NULL} }
};
diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h
index 648d444f595..b729fffc274 100644
--- a/xlators/cluster/ec/src/ec.h
+++ b/xlators/cluster/ec/src/ec.h
@@ -17,7 +17,7 @@
#define EC_XATTR_VERSION EC_XATTR_PREFIX"version"
#define EC_XATTR_HEAL EC_XATTR_PREFIX"heal"
#define EC_XATTR_DIRTY EC_XATTR_PREFIX"dirty"
-
+#define EC_STRIPE_CACHE_MAX_SIZE 10
#define EC_VERSION_SIZE 2
#define EC_SHD_INODE_LRU_LIMIT 10