summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/ec/src
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster/ec/src')
-rw-r--r--xlators/cluster/ec/src/ec-common.c115
-rw-r--r--xlators/cluster/ec/src/ec-common.h1
-rw-r--r--xlators/cluster/ec/src/ec-helpers.c15
-rw-r--r--xlators/cluster/ec/src/ec-inode-write.c211
-rw-r--r--xlators/cluster/ec/src/ec-mem-types.h1
-rw-r--r--xlators/cluster/ec/src/ec-types.h206
-rw-r--r--xlators/cluster/ec/src/ec.c52
-rw-r--r--xlators/cluster/ec/src/ec.h2
8 files changed, 519 insertions, 84 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index ea5773c9879..29ab66f374c 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -1437,6 +1437,23 @@ ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
return found;
}
+static void
+ec_release_stripe_cache (ec_inode_t *ctx)
+{
+ ec_stripe_list_t *stripe_cache = NULL;
+ ec_stripe_t *stripe = NULL;
+
+ stripe_cache = &ctx->stripe_cache;
+ while (!list_empty (&stripe_cache->lru)) {
+ stripe = list_first_entry (&stripe_cache->lru, ec_stripe_t,
+ lru);
+ list_del (&stripe->lru);
+ GF_FREE (stripe);
+ }
+ stripe_cache->count = 0;
+ stripe_cache->max = 0;
+}
+
void ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode)
{
ec_inode_t *ctx;
@@ -1448,6 +1465,7 @@ void ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode)
goto unlock;
}
+ ec_release_stripe_cache (ctx);
ctx->have_info = _gf_false;
ctx->have_config = _gf_false;
ctx->have_version = _gf_false;
@@ -2465,6 +2483,102 @@ ec_use_eager_lock(ec_t *ec, ec_fop_data_t *fop)
return ec->other_eager_lock;
}
+static void
+ec_update_stripe(ec_t *ec, ec_stripe_list_t *stripe_cache, ec_stripe_t *stripe,
+ ec_fop_data_t *fop)
+{
+ off_t base;
+
+ /* On write fops, we only update existing fragments if the write has
+ * succeeded. Otherwise, we remove them from the cache. */
+ if ((fop->id == GF_FOP_WRITE) && (fop->answer != NULL) &&
+ (fop->answer->op_ret >= 0)) {
+ base = stripe->frag_offset - fop->frag_range.first;
+ base *= ec->fragments;
+
+ /* We check if the stripe offset falls inside the real region
+ * modified by the write fop (a write request is allowed,
+ * though uncommon, to write less bytes than requested). The
+ * current write fop implementation doesn't allow partial
+ * writes of fragments, so if there's no error, we are sure
+ * that a full stripe has been completely modified or not
+ * touched at all. The value of op_ret may not be a multiple
+ * of the stripe size because it depends on the requested
+ * size by the user, so we update the stripe if the write has
+ * modified at least one byte (meaning ec has written the full
+ * stripe). */
+ if (base < fop->answer->op_ret) {
+ memcpy(stripe->data, fop->vector[0].iov_base + base,
+ ec->stripe_size);
+ list_move_tail(&stripe->lru, &stripe_cache->lru);
+
+ GF_ATOMIC_INC(ec->stats.stripe_cache.updates);
+ }
+ } else {
+ stripe->frag_offset = -1;
+ list_move (&stripe->lru, &stripe_cache->lru);
+
+ GF_ATOMIC_INC(ec->stats.stripe_cache.invals);
+ }
+}
+
+static void
+ec_update_cached_stripes (ec_fop_data_t *fop)
+{
+ uint64_t first;
+ uint64_t last;
+ ec_stripe_t *stripe = NULL;
+ ec_inode_t *ctx = NULL;
+ ec_stripe_list_t *stripe_cache = NULL;
+ inode_t *inode = NULL;
+ struct list_head *temp;
+ struct list_head sentinel;
+
+ first = fop->frag_range.first;
+ /* 'last' represents the first stripe not touched by the operation */
+ last = fop->frag_range.last;
+
+ /* If there are no modified stripes, we don't need to do anything
+ * else. */
+ if (last <= first) {
+ return;
+ }
+
+ if (!fop->use_fd) {
+ inode = fop->loc[0].inode;
+ } else {
+ inode = fop->fd->inode;
+ }
+
+ LOCK(&inode->lock);
+
+ ctx = __ec_inode_get (inode, fop->xl);
+ if (ctx == NULL) {
+ goto out;
+ }
+ stripe_cache = &ctx->stripe_cache;
+
+ /* Since we'll be moving elements of the list to the tail, we might
+ * end in an infinite loop. To avoid it, we insert a sentinel element
+ * into the list, so that it will be used to detect when we have
+ * traversed all existing elements once. */
+ list_add_tail(&sentinel, &stripe_cache->lru);
+ temp = stripe_cache->lru.next;
+ while (temp != &sentinel) {
+ stripe = list_entry(temp, ec_stripe_t, lru);
+ temp = temp->next;
+ if ((first <= stripe->frag_offset) &&
+ (stripe->frag_offset < last)) {
+ ec_update_stripe (fop->xl->private, stripe_cache,
+ stripe, fop);
+ }
+ }
+ list_del(&sentinel);
+
+out:
+ UNLOCK(&inode->lock);
+}
+
void ec_lock_reuse(ec_fop_data_t *fop)
{
ec_cbk_data_t *cbk;
@@ -2491,6 +2605,7 @@ void ec_lock_reuse(ec_fop_data_t *fop)
* the lock. */
release = _gf_true;
}
+ ec_update_cached_stripes (fop);
for (i = 0; i < fop->lock_count; i++) {
ec_lock_next_owner(&fop->locks[i], cbk, release);
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
index 9a35391a781..35c6a8107c2 100644
--- a/xlators/cluster/ec/src/ec-common.h
+++ b/xlators/cluster/ec/src/ec-common.h
@@ -139,4 +139,5 @@ ec_get_heal_info (xlator_t *this, loc_t *loc, dict_t **dict);
int32_t ec_lock_unlocked(call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
dict_t *xdata);
+
#endif /* __EC_COMMON_H__ */
diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c
index 122fe24b5d3..fe610748f0f 100644
--- a/xlators/cluster/ec/src/ec-helpers.c
+++ b/xlators/cluster/ec/src/ec-helpers.c
@@ -706,6 +706,17 @@ void ec_owner_copy(call_frame_t *frame, gf_lkowner_t *owner)
lk_owner_copy (&frame->root->lk_owner, owner);
}
+static void
+ec_stripe_cache_init (ec_t *ec, ec_inode_t *ctx)
+{
+ ec_stripe_list_t *stripe_cache = NULL;
+
+ stripe_cache = &(ctx->stripe_cache);
+ if (stripe_cache->max == 0) {
+ stripe_cache->max = ec->stripe_cache;
+ }
+}
+
ec_inode_t * __ec_inode_get(inode_t * inode, xlator_t * xl)
{
ec_inode_t * ctx = NULL;
@@ -718,7 +729,7 @@ ec_inode_t * __ec_inode_get(inode_t * inode, xlator_t * xl)
{
memset(ctx, 0, sizeof(*ctx));
INIT_LIST_HEAD(&ctx->heal);
-
+ INIT_LIST_HEAD(&ctx->stripe_cache.lru);
value = (uint64_t)(uintptr_t)ctx;
if (__inode_ctx_set(inode, xl, &value) != 0)
{
@@ -732,6 +743,8 @@ ec_inode_t * __ec_inode_get(inode_t * inode, xlator_t * xl)
{
ctx = (ec_inode_t *)(uintptr_t)value;
}
+ if (ctx)
+ ec_stripe_cache_init (xl->private, ctx);
return ctx;
}
diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c
index ae5120226e3..2f8170d1ac0 100644
--- a/xlators/cluster/ec/src/ec-inode-write.c
+++ b/xlators/cluster/ec/src/ec-inode-write.c
@@ -18,6 +18,7 @@
#include "ec-combine.h"
#include "ec-method.h"
#include "ec-fops.h"
+#include "ec-mem-types.h"
int32_t
ec_update_writev_cbk (call_frame_t *frame, void *cookie,
@@ -1169,12 +1170,14 @@ void ec_discard_adjust_offset_size(ec_fop_data_t *fop)
* write zeros.
*/
fop->int32 = ec_adjust_offset_up(ec, &fop->offset, _gf_true);
+ fop->frag_range.first = fop->offset;
if (fop->size < fop->int32) {
fop->size = 0;
} else {
fop->size -= fop->int32;
ec_adjust_size_down(ec, &fop->size, _gf_true);
}
+ fop->frag_range.last = fop->offset + fop->size;
}
int32_t ec_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
@@ -1436,6 +1439,8 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state)
case EC_STATE_INIT:
fop->user_size = fop->offset;
ec_adjust_offset_up(fop->xl->private, &fop->offset, _gf_true);
+ fop->frag_range.first = fop->offset;
+ fop->frag_range.last = UINT64_MAX;
/* Fall through */
@@ -1696,6 +1701,78 @@ out:
}
/* FOP: writev */
+static ec_stripe_t *
+ec_allocate_stripe (ec_t *ec, ec_stripe_list_t *stripe_cache)
+{
+ ec_stripe_t *stripe = NULL;
+
+ if (stripe_cache->count >= stripe_cache->max) {
+ GF_ASSERT (!list_empty(&stripe_cache->lru));
+ stripe = list_first_entry(&stripe_cache->lru, ec_stripe_t, lru);
+ list_move_tail(&stripe->lru, &stripe_cache->lru);
+ GF_ATOMIC_INC(ec->stats.stripe_cache.evicts);
+ } else {
+ stripe = GF_MALLOC (sizeof (ec_stripe_t) + ec->stripe_size,
+ ec_mt_ec_stripe_t);
+ if (stripe != NULL) {
+ stripe_cache->count++;
+ list_add_tail (&stripe->lru, &stripe_cache->lru);
+ GF_ATOMIC_INC(ec->stats.stripe_cache.allocs);
+ } else {
+ GF_ATOMIC_INC(ec->stats.stripe_cache.errors);
+ }
+ }
+
+ return stripe;
+}
+
+static void
+ec_write_stripe_data (ec_t *ec, ec_fop_data_t *fop,
+ ec_stripe_t *stripe)
+{
+ off_t base;
+
+ base = fop->size - ec->stripe_size;
+ memcpy(stripe->data, fop->vector[0].iov_base + base, ec->stripe_size);
+ stripe->frag_offset = fop->frag_range.last - ec->fragment_size;
+}
+
+static void
+ec_add_stripe_in_cache (ec_t *ec, ec_fop_data_t *fop)
+{
+ ec_inode_t *ctx = NULL;
+ ec_stripe_t *stripe = NULL;
+ ec_stripe_list_t *stripe_cache = NULL;
+ gf_boolean_t failed = _gf_true;
+
+ LOCK(&fop->fd->inode->lock);
+
+ ctx = __ec_inode_get (fop->fd->inode, fop->xl);
+ if (ctx == NULL) {
+ goto out;
+ }
+
+ stripe_cache = &ctx->stripe_cache;
+ if (stripe_cache->max > 0) {
+ stripe = ec_allocate_stripe (ec, stripe_cache);
+ if (stripe == NULL) {
+ goto out;
+ }
+
+ ec_write_stripe_data (ec, fop, stripe);
+ }
+
+ failed = _gf_false;
+
+out:
+ UNLOCK(&fop->fd->inode->lock);
+
+ if (failed) {
+ gf_msg (ec->xl->name, GF_LOG_DEBUG, ENOMEM,
+ EC_MSG_FILE_DESC_REF_FAIL,
+ "Failed to create and add stripe in cache");
+ }
+}
int32_t ec_writev_merge_tail(call_frame_t * frame, void * cookie,
xlator_t * this, int32_t op_ret, int32_t op_errno,
@@ -1725,8 +1802,11 @@ int32_t ec_writev_merge_tail(call_frame_t * frame, void * cookie,
{
memset(fop->vector[0].iov_base + fop->size - size, 0, size);
}
- }
+ if (ec->stripe_cache) {
+ ec_add_stripe_in_cache (ec, fop);
+ }
+ }
return 0;
}
@@ -1775,7 +1855,7 @@ ec_make_internal_fop_xdata (dict_t **xdata)
dict_t *dict = NULL;
if (*xdata)
- return 0;
+ return 0;
dict = dict_new();
if (!dict)
@@ -1802,8 +1882,10 @@ ec_writev_prepare_buffers(ec_t *ec, ec_fop_data_t *fop)
fop->user_size = iov_length(fop->vector, fop->int32);
fop->head = ec_adjust_offset_down(ec, &fop->offset, _gf_false);
+ fop->frag_range.first = fop->offset / ec->fragments;
fop->size = fop->user_size + fop->head;
ec_adjust_size_up(ec, &fop->size, _gf_false);
+ fop->frag_range.last = fop->frag_range.first + fop->size / ec->fragments;
if ((fop->int32 != 1) || (fop->head != 0) ||
(fop->size > fop->user_size) ||
@@ -1850,6 +1932,98 @@ out:
return err;
}
+static void
+ec_merge_stripe_head_locked (ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe)
+{
+ size_t head, size;
+
+ head = fop->head;
+ memcpy(fop->vector[0].iov_base, stripe->data, head);
+
+ size = ec->stripe_size - head;
+ if (size > fop->user_size) {
+ head += fop->user_size;
+ size = ec->stripe_size - head;
+ memcpy(fop->vector[0].iov_base + head, stripe->data + head,
+ size);
+ }
+}
+
+static void
+ec_merge_stripe_tail_locked (ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe)
+{
+ size_t head, tail;
+ off_t offset;
+
+ offset = fop->user_size + fop->head;
+ tail = fop->size - offset;
+ head = ec->stripe_size - tail;
+
+ memcpy(fop->vector[0].iov_base + offset, stripe->data + head, tail);
+}
+
+static ec_stripe_t *
+ec_get_stripe_from_cache_locked (ec_t *ec, ec_fop_data_t *fop,
+ uint64_t frag_offset)
+{
+ ec_inode_t *ctx = NULL;
+ ec_stripe_t *stripe = NULL;
+ ec_stripe_list_t *stripe_cache = NULL;
+
+ ctx = __ec_inode_get (fop->fd->inode, fop->xl);
+ if (ctx == NULL) {
+ GF_ATOMIC_INC(ec->stats.stripe_cache.errors);
+ return NULL;
+ }
+
+ stripe_cache = &ctx->stripe_cache;
+ list_for_each_entry (stripe, &stripe_cache->lru, lru) {
+ if (stripe->frag_offset == frag_offset) {
+ list_move_tail (&stripe->lru, &stripe_cache->lru);
+ GF_ATOMIC_INC(ec->stats.stripe_cache.hits);
+ return stripe;
+ }
+ }
+
+ GF_ATOMIC_INC(ec->stats.stripe_cache.misses);
+
+ return NULL;
+}
+
+static gf_boolean_t
+ec_get_and_merge_stripe (ec_t *ec, ec_fop_data_t *fop, ec_stripe_part_t which)
+{
+ uint64_t frag_offset;
+ ec_stripe_t *stripe = NULL;
+ gf_boolean_t found = _gf_false;
+
+ if (!ec->stripe_cache) {
+ return found;
+ }
+
+ LOCK(&fop->fd->inode->lock);
+ if (which == EC_STRIPE_HEAD) {
+ frag_offset = fop->frag_range.first;
+ stripe = ec_get_stripe_from_cache_locked(ec, fop, frag_offset);
+ if (stripe) {
+ ec_merge_stripe_head_locked (ec, fop, stripe);
+ found = _gf_true;
+ }
+ }
+
+ if (which == EC_STRIPE_TAIL) {
+ frag_offset = fop->frag_range.last - ec->fragment_size;
+ stripe = ec_get_stripe_from_cache_locked(ec, fop, frag_offset);
+ if (stripe) {
+ ec_merge_stripe_tail_locked (ec, fop, stripe);
+ found = _gf_true;
+ }
+ }
+ UNLOCK(&fop->fd->inode->lock);
+
+ return found;
+}
+
void ec_writev_start(ec_fop_data_t *fop)
{
ec_t *ec = fop->xl->private;
@@ -1858,6 +2032,7 @@ void ec_writev_start(ec_fop_data_t *fop)
dict_t *xdata = NULL;
uint64_t tail, current;
int32_t err = -ENOMEM;
+ gf_boolean_t found_stripe = _gf_false;
/* This shouldn't fail because we have the inode locked. */
GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, &current));
@@ -1884,15 +2059,19 @@ void ec_writev_start(ec_fop_data_t *fop)
if (err != 0) {
goto failed_fd;
}
-
if (fop->head > 0) {
- if (ec_make_internal_fop_xdata (&xdata)) {
- err = -ENOMEM;
- goto failed_xdata;
+ found_stripe = ec_get_and_merge_stripe (ec, fop, EC_STRIPE_HEAD);
+ if (!found_stripe) {
+ if (ec_make_internal_fop_xdata (&xdata)) {
+ err = -ENOMEM;
+ goto failed_xdata;
+ }
+ ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
+ ec_writev_merge_head,
+ NULL, fd, ec->stripe_size, fop->offset, 0, xdata);
}
- ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, ec_writev_merge_head,
- NULL, fd, ec->stripe_size, fop->offset, 0, xdata);
}
+
tail = fop->size - fop->user_size - fop->head;
if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) {
/* Current locking scheme will make sure the 'current' below will
@@ -1900,13 +2079,17 @@ void ec_writev_start(ec_fop_data_t *fop)
* work as expected
*/
if (current > fop->offset + fop->head + fop->user_size) {
- if (ec_make_internal_fop_xdata (&xdata)) {
- err = -ENOMEM;
- goto failed_xdata;
+ found_stripe = ec_get_and_merge_stripe (ec, fop, EC_STRIPE_TAIL);
+ if (!found_stripe) {
+ if (ec_make_internal_fop_xdata (&xdata)) {
+ err = -ENOMEM;
+ goto failed_xdata;
+ }
+ ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
+ ec_writev_merge_tail, NULL, fd, ec->stripe_size,
+ fop->offset + fop->size - ec->stripe_size,
+ 0, xdata);
}
- ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
- ec_writev_merge_tail, NULL, fd, ec->stripe_size,
- fop->offset + fop->size - ec->stripe_size, 0, xdata);
} else {
memset(fop->vector[0].iov_base + fop->size - tail, 0, tail);
}
diff --git a/xlators/cluster/ec/src/ec-mem-types.h b/xlators/cluster/ec/src/ec-mem-types.h
index 9a4b6c58049..8109a422d9d 100644
--- a/xlators/cluster/ec/src/ec-mem-types.h
+++ b/xlators/cluster/ec/src/ec-mem-types.h
@@ -25,6 +25,7 @@ enum gf_ec_mem_types_
ec_mt_ec_code_t,
ec_mt_ec_code_builder_t,
ec_mt_ec_matrix_t,
+ ec_mt_ec_stripe_t,
ec_mt_end
};
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
index 23b30548450..7b2b7b8247d 100644
--- a/xlators/cluster/ec/src/ec-types.h
+++ b/xlators/cluster/ec/src/ec-types.h
@@ -14,12 +14,16 @@
#include "xlator.h"
#include "timer.h"
#include "libxlator.h"
+#include "atomic.h"
#define EC_GF_MAX_REGS 16
enum _ec_heal_need;
typedef enum _ec_heal_need ec_heal_need_t;
+enum _ec_stripe_part;
+typedef enum _ec_stripe_part ec_stripe_part_t;
+
enum _ec_read_policy;
typedef enum _ec_read_policy ec_read_policy_t;
@@ -29,6 +33,9 @@ typedef struct _ec_config ec_config_t;
struct _ec_fd;
typedef struct _ec_fd ec_fd_t;
+struct _ec_fragment_range;
+typedef struct _ec_fragment_range ec_fragment_range_t;
+
struct _ec_inode;
typedef struct _ec_inode ec_inode_t;
@@ -77,6 +84,12 @@ typedef struct _ec_code_builder ec_code_builder_t;
struct _ec_code_chunk;
typedef struct _ec_code_chunk ec_code_chunk_t;
+struct _ec_stripe;
+typedef struct _ec_stripe ec_stripe_t;
+
+struct _ec_stripe_list;
+typedef struct _ec_stripe_list ec_stripe_list_t;
+
struct _ec_code_space;
typedef struct _ec_code_space ec_code_space_t;
@@ -105,6 +118,9 @@ typedef struct _ec_heal ec_heal_t;
struct _ec_self_heald;
typedef struct _ec_self_heald ec_self_heald_t;
+struct _ec_statistics;
+typedef struct _ec_statistics ec_statistics_t;
+
struct _ec;
typedef struct _ec ec_t;
@@ -124,6 +140,11 @@ enum _ec_heal_need {
EC_HEAL_MUST
};
+enum _ec_stripe_part {
+ EC_STRIPE_HEAD,
+ EC_STRIPE_TAIL
+};
+
struct _ec_config {
uint32_t version;
uint8_t algorithm;
@@ -139,6 +160,18 @@ struct _ec_fd {
int32_t flags;
};
+struct _ec_stripe {
+ struct list_head lru; /* LRU list member */
+ uint64_t frag_offset; /* Fragment offset of this stripe */
+ char data[]; /* Contents of the stripe */
+};
+
+struct _ec_stripe_list {
+ struct list_head lru;
+ uint32_t count;
+ uint32_t max;
+};
+
struct _ec_inode {
ec_lock_t *inode_lock;
gf_boolean_t have_info;
@@ -152,8 +185,10 @@ struct _ec_inode {
uint64_t post_size;
uint64_t dirty[2];
struct list_head heal;
+ ec_stripe_list_t stripe_cache;
};
+
typedef int32_t (*fop_heal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
int32_t, uintptr_t, uintptr_t, uintptr_t,
dict_t *);
@@ -263,75 +298,88 @@ struct _ec_lock_link {
off_t fl_end;
};
-struct _ec_fop_data {
- int32_t id;
- int32_t refs;
- int32_t state;
- int32_t minimum;
- int32_t expected;
- int32_t winds;
- int32_t jobs;
- int32_t error;
- ec_fop_data_t *parent;
- xlator_t *xl;
- call_frame_t *req_frame; /* frame of the calling xlator */
- call_frame_t *frame; /* frame used by this fop */
- struct list_head cbk_list; /* sorted list of groups of answers */
- struct list_head answer_list; /* list of answers */
- struct list_head pending_list; /* member of ec_t.pending_fops */
- ec_cbk_data_t *answer; /* accepted answer */
- int32_t lock_count;
- int32_t locked;
- ec_lock_link_t locks[2];
- int32_t first_lock;
- gf_lock_t lock;
+/* This structure keeps a range of fragment offsets affected by a fop. Since
+ * real file offsets can be difficult to handle correctly because of overflows,
+ * we use the 'scaled' offset, which corresponds to the offset of the fragment
+ * seen by the bricks, which is always smaller and cannot overflow. */
+struct _ec_fragment_range {
+ uint64_t first; /* Address of the first affected fragment as seen by the
+ bricks (offset on brick) */
+ uint64_t last; /* Address of the first non affected fragment as seen by
+ the bricks (offset on brick) */
+};
- uint32_t flags;
- uint32_t first;
- uintptr_t mask;
- uintptr_t healing; /*Dispatch is done but call is successful only
- if fop->minimum number of subvolumes succeed
- which are not healing*/
- uintptr_t remaining;
- uintptr_t received; /* Mask of responses */
- uintptr_t good;
-
- uid_t uid;
- gid_t gid;
-
- ec_wind_f wind;
- ec_handler_f handler;
- ec_resume_f resume;
- ec_cbk_t cbks;
- void *data;
- ec_heal_t *heal;
- struct list_head healer;
-
- uint64_t user_size;
- uint32_t head;
-
- int32_t use_fd;
-
- dict_t *xdata;
- dict_t *dict;
- int32_t int32;
- uint32_t uint32;
- uint64_t size;
- off_t offset;
- mode_t mode[2];
- entrylk_cmd entrylk_cmd;
- entrylk_type entrylk_type;
- gf_xattrop_flags_t xattrop_flags;
- dev_t dev;
- inode_t *inode;
- fd_t *fd;
- struct iatt iatt;
- char *str[2];
- loc_t loc[2];
- struct gf_flock flock;
- struct iovec *vector;
- struct iobref *buffers;
- gf_seek_what_t seek;
+struct _ec_fop_data {
+ int32_t id;
+ int32_t refs;
+ int32_t state;
+ int32_t minimum;
+ int32_t expected;
+ int32_t winds;
+ int32_t jobs;
+ int32_t error;
+ ec_fop_data_t *parent;
+ xlator_t *xl;
+ call_frame_t *req_frame; /* frame of the calling xlator */
+ call_frame_t *frame; /* frame used by this fop */
+ struct list_head cbk_list; /* sorted list of groups of answers */
+ struct list_head answer_list; /* list of answers */
+ struct list_head pending_list; /* member of ec_t.pending_fops */
+ ec_cbk_data_t *answer; /* accepted answer */
+ int32_t lock_count;
+ int32_t locked;
+ ec_lock_link_t locks[2];
+ int32_t first_lock;
+ gf_lock_t lock;
+
+ uint32_t flags;
+ uint32_t first;
+ uintptr_t mask;
+ uintptr_t healing; /*Dispatch is done but call is successful
+ only if fop->minimum number of subvolumes
+ succeed which are not healing*/
+ uintptr_t remaining;
+ uintptr_t received; /* Mask of responses */
+ uintptr_t good;
+
+ uid_t uid;
+ gid_t gid;
+
+ ec_wind_f wind;
+ ec_handler_f handler;
+ ec_resume_f resume;
+ ec_cbk_t cbks;
+ void *data;
+ ec_heal_t *heal;
+ struct list_head healer;
+
+ uint64_t user_size;
+ uint32_t head;
+
+ int32_t use_fd;
+
+ dict_t *xdata;
+ dict_t *dict;
+ int32_t int32;
+ uint32_t uint32;
+ uint64_t size;
+ off_t offset;
+ mode_t mode[2];
+ entrylk_cmd entrylk_cmd;
+ entrylk_type entrylk_type;
+ gf_xattrop_flags_t xattrop_flags;
+ dev_t dev;
+ inode_t *inode;
+ fd_t *fd;
+ struct iatt iatt;
+ char *str[2];
+ loc_t loc[2];
+ struct gf_flock flock;
+ struct iovec *vector;
+ struct iobref *buffers;
+ gf_seek_what_t seek;
+ ec_fragment_range_t frag_range; /* This will hold the range of stripes
+ affected by the fop. */
};
struct _ec_cbk_data {
@@ -551,6 +599,26 @@ struct _ec_self_heald {
struct subvol_healer *full_healers;
};
+struct _ec_statistics {
+ struct {
+ gf_atomic_t hits; /* Cache hits. */
+ gf_atomic_t misses; /* Cache misses. */
+ gf_atomic_t updates; /* Number of times an existing stripe has
+ been updated with new content. */
+ gf_atomic_t invals; /* Number of times an existing stripe has
+ been invalidated because of truncates
+ or discards. */
+ gf_atomic_t evicts; /* Number of times that an existing entry
+ has been evicted to make room for newer
+ entries. */
+ gf_atomic_t allocs; /* Number of memory allocations made to
+ store stripes. */
+ gf_atomic_t errors; /* Number of errors that have caused extra
+ requests. (Basically memory allocation
+ errors). */
+ } stripe_cache;
+};
+
struct _ec {
xlator_t *xl;
int32_t healers;
@@ -576,6 +644,7 @@ struct _ec {
gf_boolean_t other_eager_lock;
gf_boolean_t optimistic_changelog;
gf_boolean_t parallel_writes;
+ uint32_t stripe_cache;
uint32_t background_heals;
uint32_t heal_wait_qlen;
uint32_t self_heal_window_size; /* max size of read/writes */
@@ -590,6 +659,7 @@ struct _ec {
dict_t *leaf_to_subvolid;
ec_read_policy_t read_policy;
ec_matrix_list_t matrix;
+ ec_statistics_t stats;
};
#endif /* __EC_TYPES_H__ */
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
index 9f361a54aa3..275dd15a302 100644
--- a/xlators/cluster/ec/src/ec.c
+++ b/xlators/cluster/ec/src/ec.c
@@ -299,6 +299,8 @@ reconfigure (xlator_t *this, dict_t *options)
options, bool, failed);
GF_OPTION_RECONF ("parallel-writes", ec->parallel_writes,
options, bool, failed);
+ GF_OPTION_RECONF ("stripe-cache", ec->stripe_cache, options, uint32,
+ failed);
ret = 0;
if (ec_assign_read_policy (ec, read_policy)) {
ret = -1;
@@ -581,6 +583,18 @@ notify (xlator_t *this, int32_t event, void *data, ...)
return ret;
}
+static void
+ec_statistics_init(ec_t *ec)
+{
+ GF_ATOMIC_INIT(ec->stats.stripe_cache.hits, 0);
+ GF_ATOMIC_INIT(ec->stats.stripe_cache.misses, 0);
+ GF_ATOMIC_INIT(ec->stats.stripe_cache.updates, 0);
+ GF_ATOMIC_INIT(ec->stats.stripe_cache.invals, 0);
+ GF_ATOMIC_INIT(ec->stats.stripe_cache.evicts, 0);
+ GF_ATOMIC_INIT(ec->stats.stripe_cache.allocs, 0);
+ GF_ATOMIC_INIT(ec->stats.stripe_cache.errors, 0);
+}
+
int32_t
init (xlator_t *this)
{
@@ -671,6 +685,7 @@ init (xlator_t *this)
GF_OPTION_INIT ("shd-wait-qlength", ec->shd.wait_qlength, uint32, failed);
GF_OPTION_INIT ("optimistic-change-log", ec->optimistic_changelog, bool, failed);
GF_OPTION_INIT ("parallel-writes", ec->parallel_writes, bool, failed);
+ GF_OPTION_INIT ("stripe-cache", ec->stripe_cache, uint32, failed);
this->itable = inode_table_new (EC_SHD_INODE_LRU_LIMIT, this);
if (!this->itable)
@@ -697,6 +712,8 @@ init (xlator_t *this)
goto failed;
}
+ ec_statistics_init(ec);
+
return 0;
failed:
@@ -1252,6 +1269,9 @@ int32_t ec_gf_forget(xlator_t * this, inode_t * inode)
if ((inode_ctx_del(inode, this, &value) == 0) && (value != 0))
{
ctx = (ec_inode_t *)(uintptr_t)value;
+ /* We can only forget an inode if it has been unlocked, so the stripe
+ * cache should also be empty. */
+ GF_ASSERT(list_empty(&ctx->stripe_cache.lru));
GF_FREE(ctx);
}
@@ -1313,6 +1333,25 @@ int32_t ec_dump_private(xlator_t *this)
gf_proc_dump_write("heal-waiters", "%d", ec->heal_waiters);
gf_proc_dump_write("read-policy", "%s", ec_read_policies[ec->read_policy]);
+ snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s.stats.stripe_cache",
+ this->type, this->name);
+ gf_proc_dump_add_section(key_prefix);
+
+ gf_proc_dump_write("hits", "%llu",
+ GF_ATOMIC_GET(ec->stats.stripe_cache.hits));
+ gf_proc_dump_write("misses", "%llu",
+ GF_ATOMIC_GET(ec->stats.stripe_cache.misses));
+ gf_proc_dump_write("updates", "%llu",
+ GF_ATOMIC_GET(ec->stats.stripe_cache.updates));
+ gf_proc_dump_write("invalidations", "%llu",
+ GF_ATOMIC_GET(ec->stats.stripe_cache.invals));
+ gf_proc_dump_write("evicts", "%llu",
+ GF_ATOMIC_GET(ec->stats.stripe_cache.evicts));
+ gf_proc_dump_write("allocations", "%llu",
+ GF_ATOMIC_GET(ec->stats.stripe_cache.allocs));
+ gf_proc_dump_write("errors", "%llu",
+ GF_ATOMIC_GET(ec->stats.stripe_cache.errors));
+
return 0;
}
@@ -1512,5 +1551,18 @@ struct volume_options options[] =
.description = "This controls if writes can be wound in parallel as long"
"as it doesn't modify same stripes"
},
+ { .key = {"stripe-cache"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,/*Disabling stripe_cache*/
+ .max = EC_STRIPE_CACHE_MAX_SIZE,
+ .default_value = "0",
+ .description = "This option will keep the last stripe of write fop"
+ "in memory. If next write falls in this stripe, we need"
+ "not to read it again from backend and we can save READ"
+ "fop going over the network. This will improve performance,"
+ "specially for sequential writes. However, this will also"
+ "lead to extra memory consumption, maximum "
+ "(cache size * stripe size) Bytes per open file."
+ },
{ .key = {NULL} }
};
diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h
index 648d444f595..b729fffc274 100644
--- a/xlators/cluster/ec/src/ec.h
+++ b/xlators/cluster/ec/src/ec.h
@@ -17,7 +17,7 @@
#define EC_XATTR_VERSION EC_XATTR_PREFIX"version"
#define EC_XATTR_HEAL EC_XATTR_PREFIX"heal"
#define EC_XATTR_DIRTY EC_XATTR_PREFIX"dirty"
-
+#define EC_STRIPE_CACHE_MAX_SIZE 10
#define EC_VERSION_SIZE 2
#define EC_SHD_INODE_LRU_LIMIT 10