summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/stripe
diff options
context:
space:
mode:
authorBrian Foster <bfoster@redhat.com>2012-07-19 15:01:13 -0400
committerAnand Avati <avati@redhat.com>2012-07-30 11:39:03 -0700
commit879c01087d58742515524664e8a193a04a0d4146 (patch)
tree9c824d09ea4a814e085f9df6b2ff292a767c6868 /xlators/cluster/stripe
parentb3025cb8dd79751c12b7bb375ec701b532688cbc (diff)
cluster/stripe: handle short writes and errors in writev callback
cluster/stripe write callback handling is broken in the event of server side errors and short writes due to crudely summing up the return values from each node. This can produce incorrect results or cause an application to rewrite the wrong portions of a buffer in an attempt to handle this condition. Modify cluster/stripe writev handling to record the requested size of each write and use this data to return the number of consecutive bytes written from the original request. This allows an application to retry a write at the point of error (and potentially consume said error). BUG: 809975 Change-Id: Ic35cb1e092c29545205aa32e352485c507534ce0 Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-on: http://review.gluster.com/3700 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Shishir Gowda <sgowda@redhat.com> Reviewed-by: Amar Tumballi <amarts@redhat.com> Reviewed-by: Anand Avati <avati@redhat.com>
Diffstat (limited to 'xlators/cluster/stripe')
-rw-r--r--xlators/cluster/stripe/src/stripe-mem-types.h2
-rw-r--r--xlators/cluster/stripe/src/stripe.c136
-rw-r--r--xlators/cluster/stripe/src/stripe.h6
3 files changed, 106 insertions, 38 deletions
diff --git a/xlators/cluster/stripe/src/stripe-mem-types.h b/xlators/cluster/stripe/src/stripe-mem-types.h
index c8781d7d761..e05ba0c29c3 100644
--- a/xlators/cluster/stripe/src/stripe-mem-types.h
+++ b/xlators/cluster/stripe/src/stripe-mem-types.h
@@ -16,7 +16,7 @@
enum gf_stripe_mem_types_ {
gf_stripe_mt_iovec = gf_common_mt_end + 1,
- gf_stripe_mt_readv_replies,
+ gf_stripe_mt_stripe_replies,
gf_stripe_mt_stripe_fd_ctx_t,
gf_stripe_mt_char,
gf_stripe_mt_int8_t,
diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c
index e2176eeae66..6588a44996c 100644
--- a/xlators/cluster/stripe/src/stripe.c
+++ b/xlators/cluster/stripe/src/stripe.c
@@ -3482,8 +3482,8 @@ stripe_readv (call_frame_t *frame, xlator_t *this, fd_t *fd,
frame->local = local;
/* This is where all the vectors should be copied. */
- local->replies = GF_CALLOC (num_stripe, sizeof (struct readv_replies),
- gf_stripe_mt_readv_replies);
+ local->replies = GF_CALLOC (num_stripe, sizeof (struct stripe_replies),
+ gf_stripe_mt_stripe_replies);
if (!local->replies) {
op_errno = ENOMEM;
goto err;
@@ -3543,7 +3543,11 @@ stripe_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
int32_t callcnt = 0;
stripe_local_t *local = NULL;
+ stripe_local_t *mlocal = NULL;
call_frame_t *prev = NULL;
+ call_frame_t *mframe = NULL;
+ struct stripe_replies *reply = NULL;
+ int32_t i = 0;
if (!this || !frame || !frame->local || !cookie) {
gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
@@ -3552,48 +3556,75 @@ stripe_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
prev = cookie;
local = frame->local;
+ mframe = local->orig_frame;
+ mlocal = mframe->local;
LOCK(&frame->lock);
{
- callcnt = ++local->call_count;
+ callcnt = ++mlocal->call_count;
+
+ mlocal->replies[local->node_index].op_ret = op_ret;
+ mlocal->replies[local->node_index].op_errno = op_errno;
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s returned error %s",
- prev->this->name, strerror (op_errno));
- local->op_errno = op_errno;
- local->op_ret = -1;
- }
if (op_ret >= 0) {
- local->op_ret += op_ret;
- local->post_buf = *postbuf;
- local->pre_buf = *prebuf;
+ mlocal->post_buf = *postbuf;
+ mlocal->pre_buf = *prebuf;
- local->prebuf_blocks += prebuf->ia_blocks;
- local->postbuf_blocks += postbuf->ia_blocks;
+ mlocal->prebuf_blocks += prebuf->ia_blocks;
+ mlocal->postbuf_blocks += postbuf->ia_blocks;
- correct_file_size(prebuf, local->fctx, prev);
- correct_file_size(postbuf, local->fctx, prev);
+ correct_file_size(prebuf, mlocal->fctx, prev);
+ correct_file_size(postbuf, mlocal->fctx, prev);
- if (local->prebuf_size < prebuf->ia_size)
- local->prebuf_size = prebuf->ia_size;
- if (local->postbuf_size < postbuf->ia_size)
- local->postbuf_size = postbuf->ia_size;
+ if (mlocal->prebuf_size < prebuf->ia_size)
+ mlocal->prebuf_size = prebuf->ia_size;
+ if (mlocal->postbuf_size < postbuf->ia_size)
+ mlocal->postbuf_size = postbuf->ia_size;
}
}
UNLOCK (&frame->lock);
- if ((callcnt == local->wind_count) && local->unwind) {
- local->pre_buf.ia_size = local->prebuf_size;
- local->pre_buf.ia_blocks = local->prebuf_blocks;
- local->post_buf.ia_size = local->postbuf_size;
- local->post_buf.ia_blocks = local->postbuf_blocks;
+ if ((callcnt == mlocal->wind_count) && mlocal->unwind) {
+ mlocal->pre_buf.ia_size = mlocal->prebuf_size;
+ mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks;
+ mlocal->post_buf.ia_size = mlocal->postbuf_size;
+ mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks;
+
+ /*
+ * Only return the number of consecutively written bytes up until
+ * the first error. Only return an error if it occurs first.
+ *
+ * When a short write occurs, the application should retry at the
+ * appropriate offset, at which point we'll potentially pass back
+ * the error.
+ */
+ for (i = 0, reply = mlocal->replies; i < mlocal->wind_count;
+ i++, reply++) {
+ if (reply->op_ret == -1) {
+ gf_log(this->name, GF_LOG_DEBUG, "reply %d "
+ "returned error %s", i,
+ strerror(reply->op_errno));
+ if (!mlocal->op_ret) {
+ mlocal->op_ret = -1;
+ mlocal->op_errno = reply->op_errno;
+ }
+ break;
+ }
- STRIPE_STACK_UNWIND (writev, frame, local->op_ret,
- local->op_errno, &local->pre_buf,
- &local->post_buf, NULL);
+ mlocal->op_ret += reply->op_ret;
+
+ if (reply->op_ret < reply->requested_size)
+ break;
+ }
+
+ GF_FREE(mlocal->replies);
+
+ STRIPE_STACK_UNWIND (writev, mframe, mlocal->op_ret,
+ mlocal->op_errno, &mlocal->pre_buf,
+ &mlocal->post_buf, NULL);
}
out:
+ STRIPE_STACK_DESTROY(frame);
return 0;
}
@@ -3615,6 +3646,11 @@ stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
uint64_t stripe_size = 0;
uint64_t tmp_fctx = 0;
off_t dest_offset = 0;
+ off_t rounded_start = 0;
+ off_t rounded_end = 0;
+ int32_t total_chunks = 0;
+ call_frame_t *wframe = NULL;
+ stripe_local_t *wlocal = NULL;
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
@@ -3653,7 +3689,27 @@ stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
goto err;
}
+ rounded_start = floor(offset, stripe_size);
+ rounded_end = roof(offset + total_size, stripe_size);
+ total_chunks = (rounded_end - rounded_start) / stripe_size;
+ local->replies = GF_CALLOC(total_chunks, sizeof(struct stripe_replies),
+ gf_stripe_mt_stripe_replies);
+ if (!local->replies) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ total_chunks = 0;
while (1) {
+ wframe = copy_frame(frame);
+ wlocal = mem_get0(this->local_pool);
+ if (!wlocal) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ wlocal->orig_frame = frame;
+ wframe->local = wlocal;
+
/* Send striped chunk of the vector to child
nodes appropriately. */
idx = (((offset + offset_offset) /
@@ -3681,25 +3737,37 @@ stripe_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
if (remaining_size == 0)
local->unwind = 1;
+ /*
+ * Store off the request index (with respect to the chunk of the
+ * initial offset) and the size of the request. This is required
+ * in the callback to calculate an appropriate return value in
+ * the event of a write failure in one or more requests.
+ */
+ wlocal->node_index = total_chunks;
+ local->replies[total_chunks].requested_size = fill_size;
+
+ dest_offset = offset + offset_offset;
if (fctx->stripe_coalesce)
- dest_offset = coalesced_offset(offset + offset_offset,
- local->stripe_size, fctx->stripe_count);
- else
- dest_offset = offset + offset_offset;
+ dest_offset = coalesced_offset(dest_offset,
+ local->stripe_size, fctx->stripe_count);
- STACK_WIND (frame, stripe_writev_cbk, fctx->xl_array[idx],
+ STACK_WIND (wframe, stripe_writev_cbk, fctx->xl_array[idx],
fctx->xl_array[idx]->fops->writev, fd, tmp_vec,
tmp_count, dest_offset, flags, iobref,
xdata);
GF_FREE (tmp_vec);
offset_offset += fill_size;
+ total_chunks++;
if (remaining_size == 0)
break;
}
return 0;
err:
+ if (wframe)
+ STRIPE_STACK_DESTROY(wframe);
+
STRIPE_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
diff --git a/xlators/cluster/stripe/src/stripe.h b/xlators/cluster/stripe/src/stripe.h
index 1b9e660c126..a440f87ba1f 100644
--- a/xlators/cluster/stripe/src/stripe.h
+++ b/xlators/cluster/stripe/src/stripe.h
@@ -106,9 +106,9 @@ struct stripe_private {
};
/**
- * Used to keep info about the replies received from fops->readv calls
+ * Used to keep info about the replies received from readv/writev calls
*/
-struct readv_replies {
+struct stripe_replies {
struct iovec *vector;
int32_t count; //count of vector
int32_t op_ret; //op_ret of readv
@@ -156,7 +156,7 @@ struct stripe_local {
blkcnt_t preparent_blocks;
blkcnt_t postparent_blocks;
- struct readv_replies *replies;
+ struct stripe_replies *replies;
struct statvfs statvfs_buf;
dir_entry_t *entry;