summaryrefslogtreecommitdiffstats
path: root/xlators/cluster
diff options
context:
space:
mode:
authorBrian Foster <bfoster@redhat.com>2012-11-01 09:46:12 -0400
committerVijay Bellur <vbellur@redhat.com>2012-11-29 09:00:28 -0800
commit0314f16ec59d8c22597c8c14b53a473b736b8b1f (patch)
tree1e0e1dc470be5d04ff41c470c3812bc10bd0afb3 /xlators/cluster
parentc85a3eee54b4028573c905829d5b46c0b6512c56 (diff)
afr: handle short writes in afr_writev_wind and self-heal to avoid corruption
The current failure to handle short writes on writev fops leaves us open to file corruption. A short write on a user request is ignored and leaves replicas in an inconsistent state. A short write during a self-heal is ignored and incorrectly marks the files as consistent if the heal completes. Modify user writev handling to return the best case return value from each of the replicas. Short writes that occur relative to this value are marked as failed and will require a heal. Modify self-heal to set an error on a short write and abort the heal. BUG: 853690 Change-Id: I18b30f58702326249230eeebb361b29e40b535f5 Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-on: http://review.gluster.org/4150 Reviewed-by: Jeff Darcy <jdarcy@redhat.com> Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com> Tested-by: Gluster Build System <jenkins@build.gluster.com>
Diffstat (limited to 'xlators/cluster')
-rw-r--r--xlators/cluster/afr/src/afr-common.c2
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c61
-rw-r--r--xlators/cluster/afr/src/afr-mem-types.h1
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-algorithm.c20
-rw-r--r--xlators/cluster/afr/src/afr.h7
5 files changed, 75 insertions, 16 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index a1dd4a5ce57..2e339986621 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -879,6 +879,8 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)
if (local->dict)
dict_unref (local->dict);
+ GF_FREE(local->replies);
+
GF_FREE (local->child_up);
GF_FREE (local->child_errno);
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index 94700ddae78..d9d3800fc47 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -77,8 +77,11 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int child_index = (long) cookie;
int call_count = -1;
int read_child = 0;
+ afr_private_t *priv = NULL;
+ int i = 0;
local = frame->local;
+ priv = this->private;
read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL);
@@ -88,31 +91,50 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->read_child_returned = _gf_true;
}
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+
if (afr_fop_failed (op_ret, op_errno))
afr_transaction_fop_failed (frame, this, child_index);
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.writev.prebuf = *prebuf;
- local->cont.writev.postbuf = *postbuf;
- }
-
- if (child_index == read_child) {
- local->cont.writev.prebuf = *prebuf;
- local->cont.writev.postbuf = *postbuf;
- }
- }
-
- local->op_errno = op_errno;
+ /* stage the best case return value for unwind */
+ if ((local->success_count == 0) || (op_ret > local->op_ret)) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+
+ if (op_ret != -1) {
+ if ((local->success_count == 0) ||
+ (child_index == read_child)) {
+ local->cont.writev.prebuf = *prebuf;
+ local->cont.writev.postbuf = *postbuf;
+ }
+ local->success_count++;
+ }
}
UNLOCK (&frame->lock);
call_count = afr_frame_return (frame);
if (call_count == 0) {
- local->transaction.unwind (frame, this);
+ /*
+ * We already have the best case result of the writev calls staged
+ * as the return value. Any writev that returns some value less
+ * than the best case is now out of sync, so mark the fop as
+ * failed. Note that fops that have returned with errors have
+ * already been marked as failed.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if ((!local->replies[i].valid) ||
+ (local->replies[i].op_ret == -1))
+ continue;
+
+ if (local->replies[i].op_ret < local->op_ret)
+ afr_transaction_fop_failed(frame, this, i);
+ }
+ local->transaction.unwind (frame, this);
local->transaction.resume (frame, this);
}
return 0;
@@ -138,6 +160,15 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this)
}
local->call_count = call_count;
+ local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies),
+ gf_afr_mt_reply_t);
+ if (!local->replies) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ local->transaction.unwind(frame, this);
+ local->transaction.resume(frame, this);
+ return 0;
+ }
for (i = 0; i < priv->child_count; i++) {
if (local->transaction.pre_op[i]) {
diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h
index 7e684801fae..e01ab366f4f 100644
--- a/xlators/cluster/afr/src/afr-mem-types.h
+++ b/xlators/cluster/afr/src/afr-mem-types.h
@@ -41,6 +41,7 @@ enum gf_afr_mem_types_ {
gf_afr_mt_shd_event_t,
gf_afr_mt_time_t,
gf_afr_mt_pos_data_t,
+ gf_afr_mt_reply_t,
gf_afr_mt_end
};
#endif
diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c
index 9ad40592088..e03e1cc45f0 100644
--- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c
+++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.c
@@ -434,11 +434,20 @@ sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this,
sh->op_failed = 1;
afr_sh_set_error (loop_sh, op_errno);
- }
+ } else if (op_ret < loop_local->cont.writev.vector->iov_len) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "incomplete write to %s on subvolume %s "
+ "(expected %lu, returned %d)", sh_local->loc.path,
+ priv->children[child_index]->name,
+ loop_local->cont.writev.vector->iov_len, op_ret);
+ sh->op_failed = 1;
+ }
call_count = afr_frame_return (loop_frame);
if (call_count == 0) {
+ iobref_unref(loop_local->cont.writev.iobref);
+
sh_loop_return (sh_frame, this, loop_frame,
loop_sh->op_ret, loop_sh->op_errno);
}
@@ -527,8 +536,17 @@ sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie,
sh_loop_return (sh_frame, this, loop_frame, 0, 0);
goto out;
}
+
loop_local->call_count = call_count;
+ /*
+ * We only really need the request size at the moment, but the buffer
+ * is required if we want to issue a retry in the event of a short write.
+ * Therefore, we duplicate the vector and ref the iobref here...
+ */
+ loop_local->cont.writev.vector = iov_dup(vector, count);
+ loop_local->cont.writev.iobref = iobref_ref(iobref);
+
for (i = 0; i < priv->child_count; i++) {
if (!loop_sh->write_needed[i])
continue;
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index a0d1f3a7466..48dfbf37eb8 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -373,6 +373,12 @@ typedef struct _afr_locked_fd {
struct list_head list;
} afr_locked_fd_t;
+struct afr_reply {
+ int valid;
+ int32_t op_ret;
+ int32_t op_errno;
+};
+
typedef struct _afr_local {
int uid;
int gid;
@@ -665,6 +671,7 @@ typedef struct _afr_local {
mode_t umask;
int xflag;
gf_boolean_t do_discovery;
+ struct afr_reply *replies;
} afr_local_t;
typedef enum {