diff options
| -rwxr-xr-x | tests/bugs/bug-853690.t | 94 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 2 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-inode-write.c | 61 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-mem-types.h | 1 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-algorithm.c | 20 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 7 | 
6 files changed, 169 insertions, 16 deletions
diff --git a/tests/bugs/bug-853690.t b/tests/bugs/bug-853690.t new file mode 100755 index 00000000..77a581f5 --- /dev/null +++ b/tests/bugs/bug-853690.t @@ -0,0 +1,94 @@ +#!/bin/bash +# +# Bug 853690 - Test that short writes do not lead to corruption. +# +# Mismanagement of short writes in AFR leads to corruption and immediately +# detectable split-brain. Write a file to a replica volume using error-gen +# to cause short writes on one replica. +# +# Short writes are also possible during heal. If ignored, the files are marked +# consistent and silently differ. After reading the file, cause a lookup, wait +# for self-heal and verify that the afr xattrs do not match. +# +######## + +. $(dirname $0)/../include.rc + +cleanup; + +TEST mkdir -p $B0/test{1,2} + +# Our graph is a two brick replica with 100% frequency of short writes on one +# side of the replica. This guarantees a single write fop leads to an out-of-sync +# situation. +cat > $B0/test.vol <<EOF +volume test-posix-0 +    type storage/posix +    option directory $B0/test1 +end-volume + +volume test-error-0 +    type debug/error-gen +    option failure 100 +    option enable writev +    option error-no GF_ERROR_SHORT_WRITE +    subvolumes test-posix-0 +end-volume + +volume test-locks-0 +    type features/locks +    subvolumes test-error-0 +end-volume + +volume test-posix-1 +    type storage/posix +    option directory $B0/test2 +end-volume + +volume test-locks-1 +    type features/locks +    subvolumes test-posix-1 +end-volume + +volume test-replicate-0 +    type cluster/replicate +    option background-self-heal-count 0 +    subvolumes test-locks-0 test-locks-1 +end-volume +EOF + +TEST glusterd + +TEST glusterfs --volfile=$B0/test.vol --attribute-timeout=0 --entry-timeout=0 $M0 + +# Send a single write, guaranteed to be short on one replica, and attempt to +# read the data back. Failure to detect the short write results in different +# file sizes and immediate split-brain (EIO). +TEST dd if=/dev/zero of=$M0/file bs=128k count=1 +TEST dd if=$M0/file of=/dev/null bs=128k count=1 + +######## +# +# Test self-heal with short writes... +# +######## + +# Cause a lookup and wait a few seconds for posterity. This self-heal also fails +# due to a short write. +TEST ls $M0/file + +# Verify the attributes on the healthy replica do not reflect consistency with +# the other replica. +TEST "getfattr -n trusted.afr.test-locks-0 $B0/test2/file --only-values > $B0/out1 2> /dev/null" +TEST "getfattr -n trusted.afr.test-locks-1 $B0/test2/file --only-values > $B0/out2 2> /dev/null" +TEST ! cmp $B0/out1 $B0/out2 + +TEST rm -f $B0/out1 $B0/out2 +TEST rm -f $M0/file +TEST umount $M0 + +rm -f $B0/test.vol +rm -rf $B0/test1 $B0/test2 + +cleanup; + diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index a1dd4a5c..2e339986 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -879,6 +879,8 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)          if (local->dict)                  dict_unref (local->dict); +	GF_FREE(local->replies); +          GF_FREE (local->child_up);          GF_FREE (local->child_errno); diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 94700dda..d9d3800f 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -77,8 +77,11 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          int child_index = (long) cookie;          int call_count  = -1;          int read_child  = 0; +	afr_private_t *priv = NULL; +	int i = 0;          local = frame->local; +	priv = this->private;          read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); @@ -88,31 +91,50 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          local->read_child_returned = _gf_true;                  } +		local->replies[child_index].valid = 1; +		local->replies[child_index].op_ret = op_ret; +		local->replies[child_index].op_errno = op_errno; +                  if (afr_fop_failed (op_ret, op_errno))                          afr_transaction_fop_failed (frame, this, child_index); -                if (op_ret != -1) { -                        if (local->success_count == 0) { -                                local->op_ret              = op_ret; -                                local->cont.writev.prebuf  = *prebuf; -                                local->cont.writev.postbuf = *postbuf; -                        } - -                        if (child_index == read_child) { -                                local->cont.writev.prebuf  = *prebuf; -                                local->cont.writev.postbuf = *postbuf; -                        } -                } - -                local->op_errno = op_errno; +		/* stage the best case return value for unwind */ +                if ((local->success_count == 0) || (op_ret > local->op_ret)) { +                        local->op_ret              = op_ret; +			local->op_errno		   = op_errno; +		} + +		if (op_ret != -1) { +			if ((local->success_count == 0) || +			    (child_index == read_child)) { +				local->cont.writev.prebuf  = *prebuf; +				local->cont.writev.postbuf = *postbuf; +			} +			local->success_count++; +		}          }          UNLOCK (&frame->lock);          call_count = afr_frame_return (frame);          if (call_count == 0) { -                local->transaction.unwind (frame, this); +		/* +		 * We already have the best case result of the writev calls staged +		 * as the return value. Any writev that returns some value less +		 * than the best case is now out of sync, so mark the fop as +		 * failed. Note that fops that have returned with errors have +		 * already been marked as failed. +		 */ +		for (i = 0; i < priv->child_count; i++) { +			if ((!local->replies[i].valid) || +			    (local->replies[i].op_ret == -1)) +				continue; + +			if (local->replies[i].op_ret < local->op_ret) +				afr_transaction_fop_failed(frame, this, i); +		} +                local->transaction.unwind (frame, this);                  local->transaction.resume (frame, this);          }          return 0; @@ -138,6 +160,15 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this)          }          local->call_count = call_count; +	local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), +				   gf_afr_mt_reply_t); +	if (!local->replies) { +		local->op_ret = -1; +		local->op_errno = ENOMEM; +		local->transaction.unwind(frame, this); +		local->transaction.resume(frame, this); +		return 0; +	}          for (i = 0; i < priv->child_count; i++) {                  if (local->transaction.pre_op[i]) { diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 7e684801..e01ab366 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -41,6 +41,7 @@ enum gf_afr_mem_types_ {          gf_afr_mt_shd_event_t,          gf_afr_mt_time_t,          gf_afr_mt_pos_data_t, +	gf_afr_mt_reply_t,          gf_afr_mt_end  };  #endif diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c index 9ad40592..e03e1cc4 100644 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c +++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.c @@ -434,11 +434,20 @@ sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this,                  sh->op_failed = 1;                  afr_sh_set_error (loop_sh, op_errno); -        } +        } else if (op_ret < loop_local->cont.writev.vector->iov_len) { +		gf_log(this->name, GF_LOG_ERROR, +		       "incomplete write to %s on subvolume %s " +		       "(expected %lu, returned %d)", sh_local->loc.path, +		       priv->children[child_index]->name, +		       loop_local->cont.writev.vector->iov_len, op_ret); +		sh->op_failed = 1; +	}          call_count = afr_frame_return (loop_frame);          if (call_count == 0) { +		iobref_unref(loop_local->cont.writev.iobref); +                  sh_loop_return (sh_frame, this, loop_frame,                                  loop_sh->op_ret, loop_sh->op_errno);          } @@ -527,8 +536,17 @@ sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie,                  sh_loop_return (sh_frame, this, loop_frame, 0, 0);                  goto out;          } +          loop_local->call_count = call_count; +	/* +	 * We only really need the request size at the moment, but the buffer +	 * is required if we want to issue a retry in the event of a short write. +	 * Therefore, we duplicate the vector and ref the iobref here... +	 */ +	loop_local->cont.writev.vector = iov_dup(vector, count); +	loop_local->cont.writev.iobref = iobref_ref(iobref); +          for (i = 0; i < priv->child_count; i++) {                  if (!loop_sh->write_needed[i])                          continue; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index a0d1f3a7..48dfbf37 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -373,6 +373,12 @@ typedef struct _afr_locked_fd {          struct list_head list;  } afr_locked_fd_t; +struct afr_reply { +	int	valid; +	int32_t	op_ret; +	int32_t	op_errno; +}; +  typedef struct _afr_local {          int     uid;          int     gid; @@ -665,6 +671,7 @@ typedef struct _afr_local {          mode_t          umask;          int             xflag;          gf_boolean_t    do_discovery; +	struct afr_reply *replies;  } afr_local_t;  typedef enum {  | 
