diff options
Diffstat (limited to 'xlators/cluster')
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 2 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-inode-write.c | 61 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-mem-types.h | 1 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-algorithm.c | 20 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 7 | 
5 files changed, 75 insertions, 16 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index a1dd4a5ce..2e3399866 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -879,6 +879,8 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)          if (local->dict)                  dict_unref (local->dict); +	GF_FREE(local->replies); +          GF_FREE (local->child_up);          GF_FREE (local->child_errno); diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 94700ddae..d9d3800fc 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -77,8 +77,11 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          int child_index = (long) cookie;          int call_count  = -1;          int read_child  = 0; +	afr_private_t *priv = NULL; +	int i = 0;          local = frame->local; +	priv = this->private;          read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); @@ -88,31 +91,50 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          local->read_child_returned = _gf_true;                  } +		local->replies[child_index].valid = 1; +		local->replies[child_index].op_ret = op_ret; +		local->replies[child_index].op_errno = op_errno; +                  if (afr_fop_failed (op_ret, op_errno))                          afr_transaction_fop_failed (frame, this, child_index); -                if (op_ret != -1) { -                        if (local->success_count == 0) { -                                local->op_ret              = op_ret; -                                local->cont.writev.prebuf  = *prebuf; -                                local->cont.writev.postbuf = *postbuf; -                        } - -                        if (child_index == read_child) { -                                local->cont.writev.prebuf  = *prebuf; -                                local->cont.writev.postbuf = *postbuf; -                        } -                } - -                local->op_errno = op_errno; +		/* stage the best case return value for unwind */ +                if ((local->success_count == 0) || (op_ret > local->op_ret)) { +                        local->op_ret              = op_ret; +			local->op_errno		   = op_errno; +		} + +		if (op_ret != -1) { +			if ((local->success_count == 0) || +			    (child_index == read_child)) { +				local->cont.writev.prebuf  = *prebuf; +				local->cont.writev.postbuf = *postbuf; +			} +			local->success_count++; +		}          }          UNLOCK (&frame->lock);          call_count = afr_frame_return (frame);          if (call_count == 0) { -                local->transaction.unwind (frame, this); +		/* +		 * We already have the best case result of the writev calls staged +		 * as the return value. Any writev that returns some value less +		 * than the best case is now out of sync, so mark the fop as +		 * failed. Note that fops that have returned with errors have +		 * already been marked as failed. +		 */ +		for (i = 0; i < priv->child_count; i++) { +			if ((!local->replies[i].valid) || +			    (local->replies[i].op_ret == -1)) +				continue; + +			if (local->replies[i].op_ret < local->op_ret) +				afr_transaction_fop_failed(frame, this, i); +		} +                local->transaction.unwind (frame, this);                  local->transaction.resume (frame, this);          }          return 0; @@ -138,6 +160,15 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this)          }          local->call_count = call_count; +	local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), +				   gf_afr_mt_reply_t); +	if (!local->replies) { +		local->op_ret = -1; +		local->op_errno = ENOMEM; +		local->transaction.unwind(frame, this); +		local->transaction.resume(frame, this); +		return 0; +	}          for (i = 0; i < priv->child_count; i++) {                  if (local->transaction.pre_op[i]) { diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 7e684801f..e01ab366f 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -41,6 +41,7 @@ enum gf_afr_mem_types_ {          gf_afr_mt_shd_event_t,          gf_afr_mt_time_t,          gf_afr_mt_pos_data_t, +	gf_afr_mt_reply_t,          gf_afr_mt_end  };  #endif diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c index 9ad405920..e03e1cc45 100644 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c +++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.c @@ -434,11 +434,20 @@ sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this,                  sh->op_failed = 1;                  afr_sh_set_error (loop_sh, op_errno); -        } +        } else if (op_ret < loop_local->cont.writev.vector->iov_len) { +		gf_log(this->name, GF_LOG_ERROR, +		       "incomplete write to %s on subvolume %s " +		       "(expected %lu, returned %d)", sh_local->loc.path, +		       priv->children[child_index]->name, +		       loop_local->cont.writev.vector->iov_len, op_ret); +		sh->op_failed = 1; +	}          call_count = afr_frame_return (loop_frame);          if (call_count == 0) { +		iobref_unref(loop_local->cont.writev.iobref); +                  sh_loop_return (sh_frame, this, loop_frame,                                  loop_sh->op_ret, loop_sh->op_errno);          } @@ -527,8 +536,17 @@ sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie,                  sh_loop_return (sh_frame, this, loop_frame, 0, 0);                  goto out;          } +          loop_local->call_count = call_count; +	/* +	 * We only really need the request size at the moment, but the buffer +	 * is required if we want to issue a retry in the event of a short write. +	 * Therefore, we duplicate the vector and ref the iobref here... +	 */ +	loop_local->cont.writev.vector = iov_dup(vector, count); +	loop_local->cont.writev.iobref = iobref_ref(iobref); +          for (i = 0; i < priv->child_count; i++) {                  if (!loop_sh->write_needed[i])                          continue; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index a0d1f3a74..48dfbf37e 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -373,6 +373,12 @@ typedef struct _afr_locked_fd {          struct list_head list;  } afr_locked_fd_t; +struct afr_reply { +	int	valid; +	int32_t	op_ret; +	int32_t	op_errno; +}; +  typedef struct _afr_local {          int     uid;          int     gid; @@ -665,6 +671,7 @@ typedef struct _afr_local {          mode_t          umask;          int             xflag;          gf_boolean_t    do_discovery; +	struct afr_reply *replies;  } afr_local_t;  typedef enum {  | 
