summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAnand Avati <avati@redhat.com>2013-03-24 12:19:56 -0700
committerAnand Avati <avati@redhat.com>2013-03-27 22:15:53 -0700
commitca10fdc81a72a71ac67ac9fc8c5ad5b92febd875 (patch)
tree7c8cef642d283bf3a132499a1c82f3351ced475f
parent1f7dadccd45863ebea8f60339f297ac551e89899 (diff)
cluster/afr: ensure DATA operations are made durable before POST-OP
The changelogging scheme of AFR stores information about the state of all replicas in all replicas (in the extended attribute of the respective files on each server) in the form of 'pending counts' of operations (effectively "dirty flags"). These xattrs are blindly trusted while performing self-heal, and therefore utmost care has to be taken while updating and maintaing them. The most critical updation is the clearing of the pending counts corresponding to the *other* server in the changelog of a given server. Before clearing the pending count, we need durability guarantee of the write which was performed on the other server. To obtain such a guarantee, it may be necessary to explicitly introduce an fsync() phase (if the file itself wasn't already opened with O_SYNC). This patch introduces the detection of unstable stable writes on a file and issues explicit fsync() on the servers before performing the POST-OP clearing of pending flags. Change-Id: I2171b86a74ec91e40e5877eef0a4e7379578ecf7 BUG: 927146 Signed-off-by: Anand Avati <avati@redhat.com> Reviewed-on: http://review.gluster.org/4721 Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com> Reviewed-by: Krishnan Parthasarathi <kparthas@redhat.com> Tested-by: Gluster Build System <jenkins@build.gluster.com>
-rw-r--r--tests/bugs/bug-888174.t4
-rw-r--r--xlators/cluster/afr/src/afr-common.c23
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c8
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c281
-rw-r--r--xlators/cluster/afr/src/afr.h24
5 files changed, 316 insertions, 24 deletions
diff --git a/tests/bugs/bug-888174.t b/tests/bugs/bug-888174.t
index 6cf8d04ad7c..76ca3c961f4 100644
--- a/tests/bugs/bug-888174.t
+++ b/tests/bugs/bug-888174.t
@@ -36,14 +36,14 @@ inodelk_max_latency=$($CLI volume profile $V0 info | grep INODELK | awk 'BEGIN {
TEST [ -z $inodelk_max_latency ]
-TEST dd of=$M0/a if=/dev/urandom bs=1M count=10
+TEST dd of=$M0/a if=/dev/urandom bs=1M count=10 conv=fsync
#Check for no trace of pending changelog. Flush should make sure of it.
EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_0/a trusted.afr.$V0-client-0
EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_0/a trusted.afr.$V0-client-1
EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_1/a trusted.afr.$V0-client-0
EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/r2_1/a trusted.afr.$V0-client-1
-dd of=$M0/a if=/dev/urandom bs=1M count=1024 2>/dev/null &
+dd of=$M0/a if=/dev/urandom bs=1M count=1024 oflag=sync 2>/dev/null &
p=$!
#trigger graph switches, tests for fsync not leaving any pending flags
TEST $CLI volume set $V0 performance.quick-read off
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 4157174e40c..5b96a7789f9 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -850,6 +850,8 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
loc_wipe (&local->transaction.parent_loc);
loc_wipe (&local->transaction.new_parent_loc);
+
+ GF_FREE (local->transaction.postop_piggybacked);
}
@@ -2675,6 +2677,14 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_count = afr_frame_return (frame);
if (call_count == 0) {
+ /* If no new unstable writes happened between the
+ time we cleared the unstable write witness flag in afr_fsync
+ and now, calling afr_delayed_changelog_wake_up() should
+ wake up and skip over the fsync phase and go straight to
+ afr_changelog_post_op_now()
+ */
+ afr_delayed_changelog_wake_up (this, local->fd);
+
AFR_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno,
&local->cont.fsync.prebuf,
&local->cont.fsync.postbuf,
@@ -2713,7 +2723,9 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
local->fd = fd_ref (fd);
- afr_delayed_changelog_wake_up (this, fd);
+ if (afr_fd_has_witnessed_unstable_write (this, fd)) {
+ /* don't care. we only wanted to CLEAR the bit */
+ }
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
@@ -3872,6 +3884,15 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
goto out;
}
+ local->transaction.postop_piggybacked = GF_CALLOC (priv->child_count,
+ sizeof (int),
+ gf_afr_mt_int32_t);
+ if (!local->transaction.postop_piggybacked) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
ret = 0;
out:
return ret;
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index a83223569d0..ce4fbf22698 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -176,6 +176,9 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (call_count == 0) {
+ if (!local->stable_write)
+ afr_fd_report_unstable_write (this, local->fd);
+
afr_writev_handle_short_writes (frame, this);
/*
* Generally inode-write fops do transaction.unwind then
@@ -471,6 +474,11 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
local->fd = fd_ref (fd);
+ /* detect here, but set it in writev_wind_cbk *after* the unstable
+ write is performed
+ */
+ local->stable_write = !!((fd->flags|flags)&(O_SYNC|O_DSYNC));
+
afr_open_fd_fix (fd, this);
afr_do_writev (frame, this);
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index d20928d151d..76697f06b5c 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -574,6 +574,29 @@ afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr,
"failed to set pending entry");
}
+
+gf_boolean_t
+afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int index = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ index = afr_index_for_transaction_type (local->transaction.type);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->pending[i][index] == 0)
+ return _gf_false;
+ }
+
+ return _gf_true;
+}
+
+
int
afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
{
@@ -586,7 +609,6 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
afr_fd_ctx_t *fdctx = NULL;
dict_t **xattr = NULL;
int piggyback = 0;
- int index = 0;
int nothing_failed = 1;
local = frame->local;
@@ -622,15 +644,7 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
goto out;
}
- /* check if something has failed, to handle piggybacking */
- nothing_failed = 1;
- index = afr_index_for_transaction_type (local->transaction.type);
- for (i = 0; i < priv->child_count; i++) {
- if (local->pending[i][index] == 0) {
- nothing_failed = 0;
- break;
- }
- }
+ nothing_failed = afr_txn_nothing_failed (frame, this);
afr_compute_txn_changelog (local , priv);
@@ -656,15 +670,14 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
break;
}
- LOCK (&local->fd->lock);
- {
- piggyback = 0;
- if (fdctx->pre_op_piggyback[i]) {
- fdctx->pre_op_piggyback[i]--;
- piggyback = 1;
- }
- }
- UNLOCK (&local->fd->lock);
+ /* local->transaction.postop_piggybacked[] was
+ precomputed in is_piggyback_postop() when called from
+ afr_changelog_post_op_safe()
+ */
+
+ piggyback = 0;
+ if (local->transaction.postop_piggybacked[i])
+ piggyback = 1;
afr_set_postop_dict (local, this, xattr[i],
piggyback, i);
@@ -1339,6 +1352,232 @@ afr_delayed_changelog_wake_up_cbk (void *data)
}
+/*
+ Check if the frame is destined to get optimized away
+ with changelog piggybacking
+*/
+static gf_boolean_t
+is_piggyback_post_op (call_frame_t *frame, fd_t *fd)
+{
+ afr_fd_ctx_t *fdctx = NULL;
+ afr_local_t *local = NULL;
+ gf_boolean_t piggyback = _gf_true;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ priv = frame->this->private;
+ local = frame->local;
+ fdctx = afr_fd_ctx_get (fd, frame->this);
+
+ if (!afr_txn_nothing_failed (frame, frame->this))
+ /* something failed in this transaction,
+ we will be performing a hard post-op
+ */
+ return _gf_false;
+
+ LOCK(&fd->lock);
+ {
+ piggyback = _gf_true;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i])
+ continue;
+ if (fdctx->pre_op_piggyback[i]) {
+ fdctx->pre_op_piggyback[i]--;
+ local->transaction.postop_piggybacked[i] = 1;
+ } else {
+ /* For at least _one_ subvolume we cannot
+ piggyback on the changelog, and have to
+ perform a hard POST-OP and therefore fsync
+ if necesssary
+ */
+ piggyback = _gf_false;
+ }
+ }
+ }
+ UNLOCK(&fd->lock);
+
+ return piggyback;
+}
+
+
+/* SET operation */
+int
+afr_fd_report_unstable_write (xlator_t *this, fd_t *fd)
+{
+ afr_fd_ctx_t *fdctx = NULL;
+
+ fdctx = afr_fd_ctx_get (fd, this);
+
+ LOCK(&fd->lock);
+ {
+ fdctx->witnessed_unstable_write = _gf_true;
+ }
+ UNLOCK(&fd->lock);
+
+ return 0;
+}
+
+/* TEST and CLEAR operation */
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd)
+{
+ afr_fd_ctx_t *fdctx = NULL;
+ gf_boolean_t witness = _gf_false;
+
+ fdctx = afr_fd_ctx_get (fd, this);
+
+ LOCK(&fd->lock);
+ {
+ if (fdctx->witnessed_unstable_write) {
+ witness = _gf_true;
+ fdctx->witnessed_unstable_write = _gf_false;
+ }
+ }
+ UNLOCK (&fd->lock);
+
+ return witness;
+}
+
+
+int
+afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ int child_index = (long) cookie;
+ int call_count = -1;
+ afr_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (afr_fop_failed (op_ret, op_errno)) {
+ /* Failure of fsync() is as good as failure of previous
+ write(). So treat it like one.
+ */
+ gf_log (this->name, GF_LOG_WARNING,
+ "fsync(%s) failed on subvolume %s. Transaction was %s",
+ uuid_utoa (local->fd->inode->gfid),
+ priv->children[child_index]->name,
+ gf_fop_list[local->op]);
+
+ afr_transaction_fop_failed (frame, this, child_index);
+ }
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_changelog_post_op_now (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_changelog_fsync (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
+ priv->child_count);
+
+ if (!call_count) {
+ /* will go straight to unlock */
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i])
+ continue;
+
+ STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk,
+ (void *) (long) i, priv->children[i],
+ priv->children[i]->fops->fsync, local->fd,
+ 1, NULL);
+ if (!--call_count)
+ break;
+ }
+
+ return 0;
+}
+
+
+int
+afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) {
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
+
+ if (is_piggyback_post_op (frame, local->fd)) {
+ /* just detected that this post-op is about to
+ be optimized away as a new write() has
+ already piggybacked on this frame's changelog.
+ */
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
+
+ /* Calling afr_changelog_post_op_now() now will result in
+ issuing ->[f]xattrop().
+
+ Performing a hard POST-OP (->[f]xattrop() FOP) is a more
+ responsible operation that what it might appear on the surface.
+
+ The changelog of a file (in the xattr of the file on the server)
+ stores information (pending count) about the state of the file
+ on the OTHER server. This changelog is blindly trusted, and must
+ therefore be updated in such a way it remains trustworthy. This
+ implies that decrementing the pending count (essentially "clearing
+ the dirty flag") must be done STRICTLY after we are sure that the
+ operation on the other server has reached stable storage.
+
+ While the backend filesystem on that server will eventually flush
+ it to stable storage, we (being in userspace) have no mechanism
+ to get notified when the write became "stable".
+
+ This means we need take matter into our own hands and issue an
+ fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES,
+ and get an acknowledgement for it. And we need to wait for the
+ fsync() acknowledgement before initiating the hard POST-OP.
+
+ However if the FD itself was opened in O_SYNC or O_DSYNC then
+ we are already guaranteed that the writes were made stable as
+ part of the FOP itself. The same holds true for NFS stable
+ writes which happen on an anonymous FD with O_DSYNC or O_SYNC
+ flag set in the writev() @flags param. For all other write types,
+ mark a flag in the fdctx whenever an unstable write is witnessed.
+ */
+
+ if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) {
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
+
+ /* Time to fsync() */
+
+ afr_changelog_fsync (frame, this);
+
+ return 0;
+}
+
+
void
afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd)
{
@@ -1374,7 +1613,7 @@ unlock:
pthread_mutex_unlock (&fd_ctx->delay_lock);
if (prev_frame) {
- afr_changelog_post_op_now (prev_frame, this);
+ afr_changelog_post_op_safe (prev_frame, this);
}
}
@@ -1389,7 +1628,7 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
if (is_afr_delayed_changelog_post_op_needed (frame, this))
afr_delayed_changelog_post_op (this, frame, local->fd);
else
- afr_changelog_post_op_now (frame, this);
+ afr_changelog_post_op_safe (frame, this);
}
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 978339017a2..5d9f752b95a 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -448,6 +448,12 @@ typedef struct _afr_local {
int optimistic_change_log;
gf_boolean_t delayed_post_op;
+ /* Is the current writev() going to perform a stable write?
+ i.e, is fd->flags or @flags writev param have O_SYNC or
+ O_DSYNC?
+ */
+ gf_boolean_t stable_write;
+
/*
This struct contains the arguments for the "continuation"
(scheme-like) of fops
@@ -662,6 +668,12 @@ typedef struct _afr_local {
afr_transaction_type type;
+ /* pre-compute the post piggyback status before
+ entering POST-OP phase
+ */
+ int *postop_piggybacked;
+
+
int32_t **txn_changelog;//changelog after pre+post ops
unsigned char *pre_op;
@@ -723,6 +735,11 @@ typedef struct {
gf_timer_t *delay_timer;
call_frame_t *delay_frame;
int call_child;
+
+ /* set if any write on this fd was a non stable write
+ (i.e, without O_SYNC or O_DSYNC)
+ */
+ gf_boolean_t witnessed_unstable_write;
} afr_fd_ctx_t;
@@ -1078,4 +1095,11 @@ afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count);
} \
} while (0);
+
+int
+afr_fd_report_unstable_write (xlator_t *this, fd_t *fd);
+
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd);
+
#endif /* __AFR_H__ */