summaryrefslogtreecommitdiffstats
path: root/xlators
diff options
context:
space:
mode:
authorAnand Avati <avati@redhat.com>2013-07-24 03:53:16 -0700
committerAnand Avati <avati@redhat.com>2013-08-13 23:45:03 -0700
commit8360037701788d49471cc0228fa873aa18382023 (patch)
tree6c0aff80595683322507102ddb04986915511729 /xlators
parent0d756dc618c1a4b659a3531aec449506ce577f50 (diff)
afr: treat appending writes as stable writes.
Durability of appending writes is implicit in the file size. Therefore performing an explicit fsync() is unnecessary in such cases as self-heal can check for the size of file when pending changelog is not unambiguous. Change-Id: I05446180a91d20e0dbee5de5a7085b87d57f178a BUG: 927146 Signed-off-by: Anand Avati <avati@redhat.com> Reviewed-on: http://review.gluster.org/5501 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Diffstat (limited to 'xlators')
-rw-r--r--xlators/cluster/afr/src/afr-common.c2
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c23
-rw-r--r--xlators/cluster/afr/src/afr.h5
-rw-r--r--xlators/storage/posix/src/posix.c41
4 files changed, 68 insertions, 3 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 030256417f0..691c1d4d7b3 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -4487,6 +4487,8 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
goto out;
}
+ local->append_write = _gf_false;
+
ret = 0;
out:
return ret;
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index 68570f15afe..a7441676881 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -139,6 +139,7 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int read_child = 0;
int ret = 0;
uint32_t open_fd_count = 0;
+ uint32_t write_is_append = 0;
local = frame->local;
@@ -173,6 +174,13 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->open_fd_count = open_fd_count;
local->update_open_fd_count = _gf_true;
}
+
+ write_is_append = 0;
+ ret = dict_get_uint32 (xdata,
+ GLUSTERFS_WRITE_IS_APPEND,
+ &write_is_append);
+ if (ret || !write_is_append)
+ local->append_write = _gf_false;
}
if ((local->success_count == 0) ||
@@ -192,7 +200,13 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (local->update_open_fd_count)
afr_handle_open_fd_count (frame, this);
- if (!local->stable_write)
+ if (!local->stable_write && !local->append_write)
+ /* An appended write removes the necessity to
+ fsync() the file. This is because self-heal
+ has the logic to check for larger file when
+ the xattrs are not reliably pointing at
+ a stale file.
+ */
afr_fd_report_unstable_write (this, local->fd);
afr_writev_handle_short_writes (frame, this);
@@ -251,6 +265,13 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this)
if (xdata) {
ret = dict_set_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT,
sizeof (uint32_t));
+ ret = dict_set_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND,
+ 0);
+ /* Set append_write to be true speculatively. If on any
+ server it turns not be true, we unset it in the
+ callback.
+ */
+ local->append_write = _gf_true;
}
for (i = 0; i < priv->child_count; i++) {
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 49d281acae1..2023613f834 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -496,6 +496,11 @@ typedef struct _afr_local {
*/
gf_boolean_t stable_write;
+ /* This write appended to the file. Nnot necessarily O_APPEND,
+ just means the offset of write was at the end of file.
+ */
+ gf_boolean_t append_write;
+
/*
This struct contains the arguments for the "continuation"
(scheme-like) of fops
diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c
index 49d1effbcab..fc7c259e948 100644
--- a/xlators/storage/posix/src/posix.c
+++ b/xlators/storage/posix/src/posix.c
@@ -2199,7 +2199,7 @@ err:
}
dict_t*
-_fill_open_fd_count (fd_t *fd, dict_t *xdata, xlator_t *this)
+_fill_writev_xdata (fd_t *fd, dict_t *xdata, xlator_t *this, int is_append)
{
dict_t *rsp_xdata = NULL;
int32_t ret = 0;
@@ -2229,6 +2229,14 @@ _fill_open_fd_count (fd_t *fd, dict_t *xdata, xlator_t *this)
"dictionary value for %s", uuid_utoa (fd->inode->gfid),
GLUSTERFS_OPEN_FD_COUNT);
}
+
+ ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND,
+ is_append);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set "
+ "dictionary value for %s", uuid_utoa (fd->inode->gfid),
+ GLUSTERFS_WRITE_IS_APPEND);
+ }
out:
return rsp_xdata;
}
@@ -2247,6 +2255,8 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iatt postop = {0,};
int ret = -1;
dict_t *rsp_xdata = NULL;
+ int is_append = 0;
+ gf_boolean_t locked = _gf_false;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
@@ -2268,6 +2278,17 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
_fd = pfd->fd;
+ if (xdata && dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) {
+ /* The write_is_append check and write must happen
+ atomically. Else another write can overtake this
+ write after the check and get written earlier.
+
+ So lock before preop-stat and unlock after write.
+ */
+ locked = _gf_true;
+ LOCK(&fd->inode->lock);
+ }
+
op_ret = posix_fdstat (this, _fd, &preop);
if (op_ret == -1) {
op_errno = errno;
@@ -2277,8 +2298,19 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
goto out;
}
+ if (locked) {
+ if (preop.ia_size == offset || (fd->flags & O_APPEND))
+ is_append = 1;
+ }
+
op_ret = __posix_writev (_fd, vector, count, offset,
(pfd->flags & O_DIRECT));
+
+ if (locked) {
+ UNLOCK (&fd->inode->lock);
+ locked = _gf_false;
+ }
+
if (op_ret < 0) {
op_errno = -op_ret;
op_ret = -1;
@@ -2294,7 +2326,7 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
UNLOCK (&priv->lock);
if (op_ret >= 0) {
- rsp_xdata = _fill_open_fd_count (fd, xdata, this);
+ rsp_xdata = _fill_writev_xdata (fd, xdata, this, is_append);
/* wiretv successful, we also need to get the stat of
* the file we wrote to
*/
@@ -2324,6 +2356,11 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
out:
+ if (locked) {
+ UNLOCK (&fd->inode->lock);
+ locked = _gf_false;
+ }
+
STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &preop, &postop,
rsp_xdata);