From 8360037701788d49471cc0228fa873aa18382023 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Wed, 24 Jul 2013 03:53:16 -0700 Subject: afr: treat appending writes as stable writes. Durability of appending writes is implicit in the file size. Therefore performing an explicit fsync() is unnecessary in such cases as self-heal can check for the size of file when pending changelog is not unambiguous. Change-Id: I05446180a91d20e0dbee5de5a7085b87d57f178a BUG: 927146 Signed-off-by: Anand Avati Reviewed-on: http://review.gluster.org/5501 Tested-by: Gluster Build System Reviewed-by: Pranith Kumar Karampuri --- libglusterfs/src/glusterfs.h | 1 + xlators/cluster/afr/src/afr-common.c | 2 ++ xlators/cluster/afr/src/afr-inode-write.c | 23 ++++++++++++++++- xlators/cluster/afr/src/afr.h | 5 ++++ xlators/storage/posix/src/posix.c | 41 +++++++++++++++++++++++++++++-- 5 files changed, 69 insertions(+), 3 deletions(-) diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index 763968c9e..324e3f5b4 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -107,6 +107,7 @@ #define ZR_FILE_CONTENT_STR "glusterfs.file." #define ZR_FILE_CONTENT_STRLEN 15 +#define GLUSTERFS_WRITE_IS_APPEND "glusterfs.write-is-append" #define GLUSTERFS_OPEN_FD_COUNT "glusterfs.open-fd-count" #define GLUSTERFS_INODELK_COUNT "glusterfs.inodelk-count" #define GLUSTERFS_ENTRYLK_COUNT "glusterfs.entrylk-count" diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 030256417..691c1d4d7 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -4487,6 +4487,8 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) goto out; } + local->append_write = _gf_false; + ret = 0; out: return ret; diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 68570f15a..a74416768 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -139,6 +139,7 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int read_child = 0; int ret = 0; uint32_t open_fd_count = 0; + uint32_t write_is_append = 0; local = frame->local; @@ -173,6 +174,13 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->open_fd_count = open_fd_count; local->update_open_fd_count = _gf_true; } + + write_is_append = 0; + ret = dict_get_uint32 (xdata, + GLUSTERFS_WRITE_IS_APPEND, + &write_is_append); + if (ret || !write_is_append) + local->append_write = _gf_false; } if ((local->success_count == 0) || @@ -192,7 +200,13 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (local->update_open_fd_count) afr_handle_open_fd_count (frame, this); - if (!local->stable_write) + if (!local->stable_write && !local->append_write) + /* An appended write removes the necessity to + fsync() the file. This is because self-heal + has the logic to check for larger file when + the xattrs are not reliably pointing at + a stale file. + */ afr_fd_report_unstable_write (this, local->fd); afr_writev_handle_short_writes (frame, this); @@ -251,6 +265,13 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this) if (xdata) { ret = dict_set_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, sizeof (uint32_t)); + ret = dict_set_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND, + 0); + /* Set append_write to be true speculatively. If on any + server it turns not be true, we unset it in the + callback. + */ + local->append_write = _gf_true; } for (i = 0; i < priv->child_count; i++) { diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 49d281aca..2023613f8 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -496,6 +496,11 @@ typedef struct _afr_local { */ gf_boolean_t stable_write; + /* This write appended to the file. Nnot necessarily O_APPEND, + just means the offset of write was at the end of file. + */ + gf_boolean_t append_write; + /* This struct contains the arguments for the "continuation" (scheme-like) of fops diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index 49d1effbc..fc7c259e9 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -2199,7 +2199,7 @@ err: } dict_t* -_fill_open_fd_count (fd_t *fd, dict_t *xdata, xlator_t *this) +_fill_writev_xdata (fd_t *fd, dict_t *xdata, xlator_t *this, int is_append) { dict_t *rsp_xdata = NULL; int32_t ret = 0; @@ -2229,6 +2229,14 @@ _fill_open_fd_count (fd_t *fd, dict_t *xdata, xlator_t *this) "dictionary value for %s", uuid_utoa (fd->inode->gfid), GLUSTERFS_OPEN_FD_COUNT); } + + ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND, + is_append); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set " + "dictionary value for %s", uuid_utoa (fd->inode->gfid), + GLUSTERFS_WRITE_IS_APPEND); + } out: return rsp_xdata; } @@ -2247,6 +2255,8 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt postop = {0,}; int ret = -1; dict_t *rsp_xdata = NULL; + int is_append = 0; + gf_boolean_t locked = _gf_false; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -2268,6 +2278,17 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, _fd = pfd->fd; + if (xdata && dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) { + /* The write_is_append check and write must happen + atomically. Else another write can overtake this + write after the check and get written earlier. + + So lock before preop-stat and unlock after write. + */ + locked = _gf_true; + LOCK(&fd->inode->lock); + } + op_ret = posix_fdstat (this, _fd, &preop); if (op_ret == -1) { op_errno = errno; @@ -2277,8 +2298,19 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, goto out; } + if (locked) { + if (preop.ia_size == offset || (fd->flags & O_APPEND)) + is_append = 1; + } + op_ret = __posix_writev (_fd, vector, count, offset, (pfd->flags & O_DIRECT)); + + if (locked) { + UNLOCK (&fd->inode->lock); + locked = _gf_false; + } + if (op_ret < 0) { op_errno = -op_ret; op_ret = -1; @@ -2294,7 +2326,7 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, UNLOCK (&priv->lock); if (op_ret >= 0) { - rsp_xdata = _fill_open_fd_count (fd, xdata, this); + rsp_xdata = _fill_writev_xdata (fd, xdata, this, is_append); /* wiretv successful, we also need to get the stat of * the file we wrote to */ @@ -2324,6 +2356,11 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, out: + if (locked) { + UNLOCK (&fd->inode->lock); + locked = _gf_false; + } + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &preop, &postop, rsp_xdata); -- cgit