summaryrefslogtreecommitdiffstats
path: root/xlators/performance/write-behind/src/write-behind.c
diff options
context:
space:
mode:
authorNiels de Vos <ndevos@redhat.com>2014-02-28 10:03:14 +0100
committerNiels de Vos <ndevos@redhat.com>2014-05-09 07:33:39 -0700
commita256094f34df6500bcd19845fd0f32a9f3690751 (patch)
tree3b7afbb9e12226c120188cf5ce45f7f0aeee43b4 /xlators/performance/write-behind/src/write-behind.c
parent94282fcae8ecb0d53399baa4e96f3aaa443979f0 (diff)
write-behind: track filesize when doing extending writes
A program that calls mmap() on a newly created sparse file, may receive a SIGBUS signal. If SIGBUS is not handled, a segmentation fault will occur and the program will exit. A bug in the write-behind translator can cause the creation of a sparse file created with open(), seek(), write() to be cached. The last write() may not be sent to the server, until write-behind deems this necessary. * open(.., O_TRUNC, ...)/creat() the file, it is 0 bytes big * seek() into the file, use offset 31 * write() 1 byte to the file * the range from byte 0-30 are unwritten so called 'sparse' The following illustration tries to capture this: Legend: [ = start of file _ = unallocated/unwritten bytes # = allocated bytes in the file ] = end of file [_______________#] | | '- byte 0 '- byte 31 Without this change, reading from byte 0-30 will return an error, and reading the same area through an mmap()'d pointer will trigger a SIGBUS. Reading from this range did not trigger the outstanding write() to be flushed. The brick that receives the read() (translated over the network from mmap()) does not know that the file has been extended, and returns -EINVAL. This error gets transported back from the brick to the glusterfs-fuse client, and translated by the Linux kernel/VFS into SIGBUS triggered by mmap(). In order to solve this, a new attribute to the wb_inode structure is introduced; the current size of the file. All FOPs that can modify the size, are expected to update wb_inode->size. This makes it possible for extending writes with an offset bigger than EOF to mark the unwritten area as modified/pending. Cherry picked from commit b0515e2a4a08b657ef7e9715fb8c6222c700e78c: > Change-Id: If5ba6646732e6be26568541ea9b12852a5d0b988 > BUG: 1058663 > Signed-off-by: Niels de Vos <ndevos@redhat.com> > Reviewed-on: http://review.gluster.org/6835 > Tested-by: Gluster Build System <jenkins@build.gluster.com> > Reviewed-by: Raghavendra G <rgowdapp@redhat.com> > Reviewed-by: Anand Avati <avati@redhat.com> Change-Id: Ic73288202c6cb37d8c066628e9c489c6fade49bd BUG: 1071191 Signed-off-by: Niels de Vos <ndevos@redhat.com> Reviewed-on: http://review.gluster.org/7166 Tested-by: Gluster Build System <jenkins@build.gluster.com>
Diffstat (limited to 'xlators/performance/write-behind/src/write-behind.c')
-rw-r--r--xlators/performance/write-behind/src/write-behind.c165
1 files changed, 161 insertions, 4 deletions
diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c
index 2be211fbf7b..9586c5ddffc 100644
--- a/xlators/performance/write-behind/src/write-behind.c
+++ b/xlators/performance/write-behind/src/write-behind.c
@@ -108,6 +108,7 @@ typedef struct wb_inode {
after it arrived (i.e, those that have a
liability generation higher than itself)
*/
+ size_t size; /* Size of the file to catch write after EOF. */
gf_lock_t lock;
xlator_t *this;
} wb_inode_t;
@@ -503,8 +504,23 @@ wb_enqueue_common (wb_inode_t *wb_inode, call_stub_t *stub, int tempted)
switch (stub->fop) {
case GF_FOP_WRITE:
- req->ordering.off = stub->args.offset;
- req->ordering.size = req->write_size;
+ LOCK (&wb_inode->lock);
+ {
+ if (wb_inode->size < stub->args.offset) {
+ req->ordering.off = wb_inode->size;
+ req->ordering.size = stub->args.offset
+ + req->write_size
+ - wb_inode->size;
+ } else {
+ req->ordering.off = stub->args.offset;
+ req->ordering.size = req->write_size;
+ }
+
+ if (wb_inode->size < stub->args.offset + req->write_size)
+ wb_inode->size = stub->args.offset
+ + req->write_size;
+ }
+ UNLOCK (&wb_inode->lock);
req->fd = fd_ref (stub->args.fd);
@@ -519,10 +535,20 @@ wb_enqueue_common (wb_inode_t *wb_inode, call_stub_t *stub, int tempted)
case GF_FOP_TRUNCATE:
req->ordering.off = stub->args.offset;
req->ordering.size = 0; /* till infinity */
+ LOCK (&wb_inode->lock);
+ {
+ wb_inode->size = req->ordering.off;
+ }
+ UNLOCK (&wb_inode->lock);
break;
case GF_FOP_FTRUNCATE:
req->ordering.off = stub->args.offset;
req->ordering.size = 0; /* till infinity */
+ LOCK (&wb_inode->lock);
+ {
+ wb_inode->size = req->ordering.off;
+ }
+ UNLOCK (&wb_inode->lock);
req->fd = fd_ref (stub->args.fd);
@@ -1196,6 +1222,20 @@ wb_process_queue (wb_inode_t *wb_inode)
}
+void
+wb_set_inode_size(wb_inode_t *wb_inode, struct iatt *postbuf)
+{
+ GF_ASSERT (wb_inode);
+ GF_ASSERT (postbuf);
+
+ LOCK (&wb_inode->lock);
+ {
+ wb_inode->size = postbuf->ia_size;
+ }
+ UNLOCK (&wb_inode->lock);
+}
+
+
int
wb_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
@@ -1576,11 +1616,29 @@ noqueue:
}
+int32_t
+wb_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ GF_ASSERT (frame->local);
+
+ if (op_ret == 0)
+ wb_set_inode_size (frame->local, postbuf);
+
+ frame->local = NULL;
+
+ STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+ return 0;
+}
+
+
int
wb_truncate_helper (call_frame_t *frame, xlator_t *this, loc_t *loc,
off_t offset, dict_t *xdata)
{
- STACK_WIND (frame, default_truncate_cbk, FIRST_CHILD(this),
+ STACK_WIND (frame, wb_truncate_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
return 0;
}
@@ -1597,6 +1655,8 @@ wb_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
if (!wb_inode)
goto unwind;
+ frame->local = wb_inode;
+
stub = fop_truncate_stub (frame, wb_truncate_helper, loc,
offset, xdata);
if (!stub)
@@ -1619,11 +1679,29 @@ unwind:
}
+int32_t
+wb_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ GF_ASSERT (frame->local);
+
+ if (op_ret == 0)
+ wb_set_inode_size (frame->local, postbuf);
+
+ frame->local = NULL;
+
+ STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+ return 0;
+}
+
+
int
wb_ftruncate_helper (call_frame_t *frame, xlator_t *this, fd_t *fd,
off_t offset, dict_t *xdata)
{
- STACK_WIND (frame, default_ftruncate_cbk, FIRST_CHILD(this),
+ STACK_WIND (frame, wb_ftruncate_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
return 0;
}
@@ -1646,6 +1724,8 @@ wb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
if (wb_fd_err (fd, this, &op_errno))
goto unwind;
+ frame->local = wb_inode;
+
stub = fop_ftruncate_stub (frame, wb_ftruncate_helper, fd,
offset, xdata);
if (!stub) {
@@ -1663,6 +1743,8 @@ wb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
return 0;
unwind:
+ frame->local = NULL;
+
STACK_UNWIND_STRICT (ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
if (stub)
@@ -1763,6 +1845,81 @@ noqueue:
}
+int32_t
+wb_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+ wb_inode_t *wb_inode = NULL;
+
+ wb_inode = wb_inode_create (this, fd->inode);
+ if (!wb_inode)
+ goto unwind;
+
+ if (((flags & O_RDWR) || (flags & O_WRONLY)) && (flags & O_TRUNC))
+ wb_inode->size = 0;
+
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->create, loc, flags, mode,
+ umask, fd, xdata);
+ return 0;
+
+unwind:
+ STACK_UNWIND_STRICT (create, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
+ NULL, NULL);
+ return 0;
+}
+
+
+int32_t
+wb_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata)
+{
+ wb_inode_t *wb_inode = NULL;
+
+ wb_inode = wb_inode_create (this, fd->inode);
+ if (!wb_inode)
+ goto unwind;
+
+ if (((flags & O_RDWR) || (flags & O_WRONLY)) && (flags & O_TRUNC))
+ wb_inode->size = 0;
+
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+ return 0;
+
+unwind:
+ STACK_UNWIND_STRICT (open, frame, -1, ENOMEM, NULL, NULL);
+ return 0;
+}
+
+
+int32_t
+wb_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+ if (op_ret == 0) {
+ wb_inode_t *wb_inode = wb_inode_ctx_get (this, inode);
+ if (wb_inode)
+ wb_set_inode_size (wb_inode, buf);
+ }
+
+ STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf,
+ xdata, postparent);
+ return 0;
+}
+
+
+int32_t
+wb_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *xdata)
+{
+ STACK_WIND (frame, wb_lookup_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, loc, xdata);
+ return 0;
+}
+
+
int
wb_forget (xlator_t *this, inode_t *inode)
{