summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJeff Darcy <jdarcy@redhat.com>2012-01-19 17:49:42 -0500
committerAnand Avati <avati@gluster.com>2012-01-30 05:09:35 -0800
commitc3aa99d907591f72b6302287b9b8899514fb52f1 (patch)
tree40dceb4deb7e1987caf4004eb76ead91ef4e6fd6
parent20d74c540879d3994d56b9baf7044c79ae5df5e3 (diff)
Fix race between read-ahead and write.
Change-Id: I0ed1aca585733302b5e3840f392849e12f0b0f0d BUG: 783313 Signed-off-by: Jeff Darcy <jdarcy@redhat.com> Reviewed-on: http://review.gluster.com/2666 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Anand Avati <avati@gluster.com>
-rw-r--r--xlators/performance/read-ahead/src/page.c31
-rw-r--r--xlators/performance/read-ahead/src/read-ahead.c60
-rw-r--r--xlators/performance/read-ahead/src/read-ahead.h3
3 files changed, 68 insertions, 26 deletions
diff --git a/xlators/performance/read-ahead/src/page.c b/xlators/performance/read-ahead/src/page.c
index 9778ef542..0c9a61853 100644
--- a/xlators/performance/read-ahead/src/page.c
+++ b/xlators/performance/read-ahead/src/page.c
@@ -175,14 +175,8 @@ ra_fault_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret >= 0)
file->stbuf = *stbuf;
- if (op_ret < 0) {
- page = ra_page_get (file, pending_offset);
- if (page)
- waitq = ra_page_error (page, op_ret, op_errno);
- goto unlock;
- }
-
page = ra_page_get (file, pending_offset);
+
if (!page) {
gf_log (this->name, GF_LOG_TRACE,
"wasted copy: %"PRId64"[+%"PRId64"] file=%p",
@@ -190,6 +184,29 @@ ra_fault_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto unlock;
}
+ /*
+ * "Dirty" means that the request was a pure read-ahead; it's
+ * set for requests we issue ourselves, and cleared when user
+ * requests are issued or put on the waitq. "Poisoned" means
+ * that we got a write while a read was still in flight, and we
+ * couldn't stop it so we marked it instead. If it's both
+ * dirty and poisoned by the time we get here, we cancel its
+ * effect so that a subsequent user read doesn't get data that
+ * we know is stale (because we made it stale ourselves). We
+ * can't use ESTALE because that has special significance.
+ * ECANCELED has no such special meaning, and is close to what
+ * we're trying to indicate.
+ */
+ if (page->dirty && page->poisoned) {
+ op_ret = -1;
+ op_errno = ECANCELED;
+ }
+
+ if (op_ret < 0) {
+ waitq = ra_page_error (page, op_ret, op_errno);
+ goto unlock;
+ }
+
if (page->vector) {
iobref_unref (page->iobref);
GF_FREE (page->vector);
diff --git a/xlators/performance/read-ahead/src/read-ahead.c b/xlators/performance/read-ahead/src/read-ahead.c
index 37f34f2eb..928d8b7bd 100644
--- a/xlators/performance/read-ahead/src/read-ahead.c
+++ b/xlators/performance/read-ahead/src/read-ahead.c
@@ -231,7 +231,8 @@ ra_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
*/
static void
-flush_region (call_frame_t *frame, ra_file_t *file, off_t offset, off_t size)
+flush_region (call_frame_t *frame, ra_file_t *file, off_t offset, off_t size,
+ int for_write)
{
ra_page_t *trav = NULL;
ra_page_t *next = NULL;
@@ -243,8 +244,13 @@ flush_region (call_frame_t *frame, ra_file_t *file, off_t offset, off_t size)
&& trav->offset < (offset + size)) {
next = trav->next;
- if (trav->offset >= offset && !trav->waitq) {
- ra_page_purge (trav);
+ if (trav->offset >= offset) {
+ if (!trav->waitq) {
+ ra_page_purge (trav);
+ }
+ else if (for_write) {
+ trav->poisoned = 1;
+ }
}
trav = next;
}
@@ -392,15 +398,15 @@ dispatch_requests (call_frame_t *frame, ra_file_t *file)
trav = ra_page_get (file, trav_offset);
if (!trav) {
trav = ra_page_create (file, trav_offset);
+ if (!trav) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unlock;
+ }
fault = 1;
need_atime_update = 0;
}
-
- if (!trav) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- goto unlock;
- }
+ trav->dirty = 0;
if (trav->ready) {
gf_log (frame->this->name, GF_LOG_TRACE,
@@ -513,7 +519,7 @@ ra_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
}
if (!expected_offset) {
- flush_region (frame, file, 0, file->pages.prev->offset + 1);
+ flush_region (frame, file, 0, file->pages.prev->offset + 1, 0);
}
local = (void *) GF_CALLOC (1, sizeof (*local), gf_ra_mt_ra_local_t);
@@ -536,7 +542,7 @@ ra_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
dispatch_requests (frame, file);
- flush_region (frame, file, 0, floor (offset, file->page_size));
+ flush_region (frame, file, 0, floor (offset, file->page_size), 0);
read_ahead (frame, file);
@@ -596,7 +602,7 @@ ra_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
file = (ra_file_t *)(long)tmp_file;
if (file) {
- flush_region (frame, file, 0, file->pages.prev->offset+1);
+ flush_region (frame, file, 0, file->pages.prev->offset+1, 0);
}
STACK_WIND (frame, ra_flush_cbk, FIRST_CHILD (this),
@@ -624,7 +630,7 @@ ra_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync)
file = (ra_file_t *)(long)tmp_file;
if (file) {
- flush_region (frame, file, 0, file->pages.prev->offset+1);
+ flush_region (frame, file, 0, file->pages.prev->offset+1, 0);
}
STACK_WIND (frame, ra_fsync_cbk, FIRST_CHILD (this),
@@ -649,7 +655,7 @@ ra_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
file = frame->local;
if (file) {
- flush_region (frame, file, 0, file->pages.prev->offset+1);
+ flush_region (frame, file, 0, file->pages.prev->offset+1, 1);
}
frame->local = NULL;
@@ -673,7 +679,7 @@ ra_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
fd_ctx_get (fd, this, &tmp_file);
file = (ra_file_t *)(long)tmp_file;
if (file) {
- flush_region (frame, file, 0, file->pages.prev->offset+1);
+ flush_region (frame, file, 0, file->pages.prev->offset+1, 1);
frame->local = file;
/* reset the read-ahead counters too */
file->expected = file->page_count = 0;
@@ -739,8 +745,16 @@ ra_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset)
if (!file)
continue;
+ /*
+ * Truncation invalidates reads just like writing does.
+ * TBD: this seems to flush more than it should. The
+ * only time we should flush at all is when we're
+ * shortening (not lengthening) the file, and then only
+ * from new EOF to old EOF. The same problem exists in
+ * ra_ftruncate.
+ */
flush_region (frame, file, 0,
- file->pages.prev->offset + 1);
+ file->pages.prev->offset + 1, 1);
}
}
UNLOCK (&inode->lock);
@@ -775,6 +789,8 @@ ra_page_dump (struct ra_page *page)
gf_proc_dump_write ("dirty", "%s", page->dirty ? "yes" : "no");
+ gf_proc_dump_write ("poisoned", "%s", page->poisoned ? "yes" : "no");
+
gf_proc_dump_write ("ready", "%s", page->ready ? "yes" : "no");
for (trav = page->waitq; trav; trav = trav->next) {
@@ -870,7 +886,7 @@ ra_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd)
if (!file)
continue;
flush_region (frame, file, 0,
- file->pages.prev->offset + 1);
+ file->pages.prev->offset + 1, 0);
}
}
UNLOCK (&inode->lock);
@@ -907,8 +923,16 @@ ra_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset)
file = (ra_file_t *)(long)tmp_file;
if (!file)
continue;
+ /*
+ * Truncation invalidates reads just like writing does.
+ * TBD: this seems to flush more than it should. The
+ * only time we should flush at all is when we're
+ * shortening (not lengthening) the file, and then only
+ * from new EOF to old EOF. The same problem exists in
+ * ra_truncate.
+ */
flush_region (frame, file, 0,
- file->pages.prev->offset + 1);
+ file->pages.prev->offset + 1, 1);
}
}
UNLOCK (&inode->lock);
diff --git a/xlators/performance/read-ahead/src/read-ahead.h b/xlators/performance/read-ahead/src/read-ahead.h
index d0bbcde81..f5e73cb66 100644
--- a/xlators/performance/read-ahead/src/read-ahead.h
+++ b/xlators/performance/read-ahead/src/read-ahead.h
@@ -76,7 +76,8 @@ struct ra_page {
struct ra_page *next;
struct ra_page *prev;
struct ra_file *file;
- char dirty;
+ char dirty; /* Internal request, not from user. */
+ char poisoned; /* Pending read invalidated by write. */
char ready;
struct iovec *vector;
int32_t count;