summaryrefslogtreecommitdiffstats
path: root/xlators/performance/read-ahead
diff options
context:
space:
mode:
authorVikas Gorur <vikas@zresearch.com>2009-02-18 17:36:07 +0530
committerVikas Gorur <vikas@zresearch.com>2009-02-18 17:36:07 +0530
commit77adf4cd648dce41f89469dd185deec6b6b53a0b (patch)
tree02e155a5753b398ee572b45793f889b538efab6b /xlators/performance/read-ahead
parentf3b2e6580e5663292ee113c741343c8a43ee133f (diff)
Added all files
Diffstat (limited to 'xlators/performance/read-ahead')
-rw-r--r--xlators/performance/read-ahead/Makefile.am3
-rw-r--r--xlators/performance/read-ahead/src/Makefile.am14
-rw-r--r--xlators/performance/read-ahead/src/page.c487
-rw-r--r--xlators/performance/read-ahead/src/read-ahead.c890
-rw-r--r--xlators/performance/read-ahead/src/read-ahead.h194
5 files changed, 1588 insertions, 0 deletions
diff --git a/xlators/performance/read-ahead/Makefile.am b/xlators/performance/read-ahead/Makefile.am
new file mode 100644
index 00000000000..d471a3f9243
--- /dev/null
+++ b/xlators/performance/read-ahead/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/performance/read-ahead/src/Makefile.am b/xlators/performance/read-ahead/src/Makefile.am
new file mode 100644
index 00000000000..7bb90228227
--- /dev/null
+++ b/xlators/performance/read-ahead/src/Makefile.am
@@ -0,0 +1,14 @@
+xlator_LTLIBRARIES = read-ahead.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
+
+read_ahead_la_LDFLAGS = -module -avoidversion
+
+read_ahead_la_SOURCES = read-ahead.c page.c
+read_ahead_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = read-ahead.h
+
+AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
+ -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/performance/read-ahead/src/page.c b/xlators/performance/read-ahead/src/page.c
new file mode 100644
index 00000000000..3b8d4d2093e
--- /dev/null
+++ b/xlators/performance/read-ahead/src/page.c
@@ -0,0 +1,487 @@
+/*
+ Copyright (c) 2006, 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "read-ahead.h"
+#include <assert.h>
+
+
+ra_page_t *
+ra_page_get (ra_file_t *file,
+ off_t offset)
+{
+ ra_page_t *page = NULL;
+ off_t rounded_offset = 0;
+
+ page = file->pages.next;
+ rounded_offset = floor (offset, file->page_size);
+
+ while (page != &file->pages && page->offset < rounded_offset)
+ page = page->next;
+
+ if (page == &file->pages || page->offset != rounded_offset)
+ page = NULL;
+
+ return page;
+}
+
+
+ra_page_t *
+ra_page_create (ra_file_t *file, off_t offset)
+{
+ ra_page_t *page = NULL;
+ off_t rounded_offset = 0;
+ ra_page_t *newpage = NULL;
+
+ page = file->pages.next;
+ rounded_offset = floor (offset, file->page_size);
+
+ while (page != &file->pages && page->offset < rounded_offset)
+ page = page->next;
+
+ if (page == &file->pages || page->offset != rounded_offset) {
+ newpage = CALLOC (1, sizeof (*newpage));
+ if (!newpage)
+ return NULL;
+
+ newpage->offset = rounded_offset;
+ newpage->prev = page->prev;
+ newpage->next = page;
+ newpage->file = file;
+ page->prev->next = newpage;
+ page->prev = newpage;
+
+ page = newpage;
+ }
+
+ return page;
+}
+
+
+void
+ra_wait_on_page (ra_page_t *page, call_frame_t *frame)
+{
+ ra_waitq_t *waitq = NULL;
+ ra_local_t *local = NULL;
+
+
+ local = frame->local;
+ waitq = CALLOC (1, sizeof (*waitq));
+ if (!waitq) {
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ return;
+ }
+
+ waitq->data = frame;
+ waitq->next = page->waitq;
+ page->waitq = waitq;
+
+ ra_local_lock (local);
+ {
+ local->wait_count++;
+ }
+ ra_local_unlock (local);
+}
+
+
+void
+ra_waitq_return (ra_waitq_t *waitq)
+{
+ ra_waitq_t *trav = NULL;
+ ra_waitq_t *next = NULL;
+ call_frame_t *frame = NULL;
+
+ for (trav = waitq; trav; trav = next) {
+ next = trav->next;
+
+ frame = trav->data;
+ ra_frame_return (frame);
+ free (trav);
+ }
+}
+
+
+int
+ra_fault_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iovec *vector,
+ int32_t count, struct stat *stbuf)
+{
+ ra_local_t *local = NULL;
+ off_t pending_offset = 0;
+ ra_file_t *file = NULL;
+ ra_page_t *page = NULL;
+ off_t trav_offset = 0;
+ size_t payload_size = 0;
+ ra_waitq_t *waitq = NULL;
+ fd_t *fd = NULL;
+ int ret = 0;
+ uint64_t tmp_file = 0;
+
+ local = frame->local;
+ fd = local->fd;
+
+ ret = fd_ctx_get (fd, this, &tmp_file);
+
+ file = (ra_file_t *)(long)tmp_file;
+ pending_offset = local->pending_offset;
+ trav_offset = pending_offset;
+ payload_size = op_ret;
+
+ ra_file_lock (file);
+ {
+ if (op_ret >= 0)
+ file->stbuf = *stbuf;
+
+ if (op_ret < 0) {
+ page = ra_page_get (file, pending_offset);
+ if (page)
+ waitq = ra_page_error (page, op_ret, op_errno);
+ goto unlock;
+ }
+
+ page = ra_page_get (file, pending_offset);
+ if (!page) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "wasted copy: %"PRId64"[+%"PRId64"] file=%p",
+ pending_offset, file->page_size, file);
+ goto unlock;
+ }
+
+ if (page->vector) {
+ dict_unref (page->ref);
+ free (page->vector);
+ }
+
+ page->vector = iov_dup (vector, count);
+ page->count = count;
+ page->ref = dict_ref (frame->root->rsp_refs);
+ page->ready = 1;
+
+ page->size = iov_length (vector, count);
+
+ waitq = ra_page_wakeup (page);
+ }
+unlock:
+ ra_file_unlock (file);
+
+ ra_waitq_return (waitq);
+
+ fd_unref (local->fd);
+
+ free (frame->local);
+ frame->local = NULL;
+
+ STACK_DESTROY (frame->root);
+ return 0;
+}
+
+
+void
+ra_page_fault (ra_file_t *file,
+ call_frame_t *frame,
+ off_t offset)
+{
+ call_frame_t *fault_frame = NULL;
+ ra_local_t *fault_local = NULL;
+
+ fault_frame = copy_frame (frame);
+ fault_local = CALLOC (1, sizeof (ra_local_t));
+
+ fault_frame->local = fault_local;
+ fault_local->pending_offset = offset;
+ fault_local->pending_size = file->page_size;
+
+ fault_local->fd = fd_ref (file->fd);
+
+ STACK_WIND (fault_frame, ra_fault_cbk,
+ FIRST_CHILD (fault_frame->this),
+ FIRST_CHILD (fault_frame->this)->fops->readv,
+ file->fd, file->page_size, offset);
+ return;
+}
+
+void
+ra_frame_fill (ra_page_t *page, call_frame_t *frame)
+{
+ ra_local_t *local = NULL;
+ ra_fill_t *fill = NULL;
+ off_t src_offset = 0;
+ off_t dst_offset = 0;
+ ssize_t copy_size = 0;
+ ra_fill_t *new = NULL;
+
+
+ local = frame->local;
+ fill = &local->fill;
+
+ if (local->op_ret != -1 && page->size) {
+ if (local->offset > page->offset)
+ src_offset = local->offset - page->offset;
+ else
+ dst_offset = page->offset - local->offset;
+
+ copy_size = min (page->size - src_offset,
+ local->size - dst_offset);
+
+ if (copy_size < 0) {
+ /* if page contains fewer bytes and the required offset
+ is beyond the page size in the page */
+ copy_size = src_offset = 0;
+ }
+
+ fill = fill->next;
+ while (fill != &local->fill) {
+ if (fill->offset > page->offset) {
+ break;
+ }
+ fill = fill->next;
+ }
+
+ new = CALLOC (1, sizeof (*new));
+
+ new->offset = page->offset;
+ new->size = copy_size;
+ new->refs = dict_ref (page->ref);
+ new->count = iov_subset (page->vector, page->count,
+ src_offset, src_offset+copy_size,
+ NULL);
+ new->vector = CALLOC (new->count, sizeof (struct iovec));
+
+ new->count = iov_subset (page->vector, page->count,
+ src_offset, src_offset+copy_size,
+ new->vector);
+
+ new->next = fill;
+ new->prev = new->next->prev;
+ new->next->prev = new;
+ new->prev->next = new;
+
+ local->op_ret += copy_size;
+ }
+}
+
+
+void
+ra_frame_unwind (call_frame_t *frame)
+{
+ ra_local_t *local = NULL;
+ ra_fill_t *fill = NULL;
+ int32_t count = 0;
+ struct iovec *vector;
+ int32_t copied = 0;
+ dict_t *refs = NULL;
+ ra_fill_t *next = NULL;
+ fd_t *fd = NULL;
+ ra_file_t *file = NULL;
+ int ret = 0;
+ uint64_t tmp_file = 0;
+
+ local = frame->local;
+ fill = local->fill.next;
+
+ refs = get_new_dict ();
+
+ frame->local = NULL;
+
+ while (fill != &local->fill) {
+ count += fill->count;
+ fill = fill->next;
+ }
+
+ vector = CALLOC (count, sizeof (*vector));
+
+ fill = local->fill.next;
+
+ while (fill != &local->fill) {
+ next = fill->next;
+
+ memcpy (((char *)vector) + copied, fill->vector,
+ fill->count * sizeof (*vector));
+
+ copied += (fill->count * sizeof (*vector));
+ dict_copy (fill->refs, refs);
+
+ fill->next->prev = fill->prev;
+ fill->prev->next = fill->prev;
+
+ dict_unref (fill->refs);
+ free (fill->vector);
+ free (fill);
+
+ fill = next;
+ }
+
+ frame->root->rsp_refs = dict_ref (refs);
+
+ fd = local->fd;
+ ret = fd_ctx_get (fd, frame->this, &tmp_file);
+ file = (ra_file_t *)(long)tmp_file;
+
+ STACK_UNWIND (frame, local->op_ret, local->op_errno,
+ vector, count, &file->stbuf);
+
+ dict_unref (refs);
+ pthread_mutex_destroy (&local->local_lock);
+ free (local);
+ free (vector);
+
+ return;
+}
+
+/*
+ * ra_frame_return -
+ * @frame:
+ *
+ */
+void
+ra_frame_return (call_frame_t *frame)
+{
+ ra_local_t *local = NULL;
+ int32_t wait_count = 0;
+
+ local = frame->local;
+ assert (local->wait_count > 0);
+
+ ra_local_lock (local);
+ {
+ wait_count = --local->wait_count;
+ }
+ ra_local_unlock (local);
+
+ if (!wait_count)
+ ra_frame_unwind (frame);
+
+ return;
+}
+
+/*
+ * ra_page_wakeup -
+ * @page:
+ *
+ */
+ra_waitq_t *
+ra_page_wakeup (ra_page_t *page)
+{
+ ra_waitq_t *waitq = NULL, *trav = NULL;
+ call_frame_t *frame;
+
+ waitq = page->waitq;
+ page->waitq = NULL;
+
+ trav = waitq;
+ for (trav = waitq; trav; trav = trav->next) {
+ frame = trav->data;
+ ra_frame_fill (page, frame);
+ }
+
+ return waitq;
+}
+
+/*
+ * ra_page_purge -
+ * @page:
+ *
+ */
+void
+ra_page_purge (ra_page_t *page)
+{
+ page->prev->next = page->next;
+ page->next->prev = page->prev;
+
+ if (page->ref) {
+ dict_unref (page->ref);
+ }
+ free (page->vector);
+ free (page);
+}
+
+/*
+ * ra_page_error -
+ * @page:
+ * @op_ret:
+ * @op_errno:
+ *
+ */
+ra_waitq_t *
+ra_page_error (ra_page_t *page, int32_t op_ret, int32_t op_errno)
+{
+
+ ra_waitq_t *waitq = NULL;
+ ra_waitq_t *trav = NULL;
+ call_frame_t *frame = NULL;
+ ra_local_t *local = NULL;
+
+ waitq = page->waitq;
+ page->waitq = NULL;
+
+ trav = waitq;
+ for (trav = waitq; trav; trav = trav->next) {
+ frame = trav->data;
+
+ local = frame->local;
+ if (local->op_ret != -1) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+ }
+
+ ra_page_purge (page);
+
+ return waitq;
+}
+
+/*
+ * ra_file_destroy -
+ * @file:
+ *
+ */
+void
+ra_file_destroy (ra_file_t *file)
+{
+ ra_conf_t *conf = NULL;
+ ra_page_t *trav = NULL;
+
+ conf = file->conf;
+
+ ra_conf_lock (conf);
+ {
+ file->prev->next = file->next;
+ file->next->prev = file->prev;
+ }
+ ra_conf_unlock (conf);
+
+ trav = file->pages.next;
+ while (trav != &file->pages) {
+ ra_page_error (trav, -1, EINVAL);
+ trav = file->pages.next;
+ }
+
+ pthread_mutex_destroy (&file->file_lock);
+ free (file);
+}
+
diff --git a/xlators/performance/read-ahead/src/read-ahead.c b/xlators/performance/read-ahead/src/read-ahead.c
new file mode 100644
index 00000000000..0060e00fd41
--- /dev/null
+++ b/xlators/performance/read-ahead/src/read-ahead.c
@@ -0,0 +1,890 @@
+/*
+ Copyright (c) 2006, 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+/*
+ TODO:
+ - handle O_DIRECT
+ - maintain offset, flush on lseek
+ - ensure efficient memory managment in case of random seek
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "read-ahead.h"
+#include <assert.h>
+#include <sys/time.h>
+
+
+static void
+read_ahead (call_frame_t *frame,
+ ra_file_t *file);
+
+
+int
+ra_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd)
+{
+ ra_conf_t *conf = NULL;
+ ra_file_t *file = NULL;
+ int ret = 0;
+
+ conf = this->private;
+
+ if (op_ret == -1) {
+ goto unwind;
+ }
+
+ file = CALLOC (1, sizeof (*file));
+ if (!file) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto unwind;
+ }
+
+ ret = fd_ctx_set (fd, this, (uint64_t)(long)file);
+
+ /* If mandatory locking has been enabled on this file,
+ we disable caching on it */
+
+ if ((fd->inode->st_mode & S_ISGID) && !(fd->inode->st_mode & S_IXGRP))
+ file->disabled = 1;
+
+ /* If O_DIRECT open, we disable caching on it */
+
+ if ((fd->flags & O_DIRECT) || (fd->flags & O_WRONLY))
+ file->disabled = 1;
+
+ file->offset = (unsigned long long) 0;
+ file->conf = conf;
+ file->pages.next = &file->pages;
+ file->pages.prev = &file->pages;
+ file->pages.offset = (unsigned long long) 0;
+ file->pages.file = file;
+
+ ra_conf_lock (conf);
+ {
+ file->next = conf->files.next;
+ conf->files.next = file;
+ file->next->prev = file;
+ file->prev = &conf->files;
+ }
+ ra_conf_unlock (conf);
+
+ file->fd = fd;
+ file->page_count = conf->page_count;
+ file->page_size = conf->page_size;
+ pthread_mutex_init (&file->file_lock, NULL);
+
+ if (!file->disabled) {
+ file->page_count = 1;
+ }
+
+unwind:
+ STACK_UNWIND (frame, op_ret, op_errno, fd);
+
+ return 0;
+}
+
+
+int
+ra_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ fd_t *fd, inode_t *inode, struct stat *buf)
+{
+ ra_conf_t *conf = NULL;
+ ra_file_t *file = NULL;
+ int ret = 0;
+
+ conf = this->private;
+
+ if (op_ret == -1) {
+ goto unwind;
+ }
+
+ file = CALLOC (1, sizeof (*file));
+ if (!file) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ goto unwind;
+ }
+
+ ret = fd_ctx_set (fd, this, (uint64_t)(long)file);
+
+ /* If mandatory locking has been enabled on this file,
+ we disable caching on it */
+
+ if ((fd->inode->st_mode & S_ISGID) && !(fd->inode->st_mode & S_IXGRP))
+ file->disabled = 1;
+
+ /* If O_DIRECT open, we disable caching on it */
+
+ if ((fd->flags & O_DIRECT) || (fd->flags & O_WRONLY))
+ file->disabled = 1;
+
+ file->offset = (unsigned long long) 0;
+ //file->size = fd->inode->buf.st_size;
+ file->conf = conf;
+ file->pages.next = &file->pages;
+ file->pages.prev = &file->pages;
+ file->pages.offset = (unsigned long long) 0;
+ file->pages.file = file;
+
+ ra_conf_lock (conf);
+ {
+ file->next = conf->files.next;
+ conf->files.next = file;
+ file->next->prev = file;
+ file->prev = &conf->files;
+ }
+ ra_conf_unlock (conf);
+
+ file->fd = fd;
+ file->page_count = conf->page_count;
+ file->page_size = conf->page_size;
+ pthread_mutex_init (&file->file_lock, NULL);
+
+unwind:
+ STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf);
+
+ return 0;
+}
+
+
+int
+ra_open (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, fd_t *fd)
+{
+ STACK_WIND (frame, ra_open_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->open,
+ loc, flags, fd);
+
+ return 0;
+}
+
+int
+ra_create (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, mode_t mode, fd_t *fd)
+{
+ STACK_WIND (frame, ra_create_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->create,
+ loc, flags, mode, fd);
+
+ return 0;
+}
+
+/* free cache pages between offset and offset+size,
+ does not touch pages with frames waiting on it
+*/
+
+static void
+flush_region (call_frame_t *frame,
+ ra_file_t *file,
+ off_t offset,
+ off_t size)
+{
+ ra_page_t *trav = NULL;
+ ra_page_t *next = NULL;
+
+
+ ra_file_lock (file);
+ {
+ trav = file->pages.next;
+ while (trav != &file->pages
+ && trav->offset < (offset + size)) {
+
+ next = trav->next;
+ if (trav->offset >= offset && !trav->waitq) {
+ ra_page_purge (trav);
+ }
+ trav = next;
+ }
+ }
+ ra_file_unlock (file);
+}
+
+
+
+int
+ra_release (xlator_t *this,
+ fd_t *fd)
+{
+ uint64_t tmp_file = 0;
+ int ret = 0;
+
+ ret = fd_ctx_del (fd, this, &tmp_file);
+
+ if (!ret) {
+ ra_file_destroy ((ra_file_t *)(long)tmp_file);
+ }
+
+ return 0;
+}
+
+
+void
+read_ahead (call_frame_t *frame, ra_file_t *file)
+{
+ off_t ra_offset = 0;
+ size_t ra_size = 0;
+ off_t trav_offset = 0;
+ ra_page_t *trav = NULL;
+ off_t cap = 0;
+ char fault = 0;
+
+ if (!file->page_count)
+ return;
+
+ ra_size = file->page_size * file->page_count;
+ ra_offset = floor (file->offset, file->page_size);
+ cap = file->size ? file->size : file->offset + ra_size;
+
+ while (ra_offset < min (file->offset + ra_size, cap)) {
+
+ ra_file_lock (file);
+ {
+ trav = ra_page_get (file, ra_offset);
+ }
+ ra_file_unlock (file);
+
+ if (!trav)
+ break;
+
+ ra_offset += file->page_size;
+ }
+
+ if (trav)
+ /* comfortable enough */
+ return;
+
+ trav_offset = ra_offset;
+
+ trav = file->pages.next;
+ cap = file->size ? file->size : ra_offset + ra_size;
+
+ while (trav_offset < min(ra_offset + ra_size, cap)) {
+ fault = 0;
+ ra_file_lock (file);
+ {
+ trav = ra_page_get (file, trav_offset);
+ if (!trav) {
+ fault = 1;
+ trav = ra_page_create (file, trav_offset);
+ if (trav)
+ trav->dirty = 1;
+ }
+ }
+ ra_file_unlock (file);
+
+ if (!trav) {
+ /* OUT OF MEMORY */
+ break;
+ }
+
+ if (fault) {
+ gf_log (frame->this->name, GF_LOG_DEBUG,
+ "RA at offset=%"PRId64, trav_offset);
+ ra_page_fault (file, frame, trav_offset);
+ }
+ trav_offset += file->page_size;
+ }
+
+ return;
+}
+
+
+int
+ra_need_atime_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iovec *vector,
+ int32_t count, struct stat *stbuf)
+{
+ STACK_DESTROY (frame->root);
+ return 0;
+}
+
+
+static void
+dispatch_requests (call_frame_t *frame,
+ ra_file_t *file)
+{
+ ra_local_t *local = NULL;
+ ra_conf_t *conf = NULL;
+ off_t rounded_offset = 0;
+ off_t rounded_end = 0;
+ off_t trav_offset = 0;
+ ra_page_t *trav = NULL;
+ call_frame_t *ra_frame = NULL;
+ char need_atime_update = 1;
+ char fault = 0;
+
+
+ local = frame->local;
+ conf = file->conf;
+
+ rounded_offset = floor (local->offset, file->page_size);
+ rounded_end = roof (local->offset + local->size, file->page_size);
+
+ trav_offset = rounded_offset;
+ trav = file->pages.next;
+
+ while (trav_offset < rounded_end) {
+ fault = 0;
+
+ ra_file_lock (file);
+ {
+ trav = ra_page_get (file, trav_offset);
+ if (!trav) {
+ trav = ra_page_create (file, trav_offset);
+ fault = 1;
+ need_atime_update = 0;
+ }
+
+ if (!trav)
+ goto unlock;
+
+ if (trav->ready) {
+ gf_log (frame->this->name, GF_LOG_DEBUG,
+ "HIT at offset=%"PRId64".",
+ trav_offset);
+ ra_frame_fill (trav, frame);
+ } else {
+ gf_log (frame->this->name, GF_LOG_DEBUG,
+ "IN-TRANSIT at offset=%"PRId64".",
+ trav_offset);
+ ra_wait_on_page (trav, frame);
+ need_atime_update = 0;
+ }
+ }
+ unlock:
+ ra_file_unlock (file);
+
+ if (fault) {
+ gf_log (frame->this->name, GF_LOG_DEBUG,
+ "MISS at offset=%"PRId64".",
+ trav_offset);
+ ra_page_fault (file, frame, trav_offset);
+ }
+
+ trav_offset += file->page_size;
+ }
+
+ if (need_atime_update && conf->force_atime_update) {
+ /* TODO: use untimens() since readv() can confuse underlying
+ io-cache and others */
+ ra_frame = copy_frame (frame);
+ STACK_WIND (ra_frame, ra_need_atime_cbk,
+ FIRST_CHILD (frame->this),
+ FIRST_CHILD (frame->this)->fops->readv,
+ file->fd, 1, 1);
+ }
+
+ return ;
+}
+
+
+int
+ra_readv_disabled_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iovec *vector, int32_t count, struct stat *stbuf)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf);
+
+ return 0;
+}
+
+
+int
+ra_readv (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset)
+{
+ ra_file_t *file = NULL;
+ ra_local_t *local = NULL;
+ ra_conf_t *conf = NULL;
+ int op_errno = 0;
+ int ret = 0;
+ char expected_offset = 1;
+ uint64_t tmp_file = 0;
+
+ conf = this->private;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "NEW REQ at offset=%"PRId64" for size=%"GF_PRI_SIZET"",
+ offset, size);
+
+ ret = fd_ctx_get (fd, this, &tmp_file);
+ file = (ra_file_t *)(long)tmp_file;
+
+ if (file->offset != offset) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unexpected offset (%"PRId64" != %"PRId64") resetting",
+ file->offset, offset);
+
+ expected_offset = file->expected = file->page_count = 0;
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "expected offset (%"PRId64") when page_count=%d",
+ offset, file->page_count);
+
+ if (file->expected < (conf->page_size * conf->page_count)) {
+ file->expected += size;
+ file->page_count = min ((file->expected / file->page_size),
+ conf->page_count);
+ }
+ }
+
+ if (!expected_offset) {
+ flush_region (frame, file, 0, file->pages.prev->offset + 1);
+ }
+
+ if (file->disabled) {
+ STACK_WIND (frame, ra_readv_disabled_cbk,
+ FIRST_CHILD (frame->this),
+ FIRST_CHILD (frame->this)->fops->readv,
+ file->fd, size, offset);
+ return 0;
+ }
+
+ local = (void *) CALLOC (1, sizeof (*local));
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "out of memory :(");
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ local->fd = fd;
+ local->offset = offset;
+ local->size = size;
+ local->wait_count = 1;
+
+ local->fill.next = &local->fill;
+ local->fill.prev = &local->fill;
+
+ pthread_mutex_init (&local->local_lock, NULL);
+
+ frame->local = local;
+
+ dispatch_requests (frame, file);
+
+ flush_region (frame, file, 0, floor (offset, file->page_size));
+
+ read_ahead (frame, file);
+
+ ra_frame_return (frame);
+
+ file->offset = offset + size;
+
+ return 0;
+
+unwind:
+ STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL);
+
+ return 0;
+}
+
+
+int
+ra_flush_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno)
+{
+ STACK_UNWIND (frame, op_ret, op_errno);
+ return 0;
+}
+
+
+int
+ra_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
+{
+ ra_file_t *file = NULL;
+ int ret = 0;
+ uint64_t tmp_file = 0;
+
+ ret = fd_ctx_get (fd, this, &tmp_file);
+ file = (ra_file_t *)(long)tmp_file;
+
+ if (file) {
+ flush_region (frame, file, 0, file->pages.prev->offset+1);
+ }
+
+ STACK_WIND (frame, ra_flush_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->flush,
+ fd);
+ return 0;
+}
+
+
+int
+ra_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t datasync)
+{
+ ra_file_t *file = NULL;
+ int ret = 0;
+ uint64_t tmp_file = 0;
+
+ ret = fd_ctx_get (fd, this, &tmp_file);
+ file = (ra_file_t *)(long)tmp_file;
+
+ if (file) {
+ flush_region (frame, file, 0, file->pages.prev->offset+1);
+ }
+
+ STACK_WIND (frame, ra_flush_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fsync,
+ fd, datasync);
+ return 0;
+}
+
+
+int
+ra_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *stbuf)
+{
+ fd_t *fd = NULL;
+ ra_file_t *file = NULL;
+ int ret = 0;
+ uint64_t tmp_file = 0;
+
+ fd = frame->local;
+
+ ret = fd_ctx_get (fd, this, &tmp_file);
+ file = (ra_file_t *)(long)tmp_file;
+
+ if (file) {
+ flush_region (frame, file, 0, file->pages.prev->offset+1);
+ }
+
+ frame->local = NULL;
+ STACK_UNWIND (frame, op_ret, op_errno, stbuf);
+ return 0;
+}
+
+
+int
+ra_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset)
+{
+ ra_file_t *file = NULL;
+ int ret = 0;
+ uint64_t tmp_file = 0;
+
+ ret = fd_ctx_get (fd, this, &tmp_file);
+ file = (ra_file_t *)(long)tmp_file;
+
+ if (file) {
+ flush_region (frame, file, 0, file->pages.prev->offset+1);
+
+ /* reset the read-ahead counters too */
+ file->expected = file->page_count = 0;
+ }
+
+ frame->local = fd;
+
+ STACK_WIND (frame, ra_writev_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev,
+ fd, vector, count, offset);
+
+ return 0;
+}
+
+
+int
+ra_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct stat *buf)
+{
+ STACK_UNWIND (frame, op_ret, op_errno, buf);
+ return 0;
+}
+
+
+int
+ra_truncate (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, off_t offset)
+{
+ ra_file_t *file = NULL;
+ fd_t *iter_fd = NULL;
+ inode_t *inode = NULL;
+ int ret = 0;
+ uint64_t tmp_file = 0;
+
+ inode = loc->inode;
+
+ LOCK (&inode->lock);
+ {
+ list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+ ret = fd_ctx_get (iter_fd, this, &tmp_file);
+ file = (ra_file_t *)(long)tmp_file;
+
+ if (!file)
+ continue;
+ flush_region (frame, file, 0,
+ file->pages.prev->offset + 1);
+ }
+ }
+ UNLOCK (&inode->lock);
+
+ STACK_WIND (frame, ra_attr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->truncate,
+ loc, offset);
+ return 0;
+}
+
+
+int
+ra_fstat (call_frame_t *frame, xlator_t *this,
+ fd_t *fd)
+{
+ ra_file_t *file = NULL;
+ fd_t *iter_fd = NULL;
+ inode_t *inode = NULL;
+ int ret = 0;
+ uint64_t tmp_file = 0;
+
+ inode = fd->inode;
+
+ LOCK (&inode->lock);
+ {
+ list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+ ret = fd_ctx_get (iter_fd, this, &tmp_file);
+ file = (ra_file_t *)(long)tmp_file;
+
+ if (!file)
+ continue;
+ flush_region (frame, file, 0,
+ file->pages.prev->offset + 1);
+ }
+ }
+ UNLOCK (&inode->lock);
+
+ STACK_WIND (frame, ra_attr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fstat,
+ fd);
+ return 0;
+}
+
+
+int
+ra_fchown (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, uid_t uid, gid_t gid)
+{
+ ra_file_t *file = NULL;
+ fd_t *iter_fd = NULL;
+ inode_t *inode = NULL;
+ int ret = 0;
+ uint64_t tmp_file = 0;
+
+ inode = fd->inode;
+
+ LOCK (&inode->lock);
+ {
+ list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+ ret = fd_ctx_get (iter_fd, this, &tmp_file);
+ file = (ra_file_t *)(long)tmp_file;
+
+ if (!file)
+ continue;
+ flush_region (frame, file, 0,
+ file->pages.prev->offset + 1);
+ }
+ }
+ UNLOCK (&inode->lock);
+
+ STACK_WIND (frame, ra_attr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fchown,
+ fd, uid, gid);
+ return 0;
+}
+
+
+int
+ra_ftruncate (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, off_t offset)
+{
+ ra_file_t *file = NULL;
+ fd_t *iter_fd = NULL;
+ inode_t *inode = NULL;
+ int ret = 0;
+ uint64_t tmp_file = 0;
+
+ inode = fd->inode;
+
+ LOCK (&inode->lock);
+ {
+ list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+ ret = fd_ctx_get (iter_fd, this, &tmp_file);
+ file = (ra_file_t *)(long)tmp_file;
+ if (!file)
+ continue;
+ flush_region (frame, file, 0,
+ file->pages.prev->offset + 1);
+ }
+ }
+ UNLOCK (&inode->lock);
+
+ STACK_WIND (frame, ra_attr_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->ftruncate,
+ fd, offset);
+ return 0;
+}
+
+
+int
+init (xlator_t *this)
+{
+ ra_conf_t *conf;
+ dict_t *options = this->options;
+ char *page_size_string = NULL;
+ char *page_count_string = NULL;
+
+ if (!this->children || this->children->next) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "FATAL: read-ahead not configured with exactly one child");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+
+ conf = (void *) CALLOC (1, sizeof (*conf));
+ ERR_ABORT (conf);
+ conf->page_size = 256 * 1024;
+ conf->page_count = 2;
+
+ if (dict_get (options, "page-size"))
+ page_size_string = data_to_str (dict_get (options,
+ "page-size"));
+ if (page_size_string)
+ {
+ if (gf_string2bytesize (page_size_string, &conf->page_size) != 0)
+ {
+ gf_log ("read-ahead",
+ GF_LOG_ERROR,
+ "invalid number format \"%s\" of \"option page-size\"",
+ page_size_string);
+ return -1;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG, "Using conf->page_size = %"PRIu64"",
+ conf->page_size);
+ }
+
+ if (dict_get (options, "page-count"))
+ page_count_string = data_to_str (dict_get (options,
+ "page-count"));
+ if (page_count_string)
+ {
+ if (gf_string2uint_base10 (page_count_string, &conf->page_count) != 0)
+ {
+ gf_log ("read-ahead",
+ GF_LOG_ERROR,
+ "invalid number format \"%s\" of \"option page-count\"",
+ page_count_string);
+ return -1;
+ }
+ gf_log (this->name, GF_LOG_DEBUG, "Using conf->page_count = %u",
+ conf->page_count);
+ }
+
+ if (dict_get (options, "force-atime-update")) {
+ char *force_atime_update_str = data_to_str (dict_get (options,
+ "force-atime-update"));
+ if (gf_string2boolean (force_atime_update_str, &conf->force_atime_update) == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "'force-atime-update' takes only boolean options");
+ return -1;
+ }
+ if (conf->force_atime_update)
+ gf_log (this->name, GF_LOG_DEBUG, "Forcing atime updates on cache hit");
+ }
+
+ conf->files.next = &conf->files;
+ conf->files.prev = &conf->files;
+
+ pthread_mutex_init (&conf->conf_lock, NULL);
+ this->private = conf;
+ return 0;
+}
+
+void
+fini (xlator_t *this)
+{
+ ra_conf_t *conf = this->private;
+
+ pthread_mutex_destroy (&conf->conf_lock);
+ FREE (conf);
+
+ this->private = NULL;
+ return;
+}
+
+struct xlator_fops fops = {
+ .open = ra_open,
+ .create = ra_create,
+ .readv = ra_readv,
+ .writev = ra_writev,
+ .flush = ra_flush,
+ .fsync = ra_fsync,
+ .truncate = ra_truncate,
+ .ftruncate = ra_ftruncate,
+ .fstat = ra_fstat,
+ .fchown = ra_fchown,
+};
+
+struct xlator_mops mops = {
+};
+
+struct xlator_cbks cbks = {
+ .release = ra_release,
+};
+
+struct volume_options options[] = {
+ { .key = {"force-atime-update"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
+ { .key = {"page-size"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .min = 64 * GF_UNIT_KB,
+ .max = 2 * GF_UNIT_MB
+ },
+ { .key = {"page-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 16
+ },
+ { .key = {NULL} },
+};
diff --git a/xlators/performance/read-ahead/src/read-ahead.h b/xlators/performance/read-ahead/src/read-ahead.h
new file mode 100644
index 00000000000..d624ca8abc8
--- /dev/null
+++ b/xlators/performance/read-ahead/src/read-ahead.h
@@ -0,0 +1,194 @@
+/*
+ Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __READ_AHEAD_H
+#define __READ_AHEAD_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "common-utils.h"
+
+struct ra_conf;
+struct ra_local;
+struct ra_page;
+struct ra_file;
+struct ra_waitq;
+
+
+struct ra_waitq {
+ struct ra_waitq *next;
+ void *data;
+};
+
+
+struct ra_fill {
+ struct ra_fill *next;
+ struct ra_fill *prev;
+ off_t offset;
+ size_t size;
+ struct iovec *vector;
+ int32_t count;
+ dict_t *refs;
+};
+
+
+struct ra_local {
+ mode_t mode;
+ struct ra_fill fill;
+ off_t offset;
+ size_t size;
+ int32_t op_ret;
+ int32_t op_errno;
+ off_t pending_offset;
+ size_t pending_size;
+ fd_t *fd;
+ int32_t wait_count;
+ pthread_mutex_t local_lock;
+};
+
+
+struct ra_page {
+ struct ra_page *next;
+ struct ra_page *prev;
+ struct ra_file *file;
+ char dirty;
+ char ready;
+ struct iovec *vector;
+ int32_t count;
+ off_t offset;
+ size_t size;
+ struct ra_waitq *waitq;
+ dict_t *ref;
+};
+
+
+struct ra_file {
+ struct ra_file *next;
+ struct ra_file *prev;
+ struct ra_conf *conf;
+ fd_t *fd;
+ int disabled;
+ size_t expected;
+ struct ra_page pages;
+ off_t offset;
+ size_t size;
+ int32_t refcount;
+ pthread_mutex_t file_lock;
+ struct stat stbuf;
+ uint64_t page_size;
+ uint32_t page_count;
+};
+
+
+struct ra_conf {
+ uint64_t page_size;
+ uint32_t page_count;
+ void *cache_block;
+ struct ra_file files;
+ gf_boolean_t force_atime_update;
+ pthread_mutex_t conf_lock;
+};
+
+
+typedef struct ra_conf ra_conf_t;
+typedef struct ra_local ra_local_t;
+typedef struct ra_page ra_page_t;
+typedef struct ra_file ra_file_t;
+typedef struct ra_waitq ra_waitq_t;
+typedef struct ra_fill ra_fill_t;
+
+ra_page_t *
+ra_page_get (ra_file_t *file,
+ off_t offset);
+ra_page_t *
+ra_page_create (ra_file_t *file,
+ off_t offset);
+void
+ra_page_fault (ra_file_t *file,
+ call_frame_t *frame,
+ off_t offset);
+void
+ra_wait_on_page (ra_page_t *page,
+ call_frame_t *frame);
+ra_waitq_t *
+ra_page_wakeup (ra_page_t *page);
+
+void
+ra_page_flush (ra_page_t *page);
+
+ra_waitq_t *
+ra_page_error (ra_page_t *page,
+ int32_t op_ret,
+ int32_t op_errno);
+void
+ra_page_purge (ra_page_t *page);
+
+void
+ra_frame_return (call_frame_t *frame);
+void
+ra_frame_fill (ra_page_t *page,
+ call_frame_t *frame);
+
+void
+ra_file_destroy (ra_file_t *file);
+
+static inline void
+ra_file_lock (ra_file_t *file)
+{
+ pthread_mutex_lock (&file->file_lock);
+}
+
+static inline void
+ra_file_unlock (ra_file_t *file)
+{
+ pthread_mutex_unlock (&file->file_lock);
+}
+
+static inline void
+ra_conf_lock (ra_conf_t *conf)
+{
+ pthread_mutex_lock (&conf->conf_lock);
+}
+
+static inline void
+ra_conf_unlock (ra_conf_t *conf)
+{
+ pthread_mutex_unlock (&conf->conf_lock);
+}
+static inline void
+ra_local_lock (ra_local_t *local)
+{
+ pthread_mutex_lock (&local->local_lock);
+}
+
+static inline void
+ra_local_unlock (ra_local_t *local)
+{
+ pthread_mutex_unlock (&local->local_lock);
+}
+
+#endif /* __READ_AHEAD_H */