diff options
Diffstat (limited to 'xlators/performance/read-ahead/src')
| -rw-r--r-- | xlators/performance/read-ahead/src/Makefile.am | 10 | ||||
| -rw-r--r-- | xlators/performance/read-ahead/src/page.c | 874 | ||||
| -rw-r--r-- | xlators/performance/read-ahead/src/read-ahead-mem-types.h | 25 | ||||
| -rw-r--r-- | xlators/performance/read-ahead/src/read-ahead-messages.h | 31 | ||||
| -rw-r--r-- | xlators/performance/read-ahead/src/read-ahead.c | 1835 | ||||
| -rw-r--r-- | xlators/performance/read-ahead/src/read-ahead.h | 201 |
6 files changed, 1681 insertions, 1295 deletions
diff --git a/xlators/performance/read-ahead/src/Makefile.am b/xlators/performance/read-ahead/src/Makefile.am index 7bb90228227..99efca3660c 100644 --- a/xlators/performance/read-ahead/src/Makefile.am +++ b/xlators/performance/read-ahead/src/Makefile.am @@ -1,14 +1,16 @@ xlator_LTLIBRARIES = read-ahead.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance -read_ahead_la_LDFLAGS = -module -avoidversion +read_ahead_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) read_ahead_la_SOURCES = read-ahead.c page.c read_ahead_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -noinst_HEADERS = read-ahead.h +noinst_HEADERS = read-ahead.h read-ahead-mem-types.h read-ahead-messages.h -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/performance/read-ahead/src/page.c b/xlators/performance/read-ahead/src/page.c index 07ab84ed8d7..8a58ad8bb7a 100644 --- a/xlators/performance/read-ahead/src/page.c +++ b/xlators/performance/read-ahead/src/page.c @@ -1,415 +1,455 @@ /* - Copyright (c) 2006-2009 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> #include "read-ahead.h" #include <assert.h> +#include "read-ahead-messages.h" ra_page_t * -ra_page_get (ra_file_t *file, off_t offset) +ra_page_get(ra_file_t *file, off_t offset) { - ra_page_t *page = NULL; - off_t rounded_offset = 0; + ra_page_t *page = NULL; + off_t rounded_offset = 0; - page = file->pages.next; - rounded_offset = floor (offset, file->page_size); + GF_VALIDATE_OR_GOTO("read-ahead", file, out); - while (page != &file->pages && page->offset < rounded_offset) - page = page->next; + page = file->pages.next; + rounded_offset = gf_floor(offset, file->page_size); - if (page == &file->pages || page->offset != rounded_offset) - page = NULL; + while (page != &file->pages && page->offset < rounded_offset) + page = page->next; - return page; -} + if (page == &file->pages || page->offset != rounded_offset) + page = NULL; +out: + return page; +} ra_page_t * -ra_page_create (ra_file_t *file, off_t offset) +ra_page_create(ra_file_t *file, off_t offset) { - ra_page_t *page = NULL; - off_t rounded_offset = 0; - ra_page_t *newpage = NULL; + ra_page_t *page = NULL; + off_t rounded_offset = 0; + ra_page_t *newpage = NULL; - page = file->pages.next; - rounded_offset = floor (offset, file->page_size); + GF_VALIDATE_OR_GOTO("read-ahead", file, out); - while (page != &file->pages && page->offset < rounded_offset) - page = page->next; + page = file->pages.next; + rounded_offset = gf_floor(offset, file->page_size); - if (page == &file->pages || page->offset != rounded_offset) { - newpage = CALLOC (1, sizeof (*newpage)); - if (!newpage) - return NULL; + while (page != &file->pages && page->offset < rounded_offset) + page = page->next; - newpage->offset = rounded_offset; - newpage->prev = page->prev; - newpage->next = page; - newpage->file = file; - page->prev->next = newpage; - page->prev = newpage; + if (page == &file->pages || page->offset != rounded_offset) { + newpage = GF_CALLOC(1, sizeof(*newpage), gf_ra_mt_ra_page_t); + if (!newpage) { + goto out; + } - page = newpage; - } + newpage->offset = rounded_offset; + newpage->prev = page->prev; + newpage->next = page; + newpage->file = file; + page->prev->next = newpage; + page->prev = newpage; - return page; -} + page = newpage; + } +out: + return page; +} void -ra_wait_on_page (ra_page_t *page, call_frame_t *frame) +ra_wait_on_page(ra_page_t *page, call_frame_t *frame) { - ra_waitq_t *waitq = NULL; - ra_local_t *local = NULL; - - local = frame->local; - waitq = CALLOC (1, sizeof (*waitq)); - if (!waitq) { - gf_log (frame->this->name, GF_LOG_ERROR, - "out of memory"); - local->op_ret = -1; - local->op_errno = ENOMEM; - goto out; - } + ra_waitq_t *waitq = NULL; + ra_local_t *local = NULL; + + GF_VALIDATE_OR_GOTO("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, page, out); + + local = frame->local; + + waitq = GF_CALLOC(1, sizeof(*waitq), gf_ra_mt_ra_waitq_t); + if (!waitq) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } - waitq->data = frame; - waitq->next = page->waitq; - page->waitq = waitq; + waitq->data = frame; + waitq->next = page->waitq; + page->waitq = waitq; - ra_local_lock (local); - { - local->wait_count++; - } - ra_local_unlock (local); + ra_local_lock(local); + { + local->wait_count++; + } + ra_local_unlock(local); out: - return; + return; } - void -ra_waitq_return (ra_waitq_t *waitq) +ra_waitq_return(ra_waitq_t *waitq) { - ra_waitq_t *trav = NULL; - ra_waitq_t *next = NULL; - call_frame_t *frame = NULL; + ra_waitq_t *trav = NULL; + ra_waitq_t *next = NULL; + call_frame_t *frame = NULL; - for (trav = waitq; trav; trav = next) { - next = trav->next; + for (trav = waitq; trav; trav = next) { + next = trav->next; - frame = trav->data; - ra_frame_return (frame); - free (trav); - } -} + frame = trav->data; + ra_frame_return(frame); + GF_FREE(trav); + } + return; +} int -ra_fault_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct stat *stbuf, struct iobref *iobref) +ra_fault_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) { - ra_local_t *local = NULL; - off_t pending_offset = 0; - ra_file_t *file = NULL; - ra_page_t *page = NULL; - off_t trav_offset = 0; - size_t payload_size = 0; - ra_waitq_t *waitq = NULL; - fd_t *fd = NULL; - int ret = 0; - uint64_t tmp_file = 0; - - local = frame->local; - fd = local->fd; - - ret = fd_ctx_get (fd, this, &tmp_file); - - file = (ra_file_t *)(long)tmp_file; - pending_offset = local->pending_offset; - trav_offset = pending_offset; - payload_size = op_ret; - - ra_file_lock (file); - { - if (op_ret >= 0) - file->stbuf = *stbuf; - - if (op_ret < 0) { - page = ra_page_get (file, pending_offset); - if (page) - waitq = ra_page_error (page, op_ret, op_errno); - goto unlock; - } - - page = ra_page_get (file, pending_offset); - if (!page) { - gf_log (this->name, GF_LOG_DEBUG, - "wasted copy: %"PRId64"[+%"PRId64"] file=%p", - pending_offset, file->page_size, file); - goto unlock; - } - - if (page->vector) { - iobref_unref (page->iobref); - free (page->vector); - } - - page->vector = iov_dup (vector, count); - if (page->vector == NULL) { - waitq = ra_page_error (page, -1, ENOMEM); - goto unlock; - } - - page->count = count; - page->iobref = iobref_ref (iobref); - page->ready = 1; - - page->size = iov_length (vector, count); - - waitq = ra_page_wakeup (page); - } -unlock: - ra_file_unlock (file); - - ra_waitq_return (waitq); - - fd_unref (local->fd); + ra_local_t *local = NULL; + off_t pending_offset = 0; + ra_file_t *file = NULL; + ra_page_t *page = NULL; + ra_waitq_t *waitq = NULL; + fd_t *fd = NULL; + uint64_t tmp_file = 0; + gf_boolean_t stale = _gf_false; + + GF_ASSERT(frame); + + local = frame->local; + fd = local->fd; + + fd_ctx_get(fd, this, &tmp_file); + + file = (ra_file_t *)(long)tmp_file; + pending_offset = local->pending_offset; + + if (file == NULL) { + gf_msg(this->name, GF_LOG_WARNING, EBADF, + READ_AHEAD_MSG_FD_CONTEXT_NOT_SET, + "read-ahead context not set in fd (%p)", fd); + op_ret = -1; + op_errno = EBADF; + goto out; + } + + ra_file_lock(file); + { + if (op_ret >= 0) + file->stbuf = *stbuf; + + page = ra_page_get(file, pending_offset); + + if (!page) { + gf_msg_trace(this->name, 0, + "wasted copy: " + "%" PRId64 "[+%" PRId64 "] file=%p", + pending_offset, file->page_size, file); + goto unlock; + } - free (frame->local); - frame->local = NULL; + if (page->stale) { + page->stale = 0; + page->ready = 0; + stale = 1; + goto unlock; + } - STACK_DESTROY (frame->root); - return 0; -} + /* + * "Dirty" means that the request was a pure read-ahead; it's + * set for requests we issue ourselves, and cleared when user + * requests are issued or put on the waitq. "Poisoned" means + * that we got a write while a read was still in flight, and we + * couldn't stop it so we marked it instead. If it's both + * dirty and poisoned by the time we get here, we cancel its + * effect so that a subsequent user read doesn't get data that + * we know is stale (because we made it stale ourselves). We + * can't use ESTALE because that has special significance. + * ECANCELED has no such special meaning, and is close to what + * we're trying to indicate. + */ + if (page->dirty && page->poisoned) { + op_ret = -1; + op_errno = ECANCELED; + } + if (op_ret < 0) { + waitq = ra_page_error(page, op_ret, op_errno); + goto unlock; + } -void -ra_page_fault (ra_file_t *file, call_frame_t *frame, off_t offset) -{ - call_frame_t *fault_frame = NULL; - ra_local_t *fault_local = NULL, *local = NULL; - ra_page_t *page = NULL; - ra_waitq_t *waitq = NULL; - int32_t op_ret = -1, op_errno = -1; - - local = frame->local; - fault_frame = copy_frame (frame); - if (fault_frame == NULL) { - op_ret = -1; - op_errno = ENOMEM; - goto err; + if (page->vector) { + iobref_unref(page->iobref); + GF_FREE(page->vector); } - fault_local = CALLOC (1, sizeof (ra_local_t)); - if (fault_local == NULL) { - STACK_DESTROY (fault_frame->root); - op_ret = -1; - op_errno = ENOMEM; - goto err; + page->vector = iov_dup(vector, count); + if (page->vector == NULL) { + waitq = ra_page_error(page, -1, ENOMEM); + goto unlock; } - fault_frame->local = fault_local; - fault_local->pending_offset = offset; - fault_local->pending_size = file->page_size; + page->count = count; + page->iobref = iobref_ref(iobref); + page->ready = 1; + + page->size = iov_length(vector, count); - fault_local->fd = fd_ref (file->fd); + waitq = ra_page_wakeup(page); + } +unlock: + ra_file_unlock(file); - STACK_WIND (fault_frame, ra_fault_cbk, - FIRST_CHILD (fault_frame->this), - FIRST_CHILD (fault_frame->this)->fops->readv, - file->fd, file->page_size, offset); + if (stale) { + STACK_WIND(frame, ra_fault_cbk, FIRST_CHILD(frame->this), + FIRST_CHILD(frame->this)->fops->readv, local->fd, + local->pending_size, local->pending_offset, 0, NULL); - return; + return 0; + } -err: - ra_file_lock (file); - { - page = ra_page_get (file, offset); - if (page) - waitq = ra_page_error (page, op_ret, - op_errno); - } - ra_file_unlock (file); - - if (waitq != NULL) { - ra_waitq_return (waitq); - } + ra_waitq_return(waitq); + + fd_unref(local->fd); + + mem_put(frame->local); + frame->local = NULL; + +out: + STACK_DESTROY(frame->root); + return 0; } void -ra_frame_fill (ra_page_t *page, call_frame_t *frame) +ra_page_fault(ra_file_t *file, call_frame_t *frame, off_t offset) { - ra_local_t *local = NULL; - ra_fill_t *fill = NULL; - off_t src_offset = 0; - off_t dst_offset = 0; - ssize_t copy_size = 0; - ra_fill_t *new = NULL; - - local = frame->local; - fill = &local->fill; - - if (local->op_ret != -1 && page->size) { - if (local->offset > page->offset) - src_offset = local->offset - page->offset; - else - dst_offset = page->offset - local->offset; - - copy_size = min (page->size - src_offset, - local->size - dst_offset); - - if (copy_size < 0) { - /* if page contains fewer bytes and the required offset - is beyond the page size in the page */ - copy_size = src_offset = 0; - } - - fill = fill->next; - while (fill != &local->fill) { - if (fill->offset > page->offset) { - break; - } - fill = fill->next; - } - - new = CALLOC (1, sizeof (*new)); - if (new == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto out; - } - - new->offset = page->offset; - new->size = copy_size; - new->iobref = iobref_ref (page->iobref); - new->count = iov_subset (page->vector, page->count, - src_offset, src_offset+copy_size, - NULL); - new->vector = CALLOC (new->count, sizeof (struct iovec)); - if (new->vector == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - FREE (new); - goto out; - } - - new->count = iov_subset (page->vector, page->count, - src_offset, src_offset+copy_size, - new->vector); - - new->next = fill; - new->prev = new->next->prev; - new->next->prev = new; - new->prev->next = new; - - local->op_ret += copy_size; - } + call_frame_t *fault_frame = NULL; + ra_local_t *fault_local = NULL; + ra_page_t *page = NULL; + ra_waitq_t *waitq = NULL; + int32_t op_ret = -1, op_errno = -1; + + GF_VALIDATE_OR_GOTO("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, file, out); + + fault_frame = copy_frame(frame); + if (fault_frame == NULL) { + op_ret = -1; + op_errno = ENOMEM; + goto err; + } + + fault_local = mem_get0(THIS->local_pool); + if (fault_local == NULL) { + STACK_DESTROY(fault_frame->root); + op_ret = -1; + op_errno = ENOMEM; + goto err; + } + + fault_frame->local = fault_local; + fault_local->pending_offset = offset; + fault_local->pending_size = file->page_size; + + fault_local->fd = fd_ref(file->fd); + + STACK_WIND(fault_frame, ra_fault_cbk, FIRST_CHILD(fault_frame->this), + FIRST_CHILD(fault_frame->this)->fops->readv, file->fd, + file->page_size, offset, 0, NULL); + + return; + +err: + ra_file_lock(file); + { + page = ra_page_get(file, offset); + if (page) + waitq = ra_page_error(page, op_ret, op_errno); + } + ra_file_unlock(file); + + if (waitq != NULL) { + ra_waitq_return(waitq); + } out: - return; + return; } - void -ra_frame_unwind (call_frame_t *frame) +ra_frame_fill(ra_page_t *page, call_frame_t *frame) { - ra_local_t *local = NULL; - ra_fill_t *fill = NULL; - int32_t count = 0; - struct iovec *vector; - int32_t copied = 0; - struct iobref *iobref = NULL; - ra_fill_t *next = NULL; - fd_t *fd = NULL; - ra_file_t *file = NULL; - int ret = 0; - uint64_t tmp_file = 0; - - local = frame->local; - fill = local->fill.next; - - iobref = iobref_new (); - if (iobref == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; + ra_local_t *local = NULL; + ra_fill_t *fill = NULL; + off_t src_offset = 0; + off_t dst_offset = 0; + ssize_t copy_size = 0; + ra_fill_t *new = NULL; + + GF_VALIDATE_OR_GOTO("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, page, out); + + local = frame->local; + fill = &local->fill; + + if (local->op_ret != -1 && page->size) { + if (local->offset > page->offset) + src_offset = local->offset - page->offset; + else + dst_offset = page->offset - local->offset; + + copy_size = min(page->size - src_offset, local->size - dst_offset); + + if (copy_size < 0) { + /* if page contains fewer bytes and the required offset + is beyond the page size in the page */ + copy_size = src_offset = 0; } - frame->local = NULL; + fill = fill->next; + while (fill != &local->fill) { + if (fill->offset > page->offset) { + break; + } + fill = fill->next; + } - while (fill != &local->fill) { - count += fill->count; - fill = fill->next; - } + new = GF_CALLOC(1, sizeof(*new), gf_ra_mt_ra_fill_t); + if (new == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } - vector = CALLOC (count, sizeof (*vector)); - if (vector == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - iobref_unref (iobref); - iobref = NULL; + new->offset = page->offset; + new->size = copy_size; + new->iobref = iobref_ref(page->iobref); + new->count = iov_subset(page->vector, page->count, src_offset, + copy_size, &new->vector, 0); + if (new->count < 0) { + local->op_ret = -1; + local->op_errno = ENOMEM; + iobref_unref(new->iobref); + GF_FREE(new); + goto out; } - fill = local->fill.next; + new->next = fill; + new->prev = new->next->prev; + new->next->prev = new; + new->prev->next = new; - while (fill != &local->fill) { - next = fill->next; + local->op_ret += copy_size; + } + +out: + return; +} - if ((vector != NULL) && (iobref != NULL)) { - memcpy (((char *)vector) + copied, fill->vector, - fill->count * sizeof (*vector)); - - copied += (fill->count * sizeof (*vector)); - iobref_merge (iobref, fill->iobref); - } +void +ra_frame_unwind(call_frame_t *frame) +{ + ra_local_t *local = NULL; + ra_fill_t *fill = NULL; + int32_t count = 0; + struct iovec *vector = NULL; + int32_t copied = 0; + struct iobref *iobref = NULL; + ra_fill_t *next = NULL; + fd_t *fd = NULL; + ra_file_t *file = NULL; + uint64_t tmp_file = 0; + + GF_VALIDATE_OR_GOTO("read-ahead", frame, out); + + local = frame->local; + fill = local->fill.next; + + iobref = iobref_new(); + if (iobref == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + } + + frame->local = NULL; + + while (fill != &local->fill) { + count += fill->count; + fill = fill->next; + } + + vector = GF_CALLOC(count, sizeof(*vector), gf_ra_mt_iovec); + if (vector == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + iobref_unref(iobref); + iobref = NULL; + } + + fill = local->fill.next; + + while (fill != &local->fill) { + next = fill->next; + + if ((vector != NULL) && (iobref != NULL)) { + memcpy(((char *)vector) + copied, fill->vector, + fill->count * sizeof(*vector)); + + copied += (fill->count * sizeof(*vector)); + if (iobref_merge(iobref, fill->iobref)) { + local->op_ret = -1; + local->op_errno = ENOMEM; + iobref_unref(iobref); + iobref = NULL; + } + } - fill->next->prev = fill->prev; - fill->prev->next = fill->prev; + fill->next->prev = fill->prev; + fill->prev->next = fill->prev; - iobref_unref (fill->iobref); - free (fill->vector); - free (fill); + iobref_unref(fill->iobref); + GF_FREE(fill->vector); + GF_FREE(fill); - fill = next; - } + fill = next; + } - fd = local->fd; - ret = fd_ctx_get (fd, frame->this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; + fd = local->fd; + fd_ctx_get(fd, frame->this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; - STACK_UNWIND_STRICT (readv, frame, local->op_ret, local->op_errno, - vector, count, &file->stbuf, iobref); + STACK_UNWIND_STRICT(readv, frame, local->op_ret, local->op_errno, vector, + count, &file->stbuf, iobref, NULL); - iobref_unref (iobref); - pthread_mutex_destroy (&local->local_lock); - free (local); - free (vector); + iobref_unref(iobref); + pthread_mutex_destroy(&local->local_lock); + mem_put(local); + GF_FREE(vector); - return; +out: + return; } /* @@ -418,47 +458,55 @@ ra_frame_unwind (call_frame_t *frame) * */ void -ra_frame_return (call_frame_t *frame) +ra_frame_return(call_frame_t *frame) { - ra_local_t *local = NULL; - int32_t wait_count = 0; + ra_local_t *local = NULL; + int32_t wait_count = 0; - local = frame->local; - assert (local->wait_count > 0); + GF_VALIDATE_OR_GOTO("read-ahead", frame, out); - ra_local_lock (local); - { - wait_count = --local->wait_count; - } - ra_local_unlock (local); + local = frame->local; + GF_ASSERT(local->wait_count > 0); - if (!wait_count) - ra_frame_unwind (frame); + ra_local_lock(local); + { + wait_count = --local->wait_count; + } + ra_local_unlock(local); - return; + if (!wait_count) + ra_frame_unwind(frame); + +out: + return; } -/* +/* * ra_page_wakeup - * @page: * */ ra_waitq_t * -ra_page_wakeup (ra_page_t *page) +ra_page_wakeup(ra_page_t *page) { - ra_waitq_t *waitq = NULL, *trav = NULL; - call_frame_t *frame; + ra_waitq_t *waitq = NULL, *trav = NULL; + call_frame_t *frame = NULL; + + GF_VALIDATE_OR_GOTO("read-ahead", page, out); - waitq = page->waitq; - page->waitq = NULL; + waitq = page->waitq; + page->waitq = NULL; - trav = waitq; - for (trav = waitq; trav; trav = trav->next) { - frame = trav->data; - ra_frame_fill (page, frame); - } + for (trav = waitq; trav; trav = trav->next) { + frame = trav->data; + ra_frame_fill(page, frame); + } - return waitq; + if (page->stale) { + ra_page_purge(page); + } +out: + return waitq; } /* @@ -467,16 +515,22 @@ ra_page_wakeup (ra_page_t *page) * */ void -ra_page_purge (ra_page_t *page) +ra_page_purge(ra_page_t *page) { - page->prev->next = page->next; - page->next->prev = page->prev; - - if (page->iobref) { - iobref_unref (page->iobref); - } - free (page->vector); - free (page); + GF_VALIDATE_OR_GOTO("read-ahead", page, out); + + page->prev->next = page->next; + page->next->prev = page->prev; + + if (page->iobref) { + iobref_unref(page->iobref); + } + + GF_FREE(page->vector); + GF_FREE(page); + +out: + return; } /* @@ -487,59 +541,65 @@ ra_page_purge (ra_page_t *page) * */ ra_waitq_t * -ra_page_error (ra_page_t *page, int32_t op_ret, int32_t op_errno) +ra_page_error(ra_page_t *page, int32_t op_ret, int32_t op_errno) { + ra_waitq_t *waitq = NULL; + ra_waitq_t *trav = NULL; + call_frame_t *frame = NULL; + ra_local_t *local = NULL; - ra_waitq_t *waitq = NULL; - ra_waitq_t *trav = NULL; - call_frame_t *frame = NULL; - ra_local_t *local = NULL; + GF_VALIDATE_OR_GOTO("read-ahead", page, out); - waitq = page->waitq; - page->waitq = NULL; + waitq = page->waitq; + page->waitq = NULL; - trav = waitq; - for (trav = waitq; trav; trav = trav->next) { - frame = trav->data; + for (trav = waitq; trav; trav = trav->next) { + frame = trav->data; - local = frame->local; - if (local->op_ret != -1) { - local->op_ret = op_ret; - local->op_errno = op_errno; - } - } + local = frame->local; + if (local->op_ret != -1) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + } - ra_page_purge (page); + ra_page_purge(page); - return waitq; +out: + return waitq; } -/* +/* * ra_file_destroy - * @file: * */ void -ra_file_destroy (ra_file_t *file) +ra_file_destroy(ra_file_t *file) { - ra_conf_t *conf = NULL; - ra_page_t *trav = NULL; - - conf = file->conf; - - ra_conf_lock (conf); - { - file->prev->next = file->next; - file->next->prev = file->prev; - } - ra_conf_unlock (conf); - - trav = file->pages.next; - while (trav != &file->pages) { - ra_page_error (trav, -1, EINVAL); - trav = file->pages.next; - } - - pthread_mutex_destroy (&file->file_lock); - free (file); + ra_conf_t *conf = NULL; + ra_page_t *trav = NULL; + + GF_VALIDATE_OR_GOTO("read-ahead", file, out); + + conf = file->conf; + + ra_conf_lock(conf); + { + file->prev->next = file->next; + file->next->prev = file->prev; + } + ra_conf_unlock(conf); + + trav = file->pages.next; + while (trav != &file->pages) { + ra_page_error(trav, -1, EINVAL); + trav = file->pages.next; + } + + pthread_mutex_destroy(&file->file_lock); + GF_FREE(file); + +out: + return; } diff --git a/xlators/performance/read-ahead/src/read-ahead-mem-types.h b/xlators/performance/read-ahead/src/read-ahead-mem-types.h new file mode 100644 index 00000000000..f07cfc5bba5 --- /dev/null +++ b/xlators/performance/read-ahead/src/read-ahead-mem-types.h @@ -0,0 +1,25 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __RA_MEM_TYPES_H__ +#define __RA_MEM_TYPES_H__ + +#include <glusterfs/mem-types.h> + +enum gf_ra_mem_types_ { + gf_ra_mt_ra_file_t = gf_common_mt_end + 1, + gf_ra_mt_ra_conf_t, + gf_ra_mt_ra_page_t, + gf_ra_mt_ra_waitq_t, + gf_ra_mt_ra_fill_t, + gf_ra_mt_iovec, + gf_ra_mt_end +}; +#endif diff --git a/xlators/performance/read-ahead/src/read-ahead-messages.h b/xlators/performance/read-ahead/src/read-ahead-messages.h new file mode 100644 index 00000000000..0302b7a7122 --- /dev/null +++ b/xlators/performance/read-ahead/src/read-ahead-messages.h @@ -0,0 +1,31 @@ +/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _READ_AHEAD_MESSAGES_H_ +#define _READ_AHEAD_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(READ_AHEAD, READ_AHEAD_MSG_XLATOR_CHILD_MISCONFIGURED, + READ_AHEAD_MSG_VOL_MISCONFIGURED, READ_AHEAD_MSG_NO_MEMORY, + READ_AHEAD_MSG_FD_CONTEXT_NOT_SET, + READ_AHEAD_MSG_UNDESTROYED_FILE_FOUND, + READ_AHEAD_MSG_XLATOR_CONF_NULL); + +#endif /* _READ_AHEAD_MESSAGES_H_ */ diff --git a/xlators/performance/read-ahead/src/read-ahead.c b/xlators/performance/read-ahead/src/read-ahead.c index e4c1ab2dab0..5246e1317d2 100644 --- a/xlators/performance/read-ahead/src/read-ahead.c +++ b/xlators/performance/read-ahead/src/read-ahead.c @@ -1,226 +1,201 @@ /* - Copyright (c) 2006-2009 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ -/* - TODO: - - handle O_DIRECT - - maintain offset, flush on lseek - - ensure efficient memory managment in case of random seek +/* + TODO: + - handle O_DIRECT + - maintain offset, flush on lseek + - ensure efficient memory management in case of random seek */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> #include "read-ahead.h" -#include "statedump.h" +#include <glusterfs/statedump.h> #include <assert.h> #include <sys/time.h> +#include "read-ahead-messages.h" static void -read_ahead (call_frame_t *frame, ra_file_t *file); - +read_ahead(call_frame_t *frame, ra_file_t *file); int -ra_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd) +ra_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, dict_t *xdata) { - ra_conf_t *conf = NULL; - ra_file_t *file = NULL; - int ret = 0; - long wbflags = 0; - - conf = this->private; - - if (op_ret == -1) { - goto unwind; - } - - wbflags = (long)frame->local; - - file = CALLOC (1, sizeof (*file)); - if (!file) { - op_ret = -1; - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "out of memory"); - goto unwind; - } - - /* If mandatory locking has been enabled on this file, - we disable caching on it */ - - if ((fd->inode->st_mode & S_ISGID) && !(fd->inode->st_mode & S_IXGRP)) - file->disabled = 1; - - /* If O_DIRECT open, we disable caching on it */ - - if ((fd->flags & O_DIRECT) || ((fd->flags & O_ACCMODE) == O_WRONLY)) - file->disabled = 1; - - if (wbflags & GF_OPEN_NOWB) { - file->disabled = 1; - } - - file->offset = (unsigned long long) 0; - file->conf = conf; - file->pages.next = &file->pages; - file->pages.prev = &file->pages; - file->pages.offset = (unsigned long long) 0; - file->pages.file = file; - - ra_conf_lock (conf); - { - file->next = conf->files.next; - conf->files.next = file; - file->next->prev = file; - file->prev = &conf->files; - } - ra_conf_unlock (conf); - - file->fd = fd; - file->page_count = conf->page_count; - file->page_size = conf->page_size; - pthread_mutex_init (&file->file_lock, NULL); - - if (!file->disabled) { - file->page_count = 1; - } - - ret = fd_ctx_set (fd, this, (uint64_t)(long)file); - if (ret == -1) { - ra_file_destroy (file); - op_ret = -1; - op_errno = ENOMEM; - } + ra_conf_t *conf = NULL; + ra_file_t *file = NULL; + int ret = 0; + + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + + conf = this->private; + + if (op_ret == -1) { + goto unwind; + } + + file = GF_CALLOC(1, sizeof(*file), gf_ra_mt_ra_file_t); + if (!file) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + /* If O_DIRECT open, we disable caching on it */ + + if ((fd->flags & O_DIRECT) || ((fd->flags & O_ACCMODE) == O_WRONLY)) + file->disabled = 1; + + file->offset = (unsigned long long)0; + file->conf = conf; + file->pages.next = &file->pages; + file->pages.prev = &file->pages; + file->pages.offset = (unsigned long long)0; + file->pages.file = file; + + ra_conf_lock(conf); + { + file->next = conf->files.next; + conf->files.next = file; + file->next->prev = file; + file->prev = &conf->files; + } + ra_conf_unlock(conf); + + file->fd = fd; + file->page_count = conf->page_count; + file->page_size = conf->page_size; + pthread_mutex_init(&file->file_lock, NULL); + + if (!file->disabled) { + file->page_count = 1; + } + + ret = fd_ctx_set(fd, this, (uint64_t)(long)file); + if (ret == -1) { + gf_msg(frame->this->name, GF_LOG_WARNING, 0, READ_AHEAD_MSG_NO_MEMORY, + "cannot set read-ahead context" + "information in fd (%p)", + fd); + ra_file_destroy(file); + op_ret = -1; + op_errno = ENOMEM; + } unwind: - frame->local = NULL; + frame->local = NULL; - STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd); + STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata); - return 0; + return 0; } - int -ra_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, - struct stat *buf, struct stat *preparent, - struct stat *postparent) -{ - ra_conf_t *conf = NULL; - ra_file_t *file = NULL; - int ret = 0; - - conf = this->private; - - if (op_ret == -1) { - goto unwind; - } - - file = CALLOC (1, sizeof (*file)); - if (!file) { - op_ret = -1; - op_errno = ENOMEM; - gf_log (this->name, GF_LOG_ERROR, - "out of memory"); - goto unwind; - } - - /* If mandatory locking has been enabled on this file, - we disable caching on it */ - - if ((fd->inode->st_mode & S_ISGID) && !(fd->inode->st_mode & S_IXGRP)) - file->disabled = 1; - - /* If O_DIRECT open, we disable caching on it */ - - if ((fd->flags & O_DIRECT) || ((fd->flags & O_ACCMODE) == O_WRONLY)) - file->disabled = 1; - - file->offset = (unsigned long long) 0; - //file->size = fd->inode->buf.st_size; - file->conf = conf; - file->pages.next = &file->pages; - file->pages.prev = &file->pages; - file->pages.offset = (unsigned long long) 0; - file->pages.file = file; - - ra_conf_lock (conf); - { - file->next = conf->files.next; - conf->files.next = file; - file->next->prev = file; - file->prev = &conf->files; - } - ra_conf_unlock (conf); - - file->fd = fd; - file->page_count = conf->page_count; - file->page_size = conf->page_size; - pthread_mutex_init (&file->file_lock, NULL); - - ret = fd_ctx_set (fd, this, (uint64_t)(long)file); - if (ret == -1) { - ra_file_destroy (file); - op_ret = -1; - op_errno = ENOMEM; - } +ra_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + ra_conf_t *conf = NULL; + ra_file_t *file = NULL; + int ret = 0; + + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + + conf = this->private; + + if (op_ret == -1) { + goto unwind; + } + + file = GF_CALLOC(1, sizeof(*file), gf_ra_mt_ra_file_t); + if (!file) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + /* If O_DIRECT open, we disable caching on it */ + + if ((fd->flags & O_DIRECT) || ((fd->flags & O_ACCMODE) == O_WRONLY)) + file->disabled = 1; + + file->offset = (unsigned long long)0; + // file->size = fd->inode->buf.ia_size; + file->conf = conf; + file->pages.next = &file->pages; + file->pages.prev = &file->pages; + file->pages.offset = (unsigned long long)0; + file->pages.file = file; + + ra_conf_lock(conf); + { + file->next = conf->files.next; + conf->files.next = file; + file->next->prev = file; + file->prev = &conf->files; + } + ra_conf_unlock(conf); + + file->fd = fd; + file->page_count = conf->page_count; + file->page_size = conf->page_size; + pthread_mutex_init(&file->file_lock, NULL); + + ret = fd_ctx_set(fd, this, (uint64_t)(long)file); + if (ret == -1) { + gf_msg(this->name, GF_LOG_WARNING, 0, READ_AHEAD_MSG_NO_MEMORY, + "cannot set read ahead context" + "information in fd (%p)", + fd); + ra_file_destroy(file); + op_ret = -1; + op_errno = ENOMEM; + } unwind: - STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf, - preparent, postparent); + STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent, xdata); - return 0; + return 0; } - int -ra_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, int32_t wbflags) +ra_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) { - frame->local = (void *)(long)wbflags; + GF_ASSERT(frame); + GF_ASSERT(this); - STACK_WIND (frame, ra_open_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->open, - loc, flags, fd, wbflags); + STACK_WIND(frame, ra_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); - return 0; + return 0; } int -ra_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - mode_t mode, fd_t *fd) +ra_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - STACK_WIND (frame, ra_create_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->create, - loc, flags, mode, fd); + GF_ASSERT(frame); + GF_ASSERT(this); + + STACK_WIND(frame, ra_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); - return 0; + return 0; } /* free cache pages between offset and offset+size, @@ -228,750 +203,1070 @@ ra_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, */ static void -flush_region (call_frame_t *frame, ra_file_t *file, off_t offset, off_t size) +flush_region(call_frame_t *frame, ra_file_t *file, off_t offset, off_t size, + int for_write) { - ra_page_t *trav = NULL; - ra_page_t *next = NULL; + ra_page_t *trav = NULL; + ra_page_t *next = NULL; + + ra_file_lock(file); + { + trav = file->pages.next; + while (trav != &file->pages && trav->offset < (offset + size)) { + next = trav->next; + if (trav->offset >= offset) { + if (!trav->waitq) { + ra_page_purge(trav); + } else { + trav->stale = 1; + + if (for_write) { + trav->poisoned = 1; + } + } + } + trav = next; + } + } + ra_file_unlock(file); +} + +int +ra_release(xlator_t *this, fd_t *fd) +{ + uint64_t tmp_file = 0; + int ret = 0; - ra_file_lock (file); - { - trav = file->pages.next; - while (trav != &file->pages - && trav->offset < (offset + size)) { + GF_VALIDATE_OR_GOTO("read-ahead", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); - next = trav->next; - if (trav->offset >= offset && !trav->waitq) { - ra_page_purge (trav); - } - trav = next; - } - } - ra_file_unlock (file); + ret = fd_ctx_del(fd, this, &tmp_file); + + if (!ret) { + ra_file_destroy((ra_file_t *)(long)tmp_file); + } + +out: + return 0; } +void +read_ahead(call_frame_t *frame, ra_file_t *file) +{ + off_t ra_offset = 0; + size_t ra_size = 0; + off_t trav_offset = 0; + ra_page_t *trav = NULL; + off_t cap = 0; + char fault = 0; + + GF_VALIDATE_OR_GOTO("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, file, out); + + if (!file->page_count) { + goto out; + } + + ra_size = file->page_size * file->page_count; + ra_offset = gf_floor(file->offset, file->page_size); + cap = file->size ? file->size : file->offset + ra_size; + + while (ra_offset < min(file->offset + ra_size, cap)) { + ra_file_lock(file); + { + trav = ra_page_get(file, ra_offset); + } + ra_file_unlock(file); + + if (!trav) + break; + + ra_offset += file->page_size; + } + + if (trav) { + /* comfortable enough */ + goto out; + } + + trav_offset = ra_offset; + + cap = file->size ? file->size : ra_offset + ra_size; + + while (trav_offset < min(ra_offset + ra_size, cap)) { + fault = 0; + ra_file_lock(file); + { + trav = ra_page_get(file, trav_offset); + if (!trav) { + fault = 1; + trav = ra_page_create(file, trav_offset); + if (trav) + trav->dirty = 1; + } + } + ra_file_unlock(file); + + if (!trav) { + /* OUT OF MEMORY */ + break; + } + + if (fault) { + gf_msg_trace(frame->this->name, 0, "RA at offset=%" PRId64, + trav_offset); + ra_page_fault(file, frame, trav_offset); + } + trav_offset += file->page_size; + } + +out: + return; +} int -ra_release (xlator_t *this, fd_t *fd) +ra_need_atime_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct iatt *stbuf, struct iobref *iobref, + dict_t *xdata) +{ + GF_ASSERT(frame); + STACK_DESTROY(frame->root); + return 0; +} + +static void +dispatch_requests(call_frame_t *frame, ra_file_t *file) { - uint64_t tmp_file = 0; - int ret = 0; + ra_local_t *local = NULL; + ra_conf_t *conf = NULL; + off_t rounded_offset = 0; + off_t rounded_end = 0; + off_t trav_offset = 0; + ra_page_t *trav = NULL; + call_frame_t *ra_frame = NULL; + char need_atime_update = 1; + char fault = 0; + + GF_VALIDATE_OR_GOTO("read-ahead", frame, out); + GF_VALIDATE_OR_GOTO(frame->this->name, file, out); + + local = frame->local; + conf = file->conf; + + rounded_offset = gf_floor(local->offset, file->page_size); + rounded_end = gf_roof(local->offset + local->size, file->page_size); + + trav_offset = rounded_offset; + + while (trav_offset < rounded_end) { + fault = 0; + + ra_file_lock(file); + { + trav = ra_page_get(file, trav_offset); + if (!trav) { + trav = ra_page_create(file, trav_offset); + if (!trav) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } + fault = 1; + need_atime_update = 0; + } + trav->dirty = 0; + + if (trav->ready) { + gf_msg_trace(frame->this->name, 0, "HIT at offset=%" PRId64 ".", + trav_offset); + ra_frame_fill(trav, frame); + } else { + gf_msg_trace(frame->this->name, 0, + "IN-TRANSIT at " + "offset=%" PRId64 ".", + trav_offset); + ra_wait_on_page(trav, frame); + need_atime_update = 0; + } + } + unlock: + ra_file_unlock(file); + + if (local->op_ret == -1) { + goto out; + } + + if (fault) { + gf_msg_trace(frame->this->name, 0, "MISS at offset=%" PRId64 ".", + trav_offset); + ra_page_fault(file, frame, trav_offset); + } + + trav_offset += file->page_size; + } + + if (need_atime_update && conf->force_atime_update) { + /* TODO: use untimens() since readv() can confuse underlying + io-cache and others */ + ra_frame = copy_frame(frame); + if (ra_frame == NULL) { + goto out; + } - ret = fd_ctx_del (fd, this, &tmp_file); - - if (!ret) { - ra_file_destroy ((ra_file_t *)(long)tmp_file); - } + STACK_WIND(ra_frame, ra_need_atime_cbk, FIRST_CHILD(frame->this), + FIRST_CHILD(frame->this)->fops->readv, file->fd, 1, 1, 0, + NULL); + } - return 0; +out: + return; } +int +ra_readv_disabled_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct iatt *stbuf, struct iobref *iobref, + dict_t *xdata) +{ + GF_ASSERT(frame); + + STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, vector, count, stbuf, + iobref, xdata); -void -read_ahead (call_frame_t *frame, ra_file_t *file) + return 0; +} + +int +ra_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - off_t ra_offset = 0; - size_t ra_size = 0; - off_t trav_offset = 0; - ra_page_t *trav = NULL; - off_t cap = 0; - char fault = 0; + ra_file_t *file = NULL; + ra_local_t *local = NULL; + ra_conf_t *conf = NULL; + int op_errno = EINVAL; + char expected_offset = 1; + uint64_t tmp_file = 0; + + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); + + conf = this->private; + + gf_msg_trace(this->name, 0, + "NEW REQ at offset=%" PRId64 " for size=%" GF_PRI_SIZET "", + offset, size); + + fd_ctx_get(fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (!file || file->disabled) { + goto disabled; + } + + if (file->offset != offset) { + gf_msg_trace(this->name, 0, + "unexpected offset (%" PRId64 " != %" PRId64 + ") " + "resetting", + file->offset, offset); + + expected_offset = file->expected = file->page_count = 0; + } else { + gf_msg_trace(this->name, 0, + "expected offset (%" PRId64 ") when page_count=%d", offset, + file->page_count); + + if (file->expected < (file->page_size * conf->page_count)) { + file->expected += size; + file->page_count = min((file->expected / file->page_size), + conf->page_count); + } + } - if (!file->page_count) - return; + if (!expected_offset) { + flush_region(frame, file, 0, file->pages.prev->offset + 1, 0); + } - ra_size = file->page_size * file->page_count; - ra_offset = floor (file->offset, file->page_size); - cap = file->size ? file->size : file->offset + ra_size; + local = mem_get0(this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto unwind; + } - while (ra_offset < min (file->offset + ra_size, cap)) { + local->fd = fd; + local->offset = offset; + local->size = size; + local->wait_count = 1; - ra_file_lock (file); - { - trav = ra_page_get (file, ra_offset); - } - ra_file_unlock (file); + local->fill.next = &local->fill; + local->fill.prev = &local->fill; - if (!trav) - break; + pthread_mutex_init(&local->local_lock, NULL); - ra_offset += file->page_size; - } + frame->local = local; - if (trav) - /* comfortable enough */ - return; + dispatch_requests(frame, file); - trav_offset = ra_offset; + flush_region(frame, file, 0, gf_floor(offset, file->page_size), 0); - trav = file->pages.next; - cap = file->size ? file->size : ra_offset + ra_size; + read_ahead(frame, file); - while (trav_offset < min(ra_offset + ra_size, cap)) { - fault = 0; - ra_file_lock (file); - { - trav = ra_page_get (file, trav_offset); - if (!trav) { - fault = 1; - trav = ra_page_create (file, trav_offset); - if (trav) - trav->dirty = 1; - } - } - ra_file_unlock (file); + file->offset = offset + size; - if (!trav) { - /* OUT OF MEMORY */ - break; - } + ra_frame_return(frame); - if (fault) { - gf_log (frame->this->name, GF_LOG_TRACE, - "RA at offset=%"PRId64, trav_offset); - ra_page_fault (file, frame, trav_offset); - } - trav_offset += file->page_size; - } + return 0; - return; +unwind: + STACK_UNWIND_STRICT(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL); + + return 0; + +disabled: + STACK_WIND(frame, ra_readv_disabled_cbk, FIRST_CHILD(frame->this), + FIRST_CHILD(frame->this)->fops->readv, fd, size, offset, flags, + xdata); + return 0; } +int +ra_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + GF_ASSERT(frame); + STACK_UNWIND_STRICT(flush, frame, op_ret, op_errno, xdata); + return 0; +} int -ra_need_atime_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct stat *stbuf, struct iobref *iobref) +ra_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) { - STACK_DESTROY (frame->root); - return 0; + GF_ASSERT(frame); + STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata); + return 0; } +int +ra_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int32_t op_errno = EINVAL; -static void -dispatch_requests (call_frame_t *frame, ra_file_t *file) -{ - ra_local_t *local = NULL; - ra_conf_t *conf = NULL; - off_t rounded_offset = 0; - off_t rounded_end = 0; - off_t trav_offset = 0; - ra_page_t *trav = NULL; - call_frame_t *ra_frame = NULL; - char need_atime_update = 1; - char fault = 0; - - local = frame->local; - conf = file->conf; - - rounded_offset = floor (local->offset, file->page_size); - rounded_end = roof (local->offset + local->size, file->page_size); - - trav_offset = rounded_offset; - trav = file->pages.next; - - while (trav_offset < rounded_end) { - fault = 0; - - ra_file_lock (file); - { - trav = ra_page_get (file, trav_offset); - if (!trav) { - trav = ra_page_create (file, trav_offset); - fault = 1; - need_atime_update = 0; - } - - if (!trav) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unlock; - } - - if (trav->ready) { - gf_log (frame->this->name, GF_LOG_TRACE, - "HIT at offset=%"PRId64".", - trav_offset); - ra_frame_fill (trav, frame); - } else { - gf_log (frame->this->name, GF_LOG_TRACE, - "IN-TRANSIT at offset=%"PRId64".", - trav_offset); - ra_wait_on_page (trav, frame); - need_atime_update = 0; - } - } - unlock: - ra_file_unlock (file); - - if (fault) { - gf_log (frame->this->name, GF_LOG_TRACE, - "MISS at offset=%"PRId64".", - trav_offset); - ra_page_fault (file, frame, trav_offset); - } - - trav_offset += file->page_size; - } - - if (need_atime_update && conf->force_atime_update) { - /* TODO: use untimens() since readv() can confuse underlying - io-cache and others */ - ra_frame = copy_frame (frame); - if (ra_frame == NULL) { - goto out; - } + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); - STACK_WIND (ra_frame, ra_need_atime_cbk, - FIRST_CHILD (frame->this), - FIRST_CHILD (frame->this)->fops->readv, - file->fd, 1, 1); - } + STACK_WIND(frame, ra_flush_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + return 0; -out: - return ; +unwind: + STACK_UNWIND_STRICT(flush, frame, -1, op_errno, NULL); + return 0; } - int -ra_readv_disabled_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iovec *vector, - int32_t count, struct stat *stbuf, struct iobref *iobref) +ra_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { - STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, count, - stbuf, iobref); - - return 0; -} + int32_t op_errno = EINVAL; + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); -int -ra_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset) -{ - ra_file_t *file = NULL; - ra_local_t *local = NULL; - ra_conf_t *conf = NULL; - int op_errno = 0; - int ret = 0; - char expected_offset = 1; - uint64_t tmp_file = 0; - - conf = this->private; - - gf_log (this->name, GF_LOG_TRACE, - "NEW REQ at offset=%"PRId64" for size=%"GF_PRI_SIZET"", - offset, size); - - ret = fd_ctx_get (fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - - if (file == NULL) { - op_errno = EBADF; - gf_log (this->name, GF_LOG_DEBUG, "readv received on fd with no" - " file set in its context"); - goto unwind; - } + STACK_WIND(frame, ra_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); + return 0; - if (file->offset != offset) { - gf_log (this->name, GF_LOG_DEBUG, - "unexpected offset (%"PRId64" != %"PRId64") resetting", - file->offset, offset); +unwind: + STACK_UNWIND_STRICT(fsync, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} - expected_offset = file->expected = file->page_count = 0; - } else { - gf_log (this->name, GF_LOG_TRACE, - "expected offset (%"PRId64") when page_count=%d", - offset, file->page_count); +int +ra_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + ra_file_t *file = NULL; - if (file->expected < (conf->page_size * conf->page_count)) { - file->expected += size; - file->page_count = min ((file->expected / file->page_size), - conf->page_count); - } - } + GF_ASSERT(frame); - if (!expected_offset) { - flush_region (frame, file, 0, file->pages.prev->offset + 1); - } + file = frame->local; - if (file->disabled) { - STACK_WIND (frame, ra_readv_disabled_cbk, - FIRST_CHILD (frame->this), - FIRST_CHILD (frame->this)->fops->readv, - file->fd, size, offset); - return 0; - } + if (file) { + flush_region(frame, file, 0, file->pages.prev->offset + 1, 1); + } - local = (void *) CALLOC (1, sizeof (*local)); - if (!local) { - gf_log (this->name, GF_LOG_ERROR, - "out of memory"); - op_errno = ENOMEM; - goto unwind; - } + frame->local = NULL; + STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} - local->fd = fd; - local->offset = offset; - local->size = size; - local->wait_count = 1; +int +ra_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + ra_file_t *file = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; - local->fill.next = &local->fill; - local->fill.prev = &local->fill; + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); - pthread_mutex_init (&local->local_lock, NULL); + inode = fd->inode; - frame->local = local; + LOCK(&inode->lock); + { + list_for_each_entry(iter_fd, &inode->fd_list, inode_list) + { + tmp_file = 0; + fd_ctx_get(iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; - dispatch_requests (frame, file); + if (!file) + continue; - flush_region (frame, file, 0, floor (offset, file->page_size)); + if (iter_fd == fd) + frame->local = file; - read_ahead (frame, file); + flush_region(frame, file, 0, file->pages.prev->offset + 1, 1); - ra_frame_return (frame); + /* reset the read-ahead counters too */ + file->expected = file->page_count = 0; + } + } + UNLOCK(&inode->lock); - file->offset = offset + size; + STACK_WIND(frame, ra_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); - return 0; + return 0; unwind: - STACK_UNWIND_STRICT (readv, frame, -1, op_errno, NULL, 0, NULL, NULL); - - return 0; + STACK_UNWIND_STRICT(writev, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } - int -ra_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno) +ra_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno); - return 0; -} - + GF_ASSERT(frame); + STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} int -ra_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct stat *prebuf, struct stat *postbuf) +ra_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, dict_t *xdata) { - STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf); - return 0; -} + GF_ASSERT(frame); + STACK_UNWIND_STRICT(stat, frame, op_ret, op_errno, buf, xdata); + return 0; +} int -ra_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) -{ - ra_file_t *file = NULL; - int ret = 0; - uint64_t tmp_file = 0; - int32_t op_errno = 0; - - ret = fd_ctx_get (fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - if (file == NULL) { - op_errno = EBADF; - gf_log (this->name, GF_LOG_DEBUG, "flush received on fd with no" - " file set in its context"); - goto unwind; +ra_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; + + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, loc, unwind); + + inode = loc->inode; + + LOCK(&inode->lock); + { + list_for_each_entry(iter_fd, &inode->fd_list, inode_list) + { + tmp_file = 0; + fd_ctx_get(iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (!file) + continue; + /* + * Truncation invalidates reads just like writing does. + * TBD: this seems to flush more than it should. The + * only time we should flush at all is when we're + * shortening (not lengthening) the file, and then only + * from new EOF to old EOF. The same problem exists in + * ra_ftruncate. + */ + flush_region(frame, file, 0, file->pages.prev->offset + 1, 1); } + } + UNLOCK(&inode->lock); - flush_region (frame, file, 0, file->pages.prev->offset+1); - - STACK_WIND (frame, ra_flush_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->flush, - fd); - return 0; + STACK_WIND(frame, ra_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; unwind: - STACK_UNWIND_STRICT (flush, frame, -1, op_errno); - return 0; + STACK_UNWIND_STRICT(truncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } +void +ra_page_dump(struct ra_page *page) +{ + int i = 0; + call_frame_t *frame = NULL; + char key[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + ra_waitq_t *trav = NULL; -int -ra_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync) -{ - ra_file_t *file = NULL; - int ret = 0; - uint64_t tmp_file = 0; - int32_t op_errno = 0; - - ret = fd_ctx_get (fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - if (file == NULL) { - op_errno = EBADF; - gf_log (this->name, GF_LOG_DEBUG, "fsync received on fd with no" - " file set in its context"); - goto unwind; - } + if (page == NULL) { + goto out; + } - if (file) { - flush_region (frame, file, 0, file->pages.prev->offset+1); - } + gf_proc_dump_write("offset", "%" PRId64, page->offset); - STACK_WIND (frame, ra_fsync_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fsync, - fd, datasync); - return 0; + gf_proc_dump_write("size", "%" GF_PRI_SIZET, page->size); -unwind: - STACK_UNWIND_STRICT (fsync, frame, -1, op_errno, NULL, NULL); - return 0; -} + gf_proc_dump_write("dirty", "%s", page->dirty ? "yes" : "no"); + gf_proc_dump_write("poisoned", "%s", page->poisoned ? "yes" : "no"); -int -ra_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *prebuf, - struct stat *postbuf) + gf_proc_dump_write("ready", "%s", page->ready ? "yes" : "no"); + + for (trav = page->waitq; trav; trav = trav->next) { + frame = trav->data; + sprintf(key, "waiting-frame[%d]", i++); + gf_proc_dump_write(key, "%" PRId64, frame->root->unique); + } + +out: + return; +} + +int32_t +ra_fdctx_dump(xlator_t *this, fd_t *fd) { - fd_t *fd = NULL; - ra_file_t *file = NULL; - int ret = 0; - uint64_t tmp_file = 0; + ra_file_t *file = NULL; + ra_page_t *page = NULL; + int32_t ret = 0, i = 0; + uint64_t tmp_file = 0; + char *path = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + + fd_ctx_get(fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (file == NULL) { + ret = 0; + goto out; + } - fd = frame->local; + gf_proc_dump_build_key(key_prefix, "xlator.performance.read-ahead", "file"); - ret = fd_ctx_get (fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; + gf_proc_dump_add_section("%s", key_prefix); - flush_region (frame, file, 0, file->pages.prev->offset+1); + ret = __inode_path(fd->inode, NULL, &path); + if (path != NULL) { + gf_proc_dump_write("path", "%s", path); + GF_FREE(path); + } - frame->local = NULL; - STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf); - return 0; -} + gf_proc_dump_write("fd", "%p", fd); + gf_proc_dump_write("disabled", "%s", file->disabled ? "yes" : "no"); -int -ra_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, - int32_t count, off_t offset, struct iobref *iobref) -{ - ra_file_t *file = NULL; - int ret = 0; - uint64_t tmp_file = 0; - int32_t op_errno = 0; - - ret = fd_ctx_get (fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - if (file == NULL) { - op_errno = EBADF; - gf_log (this->name, GF_LOG_DEBUG, "writev received on fd with" - "no file set in its context"); - goto unwind; - } + if (file->disabled) { + ret = 0; + goto out; + } - flush_region (frame, file, 0, file->pages.prev->offset+1); + gf_proc_dump_write("page-size", "%" PRId64, file->page_size); - /* reset the read-ahead counters too */ - file->expected = file->page_count = 0; + gf_proc_dump_write("page-count", "%u", file->page_count); - frame->local = fd; + gf_proc_dump_write("next-expected-offset-for-sequential-reads", "%" PRId64, + file->offset); - STACK_WIND (frame, ra_writev_cbk, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->writev, - fd, vector, count, offset, iobref); + for (page = file->pages.next; page != &file->pages; page = page->next) { + gf_proc_dump_write("page", "%d: %p", i++, (void *)page); + ra_page_dump(page); + } - return 0; + ret = 0; +out: + return ret; +} + +int +ra_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; + ra_conf_t *conf = NULL; + + conf = this->private; + + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); + + inode = fd->inode; + + if (conf->force_atime_update) { + LOCK(&inode->lock); + { + list_for_each_entry(iter_fd, &inode->fd_list, inode_list) + { + tmp_file = 0; + fd_ctx_get(iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (!file) + continue; + flush_region(frame, file, 0, file->pages.prev->offset + 1, 0); + } + } + UNLOCK(&inode->lock); + } + + STACK_WIND(frame, ra_attr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + return 0; unwind: - STACK_UNWIND_STRICT (writev, frame, -1, op_errno, NULL, NULL); - return 0; + STACK_UNWIND_STRICT(stat, frame, -1, op_errno, NULL, NULL); + return 0; } +int +ra_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; + + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); + + inode = fd->inode; + + LOCK(&inode->lock); + { + list_for_each_entry(iter_fd, &inode->fd_list, inode_list) + { + tmp_file = 0; + fd_ctx_get(iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + if (!file) + continue; + /* + * Truncation invalidates reads just like writing does. + * TBD: this seems to flush more than it should. The + * only time we should flush at all is when we're + * shortening (not lengthening) the file, and then only + * from new EOF to old EOF. The same problem exists in + * ra_truncate. + */ + flush_region(frame, file, 0, file->pages.prev->offset + 1, 1); + } + } + UNLOCK(&inode->lock); + + STACK_WIND(frame, ra_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT(truncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} int -ra_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *prebuf, - struct stat *postbuf) +ra_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, - postbuf); - return 0; + GF_ASSERT(frame); + + STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } +static int +ra_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; + + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); + + inode = fd->inode; + + LOCK(&inode->lock); + { + list_for_each_entry(iter_fd, &inode->fd_list, inode_list) + { + tmp_file = 0; + fd_ctx_get(iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + if (!file) + continue; + + flush_region(frame, file, offset, len, 1); + } + } + UNLOCK(&inode->lock); + + STACK_WIND(frame, ra_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT(discard, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} int -ra_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct stat *buf) +ra_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf); - return 0; + GF_ASSERT(frame); + + STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; } +static int +ra_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; + + GF_ASSERT(frame); + GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind); + + inode = fd->inode; + + LOCK(&inode->lock); + { + list_for_each_entry(iter_fd, &inode->fd_list, inode_list) + { + tmp_file = 0; + fd_ctx_get(iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + if (!file) + continue; + + flush_region(frame, file, offset, len, 1); + } + } + UNLOCK(&inode->lock); + + STACK_WIND(frame, ra_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT(zerofill, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} int -ra_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) +ra_priv_dump(xlator_t *this) +{ + ra_conf_t *conf = NULL; + int ret = -1; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + + if (!this) { + goto out; + } + + conf = this->private; + if (!conf) { + gf_msg(this->name, GF_LOG_WARNING, 0, READ_AHEAD_MSG_XLATOR_CONF_NULL, + "conf null in xlator"); + goto out; + } + + gf_proc_dump_build_key(key_prefix, "xlator.performance.read-ahead", "priv"); + + gf_proc_dump_add_section("%s", key_prefix); + + ret = pthread_mutex_trylock(&conf->conf_lock); + if (ret) + goto out; + { + gf_proc_dump_write("page_size", "%" PRIu64, conf->page_size); + gf_proc_dump_write("page_count", "%d", conf->page_count); + gf_proc_dump_write("force_atime_update", "%d", + conf->force_atime_update); + } + pthread_mutex_unlock(&conf->conf_lock); + + ret = 0; +out: + if (ret && conf) { + gf_proc_dump_write("Unable to dump priv", + "(Lock acquisition failed) %s", this->name); + } + return ret; +} + +int32_t +mem_acct_init(xlator_t *this) { - ra_file_t *file = NULL; - fd_t *iter_fd = NULL; - inode_t *inode = NULL; - int ret = 0; - uint64_t tmp_file = 0; + int ret = -1; - inode = loc->inode; + if (!this) { + goto out; + } - LOCK (&inode->lock); - { - list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { - ret = fd_ctx_get (iter_fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; + ret = xlator_mem_acct_init(this, gf_ra_mt_end + 1); - if (!file) - continue; - flush_region (frame, file, 0, - file->pages.prev->offset + 1); - } - } - UNLOCK (&inode->lock); + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, READ_AHEAD_MSG_NO_MEMORY, + "Memory accounting init" + "failed"); + } - STACK_WIND (frame, ra_truncate_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->truncate, - loc, offset); - return 0; +out: + return ret; } - int -ra_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd) +reconfigure(xlator_t *this, dict_t *options) { - ra_file_t *file = NULL; - fd_t *iter_fd = NULL; - inode_t *inode = NULL; - int ret = 0; - uint64_t tmp_file = 0; + ra_conf_t *conf = NULL; + int ret = -1; - inode = fd->inode; + GF_VALIDATE_OR_GOTO("read-ahead", this, out); + GF_VALIDATE_OR_GOTO("read-ahead", this->private, out); - LOCK (&inode->lock); - { - list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { - ret = fd_ctx_get (iter_fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; + conf = this->private; - if (!file) - continue; - flush_region (frame, file, 0, - file->pages.prev->offset + 1); - } - } - UNLOCK (&inode->lock); + GF_OPTION_RECONF("page-count", conf->page_count, options, uint32, out); - STACK_WIND (frame, ra_attr_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->fstat, - fd); - return 0; -} + GF_OPTION_RECONF("page-size", conf->page_size, options, size_uint64, out); + GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, out); -int -ra_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset) -{ - ra_file_t *file = NULL; - fd_t *iter_fd = NULL; - inode_t *inode = NULL; - int ret = 0; - uint64_t tmp_file = 0; - - inode = fd->inode; - - LOCK (&inode->lock); - { - list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { - ret = fd_ctx_get (iter_fd, this, &tmp_file); - file = (ra_file_t *)(long)tmp_file; - if (!file) - continue; - flush_region (frame, file, 0, - file->pages.prev->offset + 1); - } - } - UNLOCK (&inode->lock); - - STACK_WIND (frame, ra_truncate_cbk, - FIRST_CHILD (this), - FIRST_CHILD (this)->fops->ftruncate, - fd, offset); - return 0; + ret = 0; +out: + return ret; } int -ra_priv_dump (xlator_t *this) +init(xlator_t *this) { - ra_conf_t *conf = NULL; - int ret = -1; - char key[GF_DUMP_MAX_BUF_LEN]; - char key_prefix[GF_DUMP_MAX_BUF_LEN]; + ra_conf_t *conf = NULL; + int32_t ret = -1; - if (!this) - return -1; + GF_VALIDATE_OR_GOTO("read-ahead", this, out); - conf = this->private; - if (!conf) { - gf_log (this->name, GF_LOG_WARNING, - "conf null in xlator"); - return -1; - } + if (!this->children || this->children->next) { + gf_msg(this->name, GF_LOG_ERROR, 0, + READ_AHEAD_MSG_XLATOR_CHILD_MISCONFIGURED, + "FATAL: read-ahead not configured with exactly one" + " child"); + goto out; + } - ret = pthread_mutex_trylock (&conf->conf_lock); - if (ret) { - gf_log ("", GF_LOG_WARNING, "Unable to lock client %s" - " errno: %d", this->name, errno); - return -1; - } + if (!this->parents) { + gf_msg(this->name, GF_LOG_WARNING, 0, READ_AHEAD_MSG_VOL_MISCONFIGURED, + "dangling volume. check volfile "); + } + conf = (void *)GF_CALLOC(1, sizeof(*conf), gf_ra_mt_ra_conf_t); + if (conf == NULL) { + goto out; + } - gf_proc_dump_build_key (key_prefix, - "xlator.performance.read-ahead", - "priv"); + conf->page_size = this->ctx->page_size; - gf_proc_dump_add_section (key_prefix); - gf_proc_dump_build_key (key, key_prefix, "page_size"); - gf_proc_dump_write (key, "%d", conf->page_size); - gf_proc_dump_build_key (key, key_prefix, "page_count"); - gf_proc_dump_write (key, "%d", conf->page_count); - gf_proc_dump_build_key (key, key_prefix, "force_atime_update"); - gf_proc_dump_write (key, "%d", conf->force_atime_update); + GF_OPTION_INIT("page-size", conf->page_size, size_uint64, out); - pthread_mutex_unlock (&conf->conf_lock); + GF_OPTION_INIT("page-count", conf->page_count, uint32, out); - return 0; -} + GF_OPTION_INIT("force-atime-update", conf->force_atime_update, bool, out); -int -init (xlator_t *this) -{ - ra_conf_t *conf = NULL; - dict_t *options = this->options; - char *page_count_string = NULL; - int32_t ret = -1; - - if (!this->children || this->children->next) { - gf_log (this->name, GF_LOG_ERROR, - "FATAL: read-ahead not configured with exactly one" - " child"); - goto out; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile "); - } - - conf = (void *) CALLOC (1, sizeof (*conf)); - if (conf == NULL) { - gf_log (this->name, GF_LOG_ERROR, - "FATAL: Out of memory"); - goto out; - } + GF_OPTION_INIT("pass-through", this->pass_through, bool, out); - conf->page_size = this->ctx->page_size; - conf->page_count = 4; - - if (dict_get (options, "page-count")) - page_count_string = data_to_str (dict_get (options, - "page-count")); - if (page_count_string) - { - if (gf_string2uint_base10 (page_count_string, &conf->page_count) - != 0) - { - gf_log ("read-ahead", - GF_LOG_ERROR, - "invalid number format \"%s\" of \"option " - "page-count\"", - page_count_string); - goto out; - } - gf_log (this->name, GF_LOG_DEBUG, "Using conf->page_count = %u", - conf->page_count); - } - - if (dict_get (options, "force-atime-update")) { - char *force_atime_update_str = data_to_str (dict_get (options, - "force-atime-update")); - if (gf_string2boolean (force_atime_update_str, - &conf->force_atime_update) == -1) { - gf_log (this->name, GF_LOG_ERROR, - "'force-atime-update' takes only boolean " - "options"); - goto out; - } - if (conf->force_atime_update) - gf_log (this->name, GF_LOG_DEBUG, "Forcing atime " - "updates on cache hit"); - } - - conf->files.next = &conf->files; - conf->files.prev = &conf->files; - - pthread_mutex_init (&conf->conf_lock, NULL); - this->private = conf; - ret = 0; + conf->files.next = &conf->files; + conf->files.prev = &conf->files; + + pthread_mutex_init(&conf->conf_lock, NULL); + + this->local_pool = mem_pool_new(ra_local_t, 64); + if (!this->local_pool) { + ret = -1; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, READ_AHEAD_MSG_NO_MEMORY, + "failed to create local_t's memory pool"); + goto out; + } + + this->private = conf; + ret = 0; out: - if (ret == -1) { - if (conf != NULL) { - FREE (conf); - } - } + if (ret == -1) { + GF_FREE(conf); + } - return ret; + return ret; } void -fini (xlator_t *this) +fini(xlator_t *this) { - ra_conf_t *conf = this->private; + ra_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO("read-ahead", this, out); + + conf = this->private; + if (conf == NULL) { + goto out; + } + + this->private = NULL; - if (conf == NULL) - return; + /* The files structures allocated in open and create are not deleted. + * until that is freed, marking the below assert as warning. + GF_ASSERT ((conf->files.next == &conf->files) + && (conf->files.prev == &conf->files)); + */ + if (!((conf->files.next == &conf->files) && + (conf->files.prev == &conf->files))) { + gf_msg(this->name, GF_LOG_INFO, 0, + READ_AHEAD_MSG_UNDESTROYED_FILE_FOUND, + "undestroyed read ahead file structures found"); + } - pthread_mutex_destroy (&conf->conf_lock); - FREE (conf); + pthread_mutex_destroy(&conf->conf_lock); + GF_FREE(conf); - this->private = NULL; - return; +out: + return; } struct xlator_fops fops = { - .open = ra_open, - .create = ra_create, - .readv = ra_readv, - .writev = ra_writev, - .flush = ra_flush, - .fsync = ra_fsync, - .truncate = ra_truncate, - .ftruncate = ra_ftruncate, - .fstat = ra_fstat, -}; - -struct xlator_mops mops = { + .open = ra_open, + .create = ra_create, + .readv = ra_readv, + .writev = ra_writev, + .flush = ra_flush, + .fsync = ra_fsync, + .truncate = ra_truncate, + .ftruncate = ra_ftruncate, + .fstat = ra_fstat, + .discard = ra_discard, + .zerofill = ra_zerofill, }; struct xlator_cbks cbks = { - .release = ra_release, + .release = ra_release, }; struct xlator_dumpops dumpops = { - .priv = ra_priv_dump, + .priv = ra_priv_dump, + .fdctx = ra_fdctx_dump, }; struct volume_options options[] = { - { .key = {"force-atime-update"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"page-count"}, - .type = GF_OPTION_TYPE_INT, - .min = 1, - .max = 16 - }, - { .key = {NULL} }, + { + .key = {"read-ahead"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable read-ahead", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE, + }, + {.key = {"force-atime-update"}, + .type = GF_OPTION_TYPE_BOOL, + .op_version = {1}, + .tags = {"read-ahead"}, + .default_value = "false"}, + {.key = {"page-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 16, + .default_value = "4", + .op_version = {1}, + .tags = {"read-ahead"}, + .description = "Number of pages that will be pre-fetched"}, + {.key = {"page-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 4096, + .max = 1048576 * 64, + .default_value = "131072", + .op_version = {1}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"read-ahead"}, + .description = "Page size with which read-ahead performs server I/O"}, + {.key = {"pass-through"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .op_version = {GD_OP_VERSION_4_1_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT, + .tags = {"read-ahead"}, + .description = "Enable/Disable read ahead translator"}, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "read-ahead", + .category = GF_MAINTAINED, }; diff --git a/xlators/performance/read-ahead/src/read-ahead.h b/xlators/performance/read-ahead/src/read-ahead.h index d11143551f0..e9432fb47cc 100644 --- a/xlators/performance/read-ahead/src/read-ahead.h +++ b/xlators/performance/read-ahead/src/read-ahead.h @@ -1,36 +1,22 @@ /* - Copyright (c) 2006-2009 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __READ_AHEAD_H #define __READ_AHEAD_H -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - - -#include "glusterfs.h" -#include "logging.h" -#include "dict.h" -#include "xlator.h" -#include "common-utils.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/logging.h> +#include <glusterfs/dict.h> +#include <glusterfs/xlator.h> +#include <glusterfs/common-utils.h> +#include "read-ahead-mem-types.h" struct ra_conf; struct ra_local; @@ -38,82 +24,77 @@ struct ra_page; struct ra_file; struct ra_waitq; - struct ra_waitq { - struct ra_waitq *next; - void *data; + struct ra_waitq *next; + void *data; }; - struct ra_fill { - struct ra_fill *next; - struct ra_fill *prev; - off_t offset; - size_t size; - struct iovec *vector; - int32_t count; - struct iobref *iobref; + struct ra_fill *next; + struct ra_fill *prev; + off_t offset; + size_t size; + struct iovec *vector; + int32_t count; + struct iobref *iobref; }; - struct ra_local { - mode_t mode; - struct ra_fill fill; - off_t offset; - size_t size; - int32_t op_ret; - int32_t op_errno; - off_t pending_offset; - size_t pending_size; - fd_t *fd; - int32_t wait_count; - pthread_mutex_t local_lock; + mode_t mode; + struct ra_fill fill; + off_t offset; + size_t size; + int32_t op_ret; + int32_t op_errno; + off_t pending_offset; + size_t pending_size; + fd_t *fd; + int32_t wait_count; + pthread_mutex_t local_lock; }; - struct ra_page { - struct ra_page *next; - struct ra_page *prev; - struct ra_file *file; - char dirty; - char ready; - struct iovec *vector; - int32_t count; - off_t offset; - size_t size; - struct ra_waitq *waitq; - struct iobref *iobref; + struct ra_page *next; + struct ra_page *prev; + struct ra_file *file; + char dirty; /* Internal request, not from user. */ + char poisoned; /* Pending read invalidated by write. */ + char ready; + struct iovec *vector; + int32_t count; + off_t offset; + size_t size; + struct ra_waitq *waitq; + struct iobref *iobref; + char stale; }; - struct ra_file { - struct ra_file *next; - struct ra_file *prev; - struct ra_conf *conf; - fd_t *fd; - int disabled; - size_t expected; - struct ra_page pages; - off_t offset; - size_t size; - int32_t refcount; - pthread_mutex_t file_lock; - struct stat stbuf; - uint64_t page_size; - uint32_t page_count; + struct ra_file *next; + struct ra_file *prev; + struct ra_conf *conf; + fd_t *fd; + int disabled; + size_t expected; + struct ra_page pages; + off_t offset; + size_t size; + int32_t refcount; + pthread_mutex_t file_lock; + struct iatt stbuf; + uint64_t page_size; + uint32_t page_count; }; - struct ra_conf { - uint64_t page_size; - uint32_t page_count; - void *cache_block; - struct ra_file files; - gf_boolean_t force_atime_update; - pthread_mutex_t conf_lock; + uint64_t page_size; + uint32_t page_count; + void *cache_block; + struct ra_file files; + gf_boolean_t force_atime_update; + pthread_mutex_t conf_lock; }; - typedef struct ra_conf ra_conf_t; typedef struct ra_local ra_local_t; typedef struct ra_page ra_page_t; @@ -122,77 +103,69 @@ typedef struct ra_waitq ra_waitq_t; typedef struct ra_fill ra_fill_t; ra_page_t * -ra_page_get (ra_file_t *file, - off_t offset); +ra_page_get(ra_file_t *file, off_t offset); ra_page_t * -ra_page_create (ra_file_t *file, - off_t offset); +ra_page_create(ra_file_t *file, off_t offset); void -ra_page_fault (ra_file_t *file, - call_frame_t *frame, - off_t offset); +ra_page_fault(ra_file_t *file, call_frame_t *frame, off_t offset); void -ra_wait_on_page (ra_page_t *page, - call_frame_t *frame); +ra_wait_on_page(ra_page_t *page, call_frame_t *frame); ra_waitq_t * -ra_page_wakeup (ra_page_t *page); +ra_page_wakeup(ra_page_t *page); void -ra_page_flush (ra_page_t *page); +ra_page_flush(ra_page_t *page); ra_waitq_t * -ra_page_error (ra_page_t *page, - int32_t op_ret, - int32_t op_errno); +ra_page_error(ra_page_t *page, int32_t op_ret, int32_t op_errno); void -ra_page_purge (ra_page_t *page); +ra_page_purge(ra_page_t *page); void -ra_frame_return (call_frame_t *frame); +ra_frame_return(call_frame_t *frame); void -ra_frame_fill (ra_page_t *page, - call_frame_t *frame); +ra_frame_fill(ra_page_t *page, call_frame_t *frame); void -ra_file_destroy (ra_file_t *file); +ra_file_destroy(ra_file_t *file); static inline void -ra_file_lock (ra_file_t *file) +ra_file_lock(ra_file_t *file) { - pthread_mutex_lock (&file->file_lock); + pthread_mutex_lock(&file->file_lock); } static inline void -ra_file_unlock (ra_file_t *file) +ra_file_unlock(ra_file_t *file) { - pthread_mutex_unlock (&file->file_lock); + pthread_mutex_unlock(&file->file_lock); } static inline void -ra_conf_lock (ra_conf_t *conf) +ra_conf_lock(ra_conf_t *conf) { - pthread_mutex_lock (&conf->conf_lock); + pthread_mutex_lock(&conf->conf_lock); } static inline void -ra_conf_unlock (ra_conf_t *conf) +ra_conf_unlock(ra_conf_t *conf) { - pthread_mutex_unlock (&conf->conf_lock); + pthread_mutex_unlock(&conf->conf_lock); } static inline void -ra_local_lock (ra_local_t *local) +ra_local_lock(ra_local_t *local) { - pthread_mutex_lock (&local->local_lock); + pthread_mutex_lock(&local->local_lock); } static inline void -ra_local_unlock (ra_local_t *local) +ra_local_unlock(ra_local_t *local) { - pthread_mutex_unlock (&local->local_lock); + pthread_mutex_unlock(&local->local_lock); } #endif /* __READ_AHEAD_H */ |
