diff options
Diffstat (limited to 'xlators/performance/read-ahead/src')
| -rw-r--r-- | xlators/performance/read-ahead/src/Makefile.am | 14 | ||||
| -rw-r--r-- | xlators/performance/read-ahead/src/page.c | 487 | ||||
| -rw-r--r-- | xlators/performance/read-ahead/src/read-ahead.c | 890 | ||||
| -rw-r--r-- | xlators/performance/read-ahead/src/read-ahead.h | 194 | 
4 files changed, 1585 insertions, 0 deletions
diff --git a/xlators/performance/read-ahead/src/Makefile.am b/xlators/performance/read-ahead/src/Makefile.am new file mode 100644 index 00000000000..7bb90228227 --- /dev/null +++ b/xlators/performance/read-ahead/src/Makefile.am @@ -0,0 +1,14 @@ +xlator_LTLIBRARIES = read-ahead.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance + +read_ahead_la_LDFLAGS = -module -avoidversion + +read_ahead_la_SOURCES = read-ahead.c page.c +read_ahead_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = read-ahead.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ +	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES =  diff --git a/xlators/performance/read-ahead/src/page.c b/xlators/performance/read-ahead/src/page.c new file mode 100644 index 00000000000..3b8d4d2093e --- /dev/null +++ b/xlators/performance/read-ahead/src/page.c @@ -0,0 +1,487 @@ +/* +  Copyright (c) 2006, 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> +  This file is part of GlusterFS. + +  GlusterFS is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published +  by the Free Software Foundation; either version 3 of the License, +  or (at your option) any later version. + +  GlusterFS is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program.  If not, see +  <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "read-ahead.h" +#include <assert.h> + + +ra_page_t * +ra_page_get (ra_file_t *file, +	     off_t offset) +{ +	ra_page_t *page = NULL; +	off_t      rounded_offset = 0; + +	page = file->pages.next; +	rounded_offset = floor (offset, file->page_size); + +	while (page != &file->pages && page->offset < rounded_offset) +		page = page->next; + +	if (page == &file->pages || page->offset != rounded_offset) +		page = NULL; + +	return page; +} + + +ra_page_t * +ra_page_create (ra_file_t *file, off_t offset) +{ +	ra_page_t *page      = NULL; +	off_t      rounded_offset = 0; +	ra_page_t *newpage   = NULL; + +	page           = file->pages.next; +	rounded_offset = floor (offset, file->page_size); + +	while (page != &file->pages && page->offset < rounded_offset) +		page = page->next; + +	if (page == &file->pages || page->offset != rounded_offset) { +		newpage = CALLOC (1, sizeof (*newpage)); +		if (!newpage) +			return NULL; + +		newpage->offset = rounded_offset; +		newpage->prev = page->prev; +		newpage->next = page; +		newpage->file = file; +		page->prev->next = newpage; +		page->prev = newpage; + +		page = newpage; +	} + +	return page; +} + + +void +ra_wait_on_page (ra_page_t *page, call_frame_t *frame) +{ +	ra_waitq_t *waitq = NULL; +	ra_local_t *local = NULL; + + +	local = frame->local; +	waitq = CALLOC (1, sizeof (*waitq)); +	if (!waitq) { +		gf_log (frame->this->name, GF_LOG_ERROR, +			"out of memory :("); +		return; +	} + +	waitq->data = frame; +	waitq->next = page->waitq; +	page->waitq = waitq; + +	ra_local_lock (local); +	{ +		local->wait_count++; +	} +	ra_local_unlock (local); +} + + +void +ra_waitq_return (ra_waitq_t *waitq) +{ +	ra_waitq_t   *trav = NULL; +	ra_waitq_t   *next = NULL; +	call_frame_t *frame = NULL; + +	for (trav = waitq; trav; trav = next) { +		next = trav->next; + +		frame = trav->data; +		ra_frame_return (frame); +		free (trav); +	} +} + + +int +ra_fault_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +	      int32_t op_ret, int32_t op_errno, struct iovec *vector, +	      int32_t count, struct stat *stbuf) +{ +	ra_local_t   *local = NULL; +	off_t         pending_offset = 0; +	ra_file_t    *file = NULL; +	ra_page_t    *page = NULL; +	off_t         trav_offset = 0; +	size_t        payload_size = 0; +	ra_waitq_t   *waitq = NULL; +	fd_t         *fd = NULL; +	int           ret = 0; +	uint64_t      tmp_file = 0; + +	local = frame->local; +	fd  = local->fd; + +	ret = fd_ctx_get (fd, this, &tmp_file); + +	file = (ra_file_t *)(long)tmp_file; +	pending_offset = local->pending_offset; +	trav_offset    = pending_offset;   +	payload_size   = op_ret; + +	ra_file_lock (file); +	{ +		if (op_ret >= 0) +			file->stbuf = *stbuf; + +		if (op_ret < 0) { +			page = ra_page_get (file, pending_offset); +			if (page) +				waitq = ra_page_error (page, op_ret, op_errno); +			goto unlock; +		} + +		page = ra_page_get (file, pending_offset); +		if (!page) { +			gf_log (this->name, GF_LOG_DEBUG, +				"wasted copy: %"PRId64"[+%"PRId64"] file=%p",  +				pending_offset, file->page_size, file); +			goto unlock; +		} + +		if (page->vector) { +			dict_unref (page->ref); +			free (page->vector); +		} + +		page->vector = iov_dup (vector, count); +		page->count = count; +		page->ref = dict_ref (frame->root->rsp_refs); +		page->ready = 1; + +		page->size = iov_length (vector, count); + +		waitq = ra_page_wakeup (page); +	} +unlock: +	ra_file_unlock (file); + +	ra_waitq_return (waitq); + +	fd_unref (local->fd); + +	free (frame->local); +	frame->local = NULL; + +	STACK_DESTROY (frame->root); +	return 0; +} + + +void +ra_page_fault (ra_file_t *file, +	       call_frame_t *frame, +	       off_t offset) +{ +	call_frame_t *fault_frame = NULL; +	ra_local_t   *fault_local = NULL; +     +	fault_frame = copy_frame (frame); +	fault_local = CALLOC (1, sizeof (ra_local_t)); + +	fault_frame->local = fault_local; +	fault_local->pending_offset = offset; +	fault_local->pending_size = file->page_size; + +	fault_local->fd = fd_ref (file->fd); + +	STACK_WIND (fault_frame, ra_fault_cbk, +		    FIRST_CHILD (fault_frame->this), +		    FIRST_CHILD (fault_frame->this)->fops->readv, +		    file->fd, file->page_size, offset); +	return; +} + +void +ra_frame_fill (ra_page_t *page, call_frame_t *frame) +{ +	ra_local_t *local = NULL; +	ra_fill_t  *fill = NULL; +	off_t       src_offset = 0; +	off_t       dst_offset = 0; +	ssize_t     copy_size = 0; +	ra_fill_t  *new = NULL; + + +	local = frame->local; +	fill  = &local->fill; + +	if (local->op_ret != -1 && page->size) { +		if (local->offset > page->offset) +			src_offset = local->offset - page->offset; +		else +			dst_offset = page->offset - local->offset; + +		copy_size = min (page->size - src_offset, +				 local->size - dst_offset); + +		if (copy_size < 0) { +			/* if page contains fewer bytes and the required offset +			   is beyond the page size in the page */ +			copy_size = src_offset = 0; +		} + +		fill = fill->next; +		while (fill != &local->fill) { +			if (fill->offset > page->offset) { +				break; +			} +			fill = fill->next; +		} + +		new = CALLOC (1, sizeof (*new)); + +		new->offset = page->offset; +		new->size = copy_size; +		new->refs = dict_ref (page->ref); +		new->count = iov_subset (page->vector, page->count, +					 src_offset, src_offset+copy_size, +					 NULL); +		new->vector = CALLOC (new->count, sizeof (struct iovec)); + +		new->count = iov_subset (page->vector, page->count, +					 src_offset, src_offset+copy_size, +					 new->vector); + +		new->next = fill; +		new->prev = new->next->prev; +		new->next->prev = new; +		new->prev->next = new; + +		local->op_ret += copy_size; +	} +} + + +void +ra_frame_unwind (call_frame_t *frame) +{ +	ra_local_t   *local = NULL; +	ra_fill_t    *fill = NULL; +	int32_t       count = 0; +	struct iovec *vector; +	int32_t       copied = 0; +	dict_t       *refs = NULL; +	ra_fill_t    *next = NULL; +	fd_t         *fd = NULL; +	ra_file_t    *file = NULL; +	int           ret = 0; +	uint64_t      tmp_file = 0; + +	local = frame->local; +	fill  = local->fill.next; + +	refs  = get_new_dict (); + +	frame->local = NULL; + +	while (fill != &local->fill) { +		count += fill->count; +		fill = fill->next; +	} + +	vector = CALLOC (count, sizeof (*vector)); + +	fill = local->fill.next; + +	while (fill != &local->fill) { +		next = fill->next; + +		memcpy (((char *)vector) + copied, fill->vector, +			fill->count * sizeof (*vector)); + +		copied += (fill->count * sizeof (*vector)); +		dict_copy (fill->refs, refs); + +		fill->next->prev = fill->prev; +		fill->prev->next = fill->prev; + +		dict_unref (fill->refs); +		free (fill->vector); +		free (fill); + +		fill = next; +	} + +	frame->root->rsp_refs = dict_ref (refs); + +	fd = local->fd; +	ret = fd_ctx_get (fd, frame->this, &tmp_file); +	file = (ra_file_t *)(long)tmp_file; +	 +	STACK_UNWIND (frame, local->op_ret, local->op_errno, +		      vector, count, &file->stbuf); +   +	dict_unref (refs); +	pthread_mutex_destroy (&local->local_lock); +	free (local); +	free (vector); + +	return; +} + +/* + * ra_frame_return - + * @frame: + * + */ +void +ra_frame_return (call_frame_t *frame) +{ +	ra_local_t *local = NULL; +	int32_t     wait_count = 0; + +	local = frame->local; +	assert (local->wait_count > 0); + +	ra_local_lock (local); +	{ +		wait_count = --local->wait_count; +	} +	ra_local_unlock (local); + +	if (!wait_count) +		ra_frame_unwind (frame); + +	return; +} + +/*  + * ra_page_wakeup - + * @page: + * + */ +ra_waitq_t * +ra_page_wakeup (ra_page_t *page) +{ +	ra_waitq_t *waitq = NULL, *trav = NULL; +	call_frame_t *frame; + +	waitq = page->waitq; +	page->waitq = NULL; + +	trav = waitq; +	for (trav = waitq; trav; trav = trav->next) { +		frame = trav->data; +		ra_frame_fill (page, frame); +	} + +	return waitq; +} + +/* + * ra_page_purge - + * @page: + * + */ +void +ra_page_purge (ra_page_t *page) +{ +	page->prev->next = page->next; +	page->next->prev = page->prev; + +	if (page->ref) { +		dict_unref (page->ref); +	} +	free (page->vector); +	free (page); +} + +/* + * ra_page_error - + * @page: + * @op_ret: + * @op_errno: + * + */ +ra_waitq_t * +ra_page_error (ra_page_t *page, int32_t op_ret, int32_t op_errno) +{ + +	ra_waitq_t   *waitq = NULL; +	ra_waitq_t   *trav = NULL; +	call_frame_t *frame = NULL; +	ra_local_t   *local = NULL; + +	waitq = page->waitq; +	page->waitq = NULL; + +	trav = waitq; +	for (trav = waitq; trav; trav = trav->next) { +		frame = trav->data; + +		local = frame->local; +		if (local->op_ret != -1) { +			local->op_ret   = op_ret; +			local->op_errno = op_errno; +		} +	} + +	ra_page_purge (page); + +	return waitq; +} + +/*  + * ra_file_destroy - + * @file: + * + */ +void +ra_file_destroy (ra_file_t *file) +{ +	ra_conf_t *conf = NULL; +	ra_page_t *trav = NULL; + +	conf = file->conf; + +	ra_conf_lock (conf); +	{ +		file->prev->next = file->next; +		file->next->prev = file->prev; +	} +	ra_conf_unlock (conf); + +	trav = file->pages.next; +	while (trav != &file->pages) { +		ra_page_error (trav, -1, EINVAL); +		trav = file->pages.next; +	} + +	pthread_mutex_destroy (&file->file_lock); +	free (file); +} + diff --git a/xlators/performance/read-ahead/src/read-ahead.c b/xlators/performance/read-ahead/src/read-ahead.c new file mode 100644 index 00000000000..0060e00fd41 --- /dev/null +++ b/xlators/performance/read-ahead/src/read-ahead.c @@ -0,0 +1,890 @@ +/* +  Copyright (c) 2006, 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> +  This file is part of GlusterFS. + +  GlusterFS is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published +  by the Free Software Foundation; either version 3 of the License, +  or (at your option) any later version. + +  GlusterFS is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program.  If not, see +  <http://www.gnu.org/licenses/>. +*/ + +/*  +   TODO: +   - handle O_DIRECT +   - maintain offset, flush on lseek +   - ensure efficient memory managment in case of random seek +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "read-ahead.h" +#include <assert.h> +#include <sys/time.h> + + +static void +read_ahead (call_frame_t *frame, +            ra_file_t *file); + + +int +ra_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +             int32_t op_ret, int32_t op_errno, fd_t *fd) +{ +	ra_conf_t  *conf = NULL; +	ra_file_t  *file = NULL; +	int         ret = 0; + +	conf  = this->private; + +	if (op_ret == -1) { +		goto unwind; +	} + +	file = CALLOC (1, sizeof (*file)); +	if (!file) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto unwind; +	} + +	ret = fd_ctx_set (fd, this, (uint64_t)(long)file); + +	/* If mandatory locking has been enabled on this file, +	   we disable caching on it */ + +	if ((fd->inode->st_mode & S_ISGID) && !(fd->inode->st_mode & S_IXGRP)) +		file->disabled = 1; + +	/* If O_DIRECT open, we disable caching on it */ + +	if ((fd->flags & O_DIRECT) || (fd->flags & O_WRONLY)) +		file->disabled = 1; + +	file->offset = (unsigned long long) 0; +	file->conf = conf; +	file->pages.next = &file->pages; +	file->pages.prev = &file->pages; +	file->pages.offset = (unsigned long long) 0; +	file->pages.file = file; + +	ra_conf_lock (conf); +	{ +		file->next = conf->files.next; +		conf->files.next = file; +		file->next->prev = file; +		file->prev = &conf->files; +	} +	ra_conf_unlock (conf); + +	file->fd = fd; +	file->page_count = conf->page_count; +	file->page_size = conf->page_size; +	pthread_mutex_init (&file->file_lock, NULL); + +	if (!file->disabled) { +		file->page_count = 1; +	} + +unwind: +	STACK_UNWIND (frame, op_ret, op_errno, fd); + +	return 0; +} + + +int +ra_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +               int32_t op_ret, int32_t op_errno, +	       fd_t *fd, inode_t *inode, struct stat *buf) +{ +	ra_conf_t  *conf = NULL; +	ra_file_t  *file = NULL; +	int         ret = 0; + +	conf  = this->private; + +	if (op_ret == -1) { +		goto unwind; +	} + +	file = CALLOC (1, sizeof (*file)); +	if (!file) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		goto unwind; +	} + +	ret = fd_ctx_set (fd, this, (uint64_t)(long)file); + +	/* If mandatory locking has been enabled on this file, +	   we disable caching on it */ + +	if ((fd->inode->st_mode & S_ISGID) && !(fd->inode->st_mode & S_IXGRP)) +		file->disabled = 1; + +	/* If O_DIRECT open, we disable caching on it */ + +	if ((fd->flags & O_DIRECT) || (fd->flags & O_WRONLY)) +			file->disabled = 1; + +	file->offset = (unsigned long long) 0; +	//file->size = fd->inode->buf.st_size; +	file->conf = conf; +	file->pages.next = &file->pages; +	file->pages.prev = &file->pages; +	file->pages.offset = (unsigned long long) 0; +	file->pages.file = file; + +	ra_conf_lock (conf); +	{ +		file->next = conf->files.next; +		conf->files.next = file; +		file->next->prev = file; +		file->prev = &conf->files; +	} +	ra_conf_unlock (conf); + +	file->fd = fd; +	file->page_count = conf->page_count; +	file->page_size = conf->page_size; +	pthread_mutex_init (&file->file_lock, NULL); + +unwind: +	STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); + +	return 0; +} + + +int +ra_open (call_frame_t *frame, xlator_t *this, +         loc_t *loc, int32_t flags, fd_t *fd) +{ +	STACK_WIND (frame, ra_open_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->open, +		    loc, flags, fd); + +	return 0; +} + +int +ra_create (call_frame_t *frame, xlator_t *this, +	   loc_t *loc, int32_t flags, mode_t mode, fd_t *fd) +{ +	STACK_WIND (frame, ra_create_cbk, +		    FIRST_CHILD(this), +		    FIRST_CHILD(this)->fops->create, +		    loc, flags, mode, fd); + +	return 0; +} + +/* free cache pages between offset and offset+size, +   does not touch pages with frames waiting on it +*/ + +static void +flush_region (call_frame_t *frame, +              ra_file_t *file, +              off_t offset, +              off_t size) +{ +	ra_page_t *trav = NULL; +	ra_page_t *next = NULL; + + +	ra_file_lock (file); +	{ +		trav = file->pages.next; +		while (trav != &file->pages +		       && trav->offset < (offset + size)) { + +			next = trav->next; +			if (trav->offset >= offset && !trav->waitq) { +				ra_page_purge (trav); +			} +			trav = next; +		} +	} +	ra_file_unlock (file); +} + + + +int +ra_release (xlator_t *this, +	    fd_t *fd) +{ +	uint64_t tmp_file = 0; +	int      ret = 0; + +	ret = fd_ctx_del (fd, this, &tmp_file); +	 +	if (!ret) { +		ra_file_destroy ((ra_file_t *)(long)tmp_file); +	} + +	return 0; +} + + +void +read_ahead (call_frame_t *frame, ra_file_t *file) +{ +	off_t      ra_offset = 0; +	size_t     ra_size = 0; +	off_t      trav_offset = 0; +	ra_page_t *trav = NULL; +	off_t      cap = 0; +	char       fault = 0; + +	if (!file->page_count) +		return; + +	ra_size   = file->page_size * file->page_count; +	ra_offset = floor (file->offset, file->page_size); +	cap       = file->size ? file->size : file->offset + ra_size; + +	while (ra_offset < min (file->offset + ra_size, cap)) { + +		ra_file_lock (file); +		{ +			trav = ra_page_get (file, ra_offset); +		} +		ra_file_unlock (file); + +		if (!trav) +			break; + +		ra_offset += file->page_size; +	} + +	if (trav) +		/* comfortable enough */ +		return; + +	trav_offset = ra_offset; + +	trav = file->pages.next; +	cap  = file->size ? file->size : ra_offset + ra_size; + +	while (trav_offset < min(ra_offset + ra_size, cap)) { +		fault = 0; +		ra_file_lock (file); +		{ +			trav = ra_page_get (file, trav_offset); +			if (!trav) { +				fault = 1; +				trav = ra_page_create (file, trav_offset); +				if (trav)  +					trav->dirty = 1; +			} +		} +		ra_file_unlock (file); + +		if (!trav) { +			/* OUT OF MEMORY */ +			break; +		} + +		if (fault) { +			gf_log (frame->this->name, GF_LOG_DEBUG, +				"RA at offset=%"PRId64, trav_offset); +			ra_page_fault (file, frame, trav_offset); +		} +		trav_offset += file->page_size; +	} + +	return; +} + + +int +ra_need_atime_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                   int32_t op_ret, int32_t op_errno, struct iovec *vector, +                   int32_t count, struct stat *stbuf) +{ +	STACK_DESTROY (frame->root); +	return 0; +} + + +static void +dispatch_requests (call_frame_t *frame, +                   ra_file_t *file) +{ +	ra_local_t   *local = NULL; +	ra_conf_t    *conf = NULL; +	off_t         rounded_offset = 0; +	off_t         rounded_end = 0; +	off_t         trav_offset = 0; +	ra_page_t    *trav = NULL; +	call_frame_t *ra_frame = NULL; +	char          need_atime_update = 1; +	char          fault = 0; + + +	local = frame->local; +	conf  = file->conf; + +	rounded_offset = floor (local->offset, file->page_size); +	rounded_end    = roof (local->offset + local->size, file->page_size); + +	trav_offset = rounded_offset; +	trav        = file->pages.next; + +	while (trav_offset < rounded_end) { +		fault = 0; + +		ra_file_lock (file); +		{ +			trav = ra_page_get (file, trav_offset); +			if (!trav) { +				trav = ra_page_create (file, trav_offset); +				fault = 1; +				need_atime_update = 0; +			} + +			if (!trav) +				goto unlock; + +			if (trav->ready) { +				gf_log (frame->this->name, GF_LOG_DEBUG, +					"HIT at offset=%"PRId64".", +					trav_offset); +				ra_frame_fill (trav, frame); +			} else { +				gf_log (frame->this->name, GF_LOG_DEBUG, +					"IN-TRANSIT at offset=%"PRId64".", +					trav_offset); +				ra_wait_on_page (trav, frame); +				need_atime_update = 0; +			} +		} +	unlock: +		ra_file_unlock (file); + +		if (fault) { +			gf_log (frame->this->name, GF_LOG_DEBUG, +				"MISS at offset=%"PRId64".", +				trav_offset); +			ra_page_fault (file, frame, trav_offset); +		} + +		trav_offset += file->page_size; +	} + +	if (need_atime_update && conf->force_atime_update) { +		/* TODO: use untimens() since readv() can confuse underlying +		   io-cache and others */ +		ra_frame = copy_frame (frame); +		STACK_WIND (ra_frame, ra_need_atime_cbk, +			    FIRST_CHILD (frame->this),  +			    FIRST_CHILD (frame->this)->fops->readv, +			    file->fd, 1, 1); +	} + +	return ; +} + + +int +ra_readv_disabled_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                       int32_t op_ret, int32_t op_errno, +		       struct iovec *vector, int32_t count, struct stat *stbuf) +{ +	STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf); + +	return 0; +} + + +int +ra_readv (call_frame_t *frame, xlator_t *this, +	  fd_t *fd, size_t size, off_t offset) +{ +	ra_file_t    *file = NULL; +	ra_local_t   *local = NULL; +	ra_conf_t    *conf = NULL; +	int           op_errno = 0; +	int           ret = 0; +	char expected_offset = 1; +	uint64_t tmp_file = 0; + +	conf = this->private; + +	gf_log (this->name, GF_LOG_DEBUG, +		"NEW REQ at offset=%"PRId64" for size=%"GF_PRI_SIZET"", +		offset, size); + +	ret = fd_ctx_get (fd, this, &tmp_file); +	file = (ra_file_t *)(long)tmp_file; + +	if (file->offset != offset) { +		gf_log (this->name, GF_LOG_DEBUG, +			"unexpected offset (%"PRId64" != %"PRId64") resetting", +			file->offset, offset); + +		expected_offset = file->expected = file->page_count = 0; +	} else { +		gf_log (this->name, GF_LOG_DEBUG, +			"expected offset (%"PRId64") when page_count=%d", +			offset, file->page_count); + +		if (file->expected < (conf->page_size * conf->page_count)) { +			file->expected += size; +			file->page_count = min ((file->expected / file->page_size), +						conf->page_count); +		} +	} + +	if (!expected_offset) { +		flush_region (frame, file, 0, file->pages.prev->offset + 1); +	} + +	if (file->disabled) { +		STACK_WIND (frame, ra_readv_disabled_cbk, +			    FIRST_CHILD (frame->this),  +			    FIRST_CHILD (frame->this)->fops->readv, +			    file->fd, size, offset); +		return 0; +	} + +	local = (void *) CALLOC (1, sizeof (*local)); +	if (!local) { +		gf_log (this->name, GF_LOG_ERROR, +			"out of memory :("); +		op_errno = ENOMEM; +		goto unwind; +	} + +	local->fd         = fd; +	local->offset     = offset; +	local->size       = size; +	local->wait_count = 1; + +	local->fill.next  = &local->fill; +	local->fill.prev  = &local->fill; + +	pthread_mutex_init (&local->local_lock, NULL); + +	frame->local = local; + +	dispatch_requests (frame, file); + +	flush_region (frame, file, 0, floor (offset, file->page_size)); + +	read_ahead (frame, file); + +	ra_frame_return (frame); + +	file->offset = offset + size; + +	return 0; + +unwind: +	STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL); + +	return 0; +} + + +int +ra_flush_cbk (call_frame_t *frame, +              void *cookie, +              xlator_t *this, +              int32_t op_ret, +              int32_t op_errno) +{ +	STACK_UNWIND (frame, op_ret, op_errno); +	return 0; +} + + +int +ra_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +{ +	ra_file_t *file = NULL; +	int        ret = 0; +	uint64_t tmp_file = 0; + +	ret = fd_ctx_get (fd, this, &tmp_file); +	file = (ra_file_t *)(long)tmp_file; + +	if (file) { +		flush_region (frame, file, 0, file->pages.prev->offset+1); +	} + +	STACK_WIND (frame, ra_flush_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->flush, +		    fd); +	return 0; +} + + +int +ra_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, +          int32_t datasync) +{ +	ra_file_t *file = NULL; +	int        ret = 0; +	uint64_t tmp_file = 0; + +	ret = fd_ctx_get (fd, this, &tmp_file); +	file = (ra_file_t *)(long)tmp_file; + +	if (file) { +		flush_region (frame, file, 0, file->pages.prev->offset+1); +	} + +	STACK_WIND (frame, ra_flush_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->fsync, +		    fd, datasync); +	return 0; +} + + +int +ra_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +               int32_t op_ret, int32_t op_errno, struct stat *stbuf) +{ +	fd_t      *fd = NULL; +	ra_file_t *file = NULL; +	int        ret = 0; +	uint64_t tmp_file = 0; + +	fd = frame->local; + +	ret = fd_ctx_get (fd, this, &tmp_file); +	file = (ra_file_t *)(long)tmp_file; + +	if (file) { +		flush_region (frame, file, 0, file->pages.prev->offset+1); +	} + +	frame->local = NULL; +	STACK_UNWIND (frame, op_ret, op_errno, stbuf); +	return 0; +} + + +int +ra_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, +           struct iovec *vector, int32_t count, off_t offset) +{ +	ra_file_t *file = NULL; +	int        ret = 0; +	uint64_t tmp_file = 0; + +	ret = fd_ctx_get (fd, this, &tmp_file); +	file = (ra_file_t *)(long)tmp_file; + +	if (file) { +		flush_region (frame, file, 0, file->pages.prev->offset+1); + +		/* reset the read-ahead counters too */ +		file->expected = file->page_count = 0; +	} + +	frame->local = fd; + +	STACK_WIND (frame, ra_writev_cbk, +		    FIRST_CHILD(this), +		    FIRST_CHILD(this)->fops->writev, +		    fd, vector, count, offset); + +	return 0; +} + + +int +ra_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +	     int32_t op_ret, int32_t op_errno, struct stat *buf) +{ +	STACK_UNWIND (frame, op_ret, op_errno, buf); +	return 0; +} + + +int +ra_truncate (call_frame_t *frame, xlator_t *this, +             loc_t *loc, off_t offset) +{ +	ra_file_t *file = NULL; +	fd_t      *iter_fd = NULL; +	inode_t   *inode = NULL; +	int        ret = 0; +	uint64_t tmp_file = 0; + +	inode = loc->inode; + +	LOCK (&inode->lock); +	{ +		list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { +			ret = fd_ctx_get (iter_fd, this, &tmp_file); +			file = (ra_file_t *)(long)tmp_file; + +			if (!file) +				continue; +			flush_region (frame, file, 0, +				      file->pages.prev->offset + 1); +		} +	} +	UNLOCK (&inode->lock); + +	STACK_WIND (frame, ra_attr_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->truncate, +		    loc, offset); +	return 0; +} + + +int +ra_fstat (call_frame_t *frame, xlator_t *this, +	  fd_t *fd) +{ +	ra_file_t *file = NULL; +	fd_t      *iter_fd = NULL; +	inode_t   *inode = NULL; +	int        ret = 0; +	uint64_t tmp_file = 0; + +	inode = fd->inode; + +	LOCK (&inode->lock); +	{ +		list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { +			ret = fd_ctx_get (iter_fd, this, &tmp_file); +			file = (ra_file_t *)(long)tmp_file; + +			if (!file) +				continue; +			flush_region (frame, file, 0, +				      file->pages.prev->offset + 1); +		} +	} +	UNLOCK (&inode->lock); + +	STACK_WIND (frame, ra_attr_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->fstat, +		    fd); +	return 0; +} + + +int +ra_fchown (call_frame_t *frame, xlator_t *this, +	   fd_t *fd, uid_t uid, gid_t gid) +{ +	ra_file_t *file = NULL; +	fd_t      *iter_fd = NULL; +	inode_t   *inode = NULL; +	int        ret = 0; +	uint64_t tmp_file = 0; + +	inode = fd->inode; + +	LOCK (&inode->lock); +	{ +		list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { +			ret = fd_ctx_get (iter_fd, this, &tmp_file); +			file = (ra_file_t *)(long)tmp_file; + +			if (!file) +				continue; +			flush_region (frame, file, 0, +				      file->pages.prev->offset + 1); +		} +	} +	UNLOCK (&inode->lock); + +	STACK_WIND (frame, ra_attr_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->fchown, +		    fd, uid, gid); +	return 0; +} + + +int +ra_ftruncate (call_frame_t *frame, xlator_t *this, +              fd_t *fd, off_t offset) +{ +	ra_file_t *file = NULL; +	fd_t      *iter_fd = NULL; +	inode_t   *inode = NULL; +	int        ret = 0; +	uint64_t tmp_file = 0; + +	inode = fd->inode; + +	LOCK (&inode->lock); +	{ +		list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { +			ret = fd_ctx_get (iter_fd, this, &tmp_file); +			file = (ra_file_t *)(long)tmp_file; +			if (!file) +				continue; +			flush_region (frame, file, 0, +				      file->pages.prev->offset + 1); +		} +	} +	UNLOCK (&inode->lock); + +	STACK_WIND (frame, ra_attr_cbk, +		    FIRST_CHILD (this), +		    FIRST_CHILD (this)->fops->ftruncate, +		    fd, offset); +	return 0; +} + + +int +init (xlator_t *this) +{ +	ra_conf_t *conf; +	dict_t *options = this->options; +	char *page_size_string = NULL; +	char *page_count_string = NULL; + +	if (!this->children || this->children->next) { +		gf_log (this->name,  GF_LOG_ERROR, +			"FATAL: read-ahead not configured with exactly one child"); +		return -1; +	} + +	if (!this->parents) { +		gf_log (this->name, GF_LOG_WARNING, +			"dangling volume. check volfile "); +	} +  +	conf = (void *) CALLOC (1, sizeof (*conf)); +	ERR_ABORT (conf); +	conf->page_size = 256 * 1024; +	conf->page_count = 2; + +	if (dict_get (options, "page-size")) +		page_size_string = data_to_str (dict_get (options, +							  "page-size")); +	if (page_size_string) +	{ +		if (gf_string2bytesize (page_size_string, &conf->page_size) != 0) +		{ +			gf_log ("read-ahead",  +				GF_LOG_ERROR,  +				"invalid number format \"%s\" of \"option page-size\"",  +				page_size_string); +			return -1; +		} +       +		gf_log (this->name, GF_LOG_DEBUG, "Using conf->page_size = %"PRIu64"", +			conf->page_size); +	} +   +	if (dict_get (options, "page-count")) +		page_count_string = data_to_str (dict_get (options,  +							   "page-count")); +	if (page_count_string) +	{ +		if (gf_string2uint_base10 (page_count_string, &conf->page_count) != 0) +		{ +			gf_log ("read-ahead",  +				GF_LOG_ERROR,  +				"invalid number format \"%s\" of \"option page-count\"",  +				page_count_string); +			return -1; +		} +		gf_log (this->name, GF_LOG_DEBUG, "Using conf->page_count = %u", +			conf->page_count); +	} +   +	if (dict_get (options, "force-atime-update")) { +		char *force_atime_update_str = data_to_str (dict_get (options, +								      "force-atime-update")); +		if (gf_string2boolean (force_atime_update_str, &conf->force_atime_update) == -1) { +			gf_log (this->name, GF_LOG_ERROR, +				"'force-atime-update' takes only boolean options"); +			return -1; +		} +		if (conf->force_atime_update) +			gf_log (this->name, GF_LOG_DEBUG, "Forcing atime updates on cache hit"); +	} + +	conf->files.next = &conf->files; +	conf->files.prev = &conf->files; + +	pthread_mutex_init (&conf->conf_lock, NULL); +	this->private = conf; +	return 0; +} + +void +fini (xlator_t *this) +{ +	ra_conf_t *conf = this->private; + +	pthread_mutex_destroy (&conf->conf_lock); +	FREE (conf); + +	this->private = NULL; +	return; +} + +struct xlator_fops fops = { +	.open        = ra_open, +	.create      = ra_create, +	.readv       = ra_readv, +	.writev      = ra_writev, +	.flush       = ra_flush, +	.fsync       = ra_fsync, +	.truncate    = ra_truncate, +	.ftruncate   = ra_ftruncate, +	.fstat       = ra_fstat, +	.fchown      = ra_fchown, +}; + +struct xlator_mops mops = { +}; + +struct xlator_cbks cbks = { +	.release       = ra_release, +}; + +struct volume_options options[] = { +	{ .key  = {"force-atime-update"},  +	  .type = GF_OPTION_TYPE_BOOL  +	}, +	{ .key  = {"page-size"},  +	  .type = GF_OPTION_TYPE_SIZET,  +	  .min  = 64 * GF_UNIT_KB,  +	  .max  = 2 * GF_UNIT_MB  +	}, +	{ .key  = {"page-count"},  +	  .type = GF_OPTION_TYPE_INT,  +	  .min  = 1,  +	  .max  = 16  +	}, +	{ .key = {NULL} }, +}; diff --git a/xlators/performance/read-ahead/src/read-ahead.h b/xlators/performance/read-ahead/src/read-ahead.h new file mode 100644 index 00000000000..d624ca8abc8 --- /dev/null +++ b/xlators/performance/read-ahead/src/read-ahead.h @@ -0,0 +1,194 @@ +/* +  Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> +  This file is part of GlusterFS. + +  GlusterFS is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published +  by the Free Software Foundation; either version 3 of the License, +  or (at your option) any later version. + +  GlusterFS is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +  General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program.  If not, see +  <http://www.gnu.org/licenses/>. +*/ + +#ifndef __READ_AHEAD_H +#define __READ_AHEAD_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "common-utils.h" + +struct ra_conf; +struct ra_local; +struct ra_page; +struct ra_file; +struct ra_waitq; + + +struct ra_waitq { +	struct ra_waitq *next; +	void            *data; +}; + + +struct ra_fill { +	struct ra_fill *next; +	struct ra_fill *prev; +	off_t           offset; +	size_t          size; +	struct iovec   *vector; +	int32_t         count; +	dict_t         *refs; +}; + + +struct ra_local { +	mode_t            mode; +	struct ra_fill    fill; +	off_t             offset; +	size_t            size; +	int32_t           op_ret; +	int32_t           op_errno; +	off_t             pending_offset; +	size_t            pending_size; +	fd_t             *fd; +	int32_t           wait_count; +	pthread_mutex_t   local_lock; +}; + + +struct ra_page { +	struct ra_page   *next; +	struct ra_page   *prev; +	struct ra_file   *file; +	char              dirty; +	char              ready; +	struct iovec     *vector; +	int32_t           count; +	off_t             offset; +	size_t            size; +	struct ra_waitq  *waitq; +	dict_t           *ref; +}; + + +struct ra_file { +	struct ra_file    *next; +	struct ra_file    *prev; +	struct ra_conf    *conf; +	fd_t              *fd; +	int                disabled; +	size_t             expected; +	struct ra_page     pages; +	off_t              offset; +	size_t             size; +	int32_t            refcount; +	pthread_mutex_t    file_lock; +	struct stat        stbuf; +	uint64_t           page_size; +	uint32_t           page_count; +}; + + +struct ra_conf { +	uint64_t          page_size; +	uint32_t          page_count; +	void             *cache_block; +	struct ra_file    files; +	gf_boolean_t      force_atime_update; +	pthread_mutex_t   conf_lock; +}; + + +typedef struct ra_conf ra_conf_t; +typedef struct ra_local ra_local_t; +typedef struct ra_page ra_page_t; +typedef struct ra_file ra_file_t; +typedef struct ra_waitq ra_waitq_t; +typedef struct ra_fill ra_fill_t; + +ra_page_t * +ra_page_get (ra_file_t *file, +	     off_t offset); +ra_page_t * +ra_page_create (ra_file_t *file, +		off_t offset); +void +ra_page_fault (ra_file_t *file, +	       call_frame_t *frame, +	       off_t offset); +void +ra_wait_on_page (ra_page_t *page, +		 call_frame_t *frame); +ra_waitq_t * +ra_page_wakeup (ra_page_t *page); + +void +ra_page_flush (ra_page_t *page); + +ra_waitq_t * +ra_page_error (ra_page_t *page, +	       int32_t op_ret, +	       int32_t op_errno); +void +ra_page_purge (ra_page_t *page); + +void +ra_frame_return (call_frame_t *frame); +void +ra_frame_fill (ra_page_t *page, +	       call_frame_t *frame); + +void +ra_file_destroy (ra_file_t *file); + +static inline void +ra_file_lock (ra_file_t *file) +{ +	pthread_mutex_lock (&file->file_lock); +} + +static inline void +ra_file_unlock (ra_file_t *file) +{ +	pthread_mutex_unlock (&file->file_lock); +} + +static inline void +ra_conf_lock (ra_conf_t *conf) +{ +	pthread_mutex_lock (&conf->conf_lock); +} + +static inline void +ra_conf_unlock (ra_conf_t *conf) +{ +	pthread_mutex_unlock (&conf->conf_lock); +} +static inline void +ra_local_lock (ra_local_t *local) +{ +	pthread_mutex_lock (&local->local_lock); +} + +static inline void +ra_local_unlock (ra_local_t *local) +{ +	pthread_mutex_unlock (&local->local_lock); +} + +#endif /* __READ_AHEAD_H */  | 
