From fb8efa4c6ab4bc1af49d05b0bc6b16eb188ea3b1 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Fri, 30 Sep 2011 13:29:18 +0530 Subject: storage/posix: implement native linux AIO support Configurable via cli with "storage.linux-aio" settable option Change-Id: I9929e0d6fc1bbc2a0fe1fb67bfc8d15d8a483d3f BUG: 837495 Signed-off-by: Anand Avati Reviewed-on: http://review.gluster.com/3627 Tested-by: Gluster Build System Reviewed-by: Amar Tumballi --- xlators/mgmt/glusterd/src/glusterd-volgen.c | 1 + xlators/storage/posix/src/Makefile.am | 6 +- xlators/storage/posix/src/posix-aio.c | 519 ++++++++++++++++++++++++++++ xlators/storage/posix/src/posix-aio.h | 49 +++ xlators/storage/posix/src/posix-helpers.c | 26 +- xlators/storage/posix/src/posix-mem-types.h | 1 + xlators/storage/posix/src/posix.c | 48 ++- xlators/storage/posix/src/posix.h | 13 +- 8 files changed, 657 insertions(+), 6 deletions(-) create mode 100644 xlators/storage/posix/src/posix-aio.c create mode 100644 xlators/storage/posix/src/posix-aio.h (limited to 'xlators') diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 1e22560aaed..ff35b8b085e 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -237,6 +237,7 @@ static struct volopt_map_entry glusterd_volopt_map[] = { {"features.grace-timeout", "protocol/server", "grace-timeout", NULL, DOC, 0}, {"features.read-only", "features/read-only", "!read-only", "off", DOC, 0}, {"features.worm", "features/worm", "!worm", "off", DOC, 0}, + {"storage.linux-aio", "storage/posix", NULL, NULL, DOC, 0}, {NULL, } }; diff --git a/xlators/storage/posix/src/Makefile.am b/xlators/storage/posix/src/Makefile.am index 408dcb80ddf..4eae9798b15 100644 --- a/xlators/storage/posix/src/Makefile.am +++ b/xlators/storage/posix/src/Makefile.am @@ -4,10 +4,10 @@ xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage posix_la_LDFLAGS = -module -avoidversion -posix_la_SOURCES = posix.c posix-helpers.c posix-handle.c -posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +posix_la_SOURCES = posix.c posix-helpers.c posix-handle.c posix-aio.c +posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBAIO) -noinst_HEADERS = posix.h posix-mem-types.h posix-handle.h +noinst_HEADERS = posix.h posix-mem-types.h posix-handle.h posix-aio.h AM_CFLAGS = -fPIC -fno-strict-aliasing -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE \ -D$(GF_HOST_OS) -Wall -I$(top_srcdir)/libglusterfs/src -shared \ diff --git a/xlators/storage/posix/src/posix-aio.c b/xlators/storage/posix/src/posix-aio.c new file mode 100644 index 00000000000..ac0ce870506 --- /dev/null +++ b/xlators/storage/posix/src/posix-aio.c @@ -0,0 +1,519 @@ +/* + Copyright (c) 2006-2012 Red Hat, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "glusterfs.h" +#include "posix.h" +#include + +#ifdef HAVE_LIBAIO +#include + + +struct posix_aio_cb { + struct iocb iocb; + call_frame_t *frame; + struct iobuf *iobuf; + struct iobref *iobref; + struct iatt prebuf; + int fd; + int op; + off_t offset; +}; + + +int +posix_aio_readv_complete (struct posix_aio_cb *paiocb, int res, int res2) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + struct iobuf *iobuf = NULL; + struct iatt prebuf = {0,}; + struct iatt postbuf = {0,}; + int _fd = -1; + int op_ret = -1; + int op_errno = 0; + struct iovec iov; + struct iobref *iobref = NULL; + int ret = 0; + off_t offset = 0; + struct posix_private * priv = NULL; + + + frame = paiocb->frame; + this = frame->this; + priv = this->private; + iobuf = paiocb->iobuf; + prebuf = paiocb->prebuf; + _fd = paiocb->fd; + offset = paiocb->offset; + + if (res < 0) { + op_ret = -1; + op_errno = -res; + gf_log (this->name, GF_LOG_ERROR, + "readv(async) failed fd=%d,size=%lu,offset=%llu (%d/%s)", + _fd, paiocb->iocb.u.c.nbytes, + (unsigned long long) paiocb->offset, + res, strerror (op_errno)); + goto out; + } + + ret = posix_fdstat (this, _fd, &postbuf); + if (ret != 0) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "fstat failed on fd=%d: %s", _fd, + strerror (op_errno)); + goto out; + } + + op_ret = res; + op_errno = 0; + + iobref = iobref_new (); + if (!iobref) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + + iobref_add (iobref, iobuf); + + iov.iov_base = iobuf_ptr (iobuf); + iov.iov_len = op_ret; + + + /* Hack to notify higher layers of EOF. */ + if (postbuf.ia_size == 0) + op_errno = ENOENT; + else if ((offset + iov.iov_len) == postbuf.ia_size) + op_errno = ENOENT; + else if (offset > postbuf.ia_size) + op_errno = ENOENT; + + LOCK (&priv->lock); + { + priv->read_value += op_ret; + } + UNLOCK (&priv->lock); + +out: + STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, &iov, 1, + &postbuf, iobref, NULL); + if (iobuf) + iobuf_unref (iobuf); + if (iobref) + iobref_unref (iobref); + + GF_FREE (paiocb); + + return 0; +} + + +int +posix_aio_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t offset, uint32_t flags, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + int _fd = -1; + struct iobuf *iobuf = NULL; + struct posix_fd * pfd = NULL; + int ret = -1; + struct posix_aio_cb *paiocb = NULL; + struct posix_private *priv = NULL; + struct iocb *iocb = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + priv = this->private; + + ret = posix_fd_ctx_get_off (fd, this, &pfd, offset); + if (ret < 0) { + op_errno = -ret; + gf_log (this->name, GF_LOG_WARNING, + "pfd is NULL from fd=%p", fd); + goto err; + } + _fd = pfd->fd; + + if (!size) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size); + goto err; + } + + iobuf = iobuf_get2 (this->ctx->iobuf_pool, size); + if (!iobuf) { + op_errno = ENOMEM; + goto err; + } + + paiocb = GF_CALLOC (1, sizeof (*paiocb), gf_posix_mt_paiocb); + if (!paiocb) { + op_errno = ENOMEM; + goto err; + } + + + paiocb->frame = frame; + paiocb->iobuf = iobuf; + paiocb->offset = offset; + paiocb->fd = _fd; + paiocb->op = GF_FOP_READ; + + paiocb->iocb.data = paiocb; + paiocb->iocb.aio_fildes = _fd; + paiocb->iocb.aio_lio_opcode = IO_CMD_PREAD; + paiocb->iocb.aio_reqprio = 0; + paiocb->iocb.u.c.buf = iobuf_ptr (iobuf); + paiocb->iocb.u.c.nbytes = size; + paiocb->iocb.u.c.offset = offset; + + iocb = &paiocb->iocb; + + ret = io_submit (priv->ctxp, 1, &iocb); + if (ret != 1) { + gf_log (this->name, GF_LOG_ERROR, + "io_submit() returned %d", ret); + op_errno = -ret; + goto err; + } + + return 0; +err: + STACK_UNWIND_STRICT (readv, frame, -1, op_errno, 0, 0, 0, 0, 0); + if (iobuf) + iobuf_unref (iobuf); + + if (paiocb) + GF_FREE (paiocb); + + return 0; +} + + +int +posix_aio_writev_complete (struct posix_aio_cb *paiocb, int res, int res2) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + struct iatt prebuf = {0,}; + struct iatt postbuf = {0,}; + int _fd = -1; + int op_ret = -1; + int op_errno = 0; + int ret = 0; + struct posix_private * priv = NULL; + + + frame = paiocb->frame; + this = frame->this; + priv = this->private; + prebuf = paiocb->prebuf; + _fd = paiocb->fd; + + if (res < 0) { + op_ret = -1; + op_errno = -res; + gf_log (this->name, GF_LOG_ERROR, + "writev(async) failed fd=%d,offset=%llu (%d/%s)", + _fd, (unsigned long long) paiocb->offset, res, + strerror (op_errno)); + + goto out; + } + + ret = posix_fdstat (this, _fd, &postbuf); + if (ret != 0) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "fstat failed on fd=%d: %s", _fd, + strerror (op_errno)); + goto out; + } + + + op_ret = res; + op_errno = 0; + + LOCK (&priv->lock); + { + priv->write_value += op_ret; + } + UNLOCK (&priv->lock); + +out: + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &prebuf, &postbuf, + NULL); + + if (paiocb) { + if (paiocb->iobref) + iobref_unref (paiocb->iobref); + GF_FREE (paiocb); + } + + return 0; +} + + +int +posix_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *iov, int count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + int _fd = -1; + struct posix_fd * pfd = NULL; + int ret = -1; + struct posix_aio_cb *paiocb = NULL; + struct posix_private *priv = NULL; + struct iocb *iocb = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + priv = this->private; + + ret = posix_fd_ctx_get_off (fd, this, &pfd, offset); + if (ret < 0) { + op_errno = -ret; + gf_log (this->name, GF_LOG_WARNING, + "pfd is NULL from fd=%p", fd); + goto err; + } + _fd = pfd->fd; + + paiocb = GF_CALLOC (1, sizeof (*paiocb), gf_posix_mt_paiocb); + if (!paiocb) { + op_errno = ENOMEM; + goto err; + } + + + paiocb->frame = frame; + paiocb->offset = offset; + paiocb->fd = _fd; + paiocb->op = GF_FOP_WRITE; + + paiocb->iocb.data = paiocb; + paiocb->iocb.aio_fildes = _fd; + paiocb->iobref = iobref_ref (iobref); + paiocb->iocb.aio_lio_opcode = IO_CMD_PWRITEV; + paiocb->iocb.aio_reqprio = 0; + paiocb->iocb.u.v.vec = iov; + paiocb->iocb.u.v.nr = count; + paiocb->iocb.u.v.offset = offset; + + iocb = &paiocb->iocb; + + ret = posix_fdstat (this, _fd, &paiocb->prebuf); + if (ret != 0) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "fstat failed on fd=%p: %s", fd, + strerror (op_errno)); + goto err; + } + + + ret = io_submit (priv->ctxp, 1, &iocb); + if (ret != 1) { + gf_log (this->name, GF_LOG_ERROR, + "io_submit() returned %d", ret); + op_errno = -ret; + goto err; + } + + return 0; +err: + STACK_UNWIND_STRICT (writev, frame, -1, op_errno, 0, 0, 0); + + if (paiocb) { + if (paiocb->iobref) + iobref_unref (paiocb->iobref); + GF_FREE (paiocb); + } + + return 0; +} + + +void * +posix_aio_thread (void *data) +{ + xlator_t *this = NULL; + struct posix_private *priv = NULL; + int ret = 0; + int i = 0; + struct io_event events[POSIX_AIO_MAX_NR_GETEVENTS]; + struct io_event *event = NULL; + struct posix_aio_cb *paiocb = NULL; + + this = data; + THIS = this; + priv = this->private; + + for (;;) { + memset (&events[0], 0, sizeof (events)); + ret = io_getevents (priv->ctxp, 1, POSIX_AIO_MAX_NR_GETEVENTS, + &events[0], NULL); + if (ret <= 0) { + gf_log (this->name, GF_LOG_ERROR, + "io_getevents() returned %d", ret); + if (ret == -EINTR) + continue; + break; + } + + for (i = 0; i < ret; i++) { + event = &events[i]; + + paiocb = event->data; + + switch (paiocb->op) { + case GF_FOP_READ: + posix_aio_readv_complete (paiocb, event->res, + event->res2); + break; + case GF_FOP_WRITE: + posix_aio_writev_complete (paiocb, event->res, + event->res2); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "unknown op %d found in piocb", + paiocb->op); + break; + } + } + } + + return NULL; +} + + +int +posix_aio_init (xlator_t *this) +{ + struct posix_private *priv = NULL; + int ret = 0; + + priv = this->private; + + ret = io_setup (POSIX_AIO_MAX_NR_EVENTS, &priv->ctxp); + if ((ret == -1 && errno == ENOSYS) || ret == -ENOSYS) { + gf_log (this->name, GF_LOG_WARNING, + "Linux AIO not availble at run-time." + " Continuing with synchronous IO"); + ret = 0; + goto out; + } + + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "io_setup() failed. ret=%d, errno=%d", + ret, errno); + goto out; + } + + ret = pthread_create (&priv->aiothread, NULL, + posix_aio_thread, this); + if (ret != 0) { + io_destroy (priv->ctxp); + goto out; + } + + this->fops->readv = posix_aio_readv; + this->fops->writev = posix_aio_writev; +out: + return ret; +} + + +int +posix_aio_on (xlator_t *this) +{ + struct posix_private *priv = NULL; + int ret = 0; + + priv = this->private; + + if (!priv->aio_init_done) { + ret = posix_aio_init (this); + if (ret == 0) + priv->aio_capable = _gf_true; + else + priv->aio_capable = _gf_false; + priv->aio_init_done = _gf_true; + } + + if (priv->aio_capable) { + this->fops->readv = posix_aio_readv; + this->fops->writev = posix_aio_writev; + } + + return ret; +} + +int +posix_aio_off (xlator_t *this) +{ + this->fops->readv = posix_readv; + this->fops->writev = posix_writev; + + return 0; +} + + +#else + + +int +posix_aio_on (xlator_t *this) +{ + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not availble at build-time." + " Continuing with synchronous IO"); + return 0; +} + +int +posix_aio_off (xlator_t *this) +{ + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not availble at build-time." + " Continuing with synchronous IO"); + return 0; +} + +#endif diff --git a/xlators/storage/posix/src/posix-aio.h b/xlators/storage/posix/src/posix-aio.h new file mode 100644 index 00000000000..545130a2889 --- /dev/null +++ b/xlators/storage/posix/src/posix-aio.h @@ -0,0 +1,49 @@ +/* + Copyright (c) 2006-2012 Red Hat, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _POSIX_AIO_H +#define _POSIX_AIO_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "glusterfs.h" + +// Maximum number of concurrently submitted IO events. The heaviest load +// GlusterFS has been able to handle had 60-80 concurrent calls +#define POSIX_AIO_MAX_NR_EVENTS 256 + +// Maximum number of completed IO operations to reap per getevents syscall +#define POSIX_AIO_MAX_NR_GETEVENTS 16 + + +int posix_aio_on (xlator_t *this); +int posix_aio_off (xlator_t *this); + +int posix_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata); + +int posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata); + +#endif /* !_POSIX_AIO_H */ diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c index c649482f4a5..3c19ba7a3fd 100644 --- a/xlators/storage/posix/src/posix-helpers.c +++ b/xlators/storage/posix/src/posix-helpers.c @@ -1058,5 +1058,29 @@ int posix_fd_ctx_get_off (fd_t *fd, xlator_t *this, struct posix_fd **pfd, off_t offset) { - return posix_fd_ctx_get (fd, this, pfd); + int ret; + int flags; + + LOCK (&fd->inode->lock); + { + ret = __posix_fd_ctx_get (fd, this, pfd); + if (ret) + goto unlock; + + if ((offset & 0xfff) && (*pfd)->odirect) { + flags = fcntl ((*pfd)->fd, F_GETFL); + ret = fcntl ((*pfd)->fd, F_SETFL, (flags & (~O_DIRECT))); + (*pfd)->odirect = 0; + } + + if (((offset & 0xfff) == 0) && (!(*pfd)->odirect)) { + flags = fcntl ((*pfd)->fd, F_GETFL); + ret = fcntl ((*pfd)->fd, F_SETFL, (flags | O_DIRECT)); + (*pfd)->odirect = 1; + } + } +unlock: + UNLOCK (&fd->inode->lock); + + return ret; } diff --git a/xlators/storage/posix/src/posix-mem-types.h b/xlators/storage/posix/src/posix-mem-types.h index 10aa75edc11..b8d81248c2d 100644 --- a/xlators/storage/posix/src/posix-mem-types.h +++ b/xlators/storage/posix/src/posix-mem-types.h @@ -30,6 +30,7 @@ enum gf_posix_mem_types_ { gf_posix_mt_int32_t, gf_posix_mt_posix_dev_t, gf_posix_mt_trash_path, + gf_posix_mt_paiocb, gf_posix_mt_end }; #endif diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index 23ea046a984..ce0f0be0a13 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -59,6 +59,7 @@ #include "timer.h" #include "glusterfs3-xdr.h" #include "hashfn.h" +#include "posix-aio.h" extern char *marker_xattrs[]; @@ -3918,6 +3919,29 @@ mem_acct_init (xlator_t *this) return ret; } + +int +reconfigure (xlator_t *this, dict_t *options) +{ + int ret = -1; + struct posix_private *priv = NULL; + + priv = this->private; + + GF_OPTION_RECONF ("linux-aio", priv->aio_configured, + options, bool, out); + + if (priv->aio_configured) + posix_aio_on (this); + else + posix_aio_off (this); + + ret = 0; +out: + return ret; +} + + /** * init - */ @@ -4248,7 +4272,23 @@ init (xlator_t *this) "Posix landfill setup failed"); ret = -1; goto out; - } + } + + _private->aio_init_done = _gf_false; + _private->aio_capable = _gf_false; + + GF_OPTION_INIT ("linux-aio", _private->aio_configured, bool, out); + + if (_private->aio_configured) { + op_ret = posix_aio_on (this); + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "Posix AIO init failed"); + ret = -1; + goto out; + } + } pthread_mutex_init (&_private->janitor_lock, NULL); pthread_cond_init (&_private->janitor_cond, NULL); @@ -4347,5 +4387,11 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_ANY }, { .key = {"glusterd-uuid"}, .type = GF_OPTION_TYPE_STR }, + { + .key = {"linux-aio"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Support for native Linux AIO" + }, { .key = {NULL} } }; diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index 7c2b47bb02b..ce5c94c1686 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -54,6 +54,11 @@ #include "posix-mem-types.h" #include "posix-handle.h" +#ifdef HAVE_LIBAIO +#include +#include "posix-aio.h" +#endif + /** * posix_fd - internal structure common to file and directory fd's */ @@ -64,7 +69,6 @@ struct posix_fd { DIR * dir; /* handle returned by the kernel */ int flushwrites; int odirect; - int op_performed; struct list_head list; /* to add to the janitor list */ }; @@ -124,6 +128,13 @@ struct posix_private { /* uuid of glusterd that swapned the brick process */ uuid_t glusterd_uuid; + gf_boolean_t aio_configured; + gf_boolean_t aio_init_done; + gf_boolean_t aio_capable; +#ifdef HAVE_LIBAIO + io_context_t ctxp; + pthread_t aiothread; +#endif }; #define POSIX_BASE_PATH(this) (((struct posix_private *)this->private)->base_path) -- cgit