diff options
Diffstat (limited to 'xlators/storage/posix/src')
| -rw-r--r-- | xlators/storage/posix/src/Makefile.am | 17 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-aio.c | 569 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-aio.h | 39 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-handle.c | 319 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-handle.h | 137 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-helpers.c | 789 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-mem-types.h | 21 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix.c | 2665 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix.h | 117 |
9 files changed, 3903 insertions, 770 deletions
diff --git a/xlators/storage/posix/src/Makefile.am b/xlators/storage/posix/src/Makefile.am index 408dcb80d..88efcc784 100644 --- a/xlators/storage/posix/src/Makefile.am +++ b/xlators/storage/posix/src/Makefile.am @@ -2,17 +2,18 @@ xlator_LTLIBRARIES = posix.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage -posix_la_LDFLAGS = -module -avoidversion +posix_la_LDFLAGS = -module -avoid-version -posix_la_SOURCES = posix.c posix-helpers.c posix-handle.c -posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +posix_la_SOURCES = posix.c posix-helpers.c posix-handle.c posix-aio.c +posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBAIO) -noinst_HEADERS = posix.h posix-mem-types.h posix-handle.h +noinst_HEADERS = posix.h posix-mem-types.h posix-handle.h posix-aio.h -AM_CFLAGS = -fPIC -fno-strict-aliasing -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE \ - -D$(GF_HOST_OS) -Wall -I$(top_srcdir)/libglusterfs/src -shared \ - -nostartfiles -I$(top_srcdir)/rpc/xdr/src \ - -I$(top_srcdir)/rpc/rpc-lib/src $(GF_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src \ + -I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -fno-strict-aliasing -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/storage/posix/src/posix-aio.c b/xlators/storage/posix/src/posix-aio.c new file mode 100644 index 000000000..c3bbddd67 --- /dev/null +++ b/xlators/storage/posix/src/posix-aio.c @@ -0,0 +1,569 @@ +/* + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "glusterfs.h" +#include "posix.h" +#include <sys/uio.h> + +#ifdef HAVE_LIBAIO +#include <libaio.h> + + +void +__posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags, + off_t offset, size_t size) +{ + int odirect = 0; + int flags = 0; + int ret = 0; + + odirect = pfd->odirect; + + if ((fd->flags|opflags) & O_DIRECT) { + /* if instructed, use O_DIRECT always */ + odirect = 1; + } else { + /* else use O_DIRECT when feasible */ + if ((offset|size) & 0xfff) + odirect = 0; + else + odirect = 1; + } + + if (!odirect && pfd->odirect) { + flags = fcntl (pfd->fd, F_GETFL); + ret = fcntl (pfd->fd, F_SETFL, (flags & (~O_DIRECT))); + pfd->odirect = 0; + } + + if (odirect && !pfd->odirect) { + flags = fcntl (pfd->fd, F_GETFL); + ret = fcntl (pfd->fd, F_SETFL, (flags | O_DIRECT)); + pfd->odirect = 1; + } + + if (ret) { + gf_log (THIS->name, GF_LOG_WARNING, + "fcntl() failed (%s). fd=%d flags=%d pfd->odirect=%d", + strerror (errno), pfd->fd, flags, pfd->odirect); + } +} + + +struct posix_aio_cb { + struct iocb iocb; + call_frame_t *frame; + struct iobuf *iobuf; + struct iobref *iobref; + struct iatt prebuf; + int fd; + int op; + off_t offset; +}; + + +int +posix_aio_readv_complete (struct posix_aio_cb *paiocb, int res, int res2) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + struct iobuf *iobuf = NULL; + struct iatt postbuf = {0,}; + int _fd = -1; + int op_ret = -1; + int op_errno = 0; + struct iovec iov; + struct iobref *iobref = NULL; + int ret = 0; + off_t offset = 0; + struct posix_private * priv = NULL; + + + frame = paiocb->frame; + this = frame->this; + priv = this->private; + iobuf = paiocb->iobuf; + _fd = paiocb->fd; + offset = paiocb->offset; + + if (res < 0) { + op_ret = -1; + op_errno = -res; + gf_log (this->name, GF_LOG_ERROR, + "readv(async) failed fd=%d,size=%lu,offset=%llu (%d/%s)", + _fd, paiocb->iocb.u.c.nbytes, + (unsigned long long) paiocb->offset, + res, strerror (op_errno)); + goto out; + } + + ret = posix_fdstat (this, _fd, &postbuf); + if (ret != 0) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "fstat failed on fd=%d: %s", _fd, + strerror (op_errno)); + goto out; + } + + op_ret = res; + op_errno = 0; + + iobref = iobref_new (); + if (!iobref) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + + iobref_add (iobref, iobuf); + + iov.iov_base = iobuf_ptr (iobuf); + iov.iov_len = op_ret; + + + /* Hack to notify higher layers of EOF. */ + if (!postbuf.ia_size || (offset + iov.iov_len) >= postbuf.ia_size) + op_errno = ENOENT; + + LOCK (&priv->lock); + { + priv->read_value += op_ret; + } + UNLOCK (&priv->lock); + +out: + STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, &iov, 1, + &postbuf, iobref, NULL); + if (iobuf) + iobuf_unref (iobuf); + if (iobref) + iobref_unref (iobref); + + GF_FREE (paiocb); + + return 0; +} + + +int +posix_aio_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t offset, uint32_t flags, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + int _fd = -1; + struct iobuf *iobuf = NULL; + struct posix_fd * pfd = NULL; + int ret = -1; + struct posix_aio_cb *paiocb = NULL; + struct posix_private *priv = NULL; + struct iocb *iocb = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + priv = this->private; + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + op_errno = -ret; + gf_log (this->name, GF_LOG_WARNING, + "pfd is NULL from fd=%p", fd); + goto err; + } + _fd = pfd->fd; + + if (!size) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size); + goto err; + } + + iobuf = iobuf_get2 (this->ctx->iobuf_pool, size); + if (!iobuf) { + op_errno = ENOMEM; + goto err; + } + + paiocb = GF_CALLOC (1, sizeof (*paiocb), gf_posix_mt_paiocb); + if (!paiocb) { + op_errno = ENOMEM; + goto err; + } + + + paiocb->frame = frame; + paiocb->iobuf = iobuf; + paiocb->offset = offset; + paiocb->fd = _fd; + paiocb->op = GF_FOP_READ; + + paiocb->iocb.data = paiocb; + paiocb->iocb.aio_fildes = _fd; + paiocb->iocb.aio_lio_opcode = IO_CMD_PREAD; + paiocb->iocb.aio_reqprio = 0; + paiocb->iocb.u.c.buf = iobuf_ptr (iobuf); + paiocb->iocb.u.c.nbytes = size; + paiocb->iocb.u.c.offset = offset; + + iocb = &paiocb->iocb; + + LOCK (&fd->lock); + { + __posix_fd_set_odirect (fd, pfd, flags, offset, size); + + ret = io_submit (priv->ctxp, 1, &iocb); + } + UNLOCK (&fd->lock); + + if (ret != 1) { + gf_log (this->name, GF_LOG_ERROR, + "io_submit() returned %d", ret); + op_errno = -ret; + goto err; + } + + return 0; +err: + STACK_UNWIND_STRICT (readv, frame, -1, op_errno, 0, 0, 0, 0, 0); + if (iobuf) + iobuf_unref (iobuf); + + if (paiocb) + GF_FREE (paiocb); + + return 0; +} + + +int +posix_aio_writev_complete (struct posix_aio_cb *paiocb, int res, int res2) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + struct iatt prebuf = {0,}; + struct iatt postbuf = {0,}; + int _fd = -1; + int op_ret = -1; + int op_errno = 0; + int ret = 0; + struct posix_private * priv = NULL; + + + frame = paiocb->frame; + this = frame->this; + priv = this->private; + prebuf = paiocb->prebuf; + _fd = paiocb->fd; + + if (res < 0) { + op_ret = -1; + op_errno = -res; + gf_log (this->name, GF_LOG_ERROR, + "writev(async) failed fd=%d,offset=%llu (%d/%s)", + _fd, (unsigned long long) paiocb->offset, res, + strerror (op_errno)); + + goto out; + } + + ret = posix_fdstat (this, _fd, &postbuf); + if (ret != 0) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "fstat failed on fd=%d: %s", _fd, + strerror (op_errno)); + goto out; + } + + + op_ret = res; + op_errno = 0; + + LOCK (&priv->lock); + { + priv->write_value += op_ret; + } + UNLOCK (&priv->lock); + +out: + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &prebuf, &postbuf, + NULL); + + if (paiocb) { + if (paiocb->iobref) + iobref_unref (paiocb->iobref); + GF_FREE (paiocb); + } + + return 0; +} + + +int +posix_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *iov, int count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + int _fd = -1; + struct posix_fd * pfd = NULL; + int ret = -1; + struct posix_aio_cb *paiocb = NULL; + struct posix_private *priv = NULL; + struct iocb *iocb = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + priv = this->private; + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + op_errno = -ret; + gf_log (this->name, GF_LOG_WARNING, + "pfd is NULL from fd=%p", fd); + goto err; + } + _fd = pfd->fd; + + paiocb = GF_CALLOC (1, sizeof (*paiocb), gf_posix_mt_paiocb); + if (!paiocb) { + op_errno = ENOMEM; + goto err; + } + + + paiocb->frame = frame; + paiocb->offset = offset; + paiocb->fd = _fd; + paiocb->op = GF_FOP_WRITE; + + paiocb->iocb.data = paiocb; + paiocb->iocb.aio_fildes = _fd; + paiocb->iobref = iobref_ref (iobref); + paiocb->iocb.aio_lio_opcode = IO_CMD_PWRITEV; + paiocb->iocb.aio_reqprio = 0; + paiocb->iocb.u.v.vec = iov; + paiocb->iocb.u.v.nr = count; + paiocb->iocb.u.v.offset = offset; + + iocb = &paiocb->iocb; + + ret = posix_fdstat (this, _fd, &paiocb->prebuf); + if (ret != 0) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "fstat failed on fd=%p: %s", fd, + strerror (op_errno)); + goto err; + } + + + LOCK (&fd->lock); + { + __posix_fd_set_odirect (fd, pfd, flags, offset, + iov_length (iov, count)); + + ret = io_submit (priv->ctxp, 1, &iocb); + } + UNLOCK (&fd->lock); + + if (ret != 1) { + gf_log (this->name, GF_LOG_ERROR, + "io_submit() returned %d", ret); + op_errno = -ret; + goto err; + } + + return 0; +err: + STACK_UNWIND_STRICT (writev, frame, -1, op_errno, 0, 0, 0); + + if (paiocb) { + if (paiocb->iobref) + iobref_unref (paiocb->iobref); + GF_FREE (paiocb); + } + + return 0; +} + + +void * +posix_aio_thread (void *data) +{ + xlator_t *this = NULL; + struct posix_private *priv = NULL; + int ret = 0; + int i = 0; + struct io_event events[POSIX_AIO_MAX_NR_GETEVENTS]; + struct io_event *event = NULL; + struct posix_aio_cb *paiocb = NULL; + + this = data; + THIS = this; + priv = this->private; + + for (;;) { + memset (&events[0], 0, sizeof (events)); + ret = io_getevents (priv->ctxp, 1, POSIX_AIO_MAX_NR_GETEVENTS, + &events[0], NULL); + if (ret <= 0) { + gf_log (this->name, GF_LOG_ERROR, + "io_getevents() returned %d", ret); + if (ret == -EINTR) + continue; + break; + } + + for (i = 0; i < ret; i++) { + event = &events[i]; + + paiocb = event->data; + + switch (paiocb->op) { + case GF_FOP_READ: + posix_aio_readv_complete (paiocb, event->res, + event->res2); + break; + case GF_FOP_WRITE: + posix_aio_writev_complete (paiocb, event->res, + event->res2); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "unknown op %d found in piocb", + paiocb->op); + break; + } + } + } + + return NULL; +} + + +int +posix_aio_init (xlator_t *this) +{ + struct posix_private *priv = NULL; + int ret = 0; + + priv = this->private; + + ret = io_setup (POSIX_AIO_MAX_NR_EVENTS, &priv->ctxp); + if ((ret == -1 && errno == ENOSYS) || ret == -ENOSYS) { + gf_log (this->name, GF_LOG_WARNING, + "Linux AIO not available at run-time." + " Continuing with synchronous IO"); + ret = 0; + goto out; + } + + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "io_setup() failed. ret=%d, errno=%d", + ret, errno); + goto out; + } + + ret = gf_thread_create (&priv->aiothread, NULL, + posix_aio_thread, this); + if (ret != 0) { + io_destroy (priv->ctxp); + goto out; + } + + this->fops->readv = posix_aio_readv; + this->fops->writev = posix_aio_writev; +out: + return ret; +} + + +int +posix_aio_on (xlator_t *this) +{ + struct posix_private *priv = NULL; + int ret = 0; + + priv = this->private; + + if (!priv->aio_init_done) { + ret = posix_aio_init (this); + if (ret == 0) + priv->aio_capable = _gf_true; + else + priv->aio_capable = _gf_false; + priv->aio_init_done = _gf_true; + } + + if (priv->aio_capable) { + this->fops->readv = posix_aio_readv; + this->fops->writev = posix_aio_writev; + } + + return ret; +} + +int +posix_aio_off (xlator_t *this) +{ + this->fops->readv = posix_readv; + this->fops->writev = posix_writev; + + return 0; +} + + +#else + + +int +posix_aio_on (xlator_t *this) +{ + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return 0; +} + +int +posix_aio_off (xlator_t *this) +{ + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return 0; +} + +void +__posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags, + off_t offset, size_t size) +{ + xlator_t *this = THIS; + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return; +} +#endif diff --git a/xlators/storage/posix/src/posix-aio.h b/xlators/storage/posix/src/posix-aio.h new file mode 100644 index 000000000..5bde71601 --- /dev/null +++ b/xlators/storage/posix/src/posix-aio.h @@ -0,0 +1,39 @@ +/* + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _POSIX_AIO_H +#define _POSIX_AIO_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "glusterfs.h" + +// Maximum number of concurrently submitted IO events. The heaviest load +// GlusterFS has been able to handle had 60-80 concurrent calls +#define POSIX_AIO_MAX_NR_EVENTS 256 + +// Maximum number of completed IO operations to reap per getevents syscall +#define POSIX_AIO_MAX_NR_GETEVENTS 16 + + +int posix_aio_on (xlator_t *this); +int posix_aio_off (xlator_t *this); + +int posix_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata); + +int posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata); + +#endif /* !_POSIX_AIO_H */ diff --git a/xlators/storage/posix/src/posix-handle.c b/xlators/storage/posix/src/posix-handle.c index 9b6df445e..adb8acc07 100644 --- a/xlators/storage/posix/src/posix-handle.c +++ b/xlators/storage/posix/src/posix-handle.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" @@ -27,18 +17,176 @@ #include <sys/stat.h> #include <unistd.h> #include <libgen.h> +#ifdef GF_LINUX_HOST_OS #include <alloca.h> +#endif #include "posix-handle.h" #include "posix.h" #include "xlator.h" +#include "syscall.h" + +inode_t * +posix_resolve (xlator_t *this, inode_table_t *itable, inode_t *parent, + char *bname, struct iatt *iabuf) +{ + inode_t *inode = NULL, *linked_inode = NULL; + int ret = -1; + + ret = posix_istat (this, parent->gfid, bname, iabuf); + if (ret < 0) + goto out; + + inode = inode_find (itable, iabuf->ia_gfid); + if (inode == NULL) { + inode = inode_new (itable); + } + + linked_inode = inode_link (inode, parent, bname, iabuf); + inode_unref (inode); -#define HANDLE_PFX ".glusterfs" +out: + return linked_inode; +} -#define UUID0_STR "00000000-0000-0000-0000-000000000000" -#define SLEN(str) (sizeof(str) - 1) +int +posix_make_ancestral_node (const char *priv_base_path, char *path, int pathsize, + gf_dirent_t *head, + char *dir_name, struct iatt *iabuf, inode_t *inode, + int type, dict_t *xdata) +{ + gf_dirent_t *entry = NULL; + char real_path[PATH_MAX + 1] = {0, }, len = 0; + loc_t loc = {0, }; + int ret = -1; + len = strlen (path) + strlen (dir_name) + 1; + if (len > pathsize) { + goto out; + } + + strcat (path, dir_name); + + if (type & POSIX_ANCESTRY_DENTRY) { + entry = gf_dirent_for_name (dir_name); + if (!entry) { + gf_log (THIS->name, GF_LOG_ERROR, + "could not create gf_dirent for entry %s: (%s)", + dir_name, strerror (errno)); + goto out; + } + + entry->d_stat = *iabuf; + entry->inode = inode_ref (inode); + + list_add_tail (&entry->list, &head->list); + strcpy (real_path, priv_base_path); + strcat (real_path, "/"); + strcat (real_path, path); + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); + + entry->dict = posix_lookup_xattr_fill (THIS, real_path, &loc, + xdata, iabuf); + loc_wipe (&loc); + } + + ret = 0; + +out: + return ret; +} + +int +posix_make_ancestryfromgfid (xlator_t *this, char *path, int pathsize, + gf_dirent_t *head, int type, uuid_t gfid, + const size_t handle_size, + const char *priv_base_path, inode_table_t *itable, + inode_t **parent, dict_t *xdata) +{ + char *linkname = NULL; /* "../../<gfid[0]>/<gfid[1]/" + "<gfidstr>/<NAME_MAX>" */ + char *dir_handle = NULL; + char *dir_name = NULL; + char *pgfidstr = NULL; + char *saveptr = NULL; + ssize_t len = 0; + inode_t *inode = NULL; + struct iatt iabuf = {0, }; + int ret = -1; + uuid_t tmp_gfid = {0, }; + + if (!path || !parent || !priv_base_path || uuid_is_null (gfid)) { + goto out; + } + + if (__is_root_gfid (gfid)) { + if (parent) { + if (*parent) { + inode_unref (*parent); + } + + *parent = inode_ref (itable->root); + } + + inode = itable->root; + + memset (&iabuf, 0, sizeof (iabuf)); + uuid_copy (iabuf.ia_gfid, inode->gfid); + iabuf.ia_type = inode->ia_type; + + ret = posix_make_ancestral_node (priv_base_path, path, pathsize, + head, "/", &iabuf, inode, type, + xdata); + return ret; + } + + dir_handle = alloca (handle_size); + linkname = alloca (PATH_MAX); + snprintf (dir_handle, handle_size, "%s/%s/%02x/%02x/%s", + priv_base_path, GF_HIDDEN_PATH, gfid[0], gfid[1], + uuid_utoa (gfid)); + + len = readlink (dir_handle, linkname, PATH_MAX); + if (len < 0) { + gf_log (this->name, GF_LOG_ERROR, "could not read the link " + "from the gfid handle %s (%s)", dir_handle, + strerror (errno)); + goto out; + } + + linkname[len] = '\0'; + + pgfidstr = strtok_r (linkname + SLEN("../../00/00/"), "/", &saveptr); + dir_name = strtok_r (NULL, "/", &saveptr); + strcat (dir_name, "/"); + + uuid_parse (pgfidstr, tmp_gfid); + + ret = posix_make_ancestryfromgfid (this, path, pathsize, head, type, + tmp_gfid, handle_size, + priv_base_path, itable, parent, + xdata); + if (ret < 0) { + goto out; + } + + memset (&iabuf, 0, sizeof (iabuf)); + + inode = posix_resolve (this, itable, *parent, dir_name, &iabuf); + + ret = posix_make_ancestral_node (priv_base_path, path, pathsize, head, + dir_name, &iabuf, inode, type, xdata); + if (*parent != NULL) { + inode_unref (*parent); + } + + *parent = inode; + +out: + return ret; +} int posix_handle_relpath (xlator_t *this, uuid_t gfid, const char *basename, @@ -104,7 +252,7 @@ posix_handle_pump (xlator_t *this, char *buf, int len, int maxlen, if ((ret == 8) && memcmp (linkname, "../../..", 8) == 0) { if (strcmp (base_str, buf) == 0) { - strncpy (buf + pfx_len, "..", 3); + strcpy (buf + pfx_len, ".."); } goto out; } @@ -195,13 +343,13 @@ posix_handle_path (xlator_t *this, uuid_t gfid, const char *basename, buf = alloca (maxlen); } - base_len = (priv->base_path_length + SLEN(HANDLE_PFX) + 45); + base_len = (priv->base_path_length + SLEN(GF_HIDDEN_PATH) + 45); base_str = alloca (base_len + 1); base_len = snprintf (base_str, base_len + 1, "%s/%s/%02x/%02x/%s", - priv->base_path, HANDLE_PFX, gfid[0], gfid[1], + priv->base_path, GF_HIDDEN_PATH, gfid[0], gfid[1], uuid_str); - pfx_len = priv->base_path_length + 1 + SLEN(HANDLE_PFX) + 1; + pfx_len = priv->base_path_length + 1 + SLEN(GF_HIDDEN_PATH) + 1; if (basename) { len = snprintf (buf, maxlen, "%s/%s", base_str, basename); @@ -243,7 +391,7 @@ posix_handle_gfid_path (xlator_t *this, uuid_t gfid, const char *basename, len = priv->base_path_length /* option directory "/export" */ + SLEN("/") - + SLEN(HANDLE_PFX) + + SLEN(GF_HIDDEN_PATH) + SLEN("/") + SLEN("00/") + SLEN("00/") @@ -274,10 +422,10 @@ posix_handle_gfid_path (xlator_t *this, uuid_t gfid, const char *basename, if (basename) { len = snprintf (buf, buflen, "%s/%s/%02x/%02x/%s/%s", priv->base_path, - HANDLE_PFX, gfid[0], gfid[1], uuid_str, basename); + GF_HIDDEN_PATH, gfid[0], gfid[1], uuid_str, basename); } else { len = snprintf (buf, buflen, "%s/%s/%02x/%02x/%s", priv->base_path, - HANDLE_PFX, gfid[0], gfid[1], uuid_str); + GF_HIDDEN_PATH, gfid[0], gfid[1], uuid_str); } out: return len; @@ -306,10 +454,10 @@ posix_handle_init (xlator_t *this) return -1; } - handle_pfx = alloca (priv->base_path_length + 1 + strlen (HANDLE_PFX) + handle_pfx = alloca (priv->base_path_length + 1 + strlen (GF_HIDDEN_PATH) + 1); - sprintf (handle_pfx, "%s/%s", priv->base_path, HANDLE_PFX); + sprintf (handle_pfx, "%s/%s", priv->base_path, GF_HIDDEN_PATH); ret = stat (handle_pfx, &stbuf); switch (ret) { @@ -391,6 +539,107 @@ posix_handle_init (xlator_t *this) return 0; } +gf_boolean_t +posix_does_old_trash_exists (char *old_trash) +{ + uuid_t gfid = {0}; + gf_boolean_t exists = _gf_false; + struct stat stbuf = {0}; + int ret = 0; + + ret = lstat (old_trash, &stbuf); + if ((ret == 0) && S_ISDIR (stbuf.st_mode)) { + ret = sys_lgetxattr (old_trash, "trusted.gfid", gfid, 16); + if ((ret < 0) && (errno == ENODATA)) + exists = _gf_true; + } + return exists; +} + +int +posix_handle_new_trash_init (xlator_t *this, char *trash) +{ + int ret = 0; + struct stat stbuf = {0}; + + ret = lstat (trash, &stbuf); + switch (ret) { + case -1: + if (errno == ENOENT) { + ret = mkdir (trash, 0755); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "Creating directory %s failed: %s", + trash, strerror (errno)); + } + } else { + gf_log (this->name, GF_LOG_ERROR, "Checking for %s " + "failed: %s", trash, strerror (errno)); + } + break; + case 0: + if (!S_ISDIR (stbuf.st_mode)) { + gf_log (this->name, GF_LOG_ERROR, + "Not a directory: %s", trash); + ret = -1; + } + break; + default: + break; + } + return ret; +} + +int +posix_mv_old_trash_into_new_trash (xlator_t *this, char *old, char *new) +{ + char dest_old[PATH_MAX] = {0}; + int ret = 0; + uuid_t dest_name = {0}; + + if (!posix_does_old_trash_exists (old)) + goto out; + uuid_generate (dest_name); + snprintf (dest_old, sizeof (dest_old), "%s/%s", new, + uuid_utoa (dest_name)); + ret = rename (old, dest_old); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Not able to move " + "%s -> %s (%s)", old, dest_old, strerror (errno)); + } +out: + return ret; +} + +int +posix_handle_trash_init (xlator_t *this) +{ + int ret = -1; + struct posix_private *priv = NULL; + char old_trash[PATH_MAX] = {0}; + + priv = this->private; + + priv->trash_path = GF_CALLOC (1, priv->base_path_length + strlen ("/") + + strlen (GF_HIDDEN_PATH) + strlen ("/") + + strlen (TRASH_DIR) + 1, + gf_posix_mt_trash_path); + + if (!priv->trash_path) + goto out; + + strncpy (priv->trash_path, priv->base_path, priv->base_path_length); + strcat (priv->trash_path, "/" GF_HIDDEN_PATH "/" TRASH_DIR); + ret = posix_handle_new_trash_init (this, priv->trash_path); + if (ret) + goto out; + snprintf (old_trash, sizeof (old_trash), "%s/.landfill", + priv->base_path); + ret = posix_mv_old_trash_into_new_trash (this, old_trash, + priv->trash_path); +out: + return ret; +} int posix_handle_mkdir_hashes (xlator_t *this, const char *newpath) @@ -452,7 +701,8 @@ posix_handle_hard (xlator_t *this, const char *oldpath, uuid_t gfid, struct stat return -1; } - ret = link (oldpath, newpath); + ret = sys_link (oldpath, newpath); + if (ret) { gf_log (this->name, GF_LOG_WARNING, "link %s -> %s failed (%s)", @@ -469,13 +719,6 @@ posix_handle_hard (xlator_t *this, const char *oldpath, uuid_t gfid, struct stat } } - ret = lstat (newpath, &newbuf); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "lstat on %s failed (%s)", newpath, strerror (errno)); - return -1; - } - if (newbuf.st_ino != oldbuf->st_ino || newbuf.st_dev != oldbuf->st_dev) { gf_log (this->name, GF_LOG_WARNING, @@ -631,7 +874,7 @@ posix_create_link_if_gfid_exists (xlator_t *this, uuid_t gfid, MAKE_HANDLE_PATH (newpath, this, gfid, NULL); ret = lstat (newpath, &stbuf); if (!ret) { - ret = link (newpath, real_path); + ret = sys_link (newpath, real_path); } return ret; diff --git a/xlators/storage/posix/src/posix-handle.h b/xlators/storage/posix/src/posix-handle.h index a8fd9228a..31cbf83fd 100644 --- a/xlators/storage/posix/src/posix-handle.h +++ b/xlators/storage/posix/src/posix-handle.h @@ -1,22 +1,12 @@ /* - Copyright (c) 2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef _POSIX_HANDLE_H #define _POSIX_HANDLE_H @@ -27,9 +17,85 @@ #include <sys/types.h> #include "xlator.h" +#include "gf-dirent.h" + +#define TRASH_DIR "landfill" + +#define UUID0_STR "00000000-0000-0000-0000-000000000000" +#define SLEN(str) (sizeof(str) - 1) + +#define LOC_HAS_ABSPATH(loc) (loc && (loc->path) && (loc->path[0] == '/')) + +#define MAKE_PGFID_XATTR_KEY(var, prefix, pgfid) do { \ + var = alloca (strlen (prefix) + UUID_CANONICAL_FORM_LEN + 1); \ + strcpy (var, prefix); \ + strcat (var, uuid_utoa (pgfid)); \ + } while (0) + +#define SET_PGFID_XATTR(path, key, value, flags, op_ret, this, label) do { \ + value = hton32 (value); \ + op_ret = sys_lsetxattr (path, key, &value, sizeof (value), \ + flags); \ + if (op_ret == -1) { \ + op_errno = errno; \ + gf_log (this->name, GF_LOG_WARNING, \ + "setting xattr failed on %s: key = %s (%s)", \ + path, key, strerror (op_errno)); \ + goto label; \ + } \ + } while (0) -#define LOC_HAS_ABSPATH(loc) ((loc) && (loc->path) && (loc->path[0] == '/')) +#define REMOVE_PGFID_XATTR(path, key, op_ret, this, label) do { \ + op_ret = sys_lremovexattr (path, key); \ + if (op_ret == -1) { \ + op_errno = errno; \ + gf_log (this->name, GF_LOG_WARNING, "removing xattr " \ + "failed on %s: key = %s (%s)", path, key, \ + strerror (op_errno)); \ + goto label; \ + } \ + } while (0) + +/* should be invoked holding a lock */ +#define LINK_MODIFY_PGFID_XATTR(path, key, value, flags, op_ret, this, label) do { \ + op_ret = sys_lgetxattr (path, key, &value, sizeof (value)); \ + if (op_ret == -1) { \ + op_errno = errno; \ + if (op_errno == ENOATTR) { \ + value = 1; \ + } else { \ + gf_log (this->name, GF_LOG_WARNING,"getting xattr " \ + "failed on %s: key = %s (%s)", path, key, \ + strerror (op_errno)); \ + goto label; \ + } \ + } else { \ + value = ntoh32 (value); \ + value++; \ + } \ + SET_PGFID_XATTR (path, key, value, flags, op_ret, this, label); \ + } while (0) + +/* should be invoked holding a lock */ +#define UNLINK_MODIFY_PGFID_XATTR(path, key, value, flags, op_ret, this, label) do { \ + op_ret = sys_lgetxattr (path, key, &value, sizeof (value)); \ + if (op_ret == -1) { \ + op_errno = errno; \ + gf_log (this->name, GF_LOG_WARNING, "getting xattr failed on " \ + "%s: key = %s (%s)", path, key, strerror (op_errno)); \ + goto label; \ + } else { \ + value = ntoh32 (value); \ + value--; \ + if (value > 0) { \ + SET_PGFID_XATTR (path, key, value, flags, op_ret, \ + this, label); \ + } else { \ + REMOVE_PGFID_XATTR (path, key, op_ret, this, label); \ + } \ + } \ + } while (0) #define MAKE_REAL_PATH(var, this, path) do { \ var = alloca (strlen (path) + POSIX_BASE_PATH_LEN(this) + 2); \ @@ -37,7 +103,6 @@ strcpy (&var[POSIX_BASE_PATH_LEN(this)], path); \ } while (0) - #define MAKE_HANDLE_PATH(var, this, gfid, base) do { \ int __len; \ __len = posix_handle_path (this, gfid, base, NULL, 0); \ @@ -71,7 +136,12 @@ #define MAKE_INODE_HANDLE(rpath, this, loc, iatt_p) do { \ if (uuid_is_null (loc->gfid)) { \ gf_log (this->name, GF_LOG_ERROR, \ - "null gfid for path %s", loc->path); \ + "null gfid for path %s", (loc)->path); \ + break; \ + } \ + if (LOC_HAS_ABSPATH (loc)) { \ + MAKE_REAL_PATH (rpath, this, (loc)->path); \ + op_ret = posix_pstat (this, (loc)->gfid, rpath, iatt_p); \ break; \ } \ errno = 0; \ @@ -81,11 +151,6 @@ break; \ } \ /* __ret == -1 && errno == ELOOP */ \ - if (LOC_HAS_ABSPATH (loc)) { \ - MAKE_REAL_PATH (rpath, this, loc->path); \ - op_ret = posix_pstat (this, loc->gfid, rpath, iatt_p); \ - break; \ - } \ } while (0) @@ -98,6 +163,13 @@ break; \ } \ \ + if (LOC_HAS_ABSPATH (loc)) { \ + MAKE_REAL_PATH (entp, this, loc->path); \ + __parp = strdupa (entp); \ + parp = dirname (__parp); \ + op_ret = posix_pstat (this, NULL, entp, ent_p); \ + break; \ + } \ errno = 0; \ op_ret = posix_istat (this, loc->pargfid, loc->name, ent_p); \ if (errno != ELOOP) { \ @@ -106,21 +178,24 @@ break; \ } \ /* __ret == -1 && errno == ELOOP */ \ - if (LOC_HAS_ABSPATH (loc)) { \ - MAKE_REAL_PATH (entp, this, loc->path); \ - __parp = strdupa (entp); \ - parp = dirname (__parp); \ - op_ret = posix_pstat (this, NULL, entp, ent_p); \ - break; \ - } \ /* expand ELOOP */ \ } while (0) +#define POSIX_ANCESTRY_PATH (1 << 0) +#define POSIX_ANCESTRY_DENTRY (1 << 1) int posix_handle_path (xlator_t *this, uuid_t gfid, const char *basename, char *buf, size_t len); + +int +posix_make_ancestryfromgfid (xlator_t *this, char *path, int pathsize, + gf_dirent_t *head, int type, uuid_t gfid, + const size_t handle_size, + const char *priv_base_path, + inode_table_t *table, inode_t **parent, + dict_t *xdata); int posix_handle_path_safe (xlator_t *this, uuid_t gfid, const char *basename, char *buf, size_t len); @@ -148,4 +223,6 @@ int posix_handle_init (xlator_t *this); int posix_create_link_if_gfid_exists (xlator_t *this, uuid_t gfid, char *real_path); +int +posix_handle_trash_init (xlator_t *this); #endif /* !_POSIX_HANDLE_H */ diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c index f4334302f..ab46f7f7e 100644 --- a/xlators/storage/posix/src/posix-helpers.c +++ b/xlators/storage/posix/src/posix-helpers.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" @@ -32,6 +22,7 @@ #include <pthread.h> #include <ftw.h> #include <sys/stat.h> +#include <signal.h> #ifndef GF_BSD_HOST_OS #include <alloca.h> @@ -54,25 +45,22 @@ #include "timer.h" #include "glusterfs3-xdr.h" #include "hashfn.h" +#include "glusterfs-acl.h" #include <fnmatch.h> -typedef struct { - xlator_t *this; - const char *real_path; - dict_t *xattr; - struct iatt *stbuf; - loc_t *loc; -} posix_xattr_filler_t; - char *marker_xattrs[] = {"trusted.glusterfs.quota.*", "trusted.glusterfs.*.xtime", NULL}; +char *marker_contri_key = "trusted.*.*.contri"; + static char* posix_ignore_xattrs[] = { "gfid-req", GLUSTERFS_ENTRYLK_COUNT, GLUSTERFS_INODELK_COUNT, GLUSTERFS_POSIXLK_COUNT, + GLUSTERFS_PARENT_ENTRYLK, + GF_GFIDLESS_LOOKUP, NULL }; @@ -117,15 +105,143 @@ out: return ignore; } -static void +static int +_posix_xattr_get_set_from_backend (posix_xattr_filler_t *filler, char *key) +{ + ssize_t xattr_size = -1; + int ret = 0; + char *value = NULL; + + xattr_size = sys_lgetxattr (filler->real_path, key, NULL, 0); + + if (xattr_size > 0) { + value = GF_CALLOC (1, xattr_size + 1, + gf_posix_mt_char); + if (!value) + goto out; + + xattr_size = sys_lgetxattr (filler->real_path, key, value, + xattr_size); + if (xattr_size <= 0) { + gf_log (filler->this->name, GF_LOG_WARNING, + "getxattr failed. path: %s, key: %s", + filler->real_path, key); + GF_FREE (value); + goto out; + } + + value[xattr_size] = '\0'; + ret = dict_set_bin (filler->xattr, key, + value, xattr_size); + if (ret < 0) { + gf_log (filler->this->name, GF_LOG_DEBUG, + "dict set failed. path: %s, key: %s", + filler->real_path, key); + GF_FREE (value); + goto out; + } + } + ret = 0; +out: + return ret; +} + +static int gf_posix_xattr_enotsup_log; + +static int +_posix_get_marker_all_contributions (posix_xattr_filler_t *filler) +{ + ssize_t size = -1, remaining_size = -1, list_offset = 0; + int ret = -1; + char *list = NULL, key[4096] = {0, }; + + size = sys_llistxattr (filler->real_path, NULL, 0); + if (size == -1) { + if ((errno == ENOTSUP) || (errno == ENOSYS)) { + GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log, + THIS->name, GF_LOG_WARNING, + "Extended attributes not " + "supported (try remounting brick" + " with 'user_xattr' flag)"); + + } else { + gf_log (THIS->name, GF_LOG_WARNING, + "listxattr failed on %s: %s", + filler->real_path, strerror (errno)); + + } + + goto out; + } + + if (size == 0) { + ret = 0; + goto out; + } + + list = alloca (size + 1); + if (!list) { + goto out; + } + + size = sys_llistxattr (filler->real_path, list, size); + if (size <= 0) { + ret = size; + goto out; + } + + remaining_size = size; + list_offset = 0; + + while (remaining_size > 0) { + if (*(list + list_offset) == '\0') + break; + strcpy (key, list + list_offset); + if (fnmatch (marker_contri_key, key, 0) == 0) { + ret = _posix_xattr_get_set_from_backend (filler, key); + } + + remaining_size -= strlen (key) + 1; + list_offset += strlen (key) + 1; + } + + ret = 0; + +out: + return ret; +} + +static int +_posix_get_marker_quota_contributions (posix_xattr_filler_t *filler, char *key) +{ + char *saveptr = NULL, *token = NULL, *tmp_key = NULL; + char *ptr = NULL; + int i = 0, ret = 0; + + tmp_key = ptr = gf_strdup (key); + for (i = 0; i < 4; i++) { + token = strtok_r (tmp_key, ".", &saveptr); + tmp_key = NULL; + } + + if (strncmp (token, "contri", strlen ("contri")) == 0) { + ret = _posix_get_marker_all_contributions (filler); + } else { + ret = _posix_xattr_get_set_from_backend (filler, key); + } + + GF_FREE (ptr); + + return ret; +} + +static int _posix_xattr_get_set (dict_t *xattr_req, char *key, data_t *data, void *xattrargs) { posix_xattr_filler_t *filler = xattrargs; - char *value = NULL; - ssize_t xattr_size = -1; int ret = -1; char *databuf = NULL; int _fd = -1; @@ -150,6 +266,16 @@ _posix_xattr_get_set (dict_t *xattr_req, goto err; } + /* + * There could be a situation where the ia_size is + * zero. GF_CALLOC will return a pointer to the + * memory initialized by gf_mem_set_acct_info. + * This function adds a header and a footer to + * the allocated memory. The returned pointer + * points to the memory just after the header, but + * when size is zero, there is no space for user + * data. The memory can be freed by calling GF_FREE. + */ databuf = GF_CALLOC (1, filler->stbuf->ia_size, gf_posix_mt_char); if (!databuf) { @@ -187,47 +313,41 @@ _posix_xattr_get_set (dict_t *xattr_req, err: if (_fd != -1) close (_fd); - if (databuf) - GF_FREE (databuf); + GF_FREE (databuf); } } else if (!strcmp (key, GLUSTERFS_OPEN_FD_COUNT)) { loc = filler->loc; - if (loc && !list_empty (&loc->inode->fd_list)) { - ret = dict_set_uint32 (filler->xattr, key, 1); - if (ret < 0) - gf_log (filler->this->name, GF_LOG_WARNING, - "Failed to set dictionary value for %s", - key); - } else { - ret = dict_set_uint32 (filler->xattr, key, 0); + if (loc) { + ret = dict_set_uint32 (filler->xattr, key, + loc->inode->fd_count); if (ret < 0) gf_log (filler->this->name, GF_LOG_WARNING, "Failed to set dictionary value for %s", key); } - } else { - xattr_size = sys_lgetxattr (filler->real_path, key, NULL, 0); - - if (xattr_size > 0) { - value = GF_CALLOC (1, xattr_size + 1, - gf_posix_mt_char); - if (!value) - return; - - sys_lgetxattr (filler->real_path, key, value, - xattr_size); + } else if (!strcmp (key, GET_ANCESTRY_PATH_KEY)) { + char *path = NULL; + ret = posix_get_ancestry (filler->this, filler->loc->inode, + NULL, &path, POSIX_ANCESTRY_PATH, + &filler->op_errno, xattr_req); + if (ret < 0) { + goto out; + } - value[xattr_size] = '\0'; - ret = dict_set_bin (filler->xattr, key, - value, xattr_size); - if (ret < 0) - gf_log (filler->this->name, GF_LOG_DEBUG, - "dict set failed. path: %s, key: %s", - filler->real_path, key); + ret = dict_set_dynstr (filler->xattr, GET_ANCESTRY_PATH_KEY, + path); + if (ret < 0) { + GF_FREE (path); + goto out; } + + } else if (fnmatch (marker_contri_key, key, 0) == 0) { + ret = _posix_get_marker_quota_contributions (filler, key); + } else { + ret = _posix_xattr_get_set_from_backend (filler, key); } out: - return; + return 0; } @@ -235,14 +355,17 @@ int posix_fill_gfid_path (xlator_t *this, const char *path, struct iatt *iatt) { int ret = 0; + ssize_t size = 0; if (!iatt) return 0; - ret = sys_lgetxattr (path, GFID_XATTR_KEY, iatt->ia_gfid, 16); + size = sys_lgetxattr (path, GFID_XATTR_KEY, iatt->ia_gfid, 16); /* Return value of getxattr */ - if ((ret == 16) || (ret == -1)) + if ((size == 16) || (size == -1)) ret = 0; + else + ret = size; return ret; } @@ -252,14 +375,17 @@ int posix_fill_gfid_fd (xlator_t *this, int fd, struct iatt *iatt) { int ret = 0; + ssize_t size = 0; if (!iatt) return 0; - ret = sys_fgetxattr (fd, GFID_XATTR_KEY, iatt->ia_gfid, 16); + size = sys_fgetxattr (fd, GFID_XATTR_KEY, iatt->ia_gfid, 16); /* Return value of getxattr */ - if ((ret == 16) || (ret == -1)) + if ((size == 16) || (size == -1)) ret = 0; + else + ret = size; return ret; } @@ -277,7 +403,7 @@ posix_fill_ino_from_gfid (xlator_t *this, struct iatt *buf) goto out; } for (i = 15; i > (15 - 8); i--) { - temp_ino += (uint64_t)(buf->ia_gfid[i]) << j; + temp_ino += (uint64_t)(buf->ia_gfid[i]) << j; j += 8; } buf->ia_ino = temp_ino; @@ -332,11 +458,21 @@ posix_istat (xlator_t *this, uuid_t gfid, const char *basename, ret = lstat (real_path, &lstatbuf); - if (ret == -1) { - if (errno != ENOENT && errno != ELOOP) - gf_log (this->name, GF_LOG_WARNING, - "lstat failed on %s (%s)", - real_path, strerror (errno)); + if (ret != 0) { + if (ret == -1) { + if (errno != ENOENT && errno != ELOOP) + gf_log (this->name, GF_LOG_WARNING, + "lstat failed on %s (%s)", + real_path, strerror (errno)); + } else { + // may be some backend filesystem issue + gf_log (this->name, GF_LOG_ERROR, "lstat failed on " + "%s and return value is %d instead of -1. " + "Please see dmesg output to check whether the " + "failure is due to backend filesystem issue", + real_path, ret); + ret = -1; + } goto out; } @@ -380,11 +516,21 @@ posix_pstat (xlator_t *this, uuid_t gfid, const char *path, ret = lstat (path, &lstatbuf); - if (ret == -1) { - if (errno != ENOENT) - gf_log (this->name, GF_LOG_WARNING, - "lstat failed on %s (%s)", - path, strerror (errno)); + if (ret != 0) { + if (ret == -1) { + if (errno != ENOENT) + gf_log (this->name, GF_LOG_WARNING, + "lstat failed on %s (%s)", + path, strerror (errno)); + } else { + // may be some backend filesytem issue + gf_log (this->name, GF_LOG_ERROR, "lstat failed on " + "%s and return value is %d instead of -1. " + "Please see dmesg output to check whether the " + "failure is due to backend filesystem issue", + path, ret); + ret = -1; + } goto out; } @@ -443,6 +589,7 @@ posix_gfid_set (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req) void *uuid_req = NULL; uuid_t uuid_curr; int ret = 0; + ssize_t size = 0; struct stat stat = {0, }; @@ -452,8 +599,8 @@ posix_gfid_set (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req) if (sys_lstat (path, &stat) != 0) goto out; - ret = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16); - if (ret == 16) { + size = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16); + if (size == 16) { ret = 0; goto verify_handle; } @@ -487,8 +634,8 @@ out: int -posix_set_file_contents (xlator_t *this, const char *path, data_pair_t *trav, - int flags) +posix_set_file_contents (xlator_t *this, const char *path, char *keyp, + data_t *value, int flags) { char * key = NULL; char real_path[PATH_MAX]; @@ -500,7 +647,7 @@ posix_set_file_contents (xlator_t *this, const char *path, data_pair_t *trav, /* XXX: does not handle assigning GFID to created files */ return -1; - key = &(trav->key[15]); + key = &(keyp[15]); sprintf (real_path, "%s/%s", path, key); if (flags & XATTR_REPLACE) { @@ -512,9 +659,8 @@ posix_set_file_contents (xlator_t *this, const char *path, data_pair_t *trav, goto create; } - if (trav->value->len) { - ret = write (file_fd, trav->value->data, - trav->value->len); + if (value->len) { + ret = write (file_fd, value->data, value->len); if (ret == -1) { op_ret = -errno; gf_log (this->name, GF_LOG_ERROR, @@ -546,7 +692,7 @@ posix_set_file_contents (xlator_t *this, const char *path, data_pair_t *trav, goto out; } - ret = write (file_fd, trav->value->data, trav->value->len); + ret = write (file_fd, value->data, value->len); if (ret == -1) { op_ret = -errno; gf_log (this->name, GF_LOG_ERROR, @@ -630,8 +776,7 @@ posix_get_file_contents (xlator_t *this, uuid_t pargfid, out: if (op_ret < 0) { - if (*contents) - GF_FREE (*contents); + GF_FREE (*contents); if (file_fd != -1) close (file_fd); } @@ -639,52 +784,81 @@ out: return op_ret; } +#ifdef GF_DARWIN_HOST_OS +static +void posix_dump_buffer (xlator_t *this, const char *real_path, const char *key, + data_t *value, int flags) +{ + char buffer[3*value->len+1]; + int index = 0; + buffer[0] = 0; + gf_loglevel_t log_level = gf_log_get_loglevel (); + if (log_level == GF_LOG_TRACE) { + char *data = (char *) value->data; + for (index = 0; index < value->len; index++) + sprintf(buffer+3*index, " %02x", data[index]); + } + gf_log (this->name, GF_LOG_DEBUG, + "Dump %s: key:%s flags: %u length:%u data:%s ", + real_path, key, flags, value->len, + (log_level == GF_LOG_TRACE ? buffer : "<skipped in DEBUG>")); +} +#endif + static int gf_xattr_enotsup_log; int posix_handle_pair (xlator_t *this, const char *real_path, - data_pair_t *trav, int flags) + char *key, data_t *value, int flags) { int sys_ret = -1; int ret = 0; - if (ZR_FILE_CONTENT_REQUEST(trav->key)) { - ret = posix_set_file_contents (this, real_path, trav, flags); + if (XATTR_IS_PATHINFO (key)) { + ret = -EACCES; + goto out; + } else if (ZR_FILE_CONTENT_REQUEST(key)) { + ret = posix_set_file_contents (this, real_path, key, value, + flags); } else { - sys_ret = sys_lsetxattr (real_path, trav->key, - trav->value->data, - trav->value->len, flags); - + sys_ret = sys_lsetxattr (real_path, key, value->data, + value->len, flags); +#ifdef GF_DARWIN_HOST_OS + posix_dump_buffer(this, real_path, key, value, flags); +#endif if (sys_ret < 0) { + ret = -errno; if (errno == ENOTSUP) { GF_LOG_OCCASIONALLY(gf_xattr_enotsup_log, this->name,GF_LOG_WARNING, "Extended attributes not " - "supported"); - } else if (errno == ENOENT && - !posix_special_xattr (marker_xattrs, - trav->key)) { - gf_log (this->name, GF_LOG_ERROR, - "setxattr on %s failed: %s", real_path, - strerror (errno)); + "supported (try remounting " + "brick with 'user_xattr' " + "flag)"); + } else if (errno == ENOENT) { + if (!posix_special_xattr (marker_xattrs, + key)) { + gf_log (this->name, GF_LOG_ERROR, + "setxattr on %s failed: %s", + real_path, strerror (errno)); + } } else { #ifdef GF_DARWIN_HOST_OS gf_log (this->name, ((errno == EINVAL) ? GF_LOG_DEBUG : GF_LOG_ERROR), - "%s: key:%s error:%s", - real_path, trav->key, + "%s: key:%s flags: %u length:%d error:%s", + real_path, key, flags, value->len, strerror (errno)); #else /* ! DARWIN */ gf_log (this->name, GF_LOG_ERROR, - "%s: key:%s error:%s", - real_path, trav->key, + "%s: key:%s flags: %u length:%d error:%s", + real_path, key, flags, value->len, strerror (errno)); #endif /* DARWIN */ } - ret = -errno; goto out; } } @@ -694,20 +868,28 @@ out: int posix_fhandle_pair (xlator_t *this, int fd, - data_pair_t *trav, int flags) + char *key, data_t *value, int flags) { int sys_ret = -1; int ret = 0; - sys_ret = sys_fsetxattr (fd, trav->key, trav->value->data, - trav->value->len, flags); + if (XATTR_IS_PATHINFO (key)) { + ret = -EACCES; + goto out; + } + + sys_ret = sys_fsetxattr (fd, key, value->data, + value->len, flags); if (sys_ret < 0) { + ret = -errno; if (errno == ENOTSUP) { GF_LOG_OCCASIONALLY(gf_xattr_enotsup_log, this->name,GF_LOG_WARNING, "Extended attributes not " - "supported"); + "supported (try remounting " + "brick with 'user_xattr' " + "flag)"); } else if (errno == ENOENT) { gf_log (this->name, GF_LOG_ERROR, "fsetxattr on fd=%d failed: %s", fd, @@ -719,17 +901,14 @@ posix_fhandle_pair (xlator_t *this, int fd, ((errno == EINVAL) ? GF_LOG_DEBUG : GF_LOG_ERROR), "fd=%d: key:%s error:%s", - fd, trav->key, - strerror (errno)); + fd, key, strerror (errno)); #else /* ! DARWIN */ gf_log (this->name, GF_LOG_ERROR, "fd=%d: key:%s error:%s", - fd, trav->key, - strerror (errno)); + fd, key, strerror (errno)); #endif /* DARWIN */ } - ret = -errno; goto out; } @@ -829,7 +1008,7 @@ posix_janitor_thread_proc (void *data) time (&now); if ((now - priv->last_landfill_check) > priv->janitor_sleep_duration) { gf_log (this->name, GF_LOG_TRACE, - "janitor cleaning out /" GF_REPLICATE_TRASH_DIR); + "janitor cleaning out %s", priv->trash_path); nftw (priv->trash_path, janitor_walker, @@ -870,8 +1049,8 @@ posix_spawn_janitor_thread (xlator_t *this) LOCK (&priv->lock); { if (!priv->janitor_present) { - ret = pthread_create (&priv->janitor, NULL, - posix_janitor_thread_proc, this); + ret = gf_thread_create (&priv->janitor, NULL, + posix_janitor_thread_proc, this); if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, @@ -887,6 +1066,74 @@ unlock: UNLOCK (&priv->lock); } +static int +is_fresh_file (struct stat *stat) +{ + struct timeval tv; + + gettimeofday (&tv, NULL); + + if ((stat->st_ctime >= (tv.tv_sec - 1)) + && (stat->st_ctime <= tv.tv_sec)) + return 1; + + return 0; +} + + +int +posix_gfid_heal (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req) +{ + /* The purpose of this function is to prevent a race + where an inode creation FOP (like mkdir/mknod/create etc) + races with lookup in the following way: + + {create thread} | {lookup thread} + | + t0 + mkdir ("name") | + t1 + | posix_gfid_set ("name", 2); + t2 + posix_gfid_set ("name", 1); | + t3 + lstat ("name"); | lstat ("name"); + + In the above case mkdir FOP would have resulted with GFID 2 while + it should have been GFID 1. It matters in the case where GFID would + have gotten set to 1 on other subvolumes of replciate/distribute + + The "solution" here is that, if we detect lookup is attempting to + set a GFID on a file which is created very recently, but does not + yet have a GFID (i.e, between t1 and t2), then "fake" it as though + posix_gfid_heal was called at t0 instead. + */ + + uuid_t uuid_curr; + int ret = 0; + struct stat stat = {0, }; + + if (!xattr_req) + goto out; + + if (sys_lstat (path, &stat) != 0) + goto out; + + ret = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16); + if (ret != 16) { + if (is_fresh_file (&stat)) { + ret = -1; + errno = ENOENT; + goto out; + } + } + + ret = posix_gfid_set (this, path, loc, xattr_req); +out: + return ret; +} + + int posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req) { @@ -900,17 +1147,17 @@ posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req) if (sys_lstat (path, &stat) != 0) goto out; - data = dict_get (xattr_req, "system.posix_acl_access"); + data = dict_get (xattr_req, POSIX_ACL_ACCESS_XATTR); if (data) { - ret = sys_lsetxattr (path, "system.posix_acl_access", + ret = sys_lsetxattr (path, POSIX_ACL_ACCESS_XATTR, data->data, data->len, 0); if (ret != 0) goto out; } - data = dict_get (xattr_req, "system.posix_acl_default"); + data = dict_get (xattr_req, POSIX_ACL_DEFAULT_XATTR); if (data) { - ret = sys_lsetxattr (path, "system.posix_acl_default", + ret = sys_lsetxattr (path, POSIX_ACL_DEFAULT_XATTR, data->data, data->len, 0); if (ret != 0) goto out; @@ -920,37 +1167,47 @@ out: return ret; } +static int +_handle_entry_create_keyvalue_pair (dict_t *d, char *k, data_t *v, + void *tmp) +{ + int ret = -1; + posix_xattr_filler_t *filler = NULL; + + filler = tmp; + + if (!strcmp (GFID_XATTR_KEY, k) || + !strcmp ("gfid-req", k) || + !strcmp (POSIX_ACL_DEFAULT_XATTR, k) || + !strcmp (POSIX_ACL_ACCESS_XATTR, k) || + ZR_FILE_CONTENT_REQUEST(k)) { + return 0; + } + + ret = posix_handle_pair (filler->this, filler->real_path, k, v, + XATTR_CREATE); + if (ret < 0) { + errno = -ret; + return -1; + } + return 0; +} + int posix_entry_create_xattr_set (xlator_t *this, const char *path, dict_t *dict) { - data_pair_t *trav = NULL; int ret = -1; + posix_xattr_filler_t filler = {0,}; + if (!dict) goto out; - trav = dict->members_list; - while (trav) { - if (!strcmp (GFID_XATTR_KEY, trav->key) || - !strcmp ("gfid-req", trav->key) || - !strcmp ("system.posix_acl_default", trav->key) || - !strcmp ("system.posix_acl_access", trav->key) || - ZR_FILE_CONTENT_REQUEST(trav->key)) { - trav = trav->next; - continue; - } + filler.this = this; + filler.real_path = path; - ret = posix_handle_pair (this, path, trav, XATTR_CREATE); - if (ret < 0) { - errno = -ret; - ret = -1; - goto out; - } - trav = trav->next; - } - - ret = 0; + ret = dict_foreach (dict, _handle_entry_create_keyvalue_pair, &filler); out: return ret; @@ -974,7 +1231,7 @@ __posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd_p) goto out; } - if (fd->pid != -1) + if (!fd_is_anonymous(fd)) /* anonymous fd */ goto out; @@ -1041,10 +1298,256 @@ posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd) return ret; } +static void * +posix_health_check_thread_proc (void *data) +{ + xlator_t *this = NULL; + struct posix_private *priv = NULL; + uint32_t interval = 0; + int ret = -1; + struct stat sb = {0, }; + + this = data; + priv = this->private; + + /* prevent races when the interval is updated */ + interval = priv->health_check_interval; + if (interval == 0) + goto out; + + gf_log (this->name, GF_LOG_DEBUG, "health-check thread started, " + "interval = %d seconds", interval); + + while (1) { + /* aborting sleep() is a request to exit this thread, sleep() + * will normally not return when cancelled */ + ret = sleep (interval); + if (ret > 0) + break; + + /* prevent thread errors while doing the health-check(s) */ + pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL); + + /* Do the health-check, it should be moved to its own function + * in case it gets more complex. */ + ret = stat (priv->base_path, &sb); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "stat() on %s returned: %s", priv->base_path, + strerror (errno)); + goto abort; + } + + pthread_setcancelstate (PTHREAD_CANCEL_ENABLE, NULL); + } + +out: + gf_log (this->name, GF_LOG_DEBUG, "health-check thread exiting"); + + LOCK (&priv->lock); + { + priv->health_check_active = _gf_false; + } + UNLOCK (&priv->lock); + + return NULL; + +abort: + /* health-check failed */ + gf_log (this->name, GF_LOG_EMERG, "health-check failed, going down"); + xlator_notify (this->parents->xlator, GF_EVENT_CHILD_DOWN, this); + + ret = sleep (30); + if (ret == 0) { + gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGTERM"); + kill (getpid(), SIGTERM); + } + + ret = sleep (30); + if (ret == 0) { + gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGKILL"); + kill (getpid(), SIGKILL); + } + + return NULL; +} + +void +posix_spawn_health_check_thread (xlator_t *xl) +{ + struct posix_private *priv = NULL; + int ret = -1; + + priv = xl->private; + + LOCK (&priv->lock); + { + /* cancel the running thread */ + if (priv->health_check_active == _gf_true) { + pthread_cancel (priv->health_check); + priv->health_check_active = _gf_false; + } + + /* prevent scheduling a check in a tight loop */ + if (priv->health_check_interval == 0) + goto unlock; + + ret = gf_thread_create (&priv->health_check, NULL, + posix_health_check_thread_proc, xl); + if (ret < 0) { + priv->health_check_interval = 0; + priv->health_check_active = _gf_false; + gf_log (xl->name, GF_LOG_ERROR, + "unable to setup health-check thread: %s", + strerror (errno)); + goto unlock; + } + + /* run the thread detached, resources will be freed on exit */ + pthread_detach (priv->health_check); + priv->health_check_active = _gf_true; + } +unlock: + UNLOCK (&priv->lock); +} int -posix_fd_ctx_get_off (fd_t *fd, xlator_t *this, struct posix_fd **pfd, - off_t offset) +posix_fsyncer_pick (xlator_t *this, struct list_head *head) { - return posix_fd_ctx_get (fd, this, pfd); + struct posix_private *priv = NULL; + int count = 0; + + priv = this->private; + pthread_mutex_lock (&priv->fsync_mutex); + { + while (list_empty (&priv->fsyncs)) + pthread_cond_wait (&priv->fsync_cond, + &priv->fsync_mutex); + + count = priv->fsync_queue_count; + priv->fsync_queue_count = 0; + list_splice_init (&priv->fsyncs, head); + } + pthread_mutex_unlock (&priv->fsync_mutex); + + return count; +} + + +void +posix_fsyncer_process (xlator_t *this, call_stub_t *stub, gf_boolean_t do_fsync) +{ + struct posix_fd *pfd = NULL; + int ret = -1; + struct posix_private *priv = NULL; + + priv = this->private; + + ret = posix_fd_ctx_get (stub->args.fd, this, &pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not get fdctx for fd(%s)", + uuid_utoa (stub->args.fd->inode->gfid)); + call_unwind_error (stub, -1, EINVAL); + return; + } + + if (do_fsync) { + if (stub->args.datasync) + ret = sys_fdatasync (pfd->fd); + else + ret = sys_fsync (pfd->fd); + } else { + ret = 0; + } + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "could not fstat fd(%s)", + uuid_utoa (stub->args.fd->inode->gfid)); + call_unwind_error (stub, -1, errno); + return; + } + + call_unwind_error (stub, 0, 0); +} + + +static void +posix_fsyncer_syncfs (xlator_t *this, struct list_head *head) +{ + call_stub_t *stub = NULL; + struct posix_fd *pfd = NULL; + int ret = -1; + + stub = list_entry (head->prev, call_stub_t, list); + ret = posix_fd_ctx_get (stub->args.fd, this, &pfd); + if (ret) + return; + +#ifdef GF_LINUX_HOST_OS + /* syncfs() is not "declared" in RHEL's glibc even though + the kernel has support. + */ +#include <sys/syscall.h> +#include <unistd.h> +#ifdef SYS_syncfs + syscall (SYS_syncfs, pfd->fd); +#else + sync(); +#endif +#else + sync(); +#endif +} + + +void * +posix_fsyncer (void *d) +{ + xlator_t *this = d; + struct posix_private *priv = NULL; + call_stub_t *stub = NULL; + call_stub_t *tmp = NULL; + struct list_head list; + int count = 0; + gf_boolean_t do_fsync = _gf_true; + + priv = this->private; + + for (;;) { + INIT_LIST_HEAD (&list); + + count = posix_fsyncer_pick (this, &list); + + usleep (priv->batch_fsync_delay_usec); + + gf_log (this->name, GF_LOG_DEBUG, + "picked %d fsyncs", count); + + switch (priv->batch_fsync_mode) { + case BATCH_NONE: + case BATCH_REVERSE_FSYNC: + break; + case BATCH_SYNCFS: + case BATCH_SYNCFS_SINGLE_FSYNC: + case BATCH_SYNCFS_REVERSE_FSYNC: + posix_fsyncer_syncfs (this, &list); + break; + } + + if (priv->batch_fsync_mode == BATCH_SYNCFS) + do_fsync = _gf_false; + else + do_fsync = _gf_true; + + list_for_each_entry_safe_reverse (stub, tmp, &list, list) { + list_del_init (&stub->list); + + posix_fsyncer_process (this, stub, do_fsync); + + if (priv->batch_fsync_mode == BATCH_SYNCFS_SINGLE_FSYNC) + do_fsync = _gf_false; + } + } } diff --git a/xlators/storage/posix/src/posix-mem-types.h b/xlators/storage/posix/src/posix-mem-types.h index 10aa75edc..81752c17e 100644 --- a/xlators/storage/posix/src/posix-mem-types.h +++ b/xlators/storage/posix/src/posix-mem-types.h @@ -1,22 +1,12 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef __POSIX_MEM_TYPES_H__ #define __POSIX_MEM_TYPES_H__ @@ -30,6 +20,7 @@ enum gf_posix_mem_types_ { gf_posix_mt_int32_t, gf_posix_mt_posix_dev_t, gf_posix_mt_trash_path, + gf_posix_mt_paiocb, gf_posix_mt_end }; #endif diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index 168e7f7d2..f7800184e 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" @@ -33,6 +23,8 @@ #include <pthread.h> #include <ftw.h> #include <sys/stat.h> +#include <signal.h> +#include <sys/uio.h> #ifndef GF_BSD_HOST_OS #include <alloca.h> @@ -59,8 +51,11 @@ #include "timer.h" #include "glusterfs3-xdr.h" #include "hashfn.h" +#include "posix-aio.h" +#include "glusterfs-acl.h" extern char *marker_xattrs[]; +#define ALIGN_SIZE 4096 #undef HAVE_SET_FSID #ifdef HAVE_SET_FSID @@ -84,7 +79,6 @@ extern char *marker_xattrs[]; #define SET_TO_OLD_FS_ID() #endif - int posix_forget (xlator_t *this, inode_t *inode) { @@ -110,16 +104,18 @@ posix_lookup (call_frame_t *frame, xlator_t *this, char * par_path = NULL; struct iatt postparent = {0,}; int32_t gfidless = 0; + struct posix_private *priv = NULL; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (loc, out); - VALIDATE_OR_GOTO (loc->path, out); + + priv = this->private; /* The Hidden directory should be for housekeeping purpose and it should not get any gfid on it */ - if (__is_root_gfid (loc->pargfid) && - (strcmp (loc->name, GF_HIDDEN_PATH) == 0)) { + if (__is_root_gfid (loc->pargfid) && loc->name + && (strcmp (loc->name, GF_HIDDEN_PATH) == 0)) { gf_log (this->name, GF_LOG_WARNING, "Lookup issued on %s, which is not permitted", GF_HIDDEN_PATH); @@ -130,14 +126,14 @@ posix_lookup (call_frame_t *frame, xlator_t *this, op_ret = dict_get_int32 (xdata, GF_GFIDLESS_LOOKUP, &gfidless); op_ret = -1; - if (uuid_is_null (loc->pargfid)) { + if (uuid_is_null (loc->pargfid) || (loc->name == NULL)) { /* nameless lookup */ MAKE_INODE_HANDLE (real_path, this, loc, &buf); } else { MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &buf); if (uuid_is_null (loc->inode->gfid)) { - posix_gfid_set (this, real_path, loc, xdata); + posix_gfid_heal (this, real_path, loc, xdata); MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &buf); } @@ -169,6 +165,12 @@ parent: gf_log (this->name, GF_LOG_ERROR, "post-operation lstat on parent %s failed: %s", par_path, strerror (op_errno)); + if (op_errno == ENOENT) + /* If parent directory is missing in a lookup, + errno should be ESTALE (bad handle) and not + ENOENT (missing entry) + */ + op_errno = ESTALE; goto out; } } @@ -218,7 +220,8 @@ posix_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) if (op_ret == -1) { op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, + gf_log (this->name, (op_errno == ENOENT)? + GF_LOG_DEBUG:GF_LOG_ERROR, "lstat on %s failed: %s", real_path, strerror (op_errno)); goto out; @@ -359,23 +362,23 @@ posix_setattr (call_frame_t *frame, xlator_t *this, goto out; } - if (valid & GF_SET_ATTR_MODE) { - op_ret = posix_do_chmod (this, real_path, stbuf); + if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)){ + op_ret = posix_do_chown (this, real_path, stbuf, valid); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "setattr (chmod) on %s failed: %s", real_path, + "setattr (chown) on %s failed: %s", real_path, strerror (op_errno)); goto out; } } - if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)){ - op_ret = posix_do_chown (this, real_path, stbuf, valid); + if (valid & GF_SET_ATTR_MODE) { + op_ret = posix_do_chmod (this, real_path, stbuf); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "setattr (chown) on %s failed: %s", real_path, + "setattr (chmod) on %s failed: %s", real_path, strerror (op_errno)); goto out; } @@ -569,6 +572,318 @@ out: return 0; } +#ifdef FALLOC_FL_KEEP_SIZE +static int32_t +posix_do_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + off_t offset, size_t len, struct iatt *statpre, + struct iatt *statpost) +{ + struct posix_fd *pfd = NULL; + int32_t ret = -1; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID (frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "pfd is NULL from fd=%p", fd); + goto out; + } + + ret = posix_fdstat (this, pfd->fd, statpre); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "fallocate (fstat) failed on fd=%p: %s", fd, + strerror (errno)); + goto out; + } + + ret = sys_fallocate(pfd->fd, flags, offset, len); + if (ret == -1) { + ret = -errno; + goto out; + } + + ret = posix_fdstat (this, pfd->fd, statpost); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "fallocate (fstat) failed on fd=%p: %s", fd, + strerror (errno)); + goto out; + } + +out: + SET_TO_OLD_FS_ID (); + + return ret; +} +#endif /* FALLOC_FL_KEEP_SIZE */ + +char* +_page_aligned_alloc (size_t size, char **aligned_buf) +{ + char *alloc_buf = NULL; + char *buf = NULL; + + alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_posix_mt_char); + if (!alloc_buf) + goto out; + /* page aligned buffer */ + buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE); + *aligned_buf = buf; +out: + return alloc_buf; +} + +static int32_t +_posix_do_zerofill(int fd, off_t offset, off_t len, int o_direct) +{ + off_t num_vect = 0; + off_t num_loop = 1; + off_t idx = 0; + int32_t op_ret = -1; + int32_t vect_size = VECTOR_SIZE; + off_t remain = 0; + off_t extra = 0; + struct iovec *vector = NULL; + char *iov_base = NULL; + char *alloc_buf = NULL; + + if (len == 0) + return 0; + if (len < VECTOR_SIZE) + vect_size = len; + + num_vect = len / (vect_size); + remain = len % vect_size ; + if (num_vect > MAX_NO_VECT) { + extra = num_vect % MAX_NO_VECT; + num_loop = num_vect / MAX_NO_VECT; + num_vect = MAX_NO_VECT; + } + + vector = GF_CALLOC (num_vect, sizeof(struct iovec), + gf_common_mt_iovec); + if (!vector) + return -1; + if (o_direct) { + alloc_buf = _page_aligned_alloc(vect_size, &iov_base); + if (!alloc_buf) { + gf_log ("_posix_do_zerofill", GF_LOG_DEBUG, + "memory alloc failed, vect_size %d: %s", + vect_size, strerror(errno)); + GF_FREE(vector); + return -1; + } + } else { + iov_base = GF_CALLOC (vect_size, sizeof(char), + gf_common_mt_char); + if (!iov_base) { + GF_FREE(vector); + return -1; + } + } + + for (idx = 0; idx < num_vect; idx++) { + vector[idx].iov_base = iov_base; + vector[idx].iov_len = vect_size; + } + if (lseek(fd, offset, SEEK_SET) < 0) { + op_ret = -1; + goto err; + } + + for (idx = 0; idx < num_loop; idx++) { + op_ret = writev(fd, vector, num_vect); + if (op_ret < 0) + goto err; + } + if (extra) { + op_ret = writev(fd, vector, extra); + if (op_ret < 0) + goto err; + } + if (remain) { + vector[0].iov_len = remain; + op_ret = writev(fd, vector , 1); + if (op_ret < 0) + goto err; + } +err: + if (o_direct) + GF_FREE(alloc_buf); + else + GF_FREE(iov_base); + GF_FREE(vector); + return op_ret; +} + +static int32_t +posix_do_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, off_t len, struct iatt *statpre, + struct iatt *statpost) +{ + struct posix_fd *pfd = NULL; + int32_t ret = -1; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID (frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "pfd is NULL from fd=%p", fd); + goto out; + } + + ret = posix_fdstat (this, pfd->fd, statpre); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "pre-operation fstat failed on fd = %p: %s", fd, + strerror (errno)); + goto out; + } + ret = _posix_do_zerofill(pfd->fd, offset, len, pfd->flags & O_DIRECT); + if (ret < 0) { + ret = -errno; + gf_log(this->name, GF_LOG_ERROR, + "zerofill failed on fd %d length %" PRId64 " %s", + pfd->fd, len, strerror(errno)); + goto out; + } + if (pfd->flags & (O_SYNC|O_DSYNC)) { + ret = fsync (pfd->fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "fsync() in writev on fd %d failed: %s", + pfd->fd, strerror (errno)); + ret = -errno; + goto out; + } + } + + ret = posix_fdstat (this, pfd->fd, statpost); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "post operation fstat failed on fd=%p: %s", fd, + strerror (errno)); + goto out; + } + +out: + SET_TO_OLD_FS_ID (); + + return ret; +} + +static int32_t +_posix_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size, + off_t offset, size_t len, dict_t *xdata) +{ + int32_t ret; +#ifndef FALLOC_FL_KEEP_SIZE + ret = EOPNOTSUPP; + +#else /* FALLOC_FL_KEEP_SIZE */ + int32_t flags = 0; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; + + if (keep_size) + flags = FALLOC_FL_KEEP_SIZE; + + ret = posix_do_fallocate(frame, this, fd, flags, offset, len, + &statpre, &statpost); + if (ret < 0) + goto err; + + STACK_UNWIND_STRICT(fallocate, frame, 0, 0, &statpre, &statpost, NULL); + return 0; + +err: +#endif /* FALLOC_FL_KEEP_SIZE */ + STACK_UNWIND_STRICT(fallocate, frame, -1, -ret, NULL, NULL, NULL); + return 0; +} + +static int32_t +posix_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int32_t ret; +#ifndef FALLOC_FL_KEEP_SIZE + ret = EOPNOTSUPP; + +#else /* FALLOC_FL_KEEP_SIZE */ + int32_t flags = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; + + ret = posix_do_fallocate(frame, this, fd, flags, offset, len, + &statpre, &statpost); + if (ret < 0) + goto err; + + STACK_UNWIND_STRICT(discard, frame, 0, 0, &statpre, &statpost, NULL); + return 0; + +err: +#endif /* FALLOC_FL_KEEP_SIZE */ + STACK_UNWIND_STRICT(discard, frame, -1, -ret, NULL, NULL, NULL); + return 0; +} + +static int32_t +posix_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + int32_t ret = 0; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; + + ret = posix_do_zerofill(frame, this, fd, offset, len, + &statpre, &statpost); + if (ret < 0) + goto err; + + STACK_UNWIND_STRICT(zerofill, frame, 0, 0, &statpre, &statpost, NULL); + return 0; + +err: + STACK_UNWIND_STRICT(zerofill, frame, -1, -ret, NULL, NULL, NULL); + return 0; + +} + +static int32_t +posix_ipc (call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata) +{ + /* + * IPC is for inter-translator communication. If one gets here, it + * means somebody sent one that nobody else recognized, which is an + * error much like an uncaught exception. + */ + gf_log (this->name, GF_LOG_ERROR, "GF_LOG_IPC(%d) not handled", op); + STACK_UNWIND_STRICT (ipc, frame, -1, -EOPNOTSUPP, NULL); + return 0; + +} + int32_t posix_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, dict_t *xdata) @@ -584,7 +899,6 @@ posix_opendir (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (loc, out); - VALIDATE_OR_GOTO (loc->path, out); VALIDATE_OR_GOTO (fd, out); SET_FS_ID (frame->root->uid, frame->root->gid); @@ -736,18 +1050,20 @@ int posix_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, dev_t dev, mode_t umask, dict_t *xdata) { - int tmp_fd = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - char *real_path = 0; - char *par_path = 0; - struct iatt stbuf = { 0, }; - char was_present = 1; - struct posix_private *priv = NULL; - gid_t gid = 0; - struct iatt preparent = {0,}; - struct iatt postparent = {0,}; - void * uuid_req = NULL; + int tmp_fd = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_path = 0; + char *par_path = 0; + struct iatt stbuf = { 0, }; + char was_present = 1; + struct posix_private *priv = NULL; + gid_t gid = 0; + struct iatt preparent = {0,}; + struct iatt postparent = {0,}; + void * uuid_req = NULL; + int32_t nlink_samepgfid = 0; + char *pgfid_xattr_key = NULL; DECLARE_OLD_FS_ID_VAR; @@ -850,6 +1166,16 @@ post_op: strerror (errno)); } + if (priv->update_pgfid_nlinks) { + MAKE_PGFID_XATTR_KEY (pgfid_xattr_key, PGFID_XATTR_KEY_PREFIX, + loc->pargfid); + nlink_samepgfid = 1; + + SET_PGFID_XATTR (real_path, pgfid_xattr_key, nlink_samepgfid, + XATTR_CREATE, op_ret, this, ignore); + } + +ignore: op_ret = posix_entry_create_xattr_set (this, real_path, xdata); if (op_ret) { gf_log (this->name, GF_LOG_ERROR, @@ -913,6 +1239,18 @@ posix_mkdir (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (loc, out); + /* The Hidden directory should be for housekeeping purpose and it + should not get created from a user request */ + if (__is_root_gfid (loc->pargfid) && + (strcmp (loc->name, GF_HIDDEN_PATH) == 0)) { + gf_log (this->name, GF_LOG_WARNING, + "mkdir issued on %s, which is not permitted", + GF_HIDDEN_PATH); + op_errno = EPERM; + op_ret = -1; + goto out; + } + priv = this->private; VALIDATE_OR_GOTO (priv, out); @@ -966,7 +1304,6 @@ posix_mkdir (call_frame_t *frame, xlator_t *this, goto out; } #endif - op_ret = posix_acl_xattr_set (this, real_path, xdata); if (op_ret) { gf_log (this->name, GF_LOG_ERROR, @@ -1020,15 +1357,17 @@ int32_t posix_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, dict_t *xdata) { - int32_t op_ret = -1; - int32_t op_errno = 0; - char *real_path = NULL; - char *par_path = NULL; - int32_t fd = -1; - struct iatt stbuf; - struct posix_private *priv = NULL; - struct iatt preparent = {0,}; - struct iatt postparent = {0,}; + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_path = NULL; + char *par_path = NULL; + int32_t fd = -1; + struct iatt stbuf = {0,}; + struct posix_private *priv = NULL; + struct iatt preparent = {0,}; + struct iatt postparent = {0,}; + char *pgfid_xattr_key = NULL; + int32_t nlink_samepgfid = 0; DECLARE_OLD_FS_ID_VAR; @@ -1066,6 +1405,26 @@ posix_unlink (call_frame_t *frame, xlator_t *this, } } + if (priv->update_pgfid_nlinks && (stbuf.ia_nlink > 1)) { + MAKE_PGFID_XATTR_KEY (pgfid_xattr_key, PGFID_XATTR_KEY_PREFIX, + loc->pargfid); + LOCK (&loc->inode->lock); + { + UNLINK_MODIFY_PGFID_XATTR (real_path, pgfid_xattr_key, + nlink_samepgfid, 0, op_ret, + this, unlock); + } + unlock: + UNLOCK (&loc->inode->lock); + + if (op_ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "modification of " + "parent gfid xattr failed (path:%s gfid:%s)", + real_path, uuid_utoa (loc->inode->gfid)); + goto out; + } + } + op_ret = sys_unlink (real_path); if (op_ret == -1) { op_errno = errno; @@ -1108,6 +1467,7 @@ posix_rmdir (call_frame_t *frame, xlator_t *this, int32_t op_errno = 0; char * real_path = NULL; char * par_path = NULL; + char * gfid_str = NULL; struct iatt preparent = {0,}; struct iatt postparent = {0,}; struct iatt stbuf; @@ -1147,12 +1507,13 @@ posix_rmdir (call_frame_t *frame, xlator_t *this, } if (flags) { - uint32_t hashval = 0; - char *tmp_path = alloca (strlen (priv->trash_path) + 16); + gfid_str = uuid_utoa (stbuf.ia_gfid); + char *tmp_path = alloca (strlen (priv->trash_path) + + strlen ("/") + + strlen (gfid_str) + 1); mkdir (priv->trash_path, 0755); - hashval = gf_dm_hashfn (real_path, strlen (real_path)); - sprintf (tmp_path, "%s/%u", priv->trash_path, hashval); + sprintf (tmp_path, "%s/%s", priv->trash_path, gfid_str); op_ret = rename (real_path, tmp_path); } else { op_ret = rmdir (real_path); @@ -1205,16 +1566,18 @@ int posix_symlink (call_frame_t *frame, xlator_t *this, const char *linkname, loc_t *loc, mode_t umask, dict_t *xdata) { - int32_t op_ret = -1; - int32_t op_errno = 0; - char * real_path = 0; - char * par_path = 0; - struct iatt stbuf = { 0, }; - struct posix_private *priv = NULL; - gid_t gid = 0; - char was_present = 1; - struct iatt preparent = {0,}; - struct iatt postparent = {0,}; + int32_t op_ret = -1; + int32_t op_errno = 0; + char * real_path = 0; + char * par_path = 0; + struct iatt stbuf = { 0, }; + struct posix_private *priv = NULL; + gid_t gid = 0; + char was_present = 1; + struct iatt preparent = {0,}; + struct iatt postparent = {0,}; + char *pgfid_xattr_key = NULL; + int32_t nlink_samepgfid = 0; DECLARE_OLD_FS_ID_VAR; @@ -1275,7 +1638,6 @@ posix_symlink (call_frame_t *frame, xlator_t *this, goto out; } #endif - op_ret = posix_acl_xattr_set (this, real_path, xdata); if (op_ret) { gf_log (this->name, GF_LOG_ERROR, @@ -1283,6 +1645,14 @@ posix_symlink (call_frame_t *frame, xlator_t *this, strerror (errno)); } + if (priv->update_pgfid_nlinks) { + MAKE_PGFID_XATTR_KEY (pgfid_xattr_key, PGFID_XATTR_KEY_PREFIX, + loc->pargfid); + nlink_samepgfid = 1; + SET_PGFID_XATTR (real_path, pgfid_xattr_key, nlink_samepgfid, + XATTR_CREATE, op_ret, this, ignore); + } +ignore: op_ret = posix_entry_create_xattr_set (this, real_path, xdata); if (op_ret) { gf_log (this->name, GF_LOG_ERROR, @@ -1329,24 +1699,26 @@ int posix_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, dict_t *xdata) { - int32_t op_ret = -1; - int32_t op_errno = 0; - char *real_oldpath = NULL; - char *real_newpath = NULL; - char *par_oldpath = NULL; - char *par_newpath = NULL; - struct iatt stbuf = {0, }; - struct posix_private *priv = NULL; - char was_present = 1; - struct iatt preoldparent = {0, }; - struct iatt postoldparent = {0, }; - struct iatt prenewparent = {0, }; - struct iatt postnewparent = {0, }; + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_oldpath = NULL; + char *real_newpath = NULL; + char *par_oldpath = NULL; + char *par_newpath = NULL; + struct iatt stbuf = {0, }; + struct posix_private *priv = NULL; + char was_present = 1; + struct iatt preoldparent = {0, }; + struct iatt postoldparent = {0, }; + struct iatt prenewparent = {0, }; + struct iatt postnewparent = {0, }; char olddirid[64]; char newdirid[64]; - uuid_t victim = {0}; - int was_dir = 0; - int nlink = 0; + uuid_t victim = {0}; + int was_dir = 0; + int nlink = 0; + char *pgfid_xattr_key = NULL; + int32_t nlink_samepgfid = 0; DECLARE_OLD_FS_ID_VAR; @@ -1411,17 +1783,64 @@ posix_rename (call_frame_t *frame, xlator_t *this, goto out; } - if (IA_ISDIR (oldloc->inode->ia_type)) { + if (IA_ISDIR (oldloc->inode->ia_type)) posix_handle_unset (this, oldloc->inode->gfid, NULL); + + LOCK (&oldloc->inode->lock); + { + if (!IA_ISDIR (oldloc->inode->ia_type) + && priv->update_pgfid_nlinks) { + MAKE_PGFID_XATTR_KEY (pgfid_xattr_key, + PGFID_XATTR_KEY_PREFIX, + oldloc->pargfid); + UNLINK_MODIFY_PGFID_XATTR (real_oldpath, + pgfid_xattr_key, + nlink_samepgfid, 0, + op_ret, + this, unlock); + } + + op_ret = sys_rename (real_oldpath, real_newpath); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, + (op_errno == ENOTEMPTY ? GF_LOG_DEBUG + : GF_LOG_ERROR), + "rename of %s to %s failed: %s", + real_oldpath, real_newpath, + strerror (op_errno)); + + if (priv->update_pgfid_nlinks + && !IA_ISDIR (oldloc->inode->ia_type)) { + LINK_MODIFY_PGFID_XATTR (real_oldpath, + pgfid_xattr_key, + nlink_samepgfid, 0, + op_ret, + this, unlock); + } + + goto unlock; + } + + if (!IA_ISDIR (oldloc->inode->ia_type) + && priv->update_pgfid_nlinks) { + MAKE_PGFID_XATTR_KEY (pgfid_xattr_key, + PGFID_XATTR_KEY_PREFIX, + newloc->pargfid); + LINK_MODIFY_PGFID_XATTR (real_newpath, + pgfid_xattr_key, + nlink_samepgfid, 0, + op_ret, + this, unlock); + } } +unlock: + UNLOCK (&oldloc->inode->lock); - op_ret = sys_rename (real_oldpath, real_newpath); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, - (op_errno == ENOTEMPTY ? GF_LOG_DEBUG : GF_LOG_ERROR), - "rename of %s to %s failed: %s", - real_oldpath, real_newpath, strerror (op_errno)); + if (op_ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "modification of " + "parent gfid xattr failed (gfid:%s)", + uuid_utoa (oldloc->inode->gfid)); goto out; } @@ -1485,16 +1904,18 @@ int posix_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, dict_t *xdata) { - int32_t op_ret = -1; - int32_t op_errno = 0; - char *real_oldpath = 0; - char *real_newpath = 0; - char *par_newpath = 0; - struct iatt stbuf = {0, }; - struct posix_private *priv = NULL; - char was_present = 1; - struct iatt preparent = {0,}; - struct iatt postparent = {0,}; + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_oldpath = 0; + char *real_newpath = 0; + char *par_newpath = 0; + struct iatt stbuf = {0, }; + struct posix_private *priv = NULL; + char was_present = 1; + struct iatt preparent = {0,}; + struct iatt postparent = {0,}; + int32_t nlink_samepgfid = 0; + char *pgfid_xattr_key = NULL; DECLARE_OLD_FS_ID_VAR; @@ -1522,18 +1943,9 @@ posix_link (call_frame_t *frame, xlator_t *this, goto out; } -#ifdef HAVE_LINKAT - /* - * On most systems (Linux being the notable exception), link(2) - * first resolves symlinks. If the target is a directory or - * is nonexistent, it will fail. linkat(2) operates on the - * symlink instead of its target when the AT_SYMLINK_FOLLOW - * flag is not supplied. - */ - op_ret = linkat (AT_FDCWD, real_oldpath, AT_FDCWD, real_newpath, 0); -#else - op_ret = link (real_oldpath, real_newpath); -#endif + + op_ret = sys_link (real_oldpath, real_newpath); + if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, @@ -1559,6 +1971,27 @@ posix_link (call_frame_t *frame, xlator_t *this, goto out; } + if (priv->update_pgfid_nlinks) { + MAKE_PGFID_XATTR_KEY (pgfid_xattr_key, PGFID_XATTR_KEY_PREFIX, + newloc->pargfid); + + LOCK (&newloc->inode->lock); + { + LINK_MODIFY_PGFID_XATTR (real_newpath, pgfid_xattr_key, + nlink_samepgfid, 0, op_ret, + this, unlock); + } + unlock: + UNLOCK (&newloc->inode->lock); + + if (op_ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "modification of " + "parent gfid xattr failed (path:%s gfid:%s)", + real_newpath, uuid_utoa (newloc->inode->gfid)); + goto out; + } + } + op_ret = 0; out: @@ -1625,7 +2058,6 @@ posix_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, } op_ret = 0; - out: SET_TO_OLD_FS_ID (); @@ -1641,20 +2073,23 @@ posix_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - int32_t op_ret = -1; - int32_t op_errno = 0; - int32_t _fd = -1; - int _flags = 0; - char * real_path = NULL; - char * par_path = NULL; - struct iatt stbuf = {0, }; - struct posix_fd * pfd = NULL; - struct posix_private * priv = NULL; - char was_present = 1; - - gid_t gid = 0; - struct iatt preparent = {0,}; - struct iatt postparent = {0,}; + int32_t op_ret = -1; + int32_t op_errno = 0; + int32_t _fd = -1; + int _flags = 0; + char * real_path = NULL; + char * par_path = NULL; + struct iatt stbuf = {0, }; + struct posix_fd * pfd = NULL; + struct posix_private * priv = NULL; + char was_present = 1; + + gid_t gid = 0; + struct iatt preparent = {0,}; + struct iatt postparent = {0,}; + + int nlink_samepgfid = 0; + char * pgfid_xattr_key = NULL; DECLARE_OLD_FS_ID_VAR; @@ -1712,6 +2147,9 @@ posix_create (call_frame_t *frame, xlator_t *this, goto out; } + if (was_present) + goto fill_stat; + op_ret = posix_gfid_set (this, real_path, loc, xdata); if (op_ret) { gf_log (this->name, GF_LOG_ERROR, @@ -1727,7 +2165,6 @@ posix_create (call_frame_t *frame, xlator_t *this, real_path, strerror (op_errno)); } #endif - op_ret = posix_acl_xattr_set (this, real_path, xdata); if (op_ret) { gf_log (this->name, GF_LOG_ERROR, @@ -1735,6 +2172,14 @@ posix_create (call_frame_t *frame, xlator_t *this, strerror (errno)); } + if (priv->update_pgfid_nlinks) { + MAKE_PGFID_XATTR_KEY (pgfid_xattr_key, PGFID_XATTR_KEY_PREFIX, + loc->pargfid); + nlink_samepgfid = 1; + SET_PGFID_XATTR (real_path, pgfid_xattr_key, nlink_samepgfid, + XATTR_CREATE, op_ret, this, ignore); + } +ignore: op_ret = posix_entry_create_xattr_set (this, real_path, xdata); if (op_ret) { gf_log (this->name, GF_LOG_ERROR, @@ -1742,6 +2187,7 @@ posix_create (call_frame_t *frame, xlator_t *this, strerror (errno)); } +fill_stat: op_ret = posix_fdstat (this, _fd, &stbuf); if (op_ret == -1) { op_errno = errno; @@ -1796,7 +2242,7 @@ out: STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, (loc)?loc->inode:NULL, &stbuf, &preparent, - &postparent, NULL); + &postparent, xdata); return 0; } @@ -1878,9 +2324,6 @@ out: return 0; } -#define ALIGN_BUF(ptr,bound) ((void *)((unsigned long)(ptr + bound - 1) & \ - (unsigned long)(~(bound - 1)))) - int posix_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata) @@ -1962,11 +2405,7 @@ posix_readv (call_frame_t *frame, xlator_t *this, } /* Hack to notify higher layers of EOF. */ - if (stbuf.ia_size == 0) - op_errno = ENOENT; - else if ((offset + vec.iov_len) == stbuf.ia_size) - op_errno = ENOENT; - else if (offset > stbuf.ia_size) + if (!stbuf.ia_size || (offset + vec.iov_len) >= stbuf.ia_size) op_errno = ENOENT; op_ret = vec.iov_len; @@ -2011,14 +2450,12 @@ err: return op_ret; } - int32_t __posix_writev (int fd, struct iovec *vector, int count, off_t startoff, int odirect) { int32_t op_ret = 0; int idx = 0; - int align = 4096; int max_buf_size = 0; int retval = 0; char *buf = NULL; @@ -2034,7 +2471,7 @@ __posix_writev (int fd, struct iovec *vector, int count, off_t startoff, max_buf_size = vector[idx].iov_len; } - alloc_buf = GF_MALLOC (1 * (max_buf_size + align), gf_posix_mt_char); + alloc_buf = _page_aligned_alloc (max_buf_size, &buf); if (!alloc_buf) { op_ret = -errno; goto err; @@ -2042,9 +2479,6 @@ __posix_writev (int fd, struct iovec *vector, int count, off_t startoff, internal_off = startoff; for (idx = 0; idx < count; idx++) { - /* page aligned buffer */ - buf = ALIGN_BUF (alloc_buf, align); - memcpy (buf, vector[idx].iov_base, vector[idx].iov_len); /* not sure whether writev works on O_DIRECT'd fd */ @@ -2059,12 +2493,53 @@ __posix_writev (int fd, struct iovec *vector, int count, off_t startoff, } err: - if (alloc_buf) - GF_FREE (alloc_buf); + GF_FREE (alloc_buf); return op_ret; } +dict_t* +_fill_writev_xdata (fd_t *fd, dict_t *xdata, xlator_t *this, int is_append) +{ + dict_t *rsp_xdata = NULL; + int32_t ret = 0; + inode_t *inode = NULL; + + if (fd) + inode = fd->inode; + + if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid Args: " + "fd: %p inode: %p gfid:%s", fd, inode?inode:0, + inode?uuid_utoa(inode->gfid):"N/A"); + goto out; + } + + if (!xdata || !dict_get (xdata, GLUSTERFS_OPEN_FD_COUNT)) + goto out; + + rsp_xdata = dict_new(); + if (!rsp_xdata) + goto out; + + ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_OPEN_FD_COUNT, + fd->inode->fd_count); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set " + "dictionary value for %s", uuid_utoa (fd->inode->gfid), + GLUSTERFS_OPEN_FD_COUNT); + } + + ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND, + is_append); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set " + "dictionary value for %s", uuid_utoa (fd->inode->gfid), + GLUSTERFS_WRITE_IS_APPEND); + } +out: + return rsp_xdata; +} int32_t posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, @@ -2079,6 +2554,9 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt preop = {0,}; struct iatt postop = {0,}; int ret = -1; + dict_t *rsp_xdata = NULL; + int is_append = 0; + gf_boolean_t locked = _gf_false; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -2100,6 +2578,17 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, _fd = pfd->fd; + if (xdata && dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) { + /* The write_is_append check and write must happen + atomically. Else another write can overtake this + write after the check and get written earlier. + + So lock before preop-stat and unlock after write. + */ + locked = _gf_true; + LOCK(&fd->inode->lock); + } + op_ret = posix_fdstat (this, _fd, &preop); if (op_ret == -1) { op_errno = errno; @@ -2109,8 +2598,19 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, goto out; } + if (locked) { + if (preop.ia_size == offset || (fd->flags & O_APPEND)) + is_append = 1; + } + op_ret = __posix_writev (_fd, vector, count, offset, (pfd->flags & O_DIRECT)); + + if (locked) { + UNLOCK (&fd->inode->lock); + locked = _gf_false; + } + if (op_ret < 0) { op_errno = -op_ret; op_ret = -1; @@ -2126,14 +2626,21 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, UNLOCK (&priv->lock); if (op_ret >= 0) { + rsp_xdata = _fill_writev_xdata (fd, xdata, this, is_append); /* wiretv successful, we also need to get the stat of * the file we wrote to */ - if (pfd->flushwrites) { - /* NOTE: ignore the error, if one occurs at this - * point */ - fsync (_fd); + if (flags & (O_SYNC|O_DSYNC)) { + ret = fsync (_fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "fsync() in writev on fd %d failed: %s", + _fd, strerror (errno)); + op_ret = -1; + op_errno = errno; + goto out; + } } ret = posix_fdstat (this, _fd, &postop); @@ -2149,9 +2656,16 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, out: + if (locked) { + UNLOCK (&fd->inode->lock); + locked = _gf_false; + } + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &preop, &postop, - NULL); + rsp_xdata); + if (rsp_xdata) + dict_unref (rsp_xdata); return 0; } @@ -2278,6 +2792,33 @@ out: } +int +posix_batch_fsync (call_frame_t *frame, xlator_t *this, + fd_t *fd, int datasync, dict_t *xdata) +{ + call_stub_t *stub = NULL; + struct posix_private *priv = NULL; + + priv = this->private; + + stub = fop_fsync_stub (frame, default_fsync, fd, datasync, xdata); + if (!stub) { + STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM, 0, 0, 0); + return 0; + } + + pthread_mutex_lock (&priv->fsync_mutex); + { + list_add_tail (&stub->list, &priv->fsyncs); + priv->fsync_queue_count++; + pthread_cond_signal (&priv->fsync_cond); + } + pthread_mutex_unlock (&priv->fsync_mutex); + + return 0; +} + + int32_t posix_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, dict_t *xdata) @@ -2289,6 +2830,7 @@ posix_fsync (call_frame_t *frame, xlator_t *this, int ret = -1; struct iatt preop = {0,}; struct iatt postop = {0,}; + struct posix_private *priv = NULL; DECLARE_OLD_FS_ID_VAR; @@ -2304,6 +2846,12 @@ posix_fsync (call_frame_t *frame, xlator_t *this, goto out; #endif + priv = this->private; + if (priv->batch_fsync_mode && xdata && dict_get (xdata, "batch-fsync")) { + posix_batch_fsync (frame, this, fd, datasync, xdata); + return 0; + } + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { op_errno = -ret; @@ -2325,16 +2873,14 @@ posix_fsync (call_frame_t *frame, xlator_t *this, if (datasync) { ; -#ifdef HAVE_FDATASYNC - op_ret = fdatasync (_fd); + op_ret = sys_fdatasync (_fd); if (op_ret == -1) { gf_log (this->name, GF_LOG_ERROR, "fdatasync on fd=%p failed: %s", fd, strerror (errno)); } -#endif } else { - op_ret = fsync (_fd); + op_ret = sys_fsync (_fd); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, @@ -2365,6 +2911,34 @@ out: } static int gf_posix_xattr_enotsup_log; +static int +_handle_setxattr_keyvalue_pair (dict_t *d, char *k, data_t *v, + void *tmp) +{ + posix_xattr_filler_t *filler = NULL; + + filler = tmp; + + return posix_handle_pair (filler->this, filler->real_path, k, v, + filler->flags); +} + +#ifdef GF_DARWIN_HOST_OS +static inline int +map_xattr_flags(int flags) +{ + /* DARWIN has different defines on XATTR_ flags. + There do not seem to be a POSIX standard + Parse any other flags over. + */ + int darwinflags = flags & ~(GF_XATTR_CREATE | GF_XATTR_REPLACE | XATTR_REPLACE); + if (GF_XATTR_CREATE & flags) + darwinflags |= XATTR_CREATE; + if (GF_XATTR_REPLACE & flags) + darwinflags |= XATTR_REPLACE; + return darwinflags; +} +#endif int32_t posix_setxattr (call_frame_t *frame, xlator_t *this, @@ -2373,8 +2947,8 @@ posix_setxattr (call_frame_t *frame, xlator_t *this, int32_t op_ret = -1; int32_t op_errno = 0; char * real_path = NULL; - data_pair_t * trav = NULL; - int ret = -1; + + posix_xattr_filler_t filler = {0,}; DECLARE_OLD_FS_ID_VAR; SET_FS_ID (frame->root->uid, frame->root->gid); @@ -2388,26 +2962,404 @@ posix_setxattr (call_frame_t *frame, xlator_t *this, op_ret = -1; dict_del (dict, GFID_XATTR_KEY); + dict_del (dict, GF_XATTR_VOL_ID_KEY); - trav = dict->members_list; + filler.real_path = real_path; + filler.this = this; +#ifdef GF_DARWIN_HOST_OS + filler.flags = map_xattr_flags(flags); +#else + filler.flags = flags; +#endif + op_ret = dict_foreach (dict, _handle_setxattr_keyvalue_pair, + &filler); + if (op_ret < 0) { + op_errno = -op_ret; + op_ret = -1; + } + +out: + SET_TO_OLD_FS_ID (); + + STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, NULL); + + return 0; +} + + +int +posix_xattr_get_real_filename (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *key, dict_t *dict, dict_t *xdata) +{ + char *real_path = NULL; + struct dirent *dirent = NULL; + DIR *fd = NULL; + const char *fname = NULL; + char *found = NULL; + int ret = -1; + int op_ret = -1; + + MAKE_INODE_HANDLE (real_path, this, loc, NULL); + + fd = opendir (real_path); + if (!fd) + return -errno; + + fname = key + strlen (GF_XATTR_GET_REAL_FILENAME_KEY); + + while ((dirent = readdir (fd))) { + if (strcasecmp (dirent->d_name, fname) == 0) { + found = gf_strdup (dirent->d_name); + if (!found) { + closedir (fd); + return -ENOMEM; + } + break; + } + } + + closedir (fd); + + if (!found) + return -ENOENT; + + ret = dict_set_dynstr (dict, (char *)key, found); + if (ret) { + GF_FREE (found); + return -ENOMEM; + } + ret = strlen (found) + 1; + + return ret; +} + +int +posix_get_ancestry_directory (xlator_t *this, inode_t *leaf_inode, + gf_dirent_t *head, char **path, int type, + int32_t *op_errno, dict_t *xdata) +{ + ssize_t handle_size = 0; + struct posix_private *priv = NULL; + char dirpath[PATH_MAX+1] = {0,}; + inode_t *inode = NULL; + int ret = -1; + + priv = this->private; + + handle_size = POSIX_GFID_HANDLE_SIZE(priv->base_path_length); + + ret = posix_make_ancestryfromgfid (this, dirpath, PATH_MAX + 1, head, + type | POSIX_ANCESTRY_PATH, + leaf_inode->gfid, + handle_size, priv->base_path, + leaf_inode->table, &inode, xdata); + if (ret < 0) + goto out; + + + /* there is already a reference in loc->inode */ + inode_unref (inode); + + if ((type & POSIX_ANCESTRY_PATH) && (path != NULL)) { + if (strcmp (dirpath, "/")) + dirpath[strlen (dirpath) - 1] = '\0'; + + *path = gf_strdup (dirpath); + } + +out: + return ret; +} + +int32_t +posix_links_in_same_directory (char *dirpath, int count, inode_t *leaf_inode, + inode_t *parent, uint64_t ino, + gf_dirent_t *head, char **path, + int type, dict_t *xdata, int32_t *op_errno) +{ + DIR *dirp = NULL; + int op_ret = -1; + struct dirent *entry = NULL; + struct dirent *result = NULL; + inode_t *linked_inode = NULL; + gf_dirent_t *gf_entry = NULL; + char temppath[PATH_MAX+1] = {0,}; + xlator_t *this = NULL; + struct posix_private *priv = NULL; + char *tempv = NULL; + + this = THIS; + + priv = this->private; + + dirp = opendir (dirpath); + if (!dirp) { + *op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "could not opendir %s: %s", dirpath, + strerror (*op_errno)); + goto out; + } + + entry = alloca (offsetof(struct dirent, d_name) + NAME_MAX + 1); + if (entry == NULL) + goto out; + + while (count > 0) { + *op_errno = readdir_r (dirp, entry, &result); + if ((result == NULL) || *op_errno) + break; + + if (entry->d_ino != ino) + continue; + + linked_inode = inode_link (leaf_inode, parent, + entry->d_name, NULL); + + GF_ASSERT (linked_inode == leaf_inode); + inode_unref (linked_inode); + + if (type & POSIX_ANCESTRY_DENTRY) { + loc_t loc = {0, }; + + loc.inode = inode_ref (leaf_inode); + uuid_copy (loc.gfid, leaf_inode->gfid); + + strcpy (temppath, dirpath); + strcat (temppath, "/"); + strcat (temppath, entry->d_name); + + gf_entry = gf_dirent_for_name (entry->d_name); + gf_entry->inode = inode_ref (leaf_inode); + gf_entry->dict + = posix_lookup_xattr_fill (this, + temppath, + &loc, xdata, + NULL); + list_add_tail (&gf_entry->list, &head->list); + loc_wipe (&loc); + } + + if (type & POSIX_ANCESTRY_PATH) { + strcpy (temppath, + &dirpath[priv->base_path_length]); + strcat (temppath, "/"); + strcat (temppath, entry->d_name); + if (!*path) { + *path = gf_strdup (temppath); + } else { + /* creating a colon separated */ + /* list of hard links */ + tempv = GF_REALLOC (*path, strlen (*path) + + 1 // ':' + + strlen (temppath) + 1 ); + if (!tempv) { + gf_log (this->name, GF_LOG_WARNING, + "realloc failed on path"); + GF_FREE (*path); + op_ret = -1; + *op_errno = ENOMEM; + goto out; + } + + *path = tempv; + strcat (*path, ":"); + strcat (*path, temppath); + } + } + + count--; + } + +out: + if (dirp) { + op_ret = closedir (dirp); + if (op_ret == -1) { + *op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "closedir failed: %s", + strerror (*op_errno)); + } + } + + return op_ret; +} + +int +posix_get_ancestry_non_directory (xlator_t *this, inode_t *leaf_inode, + gf_dirent_t *head, char **path, int type, + int32_t *op_errno, dict_t *xdata) +{ + size_t remaining_size = 0; + char dirpath[PATH_MAX+1] = {0,}, *leaf_path = NULL; + int op_ret = -1, pathlen = -1; + ssize_t handle_size = 0; + char pgfidstr[UUID_CANONICAL_FORM_LEN+1] = {0,}; + uuid_t pgfid = {0, }; + int nlink_samepgfid = 0; + struct stat stbuf = {0,}; + char *list = NULL; + int32_t list_offset = 0; + char key[4096] = {0,}; + struct posix_private *priv = NULL; + ssize_t size = 0; + inode_t *parent = NULL; + loc_t *loc = NULL; + + priv = this->private; + + loc = GF_CALLOC (1, sizeof (*loc), gf_posix_mt_char); + if (loc == NULL) { + op_ret = -1; + *op_errno = ENOMEM; + goto out; + } + + uuid_copy (loc->gfid, leaf_inode->gfid); + + MAKE_INODE_HANDLE (leaf_path, this, loc, NULL); + + GF_FREE (loc); + + size = sys_llistxattr (leaf_path, NULL, 0); + if (size == -1) { + *op_errno = errno; + if ((errno == ENOTSUP) || (errno == ENOSYS)) { + GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log, + this->name, GF_LOG_WARNING, + "Extended attributes not " + "supported (try remounting brick" + " with 'user_xattr' flag)"); + + } else { + gf_log (this->name, GF_LOG_WARNING, + "listxattr failed on %s: %s", + leaf_path, strerror (*op_errno)); - while (trav) { - ret = posix_handle_pair (this, real_path, trav, flags); - if (ret < 0) { - op_errno = -ret; - goto out; } - trav = trav->next; + + goto out; + } + + if (size == 0) { + op_ret = 0; + goto out; } + list = alloca (size + 1); + if (!list) { + *op_errno = errno; + goto out; + } + + size = sys_llistxattr (leaf_path, list, size); + remaining_size = size; + list_offset = 0; + + op_ret = sys_lstat (leaf_path, &stbuf); + if (op_ret == -1) { + *op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, "lstat failed" + " on %s: %s", leaf_path, + strerror (*op_errno)); + goto out; + } + + while (remaining_size > 0) { + if (*(list + list_offset) == '\0') + break; + strcpy (key, list + list_offset); + if (strncmp (key, PGFID_XATTR_KEY_PREFIX, + strlen (PGFID_XATTR_KEY_PREFIX)) != 0) + goto next; + + op_ret = sys_lgetxattr (leaf_path, key, + &nlink_samepgfid, + sizeof(nlink_samepgfid)); + if (op_ret == -1) { + *op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "getxattr failed on " + "%s: key = %s (%s)", + leaf_path, + key, + strerror (*op_errno)); + goto out; + } + + nlink_samepgfid = ntoh32 (nlink_samepgfid); + + strcpy (pgfidstr, key + strlen(PGFID_XATTR_KEY_PREFIX)); + uuid_parse (pgfidstr, pgfid); + + handle_size = POSIX_GFID_HANDLE_SIZE(priv->base_path_length); + + /* constructing the absolute real path of parent dir */ + strcpy (dirpath, priv->base_path); + pathlen = PATH_MAX + 1 - priv->base_path_length; + + op_ret = posix_make_ancestryfromgfid (this, + dirpath + priv->base_path_length, + pathlen, + head, + type | POSIX_ANCESTRY_PATH, + pgfid, + handle_size, + priv->base_path, + leaf_inode->table, + &parent, xdata); + if (op_ret < 0) { + goto next; + } + + dirpath[strlen (dirpath) - 1] = '\0'; + + posix_links_in_same_directory (dirpath, nlink_samepgfid, + leaf_inode, + parent, stbuf.st_ino, head, + path, type, xdata, op_errno); + + if (parent != NULL) { + inode_unref (parent); + parent = NULL; + } + + next: + remaining_size -= strlen (key) + 1; + list_offset += strlen (key) + 1; + } /* while (remaining_size > 0) */ + op_ret = 0; out: - SET_TO_OLD_FS_ID (); + return op_ret; +} - STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, NULL); +int +posix_get_ancestry (xlator_t *this, inode_t *leaf_inode, + gf_dirent_t *head, char **path, int type, int32_t *op_errno, + dict_t *xdata) +{ + int ret = -1; + struct posix_private *priv = NULL; - return 0; + priv = this->private; + + if (!priv->update_pgfid_nlinks) + goto out; + + if (IA_ISDIR (leaf_inode->ia_type)) { + ret = posix_get_ancestry_directory (this, leaf_inode, + head, path, type, op_errno, + xdata); + } else { + ret = posix_get_ancestry_non_directory (this, leaf_inode, + head, path, type, + op_errno, xdata); + } + +out: + return ret; } /** @@ -2419,23 +3371,23 @@ int32_t posix_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, dict_t *xdata) { - struct posix_private *priv = NULL; - int32_t op_ret = -1; - int32_t op_errno = 0; - int32_t list_offset = 0; - size_t size = 0; - size_t remaining_size = 0; - char key[4096] = {0,}; - char host_buf[1024] = {0,}; - char *value = NULL; - char *list = NULL; - char *real_path = NULL; - dict_t *dict = NULL; - char *file_contents = NULL; - int ret = -1; - char *path = NULL; - char *rpath = NULL; - char *dyn_rpath = NULL; + struct posix_private *priv = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + char host_buf[1024] = {0,}; + char *value = NULL; + char *real_path = NULL; + dict_t *dict = NULL; + char *file_contents = NULL; + int ret = -1; + char *path = NULL; + char *rpath = NULL; + char *dyn_rpath = NULL; + ssize_t size = 0; + char *list = NULL; + int32_t list_offset = 0; + size_t remaining_size = 0; + char keybuffer[4096] = {0,}; DECLARE_OLD_FS_ID_VAR; @@ -2462,12 +3414,31 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, } } - /* Get the total size */ - dict = get_new_dict (); + dict = dict_new (); if (!dict) { + op_errno = ENOMEM; goto out; } + if (loc->inode && name && + (strncmp (name, GF_XATTR_GET_REAL_FILENAME_KEY, + strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0)) { + ret = posix_xattr_get_real_filename (frame, this, loc, + name, dict, xdata); + if (ret < 0) { + op_ret = -1; + op_errno = -ret; + gf_log (this->name, (op_errno == ENOENT) ? + GF_LOG_DEBUG : GF_LOG_WARNING, + "Failed to get real filename (%s, %s): %s", + loc->path, name, strerror (op_errno)); + goto out; + } + + size = ret; + goto done; + } + if (loc->inode && name && !strcmp (name, GLUSTERFS_OPEN_FD_COUNT)) { if (!list_empty (&loc->inode->fd_list)) { ret = dict_set_uint32 (dict, (char *)name, 1); @@ -2484,15 +3455,19 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, } goto done; } - if (loc->inode && name && - (strcmp (name, GF_XATTR_PATHINFO_KEY) == 0)) { + if (loc->inode && name && (XATTR_IS_PATHINFO (name))) { if (LOC_HAS_ABSPATH (loc)) MAKE_REAL_PATH (rpath, this, loc->path); else rpath = real_path; - (void) snprintf (host_buf, 1024, "<POSIX(%s):%s:%s>", - priv->base_path, priv->hostname, rpath); + (void) snprintf (host_buf, 1024, + "<POSIX(%s):%s:%s>", priv->base_path, + ((priv->node_uuid_pathinfo + && !uuid_is_null(priv->glusterd_uuid)) + ? uuid_utoa (priv->glusterd_uuid) + : priv->hostname), + rpath); dyn_rpath = gf_strdup (host_buf); if (!dyn_rpath) { @@ -2500,12 +3475,12 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, goto done; } size = strlen (dyn_rpath) + 1; - ret = dict_set_dynstr (dict, GF_XATTR_PATHINFO_KEY, - dyn_rpath); + ret = dict_set_dynstr (dict, (char *)name, dyn_rpath); if (ret < 0) { gf_log (this->name, GF_LOG_WARNING, "could not set value (%s) in dictionary", dyn_rpath); + GF_FREE (dyn_rpath); } goto done; @@ -2530,6 +3505,7 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, gf_log (this->name, GF_LOG_WARNING, "could not set value (%s) in dictionary", dyn_rpath); + GF_FREE (dyn_rpath); } goto done; } @@ -2548,32 +3524,97 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, gf_log (this->name, GF_LOG_WARNING, "could not set value (%s) in dictionary", host_buf); + GF_FREE (path); } goto done; } - if (name) { - strcpy (key, name); + if (loc->inode && name + && (strcmp (name, GET_ANCESTRY_PATH_KEY) == 0)) { + int type = POSIX_ANCESTRY_PATH; - size = sys_lgetxattr (real_path, key, NULL, 0); - if (size == -1) { + op_ret = posix_get_ancestry (this, loc->inode, NULL, + &path, type, &op_errno, + xdata); + if (op_ret < 0) { op_ret = -1; - op_errno = errno; + op_errno = ENODATA; goto out; } + + op_ret = dict_set_dynstr (dict, GET_ANCESTRY_PATH_KEY, path); + if (op_ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "could not get " + "value for key (%s)", GET_ANCESTRY_PATH_KEY); + GF_FREE (path); + op_errno = -op_ret; + op_ret = -1; + } + + goto done; + } + + if (name) { + strcpy (keybuffer, name); + char *key = keybuffer; +#if defined(GF_DARWIN_HOST_OS_DISABLED) + if (priv->xattr_user_namespace == XATTR_STRIP) { + if (strncmp(key, "user.",5) == 0) { + key += 5; + gf_log (this->name, + GF_LOG_DEBUG, + "getxattr for file %s" + " stripping user key: %s -> %s", + real_path, keybuffer, key); + } + } +#endif + size = sys_lgetxattr (real_path, key, NULL, 0); + if (size <= 0) { + op_errno = errno; + if ((op_errno == ENOTSUP) || (op_errno == ENOSYS)) { + GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log, + this->name, GF_LOG_WARNING, + "Extended attributes not " + "supported (try remounting" + " brick with 'user_xattr' " + "flag)"); + } else if (op_errno == ENOATTR || + op_errno == ENODATA) { + gf_log (this->name, GF_LOG_DEBUG, + "No such attribute:%s for file %s", + key, real_path); + } else { + gf_log (this->name, GF_LOG_ERROR, + "getxattr failed on %s: %s (%s)", + real_path, key, strerror (op_errno)); + } + + goto done; + } value = GF_CALLOC (size + 1, sizeof(char), gf_posix_mt_char); if (!value) { op_ret = -1; + op_errno = ENOMEM; goto out; } - op_ret = sys_lgetxattr (real_path, key, value, size); - if (op_ret == -1) { + size = sys_lgetxattr (real_path, key, value, size); + if (size == -1) { + op_ret = -1; op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "getxattr failed on " + "%s: key = %s (%s)", real_path, key, + strerror (op_errno)); + GF_FREE (value); goto out; } - value [op_ret] = '\0'; - op_ret = dict_set_dynptr (dict, key, value, op_ret); + value [size] = '\0'; + op_ret = dict_set_dynptr (dict, key, value, size); if (op_ret < 0) { + op_errno = -op_ret; + gf_log (this->name, GF_LOG_ERROR, "dict set operation " + "on %s for the key %s failed.", real_path, key); + GF_FREE (value); goto out; } @@ -2587,7 +3628,9 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log, this->name, GF_LOG_WARNING, "Extended attributes not " - "supported."); + "supported (try remounting" + " brick with 'user_xattr' " + "flag)"); } else { gf_log (this->name, GF_LOG_ERROR, @@ -2613,33 +3656,55 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, while (remaining_size > 0) { if (*(list + list_offset) == '\0') break; - - strcpy (key, list + list_offset); - op_ret = sys_lgetxattr (real_path, key, NULL, 0); - if (op_ret == -1) + strcpy (keybuffer, list + list_offset); + size = sys_lgetxattr (real_path, keybuffer, NULL, 0); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "getxattr failed on " + "%s: key = %s (%s)", real_path, keybuffer, + strerror (op_errno)); break; + } - value = GF_CALLOC (op_ret + 1, sizeof(char), + value = GF_CALLOC (size + 1, sizeof(char), gf_posix_mt_char); if (!value) { op_errno = errno; goto out; } - op_ret = sys_lgetxattr (real_path, key, value, op_ret); - if (op_ret == -1) { + size = sys_lgetxattr (real_path, keybuffer, value, size); + if (size == -1) { + op_ret = -1; op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "getxattr failed on " + "%s: key = %s (%s)", real_path, keybuffer, + strerror (op_errno)); + GF_FREE (value); break; } - value [op_ret] = '\0'; - op_ret = dict_set_dynptr (dict, key, value, op_ret); + value [size] = '\0'; +#ifdef GF_DARWIN_HOST_OS + /* The protocol expect namespace for now */ + char *newkey = NULL; + gf_add_prefix (XATTR_USER_PREFIX, keybuffer, &newkey); + strcpy (keybuffer, newkey); + GF_FREE (newkey); +#endif + op_ret = dict_set_dynptr (dict, keybuffer, value, size); if (op_ret < 0) { + op_errno = -op_ret; + gf_log (this->name, GF_LOG_ERROR, "dict set operation " + "on %s for the key %s failed.", real_path, + keybuffer); + GF_FREE (value); goto out; } - remaining_size -= strlen (key) + 1; - list_offset += strlen (key) + 1; + remaining_size -= strlen (keybuffer) + 1; + list_offset += strlen (keybuffer) + 1; } /* while (remaining_size > 0) */ @@ -2648,7 +3713,7 @@ done: if (dict) { dict_del (dict, GFID_XATTR_KEY); - dict_ref (dict); + dict_del (dict, GF_XATTR_VOL_ID_KEY); } out: @@ -2656,8 +3721,9 @@ out: STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, NULL); - if (dict) + if (dict) { dict_unref (dict); + } return 0; } @@ -2672,7 +3738,7 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this, struct posix_fd * pfd = NULL; int _fd = -1; int32_t list_offset = 0; - size_t size = 0; + ssize_t size = 0; size_t remaining_size = 0; char key[4096] = {0,}; char * value = NULL; @@ -2715,21 +3781,47 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this, if (name) { strcpy (key, name); - +#ifdef GF_DARWIN_HOST_OS + struct posix_private *priv = NULL; + priv = this->private; + if (priv->xattr_user_namespace == XATTR_STRIP) { + char *newkey = NULL; + gf_add_prefix (XATTR_USER_PREFIX, key, &newkey); + strcpy (key, newkey); + GF_FREE (newkey); + } +#endif size = sys_fgetxattr (_fd, key, NULL, 0); + if (size <= 0) { + op_errno = errno; + gf_log (this->name, ((errno == ENODATA) ? + GF_LOG_DEBUG : GF_LOG_ERROR), + "fgetxattr failed on key %s (%s)", key, + strerror (op_errno)); + goto done; + } + value = GF_CALLOC (size + 1, sizeof(char), gf_posix_mt_char); if (!value) { op_ret = -1; goto out; } - op_ret = sys_fgetxattr (_fd, key, value, op_ret); - if (op_ret == -1) { + size = sys_fgetxattr (_fd, key, value, size); + if (size == -1) { + op_ret = -1; op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "fgetxattr failed on " + "fd %p for the key %s (%s)", fd, key, + strerror (op_errno)); + GF_FREE (value); goto out; } - value [op_ret] = '\0'; - op_ret = dict_set_dynptr (dict, key, value, op_ret); + value [size] = '\0'; + op_ret = dict_set_dynptr (dict, key, value, size); if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "dict set operation " + "on key %s failed", key); + GF_FREE (value); goto out; } goto done; @@ -2742,7 +3834,8 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this, GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log, this->name, GF_LOG_WARNING, "Extended attributes not " - "supported."); + "supported (try remounting " + "brick with 'user_xattr' flag)"); } else { gf_log (this->name, GF_LOG_ERROR, @@ -2770,24 +3863,42 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this, break; strcpy (key, list + list_offset); - op_ret = sys_fgetxattr (_fd, key, NULL, 0); - if (op_ret == -1) + size = sys_fgetxattr (_fd, key, NULL, 0); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "fgetxattr failed on " + "fd %p for the key %s (%s)", fd, key, + strerror (op_errno)); break; + } - value = GF_CALLOC (op_ret + 1, sizeof(char), + value = GF_CALLOC (size + 1, sizeof(char), gf_posix_mt_char); if (!value) { + op_ret = -1; op_errno = errno; goto out; } - op_ret = sys_fgetxattr (_fd, key, value, op_ret); - if (op_ret == -1) + size = sys_fgetxattr (_fd, key, value, size); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "fgetxattr failed on " + "the fd %p for the key %s (%s)", fd, key, + strerror (op_errno)); + GF_FREE (value); break; + } - value [op_ret] = '\0'; - op_ret = dict_set_dynptr (dict, key, value, op_ret); + value [size] = '\0'; + + op_ret = dict_set_dynptr (dict, key, value, size); if (op_ret) { + gf_log (this->name, GF_LOG_ERROR, "dict set operation " + "failed on key %s", key); + GF_FREE (value); goto out; } remaining_size -= strlen (key) + 1; @@ -2800,6 +3911,7 @@ done: if (dict) { dict_del (dict, GFID_XATTR_KEY); + dict_del (dict, GF_XATTR_VOL_ID_KEY); dict_ref (dict); } @@ -2814,6 +3926,17 @@ out: return 0; } +static int +_handle_fsetxattr_keyvalue_pair (dict_t *d, char *k, data_t *v, + void *tmp) +{ + posix_xattr_filler_t *filler = NULL; + + filler = tmp; + + return posix_fhandle_pair (filler->this, filler->fd, k, v, + filler->flags); +} int32_t posix_fsetxattr (call_frame_t *frame, xlator_t *this, @@ -2823,8 +3946,9 @@ posix_fsetxattr (call_frame_t *frame, xlator_t *this, int32_t op_errno = 0; struct posix_fd * pfd = NULL; int _fd = -1; - data_pair_t * trav = NULL; - int ret = -1; + int ret = -1; + + posix_xattr_filler_t filler = {0,}; DECLARE_OLD_FS_ID_VAR; SET_FS_ID (frame->root->uid, frame->root->gid); @@ -2844,20 +3968,22 @@ posix_fsetxattr (call_frame_t *frame, xlator_t *this, _fd = pfd->fd; dict_del (dict, GFID_XATTR_KEY); + dict_del (dict, GF_XATTR_VOL_ID_KEY); - trav = dict->members_list; - - while (trav) { - ret = posix_fhandle_pair (this, _fd, trav, flags); - if (ret < 0) { - op_errno = -ret; - goto out; - } - trav = trav->next; + filler.fd = _fd; + filler.this = this; +#ifdef GF_DARWIN_HOST_OS + filler.flags = map_xattr_flags(flags); +#else + filler.flags = flags; +#endif + op_ret = dict_foreach (dict, _handle_fsetxattr_keyvalue_pair, + &filler); + if (op_ret < 0) { + op_errno = -op_ret; + op_ret = -1; } - op_ret = 0; - out: SET_TO_OLD_FS_ID (); @@ -2866,6 +3992,40 @@ out: return 0; } +int +_posix_remove_xattr (dict_t *dict, char *key, data_t *value, void *data) +{ + int32_t op_ret = 0; + xlator_t *this = NULL; + posix_xattr_filler_t *filler = NULL; + + filler = (posix_xattr_filler_t *) data; + this = filler->this; +#ifdef GF_DARWIN_HOST_OS + struct posix_private *priv = NULL; + priv = (struct posix_private *) this->private; + char *newkey = NULL; + if (priv->xattr_user_namespace == XATTR_STRIP) { + gf_remove_prefix (XATTR_USER_PREFIX, key, &newkey); + gf_log("remove_xattr", GF_LOG_DEBUG, "key %s => %s" , key, + newkey); + key = newkey; + } +#endif + op_ret = sys_lremovexattr (filler->real_path, key); + if (op_ret == -1) { + filler->op_errno = errno; + if (errno != ENOATTR && errno != EPERM) + gf_log (this->name, GF_LOG_ERROR, + "removexattr failed on %s (for %s): %s", + filler->real_path, key, strerror (errno)); + } +#ifdef GF_DARWIN_HOST_OS + GF_FREE(newkey); +#endif + return op_ret; +} + int32_t posix_removexattr (call_frame_t *frame, xlator_t *this, @@ -2874,6 +4034,7 @@ posix_removexattr (call_frame_t *frame, xlator_t *this, int32_t op_ret = -1; int32_t op_errno = 0; char * real_path = NULL; + posix_xattr_filler_t filler = {0,}; DECLARE_OLD_FS_ID_VAR; @@ -2885,10 +4046,32 @@ posix_removexattr (call_frame_t *frame, xlator_t *this, op_ret = -1; goto out; } + if (!strcmp (GF_XATTR_VOL_ID_KEY, name)) { + gf_log (this->name, GF_LOG_WARNING, "Remove xattr called" + " on volume-id for file %s", real_path); + op_ret = -1; + goto out; + } SET_FS_ID (frame->root->uid, frame->root->gid); + /** + * sending an empty key name with xdata containing the + * list of key(s) to be removed implies "bulk remove request" + * for removexattr. + */ + if (name && (strcmp (name, "") == 0) && xdata) { + filler.real_path = real_path; + filler.this = this; + op_ret = dict_foreach (xdata, _posix_remove_xattr, &filler); + if (op_ret) { + op_errno = filler.op_errno; + } + + goto out; + } + op_ret = sys_lremovexattr (real_path, name); if (op_ret == -1) { op_errno = errno; @@ -2916,7 +4099,6 @@ posix_fremovexattr (call_frame_t *frame, xlator_t *this, int32_t op_errno = 0; struct posix_fd * pfd = NULL; int _fd = -1; - uint64_t tmp_pfd = 0; int ret = -1; DECLARE_OLD_FS_ID_VAR; @@ -2926,16 +4108,19 @@ posix_fremovexattr (call_frame_t *frame, xlator_t *this, " on gfid for file"); goto out; } + if (!strcmp (GF_XATTR_VOL_ID_KEY, name)) { + gf_log (this->name, GF_LOG_WARNING, "Remove xattr called" + " on volume-id for file"); + goto out; + } - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { op_errno = -ret; gf_log (this->name, GF_LOG_WARNING, "pfd is NULL from fd=%p", fd); goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; - _fd = pfd->fd; @@ -3013,9 +4198,31 @@ posix_print_xattr (dict_t *this, static void __add_array (int32_t *dest, int32_t *src, int count) { + int i = 0; + int32_t destval = 0; + for (i = 0; i < count; i++) { + destval = ntoh32 (dest[i]); + if (destval == 0xffffffff) + continue; + dest[i] = hton32 (destval + ntoh32 (src[i])); + } +} + +static void +__or_array (int32_t *dest, int32_t *src, int count) +{ int i = 0; for (i = 0; i < count; i++) { - dest[i] = hton32 (ntoh32 (dest[i]) + ntoh32 (src[i])); + dest[i] = hton32 (ntoh32 (dest[i]) | ntoh32 (src[i])); + } +} + +static void +__and_array (int32_t *dest, int32_t *src, int count) +{ + int i = 0; + for (i = 0; i < count; i++) { + dest[i] = hton32 (ntoh32 (dest[i]) & ntoh32 (src[i])); } } @@ -3028,6 +4235,168 @@ __add_long_array (int64_t *dest, int64_t *src, int count) } } +static int +_posix_handle_xattr_keyvalue_pair (dict_t *d, char *k, data_t *v, + void *tmp) +{ + int size = 0; + int count = 0; + int op_ret = 0; + int op_errno = 0; + gf_xattrop_flags_t optype = 0; + char *array = NULL; + inode_t *inode = NULL; + xlator_t *this = NULL; + posix_xattr_filler_t *filler = NULL; + + filler = tmp; + + optype = (gf_xattrop_flags_t)(filler->flags); + this = filler->this; + inode = filler->inode; + count = v->len; + array = GF_CALLOC (count, sizeof (char), gf_posix_mt_char); + +#ifdef GF_DARWIN_HOST_OS + struct posix_private *priv = NULL; + priv = this->private; + if (priv->xattr_user_namespace == XATTR_STRIP) { + if (strncmp(k, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) == 0) { + k += XATTR_USER_PREFIX_LEN; + } + } +#endif + + LOCK (&inode->lock); + { + if (filler->real_path) { + size = sys_lgetxattr (filler->real_path, k, + (char *)array, v->len); + } else { + size = sys_fgetxattr (filler->fd, k, (char *)array, + v->len); + } + + op_errno = errno; + if ((size == -1) && (op_errno != ENODATA) && + (op_errno != ENOATTR)) { + if (op_errno == ENOTSUP) { + GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, + this->name, GF_LOG_WARNING, + "Extended attributes not " + "supported by filesystem"); + } else if (op_errno != ENOENT || + !posix_special_xattr (marker_xattrs, + k)) { + if (filler->real_path) + gf_log (this->name, GF_LOG_ERROR, + "getxattr failed on %s while doing " + "xattrop: Key:%s (%s)", + filler->real_path, + k, strerror (op_errno)); + else + gf_log (this->name, GF_LOG_ERROR, + "fgetxattr failed on fd=%d while doing " + "xattrop: Key:%s (%s)", + filler->fd, + k, strerror (op_errno)); + } + + op_ret = -1; + goto unlock; + } + + switch (optype) { + + case GF_XATTROP_ADD_ARRAY: + __add_array ((int32_t *) array, (int32_t *) v->data, + v->len / 4); + break; + + case GF_XATTROP_ADD_ARRAY64: + __add_long_array ((int64_t *) array, (int64_t *) v->data, + v->len / 8); + break; + + case GF_XATTROP_OR_ARRAY: + __or_array ((int32_t *) array, + (int32_t *) v->data, + v->len / 4); + break; + + case GF_XATTROP_AND_ARRAY: + __and_array ((int32_t *) array, + (int32_t *) v->data, + v->len / 4); + break; + + default: + gf_log (this->name, GF_LOG_ERROR, + "Unknown xattrop type (%d) on %s. Please send " + "a bug report to gluster-devel@gluster.org", + optype, filler->real_path); + op_ret = -1; + op_errno = EINVAL; + goto unlock; + } + + if (filler->real_path) { + size = sys_lsetxattr (filler->real_path, k, array, + v->len, 0); + } else { + size = sys_fsetxattr (filler->fd, k, (char *)array, + v->len, 0); + } + } +unlock: + UNLOCK (&inode->lock); + + if (op_ret == -1) + goto out; + + op_errno = errno; + if (size == -1) { + if (filler->real_path) + gf_log (this->name, GF_LOG_ERROR, + "setxattr failed on %s while doing xattrop: " + "key=%s (%s)", filler->real_path, + k, strerror (op_errno)); + else + gf_log (this->name, GF_LOG_ERROR, + "fsetxattr failed on fd=%d while doing xattrop: " + "key=%s (%s)", filler->fd, + k, strerror (op_errno)); + + op_ret = -1; + goto out; + } else { + size = dict_set_bin (d, k, array, v->len); + + if (size != 0) { + if (filler->real_path) + gf_log (this->name, GF_LOG_DEBUG, + "dict_set_bin failed (path=%s): " + "key=%s (%s)", filler->real_path, + k, strerror (-size)); + else + gf_log (this->name, GF_LOG_DEBUG, + "dict_set_bin failed (fd=%d): " + "key=%s (%s)", filler->fd, + k, strerror (-size)); + + op_ret = -1; + op_errno = EINVAL; + goto out; + } + array = NULL; + } + + array = NULL; + +out: + return op_ret; +} + /** * xattrop - xattr operations - for internal use by GlusterFS * @optype: ADD_ARRAY: @@ -3039,36 +4408,24 @@ int do_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr) { - char *real_path = NULL; - char *array = NULL; - int size = 0; - int count = 0; - - int op_ret = 0; - int op_errno = 0; - - int ret = 0; - int _fd = -1; - struct posix_fd *pfd = NULL; - - data_pair_t *trav = NULL; - - char * path = NULL; - inode_t * inode = NULL; + int op_ret = 0; + int op_errno = 0; + int _fd = -1; + char *real_path = NULL; + struct posix_fd *pfd = NULL; + inode_t *inode = NULL; + posix_xattr_filler_t filler = {0,}; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (xattr, out); VALIDATE_OR_GOTO (this, out); - trav = xattr->members_list; - if (fd) { - ret = posix_fd_ctx_get (fd, this, &pfd); - if (ret < 0) { + op_ret = posix_fd_ctx_get (fd, this, &pfd); + if (op_ret < 0) { gf_log (this->name, GF_LOG_WARNING, "failed to get pfd from fd=%p", fd); - op_ret = -1; op_errno = EBADFD; goto out; } @@ -3079,138 +4436,21 @@ do_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, MAKE_INODE_HANDLE (real_path, this, loc, NULL); if (real_path) { - path = gf_strdup (real_path); inode = loc->inode; } else if (fd) { inode = fd->inode; } - while (trav && inode) { - count = trav->value->len; - array = GF_CALLOC (count, sizeof (char), - gf_posix_mt_char); + filler.this = this; + filler.fd = _fd; + filler.real_path = real_path; + filler.flags = (int)optype; + filler.inode = inode; - LOCK (&inode->lock); - { - if (loc) { - size = sys_lgetxattr (real_path, trav->key, (char *)array, - trav->value->len); - } else { - size = sys_fgetxattr (_fd, trav->key, (char *)array, - trav->value->len); - } - - op_errno = errno; - if ((size == -1) && (op_errno != ENODATA) && - (op_errno != ENOATTR)) { - if (op_errno == ENOTSUP) { - GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, - this->name,GF_LOG_WARNING, - "Extended attributes not " - "supported by filesystem"); - } else if (op_errno != ENOENT || - !posix_special_xattr (marker_xattrs, - trav->key)) { - if (loc) - gf_log (this->name, GF_LOG_ERROR, - "getxattr failed on %s while doing " - "xattrop: Key:%s (%s)", path, - trav->key, strerror (op_errno)); - else - gf_log (this->name, GF_LOG_ERROR, - "fgetxattr failed on fd=%d while doing " - "xattrop: Key:%s (%s)", _fd, - trav->key, strerror (op_errno)); - } - - op_ret = -1; - goto unlock; - } - - switch (optype) { - - case GF_XATTROP_ADD_ARRAY: - __add_array ((int32_t *) array, (int32_t *) trav->value->data, - trav->value->len / 4); - break; - - case GF_XATTROP_ADD_ARRAY64: - __add_long_array ((int64_t *) array, (int64_t *) trav->value->data, - trav->value->len / 8); - break; - - default: - gf_log (this->name, GF_LOG_ERROR, - "Unknown xattrop type (%d) on %s. Please send " - "a bug report to gluster-devel@nongnu.org", - optype, path); - op_ret = -1; - op_errno = EINVAL; - goto unlock; - } - - if (loc) { - size = sys_lsetxattr (real_path, trav->key, array, - trav->value->len, 0); - } else { - size = sys_fsetxattr (_fd, trav->key, (char *)array, - trav->value->len, 0); - } - } - unlock: - UNLOCK (&inode->lock); - - if (op_ret == -1) - goto out; - - op_errno = errno; - if (size == -1) { - if (loc) - gf_log (this->name, GF_LOG_ERROR, - "setxattr failed on %s while doing xattrop: " - "key=%s (%s)", path, - trav->key, strerror (op_errno)); - else - gf_log (this->name, GF_LOG_ERROR, - "fsetxattr failed on fd=%d while doing xattrop: " - "key=%s (%s)", _fd, - trav->key, strerror (op_errno)); - - op_ret = -1; - goto out; - } else { - size = dict_set_bin (xattr, trav->key, array, - trav->value->len); - - if (size != 0) { - if (loc) - gf_log (this->name, GF_LOG_DEBUG, - "dict_set_bin failed (path=%s): " - "key=%s (%s)", path, - trav->key, strerror (-size)); - else - gf_log (this->name, GF_LOG_DEBUG, - "dict_set_bin failed (fd=%d): " - "key=%s (%s)", _fd, - trav->key, strerror (-size)); - - op_ret = -1; - op_errno = EINVAL; - goto out; - } - array = NULL; - } - - array = NULL; - trav = trav->next; - } + op_ret = dict_foreach (xattr, _posix_handle_xattr_keyvalue_pair, + &filler); out: - if (array) - GF_FREE (array); - - if (path) - GF_FREE (path); STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, xattr, NULL); return 0; @@ -3468,7 +4708,7 @@ posix_fentrylk (call_frame_t *frame, xlator_t *this, int posix_fill_readdir (fd_t *fd, DIR *dir, off_t off, size_t size, - gf_dirent_t *entries) + gf_dirent_t *entries, xlator_t *this, int32_t skip_dirs) { off_t in_case = -1; size_t filled = 0; @@ -3478,6 +4718,18 @@ posix_fill_readdir (fd_t *fd, DIR *dir, off_t off, size_t size, int32_t this_size = -1; gf_dirent_t *this_entry = NULL; uuid_t rootgfid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1}; + struct stat stbuf = {0,}; + char *hpath = NULL; + int len = 0; + int ret = 0; + + if (skip_dirs) { + len = posix_handle_path (this, fd->inode->gfid, NULL, NULL, 0); + hpath = alloca (len + 256); /* NAME_MAX */ + posix_handle_path (this, fd->inode->gfid, NULL, hpath, len); + len = strlen (hpath); + hpath[len] = '/'; + } if (!off) { rewinddir (dir); @@ -3509,10 +4761,6 @@ posix_fill_readdir (fd_t *fd, DIR *dir, off_t off, size_t size, break; } - if ((uuid_compare (fd->inode->gfid, rootgfid) == 0) - && (!strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR))) - continue; - #ifdef __NetBSD__ /* * NetBSD with UFS1 backend uses backing files for @@ -3532,6 +4780,17 @@ posix_fill_readdir (fd_t *fd, DIR *dir, off_t off, size_t size, continue; } + if (skip_dirs) { + if (DT_ISDIR (entry->d_type)) { + continue; + } else if (hpath) { + strcpy (&hpath[len+1],entry->d_name); + ret = lstat (hpath, &stbuf); + if (!ret && S_ISDIR (stbuf.st_mode)) + continue; + } + } + this_size = max (sizeof (gf_dirent_t), sizeof (gfs3_dirplist)) + strlen (entry->d_name) + 1; @@ -3551,6 +4810,7 @@ posix_fill_readdir (fd_t *fd, DIR *dir, off_t off, size_t size, } this_entry->d_off = telldir (dir); this_entry->d_ino = entry->d_ino; + this_entry->d_type = entry->d_type; list_add_tail (&this_entry->list, &entries->list); @@ -3583,24 +4843,78 @@ posix_entry_xattr_fill (xlator_t *this, inode_t *inode, } + +int +posix_readdirp_fill (xlator_t *this, fd_t *fd, gf_dirent_t *entries, dict_t *dict) +{ + gf_dirent_t *entry = NULL; + inode_table_t *itable = NULL; + inode_t *inode = NULL; + char *hpath = NULL; + int len = 0; + struct iatt stbuf = {0, }; + uuid_t gfid; + + if (list_empty(&entries->list)) + return 0; + + itable = fd->inode->table; + + len = posix_handle_path (this, fd->inode->gfid, NULL, NULL, 0); + hpath = alloca (len + 256); /* NAME_MAX */ + posix_handle_path (this, fd->inode->gfid, NULL, hpath, len); + len = strlen (hpath); + hpath[len] = '/'; + + list_for_each_entry (entry, &entries->list, list) { + memset (gfid, 0, 16); + inode = inode_grep (fd->inode->table, fd->inode, + entry->d_name); + if (inode) + uuid_copy (gfid, inode->gfid); + + strcpy (&hpath[len+1], entry->d_name); + + posix_pstat (this, gfid, hpath, &stbuf); + + if (!inode) + inode = inode_find (itable, stbuf.ia_gfid); + + if (!inode) + inode = inode_new (itable); + + entry->inode = inode; + + if (dict) { + entry->dict = + posix_entry_xattr_fill (this, entry->inode, + fd, entry->d_name, + dict, &stbuf); + dict_ref (entry->dict); + } + + entry->d_stat = stbuf; + if (stbuf.ia_ino) + entry->d_ino = stbuf.ia_ino; + inode = NULL; + } + + return 0; +} + + int32_t posix_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t off, int whichop, dict_t *dict) { - struct posix_fd *pfd = NULL; - DIR *dir = NULL; - int ret = -1; - int count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - gf_dirent_t entries; - struct iatt stbuf = {0, }; - gf_dirent_t *tmp_entry = NULL; - inode_table_t *itable = NULL; -#ifdef IGNORE_READDIRP_ATTRS - uuid_t gfid; - ia_type_t entry_type = 0; -#endif + struct posix_fd *pfd = NULL; + DIR *dir = NULL; + int ret = -1; + int count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + gf_dirent_t entries; + int32_t skip_dirs = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -3623,9 +4937,30 @@ posix_do_readdir (call_frame_t *frame, xlator_t *this, "dir is NULL for fd=%p", fd); op_errno = EINVAL; goto out; - } + } - count = posix_fill_readdir (fd, dir, off, size, &entries); + /* When READDIR_FILTER option is set to on, we can filter out + * directory's entry from the entry->list. + */ + ret = dict_get_int32 (dict, GF_READDIR_SKIP_DIRS, &skip_dirs); + + LOCK (&fd->lock); + { + /* posix_fill_readdir performs multiple separate individual + readdir() calls to fill up the buffer. + + In case of NFS where the same anonymous FD is shared between + different applications, reading a common directory can + result in the anonymous fd getting re-used unsafely between + the two readdir requests (in two different io-threads). + + It would also help, in the future, to replace the loop + around readdir() with a single large getdents() call. + */ + count = posix_fill_readdir (fd, dir, off, size, &entries, this, + skip_dirs); + } + UNLOCK (&fd->lock); /* pick ENOENT to indicate EOF */ op_errno = errno; @@ -3634,43 +4969,7 @@ posix_do_readdir (call_frame_t *frame, xlator_t *this, if (whichop != GF_FOP_READDIRP) goto out; - itable = fd->inode->table; - - list_for_each_entry (tmp_entry, &entries.list, list) { -#ifdef IGNORE_READDIRP_ATTRS - ret = inode_grep_for_gfid (fd->inode->table, fd->inode, - tmp_entry->d_name, gfid, - &entry_type); - if (ret == 0) { - memset (&stbuf, 0, sizeof (stbuf)); - uuid_copy (stbuf.ia_gfid, gfid); - posix_fill_ino_from_gfid (this, &stbuf); - stbuf.ia_type = entry_type; - } else { - posix_istat (this, fd->inode->gfid, - tmp_entry->d_name, &stbuf); - } -#else - posix_istat (this, fd->inode->gfid, - tmp_entry->d_name, &stbuf); -#endif - if (stbuf.ia_ino) - tmp_entry->d_ino = stbuf.ia_ino; - - if (dict) { - tmp_entry->inode = inode_find (itable, stbuf.ia_gfid); - if (!tmp_entry->inode) - tmp_entry->inode = inode_new (itable); - - tmp_entry->dict = - posix_entry_xattr_fill (this, tmp_entry->inode, - fd, tmp_entry->d_name, - dict, &stbuf); - dict_ref (tmp_entry->dict); - } - - tmp_entry->d_stat = stbuf; - } + posix_readdirp_fill (this, fd, &entries, dict); out: STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, &entries, NULL); @@ -3694,6 +4993,32 @@ int32_t posix_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t off, dict_t *dict) { + gf_dirent_t entries; + int32_t op_ret = -1, op_errno = 0; + gf_dirent_t *entry = NULL; + + + if ((dict != NULL) && (dict_get (dict, GET_ANCESTRY_DENTRY_KEY))) { + INIT_LIST_HEAD (&entries.list); + + op_ret = posix_get_ancestry (this, fd->inode, &entries, NULL, + POSIX_ANCESTRY_DENTRY, + &op_errno, dict); + if (op_ret >= 0) { + op_ret = 0; + + list_for_each_entry (entry, &entries.list, list) { + op_ret++; + } + } + + STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, &entries, + NULL); + + gf_dirent_free (&entries); + return 0; + } + posix_do_readdir (frame, this, fd, size, off, GF_FOP_READDIRP, dict); return 0; } @@ -3736,28 +5061,26 @@ int32_t posix_rchecksum (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, int32_t len, dict_t *xdata) { - char *buf = NULL; - - int _fd = -1; - - struct posix_fd *pfd = NULL; - - int op_ret = -1; - int op_errno = 0; - - int ret = 0; - - int32_t weak_checksum = 0; - unsigned char strong_checksum[MD5_DIGEST_LENGTH]; + char *alloc_buf = NULL; + char *buf = NULL; + int _fd = -1; + struct posix_fd *pfd = NULL; + int op_ret = -1; + int op_errno = 0; + int ret = 0; + int32_t weak_checksum = 0; + unsigned char strong_checksum[MD5_DIGEST_LENGTH] = {0}; + struct posix_private *priv = NULL; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (fd, out); + priv = this->private; memset (strong_checksum, 0, MD5_DIGEST_LENGTH); - buf = GF_CALLOC (1, len, gf_posix_mt_char); - if (!buf) { + alloc_buf = _page_aligned_alloc (len, &buf); + if (!alloc_buf) { op_errno = ENOMEM; goto out; } @@ -3772,25 +5095,36 @@ posix_rchecksum (call_frame_t *frame, xlator_t *this, _fd = pfd->fd; - ret = pread (_fd, buf, len, offset); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "pread of %d bytes returned %d (%s)", - len, ret, strerror (errno)); + LOCK (&fd->lock); + { + if (priv->aio_capable && priv->aio_init_done) + __posix_fd_set_odirect (fd, pfd, 0, offset, len); + + ret = pread (_fd, buf, len, offset); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "pread of %d bytes returned %d (%s)", + len, ret, strerror (errno)); + + op_errno = errno; + } - op_errno = errno; - goto out; } + UNLOCK (&fd->lock); - weak_checksum = gf_rsync_weak_checksum ((unsigned char *) buf, (size_t) len); - gf_rsync_strong_checksum ((unsigned char *) buf, (size_t) len, (unsigned char *) strong_checksum); + if (ret < 0) + goto out; - GF_FREE (buf); + weak_checksum = gf_rsync_weak_checksum ((unsigned char *) buf, (size_t) ret); + gf_rsync_strong_checksum ((unsigned char *) buf, (size_t) ret, (unsigned char *) strong_checksum); op_ret = 0; out: STACK_UNWIND_STRICT (rchecksum, frame, op_ret, op_errno, weak_checksum, strong_checksum, NULL); + + GF_FREE (alloc_buf); + return 0; } @@ -3838,6 +5172,148 @@ mem_acct_init (xlator_t *this) return ret; } +static int +posix_set_owner (xlator_t *this, uid_t uid, gid_t gid) +{ + struct posix_private *priv = NULL; + int ret = -1; + struct stat st = {0,}; + + priv = this->private; + + ret = sys_lstat (priv->base_path, &st); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to stat " + "brick path %s (%s)", + priv->base_path, strerror (errno)); + return ret; + } + + if ((uid == -1 || st.st_uid == uid) && + (gid == -1 || st.st_gid == gid)) + return 0; + + ret = sys_chown (priv->base_path, uid, gid); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Failed to set " + "uid/gid for brick path %s, %s", + priv->base_path, strerror (errno)); + + return ret; +} + + +static int +set_batch_fsync_mode (struct posix_private *priv, const char *str) +{ + if (strcmp (str, "none") == 0) + priv->batch_fsync_mode = BATCH_NONE; + else if (strcmp (str, "syncfs") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS; + else if (strcmp (str, "syncfs-single-fsync") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS_SINGLE_FSYNC; + else if (strcmp (str, "syncfs-reverse-fsync") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS_REVERSE_FSYNC; + else if (strcmp (str, "reverse-fsync") == 0) + priv->batch_fsync_mode = BATCH_REVERSE_FSYNC; + else + return -1; + + return 0; +} + +#ifdef GF_DARWIN_HOST_OS +static int +set_xattr_user_namespace_mode (struct posix_private *priv, const char *str) +{ + if (strcmp (str, "none") == 0) + priv->xattr_user_namespace = XATTR_NONE; + else if (strcmp (str, "strip") == 0) + priv->xattr_user_namespace = XATTR_STRIP; + else if (strcmp (str, "append") == 0) + priv->xattr_user_namespace = XATTR_APPEND; + else if (strcmp (str, "both") == 0) + priv->xattr_user_namespace = XATTR_BOTH; + else + return -1; + return 0; +} +#endif + +int +reconfigure (xlator_t *this, dict_t *options) +{ + int ret = -1; + struct posix_private *priv = NULL; + int32_t uid = -1; + int32_t gid = -1; + char *batch_fsync_mode_str = NULL; + + priv = this->private; + + GF_OPTION_RECONF ("brick-uid", uid, options, int32, out); + GF_OPTION_RECONF ("brick-gid", gid, options, int32, out); + if (uid != -1 || gid != -1) + posix_set_owner (this, uid, gid); + + GF_OPTION_RECONF ("batch-fsync-delay-usec", priv->batch_fsync_delay_usec, + options, uint32, out); + + GF_OPTION_RECONF ("batch-fsync-mode", batch_fsync_mode_str, + options, str, out); + + if (set_batch_fsync_mode (priv, batch_fsync_mode_str) != 0) { + gf_log (this->name, GF_LOG_ERROR, "Unknown mode string: %s", + batch_fsync_mode_str); + goto out; + } + +#ifdef GF_DARWIN_HOST_OS + + char *xattr_user_namespace_mode_str = NULL; + + GF_OPTION_RECONF ("xattr-user-namespace-mode", xattr_user_namespace_mode_str, + options, str, out); + + if (set_xattr_user_namespace_mode (priv, xattr_user_namespace_mode_str) != 0) { + gf_log (this->name, GF_LOG_ERROR, "Unknown xattr user namespace mode string: %s", + xattr_user_namespace_mode_str); + goto out; + } + +#endif + + GF_OPTION_RECONF ("linux-aio", priv->aio_configured, + options, bool, out); + + if (priv->aio_configured) + posix_aio_on (this); + else + posix_aio_off (this); + + GF_OPTION_RECONF ("update-link-count-parent", priv->update_pgfid_nlinks, + options, bool, out); + + GF_OPTION_RECONF ("node-uuid-pathinfo", priv->node_uuid_pathinfo, + options, bool, out); + + if (priv->node_uuid_pathinfo && + (uuid_is_null (priv->glusterd_uuid))) { + gf_log (this->name, GF_LOG_INFO, + "glusterd uuid is NULL, pathinfo xattr would" + " fallback to <hostname>:<export>"); + } + + GF_OPTION_RECONF ("health-check-interval", priv->health_check_interval, + options, uint32, out); + posix_spawn_health_check_thread (this); + + ret = 0; +out: + return ret; +} + + /** * init - */ @@ -3852,12 +5328,16 @@ init (xlator_t *this) int dict_ret = 0; int ret = 0; int op_ret = -1; + ssize_t size = -1; int32_t janitor_sleep = 0; uuid_t old_uuid = {0,}; uuid_t dict_uuid = {0,}; uuid_t gfid = {0,}; uuid_t rootgfid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1}; char *guuid = NULL; + int32_t uid = -1; + int32_t gid = -1; + char *batch_fsync_mode_str; dir_data = dict_get (this->options, "directory"); @@ -3938,9 +5418,9 @@ init (xlator_t *this) ret = -1; goto out; } - op_ret = sys_lgetxattr (dir_data->data, - "trusted.glusterfs.volume-id", old_uuid, 16); - if (op_ret == 16) { + size = sys_lgetxattr (dir_data->data, + "trusted.glusterfs.volume-id", old_uuid, 16); + if (size == 16) { if (uuid_compare (old_uuid, dict_uuid)) { gf_log (this->name, GF_LOG_ERROR, "mismatching volume-id (%s) received. " @@ -3949,22 +5429,20 @@ init (xlator_t *this) ret = -1; goto out; } - } else if ((op_ret == -1) && (errno == ENODATA)) { - /* Using the export for first time */ - op_ret = sys_lsetxattr (dir_data->data, - "trusted.glusterfs.volume-id", - dict_uuid, 16, 0); - if (op_ret == -1) { + } else if ((size == -1) && (errno == ENODATA)) { + gf_log (this->name, GF_LOG_ERROR, - "failed to set volume id on export"); + "Extended attribute trusted.glusterfs." + "volume-id is absent"); ret = -1; goto out; - } - } else if ((op_ret == -1) && (errno != ENODATA)) { + + } else if ((size == -1) && (errno != ENODATA)) { /* Wrong 'volume-id' is set, it should be error */ gf_log (this->name, GF_LOG_WARNING, "%s: failed to fetch volume-id (%s)", dir_data->data, strerror (errno)); + ret = -1; goto out; } else { ret = -1; @@ -3976,8 +5454,8 @@ init (xlator_t *this) /* Now check if the export directory has some other 'gfid', other than that of root '/' */ - ret = sys_lgetxattr (dir_data->data, "trusted.gfid", gfid, 16); - if (ret == 16) { + size = sys_lgetxattr (dir_data->data, "trusted.gfid", gfid, 16); + if (size == 16) { if (!__is_root_gfid (gfid)) { gf_log (this->name, GF_LOG_WARNING, "%s: gfid (%s) is not that of glusterfs '/' ", @@ -3985,34 +5463,37 @@ init (xlator_t *this) ret = -1; goto out; } - } else if (ret != -1) { + } else if (size != -1) { /* Wrong 'gfid' is set, it should be error */ gf_log (this->name, GF_LOG_WARNING, "%s: wrong value set as gfid", dir_data->data); ret = -1; goto out; - } else if ((ret == -1) && (errno != ENODATA)) { + } else if ((size == -1) && (errno != ENODATA) && + (errno != ENOATTR)) { /* Wrong 'gfid' is set, it should be error */ gf_log (this->name, GF_LOG_WARNING, "%s: failed to fetch gfid (%s)", dir_data->data, strerror (errno)); + ret = -1; goto out; } else { /* First time volume, set the GFID */ - ret = sys_lsetxattr (dir_data->data, "trusted.gfid", rootgfid, + size = sys_lsetxattr (dir_data->data, "trusted.gfid", rootgfid, 16, XATTR_CREATE); - if (ret) { + if (size) { gf_log (this->name, GF_LOG_ERROR, "%s: failed to set gfid (%s)", dir_data->data, strerror (errno)); + ret = -1; goto out; } } - op_ret = sys_lgetxattr (dir_data->data, "system.posix_acl_access", - NULL, 0); - if ((op_ret < 0) && (errno == ENOTSUP)) + size = sys_lgetxattr (dir_data->data, POSIX_ACL_ACCESS_XATTR, + NULL, 0); + if ((size < 0) && (errno == ENOTSUP)) gf_log (this->name, GF_LOG_WARNING, "Posix access control list is not supported."); @@ -4027,20 +5508,6 @@ init (xlator_t *this) _private->base_path = gf_strdup (dir_data->data); _private->base_path_length = strlen (_private->base_path); - _private->trash_path = GF_CALLOC (1, _private->base_path_length - + strlen ("/") - + strlen (GF_REPLICATE_TRASH_DIR) - + 1, - gf_posix_mt_trash_path); - - if (!_private->trash_path) { - ret = -1; - goto out; - } - - strncpy (_private->trash_path, _private->base_path, _private->base_path_length); - strcat (_private->trash_path, "/" GF_REPLICATE_TRASH_DIR); - LOCK_INIT (&_private->lock); ret = dict_get_str (this->options, "hostname", &_private->hostname); @@ -4105,6 +5572,24 @@ init (xlator_t *this) "for every open)"); } + tmp_data = dict_get (this->options, "update-link-count-parent"); + if (tmp_data) { + if (gf_string2boolean (tmp_data->data, + &_private->update_pgfid_nlinks) == -1) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "wrong value provided for " + "'update-link-count-parent'"); + goto out; + } + if (_private->update_pgfid_nlinks) + gf_log (this->name, GF_LOG_DEBUG, + "update-link-count-parent is enabled. Thus for each " + "file an extended attribute representing the " + "number of hardlinks for that file within the " + "same parent directory is set."); + } + ret = dict_get_str (this->options, "glusterd-uuid", &guuid); if (!ret) { if (uuid_parse (guuid, _private->glusterd_uuid)) @@ -4176,11 +5661,93 @@ init (xlator_t *this) goto out; } + op_ret = posix_handle_trash_init (this); + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "Posix landfill setup failed"); + ret = -1; + goto out; + } + + _private->aio_init_done = _gf_false; + _private->aio_capable = _gf_false; + + GF_OPTION_INIT ("brick-uid", uid, int32, out); + GF_OPTION_INIT ("brick-gid", gid, int32, out); + if (uid != -1 || gid != -1) + posix_set_owner (this, uid, gid); + + GF_OPTION_INIT ("linux-aio", _private->aio_configured, bool, out); + + if (_private->aio_configured) { + op_ret = posix_aio_on (this); + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "Posix AIO init failed"); + ret = -1; + goto out; + } + } + + GF_OPTION_INIT ("node-uuid-pathinfo", + _private->node_uuid_pathinfo, bool, out); + if (_private->node_uuid_pathinfo && + (uuid_is_null (_private->glusterd_uuid))) { + gf_log (this->name, GF_LOG_INFO, + "glusterd uuid is NULL, pathinfo xattr would" + " fallback to <hostname>:<export>"); + } + + _private->health_check_active = _gf_false; + GF_OPTION_INIT ("health-check-interval", + _private->health_check_interval, uint32, out); + if (_private->health_check_interval) + posix_spawn_health_check_thread (this); + pthread_mutex_init (&_private->janitor_lock, NULL); pthread_cond_init (&_private->janitor_cond, NULL); INIT_LIST_HEAD (&_private->janitor_fds); posix_spawn_janitor_thread (this); + + pthread_mutex_init (&_private->fsync_mutex, NULL); + pthread_cond_init (&_private->fsync_cond, NULL); + INIT_LIST_HEAD (&_private->fsyncs); + + ret = gf_thread_create (&_private->fsyncer, NULL, posix_fsyncer, this); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "fsyncer thread" + " creation failed (%s)", strerror (errno)); + goto out; + } + + GF_OPTION_INIT ("batch-fsync-mode", batch_fsync_mode_str, str, out); + + if (set_batch_fsync_mode (_private, batch_fsync_mode_str) != 0) { + gf_log (this->name, GF_LOG_ERROR, "Unknown mode string: %s", + batch_fsync_mode_str); + goto out; + } + +#ifdef GF_DARWIN_HOST_OS + + char *xattr_user_namespace_mode_str = NULL; + + GF_OPTION_INIT ("xattr-user-namespace-mode", + xattr_user_namespace_mode_str, str, out); + + if (set_xattr_user_namespace_mode (_private, + xattr_user_namespace_mode_str) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "Unknown xattr user namespace mode string: %s", + xattr_user_namespace_mode_str); + goto out; + } +#endif + + GF_OPTION_INIT ("batch-fsync-delay-usec", _private->batch_fsync_delay_usec, + uint32, out); out: return ret; } @@ -4246,6 +5813,10 @@ struct xlator_fops fops = { .fxattrop = posix_fxattrop, .setattr = posix_setattr, .fsetattr = posix_fsetattr, + .fallocate = _posix_fallocate, + .discard = posix_discard, + .zerofill = posix_zerofill, + .ipc = posix_ipc, }; struct xlator_cbks cbks = { @@ -4273,5 +5844,77 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_ANY }, { .key = {"glusterd-uuid"}, .type = GF_OPTION_TYPE_STR }, + { + .key = {"linux-aio"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Support for native Linux AIO" + }, + { + .key = {"brick-uid"}, + .type = GF_OPTION_TYPE_INT, + .min = -1, + .validate = GF_OPT_VALIDATE_MIN, + .default_value = "-1", + .description = "Support for setting uid of brick's owner" + }, + { + .key = {"brick-gid"}, + .type = GF_OPTION_TYPE_INT, + .min = -1, + .validate = GF_OPT_VALIDATE_MIN, + .default_value = "-1", + .description = "Support for setting gid of brick's owner" + }, + { .key = {"node-uuid-pathinfo"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "return glusterd's node-uuid in pathinfo xattr" + " string instead of hostname" + }, + { + .key = {"health-check-interval"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .default_value = "30", + .validate = GF_OPT_VALIDATE_MIN, + .description = "Interval in seconds for a filesystem health check, " + "set to 0 to disable" + }, + { .key = {"batch-fsync-mode"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "reverse-fsync", + .description = "Possible values:\n" + "\t- syncfs: Perform one syncfs() on behalf oa batch" + "of fsyncs.\n" + "\t- syncfs-single-fsync: Perform one syncfs() on behalf of a batch" + " of fsyncs and one fsync() per batch.\n" + "\t- syncfs-reverse-fsync: Preform one syncfs() on behalf of a batch" + " of fsyncs and fsync() each file in the batch in reverse order.\n" + " in reverse order.\n" + "\t- reverse-fsync: Perform fsync() of each file in the batch in" + " reverse order." + }, + { .key = {"batch-fsync-delay-usec"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "0", + .description = "Num of usecs to wait for aggregating fsync" + " requests", + }, + { .key = {"update-link-count-parent"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Enable placeholders for gfid to path conversion" + }, +#if GF_DARWIN_HOST_OS + { .key = {"xattr-user-namespace-mode"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "none", + .description = "Option to control XATTR user namespace on the raw filesystem: " + "\t- None: Will use the user namespace, so files will be exchangable with Linux.\n" + " The raw filesystem will not be compatible with OS X Finder.\n" + "\t- Strip: Will strip the user namespace before setting. The raw filesystem will work in OS X.\n" + }, +#endif { .key = {NULL} } }; diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index 7c2b47bb0..c9bfc984d 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -1,22 +1,12 @@ /* - Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef _POSIX_H #define _POSIX_H @@ -53,6 +43,20 @@ #include "timer.h" #include "posix-mem-types.h" #include "posix-handle.h" +#include "call-stub.h" + +#ifdef HAVE_LIBAIO +#include <libaio.h> +#include "posix-aio.h" +#endif + +#define VECTOR_SIZE 64 * 1024 /* vector size 64KB*/ +#define MAX_NO_VECT 1024 + +#define POSIX_GFID_HANDLE_SIZE(base_path_len) (base_path_len + SLEN("/") \ + + SLEN(GF_HIDDEN_PATH) + SLEN("/") \ + + SLEN("00/") \ + + SLEN("00/") + SLEN(UUID0_STR) + 1) /* '\0' */; /** * posix_fd - internal structure common to file and directory fd's @@ -62,9 +66,7 @@ struct posix_fd { int fd; /* fd returned by the kernel */ int32_t flags; /* flags for open/creat */ DIR * dir; /* handle returned by the kernel */ - int flushwrites; int odirect; - int op_performed; struct list_head list; /* to add to the janitor list */ }; @@ -124,8 +126,63 @@ struct posix_private { /* uuid of glusterd that swapned the brick process */ uuid_t glusterd_uuid; + gf_boolean_t aio_configured; + gf_boolean_t aio_init_done; + gf_boolean_t aio_capable; +#ifdef HAVE_LIBAIO + io_context_t ctxp; + pthread_t aiothread; +#endif + + /* node-uuid in pathinfo xattr */ + gf_boolean_t node_uuid_pathinfo; + + pthread_t fsyncer; + struct list_head fsyncs; + pthread_mutex_t fsync_mutex; + pthread_cond_t fsync_cond; + int fsync_queue_count; + + enum { + BATCH_NONE = 0, + BATCH_SYNCFS, + BATCH_SYNCFS_SINGLE_FSYNC, + BATCH_REVERSE_FSYNC, + BATCH_SYNCFS_REVERSE_FSYNC + } batch_fsync_mode; + + uint32_t batch_fsync_delay_usec; + gf_boolean_t update_pgfid_nlinks; + + /* seconds to sleep between health checks */ + uint32_t health_check_interval; + pthread_t health_check; + gf_boolean_t health_check_active; + +#ifdef GF_DARWIN_HOST_OS + enum { + XATTR_NONE = 0, + XATTR_STRIP, + XATTR_APPEND, + XATTR_BOTH, + } xattr_user_namespace; +#endif + }; +typedef struct { + xlator_t *this; + const char *real_path; + dict_t *xattr; + struct iatt *stbuf; + loc_t *loc; + inode_t *inode; /* for all do_xattrop() key handling */ + int fd; + int flags; + int32_t op_errno; +} posix_xattr_filler_t; + + #define POSIX_BASE_PATH(this) (((struct posix_private *)this->private)->base_path) #define POSIX_BASE_PATH_LEN(this) (((struct posix_private *)this->private)->base_path_length) @@ -140,23 +197,33 @@ int posix_pstat (xlator_t *this, uuid_t gfid, const char *real_path, struct iatt *iatt); dict_t *posix_lookup_xattr_fill (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr, struct iatt *buf); -int posix_handle_pair (xlator_t *this, const char *real_path, - data_pair_t *trav, int flags); -int posix_fhandle_pair (xlator_t *this, int fd, data_pair_t *trav, int flags); +int posix_handle_pair (xlator_t *this, const char *real_path, char *key, + data_t *value, int flags); +int posix_fhandle_pair (xlator_t *this, int fd, char *key, data_t *value, + int flags); void posix_spawn_janitor_thread (xlator_t *this); int posix_get_file_contents (xlator_t *this, uuid_t pargfid, const char *name, char **contents); -int posix_set_file_contents (xlator_t *this, const char *path, - data_pair_t *trav, int flags); +int posix_set_file_contents (xlator_t *this, const char *path, char *key, + data_t *value, int flags); int posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req); -int posix_gfid_heal (xlator_t *this, const char *path, dict_t *xattr_req); +int posix_gfid_heal (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req); int posix_entry_create_xattr_set (xlator_t *this, const char *path, dict_t *dict); int posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd); -int posix_fd_ctx_get_off (fd_t *fd, xlator_t *this, struct posix_fd **pfd, - off_t off); void posix_fill_ino_from_gfid (xlator_t *this, struct iatt *buf); gf_boolean_t posix_special_xattr (char **pattern, char *key); + +void +__posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags, + off_t offset, size_t size); +void posix_spawn_health_check_thread (xlator_t *this); + +void *posix_fsyncer (void *); +int +posix_get_ancestry (xlator_t *this, inode_t *leaf_inode, + gf_dirent_t *head, char **path, int type, int32_t *op_errno, + dict_t *xdata); #endif /* _POSIX_H */ |
