diff options
Diffstat (limited to 'xlators/storage/posix/src')
| -rw-r--r-- | xlators/storage/posix/src/Makefile.am | 17 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-aio.c | 569 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-aio.h | 39 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-handle.c | 744 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-handle.h | 143 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-helpers.c | 893 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-mem-types.h | 21 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix.c | 2781 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix.h | 129 |
9 files changed, 4194 insertions, 1142 deletions
diff --git a/xlators/storage/posix/src/Makefile.am b/xlators/storage/posix/src/Makefile.am index 90ea1fd51..88efcc784 100644 --- a/xlators/storage/posix/src/Makefile.am +++ b/xlators/storage/posix/src/Makefile.am @@ -2,17 +2,18 @@ xlator_LTLIBRARIES = posix.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage -posix_la_LDFLAGS = -module -avoidversion +posix_la_LDFLAGS = -module -avoid-version -posix_la_SOURCES = posix.c posix-helpers.c -posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +posix_la_SOURCES = posix.c posix-helpers.c posix-handle.c posix-aio.c +posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBAIO) -noinst_HEADERS = posix.h posix-mem-types.h +noinst_HEADERS = posix.h posix-mem-types.h posix-handle.h posix-aio.h -AM_CFLAGS = -fPIC -fno-strict-aliasing -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE \ - -D$(GF_HOST_OS) -Wall -I$(top_srcdir)/libglusterfs/src -shared \ - -nostartfiles -I$(top_srcdir)/contrib/md5 -I$(top_srcdir)/rpc/xdr/src \ - -I$(top_srcdir)/rpc/rpc-lib/src $(GF_CFLAGS) +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src \ + -I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -fno-strict-aliasing -Wall $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/storage/posix/src/posix-aio.c b/xlators/storage/posix/src/posix-aio.c new file mode 100644 index 000000000..c3bbddd67 --- /dev/null +++ b/xlators/storage/posix/src/posix-aio.c @@ -0,0 +1,569 @@ +/* + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "glusterfs.h" +#include "posix.h" +#include <sys/uio.h> + +#ifdef HAVE_LIBAIO +#include <libaio.h> + + +void +__posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags, + off_t offset, size_t size) +{ + int odirect = 0; + int flags = 0; + int ret = 0; + + odirect = pfd->odirect; + + if ((fd->flags|opflags) & O_DIRECT) { + /* if instructed, use O_DIRECT always */ + odirect = 1; + } else { + /* else use O_DIRECT when feasible */ + if ((offset|size) & 0xfff) + odirect = 0; + else + odirect = 1; + } + + if (!odirect && pfd->odirect) { + flags = fcntl (pfd->fd, F_GETFL); + ret = fcntl (pfd->fd, F_SETFL, (flags & (~O_DIRECT))); + pfd->odirect = 0; + } + + if (odirect && !pfd->odirect) { + flags = fcntl (pfd->fd, F_GETFL); + ret = fcntl (pfd->fd, F_SETFL, (flags | O_DIRECT)); + pfd->odirect = 1; + } + + if (ret) { + gf_log (THIS->name, GF_LOG_WARNING, + "fcntl() failed (%s). fd=%d flags=%d pfd->odirect=%d", + strerror (errno), pfd->fd, flags, pfd->odirect); + } +} + + +struct posix_aio_cb { + struct iocb iocb; + call_frame_t *frame; + struct iobuf *iobuf; + struct iobref *iobref; + struct iatt prebuf; + int fd; + int op; + off_t offset; +}; + + +int +posix_aio_readv_complete (struct posix_aio_cb *paiocb, int res, int res2) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + struct iobuf *iobuf = NULL; + struct iatt postbuf = {0,}; + int _fd = -1; + int op_ret = -1; + int op_errno = 0; + struct iovec iov; + struct iobref *iobref = NULL; + int ret = 0; + off_t offset = 0; + struct posix_private * priv = NULL; + + + frame = paiocb->frame; + this = frame->this; + priv = this->private; + iobuf = paiocb->iobuf; + _fd = paiocb->fd; + offset = paiocb->offset; + + if (res < 0) { + op_ret = -1; + op_errno = -res; + gf_log (this->name, GF_LOG_ERROR, + "readv(async) failed fd=%d,size=%lu,offset=%llu (%d/%s)", + _fd, paiocb->iocb.u.c.nbytes, + (unsigned long long) paiocb->offset, + res, strerror (op_errno)); + goto out; + } + + ret = posix_fdstat (this, _fd, &postbuf); + if (ret != 0) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "fstat failed on fd=%d: %s", _fd, + strerror (op_errno)); + goto out; + } + + op_ret = res; + op_errno = 0; + + iobref = iobref_new (); + if (!iobref) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + + iobref_add (iobref, iobuf); + + iov.iov_base = iobuf_ptr (iobuf); + iov.iov_len = op_ret; + + + /* Hack to notify higher layers of EOF. */ + if (!postbuf.ia_size || (offset + iov.iov_len) >= postbuf.ia_size) + op_errno = ENOENT; + + LOCK (&priv->lock); + { + priv->read_value += op_ret; + } + UNLOCK (&priv->lock); + +out: + STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, &iov, 1, + &postbuf, iobref, NULL); + if (iobuf) + iobuf_unref (iobuf); + if (iobref) + iobref_unref (iobref); + + GF_FREE (paiocb); + + return 0; +} + + +int +posix_aio_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t offset, uint32_t flags, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + int _fd = -1; + struct iobuf *iobuf = NULL; + struct posix_fd * pfd = NULL; + int ret = -1; + struct posix_aio_cb *paiocb = NULL; + struct posix_private *priv = NULL; + struct iocb *iocb = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + priv = this->private; + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + op_errno = -ret; + gf_log (this->name, GF_LOG_WARNING, + "pfd is NULL from fd=%p", fd); + goto err; + } + _fd = pfd->fd; + + if (!size) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size); + goto err; + } + + iobuf = iobuf_get2 (this->ctx->iobuf_pool, size); + if (!iobuf) { + op_errno = ENOMEM; + goto err; + } + + paiocb = GF_CALLOC (1, sizeof (*paiocb), gf_posix_mt_paiocb); + if (!paiocb) { + op_errno = ENOMEM; + goto err; + } + + + paiocb->frame = frame; + paiocb->iobuf = iobuf; + paiocb->offset = offset; + paiocb->fd = _fd; + paiocb->op = GF_FOP_READ; + + paiocb->iocb.data = paiocb; + paiocb->iocb.aio_fildes = _fd; + paiocb->iocb.aio_lio_opcode = IO_CMD_PREAD; + paiocb->iocb.aio_reqprio = 0; + paiocb->iocb.u.c.buf = iobuf_ptr (iobuf); + paiocb->iocb.u.c.nbytes = size; + paiocb->iocb.u.c.offset = offset; + + iocb = &paiocb->iocb; + + LOCK (&fd->lock); + { + __posix_fd_set_odirect (fd, pfd, flags, offset, size); + + ret = io_submit (priv->ctxp, 1, &iocb); + } + UNLOCK (&fd->lock); + + if (ret != 1) { + gf_log (this->name, GF_LOG_ERROR, + "io_submit() returned %d", ret); + op_errno = -ret; + goto err; + } + + return 0; +err: + STACK_UNWIND_STRICT (readv, frame, -1, op_errno, 0, 0, 0, 0, 0); + if (iobuf) + iobuf_unref (iobuf); + + if (paiocb) + GF_FREE (paiocb); + + return 0; +} + + +int +posix_aio_writev_complete (struct posix_aio_cb *paiocb, int res, int res2) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + struct iatt prebuf = {0,}; + struct iatt postbuf = {0,}; + int _fd = -1; + int op_ret = -1; + int op_errno = 0; + int ret = 0; + struct posix_private * priv = NULL; + + + frame = paiocb->frame; + this = frame->this; + priv = this->private; + prebuf = paiocb->prebuf; + _fd = paiocb->fd; + + if (res < 0) { + op_ret = -1; + op_errno = -res; + gf_log (this->name, GF_LOG_ERROR, + "writev(async) failed fd=%d,offset=%llu (%d/%s)", + _fd, (unsigned long long) paiocb->offset, res, + strerror (op_errno)); + + goto out; + } + + ret = posix_fdstat (this, _fd, &postbuf); + if (ret != 0) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "fstat failed on fd=%d: %s", _fd, + strerror (op_errno)); + goto out; + } + + + op_ret = res; + op_errno = 0; + + LOCK (&priv->lock); + { + priv->write_value += op_ret; + } + UNLOCK (&priv->lock); + +out: + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &prebuf, &postbuf, + NULL); + + if (paiocb) { + if (paiocb->iobref) + iobref_unref (paiocb->iobref); + GF_FREE (paiocb); + } + + return 0; +} + + +int +posix_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *iov, int count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + int _fd = -1; + struct posix_fd * pfd = NULL; + int ret = -1; + struct posix_aio_cb *paiocb = NULL; + struct posix_private *priv = NULL; + struct iocb *iocb = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + priv = this->private; + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + op_errno = -ret; + gf_log (this->name, GF_LOG_WARNING, + "pfd is NULL from fd=%p", fd); + goto err; + } + _fd = pfd->fd; + + paiocb = GF_CALLOC (1, sizeof (*paiocb), gf_posix_mt_paiocb); + if (!paiocb) { + op_errno = ENOMEM; + goto err; + } + + + paiocb->frame = frame; + paiocb->offset = offset; + paiocb->fd = _fd; + paiocb->op = GF_FOP_WRITE; + + paiocb->iocb.data = paiocb; + paiocb->iocb.aio_fildes = _fd; + paiocb->iobref = iobref_ref (iobref); + paiocb->iocb.aio_lio_opcode = IO_CMD_PWRITEV; + paiocb->iocb.aio_reqprio = 0; + paiocb->iocb.u.v.vec = iov; + paiocb->iocb.u.v.nr = count; + paiocb->iocb.u.v.offset = offset; + + iocb = &paiocb->iocb; + + ret = posix_fdstat (this, _fd, &paiocb->prebuf); + if (ret != 0) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "fstat failed on fd=%p: %s", fd, + strerror (op_errno)); + goto err; + } + + + LOCK (&fd->lock); + { + __posix_fd_set_odirect (fd, pfd, flags, offset, + iov_length (iov, count)); + + ret = io_submit (priv->ctxp, 1, &iocb); + } + UNLOCK (&fd->lock); + + if (ret != 1) { + gf_log (this->name, GF_LOG_ERROR, + "io_submit() returned %d", ret); + op_errno = -ret; + goto err; + } + + return 0; +err: + STACK_UNWIND_STRICT (writev, frame, -1, op_errno, 0, 0, 0); + + if (paiocb) { + if (paiocb->iobref) + iobref_unref (paiocb->iobref); + GF_FREE (paiocb); + } + + return 0; +} + + +void * +posix_aio_thread (void *data) +{ + xlator_t *this = NULL; + struct posix_private *priv = NULL; + int ret = 0; + int i = 0; + struct io_event events[POSIX_AIO_MAX_NR_GETEVENTS]; + struct io_event *event = NULL; + struct posix_aio_cb *paiocb = NULL; + + this = data; + THIS = this; + priv = this->private; + + for (;;) { + memset (&events[0], 0, sizeof (events)); + ret = io_getevents (priv->ctxp, 1, POSIX_AIO_MAX_NR_GETEVENTS, + &events[0], NULL); + if (ret <= 0) { + gf_log (this->name, GF_LOG_ERROR, + "io_getevents() returned %d", ret); + if (ret == -EINTR) + continue; + break; + } + + for (i = 0; i < ret; i++) { + event = &events[i]; + + paiocb = event->data; + + switch (paiocb->op) { + case GF_FOP_READ: + posix_aio_readv_complete (paiocb, event->res, + event->res2); + break; + case GF_FOP_WRITE: + posix_aio_writev_complete (paiocb, event->res, + event->res2); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "unknown op %d found in piocb", + paiocb->op); + break; + } + } + } + + return NULL; +} + + +int +posix_aio_init (xlator_t *this) +{ + struct posix_private *priv = NULL; + int ret = 0; + + priv = this->private; + + ret = io_setup (POSIX_AIO_MAX_NR_EVENTS, &priv->ctxp); + if ((ret == -1 && errno == ENOSYS) || ret == -ENOSYS) { + gf_log (this->name, GF_LOG_WARNING, + "Linux AIO not available at run-time." + " Continuing with synchronous IO"); + ret = 0; + goto out; + } + + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "io_setup() failed. ret=%d, errno=%d", + ret, errno); + goto out; + } + + ret = gf_thread_create (&priv->aiothread, NULL, + posix_aio_thread, this); + if (ret != 0) { + io_destroy (priv->ctxp); + goto out; + } + + this->fops->readv = posix_aio_readv; + this->fops->writev = posix_aio_writev; +out: + return ret; +} + + +int +posix_aio_on (xlator_t *this) +{ + struct posix_private *priv = NULL; + int ret = 0; + + priv = this->private; + + if (!priv->aio_init_done) { + ret = posix_aio_init (this); + if (ret == 0) + priv->aio_capable = _gf_true; + else + priv->aio_capable = _gf_false; + priv->aio_init_done = _gf_true; + } + + if (priv->aio_capable) { + this->fops->readv = posix_aio_readv; + this->fops->writev = posix_aio_writev; + } + + return ret; +} + +int +posix_aio_off (xlator_t *this) +{ + this->fops->readv = posix_readv; + this->fops->writev = posix_writev; + + return 0; +} + + +#else + + +int +posix_aio_on (xlator_t *this) +{ + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return 0; +} + +int +posix_aio_off (xlator_t *this) +{ + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return 0; +} + +void +__posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags, + off_t offset, size_t size) +{ + xlator_t *this = THIS; + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return; +} +#endif diff --git a/xlators/storage/posix/src/posix-aio.h b/xlators/storage/posix/src/posix-aio.h new file mode 100644 index 000000000..5bde71601 --- /dev/null +++ b/xlators/storage/posix/src/posix-aio.h @@ -0,0 +1,39 @@ +/* + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _POSIX_AIO_H +#define _POSIX_AIO_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "glusterfs.h" + +// Maximum number of concurrently submitted IO events. The heaviest load +// GlusterFS has been able to handle had 60-80 concurrent calls +#define POSIX_AIO_MAX_NR_EVENTS 256 + +// Maximum number of completed IO operations to reap per getevents syscall +#define POSIX_AIO_MAX_NR_GETEVENTS 16 + + +int posix_aio_on (xlator_t *this); +int posix_aio_off (xlator_t *this); + +int posix_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata); + +int posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata); + +#endif /* !_POSIX_AIO_H */ diff --git a/xlators/storage/posix/src/posix-handle.c b/xlators/storage/posix/src/posix-handle.c new file mode 100644 index 000000000..219a582c9 --- /dev/null +++ b/xlators/storage/posix/src/posix-handle.c @@ -0,0 +1,744 @@ +/* + Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <libgen.h> +#ifdef GF_LINUX_HOST_OS +#include <alloca.h> +#endif + +#include "posix-handle.h" +#include "posix.h" +#include "xlator.h" +#include "syscall.h" + + +#define HANDLE_PFX ".glusterfs" +#define TRASH_DIR "landfill" + +#define UUID0_STR "00000000-0000-0000-0000-000000000000" +#define SLEN(str) (sizeof(str) - 1) + + +int +posix_handle_relpath (xlator_t *this, uuid_t gfid, const char *basename, + char *buf, size_t buflen) +{ + char *uuid_str = NULL; + int len = 0; + + len = SLEN("../") + + SLEN("../") + + SLEN("00/") + + SLEN("00/") + + SLEN(UUID0_STR) + + 1 /* '\0' */ + ; + + if (basename) { + len += (strlen (basename) + 1); + } + + if (buflen < len || !buf) + return len; + + uuid_str = uuid_utoa (gfid); + + if (basename) { + len = snprintf (buf, buflen, "../../%02x/%02x/%s/%s", + gfid[0], gfid[1], uuid_str, basename); + } else { + len = snprintf (buf, buflen, "../../%02x/%02x/%s", + gfid[0], gfid[1], uuid_str); + } + + return len; +} + + +/* + TODO: explain how this pump fixes ELOOP +*/ +int +posix_handle_pump (xlator_t *this, char *buf, int len, int maxlen, + char *base_str, int base_len, int pfx_len) +{ + char linkname[512] = {0,}; /* "../../<gfid>/<NAME_MAX>" */ + int ret = 0; + int blen = 0; + int link_len = 0; + + /* is a directory's symlink-handle */ + ret = readlink (base_str, linkname, 512); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "internal readlink failed on %s (%s)", + base_str, strerror (errno)); + goto err; + } + + if (ret < 512) + linkname[ret] = 0; + + link_len = ret; + + if ((ret == 8) && memcmp (linkname, "../../..", 8) == 0) { + if (strcmp (base_str, buf) == 0) { + strcpy (buf + pfx_len, ".."); + } + goto out; + } + + if (ret < 50 || ret >= 512) { + gf_log (this->name, GF_LOG_ERROR, + "malformed internal link %s for %s", + linkname, base_str); + goto err; + } + + if (memcmp (linkname, "../../", 6) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "malformed internal link %s for %s", + linkname, base_str); + goto err; + } + + if ((linkname[2] != '/') || + (linkname[5] != '/') || + (linkname[8] != '/') || + (linkname[11] != '/') || + (linkname[48] != '/')) { + gf_log (this->name, GF_LOG_ERROR, + "malformed internal link %s for %s", + linkname, base_str); + goto err; + } + + if ((linkname[20] != '-') || + (linkname[25] != '-') || + (linkname[30] != '-') || + (linkname[35] != '-')) { + gf_log (this->name, GF_LOG_ERROR, + "malformed internal link %s for %s", + linkname, base_str); + goto err; + } + + blen = link_len - 48; + memmove (buf + base_len + blen, buf + base_len, + (strlen (buf) - base_len) + 1); + + strncpy (base_str + pfx_len, linkname + 6, 42); + + if (len + blen < maxlen) + strncpy (buf + pfx_len, linkname + 6, link_len - 6); +out: + return len + blen; +err: + return -1; +} + + +/* + posix_handle_path differs from posix_handle_gfid_path in the way that the + path filled in @buf by posix_handle_path will return type IA_IFDIR when + an lstat() is performed on it, whereas posix_handle_gfid_path returns path + to the handle symlink (typically used for the purpose of unlinking it). + + posix_handle_path also guarantees immunity to ELOOP on the path returned by it +*/ + +int +posix_handle_path (xlator_t *this, uuid_t gfid, const char *basename, + char *ubuf, size_t size) +{ + struct posix_private *priv = NULL; + char *uuid_str = NULL; + int len = 0; + int ret = -1; + struct stat stat; + char *base_str = NULL; + int base_len = 0; + int pfx_len; + int maxlen; + char *buf; + + priv = this->private; + + uuid_str = uuid_utoa (gfid); + + if (ubuf) { + buf = ubuf; + maxlen = size; + } else { + maxlen = PATH_MAX; + buf = alloca (maxlen); + } + + base_len = (priv->base_path_length + SLEN(HANDLE_PFX) + 45); + base_str = alloca (base_len + 1); + base_len = snprintf (base_str, base_len + 1, "%s/%s/%02x/%02x/%s", + priv->base_path, HANDLE_PFX, gfid[0], gfid[1], + uuid_str); + + pfx_len = priv->base_path_length + 1 + SLEN(HANDLE_PFX) + 1; + + if (basename) { + len = snprintf (buf, maxlen, "%s/%s", base_str, basename); + } else { + len = snprintf (buf, maxlen, "%s", base_str); + } + + ret = lstat (base_str, &stat); + + if (!(ret == 0 && S_ISLNK(stat.st_mode) && stat.st_nlink == 1)) + goto out; + + do { + errno = 0; + ret = posix_handle_pump (this, buf, len, maxlen, + base_str, base_len, pfx_len); + if (ret == -1) + break; + + len = ret; + + ret = lstat (buf, &stat); + } while ((ret == -1) && errno == ELOOP); + +out: + return len + 1; +} + + +int +posix_handle_gfid_path (xlator_t *this, uuid_t gfid, const char *basename, + char *buf, size_t buflen) +{ + struct posix_private *priv = NULL; + char *uuid_str = NULL; + int len = 0; + + priv = this->private; + + len = priv->base_path_length /* option directory "/export" */ + + SLEN("/") + + SLEN(HANDLE_PFX) + + SLEN("/") + + SLEN("00/") + + SLEN("00/") + + SLEN(UUID0_STR) + + 1 /* '\0' */ + ; + + if (basename) { + len += (strlen (basename) + 1); + } else { + len += 256; /* worst-case for directory's symlink-handle expansion */ + } + + if ((buflen < len) || !buf) + return len; + + uuid_str = uuid_utoa (gfid); + + if (__is_root_gfid (gfid)) { + if (basename) { + len = snprintf (buf, buflen, "%s/%s", priv->base_path, + basename); + } else { + strncpy (buf, priv->base_path, buflen); + } + goto out; + } + + if (basename) { + len = snprintf (buf, buflen, "%s/%s/%02x/%02x/%s/%s", priv->base_path, + HANDLE_PFX, gfid[0], gfid[1], uuid_str, basename); + } else { + len = snprintf (buf, buflen, "%s/%s/%02x/%02x/%s", priv->base_path, + HANDLE_PFX, gfid[0], gfid[1], uuid_str); + } +out: + return len; +} + + +int +posix_handle_init (xlator_t *this) +{ + struct posix_private *priv = NULL; + char *handle_pfx = NULL; + int ret = 0; + int len = 0; + struct stat stbuf; + struct stat rootbuf; + struct stat exportbuf; + char *rootstr = NULL; + uuid_t gfid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1}; + + priv = this->private; + + ret = stat (priv->base_path, &exportbuf); + if (ret || !S_ISDIR (exportbuf.st_mode)) { + gf_log (this->name, GF_LOG_ERROR, + "Not a directory: %s", priv->base_path); + return -1; + } + + handle_pfx = alloca (priv->base_path_length + 1 + strlen (HANDLE_PFX) + + 1); + + sprintf (handle_pfx, "%s/%s", priv->base_path, HANDLE_PFX); + + ret = stat (handle_pfx, &stbuf); + switch (ret) { + case -1: + if (errno == ENOENT) { + ret = mkdir (handle_pfx, 0600); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "Creating directory %s failed: %s", + handle_pfx, strerror (errno)); + return -1; + } + } else { + gf_log (this->name, GF_LOG_ERROR, + "Checking for %s failed: %s", + handle_pfx, strerror (errno)); + return -1; + } + break; + case 0: + if (!S_ISDIR (stbuf.st_mode)) { + gf_log (this->name, GF_LOG_ERROR, + "Not a directory: %s", + handle_pfx); + return -1; + } + break; + default: + break; + } + + stat (handle_pfx, &priv->handledir); + + len = posix_handle_path (this, gfid, NULL, NULL, 0); + rootstr = alloca (len); + posix_handle_path (this, gfid, NULL, rootstr, len); + + ret = stat (rootstr, &rootbuf); + switch (ret) { + case -1: + if (errno != ENOENT) { + gf_log (this->name, GF_LOG_ERROR, + "%s: %s", priv->base_path, + strerror (errno)); + return -1; + } + + ret = posix_handle_mkdir_hashes (this, rootstr); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "mkdir %s failed (%s)", + rootstr, strerror (errno)); + return -1; + } + + ret = symlink ("../../..", rootstr); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "symlink %s creation failed (%s)", + rootstr, strerror (errno)); + return -1; + } + break; + case 0: + if ((exportbuf.st_ino == rootbuf.st_ino) && + (exportbuf.st_dev == rootbuf.st_dev)) + return 0; + + gf_log (this->name, GF_LOG_ERROR, + "Different dirs %s (%lld/%lld) != %s (%lld/%lld)", + priv->base_path, (long long) exportbuf.st_ino, + (long long) exportbuf.st_dev, rootstr, + (long long) rootbuf.st_ino, (long long) rootbuf.st_dev); + return -1; + + break; + } + + return 0; +} + +gf_boolean_t +posix_does_old_trash_exists (char *old_trash) +{ + uuid_t gfid = {0}; + gf_boolean_t exists = _gf_false; + struct stat stbuf = {0}; + int ret = 0; + + ret = lstat (old_trash, &stbuf); + if ((ret == 0) && S_ISDIR (stbuf.st_mode)) { + ret = sys_lgetxattr (old_trash, "trusted.gfid", gfid, 16); + if ((ret < 0) && (errno == ENODATA)) + exists = _gf_true; + } + return exists; +} + +int +posix_handle_new_trash_init (xlator_t *this, char *trash) +{ + int ret = 0; + struct stat stbuf = {0}; + + ret = lstat (trash, &stbuf); + switch (ret) { + case -1: + if (errno == ENOENT) { + ret = mkdir (trash, 0755); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "Creating directory %s failed: %s", + trash, strerror (errno)); + } + } else { + gf_log (this->name, GF_LOG_ERROR, "Checking for %s " + "failed: %s", trash, strerror (errno)); + } + break; + case 0: + if (!S_ISDIR (stbuf.st_mode)) { + gf_log (this->name, GF_LOG_ERROR, + "Not a directory: %s", trash); + ret = -1; + } + break; + default: + break; + } + return ret; +} + +int +posix_mv_old_trash_into_new_trash (xlator_t *this, char *old, char *new) +{ + char dest_old[PATH_MAX] = {0}; + int ret = 0; + uuid_t dest_name = {0}; + + if (!posix_does_old_trash_exists (old)) + goto out; + uuid_generate (dest_name); + snprintf (dest_old, sizeof (dest_old), "%s/%s", new, + uuid_utoa (dest_name)); + ret = rename (old, dest_old); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Not able to move " + "%s -> %s (%s)", old, dest_old, strerror (errno)); + } +out: + return ret; +} + +int +posix_handle_trash_init (xlator_t *this) +{ + int ret = -1; + struct posix_private *priv = NULL; + char old_trash[PATH_MAX] = {0}; + + priv = this->private; + + priv->trash_path = GF_CALLOC (1, priv->base_path_length + strlen ("/") + + strlen (HANDLE_PFX) + strlen ("/") + + strlen (TRASH_DIR) + 1, + gf_posix_mt_trash_path); + + if (!priv->trash_path) + goto out; + + strncpy (priv->trash_path, priv->base_path, priv->base_path_length); + strcat (priv->trash_path, "/" HANDLE_PFX "/" TRASH_DIR); + ret = posix_handle_new_trash_init (this, priv->trash_path); + if (ret) + goto out; + snprintf (old_trash, sizeof (old_trash), "%s/.landfill", + priv->base_path); + ret = posix_mv_old_trash_into_new_trash (this, old_trash, + priv->trash_path); +out: + return ret; +} + +int +posix_handle_mkdir_hashes (xlator_t *this, const char *newpath) +{ + char *duppath = NULL; + char *parpath = NULL; + int ret = 0; + + duppath = strdupa (newpath); + parpath = dirname (duppath); + parpath = dirname (duppath); + + ret = mkdir (parpath, 0700); + if (ret == -1 && errno != EEXIST) { + gf_log (this->name, GF_LOG_ERROR, + "error mkdir hash-1 %s (%s)", + parpath, strerror (errno)); + return -1; + } + + strcpy (duppath, newpath); + parpath = dirname (duppath); + + ret = mkdir (parpath, 0700); + if (ret == -1 && errno != EEXIST) { + gf_log (this->name, GF_LOG_ERROR, + "error mkdir hash-2 %s (%s)", + parpath, strerror (errno)); + return -1; + } + + return 0; +} + + +int +posix_handle_hard (xlator_t *this, const char *oldpath, uuid_t gfid, struct stat *oldbuf) +{ + char *newpath = NULL; + struct stat newbuf; + int ret = -1; + + + MAKE_HANDLE_PATH (newpath, this, gfid, NULL); + + ret = lstat (newpath, &newbuf); + if (ret == -1 && errno != ENOENT) { + gf_log (this->name, GF_LOG_WARNING, + "%s: %s", newpath, strerror (errno)); + return -1; + } + + if (ret == -1 && errno == ENOENT) { + ret = posix_handle_mkdir_hashes (this, newpath); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "mkdir %s failed (%s)", + newpath, strerror (errno)); + return -1; + } + +#ifdef HAVE_LINKAT + /* + * Use linkat if the target may be a symlink to a directory + * or without an existing target. See comment about linkat() + * usage in posix_link() in posix.c for details + */ + ret = linkat (AT_FDCWD, oldpath, AT_FDCWD, newpath, 0); +#else + ret = link (oldpath, newpath); +#endif + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "link %s -> %s failed (%s)", + oldpath, newpath, strerror (errno)); + return -1; + } + + ret = lstat (newpath, &newbuf); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "lstat on %s failed (%s)", + newpath, strerror (errno)); + return -1; + } + } + + if (newbuf.st_ino != oldbuf->st_ino || + newbuf.st_dev != oldbuf->st_dev) { + gf_log (this->name, GF_LOG_WARNING, + "mismatching ino/dev between file %s (%lld/%lld) " + "and handle %s (%lld/%lld)", + oldpath, (long long) oldbuf->st_ino, (long long) oldbuf->st_dev, + newpath, (long long) newbuf.st_ino, (long long) newbuf.st_dev); + ret = -1; + } + + return ret; +} + + +int +posix_handle_soft (xlator_t *this, const char *real_path, loc_t *loc, + uuid_t gfid, struct stat *oldbuf) +{ + char *oldpath = NULL; + char *newpath = NULL; + struct stat newbuf; + int ret = -1; + + + MAKE_HANDLE_PATH (newpath, this, gfid, NULL); + MAKE_HANDLE_RELPATH (oldpath, this, loc->pargfid, loc->name); + + + ret = lstat (newpath, &newbuf); + if (ret == -1 && errno != ENOENT) { + gf_log (this->name, GF_LOG_WARNING, + "%s: %s", newpath, strerror (errno)); + return -1; + } + + if (ret == -1 && errno == ENOENT) { + ret = posix_handle_mkdir_hashes (this, newpath); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "mkdir %s failed (%s)", + newpath, strerror (errno)); + return -1; + } + + ret = symlink (oldpath, newpath); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "symlink %s -> %s failed (%s)", + oldpath, newpath, strerror (errno)); + return -1; + } + + ret = lstat (newpath, &newbuf); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "stat on %s failed (%s)", + newpath, strerror (errno)); + return -1; + } + } + + ret = stat (real_path, &newbuf); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "stat on %s failed (%s)", newpath, strerror (errno)); + return -1; + } + + if (!oldbuf) + return ret; + + if (newbuf.st_ino != oldbuf->st_ino || + newbuf.st_dev != oldbuf->st_dev) { + gf_log (this->name, GF_LOG_WARNING, + "mismatching ino/dev between file %s (%lld/%lld) " + "and handle %s (%lld/%lld)", + oldpath, (long long) oldbuf->st_ino, (long long) oldbuf->st_dev, + newpath, (long long) newbuf.st_ino, (long long) newbuf.st_dev); + ret = -1; + } + + return ret; +} + + +static int +posix_handle_unset_gfid (xlator_t *this, uuid_t gfid) +{ + char *path = NULL; + int ret = 0; + struct stat stat; + + MAKE_HANDLE_GFID_PATH (path, this, gfid, NULL); + + ret = lstat (path, &stat); + + if (ret == -1) { + if (errno != ENOENT) { + gf_log (this->name, GF_LOG_WARNING, + "%s: %s", path, strerror (errno)); + } + goto out; + } + + ret = unlink (path); + if (ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "unlink %s failed (%s)", path, strerror (errno)); + } + +out: + return ret; +} + + +int +posix_handle_unset (xlator_t *this, uuid_t gfid, const char *basename) +{ + int ret; + struct iatt stat; + char *path = NULL; + + + if (!basename) { + ret = posix_handle_unset_gfid (this, gfid); + return ret; + } + + MAKE_HANDLE_PATH (path, this, gfid, basename); + + ret = posix_istat (this, gfid, basename, &stat); + + if (ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s: %s", path, strerror (errno)); + return -1; + } + + ret = posix_handle_unset_gfid (this, stat.ia_gfid); + + return ret; +} + + +int +posix_create_link_if_gfid_exists (xlator_t *this, uuid_t gfid, + char *real_path) +{ + int ret = -1; + struct stat stbuf = {0,}; + char *newpath = NULL; + + MAKE_HANDLE_PATH (newpath, this, gfid, NULL); + ret = lstat (newpath, &stbuf); + if (!ret) { +#ifdef HAVE_LINKAT + /* + * Use linkat if the target may be a symlink to a directory + * or without an existing target. See comment about linkat() + * usage in posix_link() in posix.c for details + */ + ret = linkat (AT_FDCWD, newpath, AT_FDCWD, real_path, 0); +#else + ret = link (newpath, real_path); +#endif + } + + return ret; +} diff --git a/xlators/storage/posix/src/posix-handle.h b/xlators/storage/posix/src/posix-handle.h new file mode 100644 index 000000000..f1163b727 --- /dev/null +++ b/xlators/storage/posix/src/posix-handle.h @@ -0,0 +1,143 @@ +/* + Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _POSIX_HANDLE_H +#define _POSIX_HANDLE_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <sys/types.h> +#include "xlator.h" + + +#define LOC_HAS_ABSPATH(loc) ((loc) && (loc->path) && (loc->path[0] == '/')) + +#define MAKE_REAL_PATH(var, this, path) do { \ + var = alloca (strlen (path) + POSIX_BASE_PATH_LEN(this) + 2); \ + strcpy (var, POSIX_BASE_PATH(this)); \ + strcpy (&var[POSIX_BASE_PATH_LEN(this)], path); \ + } while (0) + + +#define MAKE_HANDLE_PATH(var, this, gfid, base) do { \ + int __len; \ + __len = posix_handle_path (this, gfid, base, NULL, 0); \ + if (__len <= 0) \ + break; \ + var = alloca (__len); \ + __len = posix_handle_path (this, gfid, base, var, __len); \ + } while (0) + + +#define MAKE_HANDLE_GFID_PATH(var, this, gfid, base) do { \ + int __len = 0; \ + __len = posix_handle_gfid_path (this, gfid, base, NULL, 0); \ + if (__len <= 0) \ + break; \ + var = alloca (__len); \ + __len = posix_handle_gfid_path (this, gfid, base, var, __len); \ + } while (0) + + +#define MAKE_HANDLE_RELPATH(var, this, gfid, base) do { \ + int __len; \ + __len = posix_handle_relpath (this, gfid, base, NULL, 0); \ + if (__len <= 0) \ + break; \ + var = alloca (__len); \ + __len = posix_handle_relpath (this, gfid, base, var, __len); \ + } while (0) + + +#define MAKE_INODE_HANDLE(rpath, this, loc, iatt_p) do { \ + if (uuid_is_null (loc->gfid)) { \ + gf_log (this->name, GF_LOG_ERROR, \ + "null gfid for path %s", loc->path); \ + break; \ + } \ + if (LOC_HAS_ABSPATH (loc)) { \ + MAKE_REAL_PATH (rpath, this, loc->path); \ + op_ret = posix_pstat (this, loc->gfid, rpath, iatt_p); \ + break; \ + } \ + errno = 0; \ + op_ret = posix_istat (this, loc->gfid, NULL, iatt_p); \ + if (errno != ELOOP) { \ + MAKE_HANDLE_PATH (rpath, this, loc->gfid, NULL); \ + break; \ + } \ + /* __ret == -1 && errno == ELOOP */ \ + } while (0) + + +#define MAKE_ENTRY_HANDLE(entp, parp, this, loc, ent_p) do { \ + char *__parp; \ + \ + if (uuid_is_null (loc->pargfid) || !loc->name) { \ + gf_log (this->name, GF_LOG_ERROR, \ + "null pargfid/name for path %s", loc->path); \ + break; \ + } \ + \ + if (LOC_HAS_ABSPATH (loc)) { \ + MAKE_REAL_PATH (entp, this, loc->path); \ + __parp = strdupa (entp); \ + parp = dirname (__parp); \ + op_ret = posix_pstat (this, NULL, entp, ent_p); \ + break; \ + } \ + errno = 0; \ + op_ret = posix_istat (this, loc->pargfid, loc->name, ent_p); \ + if (errno != ELOOP) { \ + MAKE_HANDLE_PATH (parp, this, loc->pargfid, NULL); \ + MAKE_HANDLE_PATH (entp, this, loc->pargfid, loc->name); \ + break; \ + } \ + /* __ret == -1 && errno == ELOOP */ \ + /* expand ELOOP */ \ + } while (0) + + + +int +posix_handle_path (xlator_t *this, uuid_t gfid, const char *basename, char *buf, + size_t len); +int +posix_handle_path_safe (xlator_t *this, uuid_t gfid, const char *basename, + char *buf, size_t len); + +int +posix_handle_gfid_path (xlator_t *this, uuid_t gfid, const char *basename, + char *buf, size_t len); + +int +posix_handle_hard (xlator_t *this, const char *path, uuid_t gfid, + struct stat *buf); + + +int +posix_handle_soft (xlator_t *this, const char *real_path, loc_t *loc, + uuid_t gfid, struct stat *buf); + +int +posix_handle_unset (xlator_t *this, uuid_t gfid, const char *basename); + +int posix_handle_mkdir_hashes (xlator_t *this, const char *newpath); + +int posix_handle_init (xlator_t *this); + +int posix_create_link_if_gfid_exists (xlator_t *this, uuid_t gfid, + char *real_path); + +int +posix_handle_trash_init (xlator_t *this); +#endif /* !_POSIX_HANDLE_H */ diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c index d2cf880fb..e295f8850 100644 --- a/xlators/storage/posix/src/posix-helpers.c +++ b/xlators/storage/posix/src/posix-helpers.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" @@ -32,13 +22,13 @@ #include <pthread.h> #include <ftw.h> #include <sys/stat.h> +#include <signal.h> #ifndef GF_BSD_HOST_OS #include <alloca.h> #endif /* GF_BSD_HOST_OS */ #include "glusterfs.h" -#include "md5.h" #include "checksum.h" #include "dict.h" #include "logging.h" @@ -55,15 +45,12 @@ #include "timer.h" #include "glusterfs3-xdr.h" #include "hashfn.h" +#include "glusterfs-acl.h" +#include <fnmatch.h> - -typedef struct { - xlator_t *this; - const char *real_path; - dict_t *xattr; - struct iatt *stbuf; - loc_t *loc; -} posix_xattr_filler_t; +char *marker_xattrs[] = {"trusted.glusterfs.quota.*", + "trusted.glusterfs.*.xtime", + NULL}; static char* posix_ignore_xattrs[] = { "gfid-req", @@ -73,6 +60,25 @@ static char* posix_ignore_xattrs[] = { NULL }; +gf_boolean_t +posix_special_xattr (char **pattern, char *key) +{ + int i = 0; + gf_boolean_t flag = _gf_false; + + GF_VALIDATE_OR_GOTO ("posix", pattern, out); + GF_VALIDATE_OR_GOTO ("posix", key, out); + + for (i = 0; pattern[i]; i++) { + if (!fnmatch (pattern[i], key, 0)) { + flag = _gf_true; + break; + } + } +out: + return flag; +} + static gf_boolean_t posix_xattr_ignorable (char *key, posix_xattr_filler_t *filler) { @@ -95,7 +101,7 @@ out: return ignore; } -static void +static int _posix_xattr_get_set (dict_t *xattr_req, char *key, data_t *data, @@ -165,19 +171,13 @@ _posix_xattr_get_set (dict_t *xattr_req, err: if (_fd != -1) close (_fd); - if (databuf) - GF_FREE (databuf); + GF_FREE (databuf); } } else if (!strcmp (key, GLUSTERFS_OPEN_FD_COUNT)) { loc = filler->loc; - if (!list_empty (&loc->inode->fd_list)) { - ret = dict_set_uint32 (filler->xattr, key, 1); - if (ret < 0) - gf_log (filler->this->name, GF_LOG_WARNING, - "Failed to set dictionary value for %s", - key); - } else { - ret = dict_set_uint32 (filler->xattr, key, 0); + if (loc) { + ret = dict_set_uint32 (filler->xattr, key, + loc->inode->fd_count); if (ret < 0) gf_log (filler->this->name, GF_LOG_WARNING, "Failed to set dictionary value for %s", @@ -190,22 +190,31 @@ _posix_xattr_get_set (dict_t *xattr_req, value = GF_CALLOC (1, xattr_size + 1, gf_posix_mt_char); if (!value) - return; + return -1; - sys_lgetxattr (filler->real_path, key, value, - xattr_size); + xattr_size = sys_lgetxattr (filler->real_path, key, value, + xattr_size); + if (xattr_size <= 0) { + gf_log (filler->this->name, GF_LOG_WARNING, + "getxattr failed. path: %s, key: %s", + filler->real_path, key); + GF_FREE (value); + return -1; + } value[xattr_size] = '\0'; ret = dict_set_bin (filler->xattr, key, value, xattr_size); - if (ret < 0) + if (ret < 0) { gf_log (filler->this->name, GF_LOG_DEBUG, "dict set failed. path: %s, key: %s", filler->real_path, key); + GF_FREE (value); + } } } out: - return; + return 0; } @@ -213,14 +222,17 @@ int posix_fill_gfid_path (xlator_t *this, const char *path, struct iatt *iatt) { int ret = 0; + ssize_t size = 0; if (!iatt) return 0; - ret = sys_lgetxattr (path, GFID_XATTR_KEY, iatt->ia_gfid, 16); + size = sys_lgetxattr (path, GFID_XATTR_KEY, iatt->ia_gfid, 16); /* Return value of getxattr */ - if ((ret == 16) || (ret == -1)) + if ((size == 16) || (size == -1)) ret = 0; + else + ret = size; return ret; } @@ -230,14 +242,17 @@ int posix_fill_gfid_fd (xlator_t *this, int fd, struct iatt *iatt) { int ret = 0; + ssize_t size = 0; if (!iatt) return 0; - ret = sys_fgetxattr (fd, GFID_XATTR_KEY, iatt->ia_gfid, 16); + size = sys_fgetxattr (fd, GFID_XATTR_KEY, iatt->ia_gfid, 16); /* Return value of getxattr */ - if ((ret == 16) || (ret == -1)) + if ((size == 16) || (size == -1)) ret = 0; + else + ret = size; return ret; } @@ -255,7 +270,7 @@ posix_fill_ino_from_gfid (xlator_t *this, struct iatt *buf) goto out; } for (i = 15; i > (15 - 8); i--) { - temp_ino += buf->ia_gfid[i] << j; + temp_ino += (uint64_t)(buf->ia_gfid[i]) << j; j += 8; } buf->ia_ino = temp_ino; @@ -264,19 +279,22 @@ out: } int -posix_lstat_with_gfid (xlator_t *this, const char *path, struct iatt *stbuf_p) +posix_fdstat (xlator_t *this, int fd, struct iatt *stbuf_p) { int ret = 0; - struct stat lstatbuf = {0, }; + struct stat fstatbuf = {0, }; struct iatt stbuf = {0, }; - ret = lstat (path, &lstatbuf); + ret = fstat (fd, &fstatbuf); if (ret == -1) goto out; - iatt_from_stat (&stbuf, &lstatbuf); + if (fstatbuf.st_nlink && !S_ISDIR (fstatbuf.st_mode)) + fstatbuf.st_nlink--; - ret = posix_fill_gfid_path (this, path, &stbuf); + iatt_from_stat (&stbuf, &fstatbuf); + + ret = posix_fill_gfid_fd (this, fd, &stbuf); if (ret) gf_log_callingfn (this->name, GF_LOG_DEBUG, "failed to get gfid"); @@ -284,33 +302,125 @@ posix_lstat_with_gfid (xlator_t *this, const char *path, struct iatt *stbuf_p) if (stbuf_p) *stbuf_p = stbuf; + out: return ret; } int -posix_fstat_with_gfid (xlator_t *this, int fd, struct iatt *stbuf_p) +posix_istat (xlator_t *this, uuid_t gfid, const char *basename, + struct iatt *buf_p) { - int ret = 0; - struct stat fstatbuf = {0, }; - struct iatt stbuf = {0, }; + char *real_path = NULL; + struct stat lstatbuf = {0, }; + struct iatt stbuf = {0, }; + int ret = 0; + struct posix_private *priv = NULL; - ret = fstat (fd, &fstatbuf); - if (ret == -1) + + priv = this->private; + + MAKE_HANDLE_PATH (real_path, this, gfid, basename); + + ret = lstat (real_path, &lstatbuf); + + if (ret != 0) { + if (ret == -1) { + if (errno != ENOENT && errno != ELOOP) + gf_log (this->name, GF_LOG_WARNING, + "lstat failed on %s (%s)", + real_path, strerror (errno)); + } else { + // may be some backend filesystem issue + gf_log (this->name, GF_LOG_ERROR, "lstat failed on " + "%s and return value is %d instead of -1. " + "Please see dmesg output to check whether the " + "failure is due to backend filesystem issue", + real_path, ret); + ret = -1; + } goto out; + } - iatt_from_stat (&stbuf, &fstatbuf); + if ((lstatbuf.st_ino == priv->handledir.st_ino) && + (lstatbuf.st_dev == priv->handledir.st_dev)) { + errno = ENOENT; + return -1; + } - ret = posix_fill_gfid_fd (this, fd, &stbuf); - if (ret) - gf_log_callingfn (this->name, GF_LOG_DEBUG, "failed to get gfid"); + if (!S_ISDIR (lstatbuf.st_mode)) + lstatbuf.st_nlink --; + + iatt_from_stat (&stbuf, &lstatbuf); + + if (basename) + posix_fill_gfid_path (this, real_path, &stbuf); + else + uuid_copy (stbuf.ia_gfid, gfid); posix_fill_ino_from_gfid (this, &stbuf); - if (stbuf_p) - *stbuf_p = stbuf; + if (buf_p) + *buf_p = stbuf; +out: + return ret; +} + + +int +posix_pstat (xlator_t *this, uuid_t gfid, const char *path, + struct iatt *buf_p) +{ + struct stat lstatbuf = {0, }; + struct iatt stbuf = {0, }; + int ret = 0; + struct posix_private *priv = NULL; + + + priv = this->private; + + ret = lstat (path, &lstatbuf); + + if (ret != 0) { + if (ret == -1) { + if (errno != ENOENT) + gf_log (this->name, GF_LOG_WARNING, + "lstat failed on %s (%s)", + path, strerror (errno)); + } else { + // may be some backend filesytem issue + gf_log (this->name, GF_LOG_ERROR, "lstat failed on " + "%s and return value is %d instead of -1. " + "Please see dmesg output to check whether the " + "failure is due to backend filesystem issue", + path, ret); + ret = -1; + } + goto out; + } + + if ((lstatbuf.st_ino == priv->handledir.st_ino) && + (lstatbuf.st_dev == priv->handledir.st_dev)) { + errno = ENOENT; + return -1; + } + + if (!S_ISDIR (lstatbuf.st_mode)) + lstatbuf.st_nlink --; + + iatt_from_stat (&stbuf, &lstatbuf); + + if (gfid && !uuid_is_null (gfid)) + uuid_copy (stbuf.ia_gfid, gfid); + else + posix_fill_gfid_path (this, path, &stbuf); + + posix_fill_ino_from_gfid (this, &stbuf); + + if (buf_p) + *buf_p = stbuf; out: return ret; } @@ -340,83 +450,50 @@ out: } -/* - * If the parent directory of {real_path} has the setgid bit set, - * then set {gid} to the gid of the parent. Otherwise, - * leave {gid} unchanged. - */ - int -setgid_override (xlator_t *this, char *real_path, gid_t *gid) -{ - char * tmp_path = NULL; - char * parent_path = NULL; - struct iatt parent_stbuf; - - int op_ret = 0; - - tmp_path = gf_strdup (real_path); - if (!tmp_path) { - op_ret = -ENOMEM; - goto out; - } - - parent_path = dirname (tmp_path); - - op_ret = posix_lstat_with_gfid (this, parent_path, &parent_stbuf); - if (op_ret == -1) { - op_ret = -errno; - gf_log_callingfn (this->name, GF_LOG_ERROR, - "lstat on parent directory (%s) failed: %s", - parent_path, strerror (errno)); - goto out; - } - - if (parent_stbuf.ia_prot.sgid) { - /* - * Entries created inside a setgid directory - * should inherit the gid from the parent - */ - - *gid = parent_stbuf.ia_gid; - } -out: - - if (tmp_path) - GF_FREE (tmp_path); - - return op_ret; -} - - -int -posix_gfid_set (xlator_t *this, const char *path, dict_t *xattr_req) +posix_gfid_set (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req) { void *uuid_req = NULL; uuid_t uuid_curr; int ret = 0; + ssize_t size = 0; struct stat stat = {0, }; + if (!xattr_req) goto out; if (sys_lstat (path, &stat) != 0) goto out; - ret = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16); - if (ret == 16) { + size = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16); + if (size == 16) { ret = 0; - goto out; + goto verify_handle; } ret = dict_get_ptr (xattr_req, "gfid-req", &uuid_req); if (ret) { - gf_log_callingfn (this->name, GF_LOG_DEBUG, - "failed to get the gfid from dict"); + gf_log (this->name, GF_LOG_DEBUG, + "failed to get the gfid from dict for %s", + loc->path); goto out; } ret = sys_lsetxattr (path, GFID_XATTR_KEY, uuid_req, 16, XATTR_CREATE); + if (ret != 0) { + gf_log (this->name, GF_LOG_WARNING, + "setting GFID on %s failed (%s)", path, + strerror (errno)); + goto out; + } + uuid_copy (uuid_curr, uuid_req); + +verify_handle: + if (!S_ISDIR (stat.st_mode)) + ret = posix_handle_hard (this, path, uuid_curr, &stat); + else + ret = posix_handle_soft (this, path, loc, uuid_curr, &stat); out: return ret; @@ -424,36 +501,39 @@ out: int -posix_set_file_contents (xlator_t *this, const char *real_path, - data_pair_t *trav, int flags) +posix_set_file_contents (xlator_t *this, const char *path, char *keyp, + data_t *value, int flags) { char * key = NULL; - char real_filepath[ZR_PATH_MAX] = {0,}; + char real_path[PATH_MAX]; int32_t file_fd = -1; int op_ret = 0; int ret = -1; - key = &(trav->key[15]); - sprintf (real_filepath, "%s/%s", real_path, key); + + /* XXX: does not handle assigning GFID to created files */ + return -1; + + key = &(keyp[15]); + sprintf (real_path, "%s/%s", path, key); if (flags & XATTR_REPLACE) { /* if file exists, replace it * else, error out */ - file_fd = open (real_filepath, O_TRUNC|O_WRONLY); + file_fd = open (real_path, O_TRUNC|O_WRONLY); if (file_fd == -1) { goto create; } - if (trav->value->len) { - ret = write (file_fd, trav->value->data, - trav->value->len); + if (value->len) { + ret = write (file_fd, value->data, value->len); if (ret == -1) { op_ret = -errno; gf_log (this->name, GF_LOG_ERROR, "write failed while doing setxattr " "for key %s on path %s: %s", - key, real_filepath, strerror (errno)); + key, real_path, strerror (errno)); goto out; } @@ -462,14 +542,14 @@ posix_set_file_contents (xlator_t *this, const char *real_path, op_ret = -errno; gf_log (this->name, GF_LOG_ERROR, "close failed on %s: %s", - real_filepath, strerror (errno)); + real_path, strerror (errno)); goto out; } } create: /* we know file doesn't exist, create it */ - file_fd = open (real_filepath, O_CREAT|O_WRONLY, 0644); + file_fd = open (real_path, O_CREAT|O_WRONLY, 0644); if (file_fd == -1) { op_ret = -errno; @@ -479,13 +559,13 @@ posix_set_file_contents (xlator_t *this, const char *real_path, goto out; } - ret = write (file_fd, trav->value->data, trav->value->len); + ret = write (file_fd, value->data, value->len); if (ret == -1) { op_ret = -errno; gf_log (this->name, GF_LOG_ERROR, "write failed on %s while setxattr with " "key %s: %s", - real_filepath, key, strerror (errno)); + real_path, key, strerror (errno)); goto out; } @@ -495,7 +575,7 @@ posix_set_file_contents (xlator_t *this, const char *real_path, gf_log (this->name, GF_LOG_ERROR, "close failed on %s while setxattr with " "key %s: %s", - real_filepath, key, strerror (errno)); + real_path, key, strerror (errno)); goto out; } } @@ -506,33 +586,32 @@ out: int -posix_get_file_contents (xlator_t *this, const char *real_path, +posix_get_file_contents (xlator_t *this, uuid_t pargfid, const char *name, char **contents) { - char real_filepath[ZR_PATH_MAX] = {0,}; - char * key = NULL; + char *real_path = NULL; int32_t file_fd = -1; struct iatt stbuf = {0,}; int op_ret = 0; int ret = -1; - key = (char *) &(name[15]); - sprintf (real_filepath, "%s/%s", real_path, key); - op_ret = posix_lstat_with_gfid (this, real_filepath, &stbuf); + MAKE_HANDLE_PATH (real_path, this, pargfid, name); + + op_ret = posix_istat (this, pargfid, name, &stbuf); if (op_ret == -1) { op_ret = -errno; gf_log (this->name, GF_LOG_ERROR, "lstat failed on %s: %s", - real_filepath, strerror (errno)); + real_path, strerror (errno)); goto out; } - file_fd = open (real_filepath, O_RDONLY); + file_fd = open (real_path, O_RDONLY); if (file_fd == -1) { op_ret = -errno; gf_log (this->name, GF_LOG_ERROR, "open failed on %s: %s", - real_filepath, strerror (errno)); + real_path, strerror (errno)); goto out; } @@ -547,7 +626,7 @@ posix_get_file_contents (xlator_t *this, const char *real_path, if (ret <= 0) { op_ret = -1; gf_log (this->name, GF_LOG_ERROR, "read on %s failed: %s", - real_filepath, strerror (errno)); + real_path, strerror (errno)); goto out; } @@ -558,14 +637,13 @@ posix_get_file_contents (xlator_t *this, const char *real_path, if (op_ret == -1) { op_ret = -errno; gf_log (this->name, GF_LOG_ERROR, "close on %s failed: %s", - real_filepath, strerror (errno)); + real_path, strerror (errno)); goto out; } out: if (op_ret < 0) { - if (*contents) - GF_FREE (*contents); + GF_FREE (*contents); if (file_fd != -1) close (file_fd); } @@ -577,28 +655,33 @@ static int gf_xattr_enotsup_log; int posix_handle_pair (xlator_t *this, const char *real_path, - data_pair_t *trav, int flags) + char *key, data_t *value, int flags) { int sys_ret = -1; int ret = 0; - if (ZR_FILE_CONTENT_REQUEST(trav->key)) { - ret = posix_set_file_contents (this, real_path, trav, flags); + if (ZR_FILE_CONTENT_REQUEST(key)) { + ret = posix_set_file_contents (this, real_path, key, value, + flags); } else { - sys_ret = sys_lsetxattr (real_path, trav->key, - trav->value->data, - trav->value->len, flags); + sys_ret = sys_lsetxattr (real_path, key, value->data, + value->len, flags); if (sys_ret < 0) { if (errno == ENOTSUP) { GF_LOG_OCCASIONALLY(gf_xattr_enotsup_log, this->name,GF_LOG_WARNING, "Extended attributes not " - "supported"); + "supported (try remounting " + "brick with 'user_xattr' " + "flag)"); } else if (errno == ENOENT) { - gf_log (this->name, GF_LOG_ERROR, - "setxattr on %s failed: %s", real_path, - strerror (errno)); + if (!posix_special_xattr (marker_xattrs, + key)) { + gf_log (this->name, GF_LOG_ERROR, + "setxattr on %s failed: %s", + real_path, strerror (errno)); + } } else { #ifdef GF_DARWIN_HOST_OS @@ -606,12 +689,12 @@ posix_handle_pair (xlator_t *this, const char *real_path, ((errno == EINVAL) ? GF_LOG_DEBUG : GF_LOG_ERROR), "%s: key:%s error:%s", - real_path, trav->key, + real_path, key, strerror (errno)); #else /* ! DARWIN */ gf_log (this->name, GF_LOG_ERROR, "%s: key:%s error:%s", - real_path, trav->key, + real_path, key, strerror (errno)); #endif /* DARWIN */ } @@ -626,20 +709,22 @@ out: int posix_fhandle_pair (xlator_t *this, int fd, - data_pair_t *trav, int flags) + char *key, data_t *value, int flags) { int sys_ret = -1; int ret = 0; - sys_ret = sys_fsetxattr (fd, trav->key, trav->value->data, - trav->value->len, flags); + sys_ret = sys_fsetxattr (fd, key, value->data, + value->len, flags); if (sys_ret < 0) { if (errno == ENOTSUP) { GF_LOG_OCCASIONALLY(gf_xattr_enotsup_log, this->name,GF_LOG_WARNING, "Extended attributes not " - "supported"); + "supported (try remounting " + "brick with 'user_xattr' " + "flag)"); } else if (errno == ENOENT) { gf_log (this->name, GF_LOG_ERROR, "fsetxattr on fd=%d failed: %s", fd, @@ -651,13 +736,11 @@ posix_fhandle_pair (xlator_t *this, int fd, ((errno == EINVAL) ? GF_LOG_DEBUG : GF_LOG_ERROR), "fd=%d: key:%s error:%s", - fd, trav->key, - strerror (errno)); + fd, key, strerror (errno)); #else /* ! DARWIN */ gf_log (this->name, GF_LOG_ERROR, "fd=%d: key:%s error:%s", - fd, trav->key, - strerror (errno)); + fd, key, strerror (errno)); #endif /* DARWIN */ } @@ -674,6 +757,11 @@ static int janitor_walker (const char *fpath, const struct stat *sb, int typeflag, struct FTW *ftwbuf) { + struct iatt stbuf = {0, }; + xlator_t *this = NULL; + + this = THIS; + posix_pstat (this, NULL, fpath, &stbuf); switch (sb->st_mode & S_IFMT) { case S_IFREG: case S_IFBLK: @@ -684,6 +772,8 @@ janitor_walker (const char *fpath, const struct stat *sb, gf_log (THIS->name, GF_LOG_TRACE, "unlinking %s", fpath); unlink (fpath); + if (stbuf.ia_nlink == 1) + posix_handle_unset (this, stbuf.ia_gfid, NULL); break; case S_IFDIR: @@ -692,6 +782,7 @@ janitor_walker (const char *fpath, const struct stat *sb, "removing directory %s", fpath); rmdir (fpath); + posix_handle_unset (this, stbuf.ia_gfid, NULL); } break; } @@ -753,7 +844,7 @@ posix_janitor_thread_proc (void *data) time (&now); if ((now - priv->last_landfill_check) > priv->janitor_sleep_duration) { gf_log (this->name, GF_LOG_TRACE, - "janitor cleaning out /" GF_REPLICATE_TRASH_DIR); + "janitor cleaning out %s", priv->trash_path); nftw (priv->trash_path, janitor_walker, @@ -775,9 +866,6 @@ posix_janitor_thread_proc (void *data) closedir (pfd->dir); } - if (pfd->path) - GF_FREE (pfd->path); - GF_FREE (pfd); } } @@ -797,8 +885,8 @@ posix_spawn_janitor_thread (xlator_t *this) LOCK (&priv->lock); { if (!priv->janitor_present) { - ret = pthread_create (&priv->janitor, NULL, - posix_janitor_thread_proc, this); + ret = gf_thread_create (&priv->janitor, NULL, + posix_janitor_thread_proc, this); if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, @@ -814,6 +902,74 @@ unlock: UNLOCK (&priv->lock); } +static int +is_fresh_file (struct stat *stat) +{ + struct timeval tv; + + gettimeofday (&tv, NULL); + + if ((stat->st_ctime >= (tv.tv_sec - 1)) + && (stat->st_ctime <= tv.tv_sec)) + return 1; + + return 0; +} + + +int +posix_gfid_heal (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req) +{ + /* The purpose of this function is to prevent a race + where an inode creation FOP (like mkdir/mknod/create etc) + races with lookup in the following way: + + {create thread} | {lookup thread} + | + t0 + mkdir ("name") | + t1 + | posix_gfid_set ("name", 2); + t2 + posix_gfid_set ("name", 1); | + t3 + lstat ("name"); | lstat ("name"); + + In the above case mkdir FOP would have resulted with GFID 2 while + it should have been GFID 1. It matters in the case where GFID would + have gotten set to 1 on other subvolumes of replciate/distribute + + The "solution" here is that, if we detect lookup is attempting to + set a GFID on a file which is created very recently, but does not + yet have a GFID (i.e, between t1 and t2), then "fake" it as though + posix_gfid_heal was called at t0 instead. + */ + + uuid_t uuid_curr; + int ret = 0; + struct stat stat = {0, }; + + if (!xattr_req) + goto out; + + if (sys_lstat (path, &stat) != 0) + goto out; + + ret = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16); + if (ret != 16) { + if (is_fresh_file (&stat)) { + ret = -1; + errno = ENOENT; + goto out; + } + } + + ret = posix_gfid_set (this, path, loc, xattr_req); +out: + return ret; +} + + int posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req) { @@ -827,17 +983,17 @@ posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req) if (sys_lstat (path, &stat) != 0) goto out; - data = dict_get (xattr_req, "system.posix_acl_access"); + data = dict_get (xattr_req, POSIX_ACL_ACCESS_XATTR); if (data) { - ret = sys_lsetxattr (path, "system.posix_acl_access", + ret = sys_lsetxattr (path, POSIX_ACL_ACCESS_XATTR, data->data, data->len, 0); if (ret != 0) goto out; } - data = dict_get (xattr_req, "system.posix_acl_default"); + data = dict_get (xattr_req, POSIX_ACL_DEFAULT_XATTR); if (data) { - ret = sys_lsetxattr (path, "system.posix_acl_default", + ret = sys_lsetxattr (path, POSIX_ACL_DEFAULT_XATTR, data->data, data->len, 0); if (ret != 0) goto out; @@ -847,38 +1003,389 @@ out: return ret; } +static int +_handle_entry_create_keyvalue_pair (dict_t *d, char *k, data_t *v, + void *tmp) +{ + int ret = -1; + posix_xattr_filler_t *filler = NULL; + + filler = tmp; + + if (!strcmp (GFID_XATTR_KEY, k) || + !strcmp ("gfid-req", k) || + !strcmp (POSIX_ACL_DEFAULT_XATTR, k) || + !strcmp (POSIX_ACL_ACCESS_XATTR, k) || + ZR_FILE_CONTENT_REQUEST(k)) { + return 0; + } + + ret = posix_handle_pair (filler->this, filler->real_path, k, v, + XATTR_CREATE); + if (ret < 0) { + errno = -ret; + return -1; + } + return 0; +} + int posix_entry_create_xattr_set (xlator_t *this, const char *path, dict_t *dict) { - data_pair_t *trav = NULL; int ret = -1; + posix_xattr_filler_t filler = {0,}; + if (!dict) goto out; - trav = dict->members_list; - while (trav) { - if (!strcmp (GFID_XATTR_KEY, trav->key) || - !strcmp ("gfid-req", trav->key) || - !strcmp ("system.posix_acl_default", trav->key) || - !strcmp ("system.posix_acl_access", trav->key) || - ZR_FILE_CONTENT_REQUEST(trav->key)) { - trav = trav->next; - continue; + filler.this = this; + filler.real_path = path; + + ret = dict_foreach (dict, _handle_entry_create_keyvalue_pair, &filler); + +out: + return ret; +} + + +static int +__posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd_p) +{ + uint64_t tmp_pfd = 0; + struct posix_fd *pfd = NULL; + int ret = -1; + char *real_path = NULL; + int _fd = -1; + DIR *dir = NULL; + + ret = __fd_ctx_get (fd, this, &tmp_pfd); + if (ret == 0) { + pfd = (void *)(long) tmp_pfd; + ret = 0; + goto out; + } + + if (!fd_is_anonymous(fd)) + /* anonymous fd */ + goto out; + + MAKE_HANDLE_PATH (real_path, this, fd->inode->gfid, NULL); + + pfd = GF_CALLOC (1, sizeof (*pfd), gf_posix_mt_posix_fd); + if (!pfd) { + goto out; + } + pfd->fd = -1; + + if (fd->inode->ia_type == IA_IFDIR) { + dir = opendir (real_path); + if (!dir) { + GF_FREE (pfd); + pfd = NULL; + goto out; } + _fd = dirfd (dir); + } - ret = posix_handle_pair (this, path, trav, XATTR_CREATE); - if (ret < 0) { - errno = -ret; - ret = -1; + if (fd->inode->ia_type == IA_IFREG) { + _fd = open (real_path, O_RDWR|O_LARGEFILE); + if (_fd == -1) { + GF_FREE (pfd); + pfd = NULL; goto out; } - trav = trav->next; } - ret = 0; + pfd->fd = _fd; + pfd->dir = dir; + + ret = __fd_ctx_set (fd, this, (uint64_t) (long) pfd); + if (ret != 0) { + if (_fd != -1) + close (_fd); + if (dir) + closedir (dir); + GF_FREE (pfd); + pfd = NULL; + goto out; + } + ret = 0; out: + if (pfd_p) + *pfd_p = pfd; + return ret; +} + + +int +posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd) +{ + int ret; + + LOCK (&fd->inode->lock); + { + ret = __posix_fd_ctx_get (fd, this, pfd); + } + UNLOCK (&fd->inode->lock); + return ret; } + +static void * +posix_health_check_thread_proc (void *data) +{ + xlator_t *this = NULL; + struct posix_private *priv = NULL; + uint32_t interval = 0; + int ret = -1; + struct stat sb = {0, }; + + this = data; + priv = this->private; + + /* prevent races when the interval is updated */ + interval = priv->health_check_interval; + if (interval == 0) + goto out; + + gf_log (this->name, GF_LOG_DEBUG, "health-check thread started, " + "interval = %d seconds", interval); + + while (1) { + /* aborting sleep() is a request to exit this thread, sleep() + * will normally not return when cancelled */ + ret = sleep (interval); + if (ret > 0) + break; + + /* prevent thread errors while doing the health-check(s) */ + pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL); + + /* Do the health-check, it should be moved to its own function + * in case it gets more complex. */ + ret = stat (priv->base_path, &sb); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "stat() on %s returned: %s", priv->base_path, + strerror (errno)); + goto abort; + } + + pthread_setcancelstate (PTHREAD_CANCEL_ENABLE, NULL); + } + +out: + gf_log (this->name, GF_LOG_DEBUG, "health-check thread exiting"); + + LOCK (&priv->lock); + { + priv->health_check_active = _gf_false; + } + UNLOCK (&priv->lock); + + return NULL; + +abort: + /* health-check failed */ + gf_log (this->name, GF_LOG_EMERG, "health-check failed, going down"); + xlator_notify (this->parents->xlator, GF_EVENT_CHILD_DOWN, this); + + ret = sleep (30); + if (ret == 0) { + gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGTERM"); + kill (getpid(), SIGTERM); + } + + ret = sleep (30); + if (ret == 0) { + gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGKILL"); + kill (getpid(), SIGKILL); + } + + return NULL; +} + +void +posix_spawn_health_check_thread (xlator_t *xl) +{ + struct posix_private *priv = NULL; + int ret = -1; + + priv = xl->private; + + LOCK (&priv->lock); + { + /* cancel the running thread */ + if (priv->health_check_active == _gf_true) { + pthread_cancel (priv->health_check); + priv->health_check_active = _gf_false; + } + + /* prevent scheduling a check in a tight loop */ + if (priv->health_check_interval == 0) + goto unlock; + + ret = gf_thread_create (&priv->health_check, NULL, + posix_health_check_thread_proc, xl); + if (ret < 0) { + priv->health_check_interval = 0; + priv->health_check_active = _gf_false; + gf_log (xl->name, GF_LOG_ERROR, + "unable to setup health-check thread: %s", + strerror (errno)); + goto unlock; + } + + /* run the thread detached, resources will be freed on exit */ + pthread_detach (priv->health_check); + priv->health_check_active = _gf_true; + } +unlock: + UNLOCK (&priv->lock); +} + +int +posix_fsyncer_pick (xlator_t *this, struct list_head *head) +{ + struct posix_private *priv = NULL; + int count = 0; + + priv = this->private; + pthread_mutex_lock (&priv->fsync_mutex); + { + while (list_empty (&priv->fsyncs)) + pthread_cond_wait (&priv->fsync_cond, + &priv->fsync_mutex); + + count = priv->fsync_queue_count; + priv->fsync_queue_count = 0; + list_splice_init (&priv->fsyncs, head); + } + pthread_mutex_unlock (&priv->fsync_mutex); + + return count; +} + + +void +posix_fsyncer_process (xlator_t *this, call_stub_t *stub, gf_boolean_t do_fsync) +{ + struct posix_fd *pfd = NULL; + int ret = -1; + struct posix_private *priv = NULL; + + priv = this->private; + + ret = posix_fd_ctx_get (stub->args.fd, this, &pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not get fdctx for fd(%s)", + uuid_utoa (stub->args.fd->inode->gfid)); + call_unwind_error (stub, -1, EINVAL); + return; + } + + if (do_fsync) { +#ifdef HAVE_FDATASYNC + if (stub->args.datasync) + ret = fdatasync (pfd->fd); + else +#endif + ret = fsync (pfd->fd); + } else { + ret = 0; + } + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "could not fstat fd(%s)", + uuid_utoa (stub->args.fd->inode->gfid)); + call_unwind_error (stub, -1, errno); + return; + } + + call_unwind_error (stub, 0, 0); +} + + +static void +posix_fsyncer_syncfs (xlator_t *this, struct list_head *head) +{ + call_stub_t *stub = NULL; + struct posix_fd *pfd = NULL; + int ret = -1; + + stub = list_entry (head->prev, call_stub_t, list); + ret = posix_fd_ctx_get (stub->args.fd, this, &pfd); + if (ret) + return; + +#ifdef GF_LINUX_HOST_OS + /* syncfs() is not "declared" in RHEL's glibc even though + the kernel has support. + */ +#include <sys/syscall.h> +#include <unistd.h> +#ifdef SYS_syncfs + syscall (SYS_syncfs, pfd->fd); +#else + sync(); +#endif +#else + sync(); +#endif +} + + +void * +posix_fsyncer (void *d) +{ + xlator_t *this = d; + struct posix_private *priv = NULL; + call_stub_t *stub = NULL; + call_stub_t *tmp = NULL; + struct list_head list; + int count = 0; + gf_boolean_t do_fsync = _gf_true; + + priv = this->private; + + for (;;) { + INIT_LIST_HEAD (&list); + + count = posix_fsyncer_pick (this, &list); + + usleep (priv->batch_fsync_delay_usec); + + gf_log (this->name, GF_LOG_DEBUG, + "picked %d fsyncs", count); + + switch (priv->batch_fsync_mode) { + case BATCH_NONE: + case BATCH_REVERSE_FSYNC: + break; + case BATCH_SYNCFS: + case BATCH_SYNCFS_SINGLE_FSYNC: + case BATCH_SYNCFS_REVERSE_FSYNC: + posix_fsyncer_syncfs (this, &list); + break; + } + + if (priv->batch_fsync_mode == BATCH_SYNCFS) + do_fsync = _gf_false; + else + do_fsync = _gf_true; + + list_for_each_entry_safe_reverse (stub, tmp, &list, list) { + list_del_init (&stub->list); + + posix_fsyncer_process (this, stub, do_fsync); + + if (priv->batch_fsync_mode == BATCH_SYNCFS_SINGLE_FSYNC) + do_fsync = _gf_false; + } + } +} diff --git a/xlators/storage/posix/src/posix-mem-types.h b/xlators/storage/posix/src/posix-mem-types.h index 10aa75edc..81752c17e 100644 --- a/xlators/storage/posix/src/posix-mem-types.h +++ b/xlators/storage/posix/src/posix-mem-types.h @@ -1,22 +1,12 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef __POSIX_MEM_TYPES_H__ #define __POSIX_MEM_TYPES_H__ @@ -30,6 +20,7 @@ enum gf_posix_mem_types_ { gf_posix_mt_int32_t, gf_posix_mt_posix_dev_t, gf_posix_mt_trash_path, + gf_posix_mt_paiocb, gf_posix_mt_end }; #endif diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index ab512af15..fb45c7a67 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -1,22 +1,12 @@ /* - Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. -*/ + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" @@ -24,6 +14,7 @@ #define __XOPEN_SOURCE 500 +#include <openssl/md5.h> #include <stdint.h> #include <sys/time.h> #include <sys/resource.h> @@ -32,6 +23,8 @@ #include <pthread.h> #include <ftw.h> #include <sys/stat.h> +#include <signal.h> +#include <sys/uio.h> #ifndef GF_BSD_HOST_OS #include <alloca.h> @@ -42,7 +35,6 @@ #endif /* HAVE_LINKAT */ #include "glusterfs.h" -#include "md5.h" #include "checksum.h" #include "dict.h" #include "logging.h" @@ -59,7 +51,11 @@ #include "timer.h" #include "glusterfs3-xdr.h" #include "hashfn.h" +#include "posix-aio.h" +#include "glusterfs-acl.h" +extern char *marker_xattrs[]; +#define ALIGN_SIZE 4096 #undef HAVE_SET_FSID #ifdef HAVE_SET_FSID @@ -98,71 +94,90 @@ posix_forget (xlator_t *this, inode_t *inode) int32_t posix_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) + loc_t *loc, dict_t *xdata) { struct iatt buf = {0, }; - char * real_path = NULL; int32_t op_ret = -1; int32_t entry_ret = 0; int32_t op_errno = 0; dict_t * xattr = NULL; - char * pathdup = NULL; - char * parentpath = NULL; + char * real_path = NULL; + char * par_path = NULL; struct iatt postparent = {0,}; + int32_t gfidless = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (loc, out); - VALIDATE_OR_GOTO (loc->path, out); - MAKE_REAL_PATH (real_path, this, loc->path); + /* The Hidden directory should be for housekeeping purpose and it + should not get any gfid on it */ + if (__is_root_gfid (loc->pargfid) && + (strcmp (loc->name, GF_HIDDEN_PATH) == 0)) { + gf_log (this->name, GF_LOG_WARNING, + "Lookup issued on %s, which is not permitted", + GF_HIDDEN_PATH); + op_errno = EPERM; + op_ret = -1; + goto out; + } + + op_ret = dict_get_int32 (xdata, GF_GFIDLESS_LOOKUP, &gfidless); + op_ret = -1; + if (uuid_is_null (loc->pargfid)) { + /* nameless lookup */ + MAKE_INODE_HANDLE (real_path, this, loc, &buf); + } else { + MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &buf); - posix_gfid_set (this, real_path, xattr_req); + if (uuid_is_null (loc->inode->gfid)) { + posix_gfid_heal (this, real_path, loc, xdata); + MAKE_ENTRY_HANDLE (real_path, par_path, this, + loc, &buf); + } + } - op_ret = posix_lstat_with_gfid (this, real_path, &buf); op_errno = errno; if (op_ret == -1) { if (op_errno != ENOENT) { gf_log (this->name, GF_LOG_ERROR, "lstat on %s failed: %s", - loc->path, strerror (op_errno)); + real_path, strerror (op_errno)); } entry_ret = -1; goto parent; } - if (xattr_req && (op_ret == 0)) { + if (xdata && (op_ret == 0)) { xattr = posix_lookup_xattr_fill (this, real_path, loc, - xattr_req, &buf); + xdata, &buf); } parent: - if (loc->parent) { - pathdup = gf_strdup (real_path); - GF_VALIDATE_OR_GOTO (this->name, pathdup, out); - - parentpath = dirname (pathdup); - - op_ret = posix_lstat_with_gfid (this, parentpath, &postparent); + if (par_path) { + op_ret = posix_pstat (this, loc->pargfid, par_path, &postparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "post-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); + "post-operation lstat on parent %s failed: %s", + par_path, strerror (op_errno)); goto out; } } op_ret = entry_ret; out: - if (pathdup) - GF_FREE (pathdup); - if (xattr) dict_ref (xattr); + if (!op_ret && !gfidless && uuid_is_null (buf.ia_gfid)) { + gf_log (this->name, GF_LOG_ERROR, "buf->ia_gfid is null for " + "%s", (real_path) ? real_path: ""); + op_ret = -1; + op_errno = ENODATA; + } STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, (loc)?loc->inode:NULL, &buf, xattr, &postparent); @@ -174,13 +189,13 @@ out: int32_t -posix_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) +posix_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { struct iatt buf = {0,}; - char * real_path = NULL; int32_t op_ret = -1; int32_t op_errno = 0; struct posix_private *priv = NULL; + char *real_path = NULL; DECLARE_OLD_FS_ID_VAR; @@ -192,13 +207,14 @@ posix_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) VALIDATE_OR_GOTO (priv, out); SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); - op_ret = posix_lstat_with_gfid (this, real_path, &buf); + MAKE_INODE_HANDLE (real_path, this, loc, &buf); + if (op_ret == -1) { op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "lstat on %s failed: %s", loc->path, + gf_log (this->name, (op_errno == ENOENT)? + GF_LOG_DEBUG:GF_LOG_ERROR, + "lstat on %s failed: %s", real_path, strerror (op_errno)); goto out; } @@ -207,7 +223,7 @@ posix_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) out: SET_TO_OLD_FS_ID(); - STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, &buf); + STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, &buf, NULL); return 0; } @@ -313,7 +329,7 @@ out: int posix_setattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct iatt *stbuf, int32_t valid) + loc_t *loc, struct iatt *stbuf, int32_t valid, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; @@ -328,9 +344,8 @@ posix_setattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (loc, out); SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); + MAKE_INODE_HANDLE (real_path, this, loc, &statpre); - op_ret = posix_lstat_with_gfid (this, real_path, &statpre); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, @@ -384,7 +399,7 @@ posix_setattr (call_frame_t *frame, xlator_t *this, } } - op_ret = posix_lstat_with_gfid (this, real_path, &statpost); + op_ret = posix_pstat (this, loc->gfid, real_path, &statpost); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, @@ -399,7 +414,7 @@ out: SET_TO_OLD_FS_ID (); STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, - &statpre, &statpost); + &statpre, &statpost, NULL); return 0; } @@ -449,14 +464,13 @@ posix_do_futimes (xlator_t *this, int posix_fsetattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iatt *stbuf, int32_t valid) + fd_t *fd, struct iatt *stbuf, int32_t valid, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; struct iatt statpre = {0,}; struct iatt statpost = {0,}; struct posix_fd *pfd = NULL; - uint64_t tmp_pfd = 0; int32_t ret = -1; DECLARE_OLD_FS_ID_VAR; @@ -467,16 +481,15 @@ posix_fsetattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (fd, out); - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { op_errno = -ret; gf_log (this->name, GF_LOG_DEBUG, "pfd is NULL from fd=%p", fd); goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; - op_ret = posix_fstat_with_gfid (this, pfd->fd, &statpre); + op_ret = posix_fdstat (this, pfd->fd, &statpre); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, @@ -531,7 +544,7 @@ posix_fsetattr (call_frame_t *frame, xlator_t *this, } } - op_ret = posix_fstat_with_gfid (this, pfd->fd, &statpost); + op_ret = posix_fdstat (this, pfd->fd, &statpost); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, @@ -546,14 +559,297 @@ out: SET_TO_OLD_FS_ID (); STACK_UNWIND_STRICT (fsetattr, frame, op_ret, op_errno, - &statpre, &statpost); + &statpre, &statpost, NULL); + + return 0; +} + +static int32_t +posix_do_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + off_t offset, size_t len, struct iatt *statpre, + struct iatt *statpost) +{ + struct posix_fd *pfd = NULL; + int32_t ret = -1; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID (frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "pfd is NULL from fd=%p", fd); + goto out; + } + + ret = posix_fdstat (this, pfd->fd, statpre); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "fallocate (fstat) failed on fd=%p: %s", fd, + strerror (errno)); + goto out; + } + + ret = sys_fallocate(pfd->fd, flags, offset, len); + if (ret == -1) { + ret = -errno; + goto out; + } + + ret = posix_fdstat (this, pfd->fd, statpost); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "fallocate (fstat) failed on fd=%p: %s", fd, + strerror (errno)); + goto out; + } + +out: + SET_TO_OLD_FS_ID (); + + return ret; +} + +char* +_page_aligned_alloc (size_t size, char **aligned_buf) +{ + char *alloc_buf = NULL; + char *buf = NULL; + + alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_posix_mt_char); + if (!alloc_buf) + goto out; + /* page aligned buffer */ + buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE); + *aligned_buf = buf; +out: + return alloc_buf; +} + +static int32_t +_posix_do_zerofill(int fd, off_t offset, size_t len, int o_direct) +{ + size_t num_vect = 0; + int32_t num_loop = 1; + int32_t idx = 0; + int32_t op_ret = -1; + int32_t vect_size = VECTOR_SIZE; + size_t remain = 0; + size_t extra = 0; + struct iovec *vector = NULL; + char *iov_base = NULL; + char *alloc_buf = NULL; + + if (len == 0) + return 0; + if (len < VECTOR_SIZE) + vect_size = len; + + num_vect = len / (vect_size); + remain = len % vect_size ; + if (num_vect > MAX_NO_VECT) { + extra = num_vect % MAX_NO_VECT; + num_loop = num_vect / MAX_NO_VECT; + num_vect = MAX_NO_VECT; + } + + vector = GF_CALLOC (num_vect, sizeof(struct iovec), + gf_common_mt_iovec); + if (!vector) + return -1; + if (o_direct) { + alloc_buf = _page_aligned_alloc(vect_size, &iov_base); + if (!alloc_buf) { + gf_log ("_posix_do_zerofill", GF_LOG_DEBUG, + "memory alloc failed, vect_size %d: %s", + vect_size, strerror(errno)); + GF_FREE(vector); + return -1; + } + } else { + iov_base = GF_CALLOC (vect_size, sizeof(char), + gf_common_mt_char); + if (!iov_base) { + GF_FREE(vector); + return -1; + } + } + + for (idx = 0; idx < num_vect; idx++) { + vector[idx].iov_base = iov_base; + vector[idx].iov_len = vect_size; + } + lseek(fd, offset, SEEK_SET); + for (idx = 0; idx < num_loop; idx++) { + op_ret = writev(fd, vector, num_vect); + if (op_ret < 0) + goto err; + } + if (extra) { + op_ret = writev(fd, vector, extra); + if (op_ret < 0) + goto err; + } + if (remain) { + vector[0].iov_len = remain; + op_ret = writev(fd, vector , 1); + if (op_ret < 0) + goto err; + } +err: + if (o_direct) + GF_FREE(alloc_buf); + else + GF_FREE(iov_base); + GF_FREE(vector); + return op_ret; +} + +static int32_t +posix_do_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, size_t len, struct iatt *statpre, + struct iatt *statpost) +{ + struct posix_fd *pfd = NULL; + int32_t ret = -1; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID (frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "pfd is NULL from fd=%p", fd); + goto out; + } + + ret = posix_fdstat (this, pfd->fd, statpre); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "pre-operation fstat failed on fd = %p: %s", fd, + strerror (errno)); + goto out; + } + ret = _posix_do_zerofill(pfd->fd, offset, len, pfd->flags & O_DIRECT); + if (ret < 0) { + ret = -errno; + gf_log(this->name, GF_LOG_ERROR, + "zerofill failed on fd %d length %ld %s", + pfd->fd, len, strerror(errno)); + goto out; + } + if (pfd->flags & (O_SYNC|O_DSYNC)) { + ret = fsync (pfd->fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "fsync() in writev on fd %d failed: %s", + pfd->fd, strerror (errno)); + ret = -errno; + goto out; + } + } + ret = posix_fdstat (this, pfd->fd, statpost); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "post operation fstat failed on fd=%p: %s", fd, + strerror (errno)); + goto out; + } + +out: + SET_TO_OLD_FS_ID (); + + return ret; +} + +static int32_t +_posix_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size, + off_t offset, size_t len, dict_t *xdata) +{ + int32_t ret; + int32_t flags = 0; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; + + if (keep_size) + flags = FALLOC_FL_KEEP_SIZE; + + ret = posix_do_fallocate(frame, this, fd, flags, offset, len, + &statpre, &statpost); + if (ret < 0) + goto err; + + STACK_UNWIND_STRICT(fallocate, frame, 0, 0, &statpre, &statpost, NULL); + return 0; + +err: + STACK_UNWIND_STRICT(fallocate, frame, -1, -ret, NULL, NULL, NULL); + return 0; +} + +static int32_t +posix_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int32_t ret; + int32_t flags = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; + + ret = posix_do_fallocate(frame, this, fd, flags, offset, len, + &statpre, &statpost); + if (ret < 0) + goto err; + + STACK_UNWIND_STRICT(discard, frame, 0, 0, &statpre, &statpost, NULL); + return 0; + +err: + STACK_UNWIND_STRICT(discard, frame, -1, -ret, NULL, NULL, NULL); + return 0; + +} + +static int32_t +posix_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int32_t ret = 0; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; + + ret = posix_do_zerofill(frame, this, fd, offset, len, + &statpre, &statpost); + if (ret < 0) + goto err; + + STACK_UNWIND_STRICT(zerofill, frame, 0, 0, &statpre, &statpost, NULL); return 0; + +err: + STACK_UNWIND_STRICT(zerofill, frame, -1, -ret, NULL, NULL, NULL); + return 0; + } int32_t posix_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd) + loc_t *loc, fd_t *fd, dict_t *xdata) { char * real_path = NULL; int32_t op_ret = -1; @@ -570,15 +866,16 @@ posix_opendir (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (fd, out); SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); + MAKE_INODE_HANDLE (real_path, this, loc, NULL); + op_ret = -1; dir = opendir (real_path); if (dir == NULL) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "opendir failed on %s: %s", - loc->path, strerror (op_errno)); + real_path, strerror (op_errno)); goto out; } @@ -587,7 +884,7 @@ posix_opendir (call_frame_t *frame, xlator_t *this, op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "dirfd() failed on %s: %s", - loc->path, strerror (op_errno)); + real_path, strerror (op_errno)); goto out; } @@ -599,16 +896,12 @@ posix_opendir (call_frame_t *frame, xlator_t *this, pfd->dir = dir; pfd->fd = dirfd (dir); - pfd->path = gf_strdup (real_path); - if (!pfd->path) { - goto out; - } op_ret = fd_ctx_set (fd, this, (uint64_t)(long)pfd); if (op_ret) gf_log (this->name, GF_LOG_WARNING, "failed to set the fd context path=%s fd=%p", - loc->path, fd); + real_path, fd); op_ret = 0; @@ -619,15 +912,13 @@ out: dir = NULL; } if (pfd) { - if (pfd->path) - GF_FREE (pfd->path); GF_FREE (pfd); pfd = NULL; } } SET_TO_OLD_FS_ID (); - STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd); + STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd, NULL); return 0; } @@ -654,19 +945,12 @@ posix_releasedir (xlator_t *this, pfd = (struct posix_fd *)(long)tmp_pfd; if (!pfd->dir) { gf_log (this->name, GF_LOG_WARNING, - "pfd->dir is NULL for fd=%p path=%s", - fd, pfd->path ? pfd->path : "<NULL>"); + "pfd->dir is NULL for fd=%p", fd); goto out; } priv = this->private; - if (!pfd->path) { - gf_log (this->name, GF_LOG_WARNING, - "pfd->path was NULL. fd=%p pfd=%p", - fd, pfd); - } - pthread_mutex_lock (&priv->janitor_lock); { INIT_LIST_HEAD (&pfd->list); @@ -682,11 +966,10 @@ out: int32_t posix_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size) + loc_t *loc, size_t size, dict_t *xdata) { char * dest = NULL; int32_t op_ret = -1; - int32_t lstat_ret = -1; int32_t op_errno = 0; char * real_path = NULL; struct iatt stbuf = {0,}; @@ -699,33 +982,29 @@ posix_readlink (call_frame_t *frame, xlator_t *this, dest = alloca (size + 1); - MAKE_REAL_PATH (real_path, this, loc->path); - - op_ret = readlink (real_path, dest, size); + MAKE_INODE_HANDLE (real_path, this, loc, &stbuf); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "readlink on %s failed: %s", loc->path, + "lstat on %s failed: %s", real_path, strerror (op_errno)); goto out; } - dest[op_ret] = 0; - - lstat_ret = posix_lstat_with_gfid (this, real_path, &stbuf); - if (lstat_ret == -1) { - op_ret = -1; + op_ret = readlink (real_path, dest, size); + if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "lstat on %s failed: %s", loc->path, + "readlink on %s failed: %s", real_path, strerror (op_errno)); goto out; } + dest[op_ret] = 0; out: SET_TO_OLD_FS_ID (); - STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, dest, &stbuf); + STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, dest, &stbuf, NULL); return 0; } @@ -733,20 +1012,20 @@ out: int posix_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t dev, dict_t *params) + loc_t *loc, mode_t mode, dev_t dev, mode_t umask, dict_t *xdata) { int tmp_fd = 0; int32_t op_ret = -1; int32_t op_errno = 0; char *real_path = 0; + char *par_path = 0; struct iatt stbuf = { 0, }; char was_present = 1; struct posix_private *priv = NULL; gid_t gid = 0; - char *pathdup = NULL; struct iatt preparent = {0,}; struct iatt postparent = {0,}; - char *parentpath = NULL; + void * uuid_req = NULL; DECLARE_OLD_FS_ID_VAR; @@ -757,32 +1036,44 @@ posix_mknod (call_frame_t *frame, xlator_t *this, priv = this->private; VALIDATE_OR_GOTO (priv, out); - MAKE_REAL_PATH (real_path, this, loc->path); + MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, NULL); gid = frame->root->gid; - op_ret = setgid_override (this, real_path, &gid); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } - SET_FS_ID (frame->root->uid, gid); - pathdup = gf_strdup (real_path); - GF_VALIDATE_OR_GOTO (this->name, pathdup, out); - - parentpath = dirname (pathdup); - op_ret = posix_lstat_with_gfid (this, parentpath, &preparent); + op_ret = posix_pstat (this, loc->pargfid, par_path, &preparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "pre-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); + real_path, strerror (op_errno)); goto out; } + if (preparent.ia_prot.sgid) { + gid = preparent.ia_gid; + } + + /* Check if the 'gfid' already exists, because this mknod may be an + internal call from distribute for creating 'linkfile', and that + linkfile may be for a hardlinked file */ + if (dict_get (xdata, GLUSTERFS_INTERNAL_FOP_KEY)) { + dict_del (xdata, GLUSTERFS_INTERNAL_FOP_KEY); + op_ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); + if (op_ret) { + gf_log (this->name, GF_LOG_DEBUG, + "failed to get the gfid from dict for %s", + loc->path); + goto real_op; + } + op_ret = posix_create_link_if_gfid_exists (this, uuid_req, + real_path); + if (!op_ret) + goto post_op; + } + +real_op: #ifdef __NetBSD__ if (S_ISFIFO(mode)) op_ret = mkfifo (real_path, mode); @@ -799,23 +1090,23 @@ posix_mknod (call_frame_t *frame, xlator_t *this, if (tmp_fd == -1) { gf_log (this->name, GF_LOG_ERROR, "create failed on %s: %s", - loc->path, strerror (errno)); + real_path, strerror (errno)); goto out; } close (tmp_fd); } else { gf_log (this->name, GF_LOG_ERROR, - "mknod on %s failed: %s", loc->path, + "mknod on %s failed: %s", real_path, strerror (op_errno)); goto out; } } - op_ret = posix_gfid_set (this, real_path, params); + op_ret = posix_gfid_set (this, real_path, loc, xdata); if (op_ret) { gf_log (this->name, GF_LOG_ERROR, - "setting gfid on %s failed", loc->path); + "setting gfid on %s failed", real_path); } #ifndef HAVE_SET_FSID @@ -823,54 +1114,53 @@ posix_mknod (call_frame_t *frame, xlator_t *this, if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "lchown on %s failed: %s", loc->path, + "lchown on %s failed: %s", real_path, strerror (op_errno)); goto out; } #endif - op_ret = posix_acl_xattr_set (this, real_path, params); +post_op: + op_ret = posix_acl_xattr_set (this, real_path, xdata); if (op_ret) { gf_log (this->name, GF_LOG_ERROR, - "setting ACLs on %s failed (%s)", loc->path, + "setting ACLs on %s failed (%s)", real_path, strerror (errno)); } - op_ret = posix_entry_create_xattr_set (this, real_path, params); + op_ret = posix_entry_create_xattr_set (this, real_path, xdata); if (op_ret) { gf_log (this->name, GF_LOG_ERROR, - "setting xattrs on %s failed (%s)", loc->path, + "setting xattrs on %s failed (%s)", real_path, strerror (errno)); } - op_ret = posix_lstat_with_gfid (this, real_path, &stbuf); + op_ret = posix_pstat (this, NULL, real_path, &stbuf); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "mknod on %s failed: %s", loc->path, + "mknod on %s failed: %s", real_path, strerror (op_errno)); goto out; } - op_ret = posix_lstat_with_gfid (this, parentpath, &postparent); + op_ret = posix_pstat (this, loc->pargfid, par_path, &postparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "post-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); + "post-operation lstat on parent %s failed: %s", + par_path, strerror (op_errno)); goto out; } op_ret = 0; out: - if (pathdup) - GF_FREE (pathdup); - SET_TO_OLD_FS_ID (); STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, - (loc)?loc->inode:NULL, &stbuf, &preparent, &postparent); + (loc)?loc->inode:NULL, &stbuf, &preparent, + &postparent, NULL); if ((op_ret == -1) && (!was_present)) { unlink (real_path); @@ -879,19 +1169,19 @@ out: return 0; } + int posix_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *params) + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; char *real_path = NULL; + char *par_path = NULL; struct iatt stbuf = {0, }; char was_present = 1; struct posix_private *priv = NULL; gid_t gid = 0; - char *pathdup = NULL; - char *parentpath = NULL; struct iatt preparent = {0,}; struct iatt postparent = {0,}; @@ -901,54 +1191,59 @@ posix_mkdir (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (loc, out); + /* The Hidden directory should be for housekeeping purpose and it + should not get created from a user request */ + if (__is_root_gfid (loc->pargfid) && + (strcmp (loc->name, GF_HIDDEN_PATH) == 0)) { + gf_log (this->name, GF_LOG_WARNING, + "mkdir issued on %s, which is not permitted", + GF_HIDDEN_PATH); + op_errno = EPERM; + op_ret = -1; + goto out; + } + priv = this->private; VALIDATE_OR_GOTO (priv, out); - MAKE_REAL_PATH (real_path, this, loc->path); + MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, NULL); gid = frame->root->gid; - op_ret = posix_lstat_with_gfid (this, real_path, &stbuf); + op_ret = posix_pstat (this, NULL, real_path, &stbuf); if ((op_ret == -1) && (errno == ENOENT)) { was_present = 0; } - op_ret = setgid_override (this, real_path, &gid); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } - SET_FS_ID (frame->root->uid, gid); - pathdup = gf_strdup (real_path); - if (!pathdup) - goto out; - parentpath = dirname (pathdup); - - op_ret = posix_lstat_with_gfid (this, parentpath, &preparent); + op_ret = posix_pstat (this, loc->pargfid, par_path, &preparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "pre-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); + "pre-operation lstat on parent %s failed: %s", + par_path, strerror (op_errno)); goto out; } + if (preparent.ia_prot.sgid) { + gid = preparent.ia_gid; + mode |= S_ISGID; + } + op_ret = mkdir (real_path, mode); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "mkdir of %s failed: %s", loc->path, + "mkdir of %s failed: %s", real_path, strerror (op_errno)); goto out; } - op_ret = posix_gfid_set (this, real_path, params); + op_ret = posix_gfid_set (this, real_path, loc, xdata); if (op_ret) { gf_log (this->name, GF_LOG_ERROR, - "setting gfid on %s failed", loc->path); + "setting gfid on %s failed", real_path); } #ifndef HAVE_SET_FSID @@ -956,54 +1251,52 @@ posix_mkdir (call_frame_t *frame, xlator_t *this, if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "chown on %s failed: %s", loc->path, + "chown on %s failed: %s", real_path, strerror (op_errno)); goto out; } #endif - op_ret = posix_acl_xattr_set (this, real_path, params); + op_ret = posix_acl_xattr_set (this, real_path, xdata); if (op_ret) { gf_log (this->name, GF_LOG_ERROR, - "setting ACLs on %s failed (%s)", loc->path, + "setting ACLs on %s failed (%s)", real_path, strerror (errno)); } - op_ret = posix_entry_create_xattr_set (this, real_path, params); + op_ret = posix_entry_create_xattr_set (this, real_path, xdata); if (op_ret) { gf_log (this->name, GF_LOG_ERROR, - "setting xattrs on %s failed (%s)", loc->path, + "setting xattrs on %s failed (%s)", real_path, strerror (errno)); } - op_ret = posix_lstat_with_gfid (this, real_path, &stbuf); + op_ret = posix_pstat (this, NULL, real_path, &stbuf); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "lstat on %s failed: %s", loc->path, + "lstat on %s failed: %s", real_path, strerror (op_errno)); goto out; } - op_ret = posix_lstat_with_gfid (this, parentpath, &postparent); + op_ret = posix_pstat (this, loc->pargfid, par_path, &postparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "post-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); + real_path, strerror (op_errno)); goto out; } op_ret = 0; out: - if (pathdup) - GF_FREE (pathdup); - SET_TO_OLD_FS_ID (); STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, - (loc)?loc->inode:NULL, &stbuf, &preparent, &postparent); + (loc)?loc->inode:NULL, &stbuf, &preparent, + &postparent, NULL); if ((op_ret == -1) && (!was_present)) { unlink (real_path); @@ -1015,17 +1308,17 @@ out: int32_t posix_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc) + loc_t *loc, int xflag, dict_t *xdata) { - int32_t op_ret = -1; - int32_t op_errno = 0; - char *real_path = NULL; - char *pathdup = NULL; - char *parentpath = NULL; - int32_t fd = -1; - struct posix_private *priv = NULL; - struct iatt preparent = {0,}; - struct iatt postparent = {0,}; + int32_t op_ret = -1; + int32_t op_errno = 0; + char *real_path = NULL; + char *par_path = NULL; + int32_t fd = -1; + struct iatt stbuf = {0,}; + struct posix_private *priv = NULL; + struct iatt preparent = {0,}; + struct iatt postparent = {0,}; DECLARE_OLD_FS_ID_VAR; @@ -1034,23 +1327,20 @@ posix_unlink (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (loc, out); SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); - - pathdup = gf_strdup (real_path); - if (!pathdup) - goto out; - - parentpath = dirname (pathdup); + MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &stbuf); - op_ret = posix_lstat_with_gfid (this, parentpath, &preparent); + op_ret = posix_pstat (this, loc->pargfid, par_path, &preparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "pre-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); + "pre-operation lstat on parent %s failed: %s", + par_path, strerror (op_errno)); goto out; } + if (stbuf.ia_nlink == 1) + posix_handle_unset (this, stbuf.ia_gfid, NULL); + priv = this->private; if (priv->background_unlink) { if (IA_ISREG (loc->inode->ia_type)) { @@ -1059,7 +1349,7 @@ posix_unlink (call_frame_t *frame, xlator_t *this, op_ret = -1; op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "open of %s failed: %s", loc->path, + "open of %s failed: %s", real_path, strerror (op_errno)); goto out; } @@ -1070,30 +1360,27 @@ posix_unlink (call_frame_t *frame, xlator_t *this, if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "unlink of %s failed: %s", loc->path, + "unlink of %s failed: %s", real_path, strerror (op_errno)); goto out; } - op_ret = posix_lstat_with_gfid (this, parentpath, &postparent); + op_ret = posix_pstat (this, loc->pargfid, par_path, &postparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "post-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); + "post-operation lstat on parent %s failed: %s", + par_path, strerror (op_errno)); goto out; } op_ret = 0; out: - if (pathdup) - GF_FREE (pathdup); - SET_TO_OLD_FS_ID (); STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno, - &preparent, &postparent); + &preparent, &postparent, NULL); if (fd != -1) { close (fd); @@ -1105,15 +1392,16 @@ out: int posix_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags) + loc_t *loc, int flags, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; char * real_path = NULL; - char * pathdup = NULL; - char * parentpath = NULL; + char * par_path = NULL; + char * gfid_str = NULL; struct iatt preparent = {0,}; struct iatt postparent = {0,}; + struct iatt stbuf; struct posix_private *priv = NULL; DECLARE_OLD_FS_ID_VAR; @@ -1122,39 +1410,51 @@ posix_rmdir (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (loc, out); - priv = this->private; - SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); - pathdup = gf_strdup (real_path); - if (!pathdup) + /* The Hidden directory should be for housekeeping purpose and it + should not get deleted from inside process */ + if (__is_root_gfid (loc->pargfid) && + (strcmp (loc->name, GF_HIDDEN_PATH) == 0)) { + gf_log (this->name, GF_LOG_WARNING, + "rmdir issued on %s, which is not permitted", + GF_HIDDEN_PATH); + op_errno = EPERM; + op_ret = -1; goto out; + } - parentpath = dirname (pathdup); + priv = this->private; - op_ret = posix_lstat_with_gfid (this, parentpath, &preparent); + MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &stbuf); + + op_ret = posix_pstat (this, loc->pargfid, par_path, &preparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "pre-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); + "pre-operation lstat on parent %s failed: %s", + par_path, strerror (op_errno)); goto out; } if (flags) { - uint32_t hashval = 0; - char *tmp_path = alloca (strlen (priv->trash_path) + 16); + gfid_str = uuid_utoa (stbuf.ia_gfid); + char *tmp_path = alloca (strlen (priv->trash_path) + + strlen ("/") + + strlen (gfid_str) + 1); mkdir (priv->trash_path, 0755); - hashval = gf_dm_hashfn (real_path, strlen (real_path)); - sprintf (tmp_path, "%s/%u", priv->trash_path, hashval); + sprintf (tmp_path, "%s/%s", priv->trash_path, gfid_str); op_ret = rename (real_path, tmp_path); } else { op_ret = rmdir (real_path); } op_errno = errno; + if (op_ret == 0) { + posix_handle_unset (this, stbuf.ia_gfid, NULL); + } + if (op_errno == EEXIST) /* Solaris sets errno = EEXIST instead of ENOTEMPTY */ op_errno = ENOTEMPTY; @@ -1162,7 +1462,7 @@ posix_rmdir (call_frame_t *frame, xlator_t *this, /* No need to log a common error as ENOTEMPTY */ if (op_ret == -1 && op_errno != ENOTEMPTY) { gf_log (this->name, GF_LOG_ERROR, - "rmdir of %s failed: %s", loc->path, + "rmdir of %s failed: %s", real_path, strerror (op_errno)); } @@ -1170,27 +1470,24 @@ posix_rmdir (call_frame_t *frame, xlator_t *this, gf_log (this->name, (op_errno == ENOTEMPTY) ? GF_LOG_DEBUG : GF_LOG_ERROR, "%s on %s failed", (flags) ? "rename" : "rmdir", - loc->path); + real_path); goto out; } - op_ret = posix_lstat_with_gfid (this, parentpath, &postparent); + op_ret = posix_pstat (this, loc->pargfid, par_path, &postparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "post-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); + par_path, strerror (op_errno)); goto out; } out: - if (pathdup) - GF_FREE (pathdup); - SET_TO_OLD_FS_ID (); STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno, - &preparent, &postparent); + &preparent, &postparent, NULL); return 0; } @@ -1198,17 +1495,16 @@ out: int posix_symlink (call_frame_t *frame, xlator_t *this, - const char *linkname, loc_t *loc, dict_t *params) + const char *linkname, loc_t *loc, mode_t umask, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; char * real_path = 0; + char * par_path = 0; struct iatt stbuf = { 0, }; struct posix_private *priv = NULL; gid_t gid = 0; char was_present = 1; - char *pathdup = NULL; - char *parentpath = NULL; struct iatt preparent = {0,}; struct iatt postparent = {0,}; @@ -1222,52 +1518,43 @@ posix_symlink (call_frame_t *frame, xlator_t *this, priv = this->private; VALIDATE_OR_GOTO (priv, out); - MAKE_REAL_PATH (real_path, this, loc->path); + MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &stbuf); - op_ret = posix_lstat_with_gfid (this, real_path, &stbuf); if ((op_ret == -1) && (errno == ENOENT)){ was_present = 0; } - gid = frame->root->gid; - - op_ret = setgid_override (this, real_path, &gid); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } - SET_FS_ID (frame->root->uid, gid); - pathdup = gf_strdup (real_path); - if (!pathdup) - goto out; - parentpath = dirname (pathdup); + gid = frame->root->gid; - op_ret = posix_lstat_with_gfid (this, parentpath, &preparent); + op_ret = posix_pstat (this, loc->pargfid, par_path, &preparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "pre-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); + "pre-operation lstat on parent %s failed: %s", + par_path, strerror (op_errno)); goto out; } + if (preparent.ia_prot.sgid) { + gid = preparent.ia_gid; + } + op_ret = symlink (linkname, real_path); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "symlink of %s --> %s failed: %s", - loc->path, linkname, strerror (op_errno)); + real_path, linkname, strerror (op_errno)); goto out; } - op_ret = posix_gfid_set (this, real_path, params); + op_ret = posix_gfid_set (this, real_path, loc, xdata); if (op_ret) { gf_log (this->name, GF_LOG_ERROR, - "setting gfid on %s failed", loc->path); + "setting gfid on %s failed", real_path); } #ifndef HAVE_SET_FSID @@ -1276,53 +1563,51 @@ posix_symlink (call_frame_t *frame, xlator_t *this, op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "lchown failed on %s: %s", - loc->path, strerror (op_errno)); + real_path, strerror (op_errno)); goto out; } #endif - op_ret = posix_acl_xattr_set (this, real_path, params); + op_ret = posix_acl_xattr_set (this, real_path, xdata); if (op_ret) { gf_log (this->name, GF_LOG_ERROR, - "setting ACLs on %s failed (%s)", loc->path, + "setting ACLs on %s failed (%s)", real_path, strerror (errno)); } - op_ret = posix_entry_create_xattr_set (this, real_path, params); + op_ret = posix_entry_create_xattr_set (this, real_path, xdata); if (op_ret) { gf_log (this->name, GF_LOG_ERROR, - "setting xattrs on %s failed (%s)", loc->path, + "setting xattrs on %s failed (%s)", real_path, strerror (errno)); } - op_ret = posix_lstat_with_gfid (this, real_path, &stbuf); + op_ret = posix_pstat (this, NULL, real_path, &stbuf); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "lstat failed on %s: %s", - loc->path, strerror (op_errno)); + real_path, strerror (op_errno)); goto out; } - op_ret = posix_lstat_with_gfid (this, parentpath, &postparent); + op_ret = posix_pstat (this, loc->pargfid, par_path, &postparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "post-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); + "post-operation lstat on parent %s failed: %s", + par_path, strerror (op_errno)); goto out; } op_ret = 0; out: - if (pathdup) - GF_FREE (pathdup); - SET_TO_OLD_FS_ID (); STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, - (loc)?loc->inode:NULL, &stbuf, &preparent, &postparent); + (loc)?loc->inode:NULL, &stbuf, &preparent, + &postparent, NULL); if ((op_ret == -1) && (!was_present)) { unlink (real_path); @@ -1334,25 +1619,26 @@ out: int posix_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) + loc_t *oldloc, loc_t *newloc, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; char *real_oldpath = NULL; char *real_newpath = NULL; + char *par_oldpath = NULL; + char *par_newpath = NULL; struct iatt stbuf = {0, }; struct posix_private *priv = NULL; char was_present = 1; - char *oldpathdup = NULL; - char *oldparentpath = NULL; - char *newpathdup = NULL; - char *newparentpath = NULL; struct iatt preoldparent = {0, }; struct iatt postoldparent = {0, }; struct iatt prenewparent = {0, }; struct iatt postnewparent = {0, }; char olddirid[64]; char newdirid[64]; + uuid_t victim = {0}; + int was_dir = 0; + int nlink = 0; DECLARE_OLD_FS_ID_VAR; @@ -1365,42 +1651,35 @@ posix_rename (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (priv, out); SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_oldpath, this, oldloc->path); - MAKE_REAL_PATH (real_newpath, this, newloc->path); - - oldpathdup = gf_strdup (real_oldpath); - if (!oldpathdup) - goto out; + MAKE_ENTRY_HANDLE (real_oldpath, par_oldpath, this, oldloc, NULL); + MAKE_ENTRY_HANDLE (real_newpath, par_newpath, this, newloc, &stbuf); - oldparentpath = dirname (oldpathdup); - - op_ret = posix_lstat_with_gfid (this, oldparentpath, &preoldparent); + op_ret = posix_pstat (this, oldloc->pargfid, par_oldpath, &preoldparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "pre-operation lstat on parent of %s failed: %s", - oldloc->path, strerror (op_errno)); + "pre-operation lstat on parent %s failed: %s", + par_oldpath, strerror (op_errno)); goto out; } - newpathdup = gf_strdup (real_newpath); - if (!newpathdup) - goto out; - - newparentpath = dirname (newpathdup); - - op_ret = posix_lstat_with_gfid (this, newparentpath, &prenewparent); + op_ret = posix_pstat (this, newloc->pargfid, par_newpath, &prenewparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "pre-operation lstat on parent of %s failed: %s", - newloc->path, strerror (op_errno)); + par_newpath, strerror (op_errno)); goto out; } - op_ret = posix_lstat_with_gfid (this, real_newpath, &stbuf); + op_ret = posix_pstat (this, NULL, real_newpath, &stbuf); if ((op_ret == -1) && (errno == ENOENT)){ was_present = 0; + } else { + uuid_copy (victim, stbuf.ia_gfid); + if (IA_ISDIR (stbuf.ia_type)) + was_dir = 1; + nlink = stbuf.ia_nlink; } if (was_present && IA_ISDIR(stbuf.ia_type) && !newloc->inode) { @@ -1424,17 +1703,32 @@ posix_rename (call_frame_t *frame, xlator_t *this, goto out; } + if (IA_ISDIR (oldloc->inode->ia_type)) { + posix_handle_unset (this, oldloc->inode->gfid, NULL); + } + op_ret = sys_rename (real_oldpath, real_newpath); if (op_ret == -1) { op_errno = errno; gf_log (this->name, (op_errno == ENOTEMPTY ? GF_LOG_DEBUG : GF_LOG_ERROR), "rename of %s to %s failed: %s", - oldloc->path, newloc->path, strerror (op_errno)); + real_oldpath, real_newpath, strerror (op_errno)); goto out; } - op_ret = posix_lstat_with_gfid (this, real_newpath, &stbuf); + if (was_dir) + posix_handle_unset (this, victim, NULL); + + if (was_present && !was_dir && nlink == 1) + posix_handle_unset (this, victim, NULL); + + if (IA_ISDIR (oldloc->inode->ia_type)) { + posix_handle_soft (this, real_newpath, newloc, + oldloc->inode->gfid, NULL); + } + + op_ret = posix_pstat (this, NULL, real_newpath, &stbuf); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, @@ -1443,39 +1737,33 @@ posix_rename (call_frame_t *frame, xlator_t *this, goto out; } - op_ret = posix_lstat_with_gfid (this, oldparentpath, &postoldparent); + op_ret = posix_pstat (this, oldloc->pargfid, par_oldpath, &postoldparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "post-operation lstat on parent of %s failed: %s", - oldloc->path, strerror (op_errno)); + "post-operation lstat on parent %s failed: %s", + par_oldpath, strerror (op_errno)); goto out; } - op_ret = posix_lstat_with_gfid (this, newparentpath, &postnewparent); + op_ret = posix_pstat (this, newloc->pargfid, par_newpath, &postnewparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "post-operation lstat on parent of %s failed: %s", - newloc->path, strerror (op_errno)); + "post-operation lstat on parent %s failed: %s", + par_newpath, strerror (op_errno)); goto out; } op_ret = 0; out: - if (oldpathdup) - GF_FREE (oldpathdup); - - if (newpathdup) - GF_FREE (newpathdup); - SET_TO_OLD_FS_ID (); STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, &stbuf, &preoldparent, &postoldparent, - &prenewparent, &postnewparent); + &prenewparent, &postnewparent, NULL); if ((op_ret == -1) && !was_present) { unlink (real_newpath); @@ -1487,17 +1775,16 @@ out: int posix_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc) + loc_t *oldloc, loc_t *newloc, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; char *real_oldpath = 0; char *real_newpath = 0; + char *par_newpath = 0; struct iatt stbuf = {0, }; struct posix_private *priv = NULL; char was_present = 1; - char *newpathdup = NULL; - char *newparentpath = NULL; struct iatt preparent = {0,}; struct iatt postparent = {0,}; @@ -1512,26 +1799,18 @@ posix_link (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (priv, out); SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_oldpath, this, oldloc->path); - MAKE_REAL_PATH (real_newpath, this, newloc->path); + MAKE_INODE_HANDLE (real_oldpath, this, oldloc, &stbuf); - op_ret = posix_lstat_with_gfid (this, real_newpath, &stbuf); + MAKE_ENTRY_HANDLE (real_newpath, par_newpath, this, newloc, &stbuf); if ((op_ret == -1) && (errno == ENOENT)) { was_present = 0; } - newpathdup = gf_strdup (real_newpath); - if (!newpathdup) { - op_errno = ENOMEM; - goto out; - } - - newparentpath = dirname (newpathdup); - op_ret = posix_lstat_with_gfid (this, newparentpath, &preparent); + op_ret = posix_pstat (this, newloc->pargfid, par_newpath, &preparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "lstat failed: %s: %s", - newparentpath, strerror (op_errno)); + par_newpath, strerror (op_errno)); goto out; } @@ -1539,7 +1818,7 @@ posix_link (call_frame_t *frame, xlator_t *this, /* * On most systems (Linux being the notable exception), link(2) * first resolves symlinks. If the target is a directory or - * is nonexistent, it will fail. linkat(2) operates on the + * is nonexistent, it will fail. linkat(2) operates on the * symlink instead of its target when the AT_SYMLINK_FOLLOW * flag is not supplied. */ @@ -1551,11 +1830,11 @@ posix_link (call_frame_t *frame, xlator_t *this, op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "link %s to %s failed: %s", - oldloc->path, newloc->path, strerror (op_errno)); + real_oldpath, real_newpath, strerror (op_errno)); goto out; } - op_ret = posix_lstat_with_gfid (this, real_newpath, &stbuf); + op_ret = posix_pstat (this, NULL, real_newpath, &stbuf); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, @@ -1564,24 +1843,22 @@ posix_link (call_frame_t *frame, xlator_t *this, goto out; } - op_ret = posix_lstat_with_gfid (this, newparentpath, &postparent); + op_ret = posix_pstat (this, newloc->pargfid, par_newpath, &postparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "lstat failed: %s: %s", - newparentpath, strerror (op_errno)); + par_newpath, strerror (op_errno)); goto out; } op_ret = 0; out: - if (newpathdup) - GF_FREE (newpathdup); SET_TO_OLD_FS_ID (); STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, (oldloc)?oldloc->inode:NULL, &stbuf, &preparent, - &postparent); + &postparent, NULL); if ((op_ret == -1) && (!was_present)) { unlink (real_newpath); @@ -1592,7 +1869,8 @@ out: int32_t -posix_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) +posix_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; @@ -1611,14 +1889,13 @@ posix_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) VALIDATE_OR_GOTO (priv, out); SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); - op_ret = posix_lstat_with_gfid (this, real_path, &prebuf); + MAKE_INODE_HANDLE (real_path, this, loc, &prebuf); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "pre-operation lstat on %s failed: %s", - loc->path, strerror (op_errno)); + real_path, strerror (op_errno)); goto out; } @@ -1627,11 +1904,11 @@ posix_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset) op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "truncate on %s failed: %s", - loc->path, strerror (op_errno)); + real_path, strerror (op_errno)); goto out; } - op_ret = posix_lstat_with_gfid (this, real_path, &postbuf); + op_ret = posix_pstat (this, loc->gfid, real_path, &postbuf); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "lstat on %s failed: %s", @@ -1645,30 +1922,29 @@ out: SET_TO_OLD_FS_ID (); STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, - &prebuf, &postbuf); + &prebuf, &postbuf, NULL); return 0; } -int32_t +int posix_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, - fd_t *fd, dict_t *params) + mode_t umask, fd_t *fd, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; int32_t _fd = -1; int _flags = 0; char * real_path = NULL; + char * par_path = NULL; struct iatt stbuf = {0, }; struct posix_fd * pfd = NULL; struct posix_private * priv = NULL; char was_present = 1; gid_t gid = 0; - char *pathdup = NULL; - char *parentpath = NULL; struct iatt preparent = {0,}; struct iatt postparent = {0,}; @@ -1683,33 +1959,25 @@ posix_create (call_frame_t *frame, xlator_t *this, priv = this->private; VALIDATE_OR_GOTO (priv, out); - MAKE_REAL_PATH (real_path, this, loc->path); + MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &stbuf); gid = frame->root->gid; - op_ret = setgid_override (this, real_path, &gid); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } - SET_FS_ID (frame->root->uid, gid); - pathdup = gf_strdup (real_path); - if (!pathdup) - goto out; - parentpath = dirname (pathdup); - - op_ret = posix_lstat_with_gfid (this, parentpath, &preparent); + op_ret = posix_pstat (this, loc->pargfid, par_path, &preparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "pre-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); + "pre-operation lstat on parent %s failed: %s", + par_path, strerror (op_errno)); goto out; } + if (preparent.ia_prot.sgid) { + gid = preparent.ia_gid; + } + if (!flags) { _flags = O_CREAT | O_RDWR | O_EXCL; } @@ -1717,7 +1985,7 @@ posix_create (call_frame_t *frame, xlator_t *this, _flags = flags | O_CREAT; } - op_ret = posix_lstat_with_gfid (this, real_path, &stbuf); + op_ret = posix_pstat (this, NULL, real_path, &stbuf); if ((op_ret == -1) && (errno == ENOENT)) { was_present = 0; } @@ -1731,15 +1999,18 @@ posix_create (call_frame_t *frame, xlator_t *this, op_errno = errno; op_ret = -1; gf_log (this->name, GF_LOG_ERROR, - "open on %s failed: %s", loc->path, + "open on %s failed: %s", real_path, strerror (op_errno)); goto out; } - op_ret = posix_gfid_set (this, real_path, params); + if (was_present) + goto fill_stat; + + op_ret = posix_gfid_set (this, real_path, loc, xdata); if (op_ret) { gf_log (this->name, GF_LOG_ERROR, - "setting gfid on %s failed", loc->path); + "setting gfid on %s failed", real_path); } #ifndef HAVE_SET_FSID @@ -1752,21 +2023,22 @@ posix_create (call_frame_t *frame, xlator_t *this, } #endif - op_ret = posix_acl_xattr_set (this, real_path, params); + op_ret = posix_acl_xattr_set (this, real_path, xdata); if (op_ret) { gf_log (this->name, GF_LOG_ERROR, - "setting ACLs on %s failed (%s)", loc->path, + "setting ACLs on %s failed (%s)", real_path, strerror (errno)); } - op_ret = posix_entry_create_xattr_set (this, real_path, params); + op_ret = posix_entry_create_xattr_set (this, real_path, xdata); if (op_ret) { gf_log (this->name, GF_LOG_ERROR, - "setting xattrs on %s failed (%s)", loc->path, + "setting xattrs on %s failed (%s)", real_path, strerror (errno)); } - op_ret = posix_fstat_with_gfid (this, _fd, &stbuf); +fill_stat: + op_ret = posix_fdstat (this, _fd, &stbuf); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, @@ -1774,12 +2046,12 @@ posix_create (call_frame_t *frame, xlator_t *this, goto out; } - op_ret = posix_lstat_with_gfid (this, parentpath, &postparent); + op_ret = posix_pstat (this, loc->pargfid, par_path, &postparent); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, - "post-operation lstat on parent of %s failed: %s", - loc->path, strerror (op_errno)); + "post-operation lstat on parent %s failed: %s", + par_path, strerror (op_errno)); goto out; } @@ -1797,7 +2069,7 @@ posix_create (call_frame_t *frame, xlator_t *this, if (op_ret) gf_log (this->name, GF_LOG_WARNING, "failed to set the fd context path=%s fd=%p", - loc->path, fd); + real_path, fd); LOCK (&priv->lock); { @@ -1808,8 +2080,6 @@ posix_create (call_frame_t *frame, xlator_t *this, op_ret = 0; out: - if (pathdup) - GF_FREE (pathdup); SET_TO_OLD_FS_ID (); if ((-1 == op_ret) && (_fd != -1)) { @@ -1822,14 +2092,14 @@ out: STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, (loc)?loc->inode:NULL, &stbuf, &preparent, - &postparent); + &postparent, xdata); return 0; } int32_t posix_open (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, fd_t *fd, int wbflags) + loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; @@ -1837,7 +2107,6 @@ posix_open (call_frame_t *frame, xlator_t *this, int32_t _fd = -1; struct posix_fd *pfd = NULL; struct posix_private *priv = NULL; - gid_t gid = 0; struct iatt stbuf = {0, }; DECLARE_OLD_FS_ID_VAR; @@ -1851,22 +2120,14 @@ posix_open (call_frame_t *frame, xlator_t *this, priv = this->private; VALIDATE_OR_GOTO (priv, out); - MAKE_REAL_PATH (real_path, this, loc->path); + MAKE_INODE_HANDLE (real_path, this, loc, &stbuf); - op_ret = setgid_override (this, real_path, &gid); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } - - SET_FS_ID (frame->root->uid, gid); + op_ret = -1; + SET_FS_ID (frame->root->uid, frame->root->gid); if (priv->o_direct) flags |= O_DIRECT; - op_ret = posix_lstat_with_gfid (this, real_path, &stbuf); - _fd = open (real_path, flags, 0); if (_fd == -1) { op_ret = -1; @@ -1884,37 +2145,12 @@ posix_open (call_frame_t *frame, xlator_t *this, pfd->flags = flags; pfd->fd = _fd; - if (wbflags == GF_OPEN_FSYNC) - pfd->flushwrites = 1; op_ret = fd_ctx_set (fd, this, (uint64_t)(long)pfd); if (op_ret) gf_log (this->name, GF_LOG_WARNING, "failed to set the fd context path=%s fd=%p", - loc->path, fd); - -#ifndef HAVE_SET_FSID - if (flags & O_CREAT) { - op_ret = chown (real_path, frame->root->uid, gid); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "chown on %s failed: %s", - real_path, strerror (op_errno)); - goto out; - } - } -#endif - - if (flags & O_CREAT) { - op_ret = posix_lstat_with_gfid (this, real_path, &stbuf); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "lstat on (%s) " - "failed: %s", real_path, strerror (op_errno)); - goto out; - } - } + real_path, fd); LOCK (&priv->lock); { @@ -1933,19 +2169,15 @@ out: SET_TO_OLD_FS_ID (); - STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd); + STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, NULL); return 0; } -#define ALIGN_BUF(ptr,bound) ((void *)((unsigned long)(ptr + bound - 1) & \ - (unsigned long)(~(bound - 1)))) - int posix_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset) + fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata) { - uint64_t tmp_pfd = 0; int32_t op_ret = -1; int32_t op_errno = 0; int _fd = -1; @@ -1965,14 +2197,13 @@ posix_readv (call_frame_t *frame, xlator_t *this, priv = this->private; VALIDATE_OR_GOTO (priv, out); - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { op_errno = -ret; gf_log (this->name, GF_LOG_WARNING, "pfd is NULL from fd=%p", fd); goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; if (!size) { op_errno = EINVAL; @@ -2014,7 +2245,7 @@ posix_readv (call_frame_t *frame, xlator_t *this, * we read from */ - op_ret = posix_fstat_with_gfid (this, _fd, &stbuf); + op_ret = posix_fdstat (this, _fd, &stbuf); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, @@ -2024,18 +2255,14 @@ posix_readv (call_frame_t *frame, xlator_t *this, } /* Hack to notify higher layers of EOF. */ - if (stbuf.ia_size == 0) - op_errno = ENOENT; - else if ((offset + vec.iov_len) == stbuf.ia_size) - op_errno = ENOENT; - else if (offset > stbuf.ia_size) + if (!stbuf.ia_size || (offset + vec.iov_len) >= stbuf.ia_size) op_errno = ENOENT; op_ret = vec.iov_len; out: STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, - &vec, 1, &stbuf, iobref); + &vec, 1, &stbuf, iobref, NULL); if (iobref) iobref_unref (iobref); @@ -2073,14 +2300,12 @@ err: return op_ret; } - int32_t __posix_writev (int fd, struct iovec *vector, int count, off_t startoff, int odirect) { int32_t op_ret = 0; int idx = 0; - int align = 4096; int max_buf_size = 0; int retval = 0; char *buf = NULL; @@ -2096,7 +2321,7 @@ __posix_writev (int fd, struct iovec *vector, int count, off_t startoff, max_buf_size = vector[idx].iov_len; } - alloc_buf = GF_MALLOC (1 * (max_buf_size + align), gf_posix_mt_char); + alloc_buf = _page_aligned_alloc (max_buf_size, &buf); if (!alloc_buf) { op_ret = -errno; goto err; @@ -2104,9 +2329,6 @@ __posix_writev (int fd, struct iovec *vector, int count, off_t startoff, internal_off = startoff; for (idx = 0; idx < count; idx++) { - /* page aligned buffer */ - buf = ALIGN_BUF (alloc_buf, align); - memcpy (buf, vector[idx].iov_base, vector[idx].iov_len); /* not sure whether writev works on O_DIRECT'd fd */ @@ -2121,17 +2343,58 @@ __posix_writev (int fd, struct iovec *vector, int count, off_t startoff, } err: - if (alloc_buf) - GF_FREE (alloc_buf); + GF_FREE (alloc_buf); return op_ret; } +dict_t* +_fill_writev_xdata (fd_t *fd, dict_t *xdata, xlator_t *this, int is_append) +{ + dict_t *rsp_xdata = NULL; + int32_t ret = 0; + inode_t *inode = NULL; + + if (fd) + inode = fd->inode; + + if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid Args: " + "fd: %p inode: %p gfid:%s", fd, inode?inode:0, + inode?uuid_utoa(inode->gfid):"N/A"); + goto out; + } + + if (!xdata || !dict_get (xdata, GLUSTERFS_OPEN_FD_COUNT)) + goto out; + + rsp_xdata = dict_new(); + if (!rsp_xdata) + goto out; + + ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_OPEN_FD_COUNT, + fd->inode->fd_count); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set " + "dictionary value for %s", uuid_utoa (fd->inode->gfid), + GLUSTERFS_OPEN_FD_COUNT); + } + + ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND, + is_append); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set " + "dictionary value for %s", uuid_utoa (fd->inode->gfid), + GLUSTERFS_WRITE_IS_APPEND); + } +out: + return rsp_xdata; +} int32_t -posix_writev (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iovec *vector, int32_t count, off_t offset, - struct iobref *iobref) +posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; @@ -2141,8 +2404,9 @@ posix_writev (call_frame_t *frame, xlator_t *this, struct iatt preop = {0,}; struct iatt postop = {0,}; int ret = -1; - - uint64_t tmp_pfd = 0; + dict_t *rsp_xdata = NULL; + int is_append = 0; + gf_boolean_t locked = _gf_false; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -2154,18 +2418,28 @@ posix_writev (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (priv, out); - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { gf_log (this->name, GF_LOG_WARNING, "pfd is NULL from fd=%p", fd); op_errno = -ret; goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; _fd = pfd->fd; - op_ret = posix_fstat_with_gfid (this, _fd, &preop); + if (xdata && dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) { + /* The write_is_append check and write must happen + atomically. Else another write can overtake this + write after the check and get written earlier. + + So lock before preop-stat and unlock after write. + */ + locked = _gf_true; + LOCK(&fd->inode->lock); + } + + op_ret = posix_fdstat (this, _fd, &preop); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, @@ -2174,8 +2448,19 @@ posix_writev (call_frame_t *frame, xlator_t *this, goto out; } + if (locked) { + if (preop.ia_size == offset || (fd->flags & O_APPEND)) + is_append = 1; + } + op_ret = __posix_writev (_fd, vector, count, offset, (pfd->flags & O_DIRECT)); + + if (locked) { + UNLOCK (&fd->inode->lock); + locked = _gf_false; + } + if (op_ret < 0) { op_errno = -op_ret; op_ret = -1; @@ -2191,17 +2476,24 @@ posix_writev (call_frame_t *frame, xlator_t *this, UNLOCK (&priv->lock); if (op_ret >= 0) { + rsp_xdata = _fill_writev_xdata (fd, xdata, this, is_append); /* wiretv successful, we also need to get the stat of * the file we wrote to */ - if (pfd->flushwrites) { - /* NOTE: ignore the error, if one occurs at this - * point */ - fsync (_fd); + if (flags & (O_SYNC|O_DSYNC)) { + ret = fsync (_fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "fsync() in writev on fd %d failed: %s", + _fd, strerror (errno)); + op_ret = -1; + op_errno = errno; + goto out; + } } - ret = posix_fstat_with_gfid (this, _fd, &postop); + ret = posix_fdstat (this, _fd, &postop); if (ret == -1) { op_ret = -1; op_errno = errno; @@ -2214,15 +2506,23 @@ posix_writev (call_frame_t *frame, xlator_t *this, out: - STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &preop, &postop); + if (locked) { + UNLOCK (&fd->inode->lock); + locked = _gf_false; + } + + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &preop, &postop, + rsp_xdata); + if (rsp_xdata) + dict_unref (rsp_xdata); return 0; } int32_t posix_statfs (call_frame_t *frame, xlator_t *this, - loc_t *loc) + loc_t *loc, dict_t *xdata) { char * real_path = NULL; int32_t op_ret = -1; @@ -2235,7 +2535,7 @@ posix_statfs (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (loc, out); VALIDATE_OR_GOTO (this->private, out); - MAKE_REAL_PATH (real_path, this, loc->path); + MAKE_INODE_HANDLE (real_path, this, loc, NULL); priv = this->private; @@ -2261,25 +2561,25 @@ posix_statfs (call_frame_t *frame, xlator_t *this, op_ret = 0; out: - STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, &buf); + STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, &buf, NULL); return 0; } int32_t posix_flush (call_frame_t *frame, xlator_t *this, - fd_t *fd) + fd_t *fd, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; int ret = -1; - uint64_t tmp_pfd = 0; + struct posix_fd *pfd = NULL; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (fd, out); - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { op_errno = -ret; gf_log (this->name, GF_LOG_WARNING, @@ -2290,15 +2590,14 @@ posix_flush (call_frame_t *frame, xlator_t *this, op_ret = 0; out: - STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno); + STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, NULL); return 0; } int32_t -posix_release (xlator_t *this, - fd_t *fd) +posix_release (xlator_t *this, fd_t *fd) { struct posix_private * priv = NULL; struct posix_fd * pfd = NULL; @@ -2310,7 +2609,7 @@ posix_release (xlator_t *this, priv = this->private; - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = fd_ctx_del (fd, this, &tmp_pfd); if (ret < 0) { gf_log (this->name, GF_LOG_WARNING, "pfd is NULL from fd=%p", fd); @@ -2343,18 +2642,45 @@ out: } +int +posix_batch_fsync (call_frame_t *frame, xlator_t *this, + fd_t *fd, int datasync, dict_t *xdata) +{ + call_stub_t *stub = NULL; + struct posix_private *priv = NULL; + + priv = this->private; + + stub = fop_fsync_stub (frame, default_fsync, fd, datasync, xdata); + if (!stub) { + STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM, 0, 0, 0); + return 0; + } + + pthread_mutex_lock (&priv->fsync_mutex); + { + list_add_tail (&stub->list, &priv->fsyncs); + priv->fsync_queue_count++; + pthread_cond_signal (&priv->fsync_cond); + } + pthread_mutex_unlock (&priv->fsync_mutex); + + return 0; +} + + int32_t posix_fsync (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t datasync) + fd_t *fd, int32_t datasync, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; int _fd = -1; struct posix_fd * pfd = NULL; int ret = -1; - uint64_t tmp_pfd = 0; struct iatt preop = {0,}; struct iatt postop = {0,}; + struct posix_private *priv = NULL; DECLARE_OLD_FS_ID_VAR; @@ -2370,18 +2696,23 @@ posix_fsync (call_frame_t *frame, xlator_t *this, goto out; #endif - ret = fd_ctx_get (fd, this, &tmp_pfd); + priv = this->private; + if (priv->batch_fsync_mode && xdata && dict_get (xdata, "batch-fsync")) { + posix_batch_fsync (frame, this, fd, datasync, xdata); + return 0; + } + + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { op_errno = -ret; gf_log (this->name, GF_LOG_WARNING, "pfd not found in fd's ctx"); goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; _fd = pfd->fd; - op_ret = posix_fstat_with_gfid (this, _fd, &preop); + op_ret = posix_fdstat (this, _fd, &preop); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_WARNING, @@ -2411,7 +2742,7 @@ posix_fsync (call_frame_t *frame, xlator_t *this, } } - op_ret = posix_fstat_with_gfid (this, _fd, &postop); + op_ret = posix_fdstat (this, _fd, &postop); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_WARNING, @@ -2425,22 +2756,34 @@ posix_fsync (call_frame_t *frame, xlator_t *this, out: SET_TO_OLD_FS_ID (); - STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, &preop, &postop); + STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, &preop, &postop, + NULL); return 0; } static int gf_posix_xattr_enotsup_log; +static int +_handle_setxattr_keyvalue_pair (dict_t *d, char *k, data_t *v, + void *tmp) +{ + posix_xattr_filler_t *filler = NULL; + + filler = tmp; + + return posix_handle_pair (filler->this, filler->real_path, k, v, + filler->flags); +} int32_t posix_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int flags) + loc_t *loc, dict_t *dict, int flags, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; char * real_path = NULL; - data_pair_t * trav = NULL; - int ret = -1; + + posix_xattr_filler_t filler = {0,}; DECLARE_OLD_FS_ID_VAR; SET_FS_ID (frame->root->uid, frame->root->gid); @@ -2450,31 +2793,74 @@ posix_setxattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (loc, out); VALIDATE_OR_GOTO (dict, out); - MAKE_REAL_PATH (real_path, this, loc->path); + MAKE_INODE_HANDLE (real_path, this, loc, NULL); + op_ret = -1; dict_del (dict, GFID_XATTR_KEY); - trav = dict->members_list; - - while (trav) { - ret = posix_handle_pair (this, real_path, trav, flags); - if (ret < 0) { - op_errno = -ret; - goto out; - } - trav = trav->next; - } - - op_ret = 0; + filler.real_path = real_path; + filler.this = this; + filler.flags = flags; + op_ret = dict_foreach (dict, _handle_setxattr_keyvalue_pair, + &filler); + if (op_ret < 0) + op_errno = -op_ret; out: SET_TO_OLD_FS_ID (); - STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno); + STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, NULL); return 0; } + +int +posix_xattr_get_real_filename (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *key, dict_t *dict, dict_t *xdata) +{ + char *real_path = NULL; + struct dirent *dirent = NULL; + DIR *fd = NULL; + const char *fname = NULL; + char *found = NULL; + int ret = -1; + int op_ret = -1; + + MAKE_INODE_HANDLE (real_path, this, loc, NULL); + + fd = opendir (real_path); + if (!fd) + return -errno; + + fname = key + strlen (GF_XATTR_GET_REAL_FILENAME_KEY); + + while ((dirent = readdir (fd))) { + if (strcasecmp (dirent->d_name, fname) == 0) { + found = gf_strdup (dirent->d_name); + if (!found) { + closedir (fd); + return -ENOMEM; + } + break; + } + } + + closedir (fd); + + if (!found) + return -ENOENT; + + ret = dict_set_dynstr (dict, (char *)key, found); + if (ret) { + GF_FREE (found); + return -ENOMEM; + } + ret = strlen (found) + 1; + + return ret; +} + /** * posix_getxattr - this function returns a dictionary with all the * key:value pair present as xattr. used for @@ -2482,22 +2868,25 @@ out: */ int32_t posix_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) + loc_t *loc, const char *name, dict_t *xdata) { - struct posix_private *priv = NULL; - int32_t op_ret = -1; - int32_t op_errno = 0; - int32_t list_offset = 0; - size_t size = 0; - size_t remaining_size = 0; - char key[4096] = {0,}; - char host_buf[1024] = {0,}; - char * value = NULL; - char * list = NULL; - char * real_path = NULL; - dict_t * dict = NULL; - char * file_contents = NULL; - int ret = -1; + struct posix_private *priv = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + int32_t list_offset = 0; + ssize_t size = 0; + size_t remaining_size = 0; + char key[4096] = {0,}; + char host_buf[1024] = {0,}; + char *value = NULL; + char *list = NULL; + char *real_path = NULL; + dict_t *dict = NULL; + char *file_contents = NULL; + int ret = -1; + char *path = NULL; + char *rpath = NULL; + char *dyn_rpath = NULL; DECLARE_OLD_FS_ID_VAR; @@ -2506,13 +2895,14 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (loc, out); SET_FS_ID (frame->root->uid, frame->root->gid); - MAKE_REAL_PATH (real_path, this, loc->path); + MAKE_INODE_HANDLE (real_path, this, loc, NULL); + op_ret = -1; priv = this->private; if (loc->inode && IA_ISDIR(loc->inode->ia_type) && name && ZR_FILE_CONTENT_REQUEST(name)) { - ret = posix_get_file_contents (this, real_path, name, + ret = posix_get_file_contents (this, loc->gfid, &name[15], &file_contents); if (ret < 0) { op_errno = -ret; @@ -2523,12 +2913,31 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, } } - /* Get the total size */ - dict = get_new_dict (); + dict = dict_new (); if (!dict) { + op_errno = ENOMEM; goto out; } + if (loc->inode && name && + (strncmp (name, GF_XATTR_GET_REAL_FILENAME_KEY, + strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0)) { + ret = posix_xattr_get_real_filename (frame, this, loc, + name, dict, xdata); + if (ret < 0) { + op_ret = -1; + op_errno = -ret; + gf_log (this->name, (op_errno == ENOENT) ? + GF_LOG_DEBUG : GF_LOG_WARNING, + "Failed to get real filename (%s, %s): %s", + loc->path, name, strerror (op_errno)); + goto out; + } + + size = ret; + goto done; + } + if (loc->inode && name && !strcmp (name, GLUSTERFS_OPEN_FD_COUNT)) { if (!list_empty (&loc->inode->fd_list)) { ret = dict_set_uint32 (dict, (char *)name, 1); @@ -2547,15 +2956,76 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, } if (loc->inode && name && (strcmp (name, GF_XATTR_PATHINFO_KEY) == 0)) { - snprintf (host_buf, 1024, "<POSIX:%s:%s>", priv->hostname, - real_path); - size = strlen (host_buf) + 1; - ret = dict_set_str (dict, GF_XATTR_PATHINFO_KEY, - host_buf); + if (LOC_HAS_ABSPATH (loc)) + MAKE_REAL_PATH (rpath, this, loc->path); + else + rpath = real_path; + + (void) snprintf (host_buf, 1024, + "<POSIX(%s):%s:%s>", priv->base_path, + ((priv->node_uuid_pathinfo + && !uuid_is_null(priv->glusterd_uuid)) + ? uuid_utoa (priv->glusterd_uuid) + : priv->hostname), + rpath); + + dyn_rpath = gf_strdup (host_buf); + if (!dyn_rpath) { + ret = -1; + goto done; + } + size = strlen (dyn_rpath) + 1; + ret = dict_set_dynstr (dict, GF_XATTR_PATHINFO_KEY, + dyn_rpath); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "could not set value (%s) in dictionary", + dyn_rpath); + GF_FREE (dyn_rpath); + } + + goto done; + } + + if (loc->inode && name && + (strcmp (name, GF_XATTR_NODE_UUID_KEY) == 0) + && !uuid_is_null (priv->glusterd_uuid)) { + (void) snprintf (host_buf, 1024, "%s", + uuid_utoa (priv->glusterd_uuid)); + + dyn_rpath = gf_strdup (host_buf); + if (!dyn_rpath) { + ret = -1; + goto done; + } + + size = strlen (dyn_rpath) + 1; + ret = dict_set_dynstr (dict, GF_XATTR_NODE_UUID_KEY, + dyn_rpath); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "could not set value (%s) in dictionary", + dyn_rpath); + GF_FREE (dyn_rpath); + } + goto done; + } + + if (loc->inode && name && + (strcmp (name, GFID_TO_PATH_KEY) == 0)) { + ret = inode_path (loc->inode, NULL, &path); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "%s: could not get " + "inode path", uuid_utoa (loc->inode->gfid)); + goto done; + } + + ret = dict_set_dynstr (dict, GFID_TO_PATH_KEY, path); if (ret < 0) { gf_log (this->name, GF_LOG_WARNING, "could not set value (%s) in dictionary", host_buf); + GF_FREE (path); } goto done; } @@ -2564,19 +3034,49 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, strcpy (key, name); size = sys_lgetxattr (real_path, key, NULL, 0); + if (size <= 0) { + op_errno = errno; + if ((op_errno == ENOTSUP) || (op_errno == ENOSYS)) { + GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log, + this->name, GF_LOG_WARNING, + "Extended attributes not " + "supported (try remounting" + " brick with 'user_xattr' " + "flag)"); + } else if (op_errno == ENOATTR || + op_errno == ENODATA) { + gf_log (this->name, GF_LOG_DEBUG, + "No such attribute:%s for file %s", + key, real_path); + } else { + gf_log (this->name, GF_LOG_ERROR, + "getxattr failed on %s: %s (%s)", + real_path, key, strerror (op_errno)); + } + + goto done; + } value = GF_CALLOC (size + 1, sizeof(char), gf_posix_mt_char); if (!value) { op_ret = -1; goto out; } - op_ret = sys_lgetxattr (real_path, key, value, op_ret); - if (op_ret == -1) { + size = sys_lgetxattr (real_path, key, value, size); + if (size == -1) { + op_ret = -1; op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "getxattr failed on " + "%s: key = %s (%s)", real_path, key, + strerror (op_errno)); + GF_FREE (value); goto out; } - value [op_ret] = '\0'; - op_ret = dict_set_dynptr (dict, key, value, op_ret); + value [size] = '\0'; + op_ret = dict_set_dynptr (dict, key, value, size); if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "dict set operation " + "on %s for the key %s failed.", real_path, key); + GF_FREE (value); goto out; } @@ -2590,7 +3090,9 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log, this->name, GF_LOG_WARNING, "Extended attributes not " - "supported."); + "supported (try remounting" + " brick with 'user_xattr' " + "flag)"); } else { gf_log (this->name, GF_LOG_ERROR, @@ -2618,26 +3120,40 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, break; strcpy (key, list + list_offset); - op_ret = sys_lgetxattr (real_path, key, NULL, 0); - if (op_ret == -1) + size = sys_lgetxattr (real_path, key, NULL, 0); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "getxattr failed on " + "%s: key = %s (%s)", real_path, key, + strerror (op_errno)); break; + } - value = GF_CALLOC (op_ret + 1, sizeof(char), + value = GF_CALLOC (size + 1, sizeof(char), gf_posix_mt_char); if (!value) { op_errno = errno; goto out; } - op_ret = sys_lgetxattr (real_path, key, value, op_ret); - if (op_ret == -1) { + size = sys_lgetxattr (real_path, key, value, size); + if (size == -1) { + op_ret = -1; op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "getxattr failed on " + "%s: key = %s (%s)", real_path, key, + strerror (op_errno)); + GF_FREE (value); break; } - value [op_ret] = '\0'; - op_ret = dict_set_dynptr (dict, key, value, op_ret); + value [size] = '\0'; + op_ret = dict_set_dynptr (dict, key, value, size); if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "dict set operation " + "on %s for the key %s failed.", real_path, key); + GF_FREE (value); goto out; } @@ -2651,13 +3167,12 @@ done: if (dict) { dict_del (dict, GFID_XATTR_KEY); - dict_ref (dict); } out: SET_TO_OLD_FS_ID (); - STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict); + STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, NULL); if (dict) dict_unref (dict); @@ -2668,15 +3183,14 @@ out: int32_t posix_fgetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, const char *name) + fd_t *fd, const char *name, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = ENOENT; - uint64_t tmp_pfd = 0; struct posix_fd * pfd = NULL; int _fd = -1; int32_t list_offset = 0; - size_t size = 0; + ssize_t size = 0; size_t remaining_size = 0; char key[4096] = {0,}; char * value = NULL; @@ -2692,14 +3206,13 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this, SET_FS_ID (frame->root->uid, frame->root->gid); - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { op_errno = -ret; gf_log (this->name, GF_LOG_WARNING, "pfd is NULL from fd=%p", fd); goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; _fd = pfd->fd; @@ -2722,19 +3235,34 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this, strcpy (key, name); size = sys_fgetxattr (_fd, key, NULL, 0); + if (size <= 0) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "fgetxattr failed on " + "key %s (%s)", key, strerror (op_errno)); + goto done; + } + value = GF_CALLOC (size + 1, sizeof(char), gf_posix_mt_char); if (!value) { op_ret = -1; goto out; } - op_ret = sys_fgetxattr (_fd, key, value, op_ret); - if (op_ret == -1) { + size = sys_fgetxattr (_fd, key, value, size); + if (size == -1) { + op_ret = -1; op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "fgetxattr failed on " + "fd %p for the key %s (%s)", fd, key, + strerror (op_errno)); + GF_FREE (value); goto out; } - value [op_ret] = '\0'; - op_ret = dict_set_dynptr (dict, key, value, op_ret); + value [size] = '\0'; + op_ret = dict_set_dynptr (dict, key, value, size); if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "dict set operation " + "on key %s failed", key); + GF_FREE (value); goto out; } goto done; @@ -2747,7 +3275,8 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this, GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log, this->name, GF_LOG_WARNING, "Extended attributes not " - "supported."); + "supported (try remounting " + "brick with 'user_xattr' flag)"); } else { gf_log (this->name, GF_LOG_ERROR, @@ -2775,24 +3304,41 @@ posix_fgetxattr (call_frame_t *frame, xlator_t *this, break; strcpy (key, list + list_offset); - op_ret = sys_fgetxattr (_fd, key, NULL, 0); - if (op_ret == -1) + size = sys_fgetxattr (_fd, key, NULL, 0); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "fgetxattr failed on " + "fd %p for the key %s (%s)", fd, key, + strerror (op_errno)); break; + } - value = GF_CALLOC (op_ret + 1, sizeof(char), + value = GF_CALLOC (size + 1, sizeof(char), gf_posix_mt_char); if (!value) { + op_ret = -1; op_errno = errno; goto out; } - op_ret = sys_fgetxattr (_fd, key, value, op_ret); - if (op_ret == -1) + size = sys_fgetxattr (_fd, key, value, size); + if (size == -1) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "fgetxattr failed on " + "the fd %p for the key %s (%s)", fd, key, + strerror (op_errno)); + GF_FREE (value); break; + } - value [op_ret] = '\0'; - op_ret = dict_set_dynptr (dict, key, value, op_ret); + value [size] = '\0'; + op_ret = dict_set_dynptr (dict, key, value, size); if (op_ret) { + gf_log (this->name, GF_LOG_ERROR, "dict set operation " + "failed on key %s", key); + GF_FREE (value); goto out; } remaining_size -= strlen (key) + 1; @@ -2811,7 +3357,7 @@ done: out: SET_TO_OLD_FS_ID (); - STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict); + STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict, NULL); if (dict) dict_unref (dict); @@ -2819,18 +3365,29 @@ out: return 0; } +static int +_handle_fsetxattr_keyvalue_pair (dict_t *d, char *k, data_t *v, + void *tmp) +{ + posix_xattr_filler_t *filler = NULL; + + filler = tmp; + + return posix_fhandle_pair (filler->this, filler->fd, k, v, + filler->flags); +} int32_t posix_fsetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, dict_t *dict, int flags) + fd_t *fd, dict_t *dict, int flags, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; struct posix_fd * pfd = NULL; - uint64_t tmp_pfd = 0; int _fd = -1; - data_pair_t * trav = NULL; - int ret = -1; + int ret = -1; + + posix_xattr_filler_t filler = {0,}; DECLARE_OLD_FS_ID_VAR; SET_FS_ID (frame->root->uid, frame->root->gid); @@ -2840,66 +3397,101 @@ posix_fsetxattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (fd, out); VALIDATE_OR_GOTO (dict, out); - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { op_errno = -ret; gf_log (this->name, GF_LOG_WARNING, "pfd is NULL from fd=%p", fd); goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; _fd = pfd->fd; dict_del (dict, GFID_XATTR_KEY); - trav = dict->members_list; - - while (trav) { - ret = posix_fhandle_pair (this, _fd, trav, flags); - if (ret < 0) { - op_errno = -ret; - goto out; - } - trav = trav->next; - } - - op_ret = 0; + filler.fd = _fd; + filler.this = this; + filler.flags = flags; + op_ret = dict_foreach (dict, _handle_fsetxattr_keyvalue_pair, + &filler); + if (op_ret < 0) + op_errno = -op_ret; out: SET_TO_OLD_FS_ID (); - STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno); + STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, NULL); return 0; } +int +_posix_remove_xattr (dict_t *dict, char *key, data_t *value, void *data) +{ + int32_t op_ret = 0; + xlator_t *this = NULL; + posix_xattr_filler_t *filler = NULL; + + filler = (posix_xattr_filler_t *) data; + this = filler->this; + + op_ret = sys_lremovexattr (filler->real_path, key); + if (op_ret == -1) { + filler->op_errno = errno; + if (errno != ENOATTR && errno != EPERM) + gf_log (this->name, GF_LOG_ERROR, + "removexattr failed on %s (for %s): %s", + filler->real_path, key, strerror (errno)); + } + + return op_ret; +} + int32_t posix_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name) + loc_t *loc, const char *name, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; char * real_path = NULL; + posix_xattr_filler_t filler = {0,}; DECLARE_OLD_FS_ID_VAR; + MAKE_INODE_HANDLE (real_path, this, loc, NULL); + if (!strcmp (GFID_XATTR_KEY, name)) { gf_log (this->name, GF_LOG_WARNING, "Remove xattr called" - " on gfid for file %s", loc->path); + " on gfid for file %s", real_path); + op_ret = -1; goto out; } - MAKE_REAL_PATH (real_path, this, loc->path); SET_FS_ID (frame->root->uid, frame->root->gid); + /** + * sending an empty key name with xdata containing the + * list of key(s) to be removed implies "bulk remove request" + * for removexattr. + */ + if (name && (strcmp (name, "") == 0) && xdata) { + filler.real_path = real_path; + filler.this = this; + op_ret = dict_foreach (xdata, _posix_remove_xattr, &filler); + if (op_ret) { + op_errno = filler.op_errno; + } + + goto out; + } + op_ret = sys_lremovexattr (real_path, name); if (op_ret == -1) { op_errno = errno; if (op_errno != ENOATTR && op_errno != EPERM) gf_log (this->name, GF_LOG_ERROR, - "removexattr on %s (for %s): %s", loc->path, + "removexattr on %s (for %s): %s", real_path, name, strerror (op_errno)); goto out; } @@ -2909,25 +3501,75 @@ posix_removexattr (call_frame_t *frame, xlator_t *this, out: SET_TO_OLD_FS_ID (); - STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno); + STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno, NULL); + return 0; +} + +int32_t +posix_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + struct posix_fd * pfd = NULL; + int _fd = -1; + int ret = -1; + + DECLARE_OLD_FS_ID_VAR; + + if (!strcmp (GFID_XATTR_KEY, name)) { + gf_log (this->name, GF_LOG_WARNING, "Remove xattr called" + " on gfid for file"); + goto out; + } + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + op_errno = -ret; + gf_log (this->name, GF_LOG_WARNING, + "pfd is NULL from fd=%p", fd); + goto out; + } + _fd = pfd->fd; + + + + SET_FS_ID (frame->root->uid, frame->root->gid); + + op_ret = sys_fremovexattr (_fd, name); + if (op_ret == -1) { + op_errno = errno; + if (op_errno != ENOATTR && op_errno != EPERM) + gf_log (this->name, GF_LOG_ERROR, + "fremovexattr (for %s): %s", + name, strerror (op_errno)); + goto out; + } + + op_ret = 0; + +out: + SET_TO_OLD_FS_ID (); + + STACK_UNWIND_STRICT (fremovexattr, frame, op_ret, op_errno, NULL); return 0; } int32_t posix_fsyncdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, int datasync) + fd_t *fd, int datasync, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; int ret = -1; - uint64_t tmp_pfd = 0; + struct posix_fd *pfd = NULL; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (fd, out); - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { op_errno = -ret; gf_log (this->name, GF_LOG_WARNING, @@ -2938,7 +3580,7 @@ posix_fsyncdir (call_frame_t *frame, xlator_t *this, op_ret = 0; out: - STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno); + STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno, NULL); return 0; } @@ -2965,9 +3607,31 @@ posix_print_xattr (dict_t *this, static void __add_array (int32_t *dest, int32_t *src, int count) { + int i = 0; + int32_t destval = 0; + for (i = 0; i < count; i++) { + destval = ntoh32 (dest[i]); + if (destval == 0xffffffff) + continue; + dest[i] = hton32 (destval + ntoh32 (src[i])); + } +} + +static void +__or_array (int32_t *dest, int32_t *src, int count) +{ int i = 0; for (i = 0; i < count; i++) { - dest[i] = hton32 (ntoh32 (dest[i]) + ntoh32 (src[i])); + dest[i] = hton32 (ntoh32 (dest[i]) | ntoh32 (src[i])); + } +} + +static void +__and_array (int32_t *dest, int32_t *src, int count) +{ + int i = 0; + for (i = 0; i < count; i++) { + dest[i] = hton32 (ntoh32 (dest[i]) & ntoh32 (src[i])); } } @@ -2980,6 +3644,159 @@ __add_long_array (int64_t *dest, int64_t *src, int count) } } +static int +_posix_handle_xattr_keyvalue_pair (dict_t *d, char *k, data_t *v, + void *tmp) +{ + int size = 0; + int count = 0; + int op_ret = 0; + int op_errno = 0; + gf_xattrop_flags_t optype = 0; + char *array = NULL; + inode_t *inode = NULL; + xlator_t *this = NULL; + posix_xattr_filler_t *filler = NULL; + + filler = tmp; + + optype = (gf_xattrop_flags_t)(filler->flags); + this = filler->this; + inode = filler->inode; + + count = v->len; + array = GF_CALLOC (count, sizeof (char), gf_posix_mt_char); + + LOCK (&inode->lock); + { + if (filler->real_path) { + size = sys_lgetxattr (filler->real_path, k, + (char *)array, v->len); + } else { + size = sys_fgetxattr (filler->fd, k, (char *)array, + v->len); + } + + op_errno = errno; + if ((size == -1) && (op_errno != ENODATA) && + (op_errno != ENOATTR)) { + if (op_errno == ENOTSUP) { + GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, + this->name, GF_LOG_WARNING, + "Extended attributes not " + "supported by filesystem"); + } else if (op_errno != ENOENT || + !posix_special_xattr (marker_xattrs, + k)) { + if (filler->real_path) + gf_log (this->name, GF_LOG_ERROR, + "getxattr failed on %s while doing " + "xattrop: Key:%s (%s)", + filler->real_path, + k, strerror (op_errno)); + else + gf_log (this->name, GF_LOG_ERROR, + "fgetxattr failed on fd=%d while doing " + "xattrop: Key:%s (%s)", + filler->fd, + k, strerror (op_errno)); + } + + op_ret = -1; + goto unlock; + } + + switch (optype) { + + case GF_XATTROP_ADD_ARRAY: + __add_array ((int32_t *) array, (int32_t *) v->data, + v->len / 4); + break; + + case GF_XATTROP_ADD_ARRAY64: + __add_long_array ((int64_t *) array, (int64_t *) v->data, + v->len / 8); + break; + + case GF_XATTROP_OR_ARRAY: + __or_array ((int32_t *) array, + (int32_t *) v->data, + v->len / 4); + break; + + case GF_XATTROP_AND_ARRAY: + __and_array ((int32_t *) array, + (int32_t *) v->data, + v->len / 4); + break; + + default: + gf_log (this->name, GF_LOG_ERROR, + "Unknown xattrop type (%d) on %s. Please send " + "a bug report to gluster-devel@nongnu.org", + optype, filler->real_path); + op_ret = -1; + op_errno = EINVAL; + goto unlock; + } + + if (filler->real_path) { + size = sys_lsetxattr (filler->real_path, k, array, + v->len, 0); + } else { + size = sys_fsetxattr (filler->fd, k, (char *)array, + v->len, 0); + } + } +unlock: + UNLOCK (&inode->lock); + + if (op_ret == -1) + goto out; + + op_errno = errno; + if (size == -1) { + if (filler->real_path) + gf_log (this->name, GF_LOG_ERROR, + "setxattr failed on %s while doing xattrop: " + "key=%s (%s)", filler->real_path, + k, strerror (op_errno)); + else + gf_log (this->name, GF_LOG_ERROR, + "fsetxattr failed on fd=%d while doing xattrop: " + "key=%s (%s)", filler->fd, + k, strerror (op_errno)); + + op_ret = -1; + goto out; + } else { + size = dict_set_bin (d, k, array, v->len); + + if (size != 0) { + if (filler->real_path) + gf_log (this->name, GF_LOG_DEBUG, + "dict_set_bin failed (path=%s): " + "key=%s (%s)", filler->real_path, + k, strerror (-size)); + else + gf_log (this->name, GF_LOG_DEBUG, + "dict_set_bin failed (fd=%d): " + "key=%s (%s)", filler->fd, + k, strerror (-size)); + + op_ret = -1; + op_errno = EINVAL; + goto out; + } + array = NULL; + } + + array = NULL; + +out: + return op_ret; +} + /** * xattrop - xattr operations - for internal use by GlusterFS * @optype: ADD_ARRAY: @@ -2988,190 +3805,61 @@ __add_long_array (int64_t *dest, int64_t *src, int count) */ int -do_xattrop (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr) +do_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr) { - char *real_path = NULL; - char *array = NULL; - int size = 0; - int count = 0; - - int op_ret = 0; - int op_errno = 0; - - int ret = 0; - int _fd = -1; - uint64_t tmp_pfd = 0; - struct posix_fd *pfd = NULL; - - data_pair_t *trav = NULL; - - char * path = NULL; - inode_t * inode = NULL; + int op_ret = 0; + int op_errno = 0; + int _fd = -1; + char *real_path = NULL; + struct posix_fd *pfd = NULL; + inode_t *inode = NULL; + posix_xattr_filler_t filler = {0,}; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (xattr, out); VALIDATE_OR_GOTO (this, out); - trav = xattr->members_list; - if (fd) { - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { + op_ret = posix_fd_ctx_get (fd, this, &pfd); + if (op_ret < 0) { gf_log (this->name, GF_LOG_WARNING, "failed to get pfd from fd=%p", fd); - op_ret = -1; op_errno = EBADFD; goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; _fd = pfd->fd; } - if (loc && loc->path) - MAKE_REAL_PATH (real_path, this, loc->path); + if (loc && !uuid_is_null (loc->gfid)) + MAKE_INODE_HANDLE (real_path, this, loc, NULL); - if (loc) { - path = gf_strdup (loc->path); + if (real_path) { inode = loc->inode; } else if (fd) { inode = fd->inode; } - while (trav && inode) { - count = trav->value->len; - array = GF_CALLOC (count, sizeof (char), - gf_posix_mt_char); - - LOCK (&inode->lock); - { - if (loc) { - size = sys_lgetxattr (real_path, trav->key, (char *)array, - trav->value->len); - } else { - size = sys_fgetxattr (_fd, trav->key, (char *)array, - trav->value->len); - } - - op_errno = errno; - if ((size == -1) && (op_errno != ENODATA) && - (op_errno != ENOATTR)) { - if (op_errno == ENOTSUP) { - GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, - this->name,GF_LOG_WARNING, - "Extended attributes not " - "supported by filesystem"); - } else { - if (loc) - gf_log (this->name, GF_LOG_ERROR, - "getxattr failed on %s while doing " - "xattrop: Key:%s (%s)", path, - trav->key, strerror (op_errno)); - else - gf_log (this->name, GF_LOG_ERROR, - "fgetxattr failed on fd=%d while doing " - "xattrop: Key:%s (%s)", _fd, - trav->key, strerror (op_errno)); - } - - op_ret = -1; - goto unlock; - } - - switch (optype) { - - case GF_XATTROP_ADD_ARRAY: - __add_array ((int32_t *) array, (int32_t *) trav->value->data, - trav->value->len / 4); - break; - - case GF_XATTROP_ADD_ARRAY64: - __add_long_array ((int64_t *) array, (int64_t *) trav->value->data, - trav->value->len / 8); - break; - - default: - gf_log (this->name, GF_LOG_ERROR, - "Unknown xattrop type (%d) on %s. Please send " - "a bug report to gluster-devel@nongnu.org", - optype, path); - op_ret = -1; - op_errno = EINVAL; - goto unlock; - } - - if (loc) { - size = sys_lsetxattr (real_path, trav->key, array, - trav->value->len, 0); - } else { - size = sys_fsetxattr (_fd, trav->key, (char *)array, - trav->value->len, 0); - } - } - unlock: - UNLOCK (&inode->lock); + filler.this = this; + filler.fd = _fd; + filler.real_path = real_path; + filler.flags = (int)optype; + filler.inode = inode; - if (op_ret == -1) - goto out; - - op_errno = errno; - if (size == -1) { - if (loc) - gf_log (this->name, GF_LOG_ERROR, - "setxattr failed on %s while doing xattrop: " - "key=%s (%s)", path, - trav->key, strerror (op_errno)); - else - gf_log (this->name, GF_LOG_ERROR, - "fsetxattr failed on fd=%d while doing xattrop: " - "key=%s (%s)", _fd, - trav->key, strerror (op_errno)); - - op_ret = -1; - goto out; - } else { - size = dict_set_bin (xattr, trav->key, array, - trav->value->len); - - if (size != 0) { - if (loc) - gf_log (this->name, GF_LOG_DEBUG, - "dict_set_bin failed (path=%s): " - "key=%s (%s)", path, - trav->key, strerror (-size)); - else - gf_log (this->name, GF_LOG_DEBUG, - "dict_set_bin failed (fd=%d): " - "key=%s (%s)", _fd, - trav->key, strerror (-size)); - - op_ret = -1; - op_errno = EINVAL; - goto out; - } - array = NULL; - } - - array = NULL; - trav = trav->next; - } + op_ret = dict_foreach (xattr, _posix_handle_xattr_keyvalue_pair, + &filler); out: - if (array) - GF_FREE (array); - if (path) - GF_FREE (path); - - STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, xattr); + STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, xattr, NULL); return 0; } int posix_xattrop (call_frame_t *frame, xlator_t *this, - loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr) + loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { do_xattrop (frame, this, loc, NULL, optype, xattr); return 0; @@ -3180,7 +3868,7 @@ posix_xattrop (call_frame_t *frame, xlator_t *this, int posix_fxattrop (call_frame_t *frame, xlator_t *this, - fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr) + fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { do_xattrop (frame, this, NULL, fd, optype, xattr); return 0; @@ -3189,7 +3877,7 @@ posix_fxattrop (call_frame_t *frame, xlator_t *this, int posix_access (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t mask) + loc_t *loc, int32_t mask, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; @@ -3202,13 +3890,13 @@ posix_access (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (loc, out); - MAKE_REAL_PATH (real_path, this, loc->path); + MAKE_INODE_HANDLE (real_path, this, loc, NULL); op_ret = access (real_path, mask & 07); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "access failed on %s: %s", - loc->path, strerror (op_errno)); + real_path, strerror (op_errno)); goto out; } op_ret = 0; @@ -3216,14 +3904,14 @@ posix_access (call_frame_t *frame, xlator_t *this, out: SET_TO_OLD_FS_ID (); - STACK_UNWIND_STRICT (access, frame, op_ret, op_errno); + STACK_UNWIND_STRICT (access, frame, op_ret, op_errno, NULL); return 0; } int32_t posix_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset) + fd_t *fd, off_t offset, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; @@ -3232,7 +3920,6 @@ posix_ftruncate (call_frame_t *frame, xlator_t *this, struct iatt postop = {0,}; struct posix_fd *pfd = NULL; int ret = -1; - uint64_t tmp_pfd = 0; struct posix_private *priv = NULL; DECLARE_OLD_FS_ID_VAR; @@ -3245,18 +3932,17 @@ posix_ftruncate (call_frame_t *frame, xlator_t *this, priv = this->private; VALIDATE_OR_GOTO (priv, out); - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { gf_log (this->name, GF_LOG_WARNING, "pfd is NULL, fd=%p", fd); op_errno = -ret; goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; _fd = pfd->fd; - op_ret = posix_fstat_with_gfid (this, _fd, &preop); + op_ret = posix_fdstat (this, _fd, &preop); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, @@ -3275,7 +3961,7 @@ posix_ftruncate (call_frame_t *frame, xlator_t *this, goto out; } - op_ret = posix_fstat_with_gfid (this, _fd, &postop); + op_ret = posix_fdstat (this, _fd, &postop); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, @@ -3289,7 +3975,8 @@ posix_ftruncate (call_frame_t *frame, xlator_t *this, out: SET_TO_OLD_FS_ID (); - STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, &preop, &postop); + STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, &preop, + &postop, NULL); return 0; } @@ -3297,14 +3984,13 @@ out: int32_t posix_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd) + fd_t *fd, dict_t *xdata) { int _fd = -1; int32_t op_ret = -1; int32_t op_errno = 0; struct iatt buf = {0,}; struct posix_fd *pfd = NULL; - uint64_t tmp_pfd = 0; int ret = -1; struct posix_private *priv = NULL; @@ -3318,18 +4004,17 @@ posix_fstat (call_frame_t *frame, xlator_t *this, priv = this->private; VALIDATE_OR_GOTO (priv, out); - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { gf_log (this->name, GF_LOG_WARNING, "pfd is NULL, fd=%p", fd); op_errno = -ret; goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; _fd = pfd->fd; - op_ret = posix_fstat_with_gfid (this, _fd, &buf); + op_ret = posix_fdstat (this, _fd, &buf); if (op_ret == -1) { op_errno = errno; gf_log (this->name, GF_LOG_ERROR, "fstat failed on fd=%p: %s", @@ -3342,7 +4027,7 @@ posix_fstat (call_frame_t *frame, xlator_t *this, out: SET_TO_OLD_FS_ID (); - STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, &buf); + STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, &buf, NULL); return 0; } @@ -3350,7 +4035,7 @@ static int gf_posix_lk_log; int32_t posix_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t cmd, struct gf_flock *lock) + fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata) { struct gf_flock nullock = {0, }; @@ -3359,33 +4044,35 @@ posix_lk (call_frame_t *frame, xlator_t *this, "not loaded. You need to use it for proper " "functioning of your application."); - STACK_UNWIND_STRICT (lk, frame, -1, ENOSYS, &nullock); + STACK_UNWIND_STRICT (lk, frame, -1, ENOSYS, &nullock, NULL); return 0; } int32_t posix_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *lock) + const char *volume, loc_t *loc, int32_t cmd, + struct gf_flock *lock, dict_t *xdata) { GF_LOG_OCCASIONALLY (gf_posix_lk_log, this->name, GF_LOG_CRITICAL, "\"features/locks\" translator is " "not loaded. You need to use it for proper " "functioning of your application."); - STACK_UNWIND_STRICT (inodelk, frame, -1, ENOSYS); + STACK_UNWIND_STRICT (inodelk, frame, -1, ENOSYS, NULL); return 0; } int32_t posix_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *lock) + const char *volume, fd_t *fd, int32_t cmd, + struct gf_flock *lock, dict_t *xdata) { GF_LOG_OCCASIONALLY (gf_posix_lk_log, this->name, GF_LOG_CRITICAL, "\"features/locks\" translator is " "not loaded. You need to use it for proper " "functioning of your application."); - STACK_UNWIND_STRICT (finodelk, frame, -1, ENOSYS); + STACK_UNWIND_STRICT (finodelk, frame, -1, ENOSYS, NULL); return 0; } @@ -3393,45 +4080,56 @@ posix_finodelk (call_frame_t *frame, xlator_t *this, int32_t posix_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { GF_LOG_OCCASIONALLY (gf_posix_lk_log, this->name, GF_LOG_CRITICAL, "\"features/locks\" translator is " "not loaded. You need to use it for proper " "functioning of your application."); - STACK_UNWIND_STRICT (entrylk, frame, -1, ENOSYS); + STACK_UNWIND_STRICT (entrylk, frame, -1, ENOSYS, NULL); return 0; } int32_t posix_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type) + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { GF_LOG_OCCASIONALLY (gf_posix_lk_log, this->name, GF_LOG_CRITICAL, "\"features/locks\" translator is " "not loaded. You need to use it for proper " "functioning of your application."); - STACK_UNWIND_STRICT (fentrylk, frame, -1, ENOSYS); + STACK_UNWIND_STRICT (fentrylk, frame, -1, ENOSYS, NULL); return 0; } int -__posix_fill_readdir (DIR *dir, off_t off, size_t size, gf_dirent_t *entries, - const char *real_path, const char *base_path) +posix_fill_readdir (fd_t *fd, DIR *dir, off_t off, size_t size, + gf_dirent_t *entries, xlator_t *this, int32_t skip_dirs) { off_t in_case = -1; size_t filled = 0; - int ret = 0; int count = 0; + char entrybuf[sizeof(struct dirent) + 256 + 8]; struct dirent *entry = NULL; int32_t this_size = -1; gf_dirent_t *this_entry = NULL; - char hidden_path[PATH_MAX] = {0, }; - struct stat statbuf = {0, }; + uuid_t rootgfid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1}; + struct stat stbuf = {0,}; + char *hpath = NULL; + int len = 0; + int ret = 0; + + if (skip_dirs) { + len = posix_handle_path (this, fd->inode->gfid, NULL, NULL, 0); + hpath = alloca (len + 256); /* NAME_MAX */ + posix_handle_path (this, fd->inode->gfid, NULL, hpath, len); + len = strlen (hpath); + hpath[len] = '/'; + } if (!off) { rewinddir (dir); @@ -3450,7 +4148,8 @@ __posix_fill_readdir (DIR *dir, off_t off, size_t size, gf_dirent_t *entries, } errno = 0; - entry = readdir (dir); + entry = NULL; + readdir_r (dir, (struct dirent *)entrybuf, &entry); if (!entry) { if (errno == EBADF) { @@ -3462,10 +4161,6 @@ __posix_fill_readdir (DIR *dir, off_t off, size_t size, gf_dirent_t *entries, break; } - if ((!strcmp (real_path, base_path)) - && (!strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR))) - continue; - #ifdef __NetBSD__ /* * NetBSD with UFS1 backend uses backing files for @@ -3475,19 +4170,25 @@ __posix_fill_readdir (DIR *dir, off_t off, size_t size, gf_dirent_t *entries, * when the cluster/dht xlator decides to distribute * exended attribute backing file accross storage servers. */ - if ((!strcmp(real_path, base_path)) + if ((uuid_compare (fd->inode->gfid, rootgfid) == 0) && (!strcmp(entry->d_name, ".attribute"))) continue; #endif /* __NetBSD__ */ - if ((!strcmp (real_path, base_path)) - && (!strncmp (GF_HIDDEN_PATH, entry->d_name, - strlen (GF_HIDDEN_PATH)))) { - snprintf (hidden_path, PATH_MAX, "%s/%s", real_path, - entry->d_name); - ret = lstat (hidden_path, &statbuf); - if (!ret && S_ISDIR (statbuf.st_mode)) + if ((uuid_compare (fd->inode->gfid, rootgfid) == 0) + && (!strcmp (GF_HIDDEN_PATH, entry->d_name))) { + continue; + } + + if (skip_dirs) { + if (DT_ISDIR (entry->d_type)) { continue; + } else if (hpath) { + strcpy (&hpath[len+1],entry->d_name); + ret = lstat (hpath, &stbuf); + if (!ret && S_ISDIR (stbuf.st_mode)) + continue; + } } this_size = max (sizeof (gf_dirent_t), @@ -3509,6 +4210,7 @@ __posix_fill_readdir (DIR *dir, off_t off, size_t size, gf_dirent_t *entries, } this_entry->d_off = telldir (dir); this_entry->d_ino = entry->d_ino; + this_entry->d_type = entry->d_type; list_add_tail (&this_entry->list, &entries->list); @@ -3523,12 +4225,88 @@ out: return count; } +dict_t * +posix_entry_xattr_fill (xlator_t *this, inode_t *inode, + fd_t *fd, char *name, dict_t *dict, + struct iatt *stbuf) +{ + loc_t tmp_loc = {0,}; + char *entry_path = NULL; + + /* if we don't send the 'loc', open-fd-count be a problem. */ + tmp_loc.inode = inode; + + MAKE_HANDLE_PATH (entry_path, this, fd->inode->gfid, name); + + return posix_lookup_xattr_fill (this, entry_path, + &tmp_loc, dict, stbuf); + +} + + +int +posix_readdirp_fill (xlator_t *this, fd_t *fd, gf_dirent_t *entries, dict_t *dict) +{ + gf_dirent_t *entry = NULL; + inode_table_t *itable = NULL; + inode_t *inode = NULL; + char *hpath = NULL; + int len = 0; + struct iatt stbuf = {0, }; + uuid_t gfid; + + if (list_empty(&entries->list)) + return 0; + + itable = fd->inode->table; + + len = posix_handle_path (this, fd->inode->gfid, NULL, NULL, 0); + hpath = alloca (len + 256); /* NAME_MAX */ + posix_handle_path (this, fd->inode->gfid, NULL, hpath, len); + len = strlen (hpath); + hpath[len] = '/'; + + list_for_each_entry (entry, &entries->list, list) { + memset (gfid, 0, 16); + inode = inode_grep (fd->inode->table, fd->inode, + entry->d_name); + if (inode) + uuid_copy (gfid, inode->gfid); + + strcpy (&hpath[len+1], entry->d_name); + + posix_pstat (this, gfid, hpath, &stbuf); + + if (!inode) + inode = inode_find (itable, stbuf.ia_gfid); + + if (!inode) + inode = inode_new (itable); + + entry->inode = inode; + + if (dict) { + entry->dict = + posix_entry_xattr_fill (this, entry->inode, + fd, entry->d_name, + dict, &stbuf); + dict_ref (entry->dict); + } + + entry->d_stat = stbuf; + if (stbuf.ia_ino) + entry->d_ino = stbuf.ia_ino; + inode = NULL; + } + + return 0; +} + int32_t posix_do_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off, int whichop) + fd_t *fd, size_t size, off_t off, int whichop, dict_t *dict) { - uint64_t tmp_pfd = 0; struct posix_fd *pfd = NULL; DIR *dir = NULL; int ret = -1; @@ -3536,13 +4314,7 @@ posix_do_readdir (call_frame_t *frame, xlator_t *this, int32_t op_ret = -1; int32_t op_errno = 0; gf_dirent_t entries; - char *real_path = NULL; - int real_path_len = -1; - char *entry_path = NULL; - int entry_path_len = -1; - struct iatt stbuf = {0, }; - char base_path[PATH_MAX] = {0,}; - gf_dirent_t *tmp_entry = NULL; + int32_t skip_dirs = 0; VALIDATE_OR_GOTO (frame, out); @@ -3551,38 +4323,13 @@ posix_do_readdir (call_frame_t *frame, xlator_t *this, INIT_LIST_HEAD (&entries.list); - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { gf_log (this->name, GF_LOG_WARNING, "pfd is NULL, fd=%p", fd); op_errno = -ret; goto out; } - pfd = (struct posix_fd *)(long)tmp_pfd; - if (!pfd->path) { - op_errno = EBADFD; - gf_log (this->name, GF_LOG_WARNING, - "pfd does not have path set (possibly file " - "fd, fd=%p)", fd); - goto out; - } - - real_path = pfd->path; - real_path_len = strlen (real_path); - - entry_path_len = real_path_len + NAME_MAX; - entry_path = alloca (entry_path_len); - - strncpy(base_path, POSIX_BASE_PATH(this), sizeof(base_path)); - base_path[strlen(base_path)] = '/'; - - if (!entry_path) { - op_errno = errno; - goto out; - } - - strncpy (entry_path, real_path, entry_path_len); - entry_path[real_path_len] = '/'; dir = pfd->dir; @@ -3591,35 +4338,42 @@ posix_do_readdir (call_frame_t *frame, xlator_t *this, "dir is NULL for fd=%p", fd); op_errno = EINVAL; goto out; - } + } + /* When READDIR_FILTER option is set to on, we can filter out + * directory's entry from the entry->list. + */ + ret = dict_get_int32 (dict, GF_READDIR_SKIP_DIRS, &skip_dirs); - LOCK (&fd->lock); - { - count = __posix_fill_readdir (dir, off, size, &entries, - real_path, base_path); + LOCK (&fd->lock); + { + /* posix_fill_readdir performs multiple separate individual + readdir() calls to fill up the buffer. - } - UNLOCK (&fd->lock); + In case of NFS where the same anonymous FD is shared between + different applications, reading a common directory can + result in the anonymous fd getting re-used unsafely between + the two readdir requests (in two different io-threads). + + It would also help, in the future, to replace the loop + around readdir() with a single large getdents() call. + */ + count = posix_fill_readdir (fd, dir, off, size, &entries, this, + skip_dirs); + } + UNLOCK (&fd->lock); /* pick ENOENT to indicate EOF */ op_errno = errno; + op_ret = count; - if (whichop == GF_FOP_READDIRP) { - list_for_each_entry (tmp_entry, &entries.list, list) { - strcpy (entry_path + real_path_len + 1, - tmp_entry->d_name); - posix_lstat_with_gfid (this, entry_path, &stbuf); - if (stbuf.ia_ino) - tmp_entry->d_ino = stbuf.ia_ino; - tmp_entry->d_stat = stbuf; - } - } + if (whichop != GF_FOP_READDIRP) + goto out; - op_ret = count; + posix_readdirp_fill (this, fd, &entries, dict); out: - STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, &entries); + STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, &entries, NULL); gf_dirent_free (&entries); @@ -3629,18 +4383,18 @@ out: int32_t posix_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off) + fd_t *fd, size_t size, off_t off, dict_t *xdata) { - posix_do_readdir (frame, this, fd, size, off, GF_FOP_READDIR); + posix_do_readdir (frame, this, fd, size, off, GF_FOP_READDIR, xdata); return 0; } int32_t posix_readdirp (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off) + fd_t *fd, size_t size, off_t off, dict_t *dict) { - posix_do_readdir (frame, this, fd, size, off, GF_FOP_READDIRP); + posix_do_readdir (frame, this, fd, size, off, GF_FOP_READDIRP, dict); return 0; } @@ -3680,65 +4434,72 @@ posix_inode (xlator_t *this) int32_t posix_rchecksum (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset, int32_t len) + fd_t *fd, off_t offset, int32_t len, dict_t *xdata) { - char *buf = NULL; - - int _fd = -1; - uint64_t tmp_pfd = 0; - - struct posix_fd *pfd = NULL; - - int op_ret = -1; - int op_errno = 0; - - int ret = 0; - - int32_t weak_checksum = 0; - uint8_t strong_checksum[MD5_DIGEST_LEN]; + char *alloc_buf = NULL; + char *buf = NULL; + int _fd = -1; + struct posix_fd *pfd = NULL; + int op_ret = -1; + int op_errno = 0; + int ret = 0; + int32_t weak_checksum = 0; + unsigned char strong_checksum[MD5_DIGEST_LENGTH] = {0}; + struct posix_private *priv = NULL; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (fd, out); - memset (strong_checksum, 0, MD5_DIGEST_LEN); - buf = GF_CALLOC (1, len, gf_posix_mt_char); + priv = this->private; + memset (strong_checksum, 0, MD5_DIGEST_LENGTH); - if (!buf) { + alloc_buf = _page_aligned_alloc (len, &buf); + if (!alloc_buf) { op_errno = ENOMEM; goto out; } - ret = fd_ctx_get (fd, this, &tmp_pfd); + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { gf_log (this->name, GF_LOG_WARNING, "pfd is NULL, fd=%p", fd); op_errno = -ret; goto out; } - pfd = (struct posix_fd *)(long) tmp_pfd; _fd = pfd->fd; - ret = pread (_fd, buf, len, offset); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "pread of %d bytes returned %d (%s)", - len, ret, strerror (errno)); + LOCK (&fd->lock); + { + if (priv->aio_capable && priv->aio_init_done) + __posix_fd_set_odirect (fd, pfd, 0, offset, len); + + ret = pread (_fd, buf, len, offset); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "pread of %d bytes returned %d (%s)", + len, ret, strerror (errno)); + + op_errno = errno; + } - op_errno = errno; - goto out; } + UNLOCK (&fd->lock); - weak_checksum = gf_rsync_weak_checksum (buf, len); - gf_rsync_strong_checksum (buf, len, strong_checksum); + if (ret < 0) + goto out; - GF_FREE (buf); + weak_checksum = gf_rsync_weak_checksum ((unsigned char *) buf, (size_t) len); + gf_rsync_strong_checksum ((unsigned char *) buf, (size_t) len, (unsigned char *) strong_checksum); op_ret = 0; out: STACK_UNWIND_STRICT (rchecksum, frame, op_ret, op_errno, - weak_checksum, strong_checksum); + weak_checksum, strong_checksum, NULL); + + GF_FREE (alloc_buf); + return 0; } @@ -3786,24 +4547,123 @@ mem_acct_init (xlator_t *this) return ret; } +static int +posix_set_owner (xlator_t *this, uid_t uid, gid_t gid) +{ + struct posix_private *priv = NULL; + int ret = -1; + + priv = this->private; + + ret = sys_chown (priv->base_path, uid, gid); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Failed to set " + "uid/gid for brick path %s, %s", + priv->base_path, strerror (errno)); + + return ret; +} + + +static int +set_batch_fsync_mode (struct posix_private *priv, const char *str) +{ + if (strcmp (str, "none") == 0) + priv->batch_fsync_mode = BATCH_NONE; + else if (strcmp (str, "syncfs") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS; + else if (strcmp (str, "syncfs-single-fsync") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS_SINGLE_FSYNC; + else if (strcmp (str, "syncfs-reverse-fsync") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS_REVERSE_FSYNC; + else if (strcmp (str, "reverse-fsync") == 0) + priv->batch_fsync_mode = BATCH_REVERSE_FSYNC; + else + return -1; + + return 0; +} + + +int +reconfigure (xlator_t *this, dict_t *options) +{ + int ret = -1; + struct posix_private *priv = NULL; + uid_t uid = -1; + gid_t gid = -1; + char *batch_fsync_mode_str = NULL; + + priv = this->private; + + GF_OPTION_RECONF ("brick-uid", uid, options, uint32, out); + GF_OPTION_RECONF ("brick-gid", gid, options, uint32, out); + posix_set_owner (this, uid, gid); + + GF_OPTION_RECONF ("batch-fsync-delay-usec", priv->batch_fsync_delay_usec, + options, uint32, out); + + GF_OPTION_RECONF ("batch-fsync-mode", batch_fsync_mode_str, + options, str, out); + + if (set_batch_fsync_mode (priv, batch_fsync_mode_str) != 0) { + gf_log (this->name, GF_LOG_ERROR, "Unknown mode string: %s", + batch_fsync_mode_str); + goto out; + } + + GF_OPTION_RECONF ("linux-aio", priv->aio_configured, + options, bool, out); + + if (priv->aio_configured) + posix_aio_on (this); + else + posix_aio_off (this); + + GF_OPTION_RECONF ("node-uuid-pathinfo", priv->node_uuid_pathinfo, + options, bool, out); + + if (priv->node_uuid_pathinfo && + (uuid_is_null (priv->glusterd_uuid))) { + gf_log (this->name, GF_LOG_INFO, + "glusterd uuid is NULL, pathinfo xattr would" + " fallback to <hostname>:<export>"); + } + + GF_OPTION_RECONF ("health-check-interval", priv->health_check_interval, + options, uint32, out); + posix_spawn_health_check_thread (this); + + ret = 0; +out: + return ret; +} + + /** * init - */ int init (xlator_t *this) { - struct posix_private *_private = NULL; - data_t *dir_data = NULL; - data_t *tmp_data = NULL; - struct stat buf = {0,}; - gf_boolean_t tmp_bool = 0; - int dict_ret = 0; - int ret = 0; - int op_ret = -1; - int32_t janitor_sleep = 0; - uuid_t old_uuid = {0,}; - uuid_t dict_uuid = {0,}; - uuid_t gfid = {0,}; + struct posix_private *_private = NULL; + data_t *dir_data = NULL; + data_t *tmp_data = NULL; + struct stat buf = {0,}; + gf_boolean_t tmp_bool = 0; + int dict_ret = 0; + int ret = 0; + int op_ret = -1; + ssize_t size = -1; + int32_t janitor_sleep = 0; + uuid_t old_uuid = {0,}; + uuid_t dict_uuid = {0,}; + uuid_t gfid = {0,}; + uuid_t rootgfid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1}; + char *guuid = NULL; + uid_t uid = -1; + gid_t gid = -1; + char *batch_fsync_mode_str; dir_data = dict_get (this->options, "directory"); @@ -3884,33 +4744,31 @@ init (xlator_t *this) ret = -1; goto out; } - op_ret = sys_lgetxattr (dir_data->data, - "trusted.glusterfs.volume-id", old_uuid, 16); - if (op_ret == 16) { + size = sys_lgetxattr (dir_data->data, + "trusted.glusterfs.volume-id", old_uuid, 16); + if (size == 16) { if (uuid_compare (old_uuid, dict_uuid)) { gf_log (this->name, GF_LOG_ERROR, - "mismatching volume-id (%s) recieved. " + "mismatching volume-id (%s) received. " "already is a part of volume %s ", tmp_data->data, uuid_utoa (old_uuid)); ret = -1; goto out; } - } else if ((op_ret == -1) && (errno == ENODATA)) { - /* Using the export for first time */ - op_ret = sys_lsetxattr (dir_data->data, - "trusted.glusterfs.volume-id", - dict_uuid, 16, 0); - if (op_ret == -1) { + } else if ((size == -1) && (errno == ENODATA)) { + gf_log (this->name, GF_LOG_ERROR, - "failed to set volume id on export"); + "Extended attribute trusted.glusterfs." + "volume-id is absent"); ret = -1; goto out; - } - } else if ((op_ret == -1) && (errno != ENODATA)) { + + } else if ((size == -1) && (errno != ENODATA)) { /* Wrong 'volume-id' is set, it should be error */ gf_log (this->name, GF_LOG_WARNING, "%s: failed to fetch volume-id (%s)", dir_data->data, strerror (errno)); + ret = -1; goto out; } else { ret = -1; @@ -3922,8 +4780,8 @@ init (xlator_t *this) /* Now check if the export directory has some other 'gfid', other than that of root '/' */ - ret = sys_lgetxattr (dir_data->data, "trusted.gfid", gfid, 16); - if (ret == 16) { + size = sys_lgetxattr (dir_data->data, "trusted.gfid", gfid, 16); + if (size == 16) { if (!__is_root_gfid (gfid)) { gf_log (this->name, GF_LOG_WARNING, "%s: gfid (%s) is not that of glusterfs '/' ", @@ -3931,24 +4789,36 @@ init (xlator_t *this) ret = -1; goto out; } - } else if (ret != -1) { + } else if (size != -1) { /* Wrong 'gfid' is set, it should be error */ gf_log (this->name, GF_LOG_WARNING, "%s: wrong value set as gfid", dir_data->data); ret = -1; goto out; - } else if ((ret == -1) && (errno != ENODATA)) { + } else if ((size == -1) && (errno != ENODATA)) { /* Wrong 'gfid' is set, it should be error */ gf_log (this->name, GF_LOG_WARNING, "%s: failed to fetch gfid (%s)", dir_data->data, strerror (errno)); + ret = -1; goto out; + } else { + /* First time volume, set the GFID */ + size = sys_lsetxattr (dir_data->data, "trusted.gfid", rootgfid, + 16, XATTR_CREATE); + if (size) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set gfid (%s)", + dir_data->data, strerror (errno)); + ret = -1; + goto out; + } } - op_ret = sys_lgetxattr (dir_data->data, "system.posix_acl_access", - NULL, 0); - if ((op_ret < 0) && (errno == ENOTSUP)) + size = sys_lgetxattr (dir_data->data, POSIX_ACL_ACCESS_XATTR, + NULL, 0); + if ((size < 0) && (errno == ENOTSUP)) gf_log (this->name, GF_LOG_WARNING, "Posix access control list is not supported."); @@ -3963,20 +4833,6 @@ init (xlator_t *this) _private->base_path = gf_strdup (dir_data->data); _private->base_path_length = strlen (_private->base_path); - _private->trash_path = GF_CALLOC (1, _private->base_path_length - + strlen ("/") - + strlen (GF_REPLICATE_TRASH_DIR) - + 1, - gf_posix_mt_trash_path); - - if (!_private->trash_path) { - ret = -1; - goto out; - } - - strncpy (_private->trash_path, _private->base_path, _private->base_path_length); - strcat (_private->trash_path, "/" GF_REPLICATE_TRASH_DIR); - LOCK_INIT (&_private->lock); ret = dict_get_str (this->options, "hostname", &_private->hostname); @@ -4041,6 +4897,19 @@ init (xlator_t *this) "for every open)"); } + ret = dict_get_str (this->options, "glusterd-uuid", &guuid); + if (!ret) { + if (uuid_parse (guuid, _private->glusterd_uuid)) + gf_log (this->name, GF_LOG_WARNING, "Cannot parse " + "glusterd (node) UUID, node-uuid xattr " + "request would return - \"No such attribute\""); + } else { + gf_log (this->name, GF_LOG_DEBUG, "No glusterd (node) UUID " + "passed - node-uuid xattr request will return " + "\"No such attribute\""); + } + ret = 0; + _private->janitor_sleep_duration = 600; dict_ret = dict_get_int32 (this->options, "janitor-sleep-duration", @@ -4091,11 +4960,84 @@ init (xlator_t *this) #endif this->private = (void *)_private; + op_ret = posix_handle_init (this); + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "Posix handle setup failed"); + ret = -1; + goto out; + } + + op_ret = posix_handle_trash_init (this); + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "Posix landfill setup failed"); + ret = -1; + goto out; + } + + _private->aio_init_done = _gf_false; + _private->aio_capable = _gf_false; + + GF_OPTION_INIT ("brick-uid", uid, uint32, out); + GF_OPTION_INIT ("brick-gid", gid, uint32, out); + posix_set_owner (this, uid, gid); + + GF_OPTION_INIT ("linux-aio", _private->aio_configured, bool, out); + + if (_private->aio_configured) { + op_ret = posix_aio_on (this); + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "Posix AIO init failed"); + ret = -1; + goto out; + } + } + + GF_OPTION_INIT ("node-uuid-pathinfo", + _private->node_uuid_pathinfo, bool, out); + if (_private->node_uuid_pathinfo && + (uuid_is_null (_private->glusterd_uuid))) { + gf_log (this->name, GF_LOG_INFO, + "glusterd uuid is NULL, pathinfo xattr would" + " fallback to <hostname>:<export>"); + } + + _private->health_check_active = _gf_false; + GF_OPTION_INIT ("health-check-interval", + _private->health_check_interval, uint32, out); + if (_private->health_check_interval) + posix_spawn_health_check_thread (this); + pthread_mutex_init (&_private->janitor_lock, NULL); pthread_cond_init (&_private->janitor_cond, NULL); INIT_LIST_HEAD (&_private->janitor_fds); posix_spawn_janitor_thread (this); + + pthread_mutex_init (&_private->fsync_mutex, NULL); + pthread_cond_init (&_private->fsync_cond, NULL); + INIT_LIST_HEAD (&_private->fsyncs); + + ret = gf_thread_create (&_private->fsyncer, NULL, posix_fsyncer, this); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "fsyncer thread" + " creation failed (%s)", strerror (errno)); + goto out; + } + + GF_OPTION_INIT ("batch-fsync-mode", batch_fsync_mode_str, str, out); + + if (set_batch_fsync_mode (_private, batch_fsync_mode_str) != 0) { + gf_log (this->name, GF_LOG_ERROR, "Unknown mode string: %s", + batch_fsync_mode_str); + goto out; + } + + GF_OPTION_INIT ("batch-fsync-delay-usec", _private->batch_fsync_delay_usec, + uint32, out); out: return ret; } @@ -4146,6 +5088,7 @@ struct xlator_fops fops = { .getxattr = posix_getxattr, .fgetxattr = posix_fgetxattr, .removexattr = posix_removexattr, + .fremovexattr = posix_fremovexattr, .fsyncdir = posix_fsyncdir, .access = posix_access, .ftruncate = posix_ftruncate, @@ -4160,6 +5103,9 @@ struct xlator_fops fops = { .fxattrop = posix_fxattrop, .setattr = posix_setattr, .fsetattr = posix_fsetattr, + .fallocate = _posix_fallocate, + .discard = posix_discard, + .zerofill = posix_zerofill, }; struct xlator_cbks cbks = { @@ -4185,5 +5131,62 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_INT }, { .key = {"volume-id"}, .type = GF_OPTION_TYPE_ANY }, + { .key = {"glusterd-uuid"}, + .type = GF_OPTION_TYPE_STR }, + { + .key = {"linux-aio"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Support for native Linux AIO" + }, + { + .key = {"brick-uid"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .validate = GF_OPT_VALIDATE_MIN, + .description = "Support for setting uid of brick's owner" + }, + { + .key = {"brick-gid"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .validate = GF_OPT_VALIDATE_MIN, + .description = "Support for setting gid of brick's owner" + }, + { .key = {"node-uuid-pathinfo"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "return glusterd's node-uuid in pathinfo xattr" + " string instead of hostname" + }, + { + .key = {"health-check-interval"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .default_value = "30", + .validate = GF_OPT_VALIDATE_MIN, + .description = "Interval in seconds for a filesystem health check, " + "set to 0 to disable" + }, + { .key = {"batch-fsync-mode"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "reverse-fsync", + .description = "Possible values:\n" + "\t- syncfs: Perform one syncfs() on behalf oa batch" + "of fsyncs.\n" + "\t- syncfs-single-fsync: Perform one syncfs() on behalf of a batch" + " of fsyncs and one fsync() per batch.\n" + "\t- syncfs-reverse-fsync: Preform one syncfs() on behalf of a batch" + " of fsyncs and fsync() each file in the batch in reverse order.\n" + " in reverse order.\n" + "\t- reverse-fsync: Perform fsync() of each file in the batch in" + " reverse order." + }, + { .key = {"batch-fsync-delay-usec"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "0", + .description = "Num of usecs to wait for aggregating fsync" + " requests", + }, { .key = {NULL} } }; diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index 1733eadd1..3121db271 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -1,22 +1,12 @@ /* - Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ - #ifndef _POSIX_H #define _POSIX_H @@ -52,7 +42,16 @@ #include "compat.h" #include "timer.h" #include "posix-mem-types.h" +#include "posix-handle.h" +#include "call-stub.h" + +#ifdef HAVE_LIBAIO +#include <libaio.h> +#include "posix-aio.h" +#endif +#define VECTOR_SIZE 64 * 1024 /* vector size 64KB*/ +#define MAX_NO_VECT 1024 /** * posix_fd - internal structure common to file and directory fd's */ @@ -60,9 +59,8 @@ struct posix_fd { int fd; /* fd returned by the kernel */ int32_t flags; /* flags for open/creat */ - char * path; /* used by setdents/getdents */ DIR * dir; /* handle returned by the kernel */ - int flushwrites; + int odirect; struct list_head list; /* to add to the janitor list */ }; @@ -102,12 +100,12 @@ struct posix_private { gf_boolean_t o_direct; /* always open files in O_DIRECT mode */ -/* +/* decide whether posix_unlink does open (file), unlink (file), close (fd) instead of just unlink (file). with the former approach there is no lockout of access to parent directory during removal of very large files for the entire duration of freeing of data blocks. -*/ +*/ gf_boolean_t background_unlink; /* janitor thread which cleans up /.trash (created by replicate) */ @@ -116,38 +114,95 @@ struct posix_private { char * trash_path; /* lock for brick dir */ DIR *mount_lock; + + struct stat handledir; + +/* uuid of glusterd that swapned the brick process */ + uuid_t glusterd_uuid; + + gf_boolean_t aio_configured; + gf_boolean_t aio_init_done; + gf_boolean_t aio_capable; +#ifdef HAVE_LIBAIO + io_context_t ctxp; + pthread_t aiothread; +#endif + + /* node-uuid in pathinfo xattr */ + gf_boolean_t node_uuid_pathinfo; + + pthread_t fsyncer; + struct list_head fsyncs; + pthread_mutex_t fsync_mutex; + pthread_cond_t fsync_cond; + int fsync_queue_count; + + enum { + BATCH_NONE = 0, + BATCH_SYNCFS, + BATCH_SYNCFS_SINGLE_FSYNC, + BATCH_REVERSE_FSYNC, + BATCH_SYNCFS_REVERSE_FSYNC + } batch_fsync_mode; + + uint32_t batch_fsync_delay_usec; + + /* seconds to sleep between health checks */ + uint32_t health_check_interval; + pthread_t health_check; + gf_boolean_t health_check_active; }; -#define POSIX_BASE_PATH(this) (((struct posix_private *)this->private)->base_path) +typedef struct { + xlator_t *this; + const char *real_path; + dict_t *xattr; + struct iatt *stbuf; + loc_t *loc; + inode_t *inode; /* for all do_xattrop() key handling */ + int fd; + int flags; + int32_t op_errno; +} posix_xattr_filler_t; -#define POSIX_BASE_PATH_LEN(this) (((struct posix_private *)this->private)->base_path_length) -#define MAKE_REAL_PATH(var, this, path) do { \ - var = alloca (strlen (path) + POSIX_BASE_PATH_LEN(this) + 2); \ - strcpy (var, POSIX_BASE_PATH(this)); \ - strcpy (&var[POSIX_BASE_PATH_LEN(this)], path); \ - } while (0) +#define POSIX_BASE_PATH(this) (((struct posix_private *)this->private)->base_path) +#define POSIX_BASE_PATH_LEN(this) (((struct posix_private *)this->private)->base_path_length) /* Helper functions */ -int setgid_override (xlator_t *this, char *real_path, gid_t *gid); -int posix_gfid_set (xlator_t *this, const char *path, dict_t *xattr_req); -int posix_fstat_with_gfid (xlator_t *this, int fd, struct iatt *stbuf_p); -int posix_lstat_with_gfid (xlator_t *this, const char *path, struct iatt *buf); +int posix_gfid_set (xlator_t *this, const char *path, loc_t *loc, + dict_t *xattr_req); +int posix_fdstat (xlator_t *this, int fd, struct iatt *stbuf_p); +int posix_istat (xlator_t *this, uuid_t gfid, const char *basename, + struct iatt *iatt); +int posix_pstat (xlator_t *this, uuid_t gfid, const char *real_path, + struct iatt *iatt); dict_t *posix_lookup_xattr_fill (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr, struct iatt *buf); -int posix_handle_pair (xlator_t *this, const char *real_path, - data_pair_t *trav, int flags); -int posix_fhandle_pair (xlator_t *this, int fd, data_pair_t *trav, int flags); +int posix_handle_pair (xlator_t *this, const char *real_path, char *key, + data_t *value, int flags); +int posix_fhandle_pair (xlator_t *this, int fd, char *key, data_t *value, + int flags); void posix_spawn_janitor_thread (xlator_t *this); -int posix_get_file_contents (xlator_t *this, const char *path, +int posix_get_file_contents (xlator_t *this, uuid_t pargfid, const char *name, char **contents); -int posix_set_file_contents (xlator_t *this, const char *path, - data_pair_t *trav, int flags); +int posix_set_file_contents (xlator_t *this, const char *path, char *key, + data_t *value, int flags); int posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req); -int posix_gfid_heal (xlator_t *this, const char *path, dict_t *xattr_req); +int posix_gfid_heal (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req); int posix_entry_create_xattr_set (xlator_t *this, const char *path, dict_t *dict); +int posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd); +void posix_fill_ino_from_gfid (xlator_t *this, struct iatt *buf); + +gf_boolean_t posix_special_xattr (char **pattern, char *key); + +void +__posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags, + off_t offset, size_t size); +void posix_spawn_health_check_thread (xlator_t *this); +void *posix_fsyncer (void *); #endif /* _POSIX_H */ |
