diff options
Diffstat (limited to 'xlators/storage/posix/src')
| -rw-r--r-- | xlators/storage/posix/src/posix-aio.c | 10 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-handle.c | 7 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-helpers.c | 342 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix.c | 700 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix.h | 34 |
5 files changed, 1037 insertions, 56 deletions
diff --git a/xlators/storage/posix/src/posix-aio.c b/xlators/storage/posix/src/posix-aio.c index fad4a7df3..c3bbddd67 100644 --- a/xlators/storage/posix/src/posix-aio.c +++ b/xlators/storage/posix/src/posix-aio.c @@ -136,11 +136,7 @@ posix_aio_readv_complete (struct posix_aio_cb *paiocb, int res, int res2) /* Hack to notify higher layers of EOF. */ - if (postbuf.ia_size == 0) - op_errno = ENOENT; - else if ((offset + iov.iov_len) == postbuf.ia_size) - op_errno = ENOENT; - else if (offset > postbuf.ia_size) + if (!postbuf.ia_size || (offset + iov.iov_len) >= postbuf.ia_size) op_errno = ENOENT; LOCK (&priv->lock); @@ -490,8 +486,8 @@ posix_aio_init (xlator_t *this) goto out; } - ret = pthread_create (&priv->aiothread, NULL, - posix_aio_thread, this); + ret = gf_thread_create (&priv->aiothread, NULL, + posix_aio_thread, this); if (ret != 0) { io_destroy (priv->ctxp); goto out; diff --git a/xlators/storage/posix/src/posix-handle.c b/xlators/storage/posix/src/posix-handle.c index 33bf3db56..219a582c9 100644 --- a/xlators/storage/posix/src/posix-handle.c +++ b/xlators/storage/posix/src/posix-handle.c @@ -573,13 +573,6 @@ posix_handle_hard (xlator_t *this, const char *oldpath, uuid_t gfid, struct stat } } - ret = lstat (newpath, &newbuf); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "lstat on %s failed (%s)", newpath, strerror (errno)); - return -1; - } - if (newbuf.st_ino != oldbuf->st_ino || newbuf.st_dev != oldbuf->st_dev) { gf_log (this->name, GF_LOG_WARNING, diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c index f8284d335..e295f8850 100644 --- a/xlators/storage/posix/src/posix-helpers.c +++ b/xlators/storage/posix/src/posix-helpers.c @@ -22,6 +22,7 @@ #include <pthread.h> #include <ftw.h> #include <sys/stat.h> +#include <signal.h> #ifndef GF_BSD_HOST_OS #include <alloca.h> @@ -44,6 +45,7 @@ #include "timer.h" #include "glusterfs3-xdr.h" #include "hashfn.h" +#include "glusterfs-acl.h" #include <fnmatch.h> char *marker_xattrs[] = {"trusted.glusterfs.quota.*", @@ -883,8 +885,8 @@ posix_spawn_janitor_thread (xlator_t *this) LOCK (&priv->lock); { if (!priv->janitor_present) { - ret = pthread_create (&priv->janitor, NULL, - posix_janitor_thread_proc, this); + ret = gf_thread_create (&priv->janitor, NULL, + posix_janitor_thread_proc, this); if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, @@ -900,6 +902,74 @@ unlock: UNLOCK (&priv->lock); } +static int +is_fresh_file (struct stat *stat) +{ + struct timeval tv; + + gettimeofday (&tv, NULL); + + if ((stat->st_ctime >= (tv.tv_sec - 1)) + && (stat->st_ctime <= tv.tv_sec)) + return 1; + + return 0; +} + + +int +posix_gfid_heal (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req) +{ + /* The purpose of this function is to prevent a race + where an inode creation FOP (like mkdir/mknod/create etc) + races with lookup in the following way: + + {create thread} | {lookup thread} + | + t0 + mkdir ("name") | + t1 + | posix_gfid_set ("name", 2); + t2 + posix_gfid_set ("name", 1); | + t3 + lstat ("name"); | lstat ("name"); + + In the above case mkdir FOP would have resulted with GFID 2 while + it should have been GFID 1. It matters in the case where GFID would + have gotten set to 1 on other subvolumes of replciate/distribute + + The "solution" here is that, if we detect lookup is attempting to + set a GFID on a file which is created very recently, but does not + yet have a GFID (i.e, between t1 and t2), then "fake" it as though + posix_gfid_heal was called at t0 instead. + */ + + uuid_t uuid_curr; + int ret = 0; + struct stat stat = {0, }; + + if (!xattr_req) + goto out; + + if (sys_lstat (path, &stat) != 0) + goto out; + + ret = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16); + if (ret != 16) { + if (is_fresh_file (&stat)) { + ret = -1; + errno = ENOENT; + goto out; + } + } + + ret = posix_gfid_set (this, path, loc, xattr_req); +out: + return ret; +} + + int posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req) { @@ -913,17 +983,17 @@ posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req) if (sys_lstat (path, &stat) != 0) goto out; - data = dict_get (xattr_req, "system.posix_acl_access"); + data = dict_get (xattr_req, POSIX_ACL_ACCESS_XATTR); if (data) { - ret = sys_lsetxattr (path, "system.posix_acl_access", + ret = sys_lsetxattr (path, POSIX_ACL_ACCESS_XATTR, data->data, data->len, 0); if (ret != 0) goto out; } - data = dict_get (xattr_req, "system.posix_acl_default"); + data = dict_get (xattr_req, POSIX_ACL_DEFAULT_XATTR); if (data) { - ret = sys_lsetxattr (path, "system.posix_acl_default", + ret = sys_lsetxattr (path, POSIX_ACL_DEFAULT_XATTR, data->data, data->len, 0); if (ret != 0) goto out; @@ -944,8 +1014,8 @@ _handle_entry_create_keyvalue_pair (dict_t *d, char *k, data_t *v, if (!strcmp (GFID_XATTR_KEY, k) || !strcmp ("gfid-req", k) || - !strcmp ("system.posix_acl_default", k) || - !strcmp ("system.posix_acl_access", k) || + !strcmp (POSIX_ACL_DEFAULT_XATTR, k) || + !strcmp (POSIX_ACL_ACCESS_XATTR, k) || ZR_FILE_CONTENT_REQUEST(k)) { return 0; } @@ -1063,3 +1133,259 @@ posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd) return ret; } + +static void * +posix_health_check_thread_proc (void *data) +{ + xlator_t *this = NULL; + struct posix_private *priv = NULL; + uint32_t interval = 0; + int ret = -1; + struct stat sb = {0, }; + + this = data; + priv = this->private; + + /* prevent races when the interval is updated */ + interval = priv->health_check_interval; + if (interval == 0) + goto out; + + gf_log (this->name, GF_LOG_DEBUG, "health-check thread started, " + "interval = %d seconds", interval); + + while (1) { + /* aborting sleep() is a request to exit this thread, sleep() + * will normally not return when cancelled */ + ret = sleep (interval); + if (ret > 0) + break; + + /* prevent thread errors while doing the health-check(s) */ + pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL); + + /* Do the health-check, it should be moved to its own function + * in case it gets more complex. */ + ret = stat (priv->base_path, &sb); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "stat() on %s returned: %s", priv->base_path, + strerror (errno)); + goto abort; + } + + pthread_setcancelstate (PTHREAD_CANCEL_ENABLE, NULL); + } + +out: + gf_log (this->name, GF_LOG_DEBUG, "health-check thread exiting"); + + LOCK (&priv->lock); + { + priv->health_check_active = _gf_false; + } + UNLOCK (&priv->lock); + + return NULL; + +abort: + /* health-check failed */ + gf_log (this->name, GF_LOG_EMERG, "health-check failed, going down"); + xlator_notify (this->parents->xlator, GF_EVENT_CHILD_DOWN, this); + + ret = sleep (30); + if (ret == 0) { + gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGTERM"); + kill (getpid(), SIGTERM); + } + + ret = sleep (30); + if (ret == 0) { + gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGKILL"); + kill (getpid(), SIGKILL); + } + + return NULL; +} + +void +posix_spawn_health_check_thread (xlator_t *xl) +{ + struct posix_private *priv = NULL; + int ret = -1; + + priv = xl->private; + + LOCK (&priv->lock); + { + /* cancel the running thread */ + if (priv->health_check_active == _gf_true) { + pthread_cancel (priv->health_check); + priv->health_check_active = _gf_false; + } + + /* prevent scheduling a check in a tight loop */ + if (priv->health_check_interval == 0) + goto unlock; + + ret = gf_thread_create (&priv->health_check, NULL, + posix_health_check_thread_proc, xl); + if (ret < 0) { + priv->health_check_interval = 0; + priv->health_check_active = _gf_false; + gf_log (xl->name, GF_LOG_ERROR, + "unable to setup health-check thread: %s", + strerror (errno)); + goto unlock; + } + + /* run the thread detached, resources will be freed on exit */ + pthread_detach (priv->health_check); + priv->health_check_active = _gf_true; + } +unlock: + UNLOCK (&priv->lock); +} + +int +posix_fsyncer_pick (xlator_t *this, struct list_head *head) +{ + struct posix_private *priv = NULL; + int count = 0; + + priv = this->private; + pthread_mutex_lock (&priv->fsync_mutex); + { + while (list_empty (&priv->fsyncs)) + pthread_cond_wait (&priv->fsync_cond, + &priv->fsync_mutex); + + count = priv->fsync_queue_count; + priv->fsync_queue_count = 0; + list_splice_init (&priv->fsyncs, head); + } + pthread_mutex_unlock (&priv->fsync_mutex); + + return count; +} + + +void +posix_fsyncer_process (xlator_t *this, call_stub_t *stub, gf_boolean_t do_fsync) +{ + struct posix_fd *pfd = NULL; + int ret = -1; + struct posix_private *priv = NULL; + + priv = this->private; + + ret = posix_fd_ctx_get (stub->args.fd, this, &pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not get fdctx for fd(%s)", + uuid_utoa (stub->args.fd->inode->gfid)); + call_unwind_error (stub, -1, EINVAL); + return; + } + + if (do_fsync) { +#ifdef HAVE_FDATASYNC + if (stub->args.datasync) + ret = fdatasync (pfd->fd); + else +#endif + ret = fsync (pfd->fd); + } else { + ret = 0; + } + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "could not fstat fd(%s)", + uuid_utoa (stub->args.fd->inode->gfid)); + call_unwind_error (stub, -1, errno); + return; + } + + call_unwind_error (stub, 0, 0); +} + + +static void +posix_fsyncer_syncfs (xlator_t *this, struct list_head *head) +{ + call_stub_t *stub = NULL; + struct posix_fd *pfd = NULL; + int ret = -1; + + stub = list_entry (head->prev, call_stub_t, list); + ret = posix_fd_ctx_get (stub->args.fd, this, &pfd); + if (ret) + return; + +#ifdef GF_LINUX_HOST_OS + /* syncfs() is not "declared" in RHEL's glibc even though + the kernel has support. + */ +#include <sys/syscall.h> +#include <unistd.h> +#ifdef SYS_syncfs + syscall (SYS_syncfs, pfd->fd); +#else + sync(); +#endif +#else + sync(); +#endif +} + + +void * +posix_fsyncer (void *d) +{ + xlator_t *this = d; + struct posix_private *priv = NULL; + call_stub_t *stub = NULL; + call_stub_t *tmp = NULL; + struct list_head list; + int count = 0; + gf_boolean_t do_fsync = _gf_true; + + priv = this->private; + + for (;;) { + INIT_LIST_HEAD (&list); + + count = posix_fsyncer_pick (this, &list); + + usleep (priv->batch_fsync_delay_usec); + + gf_log (this->name, GF_LOG_DEBUG, + "picked %d fsyncs", count); + + switch (priv->batch_fsync_mode) { + case BATCH_NONE: + case BATCH_REVERSE_FSYNC: + break; + case BATCH_SYNCFS: + case BATCH_SYNCFS_SINGLE_FSYNC: + case BATCH_SYNCFS_REVERSE_FSYNC: + posix_fsyncer_syncfs (this, &list); + break; + } + + if (priv->batch_fsync_mode == BATCH_SYNCFS) + do_fsync = _gf_false; + else + do_fsync = _gf_true; + + list_for_each_entry_safe_reverse (stub, tmp, &list, list) { + list_del_init (&stub->list); + + posix_fsyncer_process (this, stub, do_fsync); + + if (priv->batch_fsync_mode == BATCH_SYNCFS_SINGLE_FSYNC) + do_fsync = _gf_false; + } + } +} diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index 44aeca356..fb45c7a67 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -23,6 +23,8 @@ #include <pthread.h> #include <ftw.h> #include <sys/stat.h> +#include <signal.h> +#include <sys/uio.h> #ifndef GF_BSD_HOST_OS #include <alloca.h> @@ -50,6 +52,7 @@ #include "glusterfs3-xdr.h" #include "hashfn.h" #include "posix-aio.h" +#include "glusterfs-acl.h" extern char *marker_xattrs[]; #define ALIGN_SIZE 4096 @@ -128,7 +131,7 @@ posix_lookup (call_frame_t *frame, xlator_t *this, MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &buf); if (uuid_is_null (loc->inode->gfid)) { - posix_gfid_set (this, real_path, loc, xdata); + posix_gfid_heal (this, real_path, loc, xdata); MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &buf); } @@ -561,6 +564,289 @@ out: return 0; } +static int32_t +posix_do_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + off_t offset, size_t len, struct iatt *statpre, + struct iatt *statpost) +{ + struct posix_fd *pfd = NULL; + int32_t ret = -1; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID (frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "pfd is NULL from fd=%p", fd); + goto out; + } + + ret = posix_fdstat (this, pfd->fd, statpre); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "fallocate (fstat) failed on fd=%p: %s", fd, + strerror (errno)); + goto out; + } + + ret = sys_fallocate(pfd->fd, flags, offset, len); + if (ret == -1) { + ret = -errno; + goto out; + } + + ret = posix_fdstat (this, pfd->fd, statpost); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "fallocate (fstat) failed on fd=%p: %s", fd, + strerror (errno)); + goto out; + } + +out: + SET_TO_OLD_FS_ID (); + + return ret; +} + +char* +_page_aligned_alloc (size_t size, char **aligned_buf) +{ + char *alloc_buf = NULL; + char *buf = NULL; + + alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_posix_mt_char); + if (!alloc_buf) + goto out; + /* page aligned buffer */ + buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE); + *aligned_buf = buf; +out: + return alloc_buf; +} + +static int32_t +_posix_do_zerofill(int fd, off_t offset, size_t len, int o_direct) +{ + size_t num_vect = 0; + int32_t num_loop = 1; + int32_t idx = 0; + int32_t op_ret = -1; + int32_t vect_size = VECTOR_SIZE; + size_t remain = 0; + size_t extra = 0; + struct iovec *vector = NULL; + char *iov_base = NULL; + char *alloc_buf = NULL; + + if (len == 0) + return 0; + if (len < VECTOR_SIZE) + vect_size = len; + + num_vect = len / (vect_size); + remain = len % vect_size ; + if (num_vect > MAX_NO_VECT) { + extra = num_vect % MAX_NO_VECT; + num_loop = num_vect / MAX_NO_VECT; + num_vect = MAX_NO_VECT; + } + + vector = GF_CALLOC (num_vect, sizeof(struct iovec), + gf_common_mt_iovec); + if (!vector) + return -1; + if (o_direct) { + alloc_buf = _page_aligned_alloc(vect_size, &iov_base); + if (!alloc_buf) { + gf_log ("_posix_do_zerofill", GF_LOG_DEBUG, + "memory alloc failed, vect_size %d: %s", + vect_size, strerror(errno)); + GF_FREE(vector); + return -1; + } + } else { + iov_base = GF_CALLOC (vect_size, sizeof(char), + gf_common_mt_char); + if (!iov_base) { + GF_FREE(vector); + return -1; + } + } + + for (idx = 0; idx < num_vect; idx++) { + vector[idx].iov_base = iov_base; + vector[idx].iov_len = vect_size; + } + lseek(fd, offset, SEEK_SET); + for (idx = 0; idx < num_loop; idx++) { + op_ret = writev(fd, vector, num_vect); + if (op_ret < 0) + goto err; + } + if (extra) { + op_ret = writev(fd, vector, extra); + if (op_ret < 0) + goto err; + } + if (remain) { + vector[0].iov_len = remain; + op_ret = writev(fd, vector , 1); + if (op_ret < 0) + goto err; + } +err: + if (o_direct) + GF_FREE(alloc_buf); + else + GF_FREE(iov_base); + GF_FREE(vector); + return op_ret; +} + +static int32_t +posix_do_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, size_t len, struct iatt *statpre, + struct iatt *statpost) +{ + struct posix_fd *pfd = NULL; + int32_t ret = -1; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID (frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "pfd is NULL from fd=%p", fd); + goto out; + } + + ret = posix_fdstat (this, pfd->fd, statpre); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "pre-operation fstat failed on fd = %p: %s", fd, + strerror (errno)); + goto out; + } + ret = _posix_do_zerofill(pfd->fd, offset, len, pfd->flags & O_DIRECT); + if (ret < 0) { + ret = -errno; + gf_log(this->name, GF_LOG_ERROR, + "zerofill failed on fd %d length %ld %s", + pfd->fd, len, strerror(errno)); + goto out; + } + if (pfd->flags & (O_SYNC|O_DSYNC)) { + ret = fsync (pfd->fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "fsync() in writev on fd %d failed: %s", + pfd->fd, strerror (errno)); + ret = -errno; + goto out; + } + } + + ret = posix_fdstat (this, pfd->fd, statpost); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "post operation fstat failed on fd=%p: %s", fd, + strerror (errno)); + goto out; + } + +out: + SET_TO_OLD_FS_ID (); + + return ret; +} + +static int32_t +_posix_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size, + off_t offset, size_t len, dict_t *xdata) +{ + int32_t ret; + int32_t flags = 0; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; + + if (keep_size) + flags = FALLOC_FL_KEEP_SIZE; + + ret = posix_do_fallocate(frame, this, fd, flags, offset, len, + &statpre, &statpost); + if (ret < 0) + goto err; + + STACK_UNWIND_STRICT(fallocate, frame, 0, 0, &statpre, &statpost, NULL); + return 0; + +err: + STACK_UNWIND_STRICT(fallocate, frame, -1, -ret, NULL, NULL, NULL); + return 0; +} + +static int32_t +posix_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int32_t ret; + int32_t flags = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; + + ret = posix_do_fallocate(frame, this, fd, flags, offset, len, + &statpre, &statpost); + if (ret < 0) + goto err; + + STACK_UNWIND_STRICT(discard, frame, 0, 0, &statpre, &statpost, NULL); + return 0; + +err: + STACK_UNWIND_STRICT(discard, frame, -1, -ret, NULL, NULL, NULL); + return 0; + +} + +static int32_t +posix_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int32_t ret = 0; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; + + ret = posix_do_zerofill(frame, this, fd, offset, len, + &statpre, &statpost); + if (ret < 0) + goto err; + + STACK_UNWIND_STRICT(zerofill, frame, 0, 0, &statpre, &statpost, NULL); + return 0; + +err: + STACK_UNWIND_STRICT(zerofill, frame, -1, -ret, NULL, NULL, NULL); + return 0; + +} + int32_t posix_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, dict_t *xdata) @@ -1969,11 +2255,7 @@ posix_readv (call_frame_t *frame, xlator_t *this, } /* Hack to notify higher layers of EOF. */ - if (stbuf.ia_size == 0) - op_errno = ENOENT; - else if ((offset + vec.iov_len) == stbuf.ia_size) - op_errno = ENOENT; - else if (offset > stbuf.ia_size) + if (!stbuf.ia_size || (offset + vec.iov_len) >= stbuf.ia_size) op_errno = ENOENT; op_ret = vec.iov_len; @@ -2018,22 +2300,6 @@ err: return op_ret; } -char* -_page_aligned_alloc (size_t size, char **aligned_buf) -{ - char *alloc_buf = NULL; - char *buf = NULL; - - alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_posix_mt_char); - if (!alloc_buf) - goto out; - /* page aligned buffer */ - buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE); - *aligned_buf = buf; -out: - return alloc_buf; -} - int32_t __posix_writev (int fd, struct iovec *vector, int count, off_t startoff, int odirect) @@ -2082,6 +2348,48 @@ err: return op_ret; } +dict_t* +_fill_writev_xdata (fd_t *fd, dict_t *xdata, xlator_t *this, int is_append) +{ + dict_t *rsp_xdata = NULL; + int32_t ret = 0; + inode_t *inode = NULL; + + if (fd) + inode = fd->inode; + + if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid Args: " + "fd: %p inode: %p gfid:%s", fd, inode?inode:0, + inode?uuid_utoa(inode->gfid):"N/A"); + goto out; + } + + if (!xdata || !dict_get (xdata, GLUSTERFS_OPEN_FD_COUNT)) + goto out; + + rsp_xdata = dict_new(); + if (!rsp_xdata) + goto out; + + ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_OPEN_FD_COUNT, + fd->inode->fd_count); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set " + "dictionary value for %s", uuid_utoa (fd->inode->gfid), + GLUSTERFS_OPEN_FD_COUNT); + } + + ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND, + is_append); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set " + "dictionary value for %s", uuid_utoa (fd->inode->gfid), + GLUSTERFS_WRITE_IS_APPEND); + } +out: + return rsp_xdata; +} int32_t posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, @@ -2096,6 +2404,9 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt preop = {0,}; struct iatt postop = {0,}; int ret = -1; + dict_t *rsp_xdata = NULL; + int is_append = 0; + gf_boolean_t locked = _gf_false; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -2117,6 +2428,17 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, _fd = pfd->fd; + if (xdata && dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) { + /* The write_is_append check and write must happen + atomically. Else another write can overtake this + write after the check and get written earlier. + + So lock before preop-stat and unlock after write. + */ + locked = _gf_true; + LOCK(&fd->inode->lock); + } + op_ret = posix_fdstat (this, _fd, &preop); if (op_ret == -1) { op_errno = errno; @@ -2126,8 +2448,19 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, goto out; } + if (locked) { + if (preop.ia_size == offset || (fd->flags & O_APPEND)) + is_append = 1; + } + op_ret = __posix_writev (_fd, vector, count, offset, (pfd->flags & O_DIRECT)); + + if (locked) { + UNLOCK (&fd->inode->lock); + locked = _gf_false; + } + if (op_ret < 0) { op_errno = -op_ret; op_ret = -1; @@ -2143,14 +2476,21 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, UNLOCK (&priv->lock); if (op_ret >= 0) { + rsp_xdata = _fill_writev_xdata (fd, xdata, this, is_append); /* wiretv successful, we also need to get the stat of * the file we wrote to */ - if (pfd->flushwrites) { - /* NOTE: ignore the error, if one occurs at this - * point */ - fsync (_fd); + if (flags & (O_SYNC|O_DSYNC)) { + ret = fsync (_fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "fsync() in writev on fd %d failed: %s", + _fd, strerror (errno)); + op_ret = -1; + op_errno = errno; + goto out; + } } ret = posix_fdstat (this, _fd, &postop); @@ -2166,9 +2506,16 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, out: + if (locked) { + UNLOCK (&fd->inode->lock); + locked = _gf_false; + } + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &preop, &postop, - NULL); + rsp_xdata); + if (rsp_xdata) + dict_unref (rsp_xdata); return 0; } @@ -2295,6 +2642,33 @@ out: } +int +posix_batch_fsync (call_frame_t *frame, xlator_t *this, + fd_t *fd, int datasync, dict_t *xdata) +{ + call_stub_t *stub = NULL; + struct posix_private *priv = NULL; + + priv = this->private; + + stub = fop_fsync_stub (frame, default_fsync, fd, datasync, xdata); + if (!stub) { + STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM, 0, 0, 0); + return 0; + } + + pthread_mutex_lock (&priv->fsync_mutex); + { + list_add_tail (&stub->list, &priv->fsyncs); + priv->fsync_queue_count++; + pthread_cond_signal (&priv->fsync_cond); + } + pthread_mutex_unlock (&priv->fsync_mutex); + + return 0; +} + + int32_t posix_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, dict_t *xdata) @@ -2306,6 +2680,7 @@ posix_fsync (call_frame_t *frame, xlator_t *this, int ret = -1; struct iatt preop = {0,}; struct iatt postop = {0,}; + struct posix_private *priv = NULL; DECLARE_OLD_FS_ID_VAR; @@ -2321,6 +2696,12 @@ posix_fsync (call_frame_t *frame, xlator_t *this, goto out; #endif + priv = this->private; + if (priv->batch_fsync_mode && xdata && dict_get (xdata, "batch-fsync")) { + posix_batch_fsync (frame, this, fd, datasync, xdata); + return 0; + } + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { op_errno = -ret; @@ -2433,6 +2814,53 @@ out: return 0; } + +int +posix_xattr_get_real_filename (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *key, dict_t *dict, dict_t *xdata) +{ + char *real_path = NULL; + struct dirent *dirent = NULL; + DIR *fd = NULL; + const char *fname = NULL; + char *found = NULL; + int ret = -1; + int op_ret = -1; + + MAKE_INODE_HANDLE (real_path, this, loc, NULL); + + fd = opendir (real_path); + if (!fd) + return -errno; + + fname = key + strlen (GF_XATTR_GET_REAL_FILENAME_KEY); + + while ((dirent = readdir (fd))) { + if (strcasecmp (dirent->d_name, fname) == 0) { + found = gf_strdup (dirent->d_name); + if (!found) { + closedir (fd); + return -ENOMEM; + } + break; + } + } + + closedir (fd); + + if (!found) + return -ENOENT; + + ret = dict_set_dynstr (dict, (char *)key, found); + if (ret) { + GF_FREE (found); + return -ENOMEM; + } + ret = strlen (found) + 1; + + return ret; +} + /** * posix_getxattr - this function returns a dictionary with all the * key:value pair present as xattr. used for @@ -2487,9 +2915,29 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, dict = dict_new (); if (!dict) { + op_errno = ENOMEM; goto out; } + if (loc->inode && name && + (strncmp (name, GF_XATTR_GET_REAL_FILENAME_KEY, + strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0)) { + ret = posix_xattr_get_real_filename (frame, this, loc, + name, dict, xdata); + if (ret < 0) { + op_ret = -1; + op_errno = -ret; + gf_log (this->name, (op_errno == ENOENT) ? + GF_LOG_DEBUG : GF_LOG_WARNING, + "Failed to get real filename (%s, %s): %s", + loc->path, name, strerror (op_errno)); + goto out; + } + + size = ret; + goto done; + } + if (loc->inode && name && !strcmp (name, GLUSTERFS_OPEN_FD_COUNT)) { if (!list_empty (&loc->inode->fd_list)) { ret = dict_set_uint32 (dict, (char *)name, 1); @@ -2513,8 +2961,13 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, else rpath = real_path; - (void) snprintf (host_buf, 1024, "<POSIX(%s):%s:%s>", - priv->base_path, priv->hostname, rpath); + (void) snprintf (host_buf, 1024, + "<POSIX(%s):%s:%s>", priv->base_path, + ((priv->node_uuid_pathinfo + && !uuid_is_null(priv->glusterd_uuid)) + ? uuid_utoa (priv->glusterd_uuid) + : priv->hostname), + rpath); dyn_rpath = gf_strdup (host_buf); if (!dyn_rpath) { @@ -2590,6 +3043,11 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, "supported (try remounting" " brick with 'user_xattr' " "flag)"); + } else if (op_errno == ENOATTR || + op_errno == ENODATA) { + gf_log (this->name, GF_LOG_DEBUG, + "No such attribute:%s for file %s", + key, real_path); } else { gf_log (this->name, GF_LOG_ERROR, "getxattr failed on %s: %s (%s)", @@ -2966,6 +3424,28 @@ out: return 0; } +int +_posix_remove_xattr (dict_t *dict, char *key, data_t *value, void *data) +{ + int32_t op_ret = 0; + xlator_t *this = NULL; + posix_xattr_filler_t *filler = NULL; + + filler = (posix_xattr_filler_t *) data; + this = filler->this; + + op_ret = sys_lremovexattr (filler->real_path, key); + if (op_ret == -1) { + filler->op_errno = errno; + if (errno != ENOATTR && errno != EPERM) + gf_log (this->name, GF_LOG_ERROR, + "removexattr failed on %s (for %s): %s", + filler->real_path, key, strerror (errno)); + } + + return op_ret; +} + int32_t posix_removexattr (call_frame_t *frame, xlator_t *this, @@ -2974,6 +3454,7 @@ posix_removexattr (call_frame_t *frame, xlator_t *this, int32_t op_ret = -1; int32_t op_errno = 0; char * real_path = NULL; + posix_xattr_filler_t filler = {0,}; DECLARE_OLD_FS_ID_VAR; @@ -2989,6 +3470,22 @@ posix_removexattr (call_frame_t *frame, xlator_t *this, SET_FS_ID (frame->root->uid, frame->root->gid); + /** + * sending an empty key name with xdata containing the + * list of key(s) to be removed implies "bulk remove request" + * for removexattr. + */ + if (name && (strcmp (name, "") == 0) && xdata) { + filler.real_path = real_path; + filler.this = this; + op_ret = dict_foreach (xdata, _posix_remove_xattr, &filler); + if (op_ret) { + op_errno = filler.op_errno; + } + + goto out; + } + op_ret = sys_lremovexattr (real_path, name); if (op_ret == -1) { op_errno = errno; @@ -3848,8 +4345,23 @@ posix_do_readdir (call_frame_t *frame, xlator_t *this, */ ret = dict_get_int32 (dict, GF_READDIR_SKIP_DIRS, &skip_dirs); - count = posix_fill_readdir (fd, dir, off, size, &entries, this, - skip_dirs); + LOCK (&fd->lock); + { + /* posix_fill_readdir performs multiple separate individual + readdir() calls to fill up the buffer. + + In case of NFS where the same anonymous FD is shared between + different applications, reading a common directory can + result in the anonymous fd getting re-used unsafely between + the two readdir requests (in two different io-threads). + + It would also help, in the future, to replace the loop + around readdir() with a single large getdents() call. + */ + count = posix_fill_readdir (fd, dir, off, size, &entries, this, + skip_dirs); + } + UNLOCK (&fd->lock); /* pick ENOENT to indicate EOF */ op_errno = errno; @@ -4052,6 +4564,27 @@ posix_set_owner (xlator_t *this, uid_t uid, gid_t gid) return ret; } + +static int +set_batch_fsync_mode (struct posix_private *priv, const char *str) +{ + if (strcmp (str, "none") == 0) + priv->batch_fsync_mode = BATCH_NONE; + else if (strcmp (str, "syncfs") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS; + else if (strcmp (str, "syncfs-single-fsync") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS_SINGLE_FSYNC; + else if (strcmp (str, "syncfs-reverse-fsync") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS_REVERSE_FSYNC; + else if (strcmp (str, "reverse-fsync") == 0) + priv->batch_fsync_mode = BATCH_REVERSE_FSYNC; + else + return -1; + + return 0; +} + + int reconfigure (xlator_t *this, dict_t *options) { @@ -4059,6 +4592,7 @@ reconfigure (xlator_t *this, dict_t *options) struct posix_private *priv = NULL; uid_t uid = -1; gid_t gid = -1; + char *batch_fsync_mode_str = NULL; priv = this->private; @@ -4066,6 +4600,18 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("brick-gid", gid, options, uint32, out); posix_set_owner (this, uid, gid); + GF_OPTION_RECONF ("batch-fsync-delay-usec", priv->batch_fsync_delay_usec, + options, uint32, out); + + GF_OPTION_RECONF ("batch-fsync-mode", batch_fsync_mode_str, + options, str, out); + + if (set_batch_fsync_mode (priv, batch_fsync_mode_str) != 0) { + gf_log (this->name, GF_LOG_ERROR, "Unknown mode string: %s", + batch_fsync_mode_str); + goto out; + } + GF_OPTION_RECONF ("linux-aio", priv->aio_configured, options, bool, out); @@ -4074,6 +4620,20 @@ reconfigure (xlator_t *this, dict_t *options) else posix_aio_off (this); + GF_OPTION_RECONF ("node-uuid-pathinfo", priv->node_uuid_pathinfo, + options, bool, out); + + if (priv->node_uuid_pathinfo && + (uuid_is_null (priv->glusterd_uuid))) { + gf_log (this->name, GF_LOG_INFO, + "glusterd uuid is NULL, pathinfo xattr would" + " fallback to <hostname>:<export>"); + } + + GF_OPTION_RECONF ("health-check-interval", priv->health_check_interval, + options, uint32, out); + posix_spawn_health_check_thread (this); + ret = 0; out: return ret; @@ -4103,6 +4663,7 @@ init (xlator_t *this) char *guuid = NULL; uid_t uid = -1; gid_t gid = -1; + char *batch_fsync_mode_str; dir_data = dict_get (this->options, "directory"); @@ -4255,7 +4816,7 @@ init (xlator_t *this) } } - size = sys_lgetxattr (dir_data->data, "system.posix_acl_access", + size = sys_lgetxattr (dir_data->data, POSIX_ACL_ACCESS_XATTR, NULL, 0); if ((size < 0) && (errno == ENOTSUP)) gf_log (this->name, GF_LOG_WARNING, @@ -4435,11 +4996,48 @@ init (xlator_t *this) } } + GF_OPTION_INIT ("node-uuid-pathinfo", + _private->node_uuid_pathinfo, bool, out); + if (_private->node_uuid_pathinfo && + (uuid_is_null (_private->glusterd_uuid))) { + gf_log (this->name, GF_LOG_INFO, + "glusterd uuid is NULL, pathinfo xattr would" + " fallback to <hostname>:<export>"); + } + + _private->health_check_active = _gf_false; + GF_OPTION_INIT ("health-check-interval", + _private->health_check_interval, uint32, out); + if (_private->health_check_interval) + posix_spawn_health_check_thread (this); + pthread_mutex_init (&_private->janitor_lock, NULL); pthread_cond_init (&_private->janitor_cond, NULL); INIT_LIST_HEAD (&_private->janitor_fds); posix_spawn_janitor_thread (this); + + pthread_mutex_init (&_private->fsync_mutex, NULL); + pthread_cond_init (&_private->fsync_cond, NULL); + INIT_LIST_HEAD (&_private->fsyncs); + + ret = gf_thread_create (&_private->fsyncer, NULL, posix_fsyncer, this); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "fsyncer thread" + " creation failed (%s)", strerror (errno)); + goto out; + } + + GF_OPTION_INIT ("batch-fsync-mode", batch_fsync_mode_str, str, out); + + if (set_batch_fsync_mode (_private, batch_fsync_mode_str) != 0) { + gf_log (this->name, GF_LOG_ERROR, "Unknown mode string: %s", + batch_fsync_mode_str); + goto out; + } + + GF_OPTION_INIT ("batch-fsync-delay-usec", _private->batch_fsync_delay_usec, + uint32, out); out: return ret; } @@ -4505,6 +5103,9 @@ struct xlator_fops fops = { .fxattrop = posix_fxattrop, .setattr = posix_setattr, .fsetattr = posix_fsetattr, + .fallocate = _posix_fallocate, + .discard = posix_discard, + .zerofill = posix_zerofill, }; struct xlator_cbks cbks = { @@ -4552,5 +5153,40 @@ struct volume_options options[] = { .validate = GF_OPT_VALIDATE_MIN, .description = "Support for setting gid of brick's owner" }, + { .key = {"node-uuid-pathinfo"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "return glusterd's node-uuid in pathinfo xattr" + " string instead of hostname" + }, + { + .key = {"health-check-interval"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .default_value = "30", + .validate = GF_OPT_VALIDATE_MIN, + .description = "Interval in seconds for a filesystem health check, " + "set to 0 to disable" + }, + { .key = {"batch-fsync-mode"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "reverse-fsync", + .description = "Possible values:\n" + "\t- syncfs: Perform one syncfs() on behalf oa batch" + "of fsyncs.\n" + "\t- syncfs-single-fsync: Perform one syncfs() on behalf of a batch" + " of fsyncs and one fsync() per batch.\n" + "\t- syncfs-reverse-fsync: Preform one syncfs() on behalf of a batch" + " of fsyncs and fsync() each file in the batch in reverse order.\n" + " in reverse order.\n" + "\t- reverse-fsync: Perform fsync() of each file in the batch in" + " reverse order." + }, + { .key = {"batch-fsync-delay-usec"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "0", + .description = "Num of usecs to wait for aggregating fsync" + " requests", + }, { .key = {NULL} } }; diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index 4703a1fd4..3121db271 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -43,12 +43,15 @@ #include "timer.h" #include "posix-mem-types.h" #include "posix-handle.h" +#include "call-stub.h" #ifdef HAVE_LIBAIO #include <libaio.h> #include "posix-aio.h" #endif +#define VECTOR_SIZE 64 * 1024 /* vector size 64KB*/ +#define MAX_NO_VECT 1024 /** * posix_fd - internal structure common to file and directory fd's */ @@ -57,7 +60,6 @@ struct posix_fd { int fd; /* fd returned by the kernel */ int32_t flags; /* flags for open/creat */ DIR * dir; /* handle returned by the kernel */ - int flushwrites; int odirect; struct list_head list; /* to add to the janitor list */ }; @@ -125,6 +127,30 @@ struct posix_private { io_context_t ctxp; pthread_t aiothread; #endif + + /* node-uuid in pathinfo xattr */ + gf_boolean_t node_uuid_pathinfo; + + pthread_t fsyncer; + struct list_head fsyncs; + pthread_mutex_t fsync_mutex; + pthread_cond_t fsync_cond; + int fsync_queue_count; + + enum { + BATCH_NONE = 0, + BATCH_SYNCFS, + BATCH_SYNCFS_SINGLE_FSYNC, + BATCH_REVERSE_FSYNC, + BATCH_SYNCFS_REVERSE_FSYNC + } batch_fsync_mode; + + uint32_t batch_fsync_delay_usec; + + /* seconds to sleep between health checks */ + uint32_t health_check_interval; + pthread_t health_check; + gf_boolean_t health_check_active; }; typedef struct { @@ -136,6 +162,7 @@ typedef struct { inode_t *inode; /* for all do_xattrop() key handling */ int fd; int flags; + int32_t op_errno; } posix_xattr_filler_t; @@ -163,7 +190,7 @@ int posix_get_file_contents (xlator_t *this, uuid_t pargfid, int posix_set_file_contents (xlator_t *this, const char *path, char *key, data_t *value, int flags); int posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req); -int posix_gfid_heal (xlator_t *this, const char *path, dict_t *xattr_req); +int posix_gfid_heal (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req); int posix_entry_create_xattr_set (xlator_t *this, const char *path, dict_t *dict); @@ -175,4 +202,7 @@ gf_boolean_t posix_special_xattr (char **pattern, char *key); void __posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags, off_t offset, size_t size); +void posix_spawn_health_check_thread (xlator_t *this); + +void *posix_fsyncer (void *); #endif /* _POSIX_H */ |
